diff --git "a/lora_model/trainer_state.json" "b/lora_model/trainer_state.json" new file mode 100644--- /dev/null +++ "b/lora_model/trainer_state.json" @@ -0,0 +1,220533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.8538424950737245, + "eval_steps": 500, + "global_step": 31500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.05981744467849e-05, + "grad_norm": 1.427058458328247, + "learning_rate": 4e-05, + "loss": 3.3886, + "step": 1 + }, + { + "epoch": 0.0001811963488935698, + "grad_norm": 1.4716960191726685, + "learning_rate": 8e-05, + "loss": 3.3668, + "step": 2 + }, + { + "epoch": 0.0002717945233403547, + "grad_norm": 1.3622956275939941, + "learning_rate": 0.00012, + "loss": 3.3405, + "step": 3 + }, + { + "epoch": 0.0003623926977871396, + "grad_norm": 1.0335720777511597, + "learning_rate": 0.00016, + "loss": 3.4894, + "step": 4 + }, + { + "epoch": 0.0004529908722339245, + "grad_norm": 0.7868813872337341, + "learning_rate": 0.0002, + "loss": 3.437, + "step": 5 + }, + { + "epoch": 0.0005435890466807093, + "grad_norm": 0.5582625269889832, + "learning_rate": 0.00019999395879900927, + "loss": 3.2075, + "step": 6 + }, + { + "epoch": 0.0006341872211274943, + "grad_norm": 0.5376262068748474, + "learning_rate": 0.0001999879175980185, + "loss": 2.9002, + "step": 7 + }, + { + "epoch": 0.0007247853955742792, + "grad_norm": 0.6543612480163574, + "learning_rate": 0.00019998187639702774, + "loss": 3.3262, + "step": 8 + }, + { + "epoch": 0.000815383570021064, + "grad_norm": 0.5691677927970886, + "learning_rate": 0.00019997583519603697, + "loss": 3.3152, + "step": 9 + }, + { + "epoch": 0.000905981744467849, + "grad_norm": 0.618198812007904, + "learning_rate": 0.00019996979399504623, + "loss": 3.0659, + "step": 10 + }, + { + "epoch": 0.0009965799189146339, + "grad_norm": 0.6271547079086304, + "learning_rate": 0.00019996375279405547, + "loss": 2.9909, + "step": 11 + }, + { + "epoch": 0.0010871780933614187, + "grad_norm": 0.49664175510406494, + "learning_rate": 0.0001999577115930647, + "loss": 2.9858, + "step": 12 + }, + { + "epoch": 0.0011777762678082037, + "grad_norm": 0.4814058244228363, + "learning_rate": 0.00019995167039207396, + "loss": 3.0527, + "step": 13 + }, + { + "epoch": 0.0012683744422549886, + "grad_norm": 0.5140668153762817, + "learning_rate": 0.0001999456291910832, + "loss": 2.9155, + "step": 14 + }, + { + "epoch": 0.0013589726167017734, + "grad_norm": 0.513937771320343, + "learning_rate": 0.00019993958799009246, + "loss": 3.0776, + "step": 15 + }, + { + "epoch": 0.0014495707911485584, + "grad_norm": 0.5271328091621399, + "learning_rate": 0.00019993354678910167, + "loss": 2.8493, + "step": 16 + }, + { + "epoch": 0.0015401689655953432, + "grad_norm": 0.4912465810775757, + "learning_rate": 0.00019992750558811093, + "loss": 2.9306, + "step": 17 + }, + { + "epoch": 0.001630767140042128, + "grad_norm": 0.5179139375686646, + "learning_rate": 0.00019992146438712016, + "loss": 2.9674, + "step": 18 + }, + { + "epoch": 0.001721365314488913, + "grad_norm": 0.5023850798606873, + "learning_rate": 0.00019991542318612942, + "loss": 2.9437, + "step": 19 + }, + { + "epoch": 0.001811963488935698, + "grad_norm": 0.4648284614086151, + "learning_rate": 0.00019990938198513866, + "loss": 3.0561, + "step": 20 + }, + { + "epoch": 0.001902561663382483, + "grad_norm": 0.4750891327857971, + "learning_rate": 0.0001999033407841479, + "loss": 3.096, + "step": 21 + }, + { + "epoch": 0.0019931598378292678, + "grad_norm": 0.49825677275657654, + "learning_rate": 0.00019989729958315715, + "loss": 3.0668, + "step": 22 + }, + { + "epoch": 0.002083758012276053, + "grad_norm": 0.5512535572052002, + "learning_rate": 0.00019989125838216639, + "loss": 2.919, + "step": 23 + }, + { + "epoch": 0.0021743561867228374, + "grad_norm": 0.5149639248847961, + "learning_rate": 0.00019988521718117562, + "loss": 2.9641, + "step": 24 + }, + { + "epoch": 0.0022649543611696224, + "grad_norm": 0.5550311207771301, + "learning_rate": 0.00019987917598018485, + "loss": 3.0554, + "step": 25 + }, + { + "epoch": 0.0023555525356164075, + "grad_norm": 0.6160560846328735, + "learning_rate": 0.00019987313477919411, + "loss": 3.0635, + "step": 26 + }, + { + "epoch": 0.002446150710063192, + "grad_norm": 0.574560821056366, + "learning_rate": 0.00019986709357820335, + "loss": 3.3721, + "step": 27 + }, + { + "epoch": 0.002536748884509977, + "grad_norm": 0.4951487183570862, + "learning_rate": 0.0001998610523772126, + "loss": 2.918, + "step": 28 + }, + { + "epoch": 0.002627347058956762, + "grad_norm": 0.5972598791122437, + "learning_rate": 0.00019985501117622184, + "loss": 2.8386, + "step": 29 + }, + { + "epoch": 0.0027179452334035467, + "grad_norm": 0.509295642375946, + "learning_rate": 0.00019984896997523108, + "loss": 2.8933, + "step": 30 + }, + { + "epoch": 0.0028085434078503318, + "grad_norm": 0.5437778234481812, + "learning_rate": 0.00019984292877424034, + "loss": 3.086, + "step": 31 + }, + { + "epoch": 0.002899141582297117, + "grad_norm": 0.550157368183136, + "learning_rate": 0.00019983688757324957, + "loss": 2.9319, + "step": 32 + }, + { + "epoch": 0.0029897397567439014, + "grad_norm": 0.6062225103378296, + "learning_rate": 0.0001998308463722588, + "loss": 3.177, + "step": 33 + }, + { + "epoch": 0.0030803379311906865, + "grad_norm": 0.6056679487228394, + "learning_rate": 0.00019982480517126804, + "loss": 2.92, + "step": 34 + }, + { + "epoch": 0.0031709361056374715, + "grad_norm": 0.5432312488555908, + "learning_rate": 0.0001998187639702773, + "loss": 2.9591, + "step": 35 + }, + { + "epoch": 0.003261534280084256, + "grad_norm": 0.6323334574699402, + "learning_rate": 0.00019981272276928656, + "loss": 2.8306, + "step": 36 + }, + { + "epoch": 0.003352132454531041, + "grad_norm": 0.5385962128639221, + "learning_rate": 0.00019980668156829577, + "loss": 3.1731, + "step": 37 + }, + { + "epoch": 0.003442730628977826, + "grad_norm": 0.5677192211151123, + "learning_rate": 0.00019980064036730503, + "loss": 2.7439, + "step": 38 + }, + { + "epoch": 0.003533328803424611, + "grad_norm": 0.5695681571960449, + "learning_rate": 0.00019979459916631427, + "loss": 3.0921, + "step": 39 + }, + { + "epoch": 0.003623926977871396, + "grad_norm": 0.60052090883255, + "learning_rate": 0.00019978855796532353, + "loss": 3.0496, + "step": 40 + }, + { + "epoch": 0.003714525152318181, + "grad_norm": 0.5882009863853455, + "learning_rate": 0.00019978251676433276, + "loss": 2.9791, + "step": 41 + }, + { + "epoch": 0.003805123326764966, + "grad_norm": 0.5420721769332886, + "learning_rate": 0.000199776475563342, + "loss": 2.8156, + "step": 42 + }, + { + "epoch": 0.0038957215012117505, + "grad_norm": 0.5006678700447083, + "learning_rate": 0.00019977043436235126, + "loss": 2.8538, + "step": 43 + }, + { + "epoch": 0.0039863196756585355, + "grad_norm": 0.5474886894226074, + "learning_rate": 0.0001997643931613605, + "loss": 3.0178, + "step": 44 + }, + { + "epoch": 0.0040769178501053206, + "grad_norm": 0.5596446990966797, + "learning_rate": 0.00019975835196036975, + "loss": 3.3123, + "step": 45 + }, + { + "epoch": 0.004167516024552106, + "grad_norm": 0.6223183870315552, + "learning_rate": 0.00019975231075937896, + "loss": 3.2187, + "step": 46 + }, + { + "epoch": 0.00425811419899889, + "grad_norm": 0.5224504470825195, + "learning_rate": 0.00019974626955838822, + "loss": 2.8958, + "step": 47 + }, + { + "epoch": 0.004348712373445675, + "grad_norm": 0.5689476132392883, + "learning_rate": 0.00019974022835739745, + "loss": 2.8815, + "step": 48 + }, + { + "epoch": 0.00443931054789246, + "grad_norm": 0.5161295533180237, + "learning_rate": 0.00019973418715640671, + "loss": 2.8205, + "step": 49 + }, + { + "epoch": 0.004529908722339245, + "grad_norm": 0.537505030632019, + "learning_rate": 0.00019972814595541595, + "loss": 3.2635, + "step": 50 + }, + { + "epoch": 0.00462050689678603, + "grad_norm": 0.5462870001792908, + "learning_rate": 0.00019972210475442518, + "loss": 3.0835, + "step": 51 + }, + { + "epoch": 0.004711105071232815, + "grad_norm": 0.5316328406333923, + "learning_rate": 0.00019971606355343444, + "loss": 2.7372, + "step": 52 + }, + { + "epoch": 0.0048017032456796, + "grad_norm": 0.5843604207038879, + "learning_rate": 0.00019971002235244368, + "loss": 2.9475, + "step": 53 + }, + { + "epoch": 0.004892301420126384, + "grad_norm": 0.5239959955215454, + "learning_rate": 0.0001997039811514529, + "loss": 3.0477, + "step": 54 + }, + { + "epoch": 0.004982899594573169, + "grad_norm": 0.5794640779495239, + "learning_rate": 0.00019969793995046215, + "loss": 3.1692, + "step": 55 + }, + { + "epoch": 0.005073497769019954, + "grad_norm": 0.5477221608161926, + "learning_rate": 0.0001996918987494714, + "loss": 2.7049, + "step": 56 + }, + { + "epoch": 0.005164095943466739, + "grad_norm": 0.5956740975379944, + "learning_rate": 0.00019968585754848064, + "loss": 3.0422, + "step": 57 + }, + { + "epoch": 0.005254694117913524, + "grad_norm": 0.6243427395820618, + "learning_rate": 0.0001996798163474899, + "loss": 2.7136, + "step": 58 + }, + { + "epoch": 0.005345292292360309, + "grad_norm": 0.5694064497947693, + "learning_rate": 0.00019967377514649914, + "loss": 3.1383, + "step": 59 + }, + { + "epoch": 0.0054358904668070935, + "grad_norm": 0.7337490320205688, + "learning_rate": 0.00019966773394550837, + "loss": 2.9984, + "step": 60 + }, + { + "epoch": 0.0055264886412538785, + "grad_norm": 0.5578823685646057, + "learning_rate": 0.00019966169274451763, + "loss": 3.0583, + "step": 61 + }, + { + "epoch": 0.0056170868157006636, + "grad_norm": 0.5746583342552185, + "learning_rate": 0.00019965565154352687, + "loss": 2.8293, + "step": 62 + }, + { + "epoch": 0.005707684990147449, + "grad_norm": 0.5938541889190674, + "learning_rate": 0.0001996496103425361, + "loss": 3.1412, + "step": 63 + }, + { + "epoch": 0.005798283164594234, + "grad_norm": 0.6199020147323608, + "learning_rate": 0.00019964356914154533, + "loss": 2.7255, + "step": 64 + }, + { + "epoch": 0.005888881339041019, + "grad_norm": 0.5543753504753113, + "learning_rate": 0.0001996375279405546, + "loss": 2.8783, + "step": 65 + }, + { + "epoch": 0.005979479513487803, + "grad_norm": 0.588051974773407, + "learning_rate": 0.00019963148673956386, + "loss": 2.8509, + "step": 66 + }, + { + "epoch": 0.006070077687934588, + "grad_norm": 0.7401419281959534, + "learning_rate": 0.00019962544553857306, + "loss": 2.9297, + "step": 67 + }, + { + "epoch": 0.006160675862381373, + "grad_norm": 0.5526180267333984, + "learning_rate": 0.00019961940433758232, + "loss": 2.9459, + "step": 68 + }, + { + "epoch": 0.006251274036828158, + "grad_norm": 0.5567218065261841, + "learning_rate": 0.00019961336313659156, + "loss": 2.8688, + "step": 69 + }, + { + "epoch": 0.006341872211274943, + "grad_norm": 0.5389745831489563, + "learning_rate": 0.00019960732193560082, + "loss": 3.0383, + "step": 70 + }, + { + "epoch": 0.006432470385721728, + "grad_norm": 0.5673227906227112, + "learning_rate": 0.00019960128073461005, + "loss": 2.816, + "step": 71 + }, + { + "epoch": 0.006523068560168512, + "grad_norm": 0.7025559544563293, + "learning_rate": 0.0001995952395336193, + "loss": 2.9395, + "step": 72 + }, + { + "epoch": 0.006613666734615297, + "grad_norm": 0.5031510591506958, + "learning_rate": 0.00019958919833262855, + "loss": 2.8786, + "step": 73 + }, + { + "epoch": 0.006704264909062082, + "grad_norm": 0.5166547298431396, + "learning_rate": 0.00019958315713163778, + "loss": 2.8614, + "step": 74 + }, + { + "epoch": 0.006794863083508867, + "grad_norm": 0.530877411365509, + "learning_rate": 0.00019957711593064702, + "loss": 3.076, + "step": 75 + }, + { + "epoch": 0.006885461257955652, + "grad_norm": 0.5549136996269226, + "learning_rate": 0.00019957107472965625, + "loss": 3.1066, + "step": 76 + }, + { + "epoch": 0.006976059432402437, + "grad_norm": 0.4837000072002411, + "learning_rate": 0.0001995650335286655, + "loss": 2.6785, + "step": 77 + }, + { + "epoch": 0.007066657606849222, + "grad_norm": 0.530819296836853, + "learning_rate": 0.00019955899232767475, + "loss": 2.8009, + "step": 78 + }, + { + "epoch": 0.007157255781296007, + "grad_norm": 0.5464468002319336, + "learning_rate": 0.000199552951126684, + "loss": 2.9518, + "step": 79 + }, + { + "epoch": 0.007247853955742792, + "grad_norm": 0.5115012526512146, + "learning_rate": 0.00019954690992569324, + "loss": 2.9564, + "step": 80 + }, + { + "epoch": 0.007338452130189577, + "grad_norm": 0.5436180233955383, + "learning_rate": 0.00019954086872470248, + "loss": 2.8659, + "step": 81 + }, + { + "epoch": 0.007429050304636362, + "grad_norm": 0.5453410148620605, + "learning_rate": 0.00019953482752371174, + "loss": 3.2191, + "step": 82 + }, + { + "epoch": 0.007519648479083147, + "grad_norm": 0.5827404260635376, + "learning_rate": 0.00019952878632272097, + "loss": 3.0825, + "step": 83 + }, + { + "epoch": 0.007610246653529932, + "grad_norm": 0.528242290019989, + "learning_rate": 0.0001995227451217302, + "loss": 2.9942, + "step": 84 + }, + { + "epoch": 0.007700844827976716, + "grad_norm": 0.5175663828849792, + "learning_rate": 0.00019951670392073944, + "loss": 2.8026, + "step": 85 + }, + { + "epoch": 0.007791443002423501, + "grad_norm": 0.597581684589386, + "learning_rate": 0.0001995106627197487, + "loss": 2.8578, + "step": 86 + }, + { + "epoch": 0.007882041176870287, + "grad_norm": 0.6200677752494812, + "learning_rate": 0.00019950462151875793, + "loss": 3.0611, + "step": 87 + }, + { + "epoch": 0.007972639351317071, + "grad_norm": 0.5239960551261902, + "learning_rate": 0.00019949858031776717, + "loss": 2.9717, + "step": 88 + }, + { + "epoch": 0.008063237525763855, + "grad_norm": 0.5968737006187439, + "learning_rate": 0.00019949253911677643, + "loss": 2.9112, + "step": 89 + }, + { + "epoch": 0.008153835700210641, + "grad_norm": 0.5439954400062561, + "learning_rate": 0.00019948649791578566, + "loss": 2.8387, + "step": 90 + }, + { + "epoch": 0.008244433874657425, + "grad_norm": 0.5373879075050354, + "learning_rate": 0.00019948045671479492, + "loss": 3.044, + "step": 91 + }, + { + "epoch": 0.008335032049104211, + "grad_norm": 0.4913642108440399, + "learning_rate": 0.00019947441551380416, + "loss": 2.791, + "step": 92 + }, + { + "epoch": 0.008425630223550995, + "grad_norm": 0.5506791472434998, + "learning_rate": 0.0001994683743128134, + "loss": 2.6827, + "step": 93 + }, + { + "epoch": 0.00851622839799778, + "grad_norm": 0.5538280606269836, + "learning_rate": 0.00019946233311182263, + "loss": 3.1089, + "step": 94 + }, + { + "epoch": 0.008606826572444565, + "grad_norm": 0.5151476263999939, + "learning_rate": 0.0001994562919108319, + "loss": 2.9196, + "step": 95 + }, + { + "epoch": 0.00869742474689135, + "grad_norm": 0.6380730867385864, + "learning_rate": 0.00019945025070984115, + "loss": 2.841, + "step": 96 + }, + { + "epoch": 0.008788022921338135, + "grad_norm": 0.5157920718193054, + "learning_rate": 0.00019944420950885036, + "loss": 2.6108, + "step": 97 + }, + { + "epoch": 0.00887862109578492, + "grad_norm": 0.5093569755554199, + "learning_rate": 0.00019943816830785962, + "loss": 2.9993, + "step": 98 + }, + { + "epoch": 0.008969219270231706, + "grad_norm": 0.5530297756195068, + "learning_rate": 0.00019943212710686885, + "loss": 2.9668, + "step": 99 + }, + { + "epoch": 0.00905981744467849, + "grad_norm": 0.5907820463180542, + "learning_rate": 0.0001994260859058781, + "loss": 2.9235, + "step": 100 + }, + { + "epoch": 0.009150415619125274, + "grad_norm": 0.596688449382782, + "learning_rate": 0.00019942004470488732, + "loss": 2.7305, + "step": 101 + }, + { + "epoch": 0.00924101379357206, + "grad_norm": 0.5594925880432129, + "learning_rate": 0.00019941400350389658, + "loss": 3.0993, + "step": 102 + }, + { + "epoch": 0.009331611968018844, + "grad_norm": 0.5407630801200867, + "learning_rate": 0.00019940796230290584, + "loss": 2.8648, + "step": 103 + }, + { + "epoch": 0.00942221014246563, + "grad_norm": 0.5662005543708801, + "learning_rate": 0.00019940192110191508, + "loss": 2.8824, + "step": 104 + }, + { + "epoch": 0.009512808316912414, + "grad_norm": 0.5584583878517151, + "learning_rate": 0.0001993958799009243, + "loss": 2.915, + "step": 105 + }, + { + "epoch": 0.0096034064913592, + "grad_norm": 0.5593752861022949, + "learning_rate": 0.00019938983869993354, + "loss": 2.9326, + "step": 106 + }, + { + "epoch": 0.009694004665805984, + "grad_norm": 0.5400118231773376, + "learning_rate": 0.0001993837974989428, + "loss": 2.7608, + "step": 107 + }, + { + "epoch": 0.009784602840252768, + "grad_norm": 0.5732753872871399, + "learning_rate": 0.00019937775629795204, + "loss": 3.1522, + "step": 108 + }, + { + "epoch": 0.009875201014699554, + "grad_norm": 0.744576096534729, + "learning_rate": 0.0001993717150969613, + "loss": 2.8243, + "step": 109 + }, + { + "epoch": 0.009965799189146338, + "grad_norm": 0.5742713809013367, + "learning_rate": 0.00019936567389597053, + "loss": 2.9443, + "step": 110 + }, + { + "epoch": 0.010056397363593124, + "grad_norm": 0.5512884855270386, + "learning_rate": 0.00019935963269497977, + "loss": 2.8002, + "step": 111 + }, + { + "epoch": 0.010146995538039908, + "grad_norm": 0.5320309996604919, + "learning_rate": 0.00019935359149398903, + "loss": 2.8759, + "step": 112 + }, + { + "epoch": 0.010237593712486693, + "grad_norm": 0.5177123546600342, + "learning_rate": 0.00019934755029299826, + "loss": 2.8314, + "step": 113 + }, + { + "epoch": 0.010328191886933479, + "grad_norm": 0.5322889685630798, + "learning_rate": 0.0001993415090920075, + "loss": 3.03, + "step": 114 + }, + { + "epoch": 0.010418790061380263, + "grad_norm": 0.676275908946991, + "learning_rate": 0.00019933546789101673, + "loss": 2.6245, + "step": 115 + }, + { + "epoch": 0.010509388235827049, + "grad_norm": 0.5471661686897278, + "learning_rate": 0.000199329426690026, + "loss": 3.0456, + "step": 116 + }, + { + "epoch": 0.010599986410273833, + "grad_norm": 0.562449038028717, + "learning_rate": 0.00019932338548903523, + "loss": 3.1899, + "step": 117 + }, + { + "epoch": 0.010690584584720619, + "grad_norm": 0.7496834993362427, + "learning_rate": 0.00019931734428804446, + "loss": 2.9677, + "step": 118 + }, + { + "epoch": 0.010781182759167403, + "grad_norm": 0.5872028470039368, + "learning_rate": 0.00019931130308705372, + "loss": 3.0412, + "step": 119 + }, + { + "epoch": 0.010871780933614187, + "grad_norm": 0.5896986722946167, + "learning_rate": 0.00019930526188606296, + "loss": 3.1885, + "step": 120 + }, + { + "epoch": 0.010962379108060973, + "grad_norm": 0.647800862789154, + "learning_rate": 0.00019929922068507222, + "loss": 2.6299, + "step": 121 + }, + { + "epoch": 0.011052977282507757, + "grad_norm": 0.5394775867462158, + "learning_rate": 0.00019929317948408145, + "loss": 2.7744, + "step": 122 + }, + { + "epoch": 0.011143575456954543, + "grad_norm": 0.5200536847114563, + "learning_rate": 0.00019928713828309068, + "loss": 2.8276, + "step": 123 + }, + { + "epoch": 0.011234173631401327, + "grad_norm": 0.5355072617530823, + "learning_rate": 0.00019928109708209992, + "loss": 3.1172, + "step": 124 + }, + { + "epoch": 0.011324771805848111, + "grad_norm": 0.5322816967964172, + "learning_rate": 0.00019927505588110918, + "loss": 2.8691, + "step": 125 + }, + { + "epoch": 0.011415369980294897, + "grad_norm": 0.6345838308334351, + "learning_rate": 0.00019926901468011841, + "loss": 2.9856, + "step": 126 + }, + { + "epoch": 0.011505968154741681, + "grad_norm": 0.6060804724693298, + "learning_rate": 0.00019926297347912765, + "loss": 2.9877, + "step": 127 + }, + { + "epoch": 0.011596566329188467, + "grad_norm": 0.5863826870918274, + "learning_rate": 0.0001992569322781369, + "loss": 2.9337, + "step": 128 + }, + { + "epoch": 0.011687164503635251, + "grad_norm": 0.7752948999404907, + "learning_rate": 0.00019925089107714614, + "loss": 2.8049, + "step": 129 + }, + { + "epoch": 0.011777762678082037, + "grad_norm": 0.5464982390403748, + "learning_rate": 0.0001992448498761554, + "loss": 2.9878, + "step": 130 + }, + { + "epoch": 0.011868360852528822, + "grad_norm": 0.5578551888465881, + "learning_rate": 0.0001992388086751646, + "loss": 2.9246, + "step": 131 + }, + { + "epoch": 0.011958959026975606, + "grad_norm": 0.7007773518562317, + "learning_rate": 0.00019923276747417387, + "loss": 2.6197, + "step": 132 + }, + { + "epoch": 0.012049557201422392, + "grad_norm": 0.512057900428772, + "learning_rate": 0.00019922672627318313, + "loss": 2.7643, + "step": 133 + }, + { + "epoch": 0.012140155375869176, + "grad_norm": 0.5611405372619629, + "learning_rate": 0.00019922068507219237, + "loss": 2.8162, + "step": 134 + }, + { + "epoch": 0.012230753550315962, + "grad_norm": 0.7192994952201843, + "learning_rate": 0.0001992146438712016, + "loss": 3.0663, + "step": 135 + }, + { + "epoch": 0.012321351724762746, + "grad_norm": 0.606450617313385, + "learning_rate": 0.00019920860267021084, + "loss": 3.0751, + "step": 136 + }, + { + "epoch": 0.012411949899209532, + "grad_norm": 0.6123049855232239, + "learning_rate": 0.0001992025614692201, + "loss": 3.1555, + "step": 137 + }, + { + "epoch": 0.012502548073656316, + "grad_norm": 0.6062910556793213, + "learning_rate": 0.00019919652026822933, + "loss": 2.912, + "step": 138 + }, + { + "epoch": 0.0125931462481031, + "grad_norm": 0.5946769714355469, + "learning_rate": 0.00019919047906723857, + "loss": 2.8832, + "step": 139 + }, + { + "epoch": 0.012683744422549886, + "grad_norm": 0.5771217346191406, + "learning_rate": 0.00019918443786624783, + "loss": 2.5484, + "step": 140 + }, + { + "epoch": 0.01277434259699667, + "grad_norm": 0.5538449287414551, + "learning_rate": 0.00019917839666525706, + "loss": 2.7799, + "step": 141 + }, + { + "epoch": 0.012864940771443456, + "grad_norm": 0.7904812097549438, + "learning_rate": 0.00019917235546426632, + "loss": 2.6317, + "step": 142 + }, + { + "epoch": 0.01295553894589024, + "grad_norm": 0.5655574202537537, + "learning_rate": 0.00019916631426327556, + "loss": 2.7767, + "step": 143 + }, + { + "epoch": 0.013046137120337024, + "grad_norm": 0.5401138067245483, + "learning_rate": 0.0001991602730622848, + "loss": 2.7607, + "step": 144 + }, + { + "epoch": 0.01313673529478381, + "grad_norm": 0.5633625984191895, + "learning_rate": 0.00019915423186129402, + "loss": 2.8511, + "step": 145 + }, + { + "epoch": 0.013227333469230594, + "grad_norm": 0.7231683731079102, + "learning_rate": 0.00019914819066030329, + "loss": 3.2865, + "step": 146 + }, + { + "epoch": 0.01331793164367738, + "grad_norm": 0.7945908308029175, + "learning_rate": 0.00019914214945931252, + "loss": 2.7866, + "step": 147 + }, + { + "epoch": 0.013408529818124165, + "grad_norm": 0.5799762606620789, + "learning_rate": 0.00019913610825832175, + "loss": 2.9432, + "step": 148 + }, + { + "epoch": 0.01349912799257095, + "grad_norm": 0.6002523899078369, + "learning_rate": 0.00019913006705733101, + "loss": 2.8678, + "step": 149 + }, + { + "epoch": 0.013589726167017735, + "grad_norm": 0.5365378856658936, + "learning_rate": 0.00019912402585634025, + "loss": 2.737, + "step": 150 + }, + { + "epoch": 0.013680324341464519, + "grad_norm": 0.9506891965866089, + "learning_rate": 0.0001991179846553495, + "loss": 2.6481, + "step": 151 + }, + { + "epoch": 0.013770922515911305, + "grad_norm": 0.5951514840126038, + "learning_rate": 0.00019911194345435872, + "loss": 2.9318, + "step": 152 + }, + { + "epoch": 0.013861520690358089, + "grad_norm": 0.6373233199119568, + "learning_rate": 0.00019910590225336798, + "loss": 2.9268, + "step": 153 + }, + { + "epoch": 0.013952118864804875, + "grad_norm": 0.5818977355957031, + "learning_rate": 0.0001990998610523772, + "loss": 3.2324, + "step": 154 + }, + { + "epoch": 0.014042717039251659, + "grad_norm": 0.5520113110542297, + "learning_rate": 0.00019909381985138647, + "loss": 2.9268, + "step": 155 + }, + { + "epoch": 0.014133315213698445, + "grad_norm": 0.5670979619026184, + "learning_rate": 0.0001990877786503957, + "loss": 3.0432, + "step": 156 + }, + { + "epoch": 0.014223913388145229, + "grad_norm": 0.6086367964744568, + "learning_rate": 0.00019908173744940494, + "loss": 3.1779, + "step": 157 + }, + { + "epoch": 0.014314511562592013, + "grad_norm": 0.5234999060630798, + "learning_rate": 0.0001990756962484142, + "loss": 2.9771, + "step": 158 + }, + { + "epoch": 0.014405109737038799, + "grad_norm": 0.543624997138977, + "learning_rate": 0.00019906965504742344, + "loss": 2.8251, + "step": 159 + }, + { + "epoch": 0.014495707911485583, + "grad_norm": 0.5949727296829224, + "learning_rate": 0.0001990636138464327, + "loss": 2.975, + "step": 160 + }, + { + "epoch": 0.014586306085932369, + "grad_norm": 0.574351966381073, + "learning_rate": 0.0001990575726454419, + "loss": 3.1739, + "step": 161 + }, + { + "epoch": 0.014676904260379153, + "grad_norm": 0.6096818447113037, + "learning_rate": 0.00019905153144445117, + "loss": 2.9111, + "step": 162 + }, + { + "epoch": 0.014767502434825937, + "grad_norm": 0.6009861826896667, + "learning_rate": 0.00019904549024346043, + "loss": 2.9002, + "step": 163 + }, + { + "epoch": 0.014858100609272723, + "grad_norm": 0.5822124481201172, + "learning_rate": 0.00019903944904246966, + "loss": 2.9321, + "step": 164 + }, + { + "epoch": 0.014948698783719508, + "grad_norm": 0.533924400806427, + "learning_rate": 0.0001990334078414789, + "loss": 2.8924, + "step": 165 + }, + { + "epoch": 0.015039296958166293, + "grad_norm": 0.6317586302757263, + "learning_rate": 0.00019902736664048813, + "loss": 2.6528, + "step": 166 + }, + { + "epoch": 0.015129895132613078, + "grad_norm": 0.6309795379638672, + "learning_rate": 0.0001990213254394974, + "loss": 3.0228, + "step": 167 + }, + { + "epoch": 0.015220493307059864, + "grad_norm": 0.5771666169166565, + "learning_rate": 0.00019901528423850662, + "loss": 2.9413, + "step": 168 + }, + { + "epoch": 0.015311091481506648, + "grad_norm": 0.5660306215286255, + "learning_rate": 0.00019900924303751586, + "loss": 2.9689, + "step": 169 + }, + { + "epoch": 0.015401689655953432, + "grad_norm": 0.5587407350540161, + "learning_rate": 0.00019900320183652512, + "loss": 2.544, + "step": 170 + }, + { + "epoch": 0.015492287830400218, + "grad_norm": 0.5251724720001221, + "learning_rate": 0.00019899716063553435, + "loss": 2.7581, + "step": 171 + }, + { + "epoch": 0.015582886004847002, + "grad_norm": 0.6316602826118469, + "learning_rate": 0.00019899111943454361, + "loss": 2.8545, + "step": 172 + }, + { + "epoch": 0.015673484179293786, + "grad_norm": 0.6610509753227234, + "learning_rate": 0.00019898507823355285, + "loss": 2.8099, + "step": 173 + }, + { + "epoch": 0.015764082353740574, + "grad_norm": 2.171680450439453, + "learning_rate": 0.00019897903703256208, + "loss": 2.574, + "step": 174 + }, + { + "epoch": 0.015854680528187358, + "grad_norm": 0.5818649530410767, + "learning_rate": 0.00019897299583157132, + "loss": 2.89, + "step": 175 + }, + { + "epoch": 0.015945278702634142, + "grad_norm": 0.5864630341529846, + "learning_rate": 0.00019896695463058058, + "loss": 2.9443, + "step": 176 + }, + { + "epoch": 0.016035876877080926, + "grad_norm": 0.5892198085784912, + "learning_rate": 0.0001989609134295898, + "loss": 2.726, + "step": 177 + }, + { + "epoch": 0.01612647505152771, + "grad_norm": 0.8631972074508667, + "learning_rate": 0.00019895487222859905, + "loss": 2.8465, + "step": 178 + }, + { + "epoch": 0.016217073225974498, + "grad_norm": 0.611740231513977, + "learning_rate": 0.0001989488310276083, + "loss": 2.9615, + "step": 179 + }, + { + "epoch": 0.016307671400421282, + "grad_norm": 0.5878658294677734, + "learning_rate": 0.00019894278982661754, + "loss": 3.1804, + "step": 180 + }, + { + "epoch": 0.016398269574868066, + "grad_norm": 0.5828810930252075, + "learning_rate": 0.0001989367486256268, + "loss": 3.0843, + "step": 181 + }, + { + "epoch": 0.01648886774931485, + "grad_norm": 0.603821873664856, + "learning_rate": 0.000198930707424636, + "loss": 3.0694, + "step": 182 + }, + { + "epoch": 0.016579465923761635, + "grad_norm": 0.703693687915802, + "learning_rate": 0.00019892466622364527, + "loss": 2.8608, + "step": 183 + }, + { + "epoch": 0.016670064098208422, + "grad_norm": 0.5583374500274658, + "learning_rate": 0.0001989186250226545, + "loss": 3.028, + "step": 184 + }, + { + "epoch": 0.016760662272655207, + "grad_norm": 0.5035538077354431, + "learning_rate": 0.00019891258382166377, + "loss": 2.8812, + "step": 185 + }, + { + "epoch": 0.01685126044710199, + "grad_norm": 0.5564166903495789, + "learning_rate": 0.000198906542620673, + "loss": 3.2191, + "step": 186 + }, + { + "epoch": 0.016941858621548775, + "grad_norm": 0.505743145942688, + "learning_rate": 0.00019890050141968223, + "loss": 2.9645, + "step": 187 + }, + { + "epoch": 0.01703245679599556, + "grad_norm": 1.1751450300216675, + "learning_rate": 0.0001988944602186915, + "loss": 2.6328, + "step": 188 + }, + { + "epoch": 0.017123054970442347, + "grad_norm": 0.5974453091621399, + "learning_rate": 0.00019888841901770073, + "loss": 2.9685, + "step": 189 + }, + { + "epoch": 0.01721365314488913, + "grad_norm": 0.7737918496131897, + "learning_rate": 0.00019888237781670996, + "loss": 2.9707, + "step": 190 + }, + { + "epoch": 0.017304251319335915, + "grad_norm": 0.5711212754249573, + "learning_rate": 0.0001988763366157192, + "loss": 2.9706, + "step": 191 + }, + { + "epoch": 0.0173948494937827, + "grad_norm": 0.5811641216278076, + "learning_rate": 0.00019887029541472846, + "loss": 3.026, + "step": 192 + }, + { + "epoch": 0.017485447668229487, + "grad_norm": 1.368725299835205, + "learning_rate": 0.00019886425421373772, + "loss": 2.1896, + "step": 193 + }, + { + "epoch": 0.01757604584267627, + "grad_norm": 0.5172442197799683, + "learning_rate": 0.00019885821301274695, + "loss": 3.1111, + "step": 194 + }, + { + "epoch": 0.017666644017123055, + "grad_norm": 0.588866651058197, + "learning_rate": 0.0001988521718117562, + "loss": 2.9071, + "step": 195 + }, + { + "epoch": 0.01775724219156984, + "grad_norm": 0.5775222778320312, + "learning_rate": 0.00019884613061076542, + "loss": 3.1337, + "step": 196 + }, + { + "epoch": 0.017847840366016623, + "grad_norm": 0.610349178314209, + "learning_rate": 0.00019884008940977468, + "loss": 3.0372, + "step": 197 + }, + { + "epoch": 0.01793843854046341, + "grad_norm": 0.6514378786087036, + "learning_rate": 0.00019883404820878392, + "loss": 3.0047, + "step": 198 + }, + { + "epoch": 0.018029036714910195, + "grad_norm": 1.0208864212036133, + "learning_rate": 0.00019882800700779315, + "loss": 2.5097, + "step": 199 + }, + { + "epoch": 0.01811963488935698, + "grad_norm": 0.5262575149536133, + "learning_rate": 0.0001988219658068024, + "loss": 2.6704, + "step": 200 + }, + { + "epoch": 0.018210233063803764, + "grad_norm": 0.686953604221344, + "learning_rate": 0.00019881592460581165, + "loss": 2.954, + "step": 201 + }, + { + "epoch": 0.018300831238250548, + "grad_norm": 0.6062977910041809, + "learning_rate": 0.0001988098834048209, + "loss": 2.9568, + "step": 202 + }, + { + "epoch": 0.018391429412697335, + "grad_norm": 0.5835559368133545, + "learning_rate": 0.00019880384220383011, + "loss": 2.902, + "step": 203 + }, + { + "epoch": 0.01848202758714412, + "grad_norm": 0.5907248854637146, + "learning_rate": 0.00019879780100283938, + "loss": 2.9875, + "step": 204 + }, + { + "epoch": 0.018572625761590904, + "grad_norm": 0.5114157795906067, + "learning_rate": 0.0001987917598018486, + "loss": 2.7794, + "step": 205 + }, + { + "epoch": 0.018663223936037688, + "grad_norm": 0.6112310290336609, + "learning_rate": 0.00019878571860085787, + "loss": 3.0813, + "step": 206 + }, + { + "epoch": 0.018753822110484472, + "grad_norm": 1.5335782766342163, + "learning_rate": 0.0001987796773998671, + "loss": 2.2483, + "step": 207 + }, + { + "epoch": 0.01884442028493126, + "grad_norm": 0.5796396732330322, + "learning_rate": 0.00019877363619887634, + "loss": 2.938, + "step": 208 + }, + { + "epoch": 0.018935018459378044, + "grad_norm": 0.8027403950691223, + "learning_rate": 0.0001987675949978856, + "loss": 3.0854, + "step": 209 + }, + { + "epoch": 0.019025616633824828, + "grad_norm": 0.6970934271812439, + "learning_rate": 0.00019876155379689483, + "loss": 2.8724, + "step": 210 + }, + { + "epoch": 0.019116214808271612, + "grad_norm": 0.5825811624526978, + "learning_rate": 0.00019875551259590407, + "loss": 3.0438, + "step": 211 + }, + { + "epoch": 0.0192068129827184, + "grad_norm": 0.5468299984931946, + "learning_rate": 0.0001987494713949133, + "loss": 2.8607, + "step": 212 + }, + { + "epoch": 0.019297411157165184, + "grad_norm": 0.5744978785514832, + "learning_rate": 0.00019874343019392256, + "loss": 2.6879, + "step": 213 + }, + { + "epoch": 0.019388009331611968, + "grad_norm": 0.6291070580482483, + "learning_rate": 0.0001987373889929318, + "loss": 2.9937, + "step": 214 + }, + { + "epoch": 0.019478607506058752, + "grad_norm": 0.5601966381072998, + "learning_rate": 0.00019873134779194106, + "loss": 2.9556, + "step": 215 + }, + { + "epoch": 0.019569205680505537, + "grad_norm": 0.6879293322563171, + "learning_rate": 0.0001987253065909503, + "loss": 2.8058, + "step": 216 + }, + { + "epoch": 0.019659803854952324, + "grad_norm": 1.4367856979370117, + "learning_rate": 0.00019871926538995953, + "loss": 2.4689, + "step": 217 + }, + { + "epoch": 0.01975040202939911, + "grad_norm": 0.6096290349960327, + "learning_rate": 0.0001987132241889688, + "loss": 3.1247, + "step": 218 + }, + { + "epoch": 0.019841000203845893, + "grad_norm": 1.0879380702972412, + "learning_rate": 0.00019870718298797802, + "loss": 2.6619, + "step": 219 + }, + { + "epoch": 0.019931598378292677, + "grad_norm": 0.7959235906600952, + "learning_rate": 0.00019870114178698726, + "loss": 2.3105, + "step": 220 + }, + { + "epoch": 0.02002219655273946, + "grad_norm": 0.577045202255249, + "learning_rate": 0.0001986951005859965, + "loss": 2.8894, + "step": 221 + }, + { + "epoch": 0.02011279472718625, + "grad_norm": 0.7103781700134277, + "learning_rate": 0.00019868905938500575, + "loss": 3.0516, + "step": 222 + }, + { + "epoch": 0.020203392901633033, + "grad_norm": 0.6142095327377319, + "learning_rate": 0.000198683018184015, + "loss": 2.7485, + "step": 223 + }, + { + "epoch": 0.020293991076079817, + "grad_norm": 0.5749015212059021, + "learning_rate": 0.00019867697698302422, + "loss": 2.8043, + "step": 224 + }, + { + "epoch": 0.0203845892505266, + "grad_norm": 0.6267493963241577, + "learning_rate": 0.00019867093578203348, + "loss": 3.0525, + "step": 225 + }, + { + "epoch": 0.020475187424973385, + "grad_norm": 0.5802502036094666, + "learning_rate": 0.00019866489458104271, + "loss": 2.6768, + "step": 226 + }, + { + "epoch": 0.020565785599420173, + "grad_norm": 0.5425853729248047, + "learning_rate": 0.00019865885338005198, + "loss": 2.7483, + "step": 227 + }, + { + "epoch": 0.020656383773866957, + "grad_norm": 0.5742504000663757, + "learning_rate": 0.0001986528121790612, + "loss": 2.8972, + "step": 228 + }, + { + "epoch": 0.02074698194831374, + "grad_norm": 0.5865948796272278, + "learning_rate": 0.00019864677097807044, + "loss": 2.9132, + "step": 229 + }, + { + "epoch": 0.020837580122760525, + "grad_norm": 0.56313556432724, + "learning_rate": 0.0001986407297770797, + "loss": 3.0166, + "step": 230 + }, + { + "epoch": 0.020928178297207313, + "grad_norm": 0.5435063242912292, + "learning_rate": 0.00019863468857608894, + "loss": 2.9509, + "step": 231 + }, + { + "epoch": 0.021018776471654097, + "grad_norm": 0.5518949627876282, + "learning_rate": 0.0001986286473750982, + "loss": 2.8048, + "step": 232 + }, + { + "epoch": 0.02110937464610088, + "grad_norm": 0.5296286344528198, + "learning_rate": 0.0001986226061741074, + "loss": 2.5399, + "step": 233 + }, + { + "epoch": 0.021199972820547665, + "grad_norm": 0.5888904929161072, + "learning_rate": 0.00019861656497311667, + "loss": 2.9343, + "step": 234 + }, + { + "epoch": 0.02129057099499445, + "grad_norm": 3.573333501815796, + "learning_rate": 0.0001986105237721259, + "loss": 2.2582, + "step": 235 + }, + { + "epoch": 0.021381169169441237, + "grad_norm": 0.6426015496253967, + "learning_rate": 0.00019860448257113516, + "loss": 2.9717, + "step": 236 + }, + { + "epoch": 0.02147176734388802, + "grad_norm": 1.1919986009597778, + "learning_rate": 0.0001985984413701444, + "loss": 2.2631, + "step": 237 + }, + { + "epoch": 0.021562365518334806, + "grad_norm": 0.5354192852973938, + "learning_rate": 0.00019859240016915363, + "loss": 2.7441, + "step": 238 + }, + { + "epoch": 0.02165296369278159, + "grad_norm": 1.0508028268814087, + "learning_rate": 0.0001985863589681629, + "loss": 2.2147, + "step": 239 + }, + { + "epoch": 0.021743561867228374, + "grad_norm": 0.5973533987998962, + "learning_rate": 0.00019858031776717213, + "loss": 2.8204, + "step": 240 + }, + { + "epoch": 0.02183416004167516, + "grad_norm": 0.7447154521942139, + "learning_rate": 0.00019857427656618136, + "loss": 2.8454, + "step": 241 + }, + { + "epoch": 0.021924758216121946, + "grad_norm": 0.5640422701835632, + "learning_rate": 0.0001985682353651906, + "loss": 2.9165, + "step": 242 + }, + { + "epoch": 0.02201535639056873, + "grad_norm": 0.5891057252883911, + "learning_rate": 0.00019856219416419986, + "loss": 2.5148, + "step": 243 + }, + { + "epoch": 0.022105954565015514, + "grad_norm": 0.5524282455444336, + "learning_rate": 0.0001985561529632091, + "loss": 2.8784, + "step": 244 + }, + { + "epoch": 0.0221965527394623, + "grad_norm": 0.5633473992347717, + "learning_rate": 0.00019855011176221835, + "loss": 2.7582, + "step": 245 + }, + { + "epoch": 0.022287150913909086, + "grad_norm": 0.5757750272750854, + "learning_rate": 0.00019854407056122758, + "loss": 2.8946, + "step": 246 + }, + { + "epoch": 0.02237774908835587, + "grad_norm": 0.6586316823959351, + "learning_rate": 0.00019853802936023682, + "loss": 3.2528, + "step": 247 + }, + { + "epoch": 0.022468347262802654, + "grad_norm": 0.6226139664649963, + "learning_rate": 0.00019853198815924608, + "loss": 3.1611, + "step": 248 + }, + { + "epoch": 0.02255894543724944, + "grad_norm": 0.5637702941894531, + "learning_rate": 0.00019852594695825531, + "loss": 2.9776, + "step": 249 + }, + { + "epoch": 0.022649543611696223, + "grad_norm": 0.5927480459213257, + "learning_rate": 0.00019851990575726455, + "loss": 2.6213, + "step": 250 + }, + { + "epoch": 0.02274014178614301, + "grad_norm": 1.9146262407302856, + "learning_rate": 0.00019851386455627378, + "loss": 2.6934, + "step": 251 + }, + { + "epoch": 0.022830739960589794, + "grad_norm": 2.382108688354492, + "learning_rate": 0.00019850782335528304, + "loss": 2.4224, + "step": 252 + }, + { + "epoch": 0.02292133813503658, + "grad_norm": 0.5924305319786072, + "learning_rate": 0.0001985017821542923, + "loss": 2.7687, + "step": 253 + }, + { + "epoch": 0.023011936309483363, + "grad_norm": 0.551910400390625, + "learning_rate": 0.0001984957409533015, + "loss": 2.749, + "step": 254 + }, + { + "epoch": 0.02310253448393015, + "grad_norm": 0.6015335917472839, + "learning_rate": 0.00019848969975231077, + "loss": 2.8739, + "step": 255 + }, + { + "epoch": 0.023193132658376935, + "grad_norm": 0.6015594601631165, + "learning_rate": 0.00019848365855132, + "loss": 3.0242, + "step": 256 + }, + { + "epoch": 0.02328373083282372, + "grad_norm": 0.5846293568611145, + "learning_rate": 0.00019847761735032927, + "loss": 2.8974, + "step": 257 + }, + { + "epoch": 0.023374329007270503, + "grad_norm": 1.5237374305725098, + "learning_rate": 0.0001984715761493385, + "loss": 2.3218, + "step": 258 + }, + { + "epoch": 0.023464927181717287, + "grad_norm": 0.5673334002494812, + "learning_rate": 0.00019846553494834774, + "loss": 2.9879, + "step": 259 + }, + { + "epoch": 0.023555525356164075, + "grad_norm": 0.5820329189300537, + "learning_rate": 0.000198459493747357, + "loss": 2.9296, + "step": 260 + }, + { + "epoch": 0.02364612353061086, + "grad_norm": 1.114565134048462, + "learning_rate": 0.00019845345254636623, + "loss": 2.4354, + "step": 261 + }, + { + "epoch": 0.023736721705057643, + "grad_norm": 0.5502650141716003, + "learning_rate": 0.00019844741134537547, + "loss": 2.8911, + "step": 262 + }, + { + "epoch": 0.023827319879504427, + "grad_norm": 0.6676936745643616, + "learning_rate": 0.0001984413701443847, + "loss": 3.2288, + "step": 263 + }, + { + "epoch": 0.02391791805395121, + "grad_norm": 0.6613924503326416, + "learning_rate": 0.00019843532894339396, + "loss": 2.8763, + "step": 264 + }, + { + "epoch": 0.024008516228398, + "grad_norm": 0.6020938754081726, + "learning_rate": 0.0001984292877424032, + "loss": 2.6298, + "step": 265 + }, + { + "epoch": 0.024099114402844783, + "grad_norm": 0.6066319346427917, + "learning_rate": 0.00019842324654141246, + "loss": 2.6562, + "step": 266 + }, + { + "epoch": 0.024189712577291567, + "grad_norm": 1.0243449211120605, + "learning_rate": 0.00019841720534042166, + "loss": 2.3405, + "step": 267 + }, + { + "epoch": 0.02428031075173835, + "grad_norm": 1.7800813913345337, + "learning_rate": 0.00019841116413943092, + "loss": 3.1385, + "step": 268 + }, + { + "epoch": 0.024370908926185136, + "grad_norm": 1.9688515663146973, + "learning_rate": 0.00019840512293844018, + "loss": 2.978, + "step": 269 + }, + { + "epoch": 0.024461507100631923, + "grad_norm": 0.6281402111053467, + "learning_rate": 0.00019839908173744942, + "loss": 2.9633, + "step": 270 + }, + { + "epoch": 0.024552105275078707, + "grad_norm": 1.1632791757583618, + "learning_rate": 0.00019839304053645865, + "loss": 2.4421, + "step": 271 + }, + { + "epoch": 0.02464270344952549, + "grad_norm": 0.6404205560684204, + "learning_rate": 0.0001983869993354679, + "loss": 3.0752, + "step": 272 + }, + { + "epoch": 0.024733301623972276, + "grad_norm": 0.5493985414505005, + "learning_rate": 0.00019838095813447715, + "loss": 2.5832, + "step": 273 + }, + { + "epoch": 0.024823899798419063, + "grad_norm": 0.5534141063690186, + "learning_rate": 0.00019837491693348638, + "loss": 3.0911, + "step": 274 + }, + { + "epoch": 0.024914497972865848, + "grad_norm": 0.6242668628692627, + "learning_rate": 0.00019836887573249562, + "loss": 3.0809, + "step": 275 + }, + { + "epoch": 0.025005096147312632, + "grad_norm": 1.1323773860931396, + "learning_rate": 0.00019836283453150488, + "loss": 2.9214, + "step": 276 + }, + { + "epoch": 0.025095694321759416, + "grad_norm": 0.6035593748092651, + "learning_rate": 0.0001983567933305141, + "loss": 2.7562, + "step": 277 + }, + { + "epoch": 0.0251862924962062, + "grad_norm": 0.6210799217224121, + "learning_rate": 0.00019835075212952337, + "loss": 2.9056, + "step": 278 + }, + { + "epoch": 0.025276890670652988, + "grad_norm": 0.5759990215301514, + "learning_rate": 0.0001983447109285326, + "loss": 2.8055, + "step": 279 + }, + { + "epoch": 0.025367488845099772, + "grad_norm": 0.5651792883872986, + "learning_rate": 0.00019833866972754184, + "loss": 2.8902, + "step": 280 + }, + { + "epoch": 0.025458087019546556, + "grad_norm": 1.097611427307129, + "learning_rate": 0.00019833262852655107, + "loss": 2.2916, + "step": 281 + }, + { + "epoch": 0.02554868519399334, + "grad_norm": 0.6029875874519348, + "learning_rate": 0.00019832658732556034, + "loss": 3.1944, + "step": 282 + }, + { + "epoch": 0.025639283368440124, + "grad_norm": 0.87055903673172, + "learning_rate": 0.0001983205461245696, + "loss": 2.3002, + "step": 283 + }, + { + "epoch": 0.025729881542886912, + "grad_norm": 0.5558617115020752, + "learning_rate": 0.0001983145049235788, + "loss": 2.8694, + "step": 284 + }, + { + "epoch": 0.025820479717333696, + "grad_norm": 0.6017692685127258, + "learning_rate": 0.00019830846372258807, + "loss": 2.8778, + "step": 285 + }, + { + "epoch": 0.02591107789178048, + "grad_norm": 0.6405017971992493, + "learning_rate": 0.0001983024225215973, + "loss": 2.9887, + "step": 286 + }, + { + "epoch": 0.026001676066227265, + "grad_norm": 0.5733552575111389, + "learning_rate": 0.00019829638132060656, + "loss": 2.9946, + "step": 287 + }, + { + "epoch": 0.02609227424067405, + "grad_norm": 0.5978521108627319, + "learning_rate": 0.00019829034011961577, + "loss": 3.0803, + "step": 288 + }, + { + "epoch": 0.026182872415120836, + "grad_norm": 0.5500574707984924, + "learning_rate": 0.00019828429891862503, + "loss": 2.9547, + "step": 289 + }, + { + "epoch": 0.02627347058956762, + "grad_norm": 0.5971173048019409, + "learning_rate": 0.0001982782577176343, + "loss": 2.8549, + "step": 290 + }, + { + "epoch": 0.026364068764014405, + "grad_norm": 0.6822018027305603, + "learning_rate": 0.00019827221651664352, + "loss": 2.6464, + "step": 291 + }, + { + "epoch": 0.02645466693846119, + "grad_norm": 0.5820775628089905, + "learning_rate": 0.00019826617531565276, + "loss": 2.8752, + "step": 292 + }, + { + "epoch": 0.026545265112907977, + "grad_norm": 0.6240231990814209, + "learning_rate": 0.000198260134114662, + "loss": 2.8898, + "step": 293 + }, + { + "epoch": 0.02663586328735476, + "grad_norm": 0.5456892848014832, + "learning_rate": 0.00019825409291367125, + "loss": 2.635, + "step": 294 + }, + { + "epoch": 0.026726461461801545, + "grad_norm": 0.5686516165733337, + "learning_rate": 0.0001982480517126805, + "loss": 3.053, + "step": 295 + }, + { + "epoch": 0.02681705963624833, + "grad_norm": 0.5976006388664246, + "learning_rate": 0.00019824201051168975, + "loss": 3.1814, + "step": 296 + }, + { + "epoch": 0.026907657810695113, + "grad_norm": 0.5703667402267456, + "learning_rate": 0.00019823596931069896, + "loss": 2.8675, + "step": 297 + }, + { + "epoch": 0.0269982559851419, + "grad_norm": 1.2945358753204346, + "learning_rate": 0.00019822992810970822, + "loss": 2.2708, + "step": 298 + }, + { + "epoch": 0.027088854159588685, + "grad_norm": 0.5459473729133606, + "learning_rate": 0.00019822388690871748, + "loss": 2.7045, + "step": 299 + }, + { + "epoch": 0.02717945233403547, + "grad_norm": 0.6488409042358398, + "learning_rate": 0.0001982178457077267, + "loss": 2.0524, + "step": 300 + }, + { + "epoch": 0.027270050508482253, + "grad_norm": 0.5617200136184692, + "learning_rate": 0.00019821180450673595, + "loss": 2.8037, + "step": 301 + }, + { + "epoch": 0.027360648682929038, + "grad_norm": 0.5922620892524719, + "learning_rate": 0.00019820576330574518, + "loss": 2.9484, + "step": 302 + }, + { + "epoch": 0.027451246857375825, + "grad_norm": 0.5944216847419739, + "learning_rate": 0.00019819972210475444, + "loss": 2.958, + "step": 303 + }, + { + "epoch": 0.02754184503182261, + "grad_norm": 0.5633590817451477, + "learning_rate": 0.00019819368090376367, + "loss": 2.8967, + "step": 304 + }, + { + "epoch": 0.027632443206269394, + "grad_norm": 0.6911365389823914, + "learning_rate": 0.0001981876397027729, + "loss": 2.8042, + "step": 305 + }, + { + "epoch": 0.027723041380716178, + "grad_norm": 0.5873891711235046, + "learning_rate": 0.00019818159850178217, + "loss": 3.0862, + "step": 306 + }, + { + "epoch": 0.027813639555162962, + "grad_norm": 0.6066146492958069, + "learning_rate": 0.0001981755573007914, + "loss": 3.0712, + "step": 307 + }, + { + "epoch": 0.02790423772960975, + "grad_norm": 0.6365353465080261, + "learning_rate": 0.00019816951609980067, + "loss": 2.7941, + "step": 308 + }, + { + "epoch": 0.027994835904056534, + "grad_norm": 0.5868875980377197, + "learning_rate": 0.0001981634748988099, + "loss": 2.9413, + "step": 309 + }, + { + "epoch": 0.028085434078503318, + "grad_norm": 0.6682957410812378, + "learning_rate": 0.00019815743369781913, + "loss": 2.8797, + "step": 310 + }, + { + "epoch": 0.028176032252950102, + "grad_norm": 0.6267266273498535, + "learning_rate": 0.00019815139249682837, + "loss": 3.0698, + "step": 311 + }, + { + "epoch": 0.02826663042739689, + "grad_norm": 0.6602296233177185, + "learning_rate": 0.00019814535129583763, + "loss": 2.6708, + "step": 312 + }, + { + "epoch": 0.028357228601843674, + "grad_norm": 0.5856702327728271, + "learning_rate": 0.00019813931009484686, + "loss": 2.8211, + "step": 313 + }, + { + "epoch": 0.028447826776290458, + "grad_norm": 0.579447329044342, + "learning_rate": 0.0001981332688938561, + "loss": 3.0413, + "step": 314 + }, + { + "epoch": 0.028538424950737242, + "grad_norm": 0.5926731824874878, + "learning_rate": 0.00019812722769286536, + "loss": 3.0478, + "step": 315 + }, + { + "epoch": 0.028629023125184026, + "grad_norm": 1.8981019258499146, + "learning_rate": 0.0001981211864918746, + "loss": 2.3345, + "step": 316 + }, + { + "epoch": 0.028719621299630814, + "grad_norm": 1.206287145614624, + "learning_rate": 0.00019811514529088385, + "loss": 2.311, + "step": 317 + }, + { + "epoch": 0.028810219474077598, + "grad_norm": 0.5678691267967224, + "learning_rate": 0.00019810910408989306, + "loss": 2.7596, + "step": 318 + }, + { + "epoch": 0.028900817648524382, + "grad_norm": 0.8940849304199219, + "learning_rate": 0.00019810306288890232, + "loss": 2.3754, + "step": 319 + }, + { + "epoch": 0.028991415822971166, + "grad_norm": 0.5867103338241577, + "learning_rate": 0.00019809702168791158, + "loss": 3.0275, + "step": 320 + }, + { + "epoch": 0.02908201399741795, + "grad_norm": 0.5729684829711914, + "learning_rate": 0.00019809098048692082, + "loss": 2.8778, + "step": 321 + }, + { + "epoch": 0.029172612171864738, + "grad_norm": 0.5760530233383179, + "learning_rate": 0.00019808493928593005, + "loss": 2.834, + "step": 322 + }, + { + "epoch": 0.029263210346311522, + "grad_norm": 0.7054018974304199, + "learning_rate": 0.00019807889808493928, + "loss": 2.1917, + "step": 323 + }, + { + "epoch": 0.029353808520758307, + "grad_norm": 0.6232660412788391, + "learning_rate": 0.00019807285688394855, + "loss": 2.9062, + "step": 324 + }, + { + "epoch": 0.02944440669520509, + "grad_norm": 0.6074373722076416, + "learning_rate": 0.00019806681568295778, + "loss": 3.0787, + "step": 325 + }, + { + "epoch": 0.029535004869651875, + "grad_norm": 1.1413437128067017, + "learning_rate": 0.00019806077448196701, + "loss": 2.3856, + "step": 326 + }, + { + "epoch": 0.029625603044098663, + "grad_norm": 0.5929679870605469, + "learning_rate": 0.00019805473328097625, + "loss": 2.9031, + "step": 327 + }, + { + "epoch": 0.029716201218545447, + "grad_norm": 0.6306084394454956, + "learning_rate": 0.0001980486920799855, + "loss": 2.9277, + "step": 328 + }, + { + "epoch": 0.02980679939299223, + "grad_norm": 0.6765310764312744, + "learning_rate": 0.00019804265087899477, + "loss": 2.9772, + "step": 329 + }, + { + "epoch": 0.029897397567439015, + "grad_norm": 1.0381203889846802, + "learning_rate": 0.000198036609678004, + "loss": 2.567, + "step": 330 + }, + { + "epoch": 0.029987995741885803, + "grad_norm": 0.6065115332603455, + "learning_rate": 0.00019803056847701324, + "loss": 2.6574, + "step": 331 + }, + { + "epoch": 0.030078593916332587, + "grad_norm": 0.6100816130638123, + "learning_rate": 0.00019802452727602247, + "loss": 2.8741, + "step": 332 + }, + { + "epoch": 0.03016919209077937, + "grad_norm": 0.5973966121673584, + "learning_rate": 0.00019801848607503173, + "loss": 2.8516, + "step": 333 + }, + { + "epoch": 0.030259790265226155, + "grad_norm": 0.6704645156860352, + "learning_rate": 0.00019801244487404097, + "loss": 2.9074, + "step": 334 + }, + { + "epoch": 0.03035038843967294, + "grad_norm": 0.74257892370224, + "learning_rate": 0.0001980064036730502, + "loss": 2.9131, + "step": 335 + }, + { + "epoch": 0.030440986614119727, + "grad_norm": 0.6257579326629639, + "learning_rate": 0.00019800036247205946, + "loss": 2.8697, + "step": 336 + }, + { + "epoch": 0.03053158478856651, + "grad_norm": 0.6590521931648254, + "learning_rate": 0.0001979943212710687, + "loss": 2.8413, + "step": 337 + }, + { + "epoch": 0.030622182963013295, + "grad_norm": 0.6020402312278748, + "learning_rate": 0.00019798828007007796, + "loss": 2.9141, + "step": 338 + }, + { + "epoch": 0.03071278113746008, + "grad_norm": 0.6040177941322327, + "learning_rate": 0.00019798223886908716, + "loss": 2.9667, + "step": 339 + }, + { + "epoch": 0.030803379311906864, + "grad_norm": 0.5973387956619263, + "learning_rate": 0.00019797619766809643, + "loss": 2.8466, + "step": 340 + }, + { + "epoch": 0.03089397748635365, + "grad_norm": 0.6222031712532043, + "learning_rate": 0.00019797015646710566, + "loss": 2.8509, + "step": 341 + }, + { + "epoch": 0.030984575660800436, + "grad_norm": 0.6122547388076782, + "learning_rate": 0.00019796411526611492, + "loss": 3.0214, + "step": 342 + }, + { + "epoch": 0.03107517383524722, + "grad_norm": 1.8787816762924194, + "learning_rate": 0.00019795807406512416, + "loss": 2.7388, + "step": 343 + }, + { + "epoch": 0.031165772009694004, + "grad_norm": 0.6349554657936096, + "learning_rate": 0.0001979520328641334, + "loss": 3.108, + "step": 344 + }, + { + "epoch": 0.03125637018414079, + "grad_norm": 0.6022644639015198, + "learning_rate": 0.00019794599166314265, + "loss": 2.8021, + "step": 345 + }, + { + "epoch": 0.03134696835858757, + "grad_norm": 2.0471301078796387, + "learning_rate": 0.00019793995046215188, + "loss": 2.9446, + "step": 346 + }, + { + "epoch": 0.03143756653303436, + "grad_norm": 0.5993468165397644, + "learning_rate": 0.00019793390926116115, + "loss": 2.9262, + "step": 347 + }, + { + "epoch": 0.03152816470748115, + "grad_norm": 0.5827695727348328, + "learning_rate": 0.00019792786806017035, + "loss": 2.966, + "step": 348 + }, + { + "epoch": 0.03161876288192793, + "grad_norm": 0.5896158218383789, + "learning_rate": 0.00019792182685917961, + "loss": 3.0097, + "step": 349 + }, + { + "epoch": 0.031709361056374716, + "grad_norm": 0.5854161977767944, + "learning_rate": 0.00019791578565818888, + "loss": 3.0092, + "step": 350 + }, + { + "epoch": 0.031799959230821497, + "grad_norm": 0.568681538105011, + "learning_rate": 0.0001979097444571981, + "loss": 2.9469, + "step": 351 + }, + { + "epoch": 0.031890557405268284, + "grad_norm": 0.6497570872306824, + "learning_rate": 0.00019790370325620734, + "loss": 2.8839, + "step": 352 + }, + { + "epoch": 0.03198115557971507, + "grad_norm": 0.5902096629142761, + "learning_rate": 0.00019789766205521658, + "loss": 2.7044, + "step": 353 + }, + { + "epoch": 0.03207175375416185, + "grad_norm": 0.6248335838317871, + "learning_rate": 0.00019789162085422584, + "loss": 2.744, + "step": 354 + }, + { + "epoch": 0.03216235192860864, + "grad_norm": 0.6208926439285278, + "learning_rate": 0.00019788557965323507, + "loss": 2.9112, + "step": 355 + }, + { + "epoch": 0.03225295010305542, + "grad_norm": 0.6436765789985657, + "learning_rate": 0.0001978795384522443, + "loss": 2.952, + "step": 356 + }, + { + "epoch": 0.03234354827750221, + "grad_norm": 0.6082020998001099, + "learning_rate": 0.00019787349725125354, + "loss": 2.8313, + "step": 357 + }, + { + "epoch": 0.032434146451948996, + "grad_norm": 1.021414875984192, + "learning_rate": 0.0001978674560502628, + "loss": 2.4678, + "step": 358 + }, + { + "epoch": 0.03252474462639578, + "grad_norm": 0.6159375905990601, + "learning_rate": 0.00019786141484927206, + "loss": 2.951, + "step": 359 + }, + { + "epoch": 0.032615342800842564, + "grad_norm": 0.5819122195243835, + "learning_rate": 0.0001978553736482813, + "loss": 2.8885, + "step": 360 + }, + { + "epoch": 0.032705940975289345, + "grad_norm": 0.6379526257514954, + "learning_rate": 0.00019784933244729053, + "loss": 2.9654, + "step": 361 + }, + { + "epoch": 0.03279653914973613, + "grad_norm": 0.8152068257331848, + "learning_rate": 0.00019784329124629976, + "loss": 2.932, + "step": 362 + }, + { + "epoch": 0.03288713732418292, + "grad_norm": 0.6006295680999756, + "learning_rate": 0.00019783725004530903, + "loss": 2.948, + "step": 363 + }, + { + "epoch": 0.0329777354986297, + "grad_norm": 0.6008549332618713, + "learning_rate": 0.00019783120884431826, + "loss": 2.7857, + "step": 364 + }, + { + "epoch": 0.03306833367307649, + "grad_norm": 0.5681292414665222, + "learning_rate": 0.0001978251676433275, + "loss": 2.8763, + "step": 365 + }, + { + "epoch": 0.03315893184752327, + "grad_norm": 0.583747923374176, + "learning_rate": 0.00019781912644233676, + "loss": 2.9315, + "step": 366 + }, + { + "epoch": 0.03324953002197006, + "grad_norm": 0.6070590615272522, + "learning_rate": 0.000197813085241346, + "loss": 2.8931, + "step": 367 + }, + { + "epoch": 0.033340128196416845, + "grad_norm": 0.6398596167564392, + "learning_rate": 0.00019780704404035525, + "loss": 3.0585, + "step": 368 + }, + { + "epoch": 0.033430726370863625, + "grad_norm": 0.5716537237167358, + "learning_rate": 0.00019780100283936446, + "loss": 3.0111, + "step": 369 + }, + { + "epoch": 0.03352132454531041, + "grad_norm": 0.6128444075584412, + "learning_rate": 0.00019779496163837372, + "loss": 2.9616, + "step": 370 + }, + { + "epoch": 0.033611922719757194, + "grad_norm": 0.6246538162231445, + "learning_rate": 0.00019778892043738295, + "loss": 2.9881, + "step": 371 + }, + { + "epoch": 0.03370252089420398, + "grad_norm": 0.976046621799469, + "learning_rate": 0.00019778287923639221, + "loss": 2.6589, + "step": 372 + }, + { + "epoch": 0.03379311906865077, + "grad_norm": 0.5666386485099792, + "learning_rate": 0.00019777683803540145, + "loss": 3.025, + "step": 373 + }, + { + "epoch": 0.03388371724309755, + "grad_norm": 0.5846790075302124, + "learning_rate": 0.00019777079683441068, + "loss": 2.7826, + "step": 374 + }, + { + "epoch": 0.03397431541754434, + "grad_norm": 0.5933738350868225, + "learning_rate": 0.00019776475563341994, + "loss": 2.7992, + "step": 375 + }, + { + "epoch": 0.03406491359199112, + "grad_norm": 0.6034766435623169, + "learning_rate": 0.00019775871443242918, + "loss": 2.6744, + "step": 376 + }, + { + "epoch": 0.034155511766437906, + "grad_norm": 0.6233522891998291, + "learning_rate": 0.0001977526732314384, + "loss": 2.8682, + "step": 377 + }, + { + "epoch": 0.03424610994088469, + "grad_norm": 0.5797932744026184, + "learning_rate": 0.00019774663203044765, + "loss": 2.9856, + "step": 378 + }, + { + "epoch": 0.034336708115331474, + "grad_norm": 0.6306365728378296, + "learning_rate": 0.0001977405908294569, + "loss": 2.9097, + "step": 379 + }, + { + "epoch": 0.03442730628977826, + "grad_norm": 0.5810677409172058, + "learning_rate": 0.00019773454962846617, + "loss": 2.4807, + "step": 380 + }, + { + "epoch": 0.03451790446422505, + "grad_norm": 0.5914978384971619, + "learning_rate": 0.0001977285084274754, + "loss": 3.0773, + "step": 381 + }, + { + "epoch": 0.03460850263867183, + "grad_norm": 0.9140338897705078, + "learning_rate": 0.00019772246722648464, + "loss": 2.2873, + "step": 382 + }, + { + "epoch": 0.03469910081311862, + "grad_norm": 0.6451712250709534, + "learning_rate": 0.00019771642602549387, + "loss": 3.1696, + "step": 383 + }, + { + "epoch": 0.0347896989875654, + "grad_norm": 0.6249769330024719, + "learning_rate": 0.00019771038482450313, + "loss": 3.0236, + "step": 384 + }, + { + "epoch": 0.034880297162012186, + "grad_norm": 0.5646182894706726, + "learning_rate": 0.00019770434362351237, + "loss": 2.7982, + "step": 385 + }, + { + "epoch": 0.034970895336458974, + "grad_norm": 0.5649774670600891, + "learning_rate": 0.0001976983024225216, + "loss": 2.7527, + "step": 386 + }, + { + "epoch": 0.035061493510905754, + "grad_norm": 0.6000802516937256, + "learning_rate": 0.00019769226122153083, + "loss": 2.9396, + "step": 387 + }, + { + "epoch": 0.03515209168535254, + "grad_norm": 0.5909450650215149, + "learning_rate": 0.0001976862200205401, + "loss": 2.0397, + "step": 388 + }, + { + "epoch": 0.03524268985979932, + "grad_norm": 0.6275204420089722, + "learning_rate": 0.00019768017881954936, + "loss": 2.8011, + "step": 389 + }, + { + "epoch": 0.03533328803424611, + "grad_norm": 0.6057722568511963, + "learning_rate": 0.00019767413761855856, + "loss": 2.9493, + "step": 390 + }, + { + "epoch": 0.0354238862086929, + "grad_norm": 0.5909808874130249, + "learning_rate": 0.00019766809641756782, + "loss": 2.9203, + "step": 391 + }, + { + "epoch": 0.03551448438313968, + "grad_norm": 0.5977863073348999, + "learning_rate": 0.00019766205521657706, + "loss": 3.1183, + "step": 392 + }, + { + "epoch": 0.035605082557586466, + "grad_norm": 0.5978794693946838, + "learning_rate": 0.00019765601401558632, + "loss": 3.0629, + "step": 393 + }, + { + "epoch": 0.03569568073203325, + "grad_norm": 1.0908485651016235, + "learning_rate": 0.00019764997281459555, + "loss": 2.5672, + "step": 394 + }, + { + "epoch": 0.035786278906480035, + "grad_norm": 0.5692805647850037, + "learning_rate": 0.0001976439316136048, + "loss": 2.8807, + "step": 395 + }, + { + "epoch": 0.03587687708092682, + "grad_norm": 0.5865841507911682, + "learning_rate": 0.00019763789041261405, + "loss": 2.8005, + "step": 396 + }, + { + "epoch": 0.0359674752553736, + "grad_norm": 0.6093242168426514, + "learning_rate": 0.00019763184921162328, + "loss": 2.0591, + "step": 397 + }, + { + "epoch": 0.03605807342982039, + "grad_norm": 0.6634935736656189, + "learning_rate": 0.00019762580801063252, + "loss": 3.0162, + "step": 398 + }, + { + "epoch": 0.03614867160426717, + "grad_norm": 0.6214368343353271, + "learning_rate": 0.00019761976680964175, + "loss": 2.8948, + "step": 399 + }, + { + "epoch": 0.03623926977871396, + "grad_norm": 0.5586468577384949, + "learning_rate": 0.000197613725608651, + "loss": 2.8565, + "step": 400 + }, + { + "epoch": 0.03632986795316075, + "grad_norm": 0.6205968856811523, + "learning_rate": 0.00019760768440766025, + "loss": 2.9985, + "step": 401 + }, + { + "epoch": 0.03642046612760753, + "grad_norm": 0.6129301190376282, + "learning_rate": 0.0001976016432066695, + "loss": 2.9748, + "step": 402 + }, + { + "epoch": 0.036511064302054315, + "grad_norm": 0.6438882350921631, + "learning_rate": 0.00019759560200567874, + "loss": 2.0535, + "step": 403 + }, + { + "epoch": 0.036601662476501096, + "grad_norm": 0.584683895111084, + "learning_rate": 0.00019758956080468797, + "loss": 2.9783, + "step": 404 + }, + { + "epoch": 0.03669226065094788, + "grad_norm": 0.5873886942863464, + "learning_rate": 0.00019758351960369724, + "loss": 2.8089, + "step": 405 + }, + { + "epoch": 0.03678285882539467, + "grad_norm": 0.507704496383667, + "learning_rate": 0.00019757747840270647, + "loss": 2.068, + "step": 406 + }, + { + "epoch": 0.03687345699984145, + "grad_norm": 0.5493074059486389, + "learning_rate": 0.0001975714372017157, + "loss": 2.6085, + "step": 407 + }, + { + "epoch": 0.03696405517428824, + "grad_norm": 0.6046673059463501, + "learning_rate": 0.00019756539600072494, + "loss": 2.8811, + "step": 408 + }, + { + "epoch": 0.03705465334873502, + "grad_norm": 0.6056495308876038, + "learning_rate": 0.0001975593547997342, + "loss": 3.1095, + "step": 409 + }, + { + "epoch": 0.03714525152318181, + "grad_norm": 0.6570762991905212, + "learning_rate": 0.00019755331359874346, + "loss": 3.0457, + "step": 410 + }, + { + "epoch": 0.037235849697628595, + "grad_norm": 0.6229265332221985, + "learning_rate": 0.00019754727239775267, + "loss": 2.9025, + "step": 411 + }, + { + "epoch": 0.037326447872075376, + "grad_norm": 0.7143917083740234, + "learning_rate": 0.00019754123119676193, + "loss": 2.8444, + "step": 412 + }, + { + "epoch": 0.037417046046522164, + "grad_norm": 0.5648350715637207, + "learning_rate": 0.00019753518999577116, + "loss": 2.8469, + "step": 413 + }, + { + "epoch": 0.037507644220968944, + "grad_norm": 0.5790063738822937, + "learning_rate": 0.00019752914879478042, + "loss": 3.1697, + "step": 414 + }, + { + "epoch": 0.03759824239541573, + "grad_norm": 0.576087474822998, + "learning_rate": 0.00019752310759378966, + "loss": 2.7088, + "step": 415 + }, + { + "epoch": 0.03768884056986252, + "grad_norm": 0.8382322192192078, + "learning_rate": 0.0001975170663927989, + "loss": 2.1755, + "step": 416 + }, + { + "epoch": 0.0377794387443093, + "grad_norm": 0.6118147969245911, + "learning_rate": 0.00019751102519180813, + "loss": 2.8584, + "step": 417 + }, + { + "epoch": 0.03787003691875609, + "grad_norm": 0.5963465571403503, + "learning_rate": 0.0001975049839908174, + "loss": 2.9859, + "step": 418 + }, + { + "epoch": 0.03796063509320287, + "grad_norm": 0.5949907302856445, + "learning_rate": 0.00019749894278982665, + "loss": 2.7651, + "step": 419 + }, + { + "epoch": 0.038051233267649656, + "grad_norm": 0.5947472453117371, + "learning_rate": 0.00019749290158883586, + "loss": 3.2281, + "step": 420 + }, + { + "epoch": 0.038141831442096444, + "grad_norm": 0.6592260599136353, + "learning_rate": 0.00019748686038784512, + "loss": 2.9226, + "step": 421 + }, + { + "epoch": 0.038232429616543225, + "grad_norm": 0.6384617686271667, + "learning_rate": 0.00019748081918685435, + "loss": 3.0871, + "step": 422 + }, + { + "epoch": 0.03832302779099001, + "grad_norm": 0.5702937841415405, + "learning_rate": 0.0001974747779858636, + "loss": 2.76, + "step": 423 + }, + { + "epoch": 0.0384136259654368, + "grad_norm": 0.6137324571609497, + "learning_rate": 0.00019746873678487285, + "loss": 2.8364, + "step": 424 + }, + { + "epoch": 0.03850422413988358, + "grad_norm": 0.6068158745765686, + "learning_rate": 0.00019746269558388208, + "loss": 2.8084, + "step": 425 + }, + { + "epoch": 0.03859482231433037, + "grad_norm": 0.6323428153991699, + "learning_rate": 0.00019745665438289134, + "loss": 2.8719, + "step": 426 + }, + { + "epoch": 0.03868542048877715, + "grad_norm": 0.6038642525672913, + "learning_rate": 0.00019745061318190057, + "loss": 2.7988, + "step": 427 + }, + { + "epoch": 0.038776018663223936, + "grad_norm": 1.1401736736297607, + "learning_rate": 0.0001974445719809098, + "loss": 2.8843, + "step": 428 + }, + { + "epoch": 0.038866616837670724, + "grad_norm": 0.6236101388931274, + "learning_rate": 0.00019743853077991904, + "loss": 2.8479, + "step": 429 + }, + { + "epoch": 0.038957215012117505, + "grad_norm": 0.628302812576294, + "learning_rate": 0.0001974324895789283, + "loss": 3.043, + "step": 430 + }, + { + "epoch": 0.03904781318656429, + "grad_norm": 0.6242570877075195, + "learning_rate": 0.00019742644837793754, + "loss": 3.024, + "step": 431 + }, + { + "epoch": 0.03913841136101107, + "grad_norm": 0.6163428425788879, + "learning_rate": 0.0001974204071769468, + "loss": 2.9567, + "step": 432 + }, + { + "epoch": 0.03922900953545786, + "grad_norm": 0.5918708443641663, + "learning_rate": 0.00019741436597595603, + "loss": 3.0307, + "step": 433 + }, + { + "epoch": 0.03931960770990465, + "grad_norm": 0.5780099630355835, + "learning_rate": 0.00019740832477496527, + "loss": 2.9264, + "step": 434 + }, + { + "epoch": 0.03941020588435143, + "grad_norm": 0.6021382808685303, + "learning_rate": 0.00019740228357397453, + "loss": 3.0468, + "step": 435 + }, + { + "epoch": 0.03950080405879822, + "grad_norm": 0.6144555807113647, + "learning_rate": 0.00019739624237298376, + "loss": 2.9342, + "step": 436 + }, + { + "epoch": 0.039591402233245, + "grad_norm": 0.6050962209701538, + "learning_rate": 0.000197390201171993, + "loss": 2.668, + "step": 437 + }, + { + "epoch": 0.039682000407691785, + "grad_norm": 0.64592045545578, + "learning_rate": 0.00019738415997100223, + "loss": 2.926, + "step": 438 + }, + { + "epoch": 0.03977259858213857, + "grad_norm": 0.6361914277076721, + "learning_rate": 0.0001973781187700115, + "loss": 2.8373, + "step": 439 + }, + { + "epoch": 0.03986319675658535, + "grad_norm": 0.6577926278114319, + "learning_rate": 0.00019737207756902075, + "loss": 2.8529, + "step": 440 + }, + { + "epoch": 0.03995379493103214, + "grad_norm": 0.6022813320159912, + "learning_rate": 0.00019736603636802996, + "loss": 2.6749, + "step": 441 + }, + { + "epoch": 0.04004439310547892, + "grad_norm": 0.6203047633171082, + "learning_rate": 0.00019735999516703922, + "loss": 2.8004, + "step": 442 + }, + { + "epoch": 0.04013499127992571, + "grad_norm": 0.6028339266777039, + "learning_rate": 0.00019735395396604846, + "loss": 3.0568, + "step": 443 + }, + { + "epoch": 0.0402255894543725, + "grad_norm": 0.7107727527618408, + "learning_rate": 0.00019734791276505772, + "loss": 3.0139, + "step": 444 + }, + { + "epoch": 0.04031618762881928, + "grad_norm": 0.6401599049568176, + "learning_rate": 0.00019734187156406695, + "loss": 3.0551, + "step": 445 + }, + { + "epoch": 0.040406785803266065, + "grad_norm": 0.6279411911964417, + "learning_rate": 0.00019733583036307618, + "loss": 2.884, + "step": 446 + }, + { + "epoch": 0.040497383977712846, + "grad_norm": 0.6120744943618774, + "learning_rate": 0.00019732978916208542, + "loss": 2.9648, + "step": 447 + }, + { + "epoch": 0.040587982152159634, + "grad_norm": 0.6754257678985596, + "learning_rate": 0.00019732374796109468, + "loss": 2.7577, + "step": 448 + }, + { + "epoch": 0.04067858032660642, + "grad_norm": 0.6690700054168701, + "learning_rate": 0.00019731770676010391, + "loss": 2.9345, + "step": 449 + }, + { + "epoch": 0.0407691785010532, + "grad_norm": 0.5675467252731323, + "learning_rate": 0.00019731166555911315, + "loss": 2.8529, + "step": 450 + }, + { + "epoch": 0.04085977667549999, + "grad_norm": 0.6255645751953125, + "learning_rate": 0.0001973056243581224, + "loss": 3.1765, + "step": 451 + }, + { + "epoch": 0.04095037484994677, + "grad_norm": 0.6278911828994751, + "learning_rate": 0.00019729958315713164, + "loss": 2.9796, + "step": 452 + }, + { + "epoch": 0.04104097302439356, + "grad_norm": 0.628866970539093, + "learning_rate": 0.0001972935419561409, + "loss": 3.1113, + "step": 453 + }, + { + "epoch": 0.041131571198840346, + "grad_norm": 0.6264688372612, + "learning_rate": 0.0001972875007551501, + "loss": 2.9835, + "step": 454 + }, + { + "epoch": 0.041222169373287126, + "grad_norm": 0.6123127937316895, + "learning_rate": 0.00019728145955415937, + "loss": 2.8382, + "step": 455 + }, + { + "epoch": 0.041312767547733914, + "grad_norm": 0.6114105582237244, + "learning_rate": 0.00019727541835316863, + "loss": 2.9788, + "step": 456 + }, + { + "epoch": 0.041403365722180695, + "grad_norm": 0.5846150517463684, + "learning_rate": 0.00019726937715217787, + "loss": 2.9678, + "step": 457 + }, + { + "epoch": 0.04149396389662748, + "grad_norm": 1.0835028886795044, + "learning_rate": 0.0001972633359511871, + "loss": 2.0753, + "step": 458 + }, + { + "epoch": 0.04158456207107427, + "grad_norm": 0.7427387833595276, + "learning_rate": 0.00019725729475019634, + "loss": 2.8978, + "step": 459 + }, + { + "epoch": 0.04167516024552105, + "grad_norm": 0.6263230443000793, + "learning_rate": 0.0001972512535492056, + "loss": 2.8306, + "step": 460 + }, + { + "epoch": 0.04176575841996784, + "grad_norm": 0.6554079651832581, + "learning_rate": 0.00019724521234821483, + "loss": 2.8966, + "step": 461 + }, + { + "epoch": 0.041856356594414626, + "grad_norm": 0.6395089030265808, + "learning_rate": 0.00019723917114722406, + "loss": 2.8523, + "step": 462 + }, + { + "epoch": 0.04194695476886141, + "grad_norm": 0.6253343820571899, + "learning_rate": 0.00019723312994623333, + "loss": 2.9201, + "step": 463 + }, + { + "epoch": 0.042037552943308194, + "grad_norm": 0.6207980513572693, + "learning_rate": 0.00019722708874524256, + "loss": 2.6654, + "step": 464 + }, + { + "epoch": 0.042128151117754975, + "grad_norm": 0.5914795398712158, + "learning_rate": 0.00019722104754425182, + "loss": 2.8745, + "step": 465 + }, + { + "epoch": 0.04221874929220176, + "grad_norm": 0.6707492470741272, + "learning_rate": 0.00019721500634326106, + "loss": 2.7358, + "step": 466 + }, + { + "epoch": 0.04230934746664855, + "grad_norm": 0.8591467142105103, + "learning_rate": 0.0001972089651422703, + "loss": 3.0883, + "step": 467 + }, + { + "epoch": 0.04239994564109533, + "grad_norm": 0.6103144288063049, + "learning_rate": 0.00019720292394127952, + "loss": 3.0348, + "step": 468 + }, + { + "epoch": 0.04249054381554212, + "grad_norm": 1.9862529039382935, + "learning_rate": 0.00019719688274028878, + "loss": 1.591, + "step": 469 + }, + { + "epoch": 0.0425811419899889, + "grad_norm": 0.6738000512123108, + "learning_rate": 0.00019719084153929805, + "loss": 2.7788, + "step": 470 + }, + { + "epoch": 0.04267174016443569, + "grad_norm": 0.6331595182418823, + "learning_rate": 0.00019718480033830725, + "loss": 2.868, + "step": 471 + }, + { + "epoch": 0.042762338338882475, + "grad_norm": 0.5978282690048218, + "learning_rate": 0.00019717875913731651, + "loss": 2.8243, + "step": 472 + }, + { + "epoch": 0.042852936513329255, + "grad_norm": 0.628258228302002, + "learning_rate": 0.00019717271793632575, + "loss": 2.3982, + "step": 473 + }, + { + "epoch": 0.04294353468777604, + "grad_norm": 0.6176953911781311, + "learning_rate": 0.000197166676735335, + "loss": 3.058, + "step": 474 + }, + { + "epoch": 0.043034132862222824, + "grad_norm": 0.6084462404251099, + "learning_rate": 0.00019716063553434422, + "loss": 2.9291, + "step": 475 + }, + { + "epoch": 0.04312473103666961, + "grad_norm": 0.5947203040122986, + "learning_rate": 0.00019715459433335348, + "loss": 2.7937, + "step": 476 + }, + { + "epoch": 0.0432153292111164, + "grad_norm": 0.5778854489326477, + "learning_rate": 0.0001971485531323627, + "loss": 2.7699, + "step": 477 + }, + { + "epoch": 0.04330592738556318, + "grad_norm": 0.6988486051559448, + "learning_rate": 0.00019714251193137197, + "loss": 2.6836, + "step": 478 + }, + { + "epoch": 0.04339652556000997, + "grad_norm": 0.6272429823875427, + "learning_rate": 0.0001971364707303812, + "loss": 2.8436, + "step": 479 + }, + { + "epoch": 0.04348712373445675, + "grad_norm": 0.5914053320884705, + "learning_rate": 0.00019713042952939044, + "loss": 2.5563, + "step": 480 + }, + { + "epoch": 0.043577721908903536, + "grad_norm": 0.5957232713699341, + "learning_rate": 0.0001971243883283997, + "loss": 2.8271, + "step": 481 + }, + { + "epoch": 0.04366832008335032, + "grad_norm": 0.6350996494293213, + "learning_rate": 0.00019711834712740894, + "loss": 3.1166, + "step": 482 + }, + { + "epoch": 0.043758918257797104, + "grad_norm": 0.6090084314346313, + "learning_rate": 0.0001971123059264182, + "loss": 2.6607, + "step": 483 + }, + { + "epoch": 0.04384951643224389, + "grad_norm": 0.5967637896537781, + "learning_rate": 0.0001971062647254274, + "loss": 2.912, + "step": 484 + }, + { + "epoch": 0.04394011460669067, + "grad_norm": 0.5884254574775696, + "learning_rate": 0.00019710022352443666, + "loss": 2.8559, + "step": 485 + }, + { + "epoch": 0.04403071278113746, + "grad_norm": 0.7153838872909546, + "learning_rate": 0.00019709418232344593, + "loss": 2.5556, + "step": 486 + }, + { + "epoch": 0.04412131095558425, + "grad_norm": 0.6987786293029785, + "learning_rate": 0.00019708814112245516, + "loss": 3.0064, + "step": 487 + }, + { + "epoch": 0.04421190913003103, + "grad_norm": 0.5958338975906372, + "learning_rate": 0.0001970820999214644, + "loss": 2.7293, + "step": 488 + }, + { + "epoch": 0.044302507304477816, + "grad_norm": 0.5834402441978455, + "learning_rate": 0.00019707605872047363, + "loss": 2.9193, + "step": 489 + }, + { + "epoch": 0.0443931054789246, + "grad_norm": 0.6119613647460938, + "learning_rate": 0.0001970700175194829, + "loss": 2.9198, + "step": 490 + }, + { + "epoch": 0.044483703653371384, + "grad_norm": 0.6048297882080078, + "learning_rate": 0.00019706397631849212, + "loss": 2.7882, + "step": 491 + }, + { + "epoch": 0.04457430182781817, + "grad_norm": 0.6408690214157104, + "learning_rate": 0.00019705793511750136, + "loss": 2.8714, + "step": 492 + }, + { + "epoch": 0.04466490000226495, + "grad_norm": 0.5954799056053162, + "learning_rate": 0.00019705189391651062, + "loss": 2.8265, + "step": 493 + }, + { + "epoch": 0.04475549817671174, + "grad_norm": 0.5790184140205383, + "learning_rate": 0.00019704585271551985, + "loss": 2.9545, + "step": 494 + }, + { + "epoch": 0.04484609635115852, + "grad_norm": 0.6234376430511475, + "learning_rate": 0.00019703981151452911, + "loss": 2.9772, + "step": 495 + }, + { + "epoch": 0.04493669452560531, + "grad_norm": 0.691173791885376, + "learning_rate": 0.00019703377031353835, + "loss": 2.6783, + "step": 496 + }, + { + "epoch": 0.045027292700052096, + "grad_norm": 0.6251050233840942, + "learning_rate": 0.00019702772911254758, + "loss": 3.1257, + "step": 497 + }, + { + "epoch": 0.04511789087449888, + "grad_norm": 0.5943994522094727, + "learning_rate": 0.00019702168791155682, + "loss": 2.8224, + "step": 498 + }, + { + "epoch": 0.045208489048945665, + "grad_norm": 0.7485010027885437, + "learning_rate": 0.00019701564671056608, + "loss": 3.0018, + "step": 499 + }, + { + "epoch": 0.045299087223392445, + "grad_norm": 0.5951609015464783, + "learning_rate": 0.0001970096055095753, + "loss": 2.9232, + "step": 500 + }, + { + "epoch": 0.04538968539783923, + "grad_norm": 0.6242930889129639, + "learning_rate": 0.00019700356430858455, + "loss": 2.8278, + "step": 501 + }, + { + "epoch": 0.04548028357228602, + "grad_norm": 0.5610213279724121, + "learning_rate": 0.0001969975231075938, + "loss": 2.8193, + "step": 502 + }, + { + "epoch": 0.0455708817467328, + "grad_norm": 0.5818517804145813, + "learning_rate": 0.00019699148190660304, + "loss": 3.0717, + "step": 503 + }, + { + "epoch": 0.04566147992117959, + "grad_norm": 0.5793395638465881, + "learning_rate": 0.0001969854407056123, + "loss": 3.0565, + "step": 504 + }, + { + "epoch": 0.045752078095626376, + "grad_norm": 0.5714972019195557, + "learning_rate": 0.0001969793995046215, + "loss": 2.2253, + "step": 505 + }, + { + "epoch": 0.04584267627007316, + "grad_norm": 0.623180627822876, + "learning_rate": 0.00019697335830363077, + "loss": 3.0239, + "step": 506 + }, + { + "epoch": 0.045933274444519945, + "grad_norm": 0.6228992342948914, + "learning_rate": 0.00019696731710264, + "loss": 3.098, + "step": 507 + }, + { + "epoch": 0.046023872618966725, + "grad_norm": 0.6125683784484863, + "learning_rate": 0.00019696127590164926, + "loss": 2.8813, + "step": 508 + }, + { + "epoch": 0.04611447079341351, + "grad_norm": 0.5980958342552185, + "learning_rate": 0.0001969552347006585, + "loss": 3.0062, + "step": 509 + }, + { + "epoch": 0.0462050689678603, + "grad_norm": 0.5956356525421143, + "learning_rate": 0.00019694919349966773, + "loss": 2.8744, + "step": 510 + }, + { + "epoch": 0.04629566714230708, + "grad_norm": 0.7594051957130432, + "learning_rate": 0.000196943152298677, + "loss": 1.963, + "step": 511 + }, + { + "epoch": 0.04638626531675387, + "grad_norm": 0.681073784828186, + "learning_rate": 0.00019693711109768623, + "loss": 2.853, + "step": 512 + }, + { + "epoch": 0.04647686349120065, + "grad_norm": 0.5120400786399841, + "learning_rate": 0.00019693106989669546, + "loss": 2.1841, + "step": 513 + }, + { + "epoch": 0.04656746166564744, + "grad_norm": 0.6820033192634583, + "learning_rate": 0.0001969250286957047, + "loss": 2.6892, + "step": 514 + }, + { + "epoch": 0.046658059840094225, + "grad_norm": 0.6144059896469116, + "learning_rate": 0.00019691898749471396, + "loss": 2.6829, + "step": 515 + }, + { + "epoch": 0.046748658014541006, + "grad_norm": 1.5551859140396118, + "learning_rate": 0.00019691294629372322, + "loss": 2.3021, + "step": 516 + }, + { + "epoch": 0.04683925618898779, + "grad_norm": 0.5971378087997437, + "learning_rate": 0.00019690690509273245, + "loss": 2.8966, + "step": 517 + }, + { + "epoch": 0.046929854363434574, + "grad_norm": 0.594224214553833, + "learning_rate": 0.0001969008638917417, + "loss": 2.8679, + "step": 518 + }, + { + "epoch": 0.04702045253788136, + "grad_norm": 0.513327419757843, + "learning_rate": 0.00019689482269075092, + "loss": 1.9364, + "step": 519 + }, + { + "epoch": 0.04711105071232815, + "grad_norm": 0.5885871052742004, + "learning_rate": 0.00019688878148976018, + "loss": 2.9275, + "step": 520 + }, + { + "epoch": 0.04720164888677493, + "grad_norm": 0.596716046333313, + "learning_rate": 0.00019688274028876942, + "loss": 3.0352, + "step": 521 + }, + { + "epoch": 0.04729224706122172, + "grad_norm": 0.6523781418800354, + "learning_rate": 0.00019687669908777865, + "loss": 2.9477, + "step": 522 + }, + { + "epoch": 0.0473828452356685, + "grad_norm": 0.6513498425483704, + "learning_rate": 0.0001968706578867879, + "loss": 2.9361, + "step": 523 + }, + { + "epoch": 0.047473443410115286, + "grad_norm": 0.6392056345939636, + "learning_rate": 0.00019686461668579715, + "loss": 3.072, + "step": 524 + }, + { + "epoch": 0.047564041584562074, + "grad_norm": 0.6047998070716858, + "learning_rate": 0.0001968585754848064, + "loss": 2.8653, + "step": 525 + }, + { + "epoch": 0.047654639759008854, + "grad_norm": 0.7065925598144531, + "learning_rate": 0.0001968525342838156, + "loss": 2.2444, + "step": 526 + }, + { + "epoch": 0.04774523793345564, + "grad_norm": 0.5646654367446899, + "learning_rate": 0.00019684649308282487, + "loss": 2.7388, + "step": 527 + }, + { + "epoch": 0.04783583610790242, + "grad_norm": 0.5809914469718933, + "learning_rate": 0.0001968404518818341, + "loss": 2.6327, + "step": 528 + }, + { + "epoch": 0.04792643428234921, + "grad_norm": 0.7150706052780151, + "learning_rate": 0.00019683441068084337, + "loss": 3.082, + "step": 529 + }, + { + "epoch": 0.048017032456796, + "grad_norm": 0.6248458027839661, + "learning_rate": 0.0001968283694798526, + "loss": 2.9004, + "step": 530 + }, + { + "epoch": 0.04810763063124278, + "grad_norm": 0.6077522039413452, + "learning_rate": 0.00019682232827886184, + "loss": 2.8568, + "step": 531 + }, + { + "epoch": 0.048198228805689566, + "grad_norm": 0.7653781175613403, + "learning_rate": 0.0001968162870778711, + "loss": 1.7737, + "step": 532 + }, + { + "epoch": 0.04828882698013635, + "grad_norm": 0.6187346577644348, + "learning_rate": 0.00019681024587688033, + "loss": 2.1113, + "step": 533 + }, + { + "epoch": 0.048379425154583135, + "grad_norm": 0.6186344027519226, + "learning_rate": 0.0001968042046758896, + "loss": 3.0325, + "step": 534 + }, + { + "epoch": 0.04847002332902992, + "grad_norm": 0.6114354729652405, + "learning_rate": 0.0001967981634748988, + "loss": 2.7135, + "step": 535 + }, + { + "epoch": 0.0485606215034767, + "grad_norm": 0.5974509119987488, + "learning_rate": 0.00019679212227390806, + "loss": 2.7092, + "step": 536 + }, + { + "epoch": 0.04865121967792349, + "grad_norm": 0.6368694305419922, + "learning_rate": 0.0001967860810729173, + "loss": 3.0339, + "step": 537 + }, + { + "epoch": 0.04874181785237027, + "grad_norm": 1.2969398498535156, + "learning_rate": 0.00019678003987192656, + "loss": 2.8726, + "step": 538 + }, + { + "epoch": 0.04883241602681706, + "grad_norm": 0.6270187497138977, + "learning_rate": 0.0001967739986709358, + "loss": 2.657, + "step": 539 + }, + { + "epoch": 0.04892301420126385, + "grad_norm": 0.662196934223175, + "learning_rate": 0.00019676795746994503, + "loss": 3.0685, + "step": 540 + }, + { + "epoch": 0.04901361237571063, + "grad_norm": 0.6316245198249817, + "learning_rate": 0.0001967619162689543, + "loss": 3.0285, + "step": 541 + }, + { + "epoch": 0.049104210550157415, + "grad_norm": 0.6539645195007324, + "learning_rate": 0.00019675587506796352, + "loss": 2.8245, + "step": 542 + }, + { + "epoch": 0.0491948087246042, + "grad_norm": 0.6135842204093933, + "learning_rate": 0.00019674983386697275, + "loss": 3.0618, + "step": 543 + }, + { + "epoch": 0.04928540689905098, + "grad_norm": 0.6064906120300293, + "learning_rate": 0.000196743792665982, + "loss": 2.8626, + "step": 544 + }, + { + "epoch": 0.04937600507349777, + "grad_norm": 0.5752968192100525, + "learning_rate": 0.00019673775146499125, + "loss": 2.7812, + "step": 545 + }, + { + "epoch": 0.04946660324794455, + "grad_norm": 0.5703963041305542, + "learning_rate": 0.0001967317102640005, + "loss": 2.9102, + "step": 546 + }, + { + "epoch": 0.04955720142239134, + "grad_norm": 0.7414675354957581, + "learning_rate": 0.00019672566906300975, + "loss": 2.8619, + "step": 547 + }, + { + "epoch": 0.04964779959683813, + "grad_norm": 0.6123768091201782, + "learning_rate": 0.00019671962786201898, + "loss": 2.8463, + "step": 548 + }, + { + "epoch": 0.04973839777128491, + "grad_norm": 0.5913803577423096, + "learning_rate": 0.0001967135866610282, + "loss": 3.0387, + "step": 549 + }, + { + "epoch": 0.049828995945731695, + "grad_norm": 0.6176103949546814, + "learning_rate": 0.00019670754546003747, + "loss": 2.9448, + "step": 550 + }, + { + "epoch": 0.049919594120178476, + "grad_norm": 0.6342665553092957, + "learning_rate": 0.0001967015042590467, + "loss": 2.8327, + "step": 551 + }, + { + "epoch": 0.050010192294625264, + "grad_norm": 0.734687864780426, + "learning_rate": 0.00019669546305805594, + "loss": 2.4003, + "step": 552 + }, + { + "epoch": 0.05010079046907205, + "grad_norm": 0.5702248811721802, + "learning_rate": 0.0001966894218570652, + "loss": 2.8521, + "step": 553 + }, + { + "epoch": 0.05019138864351883, + "grad_norm": 0.6741928458213806, + "learning_rate": 0.00019668338065607444, + "loss": 2.8988, + "step": 554 + }, + { + "epoch": 0.05028198681796562, + "grad_norm": 0.593833327293396, + "learning_rate": 0.0001966773394550837, + "loss": 2.9039, + "step": 555 + }, + { + "epoch": 0.0503725849924124, + "grad_norm": 0.606051504611969, + "learning_rate": 0.0001966712982540929, + "loss": 2.8652, + "step": 556 + }, + { + "epoch": 0.05046318316685919, + "grad_norm": 0.5508158206939697, + "learning_rate": 0.00019666525705310217, + "loss": 1.4666, + "step": 557 + }, + { + "epoch": 0.050553781341305976, + "grad_norm": 0.6184675693511963, + "learning_rate": 0.0001966592158521114, + "loss": 2.9681, + "step": 558 + }, + { + "epoch": 0.050644379515752756, + "grad_norm": 0.6421597003936768, + "learning_rate": 0.00019665317465112066, + "loss": 2.3727, + "step": 559 + }, + { + "epoch": 0.050734977690199544, + "grad_norm": 0.6133762001991272, + "learning_rate": 0.0001966471334501299, + "loss": 2.9764, + "step": 560 + }, + { + "epoch": 0.050825575864646325, + "grad_norm": 0.6110644340515137, + "learning_rate": 0.00019664109224913913, + "loss": 2.9328, + "step": 561 + }, + { + "epoch": 0.05091617403909311, + "grad_norm": 0.7012593150138855, + "learning_rate": 0.0001966350510481484, + "loss": 2.801, + "step": 562 + }, + { + "epoch": 0.0510067722135399, + "grad_norm": 0.6649789214134216, + "learning_rate": 0.00019662900984715763, + "loss": 3.1025, + "step": 563 + }, + { + "epoch": 0.05109737038798668, + "grad_norm": 0.829812228679657, + "learning_rate": 0.00019662296864616686, + "loss": 2.8908, + "step": 564 + }, + { + "epoch": 0.05118796856243347, + "grad_norm": 0.6023240685462952, + "learning_rate": 0.0001966169274451761, + "loss": 2.2533, + "step": 565 + }, + { + "epoch": 0.05127856673688025, + "grad_norm": 0.6532763242721558, + "learning_rate": 0.00019661088624418535, + "loss": 2.6739, + "step": 566 + }, + { + "epoch": 0.05136916491132704, + "grad_norm": 0.5853382349014282, + "learning_rate": 0.0001966048450431946, + "loss": 2.669, + "step": 567 + }, + { + "epoch": 0.051459763085773824, + "grad_norm": 0.6554734706878662, + "learning_rate": 0.00019659880384220385, + "loss": 3.0213, + "step": 568 + }, + { + "epoch": 0.051550361260220605, + "grad_norm": 0.6268676519393921, + "learning_rate": 0.00019659276264121308, + "loss": 2.8569, + "step": 569 + }, + { + "epoch": 0.05164095943466739, + "grad_norm": 0.6099889278411865, + "learning_rate": 0.00019658672144022232, + "loss": 2.8879, + "step": 570 + }, + { + "epoch": 0.05173155760911417, + "grad_norm": 0.6597902774810791, + "learning_rate": 0.00019658068023923158, + "loss": 2.8744, + "step": 571 + }, + { + "epoch": 0.05182215578356096, + "grad_norm": 0.6344771385192871, + "learning_rate": 0.0001965746390382408, + "loss": 3.0114, + "step": 572 + }, + { + "epoch": 0.05191275395800775, + "grad_norm": 0.622954249382019, + "learning_rate": 0.00019656859783725005, + "loss": 2.8162, + "step": 573 + }, + { + "epoch": 0.05200335213245453, + "grad_norm": 0.6444194316864014, + "learning_rate": 0.00019656255663625928, + "loss": 3.0502, + "step": 574 + }, + { + "epoch": 0.05209395030690132, + "grad_norm": 0.5985990762710571, + "learning_rate": 0.00019655651543526854, + "loss": 2.8901, + "step": 575 + }, + { + "epoch": 0.0521845484813481, + "grad_norm": 0.7186000347137451, + "learning_rate": 0.0001965504742342778, + "loss": 2.2903, + "step": 576 + }, + { + "epoch": 0.052275146655794885, + "grad_norm": 0.5860978960990906, + "learning_rate": 0.000196544433033287, + "loss": 2.9586, + "step": 577 + }, + { + "epoch": 0.05236574483024167, + "grad_norm": 0.5975435376167297, + "learning_rate": 0.00019653839183229627, + "loss": 2.8832, + "step": 578 + }, + { + "epoch": 0.052456343004688454, + "grad_norm": 0.6136968731880188, + "learning_rate": 0.0001965323506313055, + "loss": 2.7889, + "step": 579 + }, + { + "epoch": 0.05254694117913524, + "grad_norm": 0.6094182133674622, + "learning_rate": 0.00019652630943031477, + "loss": 2.8955, + "step": 580 + }, + { + "epoch": 0.05263753935358203, + "grad_norm": 0.6266052722930908, + "learning_rate": 0.000196520268229324, + "loss": 2.9186, + "step": 581 + }, + { + "epoch": 0.05272813752802881, + "grad_norm": 0.6527964472770691, + "learning_rate": 0.00019651422702833324, + "loss": 3.1593, + "step": 582 + }, + { + "epoch": 0.0528187357024756, + "grad_norm": 0.602929949760437, + "learning_rate": 0.0001965081858273425, + "loss": 3.0373, + "step": 583 + }, + { + "epoch": 0.05290933387692238, + "grad_norm": 0.6097888946533203, + "learning_rate": 0.00019650214462635173, + "loss": 2.8409, + "step": 584 + }, + { + "epoch": 0.052999932051369165, + "grad_norm": 0.6112160086631775, + "learning_rate": 0.00019649610342536096, + "loss": 3.0269, + "step": 585 + }, + { + "epoch": 0.05309053022581595, + "grad_norm": 0.6273267865180969, + "learning_rate": 0.0001964900622243702, + "loss": 2.9329, + "step": 586 + }, + { + "epoch": 0.053181128400262734, + "grad_norm": 0.6076829433441162, + "learning_rate": 0.00019648402102337946, + "loss": 2.5808, + "step": 587 + }, + { + "epoch": 0.05327172657470952, + "grad_norm": 0.6495882272720337, + "learning_rate": 0.0001964779798223887, + "loss": 3.088, + "step": 588 + }, + { + "epoch": 0.0533623247491563, + "grad_norm": 0.5928909182548523, + "learning_rate": 0.00019647193862139796, + "loss": 2.8359, + "step": 589 + }, + { + "epoch": 0.05345292292360309, + "grad_norm": 0.7667100429534912, + "learning_rate": 0.0001964658974204072, + "loss": 1.5782, + "step": 590 + }, + { + "epoch": 0.05354352109804988, + "grad_norm": 0.5859053134918213, + "learning_rate": 0.00019645985621941642, + "loss": 2.7202, + "step": 591 + }, + { + "epoch": 0.05363411927249666, + "grad_norm": 0.6322861313819885, + "learning_rate": 0.00019645381501842568, + "loss": 2.9735, + "step": 592 + }, + { + "epoch": 0.053724717446943446, + "grad_norm": 0.6219518184661865, + "learning_rate": 0.00019644777381743492, + "loss": 2.956, + "step": 593 + }, + { + "epoch": 0.053815315621390226, + "grad_norm": 0.5699515342712402, + "learning_rate": 0.00019644173261644415, + "loss": 2.6237, + "step": 594 + }, + { + "epoch": 0.053905913795837014, + "grad_norm": 0.737498939037323, + "learning_rate": 0.0001964356914154534, + "loss": 3.1049, + "step": 595 + }, + { + "epoch": 0.0539965119702838, + "grad_norm": 0.6148526668548584, + "learning_rate": 0.00019642965021446265, + "loss": 2.8537, + "step": 596 + }, + { + "epoch": 0.05408711014473058, + "grad_norm": 0.6301900744438171, + "learning_rate": 0.0001964236090134719, + "loss": 2.43, + "step": 597 + }, + { + "epoch": 0.05417770831917737, + "grad_norm": 0.5718515515327454, + "learning_rate": 0.00019641756781248112, + "loss": 2.6132, + "step": 598 + }, + { + "epoch": 0.05426830649362415, + "grad_norm": 0.599317729473114, + "learning_rate": 0.00019641152661149038, + "loss": 2.935, + "step": 599 + }, + { + "epoch": 0.05435890466807094, + "grad_norm": 0.6303460001945496, + "learning_rate": 0.0001964054854104996, + "loss": 2.9483, + "step": 600 + }, + { + "epoch": 0.054449502842517726, + "grad_norm": 0.6382936239242554, + "learning_rate": 0.00019639944420950887, + "loss": 2.9325, + "step": 601 + }, + { + "epoch": 0.05454010101696451, + "grad_norm": 0.6218470335006714, + "learning_rate": 0.0001963934030085181, + "loss": 2.8489, + "step": 602 + }, + { + "epoch": 0.054630699191411294, + "grad_norm": 0.6080501675605774, + "learning_rate": 0.00019638736180752734, + "loss": 2.9842, + "step": 603 + }, + { + "epoch": 0.054721297365858075, + "grad_norm": 0.5986387133598328, + "learning_rate": 0.00019638132060653657, + "loss": 2.8438, + "step": 604 + }, + { + "epoch": 0.05481189554030486, + "grad_norm": 0.5964513421058655, + "learning_rate": 0.00019637527940554584, + "loss": 3.0091, + "step": 605 + }, + { + "epoch": 0.05490249371475165, + "grad_norm": 0.6343696117401123, + "learning_rate": 0.0001963692382045551, + "loss": 2.8343, + "step": 606 + }, + { + "epoch": 0.05499309188919843, + "grad_norm": 0.5807555317878723, + "learning_rate": 0.0001963631970035643, + "loss": 2.7854, + "step": 607 + }, + { + "epoch": 0.05508369006364522, + "grad_norm": 0.6808061003684998, + "learning_rate": 0.00019635715580257356, + "loss": 2.8824, + "step": 608 + }, + { + "epoch": 0.055174288238092, + "grad_norm": 0.8604089617729187, + "learning_rate": 0.0001963511146015828, + "loss": 2.2595, + "step": 609 + }, + { + "epoch": 0.05526488641253879, + "grad_norm": 0.581558346748352, + "learning_rate": 0.00019634507340059206, + "loss": 3.0499, + "step": 610 + }, + { + "epoch": 0.055355484586985575, + "grad_norm": 0.6463174819946289, + "learning_rate": 0.0001963390321996013, + "loss": 3.0654, + "step": 611 + }, + { + "epoch": 0.055446082761432355, + "grad_norm": 0.5751915574073792, + "learning_rate": 0.00019633299099861053, + "loss": 2.1597, + "step": 612 + }, + { + "epoch": 0.05553668093587914, + "grad_norm": 0.6467388272285461, + "learning_rate": 0.0001963269497976198, + "loss": 2.6604, + "step": 613 + }, + { + "epoch": 0.055627279110325924, + "grad_norm": 0.625277042388916, + "learning_rate": 0.00019632090859662902, + "loss": 2.8877, + "step": 614 + }, + { + "epoch": 0.05571787728477271, + "grad_norm": 0.6574413776397705, + "learning_rate": 0.00019631486739563826, + "loss": 3.3024, + "step": 615 + }, + { + "epoch": 0.0558084754592195, + "grad_norm": 0.720551609992981, + "learning_rate": 0.0001963088261946475, + "loss": 2.3305, + "step": 616 + }, + { + "epoch": 0.05589907363366628, + "grad_norm": 0.705237865447998, + "learning_rate": 0.00019630278499365675, + "loss": 2.7659, + "step": 617 + }, + { + "epoch": 0.05598967180811307, + "grad_norm": 0.7519891262054443, + "learning_rate": 0.000196296743792666, + "loss": 2.938, + "step": 618 + }, + { + "epoch": 0.05608026998255985, + "grad_norm": 0.6679311394691467, + "learning_rate": 0.00019629070259167525, + "loss": 2.8285, + "step": 619 + }, + { + "epoch": 0.056170868157006636, + "grad_norm": 0.6420069932937622, + "learning_rate": 0.00019628466139068448, + "loss": 2.7955, + "step": 620 + }, + { + "epoch": 0.05626146633145342, + "grad_norm": 0.6193233132362366, + "learning_rate": 0.00019627862018969372, + "loss": 3.1158, + "step": 621 + }, + { + "epoch": 0.056352064505900204, + "grad_norm": 0.5966829061508179, + "learning_rate": 0.00019627257898870298, + "loss": 2.9997, + "step": 622 + }, + { + "epoch": 0.05644266268034699, + "grad_norm": 0.5895545482635498, + "learning_rate": 0.0001962665377877122, + "loss": 2.9038, + "step": 623 + }, + { + "epoch": 0.05653326085479378, + "grad_norm": 0.6285210847854614, + "learning_rate": 0.00019626049658672145, + "loss": 3.0929, + "step": 624 + }, + { + "epoch": 0.05662385902924056, + "grad_norm": 0.622058629989624, + "learning_rate": 0.00019625445538573068, + "loss": 2.7821, + "step": 625 + }, + { + "epoch": 0.05671445720368735, + "grad_norm": 0.637732207775116, + "learning_rate": 0.00019624841418473994, + "loss": 2.9193, + "step": 626 + }, + { + "epoch": 0.05680505537813413, + "grad_norm": 0.6202144026756287, + "learning_rate": 0.0001962423729837492, + "loss": 3.0005, + "step": 627 + }, + { + "epoch": 0.056895653552580916, + "grad_norm": 0.6393392086029053, + "learning_rate": 0.0001962363317827584, + "loss": 2.7548, + "step": 628 + }, + { + "epoch": 0.056986251727027704, + "grad_norm": 0.6237674951553345, + "learning_rate": 0.00019623029058176767, + "loss": 2.7966, + "step": 629 + }, + { + "epoch": 0.057076849901474484, + "grad_norm": 0.6309906244277954, + "learning_rate": 0.0001962242493807769, + "loss": 2.8117, + "step": 630 + }, + { + "epoch": 0.05716744807592127, + "grad_norm": 0.6361286640167236, + "learning_rate": 0.00019621820817978616, + "loss": 2.9384, + "step": 631 + }, + { + "epoch": 0.05725804625036805, + "grad_norm": 0.6244622468948364, + "learning_rate": 0.0001962121669787954, + "loss": 2.8966, + "step": 632 + }, + { + "epoch": 0.05734864442481484, + "grad_norm": 0.59430992603302, + "learning_rate": 0.00019620612577780463, + "loss": 2.343, + "step": 633 + }, + { + "epoch": 0.05743924259926163, + "grad_norm": 0.6292974948883057, + "learning_rate": 0.00019620008457681387, + "loss": 2.9663, + "step": 634 + }, + { + "epoch": 0.05752984077370841, + "grad_norm": 0.5815742611885071, + "learning_rate": 0.00019619404337582313, + "loss": 2.6303, + "step": 635 + }, + { + "epoch": 0.057620438948155196, + "grad_norm": 0.6654151678085327, + "learning_rate": 0.00019618800217483236, + "loss": 2.8793, + "step": 636 + }, + { + "epoch": 0.05771103712260198, + "grad_norm": 0.6319230794906616, + "learning_rate": 0.0001961819609738416, + "loss": 2.8959, + "step": 637 + }, + { + "epoch": 0.057801635297048765, + "grad_norm": 0.598869800567627, + "learning_rate": 0.00019617591977285086, + "loss": 2.8797, + "step": 638 + }, + { + "epoch": 0.05789223347149555, + "grad_norm": 0.6503645777702332, + "learning_rate": 0.0001961698785718601, + "loss": 2.7765, + "step": 639 + }, + { + "epoch": 0.05798283164594233, + "grad_norm": 0.6164289116859436, + "learning_rate": 0.00019616383737086935, + "loss": 2.924, + "step": 640 + }, + { + "epoch": 0.05807342982038912, + "grad_norm": 0.6202648282051086, + "learning_rate": 0.00019615779616987856, + "loss": 3.0362, + "step": 641 + }, + { + "epoch": 0.0581640279948359, + "grad_norm": 0.6067506074905396, + "learning_rate": 0.00019615175496888782, + "loss": 2.8883, + "step": 642 + }, + { + "epoch": 0.05825462616928269, + "grad_norm": 0.6467812061309814, + "learning_rate": 0.00019614571376789708, + "loss": 2.7506, + "step": 643 + }, + { + "epoch": 0.058345224343729477, + "grad_norm": 0.6004661917686462, + "learning_rate": 0.00019613967256690632, + "loss": 3.1021, + "step": 644 + }, + { + "epoch": 0.05843582251817626, + "grad_norm": 0.752157986164093, + "learning_rate": 0.00019613363136591555, + "loss": 2.2823, + "step": 645 + }, + { + "epoch": 0.058526420692623045, + "grad_norm": 0.6502413153648376, + "learning_rate": 0.00019612759016492478, + "loss": 2.745, + "step": 646 + }, + { + "epoch": 0.058617018867069826, + "grad_norm": 0.6339449286460876, + "learning_rate": 0.00019612154896393405, + "loss": 2.9491, + "step": 647 + }, + { + "epoch": 0.05870761704151661, + "grad_norm": 0.6548113226890564, + "learning_rate": 0.00019611550776294328, + "loss": 3.2439, + "step": 648 + }, + { + "epoch": 0.0587982152159634, + "grad_norm": 0.6549171209335327, + "learning_rate": 0.0001961094665619525, + "loss": 2.8839, + "step": 649 + }, + { + "epoch": 0.05888881339041018, + "grad_norm": 0.6079529523849487, + "learning_rate": 0.00019610342536096177, + "loss": 2.8589, + "step": 650 + }, + { + "epoch": 0.05897941156485697, + "grad_norm": 0.7953592538833618, + "learning_rate": 0.000196097384159971, + "loss": 2.2952, + "step": 651 + }, + { + "epoch": 0.05907000973930375, + "grad_norm": 0.6396510004997253, + "learning_rate": 0.00019609134295898027, + "loss": 2.8937, + "step": 652 + }, + { + "epoch": 0.05916060791375054, + "grad_norm": 0.63138347864151, + "learning_rate": 0.0001960853017579895, + "loss": 2.9627, + "step": 653 + }, + { + "epoch": 0.059251206088197325, + "grad_norm": 0.6153807640075684, + "learning_rate": 0.00019607926055699874, + "loss": 3.0584, + "step": 654 + }, + { + "epoch": 0.059341804262644106, + "grad_norm": 0.6293632388114929, + "learning_rate": 0.00019607321935600797, + "loss": 2.8651, + "step": 655 + }, + { + "epoch": 0.059432402437090893, + "grad_norm": 0.6323422193527222, + "learning_rate": 0.00019606717815501723, + "loss": 2.7082, + "step": 656 + }, + { + "epoch": 0.059523000611537674, + "grad_norm": 0.676231861114502, + "learning_rate": 0.0001960611369540265, + "loss": 2.7271, + "step": 657 + }, + { + "epoch": 0.05961359878598446, + "grad_norm": 0.6075476408004761, + "learning_rate": 0.0001960550957530357, + "loss": 2.8427, + "step": 658 + }, + { + "epoch": 0.05970419696043125, + "grad_norm": 0.6186105012893677, + "learning_rate": 0.00019604905455204496, + "loss": 2.6346, + "step": 659 + }, + { + "epoch": 0.05979479513487803, + "grad_norm": 0.6413179636001587, + "learning_rate": 0.0001960430133510542, + "loss": 2.8886, + "step": 660 + }, + { + "epoch": 0.05988539330932482, + "grad_norm": 0.6246578693389893, + "learning_rate": 0.00019603697215006346, + "loss": 2.777, + "step": 661 + }, + { + "epoch": 0.059975991483771605, + "grad_norm": 0.6412546038627625, + "learning_rate": 0.00019603093094907266, + "loss": 2.7512, + "step": 662 + }, + { + "epoch": 0.060066589658218386, + "grad_norm": 0.6057112812995911, + "learning_rate": 0.00019602488974808193, + "loss": 2.9345, + "step": 663 + }, + { + "epoch": 0.060157187832665174, + "grad_norm": 0.5894813537597656, + "learning_rate": 0.00019601884854709116, + "loss": 2.2114, + "step": 664 + }, + { + "epoch": 0.060247786007111954, + "grad_norm": 0.6757852435112, + "learning_rate": 0.00019601280734610042, + "loss": 2.8222, + "step": 665 + }, + { + "epoch": 0.06033838418155874, + "grad_norm": 0.625087559223175, + "learning_rate": 0.00019600676614510965, + "loss": 2.1651, + "step": 666 + }, + { + "epoch": 0.06042898235600553, + "grad_norm": 0.6691910028457642, + "learning_rate": 0.0001960007249441189, + "loss": 2.6279, + "step": 667 + }, + { + "epoch": 0.06051958053045231, + "grad_norm": 0.6628668308258057, + "learning_rate": 0.00019599468374312815, + "loss": 2.7881, + "step": 668 + }, + { + "epoch": 0.0606101787048991, + "grad_norm": 0.6445559859275818, + "learning_rate": 0.00019598864254213738, + "loss": 2.8472, + "step": 669 + }, + { + "epoch": 0.06070077687934588, + "grad_norm": 0.6285856366157532, + "learning_rate": 0.00019598260134114665, + "loss": 2.8341, + "step": 670 + }, + { + "epoch": 0.060791375053792666, + "grad_norm": 0.6249302625656128, + "learning_rate": 0.00019597656014015585, + "loss": 2.8916, + "step": 671 + }, + { + "epoch": 0.060881973228239454, + "grad_norm": 0.7041561007499695, + "learning_rate": 0.0001959705189391651, + "loss": 2.9888, + "step": 672 + }, + { + "epoch": 0.060972571402686235, + "grad_norm": 0.706998884677887, + "learning_rate": 0.00019596447773817437, + "loss": 2.8361, + "step": 673 + }, + { + "epoch": 0.06106316957713302, + "grad_norm": 0.6398859024047852, + "learning_rate": 0.0001959584365371836, + "loss": 2.9281, + "step": 674 + }, + { + "epoch": 0.0611537677515798, + "grad_norm": 0.6218152642250061, + "learning_rate": 0.00019595239533619284, + "loss": 2.9616, + "step": 675 + }, + { + "epoch": 0.06124436592602659, + "grad_norm": 0.6453343033790588, + "learning_rate": 0.00019594635413520208, + "loss": 2.909, + "step": 676 + }, + { + "epoch": 0.06133496410047338, + "grad_norm": 0.6696106195449829, + "learning_rate": 0.00019594031293421134, + "loss": 2.797, + "step": 677 + }, + { + "epoch": 0.06142556227492016, + "grad_norm": 0.5859602689743042, + "learning_rate": 0.00019593427173322057, + "loss": 2.7539, + "step": 678 + }, + { + "epoch": 0.06151616044936695, + "grad_norm": 0.6216243505477905, + "learning_rate": 0.0001959282305322298, + "loss": 2.9939, + "step": 679 + }, + { + "epoch": 0.06160675862381373, + "grad_norm": 0.5957897901535034, + "learning_rate": 0.00019592218933123907, + "loss": 2.7298, + "step": 680 + }, + { + "epoch": 0.061697356798260515, + "grad_norm": 0.616606593132019, + "learning_rate": 0.0001959161481302483, + "loss": 2.7123, + "step": 681 + }, + { + "epoch": 0.0617879549727073, + "grad_norm": 0.6284594535827637, + "learning_rate": 0.00019591010692925756, + "loss": 2.1762, + "step": 682 + }, + { + "epoch": 0.06187855314715408, + "grad_norm": 0.5968490242958069, + "learning_rate": 0.0001959040657282668, + "loss": 2.8365, + "step": 683 + }, + { + "epoch": 0.06196915132160087, + "grad_norm": 0.6055828332901001, + "learning_rate": 0.00019589802452727603, + "loss": 2.9113, + "step": 684 + }, + { + "epoch": 0.06205974949604765, + "grad_norm": 0.6430021524429321, + "learning_rate": 0.00019589198332628526, + "loss": 2.9131, + "step": 685 + }, + { + "epoch": 0.06215034767049444, + "grad_norm": 0.5973049402236938, + "learning_rate": 0.00019588594212529453, + "loss": 2.9448, + "step": 686 + }, + { + "epoch": 0.06224094584494123, + "grad_norm": 0.6428831219673157, + "learning_rate": 0.00019587990092430376, + "loss": 2.0988, + "step": 687 + }, + { + "epoch": 0.06233154401938801, + "grad_norm": 0.6567192077636719, + "learning_rate": 0.000195873859723313, + "loss": 2.8535, + "step": 688 + }, + { + "epoch": 0.062422142193834795, + "grad_norm": 0.6435191035270691, + "learning_rate": 0.00019586781852232225, + "loss": 3.1342, + "step": 689 + }, + { + "epoch": 0.06251274036828158, + "grad_norm": 0.6329708099365234, + "learning_rate": 0.0001958617773213315, + "loss": 2.8911, + "step": 690 + }, + { + "epoch": 0.06260333854272837, + "grad_norm": 0.6363168358802795, + "learning_rate": 0.00019585573612034075, + "loss": 2.9645, + "step": 691 + }, + { + "epoch": 0.06269393671717514, + "grad_norm": 0.6670746803283691, + "learning_rate": 0.00019584969491934996, + "loss": 2.8076, + "step": 692 + }, + { + "epoch": 0.06278453489162193, + "grad_norm": 0.6386610865592957, + "learning_rate": 0.00019584365371835922, + "loss": 2.9983, + "step": 693 + }, + { + "epoch": 0.06287513306606872, + "grad_norm": 0.6171874403953552, + "learning_rate": 0.00019583761251736845, + "loss": 2.2204, + "step": 694 + }, + { + "epoch": 0.06296573124051551, + "grad_norm": 0.630914032459259, + "learning_rate": 0.0001958315713163777, + "loss": 3.0333, + "step": 695 + }, + { + "epoch": 0.0630563294149623, + "grad_norm": 0.7345241904258728, + "learning_rate": 0.00019582553011538695, + "loss": 3.0072, + "step": 696 + }, + { + "epoch": 0.06314692758940907, + "grad_norm": 0.6355704665184021, + "learning_rate": 0.00019581948891439618, + "loss": 2.9049, + "step": 697 + }, + { + "epoch": 0.06323752576385586, + "grad_norm": 0.6208865642547607, + "learning_rate": 0.00019581344771340544, + "loss": 3.055, + "step": 698 + }, + { + "epoch": 0.06332812393830264, + "grad_norm": 0.6472211480140686, + "learning_rate": 0.00019580740651241468, + "loss": 2.2303, + "step": 699 + }, + { + "epoch": 0.06341872211274943, + "grad_norm": 0.5964230895042419, + "learning_rate": 0.0001958013653114239, + "loss": 3.0859, + "step": 700 + }, + { + "epoch": 0.06350932028719622, + "grad_norm": 0.6241071224212646, + "learning_rate": 0.00019579532411043314, + "loss": 2.9484, + "step": 701 + }, + { + "epoch": 0.06359991846164299, + "grad_norm": 0.5706894993782043, + "learning_rate": 0.0001957892829094424, + "loss": 2.2155, + "step": 702 + }, + { + "epoch": 0.06369051663608978, + "grad_norm": 0.6108623147010803, + "learning_rate": 0.00019578324170845167, + "loss": 3.0118, + "step": 703 + }, + { + "epoch": 0.06378111481053657, + "grad_norm": 0.6143243312835693, + "learning_rate": 0.0001957772005074609, + "loss": 2.8264, + "step": 704 + }, + { + "epoch": 0.06387171298498336, + "grad_norm": 0.6026588678359985, + "learning_rate": 0.00019577115930647014, + "loss": 2.8397, + "step": 705 + }, + { + "epoch": 0.06396231115943014, + "grad_norm": 0.6183353662490845, + "learning_rate": 0.00019576511810547937, + "loss": 2.8507, + "step": 706 + }, + { + "epoch": 0.06405290933387692, + "grad_norm": 0.7235807180404663, + "learning_rate": 0.00019575907690448863, + "loss": 2.9471, + "step": 707 + }, + { + "epoch": 0.0641435075083237, + "grad_norm": 0.6599359512329102, + "learning_rate": 0.00019575303570349786, + "loss": 3.0691, + "step": 708 + }, + { + "epoch": 0.06423410568277049, + "grad_norm": 0.6761690378189087, + "learning_rate": 0.0001957469945025071, + "loss": 2.8684, + "step": 709 + }, + { + "epoch": 0.06432470385721728, + "grad_norm": 0.6177396774291992, + "learning_rate": 0.00019574095330151636, + "loss": 2.9133, + "step": 710 + }, + { + "epoch": 0.06441530203166407, + "grad_norm": 0.6361250877380371, + "learning_rate": 0.0001957349121005256, + "loss": 2.8112, + "step": 711 + }, + { + "epoch": 0.06450590020611084, + "grad_norm": 0.689286470413208, + "learning_rate": 0.00019572887089953485, + "loss": 3.1214, + "step": 712 + }, + { + "epoch": 0.06459649838055763, + "grad_norm": 0.6681810021400452, + "learning_rate": 0.00019572282969854406, + "loss": 2.8333, + "step": 713 + }, + { + "epoch": 0.06468709655500442, + "grad_norm": 0.6038305759429932, + "learning_rate": 0.00019571678849755332, + "loss": 2.7984, + "step": 714 + }, + { + "epoch": 0.0647776947294512, + "grad_norm": 0.6343244314193726, + "learning_rate": 0.00019571074729656256, + "loss": 2.7309, + "step": 715 + }, + { + "epoch": 0.06486829290389799, + "grad_norm": 0.6442089676856995, + "learning_rate": 0.00019570470609557182, + "loss": 2.6839, + "step": 716 + }, + { + "epoch": 0.06495889107834477, + "grad_norm": 0.6438726186752319, + "learning_rate": 0.00019569866489458105, + "loss": 2.981, + "step": 717 + }, + { + "epoch": 0.06504948925279155, + "grad_norm": 0.6059059500694275, + "learning_rate": 0.00019569262369359029, + "loss": 2.9203, + "step": 718 + }, + { + "epoch": 0.06514008742723834, + "grad_norm": 0.6218549013137817, + "learning_rate": 0.00019568658249259955, + "loss": 2.8026, + "step": 719 + }, + { + "epoch": 0.06523068560168513, + "grad_norm": 0.63478684425354, + "learning_rate": 0.00019568054129160878, + "loss": 2.7488, + "step": 720 + }, + { + "epoch": 0.06532128377613192, + "grad_norm": 0.5639198422431946, + "learning_rate": 0.00019567450009061804, + "loss": 2.0424, + "step": 721 + }, + { + "epoch": 0.06541188195057869, + "grad_norm": 0.6009072065353394, + "learning_rate": 0.00019566845888962725, + "loss": 3.0057, + "step": 722 + }, + { + "epoch": 0.06550248012502548, + "grad_norm": 0.6767693758010864, + "learning_rate": 0.0001956624176886365, + "loss": 3.0585, + "step": 723 + }, + { + "epoch": 0.06559307829947227, + "grad_norm": 0.6423208713531494, + "learning_rate": 0.00019565637648764574, + "loss": 2.2272, + "step": 724 + }, + { + "epoch": 0.06568367647391905, + "grad_norm": 0.6711210608482361, + "learning_rate": 0.000195650335286655, + "loss": 2.9858, + "step": 725 + }, + { + "epoch": 0.06577427464836584, + "grad_norm": 0.624876856803894, + "learning_rate": 0.00019564429408566424, + "loss": 3.1219, + "step": 726 + }, + { + "epoch": 0.06586487282281261, + "grad_norm": 0.6325036287307739, + "learning_rate": 0.00019563825288467347, + "loss": 2.7794, + "step": 727 + }, + { + "epoch": 0.0659554709972594, + "grad_norm": 0.5756207704544067, + "learning_rate": 0.00019563221168368274, + "loss": 2.0641, + "step": 728 + }, + { + "epoch": 0.06604606917170619, + "grad_norm": 0.62553870677948, + "learning_rate": 0.00019562617048269197, + "loss": 3.2133, + "step": 729 + }, + { + "epoch": 0.06613666734615298, + "grad_norm": 0.6667864918708801, + "learning_rate": 0.0001956201292817012, + "loss": 2.8121, + "step": 730 + }, + { + "epoch": 0.06622726552059977, + "grad_norm": 0.6537067890167236, + "learning_rate": 0.00019561408808071044, + "loss": 2.6667, + "step": 731 + }, + { + "epoch": 0.06631786369504654, + "grad_norm": 0.6298943758010864, + "learning_rate": 0.0001956080468797197, + "loss": 2.2249, + "step": 732 + }, + { + "epoch": 0.06640846186949333, + "grad_norm": 0.6417508721351624, + "learning_rate": 0.00019560200567872896, + "loss": 2.8135, + "step": 733 + }, + { + "epoch": 0.06649906004394011, + "grad_norm": 0.6363499760627747, + "learning_rate": 0.0001955959644777382, + "loss": 3.1879, + "step": 734 + }, + { + "epoch": 0.0665896582183869, + "grad_norm": 0.6459006071090698, + "learning_rate": 0.00019558992327674743, + "loss": 2.8859, + "step": 735 + }, + { + "epoch": 0.06668025639283369, + "grad_norm": 0.6350418925285339, + "learning_rate": 0.00019558388207575666, + "loss": 2.6935, + "step": 736 + }, + { + "epoch": 0.06677085456728046, + "grad_norm": 0.5899524092674255, + "learning_rate": 0.00019557784087476592, + "loss": 2.8721, + "step": 737 + }, + { + "epoch": 0.06686145274172725, + "grad_norm": 0.685865044593811, + "learning_rate": 0.00019557179967377516, + "loss": 2.8946, + "step": 738 + }, + { + "epoch": 0.06695205091617404, + "grad_norm": 0.64324951171875, + "learning_rate": 0.0001955657584727844, + "loss": 2.7285, + "step": 739 + }, + { + "epoch": 0.06704264909062083, + "grad_norm": 0.657703161239624, + "learning_rate": 0.00019555971727179365, + "loss": 2.9345, + "step": 740 + }, + { + "epoch": 0.06713324726506761, + "grad_norm": 0.6385548710823059, + "learning_rate": 0.00019555367607080289, + "loss": 2.9501, + "step": 741 + }, + { + "epoch": 0.06722384543951439, + "grad_norm": 0.6747270822525024, + "learning_rate": 0.00019554763486981215, + "loss": 2.6825, + "step": 742 + }, + { + "epoch": 0.06731444361396118, + "grad_norm": 0.6449041962623596, + "learning_rate": 0.00019554159366882135, + "loss": 2.8981, + "step": 743 + }, + { + "epoch": 0.06740504178840796, + "grad_norm": 0.726801335811615, + "learning_rate": 0.00019553555246783062, + "loss": 3.0596, + "step": 744 + }, + { + "epoch": 0.06749563996285475, + "grad_norm": 0.6070284247398376, + "learning_rate": 0.00019552951126683985, + "loss": 2.9752, + "step": 745 + }, + { + "epoch": 0.06758623813730154, + "grad_norm": 0.6228007078170776, + "learning_rate": 0.0001955234700658491, + "loss": 3.1066, + "step": 746 + }, + { + "epoch": 0.06767683631174831, + "grad_norm": 0.6405165195465088, + "learning_rate": 0.00019551742886485834, + "loss": 3.0388, + "step": 747 + }, + { + "epoch": 0.0677674344861951, + "grad_norm": 0.6080182194709778, + "learning_rate": 0.00019551138766386758, + "loss": 2.9269, + "step": 748 + }, + { + "epoch": 0.06785803266064189, + "grad_norm": 0.5327296257019043, + "learning_rate": 0.00019550534646287684, + "loss": 2.1552, + "step": 749 + }, + { + "epoch": 0.06794863083508867, + "grad_norm": 0.7093128561973572, + "learning_rate": 0.00019549930526188607, + "loss": 3.0536, + "step": 750 + }, + { + "epoch": 0.06803922900953546, + "grad_norm": 0.5703115463256836, + "learning_rate": 0.0001954932640608953, + "loss": 2.8444, + "step": 751 + }, + { + "epoch": 0.06812982718398224, + "grad_norm": 0.6048662066459656, + "learning_rate": 0.00019548722285990454, + "loss": 2.8015, + "step": 752 + }, + { + "epoch": 0.06822042535842902, + "grad_norm": 0.7090009450912476, + "learning_rate": 0.0001954811816589138, + "loss": 3.0078, + "step": 753 + }, + { + "epoch": 0.06831102353287581, + "grad_norm": 0.624772310256958, + "learning_rate": 0.00019547514045792304, + "loss": 3.0881, + "step": 754 + }, + { + "epoch": 0.0684016217073226, + "grad_norm": 0.6473666429519653, + "learning_rate": 0.0001954690992569323, + "loss": 3.023, + "step": 755 + }, + { + "epoch": 0.06849221988176939, + "grad_norm": 0.6881373524665833, + "learning_rate": 0.00019546305805594153, + "loss": 3.249, + "step": 756 + }, + { + "epoch": 0.06858281805621616, + "grad_norm": 0.6461966037750244, + "learning_rate": 0.00019545701685495077, + "loss": 2.8076, + "step": 757 + }, + { + "epoch": 0.06867341623066295, + "grad_norm": 0.7672095894813538, + "learning_rate": 0.00019545097565396003, + "loss": 2.9221, + "step": 758 + }, + { + "epoch": 0.06876401440510974, + "grad_norm": 0.5894637703895569, + "learning_rate": 0.00019544493445296926, + "loss": 1.4896, + "step": 759 + }, + { + "epoch": 0.06885461257955652, + "grad_norm": 0.5898556113243103, + "learning_rate": 0.0001954388932519785, + "loss": 2.6479, + "step": 760 + }, + { + "epoch": 0.06894521075400331, + "grad_norm": 0.6556459665298462, + "learning_rate": 0.00019543285205098773, + "loss": 2.8774, + "step": 761 + }, + { + "epoch": 0.0690358089284501, + "grad_norm": 0.6452184319496155, + "learning_rate": 0.000195426810849997, + "loss": 2.9291, + "step": 762 + }, + { + "epoch": 0.06912640710289687, + "grad_norm": 0.593367874622345, + "learning_rate": 0.00019542076964900625, + "loss": 2.8252, + "step": 763 + }, + { + "epoch": 0.06921700527734366, + "grad_norm": 0.671766996383667, + "learning_rate": 0.00019541472844801546, + "loss": 3.05, + "step": 764 + }, + { + "epoch": 0.06930760345179045, + "grad_norm": 0.6063161492347717, + "learning_rate": 0.00019540868724702472, + "loss": 2.9323, + "step": 765 + }, + { + "epoch": 0.06939820162623724, + "grad_norm": 0.6180351376533508, + "learning_rate": 0.00019540264604603395, + "loss": 2.9389, + "step": 766 + }, + { + "epoch": 0.06948879980068402, + "grad_norm": 0.6207804083824158, + "learning_rate": 0.00019539660484504322, + "loss": 2.8631, + "step": 767 + }, + { + "epoch": 0.0695793979751308, + "grad_norm": 0.6537436246871948, + "learning_rate": 0.00019539056364405245, + "loss": 3.0047, + "step": 768 + }, + { + "epoch": 0.06966999614957758, + "grad_norm": 0.5955865979194641, + "learning_rate": 0.00019538452244306168, + "loss": 2.6198, + "step": 769 + }, + { + "epoch": 0.06976059432402437, + "grad_norm": 0.6724928617477417, + "learning_rate": 0.00019537848124207094, + "loss": 3.0224, + "step": 770 + }, + { + "epoch": 0.06985119249847116, + "grad_norm": 0.5875077247619629, + "learning_rate": 0.00019537244004108018, + "loss": 2.6263, + "step": 771 + }, + { + "epoch": 0.06994179067291795, + "grad_norm": 0.6257995367050171, + "learning_rate": 0.0001953663988400894, + "loss": 2.6392, + "step": 772 + }, + { + "epoch": 0.07003238884736472, + "grad_norm": 0.7739941477775574, + "learning_rate": 0.00019536035763909865, + "loss": 3.0144, + "step": 773 + }, + { + "epoch": 0.07012298702181151, + "grad_norm": 0.6303402781486511, + "learning_rate": 0.0001953543164381079, + "loss": 3.0087, + "step": 774 + }, + { + "epoch": 0.0702135851962583, + "grad_norm": 0.6012027263641357, + "learning_rate": 0.00019534827523711714, + "loss": 2.7836, + "step": 775 + }, + { + "epoch": 0.07030418337070508, + "grad_norm": 0.6020978093147278, + "learning_rate": 0.0001953422340361264, + "loss": 2.8513, + "step": 776 + }, + { + "epoch": 0.07039478154515187, + "grad_norm": 0.6101019978523254, + "learning_rate": 0.00019533619283513564, + "loss": 2.7842, + "step": 777 + }, + { + "epoch": 0.07048537971959865, + "grad_norm": 0.5578672885894775, + "learning_rate": 0.00019533015163414487, + "loss": 2.1953, + "step": 778 + }, + { + "epoch": 0.07057597789404543, + "grad_norm": 0.6411239504814148, + "learning_rate": 0.00019532411043315413, + "loss": 2.7761, + "step": 779 + }, + { + "epoch": 0.07066657606849222, + "grad_norm": 0.6788807511329651, + "learning_rate": 0.00019531806923216337, + "loss": 2.9811, + "step": 780 + }, + { + "epoch": 0.07075717424293901, + "grad_norm": 0.5958601832389832, + "learning_rate": 0.0001953120280311726, + "loss": 2.6375, + "step": 781 + }, + { + "epoch": 0.0708477724173858, + "grad_norm": 0.9769750237464905, + "learning_rate": 0.00019530598683018183, + "loss": 2.7376, + "step": 782 + }, + { + "epoch": 0.07093837059183257, + "grad_norm": 0.608953595161438, + "learning_rate": 0.0001952999456291911, + "loss": 2.9147, + "step": 783 + }, + { + "epoch": 0.07102896876627936, + "grad_norm": 0.6491851806640625, + "learning_rate": 0.00019529390442820033, + "loss": 2.8145, + "step": 784 + }, + { + "epoch": 0.07111956694072614, + "grad_norm": 0.5694431066513062, + "learning_rate": 0.00019528786322720956, + "loss": 2.756, + "step": 785 + }, + { + "epoch": 0.07121016511517293, + "grad_norm": 0.6129596829414368, + "learning_rate": 0.00019528182202621883, + "loss": 2.7911, + "step": 786 + }, + { + "epoch": 0.07130076328961972, + "grad_norm": 0.6488217711448669, + "learning_rate": 0.00019527578082522806, + "loss": 2.9029, + "step": 787 + }, + { + "epoch": 0.0713913614640665, + "grad_norm": 0.6079372763633728, + "learning_rate": 0.00019526973962423732, + "loss": 2.808, + "step": 788 + }, + { + "epoch": 0.07148195963851328, + "grad_norm": 0.625235378742218, + "learning_rate": 0.00019526369842324655, + "loss": 2.8959, + "step": 789 + }, + { + "epoch": 0.07157255781296007, + "grad_norm": 0.7416746616363525, + "learning_rate": 0.0001952576572222558, + "loss": 2.7873, + "step": 790 + }, + { + "epoch": 0.07166315598740686, + "grad_norm": 0.611610472202301, + "learning_rate": 0.00019525161602126502, + "loss": 3.0387, + "step": 791 + }, + { + "epoch": 0.07175375416185364, + "grad_norm": 0.6518426537513733, + "learning_rate": 0.00019524557482027428, + "loss": 2.9412, + "step": 792 + }, + { + "epoch": 0.07184435233630042, + "grad_norm": 0.679958164691925, + "learning_rate": 0.00019523953361928355, + "loss": 2.785, + "step": 793 + }, + { + "epoch": 0.0719349505107472, + "grad_norm": 0.621355414390564, + "learning_rate": 0.00019523349241829275, + "loss": 2.8208, + "step": 794 + }, + { + "epoch": 0.072025548685194, + "grad_norm": 0.5956329703330994, + "learning_rate": 0.000195227451217302, + "loss": 2.2441, + "step": 795 + }, + { + "epoch": 0.07211614685964078, + "grad_norm": 0.6781240701675415, + "learning_rate": 0.00019522141001631125, + "loss": 2.8253, + "step": 796 + }, + { + "epoch": 0.07220674503408757, + "grad_norm": 0.6138700842857361, + "learning_rate": 0.0001952153688153205, + "loss": 2.6542, + "step": 797 + }, + { + "epoch": 0.07229734320853434, + "grad_norm": 0.6697364449501038, + "learning_rate": 0.00019520932761432974, + "loss": 2.0647, + "step": 798 + }, + { + "epoch": 0.07238794138298113, + "grad_norm": 0.6133973598480225, + "learning_rate": 0.00019520328641333898, + "loss": 2.7577, + "step": 799 + }, + { + "epoch": 0.07247853955742792, + "grad_norm": 0.5915447473526001, + "learning_rate": 0.00019519724521234824, + "loss": 3.0069, + "step": 800 + }, + { + "epoch": 0.0725691377318747, + "grad_norm": 0.6910145878791809, + "learning_rate": 0.00019519120401135747, + "loss": 2.8613, + "step": 801 + }, + { + "epoch": 0.0726597359063215, + "grad_norm": 0.6220174431800842, + "learning_rate": 0.0001951851628103667, + "loss": 3.1229, + "step": 802 + }, + { + "epoch": 0.07275033408076827, + "grad_norm": 0.6309188008308411, + "learning_rate": 0.00019517912160937594, + "loss": 3.2036, + "step": 803 + }, + { + "epoch": 0.07284093225521505, + "grad_norm": 0.6074507832527161, + "learning_rate": 0.0001951730804083852, + "loss": 2.5657, + "step": 804 + }, + { + "epoch": 0.07293153042966184, + "grad_norm": 0.6680125594139099, + "learning_rate": 0.00019516703920739444, + "loss": 2.9565, + "step": 805 + }, + { + "epoch": 0.07302212860410863, + "grad_norm": 0.7237497568130493, + "learning_rate": 0.0001951609980064037, + "loss": 2.7358, + "step": 806 + }, + { + "epoch": 0.07311272677855542, + "grad_norm": 0.6686829328536987, + "learning_rate": 0.00019515495680541293, + "loss": 3.0487, + "step": 807 + }, + { + "epoch": 0.07320332495300219, + "grad_norm": 0.667779266834259, + "learning_rate": 0.00019514891560442216, + "loss": 3.0868, + "step": 808 + }, + { + "epoch": 0.07329392312744898, + "grad_norm": 0.6244952082633972, + "learning_rate": 0.00019514287440343143, + "loss": 2.9267, + "step": 809 + }, + { + "epoch": 0.07338452130189577, + "grad_norm": 0.6745501756668091, + "learning_rate": 0.00019513683320244066, + "loss": 2.9467, + "step": 810 + }, + { + "epoch": 0.07347511947634255, + "grad_norm": 0.589719295501709, + "learning_rate": 0.0001951307920014499, + "loss": 1.5579, + "step": 811 + }, + { + "epoch": 0.07356571765078934, + "grad_norm": 0.6935045719146729, + "learning_rate": 0.00019512475080045913, + "loss": 2.8364, + "step": 812 + }, + { + "epoch": 0.07365631582523612, + "grad_norm": 0.6358436942100525, + "learning_rate": 0.0001951187095994684, + "loss": 2.1109, + "step": 813 + }, + { + "epoch": 0.0737469139996829, + "grad_norm": 0.6458626985549927, + "learning_rate": 0.00019511266839847762, + "loss": 3.002, + "step": 814 + }, + { + "epoch": 0.07383751217412969, + "grad_norm": 0.6256845593452454, + "learning_rate": 0.00019510662719748686, + "loss": 2.3092, + "step": 815 + }, + { + "epoch": 0.07392811034857648, + "grad_norm": 0.679996132850647, + "learning_rate": 0.00019510058599649612, + "loss": 3.256, + "step": 816 + }, + { + "epoch": 0.07401870852302327, + "grad_norm": 0.5374927520751953, + "learning_rate": 0.00019509454479550535, + "loss": 2.1524, + "step": 817 + }, + { + "epoch": 0.07410930669747004, + "grad_norm": 0.6546684503555298, + "learning_rate": 0.0001950885035945146, + "loss": 2.7821, + "step": 818 + }, + { + "epoch": 0.07419990487191683, + "grad_norm": 0.648958146572113, + "learning_rate": 0.00019508246239352385, + "loss": 2.7588, + "step": 819 + }, + { + "epoch": 0.07429050304636362, + "grad_norm": 0.6478660702705383, + "learning_rate": 0.00019507642119253308, + "loss": 2.8053, + "step": 820 + }, + { + "epoch": 0.0743811012208104, + "grad_norm": 0.6444147229194641, + "learning_rate": 0.00019507037999154232, + "loss": 2.7716, + "step": 821 + }, + { + "epoch": 0.07447169939525719, + "grad_norm": 0.6493471264839172, + "learning_rate": 0.00019506433879055158, + "loss": 2.9193, + "step": 822 + }, + { + "epoch": 0.07456229756970396, + "grad_norm": 0.7104253768920898, + "learning_rate": 0.0001950582975895608, + "loss": 2.2053, + "step": 823 + }, + { + "epoch": 0.07465289574415075, + "grad_norm": 0.6116175651550293, + "learning_rate": 0.00019505225638857004, + "loss": 2.8794, + "step": 824 + }, + { + "epoch": 0.07474349391859754, + "grad_norm": 0.7245681881904602, + "learning_rate": 0.0001950462151875793, + "loss": 2.983, + "step": 825 + }, + { + "epoch": 0.07483409209304433, + "grad_norm": 0.6701933741569519, + "learning_rate": 0.00019504017398658854, + "loss": 3.0731, + "step": 826 + }, + { + "epoch": 0.07492469026749111, + "grad_norm": 0.6922124028205872, + "learning_rate": 0.0001950341327855978, + "loss": 2.8765, + "step": 827 + }, + { + "epoch": 0.07501528844193789, + "grad_norm": 0.5996986031532288, + "learning_rate": 0.000195028091584607, + "loss": 2.8509, + "step": 828 + }, + { + "epoch": 0.07510588661638468, + "grad_norm": 0.6038746237754822, + "learning_rate": 0.00019502205038361627, + "loss": 2.7066, + "step": 829 + }, + { + "epoch": 0.07519648479083146, + "grad_norm": 0.6635264158248901, + "learning_rate": 0.00019501600918262553, + "loss": 2.8149, + "step": 830 + }, + { + "epoch": 0.07528708296527825, + "grad_norm": 0.7117771506309509, + "learning_rate": 0.00019500996798163476, + "loss": 2.9188, + "step": 831 + }, + { + "epoch": 0.07537768113972504, + "grad_norm": 0.6944715976715088, + "learning_rate": 0.000195003926780644, + "loss": 2.9633, + "step": 832 + }, + { + "epoch": 0.07546827931417181, + "grad_norm": 0.6386058926582336, + "learning_rate": 0.00019499788557965323, + "loss": 2.9613, + "step": 833 + }, + { + "epoch": 0.0755588774886186, + "grad_norm": 0.6159153580665588, + "learning_rate": 0.0001949918443786625, + "loss": 2.9983, + "step": 834 + }, + { + "epoch": 0.07564947566306539, + "grad_norm": 0.6457597017288208, + "learning_rate": 0.00019498580317767173, + "loss": 2.9219, + "step": 835 + }, + { + "epoch": 0.07574007383751218, + "grad_norm": 0.6346065998077393, + "learning_rate": 0.00019497976197668096, + "loss": 2.899, + "step": 836 + }, + { + "epoch": 0.07583067201195896, + "grad_norm": 0.6082325577735901, + "learning_rate": 0.00019497372077569022, + "loss": 2.6587, + "step": 837 + }, + { + "epoch": 0.07592127018640574, + "grad_norm": 0.6426273584365845, + "learning_rate": 0.00019496767957469946, + "loss": 2.641, + "step": 838 + }, + { + "epoch": 0.07601186836085252, + "grad_norm": 0.653953492641449, + "learning_rate": 0.00019496163837370872, + "loss": 3.0128, + "step": 839 + }, + { + "epoch": 0.07610246653529931, + "grad_norm": 0.5368183255195618, + "learning_rate": 0.00019495559717271795, + "loss": 2.2941, + "step": 840 + }, + { + "epoch": 0.0761930647097461, + "grad_norm": 0.5406140685081482, + "learning_rate": 0.00019494955597172719, + "loss": 2.0578, + "step": 841 + }, + { + "epoch": 0.07628366288419289, + "grad_norm": 0.6983000636100769, + "learning_rate": 0.00019494351477073642, + "loss": 2.9592, + "step": 842 + }, + { + "epoch": 0.07637426105863968, + "grad_norm": 0.6715800166130066, + "learning_rate": 0.00019493747356974568, + "loss": 2.971, + "step": 843 + }, + { + "epoch": 0.07646485923308645, + "grad_norm": 0.6647594571113586, + "learning_rate": 0.00019493143236875492, + "loss": 2.9553, + "step": 844 + }, + { + "epoch": 0.07655545740753324, + "grad_norm": 0.6261590719223022, + "learning_rate": 0.00019492539116776415, + "loss": 2.7466, + "step": 845 + }, + { + "epoch": 0.07664605558198002, + "grad_norm": 0.5649932026863098, + "learning_rate": 0.0001949193499667734, + "loss": 2.4113, + "step": 846 + }, + { + "epoch": 0.07673665375642681, + "grad_norm": 0.5250148773193359, + "learning_rate": 0.00019491330876578264, + "loss": 2.2942, + "step": 847 + }, + { + "epoch": 0.0768272519308736, + "grad_norm": 0.6494271755218506, + "learning_rate": 0.0001949072675647919, + "loss": 2.8357, + "step": 848 + }, + { + "epoch": 0.07691785010532037, + "grad_norm": 0.6108782887458801, + "learning_rate": 0.0001949012263638011, + "loss": 2.9326, + "step": 849 + }, + { + "epoch": 0.07700844827976716, + "grad_norm": 0.6145034432411194, + "learning_rate": 0.00019489518516281037, + "loss": 2.1098, + "step": 850 + }, + { + "epoch": 0.07709904645421395, + "grad_norm": 0.5981634855270386, + "learning_rate": 0.0001948891439618196, + "loss": 2.8006, + "step": 851 + }, + { + "epoch": 0.07718964462866074, + "grad_norm": 0.6889778971672058, + "learning_rate": 0.00019488310276082887, + "loss": 3.0521, + "step": 852 + }, + { + "epoch": 0.07728024280310752, + "grad_norm": 0.6076495051383972, + "learning_rate": 0.0001948770615598381, + "loss": 2.8684, + "step": 853 + }, + { + "epoch": 0.0773708409775543, + "grad_norm": 0.6410755515098572, + "learning_rate": 0.00019487102035884734, + "loss": 2.6367, + "step": 854 + }, + { + "epoch": 0.07746143915200109, + "grad_norm": 0.6666664481163025, + "learning_rate": 0.0001948649791578566, + "loss": 3.1489, + "step": 855 + }, + { + "epoch": 0.07755203732644787, + "grad_norm": 0.5955137610435486, + "learning_rate": 0.00019485893795686583, + "loss": 2.7529, + "step": 856 + }, + { + "epoch": 0.07764263550089466, + "grad_norm": 0.6279938817024231, + "learning_rate": 0.0001948528967558751, + "loss": 2.7927, + "step": 857 + }, + { + "epoch": 0.07773323367534145, + "grad_norm": 0.6153497695922852, + "learning_rate": 0.0001948468555548843, + "loss": 2.8355, + "step": 858 + }, + { + "epoch": 0.07782383184978822, + "grad_norm": 0.7447534203529358, + "learning_rate": 0.00019484081435389356, + "loss": 3.2243, + "step": 859 + }, + { + "epoch": 0.07791443002423501, + "grad_norm": 0.6535675525665283, + "learning_rate": 0.00019483477315290282, + "loss": 2.9595, + "step": 860 + }, + { + "epoch": 0.0780050281986818, + "grad_norm": 0.6294531226158142, + "learning_rate": 0.00019482873195191206, + "loss": 2.8649, + "step": 861 + }, + { + "epoch": 0.07809562637312858, + "grad_norm": 0.6668590307235718, + "learning_rate": 0.0001948226907509213, + "loss": 2.4825, + "step": 862 + }, + { + "epoch": 0.07818622454757537, + "grad_norm": 0.6616989970207214, + "learning_rate": 0.00019481664954993053, + "loss": 2.7616, + "step": 863 + }, + { + "epoch": 0.07827682272202215, + "grad_norm": 0.6202505826950073, + "learning_rate": 0.00019481060834893979, + "loss": 2.9221, + "step": 864 + }, + { + "epoch": 0.07836742089646893, + "grad_norm": 0.6633656620979309, + "learning_rate": 0.00019480456714794902, + "loss": 2.9138, + "step": 865 + }, + { + "epoch": 0.07845801907091572, + "grad_norm": 0.6639587879180908, + "learning_rate": 0.00019479852594695825, + "loss": 3.0063, + "step": 866 + }, + { + "epoch": 0.07854861724536251, + "grad_norm": 0.6817421913146973, + "learning_rate": 0.00019479248474596752, + "loss": 3.0847, + "step": 867 + }, + { + "epoch": 0.0786392154198093, + "grad_norm": 0.6131521463394165, + "learning_rate": 0.00019478644354497675, + "loss": 2.4065, + "step": 868 + }, + { + "epoch": 0.07872981359425607, + "grad_norm": 0.6092349886894226, + "learning_rate": 0.000194780402343986, + "loss": 2.8531, + "step": 869 + }, + { + "epoch": 0.07882041176870286, + "grad_norm": 0.634029746055603, + "learning_rate": 0.00019477436114299524, + "loss": 2.7502, + "step": 870 + }, + { + "epoch": 0.07891100994314965, + "grad_norm": 0.6322289109230042, + "learning_rate": 0.00019476831994200448, + "loss": 2.8249, + "step": 871 + }, + { + "epoch": 0.07900160811759643, + "grad_norm": 0.6423193216323853, + "learning_rate": 0.0001947622787410137, + "loss": 2.9128, + "step": 872 + }, + { + "epoch": 0.07909220629204322, + "grad_norm": 0.6628682017326355, + "learning_rate": 0.00019475623754002297, + "loss": 3.0852, + "step": 873 + }, + { + "epoch": 0.07918280446649, + "grad_norm": 0.5928162336349487, + "learning_rate": 0.0001947501963390322, + "loss": 2.682, + "step": 874 + }, + { + "epoch": 0.07927340264093678, + "grad_norm": 0.6969744563102722, + "learning_rate": 0.00019474415513804144, + "loss": 2.8695, + "step": 875 + }, + { + "epoch": 0.07936400081538357, + "grad_norm": 0.6265190839767456, + "learning_rate": 0.0001947381139370507, + "loss": 2.8348, + "step": 876 + }, + { + "epoch": 0.07945459898983036, + "grad_norm": 0.6487648487091064, + "learning_rate": 0.00019473207273605994, + "loss": 2.9728, + "step": 877 + }, + { + "epoch": 0.07954519716427715, + "grad_norm": 0.6418008208274841, + "learning_rate": 0.0001947260315350692, + "loss": 2.8889, + "step": 878 + }, + { + "epoch": 0.07963579533872392, + "grad_norm": 0.6136557459831238, + "learning_rate": 0.0001947199903340784, + "loss": 2.8279, + "step": 879 + }, + { + "epoch": 0.0797263935131707, + "grad_norm": 0.7223480939865112, + "learning_rate": 0.00019471394913308767, + "loss": 2.8684, + "step": 880 + }, + { + "epoch": 0.0798169916876175, + "grad_norm": 0.6309278607368469, + "learning_rate": 0.0001947079079320969, + "loss": 2.8897, + "step": 881 + }, + { + "epoch": 0.07990758986206428, + "grad_norm": 0.6812548041343689, + "learning_rate": 0.00019470186673110616, + "loss": 3.0359, + "step": 882 + }, + { + "epoch": 0.07999818803651107, + "grad_norm": 0.6149899363517761, + "learning_rate": 0.0001946958255301154, + "loss": 2.9133, + "step": 883 + }, + { + "epoch": 0.08008878621095784, + "grad_norm": 0.6296762228012085, + "learning_rate": 0.00019468978432912463, + "loss": 2.8273, + "step": 884 + }, + { + "epoch": 0.08017938438540463, + "grad_norm": 0.6080557107925415, + "learning_rate": 0.0001946837431281339, + "loss": 2.7767, + "step": 885 + }, + { + "epoch": 0.08026998255985142, + "grad_norm": 0.6151219010353088, + "learning_rate": 0.00019467770192714313, + "loss": 2.7259, + "step": 886 + }, + { + "epoch": 0.0803605807342982, + "grad_norm": 0.6046298146247864, + "learning_rate": 0.00019467166072615236, + "loss": 2.3211, + "step": 887 + }, + { + "epoch": 0.080451178908745, + "grad_norm": 0.6153748631477356, + "learning_rate": 0.0001946656195251616, + "loss": 2.0439, + "step": 888 + }, + { + "epoch": 0.08054177708319177, + "grad_norm": 0.6208068132400513, + "learning_rate": 0.00019465957832417085, + "loss": 2.8538, + "step": 889 + }, + { + "epoch": 0.08063237525763856, + "grad_norm": 0.5950608849525452, + "learning_rate": 0.00019465353712318012, + "loss": 2.7607, + "step": 890 + }, + { + "epoch": 0.08072297343208534, + "grad_norm": 0.601008951663971, + "learning_rate": 0.00019464749592218935, + "loss": 2.8678, + "step": 891 + }, + { + "epoch": 0.08081357160653213, + "grad_norm": 0.6425362229347229, + "learning_rate": 0.00019464145472119858, + "loss": 2.9427, + "step": 892 + }, + { + "epoch": 0.08090416978097892, + "grad_norm": 0.631397545337677, + "learning_rate": 0.00019463541352020782, + "loss": 3.0106, + "step": 893 + }, + { + "epoch": 0.08099476795542569, + "grad_norm": 0.5734562873840332, + "learning_rate": 0.00019462937231921708, + "loss": 2.1941, + "step": 894 + }, + { + "epoch": 0.08108536612987248, + "grad_norm": 0.6504313945770264, + "learning_rate": 0.0001946233311182263, + "loss": 2.393, + "step": 895 + }, + { + "epoch": 0.08117596430431927, + "grad_norm": 0.6467355489730835, + "learning_rate": 0.00019461728991723555, + "loss": 2.8439, + "step": 896 + }, + { + "epoch": 0.08126656247876606, + "grad_norm": 0.6670868992805481, + "learning_rate": 0.0001946112487162448, + "loss": 3.0639, + "step": 897 + }, + { + "epoch": 0.08135716065321284, + "grad_norm": 0.731669545173645, + "learning_rate": 0.00019460520751525404, + "loss": 2.8613, + "step": 898 + }, + { + "epoch": 0.08144775882765962, + "grad_norm": 0.6952593922615051, + "learning_rate": 0.0001945991663142633, + "loss": 2.683, + "step": 899 + }, + { + "epoch": 0.0815383570021064, + "grad_norm": 0.6425861716270447, + "learning_rate": 0.0001945931251132725, + "loss": 3.0253, + "step": 900 + }, + { + "epoch": 0.08162895517655319, + "grad_norm": 0.5657290816307068, + "learning_rate": 0.00019458708391228177, + "loss": 2.249, + "step": 901 + }, + { + "epoch": 0.08171955335099998, + "grad_norm": 0.6693564653396606, + "learning_rate": 0.000194581042711291, + "loss": 2.881, + "step": 902 + }, + { + "epoch": 0.08181015152544677, + "grad_norm": 0.649796187877655, + "learning_rate": 0.00019457500151030027, + "loss": 2.7787, + "step": 903 + }, + { + "epoch": 0.08190074969989354, + "grad_norm": 0.6429502367973328, + "learning_rate": 0.0001945689603093095, + "loss": 3.1295, + "step": 904 + }, + { + "epoch": 0.08199134787434033, + "grad_norm": 0.6723189353942871, + "learning_rate": 0.00019456291910831873, + "loss": 2.888, + "step": 905 + }, + { + "epoch": 0.08208194604878712, + "grad_norm": 0.6303548216819763, + "learning_rate": 0.000194556877907328, + "loss": 2.861, + "step": 906 + }, + { + "epoch": 0.0821725442232339, + "grad_norm": 0.626791775226593, + "learning_rate": 0.00019455083670633723, + "loss": 2.9492, + "step": 907 + }, + { + "epoch": 0.08226314239768069, + "grad_norm": 0.6611778140068054, + "learning_rate": 0.0001945447955053465, + "loss": 2.6729, + "step": 908 + }, + { + "epoch": 0.08235374057212747, + "grad_norm": 0.6462215185165405, + "learning_rate": 0.0001945387543043557, + "loss": 2.716, + "step": 909 + }, + { + "epoch": 0.08244433874657425, + "grad_norm": 0.6164008378982544, + "learning_rate": 0.00019453271310336496, + "loss": 2.8743, + "step": 910 + }, + { + "epoch": 0.08253493692102104, + "grad_norm": 0.6456943154335022, + "learning_rate": 0.0001945266719023742, + "loss": 2.7734, + "step": 911 + }, + { + "epoch": 0.08262553509546783, + "grad_norm": 0.6108839511871338, + "learning_rate": 0.00019452063070138345, + "loss": 2.7806, + "step": 912 + }, + { + "epoch": 0.08271613326991462, + "grad_norm": 0.6300186514854431, + "learning_rate": 0.0001945145895003927, + "loss": 3.0125, + "step": 913 + }, + { + "epoch": 0.08280673144436139, + "grad_norm": 0.5451338291168213, + "learning_rate": 0.00019450854829940192, + "loss": 2.2878, + "step": 914 + }, + { + "epoch": 0.08289732961880818, + "grad_norm": 0.6872933506965637, + "learning_rate": 0.00019450250709841118, + "loss": 3.0194, + "step": 915 + }, + { + "epoch": 0.08298792779325496, + "grad_norm": 0.6850318312644958, + "learning_rate": 0.00019449646589742042, + "loss": 2.7241, + "step": 916 + }, + { + "epoch": 0.08307852596770175, + "grad_norm": 0.716901421546936, + "learning_rate": 0.00019449042469642965, + "loss": 2.9065, + "step": 917 + }, + { + "epoch": 0.08316912414214854, + "grad_norm": 0.6702343821525574, + "learning_rate": 0.00019448438349543889, + "loss": 2.7944, + "step": 918 + }, + { + "epoch": 0.08325972231659531, + "grad_norm": 0.6379563808441162, + "learning_rate": 0.00019447834229444815, + "loss": 2.6319, + "step": 919 + }, + { + "epoch": 0.0833503204910421, + "grad_norm": 0.6316542625427246, + "learning_rate": 0.0001944723010934574, + "loss": 2.8485, + "step": 920 + }, + { + "epoch": 0.08344091866548889, + "grad_norm": 0.6413071155548096, + "learning_rate": 0.00019446625989246664, + "loss": 2.947, + "step": 921 + }, + { + "epoch": 0.08353151683993568, + "grad_norm": 0.6376447677612305, + "learning_rate": 0.00019446021869147588, + "loss": 2.8053, + "step": 922 + }, + { + "epoch": 0.08362211501438246, + "grad_norm": 0.7321935892105103, + "learning_rate": 0.0001944541774904851, + "loss": 2.0965, + "step": 923 + }, + { + "epoch": 0.08371271318882925, + "grad_norm": 0.6051363348960876, + "learning_rate": 0.00019444813628949437, + "loss": 2.4209, + "step": 924 + }, + { + "epoch": 0.08380331136327603, + "grad_norm": 0.6909669041633606, + "learning_rate": 0.0001944420950885036, + "loss": 2.7513, + "step": 925 + }, + { + "epoch": 0.08389390953772281, + "grad_norm": 0.6581807732582092, + "learning_rate": 0.00019443605388751284, + "loss": 2.845, + "step": 926 + }, + { + "epoch": 0.0839845077121696, + "grad_norm": 0.6653894782066345, + "learning_rate": 0.0001944300126865221, + "loss": 2.8949, + "step": 927 + }, + { + "epoch": 0.08407510588661639, + "grad_norm": 0.5935025215148926, + "learning_rate": 0.00019442397148553133, + "loss": 2.2269, + "step": 928 + }, + { + "epoch": 0.08416570406106318, + "grad_norm": 0.6478018164634705, + "learning_rate": 0.0001944179302845406, + "loss": 2.8261, + "step": 929 + }, + { + "epoch": 0.08425630223550995, + "grad_norm": 0.6806797981262207, + "learning_rate": 0.0001944118890835498, + "loss": 2.2027, + "step": 930 + }, + { + "epoch": 0.08434690040995674, + "grad_norm": 0.5824924111366272, + "learning_rate": 0.00019440584788255906, + "loss": 1.421, + "step": 931 + }, + { + "epoch": 0.08443749858440353, + "grad_norm": 0.6405988335609436, + "learning_rate": 0.0001943998066815683, + "loss": 2.7155, + "step": 932 + }, + { + "epoch": 0.08452809675885031, + "grad_norm": 0.6868276596069336, + "learning_rate": 0.00019439376548057756, + "loss": 2.8112, + "step": 933 + }, + { + "epoch": 0.0846186949332971, + "grad_norm": 0.665158212184906, + "learning_rate": 0.0001943877242795868, + "loss": 2.9264, + "step": 934 + }, + { + "epoch": 0.08470929310774387, + "grad_norm": 0.664697527885437, + "learning_rate": 0.00019438168307859603, + "loss": 2.9462, + "step": 935 + }, + { + "epoch": 0.08479989128219066, + "grad_norm": 0.6608562469482422, + "learning_rate": 0.0001943756418776053, + "loss": 2.8791, + "step": 936 + }, + { + "epoch": 0.08489048945663745, + "grad_norm": 0.5580142140388489, + "learning_rate": 0.00019436960067661452, + "loss": 2.1887, + "step": 937 + }, + { + "epoch": 0.08498108763108424, + "grad_norm": 0.5480343103408813, + "learning_rate": 0.00019436355947562376, + "loss": 2.043, + "step": 938 + }, + { + "epoch": 0.08507168580553102, + "grad_norm": 0.6118952035903931, + "learning_rate": 0.000194357518274633, + "loss": 2.8273, + "step": 939 + }, + { + "epoch": 0.0851622839799778, + "grad_norm": 0.683514416217804, + "learning_rate": 0.00019435147707364225, + "loss": 2.8774, + "step": 940 + }, + { + "epoch": 0.08525288215442459, + "grad_norm": 0.536401093006134, + "learning_rate": 0.00019434543587265149, + "loss": 2.0191, + "step": 941 + }, + { + "epoch": 0.08534348032887137, + "grad_norm": 0.6515129804611206, + "learning_rate": 0.00019433939467166075, + "loss": 2.774, + "step": 942 + }, + { + "epoch": 0.08543407850331816, + "grad_norm": 0.6690383553504944, + "learning_rate": 0.00019433335347066998, + "loss": 2.9756, + "step": 943 + }, + { + "epoch": 0.08552467667776495, + "grad_norm": 0.4596555531024933, + "learning_rate": 0.00019432731226967922, + "loss": 1.4099, + "step": 944 + }, + { + "epoch": 0.08561527485221172, + "grad_norm": 0.6455658674240112, + "learning_rate": 0.00019432127106868848, + "loss": 3.0295, + "step": 945 + }, + { + "epoch": 0.08570587302665851, + "grad_norm": 0.6757920384407043, + "learning_rate": 0.0001943152298676977, + "loss": 2.7797, + "step": 946 + }, + { + "epoch": 0.0857964712011053, + "grad_norm": 0.6263949275016785, + "learning_rate": 0.00019430918866670694, + "loss": 3.0188, + "step": 947 + }, + { + "epoch": 0.08588706937555209, + "grad_norm": 0.6597122550010681, + "learning_rate": 0.00019430314746571618, + "loss": 2.8768, + "step": 948 + }, + { + "epoch": 0.08597766754999887, + "grad_norm": 0.6750842332839966, + "learning_rate": 0.00019429710626472544, + "loss": 2.6135, + "step": 949 + }, + { + "epoch": 0.08606826572444565, + "grad_norm": 0.7063095569610596, + "learning_rate": 0.0001942910650637347, + "loss": 2.5337, + "step": 950 + }, + { + "epoch": 0.08615886389889243, + "grad_norm": 0.6431129574775696, + "learning_rate": 0.0001942850238627439, + "loss": 2.6465, + "step": 951 + }, + { + "epoch": 0.08624946207333922, + "grad_norm": 0.6950246095657349, + "learning_rate": 0.00019427898266175317, + "loss": 3.0628, + "step": 952 + }, + { + "epoch": 0.08634006024778601, + "grad_norm": 0.6734197735786438, + "learning_rate": 0.0001942729414607624, + "loss": 3.0109, + "step": 953 + }, + { + "epoch": 0.0864306584222328, + "grad_norm": 0.6371310949325562, + "learning_rate": 0.00019426690025977166, + "loss": 2.9998, + "step": 954 + }, + { + "epoch": 0.08652125659667957, + "grad_norm": 0.8092522621154785, + "learning_rate": 0.0001942608590587809, + "loss": 2.8351, + "step": 955 + }, + { + "epoch": 0.08661185477112636, + "grad_norm": 0.6227730512619019, + "learning_rate": 0.00019425481785779013, + "loss": 2.8203, + "step": 956 + }, + { + "epoch": 0.08670245294557315, + "grad_norm": 0.6625285148620605, + "learning_rate": 0.0001942487766567994, + "loss": 3.0345, + "step": 957 + }, + { + "epoch": 0.08679305112001993, + "grad_norm": 0.6616975665092468, + "learning_rate": 0.00019424273545580863, + "loss": 3.2995, + "step": 958 + }, + { + "epoch": 0.08688364929446672, + "grad_norm": 0.7158017158508301, + "learning_rate": 0.00019423669425481786, + "loss": 2.8626, + "step": 959 + }, + { + "epoch": 0.0869742474689135, + "grad_norm": 0.6579412817955017, + "learning_rate": 0.0001942306530538271, + "loss": 2.663, + "step": 960 + }, + { + "epoch": 0.08706484564336028, + "grad_norm": 0.691875159740448, + "learning_rate": 0.00019422461185283636, + "loss": 3.1192, + "step": 961 + }, + { + "epoch": 0.08715544381780707, + "grad_norm": 0.6255929470062256, + "learning_rate": 0.0001942185706518456, + "loss": 2.8703, + "step": 962 + }, + { + "epoch": 0.08724604199225386, + "grad_norm": 0.6047539710998535, + "learning_rate": 0.00019421252945085485, + "loss": 2.5376, + "step": 963 + }, + { + "epoch": 0.08733664016670065, + "grad_norm": 0.616875946521759, + "learning_rate": 0.00019420648824986406, + "loss": 2.6698, + "step": 964 + }, + { + "epoch": 0.08742723834114742, + "grad_norm": 0.5192081332206726, + "learning_rate": 0.00019420044704887332, + "loss": 2.0974, + "step": 965 + }, + { + "epoch": 0.08751783651559421, + "grad_norm": 0.6251482367515564, + "learning_rate": 0.00019419440584788258, + "loss": 2.2858, + "step": 966 + }, + { + "epoch": 0.087608434690041, + "grad_norm": 0.6069703102111816, + "learning_rate": 0.00019418836464689182, + "loss": 2.9036, + "step": 967 + }, + { + "epoch": 0.08769903286448778, + "grad_norm": 0.6334385871887207, + "learning_rate": 0.00019418232344590105, + "loss": 1.9837, + "step": 968 + }, + { + "epoch": 0.08778963103893457, + "grad_norm": 0.5860599875450134, + "learning_rate": 0.00019417628224491028, + "loss": 2.6684, + "step": 969 + }, + { + "epoch": 0.08788022921338134, + "grad_norm": 0.6246519684791565, + "learning_rate": 0.00019417024104391954, + "loss": 2.9327, + "step": 970 + }, + { + "epoch": 0.08797082738782813, + "grad_norm": 0.6789910793304443, + "learning_rate": 0.00019416419984292878, + "loss": 2.9954, + "step": 971 + }, + { + "epoch": 0.08806142556227492, + "grad_norm": 0.6745011210441589, + "learning_rate": 0.000194158158641938, + "loss": 3.0196, + "step": 972 + }, + { + "epoch": 0.08815202373672171, + "grad_norm": 0.6641865372657776, + "learning_rate": 0.00019415211744094727, + "loss": 3.2527, + "step": 973 + }, + { + "epoch": 0.0882426219111685, + "grad_norm": 0.6615995764732361, + "learning_rate": 0.0001941460762399565, + "loss": 2.7851, + "step": 974 + }, + { + "epoch": 0.08833322008561527, + "grad_norm": 0.6779003739356995, + "learning_rate": 0.00019414003503896577, + "loss": 3.1247, + "step": 975 + }, + { + "epoch": 0.08842381826006206, + "grad_norm": 0.6661621928215027, + "learning_rate": 0.000194133993837975, + "loss": 2.8811, + "step": 976 + }, + { + "epoch": 0.08851441643450884, + "grad_norm": 0.6650860905647278, + "learning_rate": 0.00019412795263698424, + "loss": 2.9046, + "step": 977 + }, + { + "epoch": 0.08860501460895563, + "grad_norm": 0.7035446166992188, + "learning_rate": 0.00019412191143599347, + "loss": 2.9679, + "step": 978 + }, + { + "epoch": 0.08869561278340242, + "grad_norm": 0.6680140495300293, + "learning_rate": 0.00019411587023500273, + "loss": 2.6798, + "step": 979 + }, + { + "epoch": 0.0887862109578492, + "grad_norm": 0.6275416016578674, + "learning_rate": 0.000194109829034012, + "loss": 2.8514, + "step": 980 + }, + { + "epoch": 0.08887680913229598, + "grad_norm": 0.6481673121452332, + "learning_rate": 0.0001941037878330212, + "loss": 2.8365, + "step": 981 + }, + { + "epoch": 0.08896740730674277, + "grad_norm": 0.6227960586547852, + "learning_rate": 0.00019409774663203046, + "loss": 3.0352, + "step": 982 + }, + { + "epoch": 0.08905800548118956, + "grad_norm": 0.6359560489654541, + "learning_rate": 0.0001940917054310397, + "loss": 2.9314, + "step": 983 + }, + { + "epoch": 0.08914860365563634, + "grad_norm": 0.6531352996826172, + "learning_rate": 0.00019408566423004896, + "loss": 2.9632, + "step": 984 + }, + { + "epoch": 0.08923920183008312, + "grad_norm": 0.6356728076934814, + "learning_rate": 0.0001940796230290582, + "loss": 2.8902, + "step": 985 + }, + { + "epoch": 0.0893298000045299, + "grad_norm": 0.6331402659416199, + "learning_rate": 0.00019407358182806742, + "loss": 3.0978, + "step": 986 + }, + { + "epoch": 0.08942039817897669, + "grad_norm": 0.6221935153007507, + "learning_rate": 0.00019406754062707669, + "loss": 2.8484, + "step": 987 + }, + { + "epoch": 0.08951099635342348, + "grad_norm": 0.6263952851295471, + "learning_rate": 0.00019406149942608592, + "loss": 2.3598, + "step": 988 + }, + { + "epoch": 0.08960159452787027, + "grad_norm": 0.6230846643447876, + "learning_rate": 0.00019405545822509515, + "loss": 2.9201, + "step": 989 + }, + { + "epoch": 0.08969219270231704, + "grad_norm": 0.6414622068405151, + "learning_rate": 0.0001940494170241044, + "loss": 3.0974, + "step": 990 + }, + { + "epoch": 0.08978279087676383, + "grad_norm": 0.6584955453872681, + "learning_rate": 0.00019404337582311365, + "loss": 2.9739, + "step": 991 + }, + { + "epoch": 0.08987338905121062, + "grad_norm": 0.6124423742294312, + "learning_rate": 0.00019403733462212288, + "loss": 2.7093, + "step": 992 + }, + { + "epoch": 0.0899639872256574, + "grad_norm": 0.6348642110824585, + "learning_rate": 0.00019403129342113214, + "loss": 2.7533, + "step": 993 + }, + { + "epoch": 0.09005458540010419, + "grad_norm": 0.6821113228797913, + "learning_rate": 0.00019402525222014135, + "loss": 2.8628, + "step": 994 + }, + { + "epoch": 0.09014518357455097, + "grad_norm": 0.613321840763092, + "learning_rate": 0.0001940192110191506, + "loss": 2.5798, + "step": 995 + }, + { + "epoch": 0.09023578174899775, + "grad_norm": 0.6431601643562317, + "learning_rate": 0.00019401316981815987, + "loss": 2.7329, + "step": 996 + }, + { + "epoch": 0.09032637992344454, + "grad_norm": 0.6535138487815857, + "learning_rate": 0.0001940071286171691, + "loss": 3.0137, + "step": 997 + }, + { + "epoch": 0.09041697809789133, + "grad_norm": 0.6823118329048157, + "learning_rate": 0.00019400108741617834, + "loss": 2.9179, + "step": 998 + }, + { + "epoch": 0.09050757627233812, + "grad_norm": 0.6387560367584229, + "learning_rate": 0.00019399504621518758, + "loss": 2.7396, + "step": 999 + }, + { + "epoch": 0.09059817444678489, + "grad_norm": 0.6159787178039551, + "learning_rate": 0.00019398900501419684, + "loss": 2.1827, + "step": 1000 + }, + { + "epoch": 0.09068877262123168, + "grad_norm": 0.8123766779899597, + "learning_rate": 0.00019398296381320607, + "loss": 3.0052, + "step": 1001 + }, + { + "epoch": 0.09077937079567847, + "grad_norm": 0.6354693174362183, + "learning_rate": 0.0001939769226122153, + "loss": 2.9035, + "step": 1002 + }, + { + "epoch": 0.09086996897012525, + "grad_norm": 0.6959117650985718, + "learning_rate": 0.00019397088141122457, + "loss": 3.0743, + "step": 1003 + }, + { + "epoch": 0.09096056714457204, + "grad_norm": 0.6462547779083252, + "learning_rate": 0.0001939648402102338, + "loss": 2.5766, + "step": 1004 + }, + { + "epoch": 0.09105116531901883, + "grad_norm": 0.6976358294487, + "learning_rate": 0.00019395879900924306, + "loss": 3.083, + "step": 1005 + }, + { + "epoch": 0.0911417634934656, + "grad_norm": 0.6979856491088867, + "learning_rate": 0.0001939527578082523, + "loss": 2.9856, + "step": 1006 + }, + { + "epoch": 0.09123236166791239, + "grad_norm": 0.628957211971283, + "learning_rate": 0.00019394671660726153, + "loss": 2.7694, + "step": 1007 + }, + { + "epoch": 0.09132295984235918, + "grad_norm": 0.6178279519081116, + "learning_rate": 0.00019394067540627076, + "loss": 2.0783, + "step": 1008 + }, + { + "epoch": 0.09141355801680597, + "grad_norm": 0.6566629409790039, + "learning_rate": 0.00019393463420528003, + "loss": 2.9662, + "step": 1009 + }, + { + "epoch": 0.09150415619125275, + "grad_norm": 0.652391791343689, + "learning_rate": 0.00019392859300428926, + "loss": 2.8516, + "step": 1010 + }, + { + "epoch": 0.09159475436569953, + "grad_norm": 0.6774464845657349, + "learning_rate": 0.0001939225518032985, + "loss": 3.0981, + "step": 1011 + }, + { + "epoch": 0.09168535254014631, + "grad_norm": 0.611379086971283, + "learning_rate": 0.00019391651060230775, + "loss": 2.8705, + "step": 1012 + }, + { + "epoch": 0.0917759507145931, + "grad_norm": 0.579620897769928, + "learning_rate": 0.000193910469401317, + "loss": 2.1506, + "step": 1013 + }, + { + "epoch": 0.09186654888903989, + "grad_norm": 0.5508917570114136, + "learning_rate": 0.00019390442820032625, + "loss": 1.4923, + "step": 1014 + }, + { + "epoch": 0.09195714706348668, + "grad_norm": 0.6607322692871094, + "learning_rate": 0.00019389838699933546, + "loss": 2.7466, + "step": 1015 + }, + { + "epoch": 0.09204774523793345, + "grad_norm": 0.6609116196632385, + "learning_rate": 0.00019389234579834472, + "loss": 3.0165, + "step": 1016 + }, + { + "epoch": 0.09213834341238024, + "grad_norm": 0.6853147745132446, + "learning_rate": 0.00019388630459735398, + "loss": 2.9354, + "step": 1017 + }, + { + "epoch": 0.09222894158682703, + "grad_norm": 0.6575244665145874, + "learning_rate": 0.0001938802633963632, + "loss": 2.8641, + "step": 1018 + }, + { + "epoch": 0.09231953976127381, + "grad_norm": 0.6607403755187988, + "learning_rate": 0.00019387422219537245, + "loss": 2.9105, + "step": 1019 + }, + { + "epoch": 0.0924101379357206, + "grad_norm": 0.6298147439956665, + "learning_rate": 0.00019386818099438168, + "loss": 2.8573, + "step": 1020 + }, + { + "epoch": 0.09250073611016738, + "grad_norm": 0.6288976073265076, + "learning_rate": 0.00019386213979339094, + "loss": 2.6955, + "step": 1021 + }, + { + "epoch": 0.09259133428461416, + "grad_norm": 0.6665705442428589, + "learning_rate": 0.00019385609859240018, + "loss": 2.8773, + "step": 1022 + }, + { + "epoch": 0.09268193245906095, + "grad_norm": 0.6976231932640076, + "learning_rate": 0.0001938500573914094, + "loss": 2.8963, + "step": 1023 + }, + { + "epoch": 0.09277253063350774, + "grad_norm": 0.795809805393219, + "learning_rate": 0.00019384401619041864, + "loss": 3.0196, + "step": 1024 + }, + { + "epoch": 0.09286312880795453, + "grad_norm": 0.6672821044921875, + "learning_rate": 0.0001938379749894279, + "loss": 2.9775, + "step": 1025 + }, + { + "epoch": 0.0929537269824013, + "grad_norm": 0.6160702109336853, + "learning_rate": 0.00019383193378843717, + "loss": 2.22, + "step": 1026 + }, + { + "epoch": 0.09304432515684809, + "grad_norm": 0.618100106716156, + "learning_rate": 0.0001938258925874464, + "loss": 2.7883, + "step": 1027 + }, + { + "epoch": 0.09313492333129487, + "grad_norm": 0.6601377129554749, + "learning_rate": 0.00019381985138645563, + "loss": 2.9631, + "step": 1028 + }, + { + "epoch": 0.09322552150574166, + "grad_norm": 0.6824899315834045, + "learning_rate": 0.00019381381018546487, + "loss": 3.167, + "step": 1029 + }, + { + "epoch": 0.09331611968018845, + "grad_norm": 0.6398523449897766, + "learning_rate": 0.00019380776898447413, + "loss": 2.9526, + "step": 1030 + }, + { + "epoch": 0.09340671785463522, + "grad_norm": 0.6455298066139221, + "learning_rate": 0.00019380172778348336, + "loss": 2.9332, + "step": 1031 + }, + { + "epoch": 0.09349731602908201, + "grad_norm": 0.6023834347724915, + "learning_rate": 0.0001937956865824926, + "loss": 2.1567, + "step": 1032 + }, + { + "epoch": 0.0935879142035288, + "grad_norm": 0.6273362636566162, + "learning_rate": 0.00019378964538150186, + "loss": 2.9557, + "step": 1033 + }, + { + "epoch": 0.09367851237797559, + "grad_norm": 0.6166684031486511, + "learning_rate": 0.0001937836041805111, + "loss": 2.9354, + "step": 1034 + }, + { + "epoch": 0.09376911055242237, + "grad_norm": 0.6350855827331543, + "learning_rate": 0.00019377756297952035, + "loss": 2.7304, + "step": 1035 + }, + { + "epoch": 0.09385970872686915, + "grad_norm": 0.6797176599502563, + "learning_rate": 0.00019377152177852956, + "loss": 2.6702, + "step": 1036 + }, + { + "epoch": 0.09395030690131594, + "grad_norm": 0.6061334013938904, + "learning_rate": 0.00019376548057753882, + "loss": 2.8936, + "step": 1037 + }, + { + "epoch": 0.09404090507576272, + "grad_norm": 0.626879870891571, + "learning_rate": 0.00019375943937654806, + "loss": 2.8826, + "step": 1038 + }, + { + "epoch": 0.09413150325020951, + "grad_norm": 0.6657402515411377, + "learning_rate": 0.00019375339817555732, + "loss": 3.0584, + "step": 1039 + }, + { + "epoch": 0.0942221014246563, + "grad_norm": 0.5728873014450073, + "learning_rate": 0.00019374735697456655, + "loss": 2.0622, + "step": 1040 + }, + { + "epoch": 0.09431269959910307, + "grad_norm": 0.6686079502105713, + "learning_rate": 0.00019374131577357579, + "loss": 2.8385, + "step": 1041 + }, + { + "epoch": 0.09440329777354986, + "grad_norm": 0.6601256728172302, + "learning_rate": 0.00019373527457258505, + "loss": 2.8136, + "step": 1042 + }, + { + "epoch": 0.09449389594799665, + "grad_norm": 0.6673895120620728, + "learning_rate": 0.00019372923337159428, + "loss": 2.9973, + "step": 1043 + }, + { + "epoch": 0.09458449412244344, + "grad_norm": 0.9200048446655273, + "learning_rate": 0.00019372319217060354, + "loss": 2.9174, + "step": 1044 + }, + { + "epoch": 0.09467509229689022, + "grad_norm": 0.5371161699295044, + "learning_rate": 0.00019371715096961275, + "loss": 1.9769, + "step": 1045 + }, + { + "epoch": 0.094765690471337, + "grad_norm": 0.6585938334465027, + "learning_rate": 0.000193711109768622, + "loss": 2.7804, + "step": 1046 + }, + { + "epoch": 0.09485628864578378, + "grad_norm": 0.6452536582946777, + "learning_rate": 0.00019370506856763127, + "loss": 3.3557, + "step": 1047 + }, + { + "epoch": 0.09494688682023057, + "grad_norm": 0.5390822291374207, + "learning_rate": 0.0001936990273666405, + "loss": 2.2256, + "step": 1048 + }, + { + "epoch": 0.09503748499467736, + "grad_norm": 0.7551419734954834, + "learning_rate": 0.00019369298616564974, + "loss": 2.6977, + "step": 1049 + }, + { + "epoch": 0.09512808316912415, + "grad_norm": 0.6292244791984558, + "learning_rate": 0.00019368694496465897, + "loss": 2.7157, + "step": 1050 + }, + { + "epoch": 0.09521868134357092, + "grad_norm": 0.657782256603241, + "learning_rate": 0.00019368090376366823, + "loss": 2.7039, + "step": 1051 + }, + { + "epoch": 0.09530927951801771, + "grad_norm": 0.6161039471626282, + "learning_rate": 0.00019367486256267747, + "loss": 2.7043, + "step": 1052 + }, + { + "epoch": 0.0953998776924645, + "grad_norm": 0.6576135158538818, + "learning_rate": 0.0001936688213616867, + "loss": 2.7274, + "step": 1053 + }, + { + "epoch": 0.09549047586691128, + "grad_norm": 0.7575075030326843, + "learning_rate": 0.00019366278016069594, + "loss": 3.0775, + "step": 1054 + }, + { + "epoch": 0.09558107404135807, + "grad_norm": 0.6737843751907349, + "learning_rate": 0.0001936567389597052, + "loss": 3.0228, + "step": 1055 + }, + { + "epoch": 0.09567167221580485, + "grad_norm": 0.805599570274353, + "learning_rate": 0.00019365069775871446, + "loss": 2.7853, + "step": 1056 + }, + { + "epoch": 0.09576227039025163, + "grad_norm": 0.650851845741272, + "learning_rate": 0.0001936446565577237, + "loss": 2.8344, + "step": 1057 + }, + { + "epoch": 0.09585286856469842, + "grad_norm": 0.6584105491638184, + "learning_rate": 0.00019363861535673293, + "loss": 2.7641, + "step": 1058 + }, + { + "epoch": 0.09594346673914521, + "grad_norm": 0.5790988802909851, + "learning_rate": 0.00019363257415574216, + "loss": 2.1259, + "step": 1059 + }, + { + "epoch": 0.096034064913592, + "grad_norm": 0.6929723620414734, + "learning_rate": 0.00019362653295475142, + "loss": 2.8336, + "step": 1060 + }, + { + "epoch": 0.09612466308803877, + "grad_norm": 0.5846459865570068, + "learning_rate": 0.00019362049175376066, + "loss": 2.276, + "step": 1061 + }, + { + "epoch": 0.09621526126248556, + "grad_norm": 0.5711168050765991, + "learning_rate": 0.0001936144505527699, + "loss": 2.1044, + "step": 1062 + }, + { + "epoch": 0.09630585943693235, + "grad_norm": 0.6814278364181519, + "learning_rate": 0.00019360840935177915, + "loss": 3.0702, + "step": 1063 + }, + { + "epoch": 0.09639645761137913, + "grad_norm": 0.6972371339797974, + "learning_rate": 0.00019360236815078839, + "loss": 2.9241, + "step": 1064 + }, + { + "epoch": 0.09648705578582592, + "grad_norm": 0.6976823806762695, + "learning_rate": 0.00019359632694979765, + "loss": 2.9096, + "step": 1065 + }, + { + "epoch": 0.0965776539602727, + "grad_norm": 0.6736823916435242, + "learning_rate": 0.00019359028574880685, + "loss": 2.7117, + "step": 1066 + }, + { + "epoch": 0.09666825213471948, + "grad_norm": 0.7167977690696716, + "learning_rate": 0.00019358424454781612, + "loss": 2.9445, + "step": 1067 + }, + { + "epoch": 0.09675885030916627, + "grad_norm": 0.610100269317627, + "learning_rate": 0.00019357820334682535, + "loss": 2.7139, + "step": 1068 + }, + { + "epoch": 0.09684944848361306, + "grad_norm": 0.7985059022903442, + "learning_rate": 0.0001935721621458346, + "loss": 3.0246, + "step": 1069 + }, + { + "epoch": 0.09694004665805984, + "grad_norm": 0.7057963609695435, + "learning_rate": 0.00019356612094484384, + "loss": 3.0929, + "step": 1070 + }, + { + "epoch": 0.09703064483250662, + "grad_norm": 0.6933151483535767, + "learning_rate": 0.00019356007974385308, + "loss": 2.8381, + "step": 1071 + }, + { + "epoch": 0.0971212430069534, + "grad_norm": 0.6482039093971252, + "learning_rate": 0.00019355403854286234, + "loss": 2.9013, + "step": 1072 + }, + { + "epoch": 0.0972118411814002, + "grad_norm": 0.6305868029594421, + "learning_rate": 0.00019354799734187157, + "loss": 2.5969, + "step": 1073 + }, + { + "epoch": 0.09730243935584698, + "grad_norm": 0.6367174386978149, + "learning_rate": 0.0001935419561408808, + "loss": 2.8568, + "step": 1074 + }, + { + "epoch": 0.09739303753029377, + "grad_norm": 0.6487560868263245, + "learning_rate": 0.00019353591493989004, + "loss": 2.8434, + "step": 1075 + }, + { + "epoch": 0.09748363570474054, + "grad_norm": 0.6874869465827942, + "learning_rate": 0.0001935298737388993, + "loss": 2.8956, + "step": 1076 + }, + { + "epoch": 0.09757423387918733, + "grad_norm": 0.6953914761543274, + "learning_rate": 0.00019352383253790856, + "loss": 3.2051, + "step": 1077 + }, + { + "epoch": 0.09766483205363412, + "grad_norm": 0.6648039817810059, + "learning_rate": 0.0001935177913369178, + "loss": 2.6079, + "step": 1078 + }, + { + "epoch": 0.0977554302280809, + "grad_norm": 0.7543134093284607, + "learning_rate": 0.00019351175013592703, + "loss": 3.0518, + "step": 1079 + }, + { + "epoch": 0.0978460284025277, + "grad_norm": 0.628548800945282, + "learning_rate": 0.00019350570893493627, + "loss": 3.0459, + "step": 1080 + }, + { + "epoch": 0.09793662657697447, + "grad_norm": 0.6690722107887268, + "learning_rate": 0.00019349966773394553, + "loss": 2.9241, + "step": 1081 + }, + { + "epoch": 0.09802722475142125, + "grad_norm": 0.628681480884552, + "learning_rate": 0.00019349362653295476, + "loss": 2.8738, + "step": 1082 + }, + { + "epoch": 0.09811782292586804, + "grad_norm": 0.6599707007408142, + "learning_rate": 0.000193487585331964, + "loss": 2.6515, + "step": 1083 + }, + { + "epoch": 0.09820842110031483, + "grad_norm": 0.6634184718132019, + "learning_rate": 0.00019348154413097323, + "loss": 3.1223, + "step": 1084 + }, + { + "epoch": 0.09829901927476162, + "grad_norm": 0.6628363132476807, + "learning_rate": 0.0001934755029299825, + "loss": 2.7989, + "step": 1085 + }, + { + "epoch": 0.0983896174492084, + "grad_norm": 0.6707492470741272, + "learning_rate": 0.00019346946172899175, + "loss": 2.9184, + "step": 1086 + }, + { + "epoch": 0.09848021562365518, + "grad_norm": 0.6396737098693848, + "learning_rate": 0.00019346342052800096, + "loss": 2.7521, + "step": 1087 + }, + { + "epoch": 0.09857081379810197, + "grad_norm": 0.5988925695419312, + "learning_rate": 0.00019345737932701022, + "loss": 2.5626, + "step": 1088 + }, + { + "epoch": 0.09866141197254875, + "grad_norm": 0.6248605251312256, + "learning_rate": 0.00019345133812601945, + "loss": 2.594, + "step": 1089 + }, + { + "epoch": 0.09875201014699554, + "grad_norm": 0.6734086871147156, + "learning_rate": 0.00019344529692502872, + "loss": 2.8771, + "step": 1090 + }, + { + "epoch": 0.09884260832144233, + "grad_norm": 0.6773838996887207, + "learning_rate": 0.00019343925572403795, + "loss": 2.9847, + "step": 1091 + }, + { + "epoch": 0.0989332064958891, + "grad_norm": 0.5756858587265015, + "learning_rate": 0.00019343321452304718, + "loss": 2.1766, + "step": 1092 + }, + { + "epoch": 0.09902380467033589, + "grad_norm": 0.6838502287864685, + "learning_rate": 0.00019342717332205644, + "loss": 2.9194, + "step": 1093 + }, + { + "epoch": 0.09911440284478268, + "grad_norm": 1.4917737245559692, + "learning_rate": 0.00019342113212106568, + "loss": 3.0627, + "step": 1094 + }, + { + "epoch": 0.09920500101922947, + "grad_norm": 0.6581339240074158, + "learning_rate": 0.00019341509092007494, + "loss": 2.837, + "step": 1095 + }, + { + "epoch": 0.09929559919367625, + "grad_norm": 0.582639217376709, + "learning_rate": 0.00019340904971908415, + "loss": 2.6435, + "step": 1096 + }, + { + "epoch": 0.09938619736812303, + "grad_norm": 0.6781558394432068, + "learning_rate": 0.0001934030085180934, + "loss": 2.8709, + "step": 1097 + }, + { + "epoch": 0.09947679554256982, + "grad_norm": 0.586860179901123, + "learning_rate": 0.00019339696731710264, + "loss": 2.2823, + "step": 1098 + }, + { + "epoch": 0.0995673937170166, + "grad_norm": 0.6563278436660767, + "learning_rate": 0.0001933909261161119, + "loss": 3.1009, + "step": 1099 + }, + { + "epoch": 0.09965799189146339, + "grad_norm": 0.6121872067451477, + "learning_rate": 0.00019338488491512114, + "loss": 2.8837, + "step": 1100 + }, + { + "epoch": 0.09974859006591018, + "grad_norm": 0.6290223002433777, + "learning_rate": 0.00019337884371413037, + "loss": 2.8786, + "step": 1101 + }, + { + "epoch": 0.09983918824035695, + "grad_norm": 0.6976778507232666, + "learning_rate": 0.00019337280251313963, + "loss": 2.9828, + "step": 1102 + }, + { + "epoch": 0.09992978641480374, + "grad_norm": 0.6702451705932617, + "learning_rate": 0.00019336676131214887, + "loss": 2.8944, + "step": 1103 + }, + { + "epoch": 0.10002038458925053, + "grad_norm": 0.7323374152183533, + "learning_rate": 0.0001933607201111581, + "loss": 2.8054, + "step": 1104 + }, + { + "epoch": 0.10011098276369731, + "grad_norm": 0.6674189567565918, + "learning_rate": 0.00019335467891016733, + "loss": 2.7609, + "step": 1105 + }, + { + "epoch": 0.1002015809381441, + "grad_norm": 0.6149994134902954, + "learning_rate": 0.0001933486377091766, + "loss": 2.867, + "step": 1106 + }, + { + "epoch": 0.10029217911259088, + "grad_norm": 0.6569970846176147, + "learning_rate": 0.00019334259650818586, + "loss": 2.6544, + "step": 1107 + }, + { + "epoch": 0.10038277728703766, + "grad_norm": 0.6552954316139221, + "learning_rate": 0.0001933365553071951, + "loss": 3.0672, + "step": 1108 + }, + { + "epoch": 0.10047337546148445, + "grad_norm": 0.6588140726089478, + "learning_rate": 0.00019333051410620432, + "loss": 2.9292, + "step": 1109 + }, + { + "epoch": 0.10056397363593124, + "grad_norm": 0.6452319025993347, + "learning_rate": 0.00019332447290521356, + "loss": 2.7738, + "step": 1110 + }, + { + "epoch": 0.10065457181037803, + "grad_norm": 0.61896812915802, + "learning_rate": 0.00019331843170422282, + "loss": 2.9531, + "step": 1111 + }, + { + "epoch": 0.1007451699848248, + "grad_norm": 0.6338852047920227, + "learning_rate": 0.00019331239050323205, + "loss": 1.8735, + "step": 1112 + }, + { + "epoch": 0.10083576815927159, + "grad_norm": 0.6890206933021545, + "learning_rate": 0.0001933063493022413, + "loss": 3.0582, + "step": 1113 + }, + { + "epoch": 0.10092636633371838, + "grad_norm": 0.6522872447967529, + "learning_rate": 0.00019330030810125052, + "loss": 2.8564, + "step": 1114 + }, + { + "epoch": 0.10101696450816516, + "grad_norm": 0.49074551463127136, + "learning_rate": 0.00019329426690025978, + "loss": 1.3897, + "step": 1115 + }, + { + "epoch": 0.10110756268261195, + "grad_norm": 0.6579478979110718, + "learning_rate": 0.00019328822569926904, + "loss": 2.7955, + "step": 1116 + }, + { + "epoch": 0.10119816085705872, + "grad_norm": 0.6918851733207703, + "learning_rate": 0.00019328218449827825, + "loss": 3.0311, + "step": 1117 + }, + { + "epoch": 0.10128875903150551, + "grad_norm": 0.6806893944740295, + "learning_rate": 0.0001932761432972875, + "loss": 3.02, + "step": 1118 + }, + { + "epoch": 0.1013793572059523, + "grad_norm": 0.648586630821228, + "learning_rate": 0.00019327010209629675, + "loss": 2.977, + "step": 1119 + }, + { + "epoch": 0.10146995538039909, + "grad_norm": 0.669166624546051, + "learning_rate": 0.000193264060895306, + "loss": 3.1156, + "step": 1120 + }, + { + "epoch": 0.10156055355484588, + "grad_norm": 0.6239954829216003, + "learning_rate": 0.00019325801969431524, + "loss": 2.8019, + "step": 1121 + }, + { + "epoch": 0.10165115172929265, + "grad_norm": 0.683656632900238, + "learning_rate": 0.00019325197849332448, + "loss": 2.6742, + "step": 1122 + }, + { + "epoch": 0.10174174990373944, + "grad_norm": 0.6935083866119385, + "learning_rate": 0.00019324593729233374, + "loss": 2.8021, + "step": 1123 + }, + { + "epoch": 0.10183234807818622, + "grad_norm": 0.6539504528045654, + "learning_rate": 0.00019323989609134297, + "loss": 2.8947, + "step": 1124 + }, + { + "epoch": 0.10192294625263301, + "grad_norm": 0.6321094632148743, + "learning_rate": 0.0001932338548903522, + "loss": 2.8511, + "step": 1125 + }, + { + "epoch": 0.1020135444270798, + "grad_norm": 0.63988196849823, + "learning_rate": 0.00019322781368936144, + "loss": 2.6973, + "step": 1126 + }, + { + "epoch": 0.10210414260152657, + "grad_norm": 0.6652947664260864, + "learning_rate": 0.0001932217724883707, + "loss": 3.0316, + "step": 1127 + }, + { + "epoch": 0.10219474077597336, + "grad_norm": 0.7177282571792603, + "learning_rate": 0.00019321573128737993, + "loss": 2.8175, + "step": 1128 + }, + { + "epoch": 0.10228533895042015, + "grad_norm": 0.7015251517295837, + "learning_rate": 0.0001932096900863892, + "loss": 3.0735, + "step": 1129 + }, + { + "epoch": 0.10237593712486694, + "grad_norm": 0.6205381155014038, + "learning_rate": 0.00019320364888539843, + "loss": 2.8767, + "step": 1130 + }, + { + "epoch": 0.10246653529931372, + "grad_norm": 0.672719419002533, + "learning_rate": 0.00019319760768440766, + "loss": 2.9183, + "step": 1131 + }, + { + "epoch": 0.1025571334737605, + "grad_norm": 0.6674543619155884, + "learning_rate": 0.00019319156648341692, + "loss": 2.8547, + "step": 1132 + }, + { + "epoch": 0.10264773164820729, + "grad_norm": 0.684343695640564, + "learning_rate": 0.00019318552528242616, + "loss": 3.0385, + "step": 1133 + }, + { + "epoch": 0.10273832982265407, + "grad_norm": 0.6519675850868225, + "learning_rate": 0.0001931794840814354, + "loss": 2.8219, + "step": 1134 + }, + { + "epoch": 0.10282892799710086, + "grad_norm": 0.6523305773735046, + "learning_rate": 0.00019317344288044463, + "loss": 2.8567, + "step": 1135 + }, + { + "epoch": 0.10291952617154765, + "grad_norm": 0.6247394680976868, + "learning_rate": 0.0001931674016794539, + "loss": 2.7621, + "step": 1136 + }, + { + "epoch": 0.10301012434599442, + "grad_norm": 0.6607104539871216, + "learning_rate": 0.00019316136047846315, + "loss": 2.9103, + "step": 1137 + }, + { + "epoch": 0.10310072252044121, + "grad_norm": 0.6504005789756775, + "learning_rate": 0.00019315531927747236, + "loss": 2.8385, + "step": 1138 + }, + { + "epoch": 0.103191320694888, + "grad_norm": 0.6496227979660034, + "learning_rate": 0.00019314927807648162, + "loss": 2.7326, + "step": 1139 + }, + { + "epoch": 0.10328191886933479, + "grad_norm": 0.694034218788147, + "learning_rate": 0.00019314323687549085, + "loss": 3.1581, + "step": 1140 + }, + { + "epoch": 0.10337251704378157, + "grad_norm": 0.6419540643692017, + "learning_rate": 0.0001931371956745001, + "loss": 2.8315, + "step": 1141 + }, + { + "epoch": 0.10346311521822835, + "grad_norm": 0.6527175307273865, + "learning_rate": 0.00019313115447350935, + "loss": 2.9529, + "step": 1142 + }, + { + "epoch": 0.10355371339267513, + "grad_norm": 0.6474694013595581, + "learning_rate": 0.00019312511327251858, + "loss": 2.7286, + "step": 1143 + }, + { + "epoch": 0.10364431156712192, + "grad_norm": 0.7440828084945679, + "learning_rate": 0.00019311907207152781, + "loss": 2.9058, + "step": 1144 + }, + { + "epoch": 0.10373490974156871, + "grad_norm": 0.6414902806282043, + "learning_rate": 0.00019311303087053708, + "loss": 2.9381, + "step": 1145 + }, + { + "epoch": 0.1038255079160155, + "grad_norm": 0.6789979338645935, + "learning_rate": 0.0001931069896695463, + "loss": 2.8668, + "step": 1146 + }, + { + "epoch": 0.10391610609046227, + "grad_norm": 0.651534378528595, + "learning_rate": 0.00019310094846855554, + "loss": 3.0536, + "step": 1147 + }, + { + "epoch": 0.10400670426490906, + "grad_norm": 0.6866450905799866, + "learning_rate": 0.0001930949072675648, + "loss": 3.0973, + "step": 1148 + }, + { + "epoch": 0.10409730243935585, + "grad_norm": 0.834514856338501, + "learning_rate": 0.00019308886606657404, + "loss": 2.2848, + "step": 1149 + }, + { + "epoch": 0.10418790061380263, + "grad_norm": 0.6696861386299133, + "learning_rate": 0.0001930828248655833, + "loss": 2.7399, + "step": 1150 + }, + { + "epoch": 0.10427849878824942, + "grad_norm": 0.648078441619873, + "learning_rate": 0.0001930767836645925, + "loss": 2.6381, + "step": 1151 + }, + { + "epoch": 0.1043690969626962, + "grad_norm": 0.6604339480400085, + "learning_rate": 0.00019307074246360177, + "loss": 3.1841, + "step": 1152 + }, + { + "epoch": 0.10445969513714298, + "grad_norm": 0.6359032988548279, + "learning_rate": 0.00019306470126261103, + "loss": 2.5484, + "step": 1153 + }, + { + "epoch": 0.10455029331158977, + "grad_norm": 0.7891672849655151, + "learning_rate": 0.00019305866006162026, + "loss": 3.0093, + "step": 1154 + }, + { + "epoch": 0.10464089148603656, + "grad_norm": 0.8097211122512817, + "learning_rate": 0.0001930526188606295, + "loss": 2.9502, + "step": 1155 + }, + { + "epoch": 0.10473148966048335, + "grad_norm": 0.6539204120635986, + "learning_rate": 0.00019304657765963873, + "loss": 2.0483, + "step": 1156 + }, + { + "epoch": 0.10482208783493012, + "grad_norm": 0.6856598258018494, + "learning_rate": 0.000193040536458648, + "loss": 3.175, + "step": 1157 + }, + { + "epoch": 0.10491268600937691, + "grad_norm": 0.6337206959724426, + "learning_rate": 0.00019303449525765723, + "loss": 2.6429, + "step": 1158 + }, + { + "epoch": 0.1050032841838237, + "grad_norm": 0.6701324582099915, + "learning_rate": 0.00019302845405666646, + "loss": 3.0328, + "step": 1159 + }, + { + "epoch": 0.10509388235827048, + "grad_norm": 0.6624292135238647, + "learning_rate": 0.00019302241285567572, + "loss": 3.1409, + "step": 1160 + }, + { + "epoch": 0.10518448053271727, + "grad_norm": 0.6200407147407532, + "learning_rate": 0.00019301637165468496, + "loss": 2.9013, + "step": 1161 + }, + { + "epoch": 0.10527507870716406, + "grad_norm": 0.6629658937454224, + "learning_rate": 0.00019301033045369422, + "loss": 2.8595, + "step": 1162 + }, + { + "epoch": 0.10536567688161083, + "grad_norm": 0.6348064541816711, + "learning_rate": 0.00019300428925270345, + "loss": 2.9522, + "step": 1163 + }, + { + "epoch": 0.10545627505605762, + "grad_norm": 0.6590244770050049, + "learning_rate": 0.00019299824805171269, + "loss": 2.9083, + "step": 1164 + }, + { + "epoch": 0.1055468732305044, + "grad_norm": 0.6969545483589172, + "learning_rate": 0.00019299220685072192, + "loss": 3.0735, + "step": 1165 + }, + { + "epoch": 0.1056374714049512, + "grad_norm": 0.7223598957061768, + "learning_rate": 0.00019298616564973118, + "loss": 2.6144, + "step": 1166 + }, + { + "epoch": 0.10572806957939798, + "grad_norm": 0.6468748450279236, + "learning_rate": 0.00019298012444874044, + "loss": 2.8812, + "step": 1167 + }, + { + "epoch": 0.10581866775384476, + "grad_norm": 0.6984209418296814, + "learning_rate": 0.00019297408324774965, + "loss": 2.8222, + "step": 1168 + }, + { + "epoch": 0.10590926592829154, + "grad_norm": 0.6958436965942383, + "learning_rate": 0.0001929680420467589, + "loss": 3.0356, + "step": 1169 + }, + { + "epoch": 0.10599986410273833, + "grad_norm": 0.6825007796287537, + "learning_rate": 0.00019296200084576814, + "loss": 3.0959, + "step": 1170 + }, + { + "epoch": 0.10609046227718512, + "grad_norm": 0.7153446078300476, + "learning_rate": 0.0001929559596447774, + "loss": 3.0184, + "step": 1171 + }, + { + "epoch": 0.1061810604516319, + "grad_norm": 0.838881254196167, + "learning_rate": 0.00019294991844378664, + "loss": 1.5212, + "step": 1172 + }, + { + "epoch": 0.10627165862607868, + "grad_norm": 0.7225100994110107, + "learning_rate": 0.00019294387724279587, + "loss": 3.1239, + "step": 1173 + }, + { + "epoch": 0.10636225680052547, + "grad_norm": 0.6834574937820435, + "learning_rate": 0.00019293783604180513, + "loss": 2.8772, + "step": 1174 + }, + { + "epoch": 0.10645285497497226, + "grad_norm": 0.6447376012802124, + "learning_rate": 0.00019293179484081437, + "loss": 2.673, + "step": 1175 + }, + { + "epoch": 0.10654345314941904, + "grad_norm": 0.8551467061042786, + "learning_rate": 0.0001929257536398236, + "loss": 2.8141, + "step": 1176 + }, + { + "epoch": 0.10663405132386583, + "grad_norm": 0.6961013078689575, + "learning_rate": 0.00019291971243883284, + "loss": 2.6121, + "step": 1177 + }, + { + "epoch": 0.1067246494983126, + "grad_norm": 0.6855564117431641, + "learning_rate": 0.0001929136712378421, + "loss": 3.1233, + "step": 1178 + }, + { + "epoch": 0.10681524767275939, + "grad_norm": 0.6983346343040466, + "learning_rate": 0.00019290763003685133, + "loss": 2.9642, + "step": 1179 + }, + { + "epoch": 0.10690584584720618, + "grad_norm": 0.7252920866012573, + "learning_rate": 0.0001929015888358606, + "loss": 2.9002, + "step": 1180 + }, + { + "epoch": 0.10699644402165297, + "grad_norm": 0.6841357946395874, + "learning_rate": 0.0001928955476348698, + "loss": 3.2747, + "step": 1181 + }, + { + "epoch": 0.10708704219609975, + "grad_norm": 0.7172092795372009, + "learning_rate": 0.00019288950643387906, + "loss": 2.8322, + "step": 1182 + }, + { + "epoch": 0.10717764037054653, + "grad_norm": 0.7096037268638611, + "learning_rate": 0.00019288346523288832, + "loss": 2.7105, + "step": 1183 + }, + { + "epoch": 0.10726823854499332, + "grad_norm": 0.7320834994316101, + "learning_rate": 0.00019287742403189756, + "loss": 2.8808, + "step": 1184 + }, + { + "epoch": 0.1073588367194401, + "grad_norm": 0.6657477021217346, + "learning_rate": 0.0001928713828309068, + "loss": 3.0929, + "step": 1185 + }, + { + "epoch": 0.10744943489388689, + "grad_norm": 0.6932780146598816, + "learning_rate": 0.00019286534162991602, + "loss": 2.742, + "step": 1186 + }, + { + "epoch": 0.10754003306833368, + "grad_norm": 0.616130530834198, + "learning_rate": 0.00019285930042892529, + "loss": 2.8672, + "step": 1187 + }, + { + "epoch": 0.10763063124278045, + "grad_norm": 0.6251799464225769, + "learning_rate": 0.00019285325922793452, + "loss": 2.9179, + "step": 1188 + }, + { + "epoch": 0.10772122941722724, + "grad_norm": 0.6399232745170593, + "learning_rate": 0.00019284721802694375, + "loss": 2.9243, + "step": 1189 + }, + { + "epoch": 0.10781182759167403, + "grad_norm": 0.675484299659729, + "learning_rate": 0.00019284117682595301, + "loss": 2.7921, + "step": 1190 + }, + { + "epoch": 0.10790242576612082, + "grad_norm": 0.6166763305664062, + "learning_rate": 0.00019283513562496225, + "loss": 2.9766, + "step": 1191 + }, + { + "epoch": 0.1079930239405676, + "grad_norm": 0.692867636680603, + "learning_rate": 0.0001928290944239715, + "loss": 2.8122, + "step": 1192 + }, + { + "epoch": 0.10808362211501438, + "grad_norm": 0.7877358198165894, + "learning_rate": 0.00019282305322298074, + "loss": 2.8835, + "step": 1193 + }, + { + "epoch": 0.10817422028946116, + "grad_norm": 0.6889457702636719, + "learning_rate": 0.00019281701202198998, + "loss": 3.0372, + "step": 1194 + }, + { + "epoch": 0.10826481846390795, + "grad_norm": 0.6390143036842346, + "learning_rate": 0.0001928109708209992, + "loss": 2.9542, + "step": 1195 + }, + { + "epoch": 0.10835541663835474, + "grad_norm": 0.9855954051017761, + "learning_rate": 0.00019280492962000847, + "loss": 2.2502, + "step": 1196 + }, + { + "epoch": 0.10844601481280153, + "grad_norm": 0.681797981262207, + "learning_rate": 0.0001927988884190177, + "loss": 2.7106, + "step": 1197 + }, + { + "epoch": 0.1085366129872483, + "grad_norm": 0.6702122688293457, + "learning_rate": 0.00019279284721802694, + "loss": 2.7675, + "step": 1198 + }, + { + "epoch": 0.10862721116169509, + "grad_norm": 0.6669275760650635, + "learning_rate": 0.0001927868060170362, + "loss": 2.5698, + "step": 1199 + }, + { + "epoch": 0.10871780933614188, + "grad_norm": 0.6990645527839661, + "learning_rate": 0.00019278076481604544, + "loss": 2.7948, + "step": 1200 + }, + { + "epoch": 0.10880840751058866, + "grad_norm": 0.6808047294616699, + "learning_rate": 0.0001927747236150547, + "loss": 3.0812, + "step": 1201 + }, + { + "epoch": 0.10889900568503545, + "grad_norm": 0.6539149880409241, + "learning_rate": 0.0001927686824140639, + "loss": 3.0024, + "step": 1202 + }, + { + "epoch": 0.10898960385948223, + "grad_norm": 0.6841357350349426, + "learning_rate": 0.00019276264121307317, + "loss": 3.0145, + "step": 1203 + }, + { + "epoch": 0.10908020203392901, + "grad_norm": 0.6377124190330505, + "learning_rate": 0.00019275660001208243, + "loss": 2.8974, + "step": 1204 + }, + { + "epoch": 0.1091708002083758, + "grad_norm": 0.6816244125366211, + "learning_rate": 0.00019275055881109166, + "loss": 2.9339, + "step": 1205 + }, + { + "epoch": 0.10926139838282259, + "grad_norm": 0.8163827061653137, + "learning_rate": 0.0001927445176101009, + "loss": 2.769, + "step": 1206 + }, + { + "epoch": 0.10935199655726938, + "grad_norm": 0.6581295132637024, + "learning_rate": 0.00019273847640911013, + "loss": 2.6701, + "step": 1207 + }, + { + "epoch": 0.10944259473171615, + "grad_norm": 0.8156266808509827, + "learning_rate": 0.0001927324352081194, + "loss": 2.1692, + "step": 1208 + }, + { + "epoch": 0.10953319290616294, + "grad_norm": 0.734043300151825, + "learning_rate": 0.00019272639400712862, + "loss": 2.9253, + "step": 1209 + }, + { + "epoch": 0.10962379108060973, + "grad_norm": 0.6459394097328186, + "learning_rate": 0.00019272035280613786, + "loss": 2.9876, + "step": 1210 + }, + { + "epoch": 0.10971438925505651, + "grad_norm": 0.6705288887023926, + "learning_rate": 0.0001927143116051471, + "loss": 2.8089, + "step": 1211 + }, + { + "epoch": 0.1098049874295033, + "grad_norm": 0.7221845984458923, + "learning_rate": 0.00019270827040415635, + "loss": 2.3783, + "step": 1212 + }, + { + "epoch": 0.10989558560395007, + "grad_norm": 0.7356142401695251, + "learning_rate": 0.00019270222920316562, + "loss": 2.7411, + "step": 1213 + }, + { + "epoch": 0.10998618377839686, + "grad_norm": 0.6573693156242371, + "learning_rate": 0.00019269618800217485, + "loss": 2.1708, + "step": 1214 + }, + { + "epoch": 0.11007678195284365, + "grad_norm": 0.676846444606781, + "learning_rate": 0.00019269014680118408, + "loss": 2.7733, + "step": 1215 + }, + { + "epoch": 0.11016738012729044, + "grad_norm": 0.6813239455223083, + "learning_rate": 0.00019268410560019332, + "loss": 2.8597, + "step": 1216 + }, + { + "epoch": 0.11025797830173723, + "grad_norm": 0.7115343809127808, + "learning_rate": 0.00019267806439920258, + "loss": 2.858, + "step": 1217 + }, + { + "epoch": 0.110348576476184, + "grad_norm": 0.6833627820014954, + "learning_rate": 0.0001926720231982118, + "loss": 3.0983, + "step": 1218 + }, + { + "epoch": 0.11043917465063079, + "grad_norm": 0.7018303871154785, + "learning_rate": 0.00019266598199722105, + "loss": 2.7585, + "step": 1219 + }, + { + "epoch": 0.11052977282507757, + "grad_norm": 0.6328460574150085, + "learning_rate": 0.0001926599407962303, + "loss": 2.7938, + "step": 1220 + }, + { + "epoch": 0.11062037099952436, + "grad_norm": 0.6208484768867493, + "learning_rate": 0.00019265389959523954, + "loss": 2.7617, + "step": 1221 + }, + { + "epoch": 0.11071096917397115, + "grad_norm": 0.6389867663383484, + "learning_rate": 0.0001926478583942488, + "loss": 2.7616, + "step": 1222 + }, + { + "epoch": 0.11080156734841792, + "grad_norm": 0.6371797323226929, + "learning_rate": 0.000192641817193258, + "loss": 2.8633, + "step": 1223 + }, + { + "epoch": 0.11089216552286471, + "grad_norm": 0.8169556856155396, + "learning_rate": 0.00019263577599226727, + "loss": 2.7358, + "step": 1224 + }, + { + "epoch": 0.1109827636973115, + "grad_norm": 0.6434078812599182, + "learning_rate": 0.0001926297347912765, + "loss": 2.7174, + "step": 1225 + }, + { + "epoch": 0.11107336187175829, + "grad_norm": 0.6640909910202026, + "learning_rate": 0.00019262369359028577, + "loss": 2.8008, + "step": 1226 + }, + { + "epoch": 0.11116396004620507, + "grad_norm": 0.6427316069602966, + "learning_rate": 0.000192617652389295, + "loss": 2.8267, + "step": 1227 + }, + { + "epoch": 0.11125455822065185, + "grad_norm": 0.6518564224243164, + "learning_rate": 0.00019261161118830423, + "loss": 2.8479, + "step": 1228 + }, + { + "epoch": 0.11134515639509864, + "grad_norm": 0.6828272342681885, + "learning_rate": 0.0001926055699873135, + "loss": 3.0121, + "step": 1229 + }, + { + "epoch": 0.11143575456954542, + "grad_norm": 0.6678755879402161, + "learning_rate": 0.00019259952878632273, + "loss": 2.9204, + "step": 1230 + }, + { + "epoch": 0.11152635274399221, + "grad_norm": 0.6771903038024902, + "learning_rate": 0.000192593487585332, + "loss": 2.7806, + "step": 1231 + }, + { + "epoch": 0.111616950918439, + "grad_norm": 0.6389148831367493, + "learning_rate": 0.0001925874463843412, + "loss": 2.9009, + "step": 1232 + }, + { + "epoch": 0.11170754909288577, + "grad_norm": 0.6714478135108948, + "learning_rate": 0.00019258140518335046, + "loss": 2.8311, + "step": 1233 + }, + { + "epoch": 0.11179814726733256, + "grad_norm": 0.63259357213974, + "learning_rate": 0.00019257536398235972, + "loss": 2.8713, + "step": 1234 + }, + { + "epoch": 0.11188874544177935, + "grad_norm": 0.6545016765594482, + "learning_rate": 0.00019256932278136895, + "loss": 3.1901, + "step": 1235 + }, + { + "epoch": 0.11197934361622613, + "grad_norm": 0.658771276473999, + "learning_rate": 0.0001925632815803782, + "loss": 2.7487, + "step": 1236 + }, + { + "epoch": 0.11206994179067292, + "grad_norm": 0.6330112218856812, + "learning_rate": 0.00019255724037938742, + "loss": 2.6826, + "step": 1237 + }, + { + "epoch": 0.1121605399651197, + "grad_norm": 0.6122382879257202, + "learning_rate": 0.00019255119917839668, + "loss": 2.7054, + "step": 1238 + }, + { + "epoch": 0.11225113813956648, + "grad_norm": 0.6099393367767334, + "learning_rate": 0.00019254515797740592, + "loss": 2.584, + "step": 1239 + }, + { + "epoch": 0.11234173631401327, + "grad_norm": 0.71757972240448, + "learning_rate": 0.00019253911677641515, + "loss": 2.2915, + "step": 1240 + }, + { + "epoch": 0.11243233448846006, + "grad_norm": 0.6432989835739136, + "learning_rate": 0.00019253307557542439, + "loss": 2.6818, + "step": 1241 + }, + { + "epoch": 0.11252293266290685, + "grad_norm": 0.7923279404640198, + "learning_rate": 0.00019252703437443365, + "loss": 2.7742, + "step": 1242 + }, + { + "epoch": 0.11261353083735363, + "grad_norm": 0.5906824469566345, + "learning_rate": 0.0001925209931734429, + "loss": 2.7544, + "step": 1243 + }, + { + "epoch": 0.11270412901180041, + "grad_norm": 0.6799190640449524, + "learning_rate": 0.00019251495197245214, + "loss": 2.7853, + "step": 1244 + }, + { + "epoch": 0.1127947271862472, + "grad_norm": 0.6477970480918884, + "learning_rate": 0.00019250891077146138, + "loss": 2.8309, + "step": 1245 + }, + { + "epoch": 0.11288532536069398, + "grad_norm": 0.6399624347686768, + "learning_rate": 0.0001925028695704706, + "loss": 3.0285, + "step": 1246 + }, + { + "epoch": 0.11297592353514077, + "grad_norm": 0.6563894152641296, + "learning_rate": 0.00019249682836947987, + "loss": 2.8882, + "step": 1247 + }, + { + "epoch": 0.11306652170958756, + "grad_norm": 0.7091249823570251, + "learning_rate": 0.0001924907871684891, + "loss": 2.8959, + "step": 1248 + }, + { + "epoch": 0.11315711988403433, + "grad_norm": 0.7253236770629883, + "learning_rate": 0.00019248474596749834, + "loss": 2.8311, + "step": 1249 + }, + { + "epoch": 0.11324771805848112, + "grad_norm": 0.6907981634140015, + "learning_rate": 0.0001924787047665076, + "loss": 2.8132, + "step": 1250 + }, + { + "epoch": 0.11333831623292791, + "grad_norm": 0.6514201164245605, + "learning_rate": 0.00019247266356551683, + "loss": 2.8366, + "step": 1251 + }, + { + "epoch": 0.1134289144073747, + "grad_norm": 0.6180839538574219, + "learning_rate": 0.0001924666223645261, + "loss": 2.9985, + "step": 1252 + }, + { + "epoch": 0.11351951258182148, + "grad_norm": 0.6930957436561584, + "learning_rate": 0.0001924605811635353, + "loss": 3.1002, + "step": 1253 + }, + { + "epoch": 0.11361011075626826, + "grad_norm": 0.7288793921470642, + "learning_rate": 0.00019245453996254456, + "loss": 2.212, + "step": 1254 + }, + { + "epoch": 0.11370070893071504, + "grad_norm": 0.7025304436683655, + "learning_rate": 0.0001924484987615538, + "loss": 2.6877, + "step": 1255 + }, + { + "epoch": 0.11379130710516183, + "grad_norm": 0.6318988800048828, + "learning_rate": 0.00019244245756056306, + "loss": 2.8803, + "step": 1256 + }, + { + "epoch": 0.11388190527960862, + "grad_norm": 0.5813159942626953, + "learning_rate": 0.0001924364163595723, + "loss": 2.0261, + "step": 1257 + }, + { + "epoch": 0.11397250345405541, + "grad_norm": 0.6250331401824951, + "learning_rate": 0.00019243037515858153, + "loss": 2.3955, + "step": 1258 + }, + { + "epoch": 0.11406310162850218, + "grad_norm": 0.6191512942314148, + "learning_rate": 0.0001924243339575908, + "loss": 2.8591, + "step": 1259 + }, + { + "epoch": 0.11415369980294897, + "grad_norm": 0.7180635929107666, + "learning_rate": 0.00019241829275660002, + "loss": 2.8884, + "step": 1260 + }, + { + "epoch": 0.11424429797739576, + "grad_norm": 0.7134669423103333, + "learning_rate": 0.00019241225155560926, + "loss": 2.975, + "step": 1261 + }, + { + "epoch": 0.11433489615184254, + "grad_norm": 0.6258230805397034, + "learning_rate": 0.0001924062103546185, + "loss": 2.2274, + "step": 1262 + }, + { + "epoch": 0.11442549432628933, + "grad_norm": 0.6793698668479919, + "learning_rate": 0.00019240016915362775, + "loss": 2.7976, + "step": 1263 + }, + { + "epoch": 0.1145160925007361, + "grad_norm": 0.574339747428894, + "learning_rate": 0.000192394127952637, + "loss": 2.2173, + "step": 1264 + }, + { + "epoch": 0.11460669067518289, + "grad_norm": 0.7244732975959778, + "learning_rate": 0.00019238808675164625, + "loss": 2.7819, + "step": 1265 + }, + { + "epoch": 0.11469728884962968, + "grad_norm": 0.6611797213554382, + "learning_rate": 0.00019238204555065548, + "loss": 3.0536, + "step": 1266 + }, + { + "epoch": 0.11478788702407647, + "grad_norm": 0.6471265554428101, + "learning_rate": 0.00019237600434966471, + "loss": 2.9226, + "step": 1267 + }, + { + "epoch": 0.11487848519852326, + "grad_norm": 0.6897923946380615, + "learning_rate": 0.00019236996314867398, + "loss": 2.7612, + "step": 1268 + }, + { + "epoch": 0.11496908337297003, + "grad_norm": 0.6951904296875, + "learning_rate": 0.0001923639219476832, + "loss": 3.0326, + "step": 1269 + }, + { + "epoch": 0.11505968154741682, + "grad_norm": 0.6353381872177124, + "learning_rate": 0.00019235788074669244, + "loss": 2.7436, + "step": 1270 + }, + { + "epoch": 0.1151502797218636, + "grad_norm": 0.715382993221283, + "learning_rate": 0.00019235183954570168, + "loss": 3.1035, + "step": 1271 + }, + { + "epoch": 0.11524087789631039, + "grad_norm": 0.6938648223876953, + "learning_rate": 0.00019234579834471094, + "loss": 2.8796, + "step": 1272 + }, + { + "epoch": 0.11533147607075718, + "grad_norm": 0.6478023529052734, + "learning_rate": 0.0001923397571437202, + "loss": 2.8794, + "step": 1273 + }, + { + "epoch": 0.11542207424520395, + "grad_norm": 0.6169307827949524, + "learning_rate": 0.0001923337159427294, + "loss": 2.0779, + "step": 1274 + }, + { + "epoch": 0.11551267241965074, + "grad_norm": 0.6557042002677917, + "learning_rate": 0.00019232767474173867, + "loss": 2.7681, + "step": 1275 + }, + { + "epoch": 0.11560327059409753, + "grad_norm": 0.6559971570968628, + "learning_rate": 0.0001923216335407479, + "loss": 2.8019, + "step": 1276 + }, + { + "epoch": 0.11569386876854432, + "grad_norm": 0.6630891561508179, + "learning_rate": 0.00019231559233975716, + "loss": 2.6901, + "step": 1277 + }, + { + "epoch": 0.1157844669429911, + "grad_norm": 0.7003583312034607, + "learning_rate": 0.0001923095511387664, + "loss": 2.7799, + "step": 1278 + }, + { + "epoch": 0.11587506511743788, + "grad_norm": 0.6387620568275452, + "learning_rate": 0.00019230350993777563, + "loss": 2.7993, + "step": 1279 + }, + { + "epoch": 0.11596566329188467, + "grad_norm": 0.7032034993171692, + "learning_rate": 0.0001922974687367849, + "loss": 3.1062, + "step": 1280 + }, + { + "epoch": 0.11605626146633145, + "grad_norm": 0.6841272711753845, + "learning_rate": 0.00019229142753579413, + "loss": 3.0677, + "step": 1281 + }, + { + "epoch": 0.11614685964077824, + "grad_norm": 0.6752796769142151, + "learning_rate": 0.0001922853863348034, + "loss": 3.0395, + "step": 1282 + }, + { + "epoch": 0.11623745781522503, + "grad_norm": 0.6590606570243835, + "learning_rate": 0.0001922793451338126, + "loss": 3.1517, + "step": 1283 + }, + { + "epoch": 0.1163280559896718, + "grad_norm": 0.7661881446838379, + "learning_rate": 0.00019227330393282186, + "loss": 3.0483, + "step": 1284 + }, + { + "epoch": 0.11641865416411859, + "grad_norm": 0.6730051636695862, + "learning_rate": 0.0001922672627318311, + "loss": 2.7692, + "step": 1285 + }, + { + "epoch": 0.11650925233856538, + "grad_norm": 0.6365759372711182, + "learning_rate": 0.00019226122153084035, + "loss": 2.9296, + "step": 1286 + }, + { + "epoch": 0.11659985051301217, + "grad_norm": 0.6815587878227234, + "learning_rate": 0.00019225518032984959, + "loss": 2.8743, + "step": 1287 + }, + { + "epoch": 0.11669044868745895, + "grad_norm": 0.6866446137428284, + "learning_rate": 0.00019224913912885882, + "loss": 2.7971, + "step": 1288 + }, + { + "epoch": 0.11678104686190573, + "grad_norm": 0.6845930218696594, + "learning_rate": 0.00019224309792786808, + "loss": 2.8507, + "step": 1289 + }, + { + "epoch": 0.11687164503635251, + "grad_norm": 0.6227009296417236, + "learning_rate": 0.00019223705672687731, + "loss": 2.5354, + "step": 1290 + }, + { + "epoch": 0.1169622432107993, + "grad_norm": 0.8095417618751526, + "learning_rate": 0.00019223101552588655, + "loss": 2.9743, + "step": 1291 + }, + { + "epoch": 0.11705284138524609, + "grad_norm": 0.6270948648452759, + "learning_rate": 0.00019222497432489578, + "loss": 2.6193, + "step": 1292 + }, + { + "epoch": 0.11714343955969288, + "grad_norm": 0.6858212947845459, + "learning_rate": 0.00019221893312390504, + "loss": 2.8448, + "step": 1293 + }, + { + "epoch": 0.11723403773413965, + "grad_norm": 0.6251046061515808, + "learning_rate": 0.0001922128919229143, + "loss": 2.7545, + "step": 1294 + }, + { + "epoch": 0.11732463590858644, + "grad_norm": 0.6429486274719238, + "learning_rate": 0.00019220685072192354, + "loss": 2.813, + "step": 1295 + }, + { + "epoch": 0.11741523408303323, + "grad_norm": 0.6531355977058411, + "learning_rate": 0.00019220080952093277, + "loss": 2.7189, + "step": 1296 + }, + { + "epoch": 0.11750583225748001, + "grad_norm": 0.6375554203987122, + "learning_rate": 0.000192194768319942, + "loss": 2.9492, + "step": 1297 + }, + { + "epoch": 0.1175964304319268, + "grad_norm": 0.7140889763832092, + "learning_rate": 0.00019218872711895127, + "loss": 2.8413, + "step": 1298 + }, + { + "epoch": 0.11768702860637358, + "grad_norm": 0.6994892954826355, + "learning_rate": 0.0001921826859179605, + "loss": 2.9734, + "step": 1299 + }, + { + "epoch": 0.11777762678082036, + "grad_norm": 0.691584050655365, + "learning_rate": 0.00019217664471696974, + "loss": 2.9518, + "step": 1300 + }, + { + "epoch": 0.11786822495526715, + "grad_norm": 0.8516950607299805, + "learning_rate": 0.00019217060351597897, + "loss": 2.3376, + "step": 1301 + }, + { + "epoch": 0.11795882312971394, + "grad_norm": 0.6629716157913208, + "learning_rate": 0.00019216456231498823, + "loss": 2.6401, + "step": 1302 + }, + { + "epoch": 0.11804942130416073, + "grad_norm": 0.6427046060562134, + "learning_rate": 0.0001921585211139975, + "loss": 2.3027, + "step": 1303 + }, + { + "epoch": 0.1181400194786075, + "grad_norm": 0.6664748787879944, + "learning_rate": 0.0001921524799130067, + "loss": 2.7375, + "step": 1304 + }, + { + "epoch": 0.11823061765305429, + "grad_norm": 0.6727922558784485, + "learning_rate": 0.00019214643871201596, + "loss": 2.7375, + "step": 1305 + }, + { + "epoch": 0.11832121582750108, + "grad_norm": 0.6679578423500061, + "learning_rate": 0.0001921403975110252, + "loss": 2.7796, + "step": 1306 + }, + { + "epoch": 0.11841181400194786, + "grad_norm": 0.7054652571678162, + "learning_rate": 0.00019213435631003446, + "loss": 2.7929, + "step": 1307 + }, + { + "epoch": 0.11850241217639465, + "grad_norm": 0.6660186648368835, + "learning_rate": 0.0001921283151090437, + "loss": 2.932, + "step": 1308 + }, + { + "epoch": 0.11859301035084142, + "grad_norm": 0.7625333666801453, + "learning_rate": 0.00019212227390805292, + "loss": 2.9044, + "step": 1309 + }, + { + "epoch": 0.11868360852528821, + "grad_norm": 0.6621032357215881, + "learning_rate": 0.00019211623270706219, + "loss": 3.0019, + "step": 1310 + }, + { + "epoch": 0.118774206699735, + "grad_norm": 0.6565437912940979, + "learning_rate": 0.00019211019150607142, + "loss": 2.7464, + "step": 1311 + }, + { + "epoch": 0.11886480487418179, + "grad_norm": 0.5665323138237, + "learning_rate": 0.00019210415030508065, + "loss": 2.3038, + "step": 1312 + }, + { + "epoch": 0.11895540304862857, + "grad_norm": 0.6716530919075012, + "learning_rate": 0.0001920981091040899, + "loss": 2.76, + "step": 1313 + }, + { + "epoch": 0.11904600122307535, + "grad_norm": 0.6934084296226501, + "learning_rate": 0.00019209206790309915, + "loss": 2.8245, + "step": 1314 + }, + { + "epoch": 0.11913659939752214, + "grad_norm": 0.6639965176582336, + "learning_rate": 0.00019208602670210838, + "loss": 3.0248, + "step": 1315 + }, + { + "epoch": 0.11922719757196892, + "grad_norm": 0.7004680633544922, + "learning_rate": 0.00019207998550111764, + "loss": 2.9634, + "step": 1316 + }, + { + "epoch": 0.11931779574641571, + "grad_norm": 0.6507301330566406, + "learning_rate": 0.00019207394430012688, + "loss": 2.726, + "step": 1317 + }, + { + "epoch": 0.1194083939208625, + "grad_norm": 0.6519638895988464, + "learning_rate": 0.0001920679030991361, + "loss": 2.7815, + "step": 1318 + }, + { + "epoch": 0.11949899209530927, + "grad_norm": 0.6747040748596191, + "learning_rate": 0.00019206186189814537, + "loss": 2.8943, + "step": 1319 + }, + { + "epoch": 0.11958959026975606, + "grad_norm": 0.6360780000686646, + "learning_rate": 0.0001920558206971546, + "loss": 2.752, + "step": 1320 + }, + { + "epoch": 0.11968018844420285, + "grad_norm": 0.674669086933136, + "learning_rate": 0.00019204977949616384, + "loss": 2.8676, + "step": 1321 + }, + { + "epoch": 0.11977078661864964, + "grad_norm": 0.6601211428642273, + "learning_rate": 0.00019204373829517308, + "loss": 2.873, + "step": 1322 + }, + { + "epoch": 0.11986138479309642, + "grad_norm": 0.6855031847953796, + "learning_rate": 0.00019203769709418234, + "loss": 2.854, + "step": 1323 + }, + { + "epoch": 0.11995198296754321, + "grad_norm": 0.7120263576507568, + "learning_rate": 0.0001920316558931916, + "loss": 2.8366, + "step": 1324 + }, + { + "epoch": 0.12004258114198998, + "grad_norm": 0.6400977373123169, + "learning_rate": 0.0001920256146922008, + "loss": 2.1426, + "step": 1325 + }, + { + "epoch": 0.12013317931643677, + "grad_norm": 0.6922060251235962, + "learning_rate": 0.00019201957349121007, + "loss": 2.9624, + "step": 1326 + }, + { + "epoch": 0.12022377749088356, + "grad_norm": 0.6878177523612976, + "learning_rate": 0.0001920135322902193, + "loss": 2.842, + "step": 1327 + }, + { + "epoch": 0.12031437566533035, + "grad_norm": 0.6578871607780457, + "learning_rate": 0.00019200749108922856, + "loss": 2.9138, + "step": 1328 + }, + { + "epoch": 0.12040497383977714, + "grad_norm": 0.7548350095748901, + "learning_rate": 0.0001920014498882378, + "loss": 3.2835, + "step": 1329 + }, + { + "epoch": 0.12049557201422391, + "grad_norm": 0.7376602292060852, + "learning_rate": 0.00019199540868724703, + "loss": 3.2113, + "step": 1330 + }, + { + "epoch": 0.1205861701886707, + "grad_norm": 0.6643527746200562, + "learning_rate": 0.00019198936748625626, + "loss": 2.9085, + "step": 1331 + }, + { + "epoch": 0.12067676836311748, + "grad_norm": 0.682968020439148, + "learning_rate": 0.00019198332628526552, + "loss": 2.4756, + "step": 1332 + }, + { + "epoch": 0.12076736653756427, + "grad_norm": 0.6661275625228882, + "learning_rate": 0.00019197728508427476, + "loss": 2.8262, + "step": 1333 + }, + { + "epoch": 0.12085796471201106, + "grad_norm": 0.6318115592002869, + "learning_rate": 0.000191971243883284, + "loss": 2.686, + "step": 1334 + }, + { + "epoch": 0.12094856288645783, + "grad_norm": 0.6582939624786377, + "learning_rate": 0.00019196520268229325, + "loss": 2.7235, + "step": 1335 + }, + { + "epoch": 0.12103916106090462, + "grad_norm": 0.6501491069793701, + "learning_rate": 0.0001919591614813025, + "loss": 3.0282, + "step": 1336 + }, + { + "epoch": 0.12112975923535141, + "grad_norm": 0.6807935237884521, + "learning_rate": 0.00019195312028031175, + "loss": 2.8564, + "step": 1337 + }, + { + "epoch": 0.1212203574097982, + "grad_norm": 0.7666577100753784, + "learning_rate": 0.00019194707907932096, + "loss": 2.8184, + "step": 1338 + }, + { + "epoch": 0.12131095558424498, + "grad_norm": 0.6237675547599792, + "learning_rate": 0.00019194103787833022, + "loss": 2.6512, + "step": 1339 + }, + { + "epoch": 0.12140155375869176, + "grad_norm": 0.6214484572410583, + "learning_rate": 0.00019193499667733948, + "loss": 2.7654, + "step": 1340 + }, + { + "epoch": 0.12149215193313855, + "grad_norm": 0.7819243669509888, + "learning_rate": 0.0001919289554763487, + "loss": 2.9882, + "step": 1341 + }, + { + "epoch": 0.12158275010758533, + "grad_norm": 0.6530365347862244, + "learning_rate": 0.00019192291427535795, + "loss": 2.9673, + "step": 1342 + }, + { + "epoch": 0.12167334828203212, + "grad_norm": 0.6302875280380249, + "learning_rate": 0.00019191687307436718, + "loss": 2.6078, + "step": 1343 + }, + { + "epoch": 0.12176394645647891, + "grad_norm": 0.6492059826850891, + "learning_rate": 0.00019191083187337644, + "loss": 2.792, + "step": 1344 + }, + { + "epoch": 0.12185454463092568, + "grad_norm": 0.7192707657814026, + "learning_rate": 0.00019190479067238568, + "loss": 2.7639, + "step": 1345 + }, + { + "epoch": 0.12194514280537247, + "grad_norm": 0.6414003372192383, + "learning_rate": 0.0001918987494713949, + "loss": 2.903, + "step": 1346 + }, + { + "epoch": 0.12203574097981926, + "grad_norm": 0.6419739723205566, + "learning_rate": 0.00019189270827040417, + "loss": 2.9952, + "step": 1347 + }, + { + "epoch": 0.12212633915426604, + "grad_norm": 0.7055956125259399, + "learning_rate": 0.0001918866670694134, + "loss": 2.9284, + "step": 1348 + }, + { + "epoch": 0.12221693732871283, + "grad_norm": 0.6710270643234253, + "learning_rate": 0.00019188062586842267, + "loss": 2.2224, + "step": 1349 + }, + { + "epoch": 0.1223075355031596, + "grad_norm": 0.6358209848403931, + "learning_rate": 0.0001918745846674319, + "loss": 2.9835, + "step": 1350 + }, + { + "epoch": 0.1223981336776064, + "grad_norm": 0.6690958738327026, + "learning_rate": 0.00019186854346644113, + "loss": 2.4157, + "step": 1351 + }, + { + "epoch": 0.12248873185205318, + "grad_norm": 0.7448469400405884, + "learning_rate": 0.00019186250226545037, + "loss": 2.7888, + "step": 1352 + }, + { + "epoch": 0.12257933002649997, + "grad_norm": 0.6583822965621948, + "learning_rate": 0.00019185646106445963, + "loss": 2.8419, + "step": 1353 + }, + { + "epoch": 0.12266992820094676, + "grad_norm": 0.7402820587158203, + "learning_rate": 0.0001918504198634689, + "loss": 2.3417, + "step": 1354 + }, + { + "epoch": 0.12276052637539353, + "grad_norm": 0.692140519618988, + "learning_rate": 0.0001918443786624781, + "loss": 2.7307, + "step": 1355 + }, + { + "epoch": 0.12285112454984032, + "grad_norm": 0.6827000379562378, + "learning_rate": 0.00019183833746148736, + "loss": 2.9014, + "step": 1356 + }, + { + "epoch": 0.1229417227242871, + "grad_norm": 0.6859697103500366, + "learning_rate": 0.0001918322962604966, + "loss": 2.7255, + "step": 1357 + }, + { + "epoch": 0.1230323208987339, + "grad_norm": 0.69193434715271, + "learning_rate": 0.00019182625505950585, + "loss": 2.7539, + "step": 1358 + }, + { + "epoch": 0.12312291907318068, + "grad_norm": 0.7015984654426575, + "learning_rate": 0.0001918202138585151, + "loss": 2.8591, + "step": 1359 + }, + { + "epoch": 0.12321351724762745, + "grad_norm": 0.6539053320884705, + "learning_rate": 0.00019181417265752432, + "loss": 2.257, + "step": 1360 + }, + { + "epoch": 0.12330411542207424, + "grad_norm": 0.7041307687759399, + "learning_rate": 0.00019180813145653356, + "loss": 2.939, + "step": 1361 + }, + { + "epoch": 0.12339471359652103, + "grad_norm": 0.6681571006774902, + "learning_rate": 0.00019180209025554282, + "loss": 2.2471, + "step": 1362 + }, + { + "epoch": 0.12348531177096782, + "grad_norm": 0.6989859342575073, + "learning_rate": 0.00019179604905455205, + "loss": 3.0083, + "step": 1363 + }, + { + "epoch": 0.1235759099454146, + "grad_norm": 0.647887647151947, + "learning_rate": 0.00019179000785356129, + "loss": 2.5951, + "step": 1364 + }, + { + "epoch": 0.12366650811986138, + "grad_norm": 0.6791573762893677, + "learning_rate": 0.00019178396665257055, + "loss": 2.8484, + "step": 1365 + }, + { + "epoch": 0.12375710629430817, + "grad_norm": 0.6797759532928467, + "learning_rate": 0.00019177792545157978, + "loss": 3.0614, + "step": 1366 + }, + { + "epoch": 0.12384770446875495, + "grad_norm": 0.7113217711448669, + "learning_rate": 0.00019177188425058904, + "loss": 3.0083, + "step": 1367 + }, + { + "epoch": 0.12393830264320174, + "grad_norm": 0.6462563276290894, + "learning_rate": 0.00019176584304959825, + "loss": 2.5873, + "step": 1368 + }, + { + "epoch": 0.12402890081764853, + "grad_norm": 0.6733567714691162, + "learning_rate": 0.0001917598018486075, + "loss": 2.9665, + "step": 1369 + }, + { + "epoch": 0.1241194989920953, + "grad_norm": 0.7001383304595947, + "learning_rate": 0.00019175376064761677, + "loss": 3.1758, + "step": 1370 + }, + { + "epoch": 0.12421009716654209, + "grad_norm": 0.6220247149467468, + "learning_rate": 0.000191747719446626, + "loss": 2.1624, + "step": 1371 + }, + { + "epoch": 0.12430069534098888, + "grad_norm": 0.7186247110366821, + "learning_rate": 0.00019174167824563524, + "loss": 3.1131, + "step": 1372 + }, + { + "epoch": 0.12439129351543567, + "grad_norm": 0.68864905834198, + "learning_rate": 0.00019173563704464447, + "loss": 2.2646, + "step": 1373 + }, + { + "epoch": 0.12448189168988245, + "grad_norm": 0.6660684943199158, + "learning_rate": 0.00019172959584365373, + "loss": 2.7869, + "step": 1374 + }, + { + "epoch": 0.12457248986432923, + "grad_norm": 0.7076602578163147, + "learning_rate": 0.00019172355464266297, + "loss": 3.2098, + "step": 1375 + }, + { + "epoch": 0.12466308803877602, + "grad_norm": 0.6893709301948547, + "learning_rate": 0.0001917175134416722, + "loss": 2.849, + "step": 1376 + }, + { + "epoch": 0.1247536862132228, + "grad_norm": 0.613897979259491, + "learning_rate": 0.00019171147224068146, + "loss": 2.6488, + "step": 1377 + }, + { + "epoch": 0.12484428438766959, + "grad_norm": 0.6503050923347473, + "learning_rate": 0.0001917054310396907, + "loss": 2.8544, + "step": 1378 + }, + { + "epoch": 0.12493488256211638, + "grad_norm": 0.6816533803939819, + "learning_rate": 0.00019169938983869996, + "loss": 2.7957, + "step": 1379 + }, + { + "epoch": 0.12502548073656317, + "grad_norm": 0.6847785115242004, + "learning_rate": 0.0001916933486377092, + "loss": 2.9691, + "step": 1380 + }, + { + "epoch": 0.12511607891100995, + "grad_norm": 0.6540956497192383, + "learning_rate": 0.00019168730743671843, + "loss": 2.7666, + "step": 1381 + }, + { + "epoch": 0.12520667708545674, + "grad_norm": 0.6613627672195435, + "learning_rate": 0.00019168126623572766, + "loss": 2.7287, + "step": 1382 + }, + { + "epoch": 0.1252972752599035, + "grad_norm": 0.6405002474784851, + "learning_rate": 0.00019167522503473692, + "loss": 3.0093, + "step": 1383 + }, + { + "epoch": 0.1253878734343503, + "grad_norm": 0.6797839999198914, + "learning_rate": 0.00019166918383374616, + "loss": 2.7952, + "step": 1384 + }, + { + "epoch": 0.12547847160879708, + "grad_norm": 0.656480073928833, + "learning_rate": 0.0001916631426327554, + "loss": 2.6767, + "step": 1385 + }, + { + "epoch": 0.12556906978324386, + "grad_norm": 0.7008855938911438, + "learning_rate": 0.00019165710143176465, + "loss": 2.979, + "step": 1386 + }, + { + "epoch": 0.12565966795769065, + "grad_norm": 0.5639016628265381, + "learning_rate": 0.00019165106023077389, + "loss": 2.0939, + "step": 1387 + }, + { + "epoch": 0.12575026613213744, + "grad_norm": 0.6805626749992371, + "learning_rate": 0.00019164501902978315, + "loss": 2.8612, + "step": 1388 + }, + { + "epoch": 0.12584086430658423, + "grad_norm": 0.6690181493759155, + "learning_rate": 0.00019163897782879235, + "loss": 2.9142, + "step": 1389 + }, + { + "epoch": 0.12593146248103101, + "grad_norm": 0.7438037991523743, + "learning_rate": 0.00019163293662780161, + "loss": 2.8345, + "step": 1390 + }, + { + "epoch": 0.1260220606554778, + "grad_norm": 0.7093648314476013, + "learning_rate": 0.00019162689542681085, + "loss": 2.9872, + "step": 1391 + }, + { + "epoch": 0.1261126588299246, + "grad_norm": 0.6512764692306519, + "learning_rate": 0.0001916208542258201, + "loss": 2.277, + "step": 1392 + }, + { + "epoch": 0.12620325700437135, + "grad_norm": 0.6697981953620911, + "learning_rate": 0.00019161481302482934, + "loss": 3.0138, + "step": 1393 + }, + { + "epoch": 0.12629385517881814, + "grad_norm": 0.696577250957489, + "learning_rate": 0.00019160877182383858, + "loss": 2.7982, + "step": 1394 + }, + { + "epoch": 0.12638445335326493, + "grad_norm": 0.6509019732475281, + "learning_rate": 0.00019160273062284784, + "loss": 2.8625, + "step": 1395 + }, + { + "epoch": 0.1264750515277117, + "grad_norm": 0.6615455746650696, + "learning_rate": 0.00019159668942185707, + "loss": 2.7022, + "step": 1396 + }, + { + "epoch": 0.1265656497021585, + "grad_norm": 0.7237778306007385, + "learning_rate": 0.0001915906482208663, + "loss": 2.997, + "step": 1397 + }, + { + "epoch": 0.1266562478766053, + "grad_norm": 0.6934956312179565, + "learning_rate": 0.00019158460701987554, + "loss": 2.0439, + "step": 1398 + }, + { + "epoch": 0.12674684605105208, + "grad_norm": 0.6474635601043701, + "learning_rate": 0.0001915785658188848, + "loss": 3.0464, + "step": 1399 + }, + { + "epoch": 0.12683744422549886, + "grad_norm": 0.6458698511123657, + "learning_rate": 0.00019157252461789406, + "loss": 2.6955, + "step": 1400 + }, + { + "epoch": 0.12692804239994565, + "grad_norm": 0.6268688440322876, + "learning_rate": 0.0001915664834169033, + "loss": 2.6776, + "step": 1401 + }, + { + "epoch": 0.12701864057439244, + "grad_norm": 0.6471112966537476, + "learning_rate": 0.00019156044221591253, + "loss": 2.8009, + "step": 1402 + }, + { + "epoch": 0.1271092387488392, + "grad_norm": 0.6045379042625427, + "learning_rate": 0.00019155440101492177, + "loss": 2.6385, + "step": 1403 + }, + { + "epoch": 0.12719983692328599, + "grad_norm": 0.6658498048782349, + "learning_rate": 0.00019154835981393103, + "loss": 2.9742, + "step": 1404 + }, + { + "epoch": 0.12729043509773277, + "grad_norm": 0.7324777245521545, + "learning_rate": 0.00019154231861294026, + "loss": 3.0645, + "step": 1405 + }, + { + "epoch": 0.12738103327217956, + "grad_norm": 0.661698043346405, + "learning_rate": 0.0001915362774119495, + "loss": 3.0385, + "step": 1406 + }, + { + "epoch": 0.12747163144662635, + "grad_norm": 0.7198633551597595, + "learning_rate": 0.00019153023621095876, + "loss": 2.9272, + "step": 1407 + }, + { + "epoch": 0.12756222962107314, + "grad_norm": 0.6339722275733948, + "learning_rate": 0.000191524195009968, + "loss": 2.7442, + "step": 1408 + }, + { + "epoch": 0.12765282779551992, + "grad_norm": 0.7115992307662964, + "learning_rate": 0.00019151815380897725, + "loss": 2.9126, + "step": 1409 + }, + { + "epoch": 0.1277434259699667, + "grad_norm": 0.655548095703125, + "learning_rate": 0.00019151211260798646, + "loss": 2.7051, + "step": 1410 + }, + { + "epoch": 0.1278340241444135, + "grad_norm": 0.6991486549377441, + "learning_rate": 0.00019150607140699572, + "loss": 2.7327, + "step": 1411 + }, + { + "epoch": 0.1279246223188603, + "grad_norm": 0.6894722580909729, + "learning_rate": 0.00019150003020600495, + "loss": 3.1938, + "step": 1412 + }, + { + "epoch": 0.12801522049330705, + "grad_norm": 0.6584382653236389, + "learning_rate": 0.00019149398900501421, + "loss": 2.9342, + "step": 1413 + }, + { + "epoch": 0.12810581866775383, + "grad_norm": 0.7611597180366516, + "learning_rate": 0.00019148794780402345, + "loss": 2.734, + "step": 1414 + }, + { + "epoch": 0.12819641684220062, + "grad_norm": 0.6345563530921936, + "learning_rate": 0.00019148190660303268, + "loss": 2.0273, + "step": 1415 + }, + { + "epoch": 0.1282870150166474, + "grad_norm": 0.7010446190834045, + "learning_rate": 0.00019147586540204194, + "loss": 2.7685, + "step": 1416 + }, + { + "epoch": 0.1283776131910942, + "grad_norm": 0.6740787625312805, + "learning_rate": 0.00019146982420105118, + "loss": 2.7394, + "step": 1417 + }, + { + "epoch": 0.12846821136554099, + "grad_norm": 0.6525139808654785, + "learning_rate": 0.00019146378300006044, + "loss": 2.712, + "step": 1418 + }, + { + "epoch": 0.12855880953998777, + "grad_norm": 0.7234054207801819, + "learning_rate": 0.00019145774179906965, + "loss": 2.8083, + "step": 1419 + }, + { + "epoch": 0.12864940771443456, + "grad_norm": 0.6179513931274414, + "learning_rate": 0.0001914517005980789, + "loss": 2.2113, + "step": 1420 + }, + { + "epoch": 0.12874000588888135, + "grad_norm": 0.6928163170814514, + "learning_rate": 0.00019144565939708814, + "loss": 2.744, + "step": 1421 + }, + { + "epoch": 0.12883060406332814, + "grad_norm": 0.6785053610801697, + "learning_rate": 0.0001914396181960974, + "loss": 3.2041, + "step": 1422 + }, + { + "epoch": 0.1289212022377749, + "grad_norm": 0.679302453994751, + "learning_rate": 0.00019143357699510664, + "loss": 2.7025, + "step": 1423 + }, + { + "epoch": 0.12901180041222168, + "grad_norm": 0.6616453528404236, + "learning_rate": 0.00019142753579411587, + "loss": 2.1935, + "step": 1424 + }, + { + "epoch": 0.12910239858666847, + "grad_norm": 0.6829967498779297, + "learning_rate": 0.00019142149459312513, + "loss": 2.9618, + "step": 1425 + }, + { + "epoch": 0.12919299676111526, + "grad_norm": 0.6340128779411316, + "learning_rate": 0.00019141545339213437, + "loss": 2.78, + "step": 1426 + }, + { + "epoch": 0.12928359493556205, + "grad_norm": 0.7361663579940796, + "learning_rate": 0.0001914094121911436, + "loss": 2.9055, + "step": 1427 + }, + { + "epoch": 0.12937419311000883, + "grad_norm": 0.6568117737770081, + "learning_rate": 0.00019140337099015283, + "loss": 2.8414, + "step": 1428 + }, + { + "epoch": 0.12946479128445562, + "grad_norm": 0.6503693461418152, + "learning_rate": 0.0001913973297891621, + "loss": 3.176, + "step": 1429 + }, + { + "epoch": 0.1295553894589024, + "grad_norm": 0.6994544267654419, + "learning_rate": 0.00019139128858817136, + "loss": 2.2393, + "step": 1430 + }, + { + "epoch": 0.1296459876333492, + "grad_norm": 0.6652103066444397, + "learning_rate": 0.0001913852473871806, + "loss": 2.6791, + "step": 1431 + }, + { + "epoch": 0.12973658580779598, + "grad_norm": 0.7135443091392517, + "learning_rate": 0.00019137920618618982, + "loss": 3.0833, + "step": 1432 + }, + { + "epoch": 0.12982718398224274, + "grad_norm": 0.6952136158943176, + "learning_rate": 0.00019137316498519906, + "loss": 2.7613, + "step": 1433 + }, + { + "epoch": 0.12991778215668953, + "grad_norm": 0.6421719193458557, + "learning_rate": 0.00019136712378420832, + "loss": 2.8842, + "step": 1434 + }, + { + "epoch": 0.13000838033113632, + "grad_norm": 0.6027471423149109, + "learning_rate": 0.00019136108258321755, + "loss": 2.0184, + "step": 1435 + }, + { + "epoch": 0.1300989785055831, + "grad_norm": 0.6707736253738403, + "learning_rate": 0.0001913550413822268, + "loss": 2.8516, + "step": 1436 + }, + { + "epoch": 0.1301895766800299, + "grad_norm": 0.674345076084137, + "learning_rate": 0.00019134900018123605, + "loss": 2.6878, + "step": 1437 + }, + { + "epoch": 0.13028017485447668, + "grad_norm": 0.6823990345001221, + "learning_rate": 0.00019134295898024528, + "loss": 2.8758, + "step": 1438 + }, + { + "epoch": 0.13037077302892347, + "grad_norm": 0.6445717215538025, + "learning_rate": 0.00019133691777925454, + "loss": 2.9614, + "step": 1439 + }, + { + "epoch": 0.13046137120337026, + "grad_norm": 0.6728843450546265, + "learning_rate": 0.00019133087657826375, + "loss": 2.9608, + "step": 1440 + }, + { + "epoch": 0.13055196937781705, + "grad_norm": 0.7179891467094421, + "learning_rate": 0.000191324835377273, + "loss": 2.9324, + "step": 1441 + }, + { + "epoch": 0.13064256755226383, + "grad_norm": 0.6587281823158264, + "learning_rate": 0.00019131879417628225, + "loss": 2.7295, + "step": 1442 + }, + { + "epoch": 0.13073316572671062, + "grad_norm": 0.6773433089256287, + "learning_rate": 0.0001913127529752915, + "loss": 2.9049, + "step": 1443 + }, + { + "epoch": 0.13082376390115738, + "grad_norm": 0.661461591720581, + "learning_rate": 0.00019130671177430074, + "loss": 2.9261, + "step": 1444 + }, + { + "epoch": 0.13091436207560417, + "grad_norm": 0.5776041746139526, + "learning_rate": 0.00019130067057330998, + "loss": 2.0861, + "step": 1445 + }, + { + "epoch": 0.13100496025005096, + "grad_norm": 0.7189454436302185, + "learning_rate": 0.00019129462937231924, + "loss": 2.8427, + "step": 1446 + }, + { + "epoch": 0.13109555842449774, + "grad_norm": 0.6541622281074524, + "learning_rate": 0.00019128858817132847, + "loss": 2.7336, + "step": 1447 + }, + { + "epoch": 0.13118615659894453, + "grad_norm": 0.6780993342399597, + "learning_rate": 0.0001912825469703377, + "loss": 2.864, + "step": 1448 + }, + { + "epoch": 0.13127675477339132, + "grad_norm": 0.6672424674034119, + "learning_rate": 0.00019127650576934694, + "loss": 2.9661, + "step": 1449 + }, + { + "epoch": 0.1313673529478381, + "grad_norm": 0.6554668545722961, + "learning_rate": 0.0001912704645683562, + "loss": 2.756, + "step": 1450 + }, + { + "epoch": 0.1314579511222849, + "grad_norm": 0.6905433535575867, + "learning_rate": 0.00019126442336736543, + "loss": 2.5502, + "step": 1451 + }, + { + "epoch": 0.13154854929673168, + "grad_norm": 0.7589731812477112, + "learning_rate": 0.0001912583821663747, + "loss": 2.784, + "step": 1452 + }, + { + "epoch": 0.13163914747117847, + "grad_norm": 0.6877279877662659, + "learning_rate": 0.00019125234096538393, + "loss": 2.8964, + "step": 1453 + }, + { + "epoch": 0.13172974564562523, + "grad_norm": 0.6758426427841187, + "learning_rate": 0.00019124629976439316, + "loss": 2.6499, + "step": 1454 + }, + { + "epoch": 0.13182034382007202, + "grad_norm": 0.662931501865387, + "learning_rate": 0.00019124025856340242, + "loss": 2.6434, + "step": 1455 + }, + { + "epoch": 0.1319109419945188, + "grad_norm": 0.6705886125564575, + "learning_rate": 0.00019123421736241166, + "loss": 2.8932, + "step": 1456 + }, + { + "epoch": 0.1320015401689656, + "grad_norm": 0.6516028642654419, + "learning_rate": 0.0001912281761614209, + "loss": 2.5799, + "step": 1457 + }, + { + "epoch": 0.13209213834341238, + "grad_norm": 0.668663501739502, + "learning_rate": 0.00019122213496043013, + "loss": 2.6427, + "step": 1458 + }, + { + "epoch": 0.13218273651785917, + "grad_norm": 0.788392961025238, + "learning_rate": 0.0001912160937594394, + "loss": 2.9471, + "step": 1459 + }, + { + "epoch": 0.13227333469230595, + "grad_norm": 0.673636794090271, + "learning_rate": 0.00019121005255844865, + "loss": 2.9375, + "step": 1460 + }, + { + "epoch": 0.13236393286675274, + "grad_norm": 0.6777180433273315, + "learning_rate": 0.00019120401135745786, + "loss": 2.4254, + "step": 1461 + }, + { + "epoch": 0.13245453104119953, + "grad_norm": 0.6786038875579834, + "learning_rate": 0.00019119797015646712, + "loss": 2.7728, + "step": 1462 + }, + { + "epoch": 0.13254512921564632, + "grad_norm": 0.7715746164321899, + "learning_rate": 0.00019119192895547635, + "loss": 2.9258, + "step": 1463 + }, + { + "epoch": 0.13263572739009308, + "grad_norm": 0.6408798694610596, + "learning_rate": 0.0001911858877544856, + "loss": 2.6069, + "step": 1464 + }, + { + "epoch": 0.13272632556453987, + "grad_norm": 0.7054091095924377, + "learning_rate": 0.00019117984655349485, + "loss": 2.8923, + "step": 1465 + }, + { + "epoch": 0.13281692373898665, + "grad_norm": 0.6664813756942749, + "learning_rate": 0.00019117380535250408, + "loss": 2.9235, + "step": 1466 + }, + { + "epoch": 0.13290752191343344, + "grad_norm": 0.6770068407058716, + "learning_rate": 0.00019116776415151334, + "loss": 2.9441, + "step": 1467 + }, + { + "epoch": 0.13299812008788023, + "grad_norm": 0.6925097703933716, + "learning_rate": 0.00019116172295052258, + "loss": 3.1163, + "step": 1468 + }, + { + "epoch": 0.13308871826232702, + "grad_norm": 0.651202380657196, + "learning_rate": 0.00019115568174953184, + "loss": 2.6953, + "step": 1469 + }, + { + "epoch": 0.1331793164367738, + "grad_norm": 0.6821592450141907, + "learning_rate": 0.00019114964054854104, + "loss": 2.7872, + "step": 1470 + }, + { + "epoch": 0.1332699146112206, + "grad_norm": 0.6838065385818481, + "learning_rate": 0.0001911435993475503, + "loss": 2.9085, + "step": 1471 + }, + { + "epoch": 0.13336051278566738, + "grad_norm": 0.6934853196144104, + "learning_rate": 0.00019113755814655954, + "loss": 2.8487, + "step": 1472 + }, + { + "epoch": 0.13345111096011417, + "grad_norm": 0.6684731245040894, + "learning_rate": 0.0001911315169455688, + "loss": 2.8233, + "step": 1473 + }, + { + "epoch": 0.13354170913456093, + "grad_norm": 0.6718436479568481, + "learning_rate": 0.00019112547574457803, + "loss": 2.5525, + "step": 1474 + }, + { + "epoch": 0.13363230730900771, + "grad_norm": 0.6737833023071289, + "learning_rate": 0.00019111943454358727, + "loss": 2.8315, + "step": 1475 + }, + { + "epoch": 0.1337229054834545, + "grad_norm": 0.6799876093864441, + "learning_rate": 0.00019111339334259653, + "loss": 2.9876, + "step": 1476 + }, + { + "epoch": 0.1338135036579013, + "grad_norm": 0.6593426465988159, + "learning_rate": 0.00019110735214160576, + "loss": 2.9111, + "step": 1477 + }, + { + "epoch": 0.13390410183234808, + "grad_norm": 0.7752467393875122, + "learning_rate": 0.000191101310940615, + "loss": 3.2155, + "step": 1478 + }, + { + "epoch": 0.13399470000679486, + "grad_norm": 0.7424026727676392, + "learning_rate": 0.00019109526973962423, + "loss": 3.1646, + "step": 1479 + }, + { + "epoch": 0.13408529818124165, + "grad_norm": 0.6719716787338257, + "learning_rate": 0.0001910892285386335, + "loss": 3.0412, + "step": 1480 + }, + { + "epoch": 0.13417589635568844, + "grad_norm": 0.6803700923919678, + "learning_rate": 0.00019108318733764273, + "loss": 2.8336, + "step": 1481 + }, + { + "epoch": 0.13426649453013523, + "grad_norm": 0.6334998607635498, + "learning_rate": 0.000191077146136652, + "loss": 2.6318, + "step": 1482 + }, + { + "epoch": 0.13435709270458202, + "grad_norm": 0.6599361896514893, + "learning_rate": 0.00019107110493566122, + "loss": 2.8312, + "step": 1483 + }, + { + "epoch": 0.13444769087902878, + "grad_norm": 0.8059895634651184, + "learning_rate": 0.00019106506373467046, + "loss": 2.888, + "step": 1484 + }, + { + "epoch": 0.13453828905347556, + "grad_norm": 0.6564418077468872, + "learning_rate": 0.00019105902253367972, + "loss": 2.7033, + "step": 1485 + }, + { + "epoch": 0.13462888722792235, + "grad_norm": 0.6865137219429016, + "learning_rate": 0.00019105298133268895, + "loss": 2.7612, + "step": 1486 + }, + { + "epoch": 0.13471948540236914, + "grad_norm": 0.6758403182029724, + "learning_rate": 0.00019104694013169819, + "loss": 2.8433, + "step": 1487 + }, + { + "epoch": 0.13481008357681593, + "grad_norm": 2.1667187213897705, + "learning_rate": 0.00019104089893070742, + "loss": 2.8197, + "step": 1488 + }, + { + "epoch": 0.1349006817512627, + "grad_norm": 0.6453744173049927, + "learning_rate": 0.00019103485772971668, + "loss": 2.3127, + "step": 1489 + }, + { + "epoch": 0.1349912799257095, + "grad_norm": 0.6457774043083191, + "learning_rate": 0.00019102881652872594, + "loss": 2.7407, + "step": 1490 + }, + { + "epoch": 0.1350818781001563, + "grad_norm": 0.7094092965126038, + "learning_rate": 0.00019102277532773515, + "loss": 2.8819, + "step": 1491 + }, + { + "epoch": 0.13517247627460308, + "grad_norm": 0.8159560561180115, + "learning_rate": 0.0001910167341267444, + "loss": 2.2801, + "step": 1492 + }, + { + "epoch": 0.13526307444904986, + "grad_norm": 0.67573082447052, + "learning_rate": 0.00019101069292575364, + "loss": 3.0621, + "step": 1493 + }, + { + "epoch": 0.13535367262349662, + "grad_norm": 0.6609558463096619, + "learning_rate": 0.0001910046517247629, + "loss": 2.9453, + "step": 1494 + }, + { + "epoch": 0.1354442707979434, + "grad_norm": 0.6822428107261658, + "learning_rate": 0.00019099861052377214, + "loss": 2.824, + "step": 1495 + }, + { + "epoch": 0.1355348689723902, + "grad_norm": 0.7021757364273071, + "learning_rate": 0.00019099256932278137, + "loss": 2.8205, + "step": 1496 + }, + { + "epoch": 0.135625467146837, + "grad_norm": 0.6162211894989014, + "learning_rate": 0.00019098652812179063, + "loss": 2.7814, + "step": 1497 + }, + { + "epoch": 0.13571606532128377, + "grad_norm": 0.7311915755271912, + "learning_rate": 0.00019098048692079987, + "loss": 2.8683, + "step": 1498 + }, + { + "epoch": 0.13580666349573056, + "grad_norm": 0.6517004370689392, + "learning_rate": 0.0001909744457198091, + "loss": 2.8261, + "step": 1499 + }, + { + "epoch": 0.13589726167017735, + "grad_norm": 0.6775630712509155, + "learning_rate": 0.00019096840451881834, + "loss": 2.6686, + "step": 1500 + }, + { + "epoch": 0.13598785984462414, + "grad_norm": 0.7497110962867737, + "learning_rate": 0.0001909623633178276, + "loss": 2.9824, + "step": 1501 + }, + { + "epoch": 0.13607845801907092, + "grad_norm": 0.6272714138031006, + "learning_rate": 0.00019095632211683683, + "loss": 2.712, + "step": 1502 + }, + { + "epoch": 0.1361690561935177, + "grad_norm": 0.6581370830535889, + "learning_rate": 0.0001909502809158461, + "loss": 2.8047, + "step": 1503 + }, + { + "epoch": 0.13625965436796447, + "grad_norm": 0.6668086051940918, + "learning_rate": 0.00019094423971485533, + "loss": 2.9685, + "step": 1504 + }, + { + "epoch": 0.13635025254241126, + "grad_norm": 0.6806427836418152, + "learning_rate": 0.00019093819851386456, + "loss": 2.819, + "step": 1505 + }, + { + "epoch": 0.13644085071685805, + "grad_norm": 0.673358142375946, + "learning_rate": 0.00019093215731287382, + "loss": 2.9154, + "step": 1506 + }, + { + "epoch": 0.13653144889130484, + "grad_norm": 0.6677250862121582, + "learning_rate": 0.00019092611611188306, + "loss": 2.8751, + "step": 1507 + }, + { + "epoch": 0.13662204706575162, + "grad_norm": 0.6289308667182922, + "learning_rate": 0.0001909200749108923, + "loss": 2.8192, + "step": 1508 + }, + { + "epoch": 0.1367126452401984, + "grad_norm": 0.7156805992126465, + "learning_rate": 0.00019091403370990152, + "loss": 2.921, + "step": 1509 + }, + { + "epoch": 0.1368032434146452, + "grad_norm": 0.6693958044052124, + "learning_rate": 0.00019090799250891079, + "loss": 2.9419, + "step": 1510 + }, + { + "epoch": 0.13689384158909199, + "grad_norm": 0.6882167458534241, + "learning_rate": 0.00019090195130792002, + "loss": 2.7334, + "step": 1511 + }, + { + "epoch": 0.13698443976353877, + "grad_norm": 0.6988106966018677, + "learning_rate": 0.00019089591010692925, + "loss": 2.8518, + "step": 1512 + }, + { + "epoch": 0.13707503793798556, + "grad_norm": 0.6563965082168579, + "learning_rate": 0.00019088986890593851, + "loss": 2.5705, + "step": 1513 + }, + { + "epoch": 0.13716563611243232, + "grad_norm": 0.6493036150932312, + "learning_rate": 0.00019088382770494775, + "loss": 2.6299, + "step": 1514 + }, + { + "epoch": 0.1372562342868791, + "grad_norm": 0.7367354035377502, + "learning_rate": 0.000190877786503957, + "loss": 2.8727, + "step": 1515 + }, + { + "epoch": 0.1373468324613259, + "grad_norm": 0.6465295553207397, + "learning_rate": 0.00019087174530296624, + "loss": 2.8855, + "step": 1516 + }, + { + "epoch": 0.13743743063577268, + "grad_norm": 0.6605355143547058, + "learning_rate": 0.00019086570410197548, + "loss": 2.7112, + "step": 1517 + }, + { + "epoch": 0.13752802881021947, + "grad_norm": 0.6636186838150024, + "learning_rate": 0.0001908596629009847, + "loss": 2.8281, + "step": 1518 + }, + { + "epoch": 0.13761862698466626, + "grad_norm": 0.6602878570556641, + "learning_rate": 0.00019085362169999397, + "loss": 2.3643, + "step": 1519 + }, + { + "epoch": 0.13770922515911305, + "grad_norm": 0.6525195240974426, + "learning_rate": 0.0001908475804990032, + "loss": 1.9416, + "step": 1520 + }, + { + "epoch": 0.13779982333355983, + "grad_norm": 0.7129232883453369, + "learning_rate": 0.00019084153929801244, + "loss": 2.8141, + "step": 1521 + }, + { + "epoch": 0.13789042150800662, + "grad_norm": 0.6334636807441711, + "learning_rate": 0.0001908354980970217, + "loss": 2.7583, + "step": 1522 + }, + { + "epoch": 0.1379810196824534, + "grad_norm": 0.6857644319534302, + "learning_rate": 0.00019082945689603094, + "loss": 2.8602, + "step": 1523 + }, + { + "epoch": 0.1380716178569002, + "grad_norm": 0.7273374199867249, + "learning_rate": 0.0001908234156950402, + "loss": 3.2489, + "step": 1524 + }, + { + "epoch": 0.13816221603134696, + "grad_norm": 0.652541995048523, + "learning_rate": 0.0001908173744940494, + "loss": 2.7099, + "step": 1525 + }, + { + "epoch": 0.13825281420579374, + "grad_norm": 0.7306389212608337, + "learning_rate": 0.00019081133329305867, + "loss": 2.8473, + "step": 1526 + }, + { + "epoch": 0.13834341238024053, + "grad_norm": 0.6905577778816223, + "learning_rate": 0.00019080529209206793, + "loss": 2.9113, + "step": 1527 + }, + { + "epoch": 0.13843401055468732, + "grad_norm": 0.651276171207428, + "learning_rate": 0.00019079925089107716, + "loss": 2.8113, + "step": 1528 + }, + { + "epoch": 0.1385246087291341, + "grad_norm": 0.6482608914375305, + "learning_rate": 0.0001907932096900864, + "loss": 2.1972, + "step": 1529 + }, + { + "epoch": 0.1386152069035809, + "grad_norm": 0.6994456648826599, + "learning_rate": 0.00019078716848909563, + "loss": 2.897, + "step": 1530 + }, + { + "epoch": 0.13870580507802768, + "grad_norm": 0.7326186895370483, + "learning_rate": 0.0001907811272881049, + "loss": 2.7438, + "step": 1531 + }, + { + "epoch": 0.13879640325247447, + "grad_norm": 0.704274594783783, + "learning_rate": 0.00019077508608711412, + "loss": 3.0348, + "step": 1532 + }, + { + "epoch": 0.13888700142692126, + "grad_norm": 0.6791133880615234, + "learning_rate": 0.00019076904488612336, + "loss": 2.9509, + "step": 1533 + }, + { + "epoch": 0.13897759960136805, + "grad_norm": 0.7161508202552795, + "learning_rate": 0.00019076300368513262, + "loss": 3.111, + "step": 1534 + }, + { + "epoch": 0.1390681977758148, + "grad_norm": 0.6044840812683105, + "learning_rate": 0.00019075696248414185, + "loss": 2.2405, + "step": 1535 + }, + { + "epoch": 0.1391587959502616, + "grad_norm": 0.7379668951034546, + "learning_rate": 0.00019075092128315111, + "loss": 3.0498, + "step": 1536 + }, + { + "epoch": 0.13924939412470838, + "grad_norm": 0.6940715312957764, + "learning_rate": 0.00019074488008216035, + "loss": 2.7497, + "step": 1537 + }, + { + "epoch": 0.13933999229915517, + "grad_norm": 0.7267511487007141, + "learning_rate": 0.00019073883888116958, + "loss": 2.6753, + "step": 1538 + }, + { + "epoch": 0.13943059047360196, + "grad_norm": 0.8068050742149353, + "learning_rate": 0.00019073279768017882, + "loss": 2.8203, + "step": 1539 + }, + { + "epoch": 0.13952118864804874, + "grad_norm": 0.710664689540863, + "learning_rate": 0.00019072675647918808, + "loss": 2.7692, + "step": 1540 + }, + { + "epoch": 0.13961178682249553, + "grad_norm": 0.6728500723838806, + "learning_rate": 0.0001907207152781973, + "loss": 2.7163, + "step": 1541 + }, + { + "epoch": 0.13970238499694232, + "grad_norm": 0.5898039937019348, + "learning_rate": 0.00019071467407720655, + "loss": 2.0811, + "step": 1542 + }, + { + "epoch": 0.1397929831713891, + "grad_norm": 0.6768251657485962, + "learning_rate": 0.0001907086328762158, + "loss": 2.7783, + "step": 1543 + }, + { + "epoch": 0.1398835813458359, + "grad_norm": 0.7494379878044128, + "learning_rate": 0.00019070259167522504, + "loss": 2.9937, + "step": 1544 + }, + { + "epoch": 0.13997417952028265, + "grad_norm": 0.7537257671356201, + "learning_rate": 0.0001906965504742343, + "loss": 3.1641, + "step": 1545 + }, + { + "epoch": 0.14006477769472944, + "grad_norm": 0.6703888773918152, + "learning_rate": 0.00019069050927324354, + "loss": 2.2155, + "step": 1546 + }, + { + "epoch": 0.14015537586917623, + "grad_norm": 0.6872087121009827, + "learning_rate": 0.00019068446807225277, + "loss": 2.6971, + "step": 1547 + }, + { + "epoch": 0.14024597404362302, + "grad_norm": 0.7250022888183594, + "learning_rate": 0.000190678426871262, + "loss": 3.129, + "step": 1548 + }, + { + "epoch": 0.1403365722180698, + "grad_norm": 0.7330519556999207, + "learning_rate": 0.00019067238567027127, + "loss": 2.8971, + "step": 1549 + }, + { + "epoch": 0.1404271703925166, + "grad_norm": 0.6485843658447266, + "learning_rate": 0.0001906663444692805, + "loss": 2.7295, + "step": 1550 + }, + { + "epoch": 0.14051776856696338, + "grad_norm": 0.756965160369873, + "learning_rate": 0.00019066030326828973, + "loss": 2.9218, + "step": 1551 + }, + { + "epoch": 0.14060836674141017, + "grad_norm": 0.6893330812454224, + "learning_rate": 0.000190654262067299, + "loss": 3.0816, + "step": 1552 + }, + { + "epoch": 0.14069896491585696, + "grad_norm": 0.7178177237510681, + "learning_rate": 0.00019064822086630823, + "loss": 2.7062, + "step": 1553 + }, + { + "epoch": 0.14078956309030374, + "grad_norm": 0.7377982139587402, + "learning_rate": 0.0001906421796653175, + "loss": 2.8157, + "step": 1554 + }, + { + "epoch": 0.1408801612647505, + "grad_norm": 0.7180757522583008, + "learning_rate": 0.0001906361384643267, + "loss": 2.8662, + "step": 1555 + }, + { + "epoch": 0.1409707594391973, + "grad_norm": 0.7748134136199951, + "learning_rate": 0.00019063009726333596, + "loss": 3.0629, + "step": 1556 + }, + { + "epoch": 0.14106135761364408, + "grad_norm": 0.693242609500885, + "learning_rate": 0.00019062405606234522, + "loss": 2.693, + "step": 1557 + }, + { + "epoch": 0.14115195578809087, + "grad_norm": 0.6820663809776306, + "learning_rate": 0.00019061801486135445, + "loss": 2.6684, + "step": 1558 + }, + { + "epoch": 0.14124255396253765, + "grad_norm": 0.6658575534820557, + "learning_rate": 0.0001906119736603637, + "loss": 3.0241, + "step": 1559 + }, + { + "epoch": 0.14133315213698444, + "grad_norm": 0.670298159122467, + "learning_rate": 0.00019060593245937292, + "loss": 2.8222, + "step": 1560 + }, + { + "epoch": 0.14142375031143123, + "grad_norm": 0.6854199171066284, + "learning_rate": 0.00019059989125838218, + "loss": 2.9204, + "step": 1561 + }, + { + "epoch": 0.14151434848587802, + "grad_norm": 0.6936894059181213, + "learning_rate": 0.00019059385005739142, + "loss": 2.8022, + "step": 1562 + }, + { + "epoch": 0.1416049466603248, + "grad_norm": 0.7043157815933228, + "learning_rate": 0.00019058780885640065, + "loss": 3.0285, + "step": 1563 + }, + { + "epoch": 0.1416955448347716, + "grad_norm": 0.7862153053283691, + "learning_rate": 0.0001905817676554099, + "loss": 3.146, + "step": 1564 + }, + { + "epoch": 0.14178614300921835, + "grad_norm": 0.6594341993331909, + "learning_rate": 0.00019057572645441915, + "loss": 2.7453, + "step": 1565 + }, + { + "epoch": 0.14187674118366514, + "grad_norm": 0.7149715423583984, + "learning_rate": 0.0001905696852534284, + "loss": 2.7663, + "step": 1566 + }, + { + "epoch": 0.14196733935811193, + "grad_norm": 0.6360280513763428, + "learning_rate": 0.00019056364405243764, + "loss": 2.6039, + "step": 1567 + }, + { + "epoch": 0.14205793753255871, + "grad_norm": 0.7271155714988708, + "learning_rate": 0.00019055760285144688, + "loss": 2.835, + "step": 1568 + }, + { + "epoch": 0.1421485357070055, + "grad_norm": 0.7327834963798523, + "learning_rate": 0.0001905515616504561, + "loss": 3.1007, + "step": 1569 + }, + { + "epoch": 0.1422391338814523, + "grad_norm": 0.7146656513214111, + "learning_rate": 0.00019054552044946537, + "loss": 2.3572, + "step": 1570 + }, + { + "epoch": 0.14232973205589908, + "grad_norm": 0.6446316838264465, + "learning_rate": 0.0001905394792484746, + "loss": 2.8499, + "step": 1571 + }, + { + "epoch": 0.14242033023034587, + "grad_norm": 0.6375436186790466, + "learning_rate": 0.00019053343804748384, + "loss": 2.2688, + "step": 1572 + }, + { + "epoch": 0.14251092840479265, + "grad_norm": 0.6934496164321899, + "learning_rate": 0.0001905273968464931, + "loss": 2.9066, + "step": 1573 + }, + { + "epoch": 0.14260152657923944, + "grad_norm": 0.6941413283348083, + "learning_rate": 0.00019052135564550233, + "loss": 2.7875, + "step": 1574 + }, + { + "epoch": 0.1426921247536862, + "grad_norm": 0.6505498886108398, + "learning_rate": 0.0001905153144445116, + "loss": 2.6833, + "step": 1575 + }, + { + "epoch": 0.142782722928133, + "grad_norm": 0.6393505930900574, + "learning_rate": 0.0001905092732435208, + "loss": 2.6357, + "step": 1576 + }, + { + "epoch": 0.14287332110257978, + "grad_norm": 0.772020697593689, + "learning_rate": 0.00019050323204253006, + "loss": 2.8824, + "step": 1577 + }, + { + "epoch": 0.14296391927702656, + "grad_norm": 0.6423460245132446, + "learning_rate": 0.0001904971908415393, + "loss": 2.7056, + "step": 1578 + }, + { + "epoch": 0.14305451745147335, + "grad_norm": 0.7087236046791077, + "learning_rate": 0.00019049114964054856, + "loss": 2.7529, + "step": 1579 + }, + { + "epoch": 0.14314511562592014, + "grad_norm": 0.6716104745864868, + "learning_rate": 0.0001904851084395578, + "loss": 2.2614, + "step": 1580 + }, + { + "epoch": 0.14323571380036693, + "grad_norm": 0.6607643961906433, + "learning_rate": 0.00019047906723856703, + "loss": 2.697, + "step": 1581 + }, + { + "epoch": 0.1433263119748137, + "grad_norm": 0.6719159483909607, + "learning_rate": 0.0001904730260375763, + "loss": 2.7719, + "step": 1582 + }, + { + "epoch": 0.1434169101492605, + "grad_norm": 0.7394698858261108, + "learning_rate": 0.00019046698483658552, + "loss": 2.937, + "step": 1583 + }, + { + "epoch": 0.1435075083237073, + "grad_norm": 0.697624921798706, + "learning_rate": 0.00019046094363559476, + "loss": 2.8148, + "step": 1584 + }, + { + "epoch": 0.14359810649815405, + "grad_norm": 0.7373396158218384, + "learning_rate": 0.000190454902434604, + "loss": 3.0661, + "step": 1585 + }, + { + "epoch": 0.14368870467260084, + "grad_norm": 0.6497548222541809, + "learning_rate": 0.00019044886123361325, + "loss": 2.2897, + "step": 1586 + }, + { + "epoch": 0.14377930284704762, + "grad_norm": 0.7084668874740601, + "learning_rate": 0.0001904428200326225, + "loss": 2.6986, + "step": 1587 + }, + { + "epoch": 0.1438699010214944, + "grad_norm": 0.6802512407302856, + "learning_rate": 0.00019043677883163175, + "loss": 2.7469, + "step": 1588 + }, + { + "epoch": 0.1439604991959412, + "grad_norm": 0.6687962412834167, + "learning_rate": 0.00019043073763064098, + "loss": 2.8637, + "step": 1589 + }, + { + "epoch": 0.144051097370388, + "grad_norm": 0.6368609070777893, + "learning_rate": 0.00019042469642965021, + "loss": 2.6582, + "step": 1590 + }, + { + "epoch": 0.14414169554483477, + "grad_norm": 0.7223345041275024, + "learning_rate": 0.00019041865522865948, + "loss": 1.8543, + "step": 1591 + }, + { + "epoch": 0.14423229371928156, + "grad_norm": 0.7248435020446777, + "learning_rate": 0.0001904126140276687, + "loss": 2.6414, + "step": 1592 + }, + { + "epoch": 0.14432289189372835, + "grad_norm": 0.6934962868690491, + "learning_rate": 0.00019040657282667794, + "loss": 2.8484, + "step": 1593 + }, + { + "epoch": 0.14441349006817514, + "grad_norm": 0.7284290790557861, + "learning_rate": 0.0001904005316256872, + "loss": 2.8401, + "step": 1594 + }, + { + "epoch": 0.1445040882426219, + "grad_norm": 0.7388444542884827, + "learning_rate": 0.00019039449042469644, + "loss": 2.7963, + "step": 1595 + }, + { + "epoch": 0.14459468641706869, + "grad_norm": 0.6952456831932068, + "learning_rate": 0.0001903884492237057, + "loss": 2.6728, + "step": 1596 + }, + { + "epoch": 0.14468528459151547, + "grad_norm": 0.7061296105384827, + "learning_rate": 0.0001903824080227149, + "loss": 2.989, + "step": 1597 + }, + { + "epoch": 0.14477588276596226, + "grad_norm": 0.6693350672721863, + "learning_rate": 0.00019037636682172417, + "loss": 2.6282, + "step": 1598 + }, + { + "epoch": 0.14486648094040905, + "grad_norm": 0.7078139185905457, + "learning_rate": 0.0001903703256207334, + "loss": 2.9667, + "step": 1599 + }, + { + "epoch": 0.14495707911485584, + "grad_norm": 0.7486432194709778, + "learning_rate": 0.00019036428441974266, + "loss": 1.9717, + "step": 1600 + }, + { + "epoch": 0.14504767728930262, + "grad_norm": 0.6460002064704895, + "learning_rate": 0.0001903582432187519, + "loss": 2.6996, + "step": 1601 + }, + { + "epoch": 0.1451382754637494, + "grad_norm": 0.7626601457595825, + "learning_rate": 0.00019035220201776113, + "loss": 2.764, + "step": 1602 + }, + { + "epoch": 0.1452288736381962, + "grad_norm": 0.696399986743927, + "learning_rate": 0.0001903461608167704, + "loss": 2.9938, + "step": 1603 + }, + { + "epoch": 0.145319471812643, + "grad_norm": 0.6365994215011597, + "learning_rate": 0.00019034011961577963, + "loss": 2.8426, + "step": 1604 + }, + { + "epoch": 0.14541006998708977, + "grad_norm": 0.6798645853996277, + "learning_rate": 0.0001903340784147889, + "loss": 2.8169, + "step": 1605 + }, + { + "epoch": 0.14550066816153653, + "grad_norm": 0.713316798210144, + "learning_rate": 0.0001903280372137981, + "loss": 2.7156, + "step": 1606 + }, + { + "epoch": 0.14559126633598332, + "grad_norm": 0.7810934782028198, + "learning_rate": 0.00019032199601280736, + "loss": 3.0199, + "step": 1607 + }, + { + "epoch": 0.1456818645104301, + "grad_norm": 0.6829586029052734, + "learning_rate": 0.0001903159548118166, + "loss": 2.7717, + "step": 1608 + }, + { + "epoch": 0.1457724626848769, + "grad_norm": 0.6983271241188049, + "learning_rate": 0.00019030991361082585, + "loss": 2.8905, + "step": 1609 + }, + { + "epoch": 0.14586306085932368, + "grad_norm": 0.6581229567527771, + "learning_rate": 0.00019030387240983508, + "loss": 2.225, + "step": 1610 + }, + { + "epoch": 0.14595365903377047, + "grad_norm": 0.6649163365364075, + "learning_rate": 0.00019029783120884432, + "loss": 2.9873, + "step": 1611 + }, + { + "epoch": 0.14604425720821726, + "grad_norm": 0.7060375213623047, + "learning_rate": 0.00019029179000785358, + "loss": 2.9676, + "step": 1612 + }, + { + "epoch": 0.14613485538266405, + "grad_norm": 0.6610205769538879, + "learning_rate": 0.00019028574880686281, + "loss": 2.8252, + "step": 1613 + }, + { + "epoch": 0.14622545355711083, + "grad_norm": 0.7069781422615051, + "learning_rate": 0.00019027970760587205, + "loss": 2.7508, + "step": 1614 + }, + { + "epoch": 0.14631605173155762, + "grad_norm": 0.6865363121032715, + "learning_rate": 0.00019027366640488128, + "loss": 2.8427, + "step": 1615 + }, + { + "epoch": 0.14640664990600438, + "grad_norm": 0.6937883496284485, + "learning_rate": 0.00019026762520389054, + "loss": 2.9477, + "step": 1616 + }, + { + "epoch": 0.14649724808045117, + "grad_norm": 0.6637274026870728, + "learning_rate": 0.0001902615840028998, + "loss": 2.0678, + "step": 1617 + }, + { + "epoch": 0.14658784625489796, + "grad_norm": 0.6765585541725159, + "learning_rate": 0.00019025554280190904, + "loss": 3.048, + "step": 1618 + }, + { + "epoch": 0.14667844442934475, + "grad_norm": 0.7054051756858826, + "learning_rate": 0.00019024950160091827, + "loss": 2.9535, + "step": 1619 + }, + { + "epoch": 0.14676904260379153, + "grad_norm": 0.6322129368782043, + "learning_rate": 0.0001902434603999275, + "loss": 2.475, + "step": 1620 + }, + { + "epoch": 0.14685964077823832, + "grad_norm": 0.8007577061653137, + "learning_rate": 0.00019023741919893677, + "loss": 2.9498, + "step": 1621 + }, + { + "epoch": 0.1469502389526851, + "grad_norm": 0.6569220423698425, + "learning_rate": 0.000190231377997946, + "loss": 2.6564, + "step": 1622 + }, + { + "epoch": 0.1470408371271319, + "grad_norm": 0.5933565497398376, + "learning_rate": 0.00019022533679695524, + "loss": 2.2174, + "step": 1623 + }, + { + "epoch": 0.14713143530157868, + "grad_norm": 0.6881754398345947, + "learning_rate": 0.0001902192955959645, + "loss": 2.6503, + "step": 1624 + }, + { + "epoch": 0.14722203347602547, + "grad_norm": 0.6914350986480713, + "learning_rate": 0.00019021325439497373, + "loss": 2.1927, + "step": 1625 + }, + { + "epoch": 0.14731263165047223, + "grad_norm": 0.7576904296875, + "learning_rate": 0.000190207213193983, + "loss": 3.1783, + "step": 1626 + }, + { + "epoch": 0.14740322982491902, + "grad_norm": 0.6308307647705078, + "learning_rate": 0.0001902011719929922, + "loss": 2.7074, + "step": 1627 + }, + { + "epoch": 0.1474938279993658, + "grad_norm": 0.6930027008056641, + "learning_rate": 0.00019019513079200146, + "loss": 3.0252, + "step": 1628 + }, + { + "epoch": 0.1475844261738126, + "grad_norm": 0.7236186265945435, + "learning_rate": 0.0001901890895910107, + "loss": 2.7227, + "step": 1629 + }, + { + "epoch": 0.14767502434825938, + "grad_norm": 0.6629323959350586, + "learning_rate": 0.00019018304839001996, + "loss": 2.7408, + "step": 1630 + }, + { + "epoch": 0.14776562252270617, + "grad_norm": 0.6689577102661133, + "learning_rate": 0.0001901770071890292, + "loss": 2.5894, + "step": 1631 + }, + { + "epoch": 0.14785622069715296, + "grad_norm": 0.74839186668396, + "learning_rate": 0.00019017096598803842, + "loss": 2.8919, + "step": 1632 + }, + { + "epoch": 0.14794681887159974, + "grad_norm": 0.655875027179718, + "learning_rate": 0.00019016492478704768, + "loss": 2.6992, + "step": 1633 + }, + { + "epoch": 0.14803741704604653, + "grad_norm": 0.664948046207428, + "learning_rate": 0.00019015888358605692, + "loss": 2.8606, + "step": 1634 + }, + { + "epoch": 0.14812801522049332, + "grad_norm": 0.6432386636734009, + "learning_rate": 0.00019015284238506615, + "loss": 2.6892, + "step": 1635 + }, + { + "epoch": 0.14821861339494008, + "grad_norm": 0.6763776540756226, + "learning_rate": 0.0001901468011840754, + "loss": 2.8775, + "step": 1636 + }, + { + "epoch": 0.14830921156938687, + "grad_norm": 0.6713539958000183, + "learning_rate": 0.00019014075998308465, + "loss": 2.6895, + "step": 1637 + }, + { + "epoch": 0.14839980974383365, + "grad_norm": 0.6772746443748474, + "learning_rate": 0.00019013471878209388, + "loss": 3.0224, + "step": 1638 + }, + { + "epoch": 0.14849040791828044, + "grad_norm": 0.6998366117477417, + "learning_rate": 0.00019012867758110314, + "loss": 2.9473, + "step": 1639 + }, + { + "epoch": 0.14858100609272723, + "grad_norm": 0.694682240486145, + "learning_rate": 0.00019012263638011238, + "loss": 2.9818, + "step": 1640 + }, + { + "epoch": 0.14867160426717402, + "grad_norm": 0.6737142205238342, + "learning_rate": 0.0001901165951791216, + "loss": 2.8393, + "step": 1641 + }, + { + "epoch": 0.1487622024416208, + "grad_norm": 0.6908707618713379, + "learning_rate": 0.00019011055397813087, + "loss": 2.7105, + "step": 1642 + }, + { + "epoch": 0.1488528006160676, + "grad_norm": 0.6496729254722595, + "learning_rate": 0.0001901045127771401, + "loss": 2.8457, + "step": 1643 + }, + { + "epoch": 0.14894339879051438, + "grad_norm": 0.662663996219635, + "learning_rate": 0.00019009847157614934, + "loss": 2.9385, + "step": 1644 + }, + { + "epoch": 0.14903399696496117, + "grad_norm": 0.6543089747428894, + "learning_rate": 0.00019009243037515857, + "loss": 2.1585, + "step": 1645 + }, + { + "epoch": 0.14912459513940793, + "grad_norm": 0.7752383947372437, + "learning_rate": 0.00019008638917416784, + "loss": 2.9229, + "step": 1646 + }, + { + "epoch": 0.14921519331385472, + "grad_norm": 0.7101108431816101, + "learning_rate": 0.0001900803479731771, + "loss": 2.9425, + "step": 1647 + }, + { + "epoch": 0.1493057914883015, + "grad_norm": 0.6801576614379883, + "learning_rate": 0.0001900743067721863, + "loss": 2.8103, + "step": 1648 + }, + { + "epoch": 0.1493963896627483, + "grad_norm": 0.7037857174873352, + "learning_rate": 0.00019006826557119557, + "loss": 2.8297, + "step": 1649 + }, + { + "epoch": 0.14948698783719508, + "grad_norm": 0.650145411491394, + "learning_rate": 0.0001900622243702048, + "loss": 2.7996, + "step": 1650 + }, + { + "epoch": 0.14957758601164187, + "grad_norm": 0.6904132962226868, + "learning_rate": 0.00019005618316921406, + "loss": 3.0433, + "step": 1651 + }, + { + "epoch": 0.14966818418608865, + "grad_norm": 0.6708207130432129, + "learning_rate": 0.0001900501419682233, + "loss": 2.9319, + "step": 1652 + }, + { + "epoch": 0.14975878236053544, + "grad_norm": 0.6960120797157288, + "learning_rate": 0.00019004410076723253, + "loss": 2.5092, + "step": 1653 + }, + { + "epoch": 0.14984938053498223, + "grad_norm": 0.6680272817611694, + "learning_rate": 0.0001900380595662418, + "loss": 2.9139, + "step": 1654 + }, + { + "epoch": 0.14993997870942902, + "grad_norm": 0.6698387265205383, + "learning_rate": 0.00019003201836525102, + "loss": 2.7552, + "step": 1655 + }, + { + "epoch": 0.15003057688387578, + "grad_norm": 0.728391170501709, + "learning_rate": 0.00019002597716426029, + "loss": 2.8851, + "step": 1656 + }, + { + "epoch": 0.15012117505832256, + "grad_norm": 0.6896248459815979, + "learning_rate": 0.0001900199359632695, + "loss": 2.6504, + "step": 1657 + }, + { + "epoch": 0.15021177323276935, + "grad_norm": 0.6738779544830322, + "learning_rate": 0.00019001389476227875, + "loss": 2.6354, + "step": 1658 + }, + { + "epoch": 0.15030237140721614, + "grad_norm": 0.6925609707832336, + "learning_rate": 0.000190007853561288, + "loss": 2.7663, + "step": 1659 + }, + { + "epoch": 0.15039296958166293, + "grad_norm": 0.739187479019165, + "learning_rate": 0.00019000181236029725, + "loss": 2.8748, + "step": 1660 + }, + { + "epoch": 0.15048356775610972, + "grad_norm": 0.6933877468109131, + "learning_rate": 0.00018999577115930646, + "loss": 2.9298, + "step": 1661 + }, + { + "epoch": 0.1505741659305565, + "grad_norm": 0.7075664401054382, + "learning_rate": 0.00018998972995831572, + "loss": 2.8864, + "step": 1662 + }, + { + "epoch": 0.1506647641050033, + "grad_norm": 0.7427495121955872, + "learning_rate": 0.00018998368875732498, + "loss": 3.0761, + "step": 1663 + }, + { + "epoch": 0.15075536227945008, + "grad_norm": 0.6787596940994263, + "learning_rate": 0.0001899776475563342, + "loss": 2.8144, + "step": 1664 + }, + { + "epoch": 0.15084596045389687, + "grad_norm": 0.7162798643112183, + "learning_rate": 0.00018997160635534345, + "loss": 2.6217, + "step": 1665 + }, + { + "epoch": 0.15093655862834363, + "grad_norm": 0.7435238361358643, + "learning_rate": 0.00018996556515435268, + "loss": 2.0553, + "step": 1666 + }, + { + "epoch": 0.1510271568027904, + "grad_norm": 0.6817489862442017, + "learning_rate": 0.00018995952395336194, + "loss": 2.9657, + "step": 1667 + }, + { + "epoch": 0.1511177549772372, + "grad_norm": 0.681178867816925, + "learning_rate": 0.00018995348275237118, + "loss": 2.9239, + "step": 1668 + }, + { + "epoch": 0.151208353151684, + "grad_norm": 0.6556907892227173, + "learning_rate": 0.00018994744155138044, + "loss": 2.7199, + "step": 1669 + }, + { + "epoch": 0.15129895132613078, + "grad_norm": 0.6986088156700134, + "learning_rate": 0.00018994140035038967, + "loss": 2.8994, + "step": 1670 + }, + { + "epoch": 0.15138954950057756, + "grad_norm": 0.6822662353515625, + "learning_rate": 0.0001899353591493989, + "loss": 2.7743, + "step": 1671 + }, + { + "epoch": 0.15148014767502435, + "grad_norm": 0.7058797478675842, + "learning_rate": 0.00018992931794840817, + "loss": 2.8534, + "step": 1672 + }, + { + "epoch": 0.15157074584947114, + "grad_norm": 0.6669792532920837, + "learning_rate": 0.0001899232767474174, + "loss": 2.935, + "step": 1673 + }, + { + "epoch": 0.15166134402391793, + "grad_norm": 0.7040063142776489, + "learning_rate": 0.00018991723554642663, + "loss": 2.8396, + "step": 1674 + }, + { + "epoch": 0.15175194219836471, + "grad_norm": 0.6920583844184875, + "learning_rate": 0.00018991119434543587, + "loss": 2.9386, + "step": 1675 + }, + { + "epoch": 0.15184254037281147, + "grad_norm": 0.7210387587547302, + "learning_rate": 0.00018990515314444513, + "loss": 2.939, + "step": 1676 + }, + { + "epoch": 0.15193313854725826, + "grad_norm": 0.6854342222213745, + "learning_rate": 0.0001898991119434544, + "loss": 3.206, + "step": 1677 + }, + { + "epoch": 0.15202373672170505, + "grad_norm": 0.6671371459960938, + "learning_rate": 0.0001898930707424636, + "loss": 3.3565, + "step": 1678 + }, + { + "epoch": 0.15211433489615184, + "grad_norm": 0.7380814552307129, + "learning_rate": 0.00018988702954147286, + "loss": 2.9844, + "step": 1679 + }, + { + "epoch": 0.15220493307059862, + "grad_norm": 0.6474877595901489, + "learning_rate": 0.0001898809883404821, + "loss": 2.7055, + "step": 1680 + }, + { + "epoch": 0.1522955312450454, + "grad_norm": 0.6690481305122375, + "learning_rate": 0.00018987494713949135, + "loss": 2.6023, + "step": 1681 + }, + { + "epoch": 0.1523861294194922, + "grad_norm": 0.6402303576469421, + "learning_rate": 0.0001898689059385006, + "loss": 2.8177, + "step": 1682 + }, + { + "epoch": 0.152476727593939, + "grad_norm": 0.6863406896591187, + "learning_rate": 0.00018986286473750982, + "loss": 2.9002, + "step": 1683 + }, + { + "epoch": 0.15256732576838578, + "grad_norm": 0.7753783464431763, + "learning_rate": 0.00018985682353651908, + "loss": 2.6653, + "step": 1684 + }, + { + "epoch": 0.15265792394283256, + "grad_norm": 0.6653502583503723, + "learning_rate": 0.00018985078233552832, + "loss": 2.8185, + "step": 1685 + }, + { + "epoch": 0.15274852211727935, + "grad_norm": 0.6846883893013, + "learning_rate": 0.00018984474113453755, + "loss": 2.8157, + "step": 1686 + }, + { + "epoch": 0.1528391202917261, + "grad_norm": 0.6993995308876038, + "learning_rate": 0.00018983869993354678, + "loss": 2.7938, + "step": 1687 + }, + { + "epoch": 0.1529297184661729, + "grad_norm": 0.7665882706642151, + "learning_rate": 0.00018983265873255605, + "loss": 3.0066, + "step": 1688 + }, + { + "epoch": 0.15302031664061969, + "grad_norm": 0.6328266263008118, + "learning_rate": 0.00018982661753156528, + "loss": 2.7033, + "step": 1689 + }, + { + "epoch": 0.15311091481506647, + "grad_norm": 0.6965563297271729, + "learning_rate": 0.00018982057633057454, + "loss": 2.8967, + "step": 1690 + }, + { + "epoch": 0.15320151298951326, + "grad_norm": 0.7262848019599915, + "learning_rate": 0.00018981453512958375, + "loss": 2.8408, + "step": 1691 + }, + { + "epoch": 0.15329211116396005, + "grad_norm": 0.6925971508026123, + "learning_rate": 0.000189808493928593, + "loss": 2.75, + "step": 1692 + }, + { + "epoch": 0.15338270933840684, + "grad_norm": 0.7640866637229919, + "learning_rate": 0.00018980245272760227, + "loss": 2.7942, + "step": 1693 + }, + { + "epoch": 0.15347330751285362, + "grad_norm": 0.7492073178291321, + "learning_rate": 0.0001897964115266115, + "loss": 3.0224, + "step": 1694 + }, + { + "epoch": 0.1535639056873004, + "grad_norm": 0.7067092061042786, + "learning_rate": 0.00018979037032562074, + "loss": 2.818, + "step": 1695 + }, + { + "epoch": 0.1536545038617472, + "grad_norm": 0.726138710975647, + "learning_rate": 0.00018978432912462997, + "loss": 3.0551, + "step": 1696 + }, + { + "epoch": 0.15374510203619396, + "grad_norm": 0.6872488856315613, + "learning_rate": 0.00018977828792363923, + "loss": 2.5486, + "step": 1697 + }, + { + "epoch": 0.15383570021064075, + "grad_norm": 0.711746096611023, + "learning_rate": 0.00018977224672264847, + "loss": 2.8617, + "step": 1698 + }, + { + "epoch": 0.15392629838508753, + "grad_norm": 0.6629757881164551, + "learning_rate": 0.0001897662055216577, + "loss": 2.9677, + "step": 1699 + }, + { + "epoch": 0.15401689655953432, + "grad_norm": 0.6929475665092468, + "learning_rate": 0.00018976016432066696, + "loss": 2.7371, + "step": 1700 + }, + { + "epoch": 0.1541074947339811, + "grad_norm": 0.6942415237426758, + "learning_rate": 0.0001897541231196762, + "loss": 2.8605, + "step": 1701 + }, + { + "epoch": 0.1541980929084279, + "grad_norm": 0.6826038360595703, + "learning_rate": 0.00018974808191868546, + "loss": 2.6838, + "step": 1702 + }, + { + "epoch": 0.15428869108287468, + "grad_norm": 0.6763452887535095, + "learning_rate": 0.0001897420407176947, + "loss": 2.9689, + "step": 1703 + }, + { + "epoch": 0.15437928925732147, + "grad_norm": 0.6559908390045166, + "learning_rate": 0.00018973599951670393, + "loss": 2.6835, + "step": 1704 + }, + { + "epoch": 0.15446988743176826, + "grad_norm": 0.6853094696998596, + "learning_rate": 0.00018972995831571316, + "loss": 2.5368, + "step": 1705 + }, + { + "epoch": 0.15456048560621505, + "grad_norm": 0.7099838852882385, + "learning_rate": 0.00018972391711472242, + "loss": 3.3609, + "step": 1706 + }, + { + "epoch": 0.1546510837806618, + "grad_norm": 0.6758812665939331, + "learning_rate": 0.00018971787591373166, + "loss": 2.8703, + "step": 1707 + }, + { + "epoch": 0.1547416819551086, + "grad_norm": 0.7032632827758789, + "learning_rate": 0.0001897118347127409, + "loss": 2.6957, + "step": 1708 + }, + { + "epoch": 0.15483228012955538, + "grad_norm": 0.6624237298965454, + "learning_rate": 0.00018970579351175015, + "loss": 2.655, + "step": 1709 + }, + { + "epoch": 0.15492287830400217, + "grad_norm": 0.6892712712287903, + "learning_rate": 0.00018969975231075938, + "loss": 3.0626, + "step": 1710 + }, + { + "epoch": 0.15501347647844896, + "grad_norm": 0.6768119931221008, + "learning_rate": 0.00018969371110976865, + "loss": 2.8265, + "step": 1711 + }, + { + "epoch": 0.15510407465289575, + "grad_norm": 0.7069239616394043, + "learning_rate": 0.00018968766990877785, + "loss": 2.9924, + "step": 1712 + }, + { + "epoch": 0.15519467282734253, + "grad_norm": 0.6960119009017944, + "learning_rate": 0.00018968162870778711, + "loss": 2.7089, + "step": 1713 + }, + { + "epoch": 0.15528527100178932, + "grad_norm": 0.672008216381073, + "learning_rate": 0.00018967558750679638, + "loss": 2.8106, + "step": 1714 + }, + { + "epoch": 0.1553758691762361, + "grad_norm": 0.6970182061195374, + "learning_rate": 0.0001896695463058056, + "loss": 2.8075, + "step": 1715 + }, + { + "epoch": 0.1554664673506829, + "grad_norm": 0.58013516664505, + "learning_rate": 0.00018966350510481484, + "loss": 1.9767, + "step": 1716 + }, + { + "epoch": 0.15555706552512966, + "grad_norm": 0.6679020524024963, + "learning_rate": 0.00018965746390382408, + "loss": 2.8796, + "step": 1717 + }, + { + "epoch": 0.15564766369957644, + "grad_norm": 0.6676114201545715, + "learning_rate": 0.00018965142270283334, + "loss": 2.8911, + "step": 1718 + }, + { + "epoch": 0.15573826187402323, + "grad_norm": 0.6531954407691956, + "learning_rate": 0.00018964538150184257, + "loss": 2.9759, + "step": 1719 + }, + { + "epoch": 0.15582886004847002, + "grad_norm": 0.670265793800354, + "learning_rate": 0.0001896393403008518, + "loss": 2.747, + "step": 1720 + }, + { + "epoch": 0.1559194582229168, + "grad_norm": 0.6917432546615601, + "learning_rate": 0.00018963329909986107, + "loss": 2.6913, + "step": 1721 + }, + { + "epoch": 0.1560100563973636, + "grad_norm": 0.7170355916023254, + "learning_rate": 0.0001896272578988703, + "loss": 2.9933, + "step": 1722 + }, + { + "epoch": 0.15610065457181038, + "grad_norm": 0.6713494658470154, + "learning_rate": 0.00018962121669787956, + "loss": 2.8074, + "step": 1723 + }, + { + "epoch": 0.15619125274625717, + "grad_norm": 0.6634004712104797, + "learning_rate": 0.0001896151754968888, + "loss": 2.6011, + "step": 1724 + }, + { + "epoch": 0.15628185092070396, + "grad_norm": 0.7131709456443787, + "learning_rate": 0.00018960913429589803, + "loss": 3.1077, + "step": 1725 + }, + { + "epoch": 0.15637244909515075, + "grad_norm": 0.666289210319519, + "learning_rate": 0.00018960309309490727, + "loss": 2.7972, + "step": 1726 + }, + { + "epoch": 0.1564630472695975, + "grad_norm": 0.6996729969978333, + "learning_rate": 0.00018959705189391653, + "loss": 2.9526, + "step": 1727 + }, + { + "epoch": 0.1565536454440443, + "grad_norm": 0.6931668519973755, + "learning_rate": 0.00018959101069292576, + "loss": 2.8751, + "step": 1728 + }, + { + "epoch": 0.15664424361849108, + "grad_norm": 0.7031682133674622, + "learning_rate": 0.000189584969491935, + "loss": 2.9752, + "step": 1729 + }, + { + "epoch": 0.15673484179293787, + "grad_norm": 0.687158465385437, + "learning_rate": 0.00018957892829094426, + "loss": 2.7632, + "step": 1730 + }, + { + "epoch": 0.15682543996738466, + "grad_norm": 0.7157454490661621, + "learning_rate": 0.0001895728870899535, + "loss": 3.065, + "step": 1731 + }, + { + "epoch": 0.15691603814183144, + "grad_norm": 0.6771743297576904, + "learning_rate": 0.00018956684588896275, + "loss": 2.0768, + "step": 1732 + }, + { + "epoch": 0.15700663631627823, + "grad_norm": 0.6970723271369934, + "learning_rate": 0.00018956080468797196, + "loss": 2.7931, + "step": 1733 + }, + { + "epoch": 0.15709723449072502, + "grad_norm": 0.6973064541816711, + "learning_rate": 0.00018955476348698122, + "loss": 3.2304, + "step": 1734 + }, + { + "epoch": 0.1571878326651718, + "grad_norm": 0.6570242643356323, + "learning_rate": 0.00018954872228599045, + "loss": 2.8382, + "step": 1735 + }, + { + "epoch": 0.1572784308396186, + "grad_norm": 0.6950442790985107, + "learning_rate": 0.00018954268108499971, + "loss": 2.93, + "step": 1736 + }, + { + "epoch": 0.15736902901406535, + "grad_norm": 0.6587425470352173, + "learning_rate": 0.00018953663988400895, + "loss": 2.9971, + "step": 1737 + }, + { + "epoch": 0.15745962718851214, + "grad_norm": 0.6628928780555725, + "learning_rate": 0.00018953059868301818, + "loss": 2.474, + "step": 1738 + }, + { + "epoch": 0.15755022536295893, + "grad_norm": 0.6766679883003235, + "learning_rate": 0.00018952455748202744, + "loss": 2.9844, + "step": 1739 + }, + { + "epoch": 0.15764082353740572, + "grad_norm": 0.6573672294616699, + "learning_rate": 0.00018951851628103668, + "loss": 2.7149, + "step": 1740 + }, + { + "epoch": 0.1577314217118525, + "grad_norm": 0.5972061157226562, + "learning_rate": 0.00018951247508004594, + "loss": 2.0007, + "step": 1741 + }, + { + "epoch": 0.1578220198862993, + "grad_norm": 0.6948111653327942, + "learning_rate": 0.00018950643387905515, + "loss": 2.8829, + "step": 1742 + }, + { + "epoch": 0.15791261806074608, + "grad_norm": 0.6439802050590515, + "learning_rate": 0.0001895003926780644, + "loss": 2.287, + "step": 1743 + }, + { + "epoch": 0.15800321623519287, + "grad_norm": 0.6691917777061462, + "learning_rate": 0.00018949435147707367, + "loss": 2.8082, + "step": 1744 + }, + { + "epoch": 0.15809381440963965, + "grad_norm": 0.6092406511306763, + "learning_rate": 0.0001894883102760829, + "loss": 2.6579, + "step": 1745 + }, + { + "epoch": 0.15818441258408644, + "grad_norm": 0.6819893717765808, + "learning_rate": 0.00018948226907509214, + "loss": 2.7528, + "step": 1746 + }, + { + "epoch": 0.1582750107585332, + "grad_norm": 0.6861570477485657, + "learning_rate": 0.00018947622787410137, + "loss": 3.0676, + "step": 1747 + }, + { + "epoch": 0.15836560893298, + "grad_norm": 0.7086118459701538, + "learning_rate": 0.00018947018667311063, + "loss": 3.0595, + "step": 1748 + }, + { + "epoch": 0.15845620710742678, + "grad_norm": 0.717301070690155, + "learning_rate": 0.00018946414547211987, + "loss": 2.9433, + "step": 1749 + }, + { + "epoch": 0.15854680528187357, + "grad_norm": 0.7145136594772339, + "learning_rate": 0.0001894581042711291, + "loss": 2.8179, + "step": 1750 + }, + { + "epoch": 0.15863740345632035, + "grad_norm": 0.6721550226211548, + "learning_rate": 0.00018945206307013836, + "loss": 3.1252, + "step": 1751 + }, + { + "epoch": 0.15872800163076714, + "grad_norm": 0.7204983234405518, + "learning_rate": 0.0001894460218691476, + "loss": 2.8923, + "step": 1752 + }, + { + "epoch": 0.15881859980521393, + "grad_norm": 0.5745464563369751, + "learning_rate": 0.00018943998066815686, + "loss": 2.0204, + "step": 1753 + }, + { + "epoch": 0.15890919797966072, + "grad_norm": 0.6942198276519775, + "learning_rate": 0.0001894339394671661, + "loss": 2.6257, + "step": 1754 + }, + { + "epoch": 0.1589997961541075, + "grad_norm": 0.6095190048217773, + "learning_rate": 0.00018942789826617532, + "loss": 1.9373, + "step": 1755 + }, + { + "epoch": 0.1590903943285543, + "grad_norm": 0.6722897291183472, + "learning_rate": 0.00018942185706518456, + "loss": 2.7451, + "step": 1756 + }, + { + "epoch": 0.15918099250300105, + "grad_norm": 0.6962360143661499, + "learning_rate": 0.00018941581586419382, + "loss": 2.8884, + "step": 1757 + }, + { + "epoch": 0.15927159067744784, + "grad_norm": 0.717995285987854, + "learning_rate": 0.00018940977466320305, + "loss": 2.8063, + "step": 1758 + }, + { + "epoch": 0.15936218885189463, + "grad_norm": 0.6445525884628296, + "learning_rate": 0.0001894037334622123, + "loss": 1.9745, + "step": 1759 + }, + { + "epoch": 0.1594527870263414, + "grad_norm": 0.6761828660964966, + "learning_rate": 0.00018939769226122155, + "loss": 2.7223, + "step": 1760 + }, + { + "epoch": 0.1595433852007882, + "grad_norm": 0.7058863043785095, + "learning_rate": 0.00018939165106023078, + "loss": 2.7588, + "step": 1761 + }, + { + "epoch": 0.159633983375235, + "grad_norm": 0.64863520860672, + "learning_rate": 0.00018938560985924004, + "loss": 2.688, + "step": 1762 + }, + { + "epoch": 0.15972458154968178, + "grad_norm": 0.7352786660194397, + "learning_rate": 0.00018937956865824925, + "loss": 2.8279, + "step": 1763 + }, + { + "epoch": 0.15981517972412856, + "grad_norm": 0.6746910810470581, + "learning_rate": 0.0001893735274572585, + "loss": 2.7887, + "step": 1764 + }, + { + "epoch": 0.15990577789857535, + "grad_norm": 1.855915904045105, + "learning_rate": 0.00018936748625626775, + "loss": 2.657, + "step": 1765 + }, + { + "epoch": 0.15999637607302214, + "grad_norm": 0.7373146414756775, + "learning_rate": 0.000189361445055277, + "loss": 3.2264, + "step": 1766 + }, + { + "epoch": 0.16008697424746893, + "grad_norm": 0.6798224449157715, + "learning_rate": 0.00018935540385428624, + "loss": 2.8611, + "step": 1767 + }, + { + "epoch": 0.1601775724219157, + "grad_norm": 0.6887388825416565, + "learning_rate": 0.00018934936265329547, + "loss": 2.2746, + "step": 1768 + }, + { + "epoch": 0.16026817059636247, + "grad_norm": 0.6805263757705688, + "learning_rate": 0.00018934332145230474, + "loss": 2.2208, + "step": 1769 + }, + { + "epoch": 0.16035876877080926, + "grad_norm": 0.7082176804542542, + "learning_rate": 0.00018933728025131397, + "loss": 2.9448, + "step": 1770 + }, + { + "epoch": 0.16044936694525605, + "grad_norm": 0.6636806130409241, + "learning_rate": 0.0001893312390503232, + "loss": 2.5762, + "step": 1771 + }, + { + "epoch": 0.16053996511970284, + "grad_norm": 0.7024235725402832, + "learning_rate": 0.00018932519784933244, + "loss": 3.0382, + "step": 1772 + }, + { + "epoch": 0.16063056329414963, + "grad_norm": 0.6620694994926453, + "learning_rate": 0.0001893191566483417, + "loss": 2.653, + "step": 1773 + }, + { + "epoch": 0.1607211614685964, + "grad_norm": 0.6839483976364136, + "learning_rate": 0.00018931311544735096, + "loss": 2.3704, + "step": 1774 + }, + { + "epoch": 0.1608117596430432, + "grad_norm": 0.7590746879577637, + "learning_rate": 0.0001893070742463602, + "loss": 2.7931, + "step": 1775 + }, + { + "epoch": 0.16090235781749, + "grad_norm": 0.6706879138946533, + "learning_rate": 0.00018930103304536943, + "loss": 2.9101, + "step": 1776 + }, + { + "epoch": 0.16099295599193678, + "grad_norm": 0.49140843749046326, + "learning_rate": 0.00018929499184437866, + "loss": 1.5175, + "step": 1777 + }, + { + "epoch": 0.16108355416638354, + "grad_norm": 0.7667207717895508, + "learning_rate": 0.00018928895064338792, + "loss": 2.7643, + "step": 1778 + }, + { + "epoch": 0.16117415234083032, + "grad_norm": 0.694218099117279, + "learning_rate": 0.00018928290944239716, + "loss": 3.3192, + "step": 1779 + }, + { + "epoch": 0.1612647505152771, + "grad_norm": 0.5812278389930725, + "learning_rate": 0.0001892768682414064, + "loss": 1.9506, + "step": 1780 + }, + { + "epoch": 0.1613553486897239, + "grad_norm": 0.7491797804832458, + "learning_rate": 0.00018927082704041565, + "loss": 3.1377, + "step": 1781 + }, + { + "epoch": 0.1614459468641707, + "grad_norm": 0.7408038973808289, + "learning_rate": 0.0001892647858394249, + "loss": 2.6883, + "step": 1782 + }, + { + "epoch": 0.16153654503861747, + "grad_norm": 0.8017481565475464, + "learning_rate": 0.00018925874463843415, + "loss": 2.8862, + "step": 1783 + }, + { + "epoch": 0.16162714321306426, + "grad_norm": 0.6766853928565979, + "learning_rate": 0.00018925270343744336, + "loss": 2.8383, + "step": 1784 + }, + { + "epoch": 0.16171774138751105, + "grad_norm": 0.6592667102813721, + "learning_rate": 0.00018924666223645262, + "loss": 2.9152, + "step": 1785 + }, + { + "epoch": 0.16180833956195784, + "grad_norm": 0.6909355521202087, + "learning_rate": 0.00018924062103546185, + "loss": 2.731, + "step": 1786 + }, + { + "epoch": 0.16189893773640462, + "grad_norm": 0.6764008402824402, + "learning_rate": 0.0001892345798344711, + "loss": 2.9289, + "step": 1787 + }, + { + "epoch": 0.16198953591085138, + "grad_norm": 1.9008827209472656, + "learning_rate": 0.00018922853863348035, + "loss": 2.9164, + "step": 1788 + }, + { + "epoch": 0.16208013408529817, + "grad_norm": 0.5823180079460144, + "learning_rate": 0.00018922249743248958, + "loss": 2.0512, + "step": 1789 + }, + { + "epoch": 0.16217073225974496, + "grad_norm": 0.6664131283760071, + "learning_rate": 0.00018921645623149884, + "loss": 2.9435, + "step": 1790 + }, + { + "epoch": 0.16226133043419175, + "grad_norm": 0.6832851767539978, + "learning_rate": 0.00018921041503050807, + "loss": 2.6567, + "step": 1791 + }, + { + "epoch": 0.16235192860863853, + "grad_norm": 0.6478071808815002, + "learning_rate": 0.00018920437382951734, + "loss": 2.9402, + "step": 1792 + }, + { + "epoch": 0.16244252678308532, + "grad_norm": 0.659089207649231, + "learning_rate": 0.00018919833262852654, + "loss": 2.9034, + "step": 1793 + }, + { + "epoch": 0.1625331249575321, + "grad_norm": 2.3474414348602295, + "learning_rate": 0.0001891922914275358, + "loss": 2.3801, + "step": 1794 + }, + { + "epoch": 0.1626237231319789, + "grad_norm": 0.7255907654762268, + "learning_rate": 0.00018918625022654504, + "loss": 2.907, + "step": 1795 + }, + { + "epoch": 0.16271432130642569, + "grad_norm": 0.679290235042572, + "learning_rate": 0.0001891802090255543, + "loss": 2.8413, + "step": 1796 + }, + { + "epoch": 0.16280491948087247, + "grad_norm": 0.8322634100914001, + "learning_rate": 0.00018917416782456353, + "loss": 3.0543, + "step": 1797 + }, + { + "epoch": 0.16289551765531923, + "grad_norm": 0.6304685473442078, + "learning_rate": 0.00018916812662357277, + "loss": 1.9414, + "step": 1798 + }, + { + "epoch": 0.16298611582976602, + "grad_norm": 0.7317205667495728, + "learning_rate": 0.00018916208542258203, + "loss": 2.7391, + "step": 1799 + }, + { + "epoch": 0.1630767140042128, + "grad_norm": 1.1768157482147217, + "learning_rate": 0.00018915604422159126, + "loss": 2.9036, + "step": 1800 + }, + { + "epoch": 0.1631673121786596, + "grad_norm": 0.6908194422721863, + "learning_rate": 0.0001891500030206005, + "loss": 2.9784, + "step": 1801 + }, + { + "epoch": 0.16325791035310638, + "grad_norm": 0.7176612615585327, + "learning_rate": 0.00018914396181960973, + "loss": 2.9039, + "step": 1802 + }, + { + "epoch": 0.16334850852755317, + "grad_norm": 0.6496519446372986, + "learning_rate": 0.000189137920618619, + "loss": 2.9089, + "step": 1803 + }, + { + "epoch": 0.16343910670199996, + "grad_norm": 0.6715891361236572, + "learning_rate": 0.00018913187941762825, + "loss": 2.772, + "step": 1804 + }, + { + "epoch": 0.16352970487644675, + "grad_norm": 0.6939947009086609, + "learning_rate": 0.0001891258382166375, + "loss": 2.8859, + "step": 1805 + }, + { + "epoch": 0.16362030305089353, + "grad_norm": 0.6814590096473694, + "learning_rate": 0.00018911979701564672, + "loss": 2.8774, + "step": 1806 + }, + { + "epoch": 0.16371090122534032, + "grad_norm": 0.7549017071723938, + "learning_rate": 0.00018911375581465596, + "loss": 3.0141, + "step": 1807 + }, + { + "epoch": 0.16380149939978708, + "grad_norm": 0.6557106375694275, + "learning_rate": 0.00018910771461366522, + "loss": 2.8725, + "step": 1808 + }, + { + "epoch": 0.16389209757423387, + "grad_norm": 0.6492998600006104, + "learning_rate": 0.00018910167341267445, + "loss": 2.9176, + "step": 1809 + }, + { + "epoch": 0.16398269574868066, + "grad_norm": 0.6637278199195862, + "learning_rate": 0.00018909563221168368, + "loss": 2.9164, + "step": 1810 + }, + { + "epoch": 0.16407329392312744, + "grad_norm": 0.6704312562942505, + "learning_rate": 0.00018908959101069295, + "loss": 2.7401, + "step": 1811 + }, + { + "epoch": 0.16416389209757423, + "grad_norm": 0.6618905663490295, + "learning_rate": 0.00018908354980970218, + "loss": 2.9429, + "step": 1812 + }, + { + "epoch": 0.16425449027202102, + "grad_norm": 0.7354364395141602, + "learning_rate": 0.00018907750860871144, + "loss": 3.0884, + "step": 1813 + }, + { + "epoch": 0.1643450884464678, + "grad_norm": 0.6667284965515137, + "learning_rate": 0.00018907146740772065, + "loss": 2.7781, + "step": 1814 + }, + { + "epoch": 0.1644356866209146, + "grad_norm": 0.6804100275039673, + "learning_rate": 0.0001890654262067299, + "loss": 2.784, + "step": 1815 + }, + { + "epoch": 0.16452628479536138, + "grad_norm": 0.7060055732727051, + "learning_rate": 0.00018905938500573914, + "loss": 2.9298, + "step": 1816 + }, + { + "epoch": 0.16461688296980817, + "grad_norm": 0.6444797515869141, + "learning_rate": 0.0001890533438047484, + "loss": 2.4635, + "step": 1817 + }, + { + "epoch": 0.16470748114425493, + "grad_norm": 0.6976041793823242, + "learning_rate": 0.00018904730260375764, + "loss": 2.9079, + "step": 1818 + }, + { + "epoch": 0.16479807931870172, + "grad_norm": 0.686178982257843, + "learning_rate": 0.00018904126140276687, + "loss": 2.9263, + "step": 1819 + }, + { + "epoch": 0.1648886774931485, + "grad_norm": 0.6905245184898376, + "learning_rate": 0.00018903522020177613, + "loss": 3.0039, + "step": 1820 + }, + { + "epoch": 0.1649792756675953, + "grad_norm": 0.6687036752700806, + "learning_rate": 0.00018902917900078537, + "loss": 2.7255, + "step": 1821 + }, + { + "epoch": 0.16506987384204208, + "grad_norm": 0.7400180697441101, + "learning_rate": 0.0001890231377997946, + "loss": 2.8072, + "step": 1822 + }, + { + "epoch": 0.16516047201648887, + "grad_norm": 1.0227566957473755, + "learning_rate": 0.00018901709659880384, + "loss": 2.3749, + "step": 1823 + }, + { + "epoch": 0.16525107019093566, + "grad_norm": 0.6873925924301147, + "learning_rate": 0.0001890110553978131, + "loss": 2.8316, + "step": 1824 + }, + { + "epoch": 0.16534166836538244, + "grad_norm": 0.6979085803031921, + "learning_rate": 0.00018900501419682233, + "loss": 2.9954, + "step": 1825 + }, + { + "epoch": 0.16543226653982923, + "grad_norm": 0.6873084902763367, + "learning_rate": 0.0001889989729958316, + "loss": 2.8654, + "step": 1826 + }, + { + "epoch": 0.16552286471427602, + "grad_norm": 0.7386538982391357, + "learning_rate": 0.00018899293179484083, + "loss": 2.7755, + "step": 1827 + }, + { + "epoch": 0.16561346288872278, + "grad_norm": 0.7213725447654724, + "learning_rate": 0.00018898689059385006, + "loss": 3.0206, + "step": 1828 + }, + { + "epoch": 0.16570406106316957, + "grad_norm": 0.8148163557052612, + "learning_rate": 0.00018898084939285932, + "loss": 2.6999, + "step": 1829 + }, + { + "epoch": 0.16579465923761635, + "grad_norm": 0.7245307564735413, + "learning_rate": 0.00018897480819186856, + "loss": 3.0169, + "step": 1830 + }, + { + "epoch": 0.16588525741206314, + "grad_norm": 0.7089089751243591, + "learning_rate": 0.0001889687669908778, + "loss": 2.848, + "step": 1831 + }, + { + "epoch": 0.16597585558650993, + "grad_norm": 2.414358139038086, + "learning_rate": 0.00018896272578988702, + "loss": 2.2062, + "step": 1832 + }, + { + "epoch": 0.16606645376095672, + "grad_norm": 0.7071884274482727, + "learning_rate": 0.00018895668458889628, + "loss": 2.6918, + "step": 1833 + }, + { + "epoch": 0.1661570519354035, + "grad_norm": 0.6805866956710815, + "learning_rate": 0.00018895064338790555, + "loss": 2.7904, + "step": 1834 + }, + { + "epoch": 0.1662476501098503, + "grad_norm": 0.7265628576278687, + "learning_rate": 0.00018894460218691475, + "loss": 3.0384, + "step": 1835 + }, + { + "epoch": 0.16633824828429708, + "grad_norm": 0.7083374261856079, + "learning_rate": 0.00018893856098592401, + "loss": 2.6744, + "step": 1836 + }, + { + "epoch": 0.16642884645874387, + "grad_norm": 0.7388215065002441, + "learning_rate": 0.00018893251978493325, + "loss": 2.5653, + "step": 1837 + }, + { + "epoch": 0.16651944463319063, + "grad_norm": 0.6217671036720276, + "learning_rate": 0.0001889264785839425, + "loss": 1.8043, + "step": 1838 + }, + { + "epoch": 0.16661004280763742, + "grad_norm": 0.6456065773963928, + "learning_rate": 0.00018892043738295174, + "loss": 2.8805, + "step": 1839 + }, + { + "epoch": 0.1667006409820842, + "grad_norm": 0.6920424103736877, + "learning_rate": 0.00018891439618196098, + "loss": 2.7279, + "step": 1840 + }, + { + "epoch": 0.166791239156531, + "grad_norm": 0.6755111217498779, + "learning_rate": 0.00018890835498097024, + "loss": 2.6723, + "step": 1841 + }, + { + "epoch": 0.16688183733097778, + "grad_norm": 0.6809356212615967, + "learning_rate": 0.00018890231377997947, + "loss": 2.0738, + "step": 1842 + }, + { + "epoch": 0.16697243550542457, + "grad_norm": 0.6793537735939026, + "learning_rate": 0.00018889627257898873, + "loss": 2.896, + "step": 1843 + }, + { + "epoch": 0.16706303367987135, + "grad_norm": 0.7015562653541565, + "learning_rate": 0.00018889023137799794, + "loss": 2.6113, + "step": 1844 + }, + { + "epoch": 0.16715363185431814, + "grad_norm": 0.7539548873901367, + "learning_rate": 0.0001888841901770072, + "loss": 3.2109, + "step": 1845 + }, + { + "epoch": 0.16724423002876493, + "grad_norm": 0.6516135931015015, + "learning_rate": 0.00018887814897601644, + "loss": 3.0056, + "step": 1846 + }, + { + "epoch": 0.16733482820321172, + "grad_norm": 0.6802234649658203, + "learning_rate": 0.0001888721077750257, + "loss": 2.8323, + "step": 1847 + }, + { + "epoch": 0.1674254263776585, + "grad_norm": 0.6912335753440857, + "learning_rate": 0.0001888660665740349, + "loss": 2.8772, + "step": 1848 + }, + { + "epoch": 0.16751602455210526, + "grad_norm": 0.6652773022651672, + "learning_rate": 0.00018886002537304416, + "loss": 2.648, + "step": 1849 + }, + { + "epoch": 0.16760662272655205, + "grad_norm": 0.7486647963523865, + "learning_rate": 0.00018885398417205343, + "loss": 3.1272, + "step": 1850 + }, + { + "epoch": 0.16769722090099884, + "grad_norm": 0.6805304884910583, + "learning_rate": 0.00018884794297106266, + "loss": 2.9573, + "step": 1851 + }, + { + "epoch": 0.16778781907544563, + "grad_norm": 0.6819382309913635, + "learning_rate": 0.0001888419017700719, + "loss": 3.1215, + "step": 1852 + }, + { + "epoch": 0.16787841724989241, + "grad_norm": 0.655046284198761, + "learning_rate": 0.00018883586056908113, + "loss": 2.7065, + "step": 1853 + }, + { + "epoch": 0.1679690154243392, + "grad_norm": 0.6943725347518921, + "learning_rate": 0.0001888298193680904, + "loss": 3.2658, + "step": 1854 + }, + { + "epoch": 0.168059613598786, + "grad_norm": 0.6639479398727417, + "learning_rate": 0.00018882377816709962, + "loss": 2.7498, + "step": 1855 + }, + { + "epoch": 0.16815021177323278, + "grad_norm": 0.7008872628211975, + "learning_rate": 0.00018881773696610888, + "loss": 2.7414, + "step": 1856 + }, + { + "epoch": 0.16824080994767956, + "grad_norm": 0.6773698925971985, + "learning_rate": 0.00018881169576511812, + "loss": 2.9298, + "step": 1857 + }, + { + "epoch": 0.16833140812212635, + "grad_norm": 0.698351263999939, + "learning_rate": 0.00018880565456412735, + "loss": 2.8838, + "step": 1858 + }, + { + "epoch": 0.1684220062965731, + "grad_norm": 0.6663510799407959, + "learning_rate": 0.00018879961336313661, + "loss": 3.094, + "step": 1859 + }, + { + "epoch": 0.1685126044710199, + "grad_norm": 0.6962144374847412, + "learning_rate": 0.00018879357216214585, + "loss": 2.9126, + "step": 1860 + }, + { + "epoch": 0.1686032026454667, + "grad_norm": 0.6647769212722778, + "learning_rate": 0.00018878753096115508, + "loss": 3.1264, + "step": 1861 + }, + { + "epoch": 0.16869380081991348, + "grad_norm": 0.7308076024055481, + "learning_rate": 0.00018878148976016432, + "loss": 2.9406, + "step": 1862 + }, + { + "epoch": 0.16878439899436026, + "grad_norm": 0.6595136523246765, + "learning_rate": 0.00018877544855917358, + "loss": 2.7745, + "step": 1863 + }, + { + "epoch": 0.16887499716880705, + "grad_norm": 0.669111430644989, + "learning_rate": 0.00018876940735818284, + "loss": 2.6552, + "step": 1864 + }, + { + "epoch": 0.16896559534325384, + "grad_norm": 0.6749942898750305, + "learning_rate": 0.00018876336615719205, + "loss": 2.7906, + "step": 1865 + }, + { + "epoch": 0.16905619351770063, + "grad_norm": 0.710063099861145, + "learning_rate": 0.0001887573249562013, + "loss": 2.7294, + "step": 1866 + }, + { + "epoch": 0.1691467916921474, + "grad_norm": 0.7538104057312012, + "learning_rate": 0.00018875128375521054, + "loss": 2.8247, + "step": 1867 + }, + { + "epoch": 0.1692373898665942, + "grad_norm": 0.6650628447532654, + "learning_rate": 0.0001887452425542198, + "loss": 2.7426, + "step": 1868 + }, + { + "epoch": 0.16932798804104096, + "grad_norm": 0.7001957297325134, + "learning_rate": 0.00018873920135322904, + "loss": 2.8705, + "step": 1869 + }, + { + "epoch": 0.16941858621548775, + "grad_norm": 0.6725601553916931, + "learning_rate": 0.00018873316015223827, + "loss": 2.6542, + "step": 1870 + }, + { + "epoch": 0.16950918438993454, + "grad_norm": 0.7215427756309509, + "learning_rate": 0.00018872711895124753, + "loss": 3.0292, + "step": 1871 + }, + { + "epoch": 0.16959978256438132, + "grad_norm": 0.7034088373184204, + "learning_rate": 0.00018872107775025677, + "loss": 2.711, + "step": 1872 + }, + { + "epoch": 0.1696903807388281, + "grad_norm": 0.6686945557594299, + "learning_rate": 0.000188715036549266, + "loss": 2.9528, + "step": 1873 + }, + { + "epoch": 0.1697809789132749, + "grad_norm": 0.7074219584465027, + "learning_rate": 0.00018870899534827523, + "loss": 2.9284, + "step": 1874 + }, + { + "epoch": 0.1698715770877217, + "grad_norm": 0.7057182192802429, + "learning_rate": 0.0001887029541472845, + "loss": 2.9016, + "step": 1875 + }, + { + "epoch": 0.16996217526216847, + "grad_norm": 0.7996147871017456, + "learning_rate": 0.00018869691294629373, + "loss": 2.8507, + "step": 1876 + }, + { + "epoch": 0.17005277343661526, + "grad_norm": 0.6815887093544006, + "learning_rate": 0.000188690871745303, + "loss": 2.9602, + "step": 1877 + }, + { + "epoch": 0.17014337161106205, + "grad_norm": 0.7415056824684143, + "learning_rate": 0.0001886848305443122, + "loss": 2.737, + "step": 1878 + }, + { + "epoch": 0.1702339697855088, + "grad_norm": 0.6503691077232361, + "learning_rate": 0.00018867878934332146, + "loss": 2.8263, + "step": 1879 + }, + { + "epoch": 0.1703245679599556, + "grad_norm": 0.6708121299743652, + "learning_rate": 0.00018867274814233072, + "loss": 2.8084, + "step": 1880 + }, + { + "epoch": 0.17041516613440238, + "grad_norm": 0.6818743348121643, + "learning_rate": 0.00018866670694133995, + "loss": 2.9372, + "step": 1881 + }, + { + "epoch": 0.17050576430884917, + "grad_norm": 0.7937660813331604, + "learning_rate": 0.0001886606657403492, + "loss": 2.8748, + "step": 1882 + }, + { + "epoch": 0.17059636248329596, + "grad_norm": 0.6993054151535034, + "learning_rate": 0.00018865462453935842, + "loss": 2.7845, + "step": 1883 + }, + { + "epoch": 0.17068696065774275, + "grad_norm": 0.6955051422119141, + "learning_rate": 0.00018864858333836768, + "loss": 2.8912, + "step": 1884 + }, + { + "epoch": 0.17077755883218954, + "grad_norm": 0.6592618227005005, + "learning_rate": 0.00018864254213737692, + "loss": 2.2289, + "step": 1885 + }, + { + "epoch": 0.17086815700663632, + "grad_norm": 0.6744357943534851, + "learning_rate": 0.00018863650093638615, + "loss": 2.9143, + "step": 1886 + }, + { + "epoch": 0.1709587551810831, + "grad_norm": 0.6915192008018494, + "learning_rate": 0.0001886304597353954, + "loss": 3.0506, + "step": 1887 + }, + { + "epoch": 0.1710493533555299, + "grad_norm": 0.6997753381729126, + "learning_rate": 0.00018862441853440465, + "loss": 2.8784, + "step": 1888 + }, + { + "epoch": 0.17113995152997666, + "grad_norm": 0.7103668451309204, + "learning_rate": 0.0001886183773334139, + "loss": 2.891, + "step": 1889 + }, + { + "epoch": 0.17123054970442345, + "grad_norm": 0.6496731042861938, + "learning_rate": 0.00018861233613242314, + "loss": 2.2797, + "step": 1890 + }, + { + "epoch": 0.17132114787887023, + "grad_norm": 0.6058412194252014, + "learning_rate": 0.00018860629493143237, + "loss": 2.1463, + "step": 1891 + }, + { + "epoch": 0.17141174605331702, + "grad_norm": 0.708358883857727, + "learning_rate": 0.0001886002537304416, + "loss": 2.6622, + "step": 1892 + }, + { + "epoch": 0.1715023442277638, + "grad_norm": 0.7137901782989502, + "learning_rate": 0.00018859421252945087, + "loss": 2.9055, + "step": 1893 + }, + { + "epoch": 0.1715929424022106, + "grad_norm": 0.794567346572876, + "learning_rate": 0.0001885881713284601, + "loss": 2.9204, + "step": 1894 + }, + { + "epoch": 0.17168354057665738, + "grad_norm": 0.6719245910644531, + "learning_rate": 0.00018858213012746934, + "loss": 2.8156, + "step": 1895 + }, + { + "epoch": 0.17177413875110417, + "grad_norm": 0.7069176435470581, + "learning_rate": 0.0001885760889264786, + "loss": 2.7524, + "step": 1896 + }, + { + "epoch": 0.17186473692555096, + "grad_norm": 0.7255394458770752, + "learning_rate": 0.00018857004772548783, + "loss": 2.6667, + "step": 1897 + }, + { + "epoch": 0.17195533509999775, + "grad_norm": 0.6919969320297241, + "learning_rate": 0.0001885640065244971, + "loss": 2.9537, + "step": 1898 + }, + { + "epoch": 0.1720459332744445, + "grad_norm": 0.7482000589370728, + "learning_rate": 0.0001885579653235063, + "loss": 2.6945, + "step": 1899 + }, + { + "epoch": 0.1721365314488913, + "grad_norm": 0.724449634552002, + "learning_rate": 0.00018855192412251556, + "loss": 3.2271, + "step": 1900 + }, + { + "epoch": 0.17222712962333808, + "grad_norm": 0.687785804271698, + "learning_rate": 0.00018854588292152482, + "loss": 2.8672, + "step": 1901 + }, + { + "epoch": 0.17231772779778487, + "grad_norm": 0.6910447478294373, + "learning_rate": 0.00018853984172053406, + "loss": 2.742, + "step": 1902 + }, + { + "epoch": 0.17240832597223166, + "grad_norm": 0.7471134066581726, + "learning_rate": 0.0001885338005195433, + "loss": 2.9205, + "step": 1903 + }, + { + "epoch": 0.17249892414667845, + "grad_norm": 0.6706297993659973, + "learning_rate": 0.00018852775931855253, + "loss": 2.8122, + "step": 1904 + }, + { + "epoch": 0.17258952232112523, + "grad_norm": 0.6833136081695557, + "learning_rate": 0.0001885217181175618, + "loss": 2.876, + "step": 1905 + }, + { + "epoch": 0.17268012049557202, + "grad_norm": 0.7041476368904114, + "learning_rate": 0.00018851567691657102, + "loss": 2.8572, + "step": 1906 + }, + { + "epoch": 0.1727707186700188, + "grad_norm": 0.779798150062561, + "learning_rate": 0.00018850963571558026, + "loss": 2.895, + "step": 1907 + }, + { + "epoch": 0.1728613168444656, + "grad_norm": 0.6951949000358582, + "learning_rate": 0.0001885035945145895, + "loss": 2.8489, + "step": 1908 + }, + { + "epoch": 0.17295191501891236, + "grad_norm": 0.6469982266426086, + "learning_rate": 0.00018849755331359875, + "loss": 2.7519, + "step": 1909 + }, + { + "epoch": 0.17304251319335914, + "grad_norm": 0.6960009336471558, + "learning_rate": 0.000188491512112608, + "loss": 2.6782, + "step": 1910 + }, + { + "epoch": 0.17313311136780593, + "grad_norm": 0.6405553817749023, + "learning_rate": 0.00018848547091161725, + "loss": 2.5604, + "step": 1911 + }, + { + "epoch": 0.17322370954225272, + "grad_norm": 0.724568784236908, + "learning_rate": 0.00018847942971062648, + "loss": 2.8748, + "step": 1912 + }, + { + "epoch": 0.1733143077166995, + "grad_norm": 0.7220951914787292, + "learning_rate": 0.00018847338850963571, + "loss": 2.9056, + "step": 1913 + }, + { + "epoch": 0.1734049058911463, + "grad_norm": 0.5684880614280701, + "learning_rate": 0.00018846734730864497, + "loss": 1.9696, + "step": 1914 + }, + { + "epoch": 0.17349550406559308, + "grad_norm": 0.7601586580276489, + "learning_rate": 0.0001884613061076542, + "loss": 2.7481, + "step": 1915 + }, + { + "epoch": 0.17358610224003987, + "grad_norm": 0.6505663990974426, + "learning_rate": 0.00018845526490666344, + "loss": 2.6609, + "step": 1916 + }, + { + "epoch": 0.17367670041448666, + "grad_norm": 0.7277441024780273, + "learning_rate": 0.0001884492237056727, + "loss": 2.6538, + "step": 1917 + }, + { + "epoch": 0.17376729858893344, + "grad_norm": 0.6841938495635986, + "learning_rate": 0.00018844318250468194, + "loss": 2.9491, + "step": 1918 + }, + { + "epoch": 0.1738578967633802, + "grad_norm": 0.7037449479103088, + "learning_rate": 0.0001884371413036912, + "loss": 2.9134, + "step": 1919 + }, + { + "epoch": 0.173948494937827, + "grad_norm": 0.734760046005249, + "learning_rate": 0.0001884311001027004, + "loss": 3.1123, + "step": 1920 + }, + { + "epoch": 0.17403909311227378, + "grad_norm": 0.6916555166244507, + "learning_rate": 0.00018842505890170967, + "loss": 2.8811, + "step": 1921 + }, + { + "epoch": 0.17412969128672057, + "grad_norm": 0.6825840473175049, + "learning_rate": 0.0001884190177007189, + "loss": 3.0326, + "step": 1922 + }, + { + "epoch": 0.17422028946116735, + "grad_norm": 0.7465177178382874, + "learning_rate": 0.00018841297649972816, + "loss": 3.0818, + "step": 1923 + }, + { + "epoch": 0.17431088763561414, + "grad_norm": 0.7009222507476807, + "learning_rate": 0.0001884069352987374, + "loss": 2.8486, + "step": 1924 + }, + { + "epoch": 0.17440148581006093, + "grad_norm": 0.9892986416816711, + "learning_rate": 0.00018840089409774663, + "loss": 2.9979, + "step": 1925 + }, + { + "epoch": 0.17449208398450772, + "grad_norm": 0.671654224395752, + "learning_rate": 0.0001883948528967559, + "loss": 2.8521, + "step": 1926 + }, + { + "epoch": 0.1745826821589545, + "grad_norm": 0.6913911700248718, + "learning_rate": 0.00018838881169576513, + "loss": 3.1831, + "step": 1927 + }, + { + "epoch": 0.1746732803334013, + "grad_norm": 0.7092026472091675, + "learning_rate": 0.0001883827704947744, + "loss": 2.8778, + "step": 1928 + }, + { + "epoch": 0.17476387850784808, + "grad_norm": 0.7210507392883301, + "learning_rate": 0.0001883767292937836, + "loss": 3.277, + "step": 1929 + }, + { + "epoch": 0.17485447668229484, + "grad_norm": 0.8376246094703674, + "learning_rate": 0.00018837068809279286, + "loss": 2.9283, + "step": 1930 + }, + { + "epoch": 0.17494507485674163, + "grad_norm": 0.7063891291618347, + "learning_rate": 0.00018836464689180212, + "loss": 2.8459, + "step": 1931 + }, + { + "epoch": 0.17503567303118842, + "grad_norm": 0.7020247578620911, + "learning_rate": 0.00018835860569081135, + "loss": 2.9635, + "step": 1932 + }, + { + "epoch": 0.1751262712056352, + "grad_norm": 0.659967839717865, + "learning_rate": 0.00018835256448982058, + "loss": 2.8136, + "step": 1933 + }, + { + "epoch": 0.175216869380082, + "grad_norm": 0.6644399166107178, + "learning_rate": 0.00018834652328882982, + "loss": 2.7835, + "step": 1934 + }, + { + "epoch": 0.17530746755452878, + "grad_norm": 0.7075510621070862, + "learning_rate": 0.00018834048208783908, + "loss": 2.9446, + "step": 1935 + }, + { + "epoch": 0.17539806572897557, + "grad_norm": 0.6934614181518555, + "learning_rate": 0.00018833444088684831, + "loss": 2.6702, + "step": 1936 + }, + { + "epoch": 0.17548866390342235, + "grad_norm": 0.6464255452156067, + "learning_rate": 0.00018832839968585755, + "loss": 2.9184, + "step": 1937 + }, + { + "epoch": 0.17557926207786914, + "grad_norm": 0.6697532534599304, + "learning_rate": 0.00018832235848486678, + "loss": 2.6658, + "step": 1938 + }, + { + "epoch": 0.17566986025231593, + "grad_norm": 0.7328280210494995, + "learning_rate": 0.00018831631728387604, + "loss": 2.8227, + "step": 1939 + }, + { + "epoch": 0.1757604584267627, + "grad_norm": 0.7353438138961792, + "learning_rate": 0.0001883102760828853, + "loss": 2.9098, + "step": 1940 + }, + { + "epoch": 0.17585105660120948, + "grad_norm": 0.6990056037902832, + "learning_rate": 0.00018830423488189454, + "loss": 2.6876, + "step": 1941 + }, + { + "epoch": 0.17594165477565626, + "grad_norm": 0.687783420085907, + "learning_rate": 0.00018829819368090377, + "loss": 3.0099, + "step": 1942 + }, + { + "epoch": 0.17603225295010305, + "grad_norm": 0.702710747718811, + "learning_rate": 0.000188292152479913, + "loss": 2.8817, + "step": 1943 + }, + { + "epoch": 0.17612285112454984, + "grad_norm": 0.7126391530036926, + "learning_rate": 0.00018828611127892227, + "loss": 2.8645, + "step": 1944 + }, + { + "epoch": 0.17621344929899663, + "grad_norm": 0.675449788570404, + "learning_rate": 0.0001882800700779315, + "loss": 2.661, + "step": 1945 + }, + { + "epoch": 0.17630404747344341, + "grad_norm": 0.6859119534492493, + "learning_rate": 0.00018827402887694074, + "loss": 2.9463, + "step": 1946 + }, + { + "epoch": 0.1763946456478902, + "grad_norm": 0.706142783164978, + "learning_rate": 0.00018826798767595, + "loss": 2.663, + "step": 1947 + }, + { + "epoch": 0.176485243822337, + "grad_norm": 0.8187311887741089, + "learning_rate": 0.00018826194647495923, + "loss": 3.009, + "step": 1948 + }, + { + "epoch": 0.17657584199678378, + "grad_norm": 0.7141663432121277, + "learning_rate": 0.0001882559052739685, + "loss": 2.7761, + "step": 1949 + }, + { + "epoch": 0.17666644017123054, + "grad_norm": 0.6907605528831482, + "learning_rate": 0.0001882498640729777, + "loss": 2.8931, + "step": 1950 + }, + { + "epoch": 0.17675703834567733, + "grad_norm": 0.6395601034164429, + "learning_rate": 0.00018824382287198696, + "loss": 2.3386, + "step": 1951 + }, + { + "epoch": 0.1768476365201241, + "grad_norm": 0.685382604598999, + "learning_rate": 0.0001882377816709962, + "loss": 2.7259, + "step": 1952 + }, + { + "epoch": 0.1769382346945709, + "grad_norm": 0.6692869067192078, + "learning_rate": 0.00018823174047000546, + "loss": 2.6956, + "step": 1953 + }, + { + "epoch": 0.1770288328690177, + "grad_norm": 0.6898842453956604, + "learning_rate": 0.0001882256992690147, + "loss": 2.9521, + "step": 1954 + }, + { + "epoch": 0.17711943104346448, + "grad_norm": 0.7222834825515747, + "learning_rate": 0.00018821965806802392, + "loss": 2.8403, + "step": 1955 + }, + { + "epoch": 0.17721002921791126, + "grad_norm": 0.7121821641921997, + "learning_rate": 0.00018821361686703318, + "loss": 2.6743, + "step": 1956 + }, + { + "epoch": 0.17730062739235805, + "grad_norm": 0.6821378469467163, + "learning_rate": 0.00018820757566604242, + "loss": 2.8177, + "step": 1957 + }, + { + "epoch": 0.17739122556680484, + "grad_norm": 0.7164502739906311, + "learning_rate": 0.00018820153446505165, + "loss": 3.1564, + "step": 1958 + }, + { + "epoch": 0.17748182374125163, + "grad_norm": 0.6371967792510986, + "learning_rate": 0.0001881954932640609, + "loss": 2.5737, + "step": 1959 + }, + { + "epoch": 0.1775724219156984, + "grad_norm": 0.692501425743103, + "learning_rate": 0.00018818945206307015, + "loss": 2.9043, + "step": 1960 + }, + { + "epoch": 0.17766302009014517, + "grad_norm": 0.6791485548019409, + "learning_rate": 0.0001881834108620794, + "loss": 2.9032, + "step": 1961 + }, + { + "epoch": 0.17775361826459196, + "grad_norm": 0.6610190868377686, + "learning_rate": 0.00018817736966108864, + "loss": 2.8285, + "step": 1962 + }, + { + "epoch": 0.17784421643903875, + "grad_norm": 0.6964738368988037, + "learning_rate": 0.00018817132846009788, + "loss": 2.8754, + "step": 1963 + }, + { + "epoch": 0.17793481461348554, + "grad_norm": 0.6227304935455322, + "learning_rate": 0.0001881652872591071, + "loss": 2.1984, + "step": 1964 + }, + { + "epoch": 0.17802541278793232, + "grad_norm": 0.7049603462219238, + "learning_rate": 0.00018815924605811637, + "loss": 3.0235, + "step": 1965 + }, + { + "epoch": 0.1781160109623791, + "grad_norm": 0.6319432258605957, + "learning_rate": 0.0001881532048571256, + "loss": 2.867, + "step": 1966 + }, + { + "epoch": 0.1782066091368259, + "grad_norm": 0.7472440004348755, + "learning_rate": 0.00018814716365613484, + "loss": 2.816, + "step": 1967 + }, + { + "epoch": 0.1782972073112727, + "grad_norm": 0.6891170740127563, + "learning_rate": 0.00018814112245514407, + "loss": 2.8358, + "step": 1968 + }, + { + "epoch": 0.17838780548571948, + "grad_norm": 0.6788110136985779, + "learning_rate": 0.00018813508125415334, + "loss": 2.6582, + "step": 1969 + }, + { + "epoch": 0.17847840366016623, + "grad_norm": 0.7436486482620239, + "learning_rate": 0.0001881290400531626, + "loss": 3.2057, + "step": 1970 + }, + { + "epoch": 0.17856900183461302, + "grad_norm": 0.674879252910614, + "learning_rate": 0.0001881229988521718, + "loss": 2.1742, + "step": 1971 + }, + { + "epoch": 0.1786596000090598, + "grad_norm": 0.669167160987854, + "learning_rate": 0.00018811695765118106, + "loss": 2.7458, + "step": 1972 + }, + { + "epoch": 0.1787501981835066, + "grad_norm": 0.7709216475486755, + "learning_rate": 0.0001881109164501903, + "loss": 2.911, + "step": 1973 + }, + { + "epoch": 0.17884079635795339, + "grad_norm": 0.677720308303833, + "learning_rate": 0.00018810487524919956, + "loss": 2.6627, + "step": 1974 + }, + { + "epoch": 0.17893139453240017, + "grad_norm": 0.7040088176727295, + "learning_rate": 0.0001880988340482088, + "loss": 2.7276, + "step": 1975 + }, + { + "epoch": 0.17902199270684696, + "grad_norm": 0.7065890431404114, + "learning_rate": 0.00018809279284721803, + "loss": 3.014, + "step": 1976 + }, + { + "epoch": 0.17911259088129375, + "grad_norm": 0.6939875483512878, + "learning_rate": 0.0001880867516462273, + "loss": 2.9034, + "step": 1977 + }, + { + "epoch": 0.17920318905574054, + "grad_norm": 0.7156928777694702, + "learning_rate": 0.00018808071044523652, + "loss": 2.9682, + "step": 1978 + }, + { + "epoch": 0.17929378723018732, + "grad_norm": 0.7246935367584229, + "learning_rate": 0.00018807466924424578, + "loss": 2.9632, + "step": 1979 + }, + { + "epoch": 0.17938438540463408, + "grad_norm": 0.7169985175132751, + "learning_rate": 0.000188068628043255, + "loss": 3.0962, + "step": 1980 + }, + { + "epoch": 0.17947498357908087, + "grad_norm": 0.6985291242599487, + "learning_rate": 0.00018806258684226425, + "loss": 2.6825, + "step": 1981 + }, + { + "epoch": 0.17956558175352766, + "grad_norm": 0.6657827496528625, + "learning_rate": 0.0001880565456412735, + "loss": 2.7211, + "step": 1982 + }, + { + "epoch": 0.17965617992797445, + "grad_norm": 0.7942516207695007, + "learning_rate": 0.00018805050444028275, + "loss": 2.677, + "step": 1983 + }, + { + "epoch": 0.17974677810242123, + "grad_norm": 0.8184764385223389, + "learning_rate": 0.00018804446323929198, + "loss": 2.9083, + "step": 1984 + }, + { + "epoch": 0.17983737627686802, + "grad_norm": 0.6385408043861389, + "learning_rate": 0.00018803842203830122, + "loss": 2.8075, + "step": 1985 + }, + { + "epoch": 0.1799279744513148, + "grad_norm": 0.6875623464584351, + "learning_rate": 0.00018803238083731048, + "loss": 2.9359, + "step": 1986 + }, + { + "epoch": 0.1800185726257616, + "grad_norm": 0.6940048933029175, + "learning_rate": 0.0001880263396363197, + "loss": 3.014, + "step": 1987 + }, + { + "epoch": 0.18010917080020838, + "grad_norm": 0.7112208008766174, + "learning_rate": 0.00018802029843532895, + "loss": 2.8011, + "step": 1988 + }, + { + "epoch": 0.18019976897465517, + "grad_norm": 0.6626676321029663, + "learning_rate": 0.00018801425723433818, + "loss": 2.6733, + "step": 1989 + }, + { + "epoch": 0.18029036714910193, + "grad_norm": 0.7074275016784668, + "learning_rate": 0.00018800821603334744, + "loss": 2.7736, + "step": 1990 + }, + { + "epoch": 0.18038096532354872, + "grad_norm": 0.7350373864173889, + "learning_rate": 0.0001880021748323567, + "loss": 3.032, + "step": 1991 + }, + { + "epoch": 0.1804715634979955, + "grad_norm": 0.7486196160316467, + "learning_rate": 0.00018799613363136594, + "loss": 3.0239, + "step": 1992 + }, + { + "epoch": 0.1805621616724423, + "grad_norm": 0.6731139421463013, + "learning_rate": 0.00018799009243037517, + "loss": 2.6223, + "step": 1993 + }, + { + "epoch": 0.18065275984688908, + "grad_norm": 0.7594361305236816, + "learning_rate": 0.0001879840512293844, + "loss": 2.5835, + "step": 1994 + }, + { + "epoch": 0.18074335802133587, + "grad_norm": 0.7048203945159912, + "learning_rate": 0.00018797801002839366, + "loss": 3.0465, + "step": 1995 + }, + { + "epoch": 0.18083395619578266, + "grad_norm": 0.6806381344795227, + "learning_rate": 0.0001879719688274029, + "loss": 3.0603, + "step": 1996 + }, + { + "epoch": 0.18092455437022945, + "grad_norm": 0.698864221572876, + "learning_rate": 0.00018796592762641213, + "loss": 2.5711, + "step": 1997 + }, + { + "epoch": 0.18101515254467623, + "grad_norm": 0.6338801980018616, + "learning_rate": 0.00018795988642542137, + "loss": 2.4876, + "step": 1998 + }, + { + "epoch": 0.18110575071912302, + "grad_norm": 0.6775919198989868, + "learning_rate": 0.00018795384522443063, + "loss": 2.7568, + "step": 1999 + }, + { + "epoch": 0.18119634889356978, + "grad_norm": 0.8023726940155029, + "learning_rate": 0.0001879478040234399, + "loss": 2.8495, + "step": 2000 + }, + { + "epoch": 0.18128694706801657, + "grad_norm": 0.7301543951034546, + "learning_rate": 0.0001879417628224491, + "loss": 2.9674, + "step": 2001 + }, + { + "epoch": 0.18137754524246336, + "grad_norm": 0.6551960110664368, + "learning_rate": 0.00018793572162145836, + "loss": 2.2419, + "step": 2002 + }, + { + "epoch": 0.18146814341691014, + "grad_norm": 0.7229650020599365, + "learning_rate": 0.0001879296804204676, + "loss": 2.9268, + "step": 2003 + }, + { + "epoch": 0.18155874159135693, + "grad_norm": 0.6534097194671631, + "learning_rate": 0.00018792363921947685, + "loss": 2.1072, + "step": 2004 + }, + { + "epoch": 0.18164933976580372, + "grad_norm": 0.6774135828018188, + "learning_rate": 0.0001879175980184861, + "loss": 2.8223, + "step": 2005 + }, + { + "epoch": 0.1817399379402505, + "grad_norm": 0.7127194404602051, + "learning_rate": 0.00018791155681749532, + "loss": 2.9196, + "step": 2006 + }, + { + "epoch": 0.1818305361146973, + "grad_norm": 0.5789762139320374, + "learning_rate": 0.00018790551561650458, + "loss": 2.2314, + "step": 2007 + }, + { + "epoch": 0.18192113428914408, + "grad_norm": 0.7023323774337769, + "learning_rate": 0.00018789947441551382, + "loss": 2.7344, + "step": 2008 + }, + { + "epoch": 0.18201173246359087, + "grad_norm": 0.7655918598175049, + "learning_rate": 0.00018789343321452305, + "loss": 2.9081, + "step": 2009 + }, + { + "epoch": 0.18210233063803766, + "grad_norm": 0.7169873118400574, + "learning_rate": 0.00018788739201353228, + "loss": 2.6666, + "step": 2010 + }, + { + "epoch": 0.18219292881248442, + "grad_norm": 0.6465685367584229, + "learning_rate": 0.00018788135081254155, + "loss": 2.6205, + "step": 2011 + }, + { + "epoch": 0.1822835269869312, + "grad_norm": 0.7680977582931519, + "learning_rate": 0.00018787530961155078, + "loss": 3.0062, + "step": 2012 + }, + { + "epoch": 0.182374125161378, + "grad_norm": 0.571382999420166, + "learning_rate": 0.00018786926841056004, + "loss": 1.9347, + "step": 2013 + }, + { + "epoch": 0.18246472333582478, + "grad_norm": 0.6862770318984985, + "learning_rate": 0.00018786322720956927, + "loss": 2.8718, + "step": 2014 + }, + { + "epoch": 0.18255532151027157, + "grad_norm": 0.7884668707847595, + "learning_rate": 0.0001878571860085785, + "loss": 1.5095, + "step": 2015 + }, + { + "epoch": 0.18264591968471836, + "grad_norm": 0.7676835656166077, + "learning_rate": 0.00018785114480758777, + "loss": 2.9361, + "step": 2016 + }, + { + "epoch": 0.18273651785916514, + "grad_norm": 0.6586138010025024, + "learning_rate": 0.000187845103606597, + "loss": 2.8221, + "step": 2017 + }, + { + "epoch": 0.18282711603361193, + "grad_norm": 0.6722764372825623, + "learning_rate": 0.00018783906240560624, + "loss": 2.7415, + "step": 2018 + }, + { + "epoch": 0.18291771420805872, + "grad_norm": 0.7208830118179321, + "learning_rate": 0.00018783302120461547, + "loss": 3.0857, + "step": 2019 + }, + { + "epoch": 0.1830083123825055, + "grad_norm": 0.7229007482528687, + "learning_rate": 0.00018782698000362473, + "loss": 2.891, + "step": 2020 + }, + { + "epoch": 0.18309891055695227, + "grad_norm": 0.7169089317321777, + "learning_rate": 0.000187820938802634, + "loss": 2.8768, + "step": 2021 + }, + { + "epoch": 0.18318950873139905, + "grad_norm": 0.9924443960189819, + "learning_rate": 0.0001878148976016432, + "loss": 2.9269, + "step": 2022 + }, + { + "epoch": 0.18328010690584584, + "grad_norm": 0.6819146871566772, + "learning_rate": 0.00018780885640065246, + "loss": 2.8341, + "step": 2023 + }, + { + "epoch": 0.18337070508029263, + "grad_norm": 0.7493606209754944, + "learning_rate": 0.0001878028151996617, + "loss": 3.0646, + "step": 2024 + }, + { + "epoch": 0.18346130325473942, + "grad_norm": 0.7061737179756165, + "learning_rate": 0.00018779677399867096, + "loss": 3.1174, + "step": 2025 + }, + { + "epoch": 0.1835519014291862, + "grad_norm": 0.737607479095459, + "learning_rate": 0.0001877907327976802, + "loss": 2.7023, + "step": 2026 + }, + { + "epoch": 0.183642499603633, + "grad_norm": 0.6909933686256409, + "learning_rate": 0.00018778469159668943, + "loss": 2.9896, + "step": 2027 + }, + { + "epoch": 0.18373309777807978, + "grad_norm": 0.732199490070343, + "learning_rate": 0.00018777865039569866, + "loss": 3.2429, + "step": 2028 + }, + { + "epoch": 0.18382369595252657, + "grad_norm": 0.6720015406608582, + "learning_rate": 0.00018777260919470792, + "loss": 2.8242, + "step": 2029 + }, + { + "epoch": 0.18391429412697335, + "grad_norm": 0.6645336151123047, + "learning_rate": 0.00018776656799371715, + "loss": 2.8631, + "step": 2030 + }, + { + "epoch": 0.18400489230142011, + "grad_norm": 0.6778448224067688, + "learning_rate": 0.0001877605267927264, + "loss": 2.6898, + "step": 2031 + }, + { + "epoch": 0.1840954904758669, + "grad_norm": 0.6987476348876953, + "learning_rate": 0.00018775448559173565, + "loss": 2.9989, + "step": 2032 + }, + { + "epoch": 0.1841860886503137, + "grad_norm": 0.6578554511070251, + "learning_rate": 0.00018774844439074488, + "loss": 2.7485, + "step": 2033 + }, + { + "epoch": 0.18427668682476048, + "grad_norm": 0.6954371333122253, + "learning_rate": 0.00018774240318975415, + "loss": 2.8623, + "step": 2034 + }, + { + "epoch": 0.18436728499920726, + "grad_norm": 0.7500630617141724, + "learning_rate": 0.00018773636198876335, + "loss": 3.0146, + "step": 2035 + }, + { + "epoch": 0.18445788317365405, + "grad_norm": 0.7930296063423157, + "learning_rate": 0.0001877303207877726, + "loss": 2.808, + "step": 2036 + }, + { + "epoch": 0.18454848134810084, + "grad_norm": 0.6728442311286926, + "learning_rate": 0.00018772427958678187, + "loss": 2.9944, + "step": 2037 + }, + { + "epoch": 0.18463907952254763, + "grad_norm": 0.7023773193359375, + "learning_rate": 0.0001877182383857911, + "loss": 2.979, + "step": 2038 + }, + { + "epoch": 0.18472967769699442, + "grad_norm": 0.7432235479354858, + "learning_rate": 0.00018771219718480034, + "loss": 2.7792, + "step": 2039 + }, + { + "epoch": 0.1848202758714412, + "grad_norm": 0.7078784108161926, + "learning_rate": 0.00018770615598380958, + "loss": 2.6767, + "step": 2040 + }, + { + "epoch": 0.18491087404588796, + "grad_norm": 0.8145111799240112, + "learning_rate": 0.00018770011478281884, + "loss": 2.2452, + "step": 2041 + }, + { + "epoch": 0.18500147222033475, + "grad_norm": 0.6984264850616455, + "learning_rate": 0.00018769407358182807, + "loss": 2.8013, + "step": 2042 + }, + { + "epoch": 0.18509207039478154, + "grad_norm": 0.6665598750114441, + "learning_rate": 0.00018768803238083733, + "loss": 2.5878, + "step": 2043 + }, + { + "epoch": 0.18518266856922833, + "grad_norm": 0.7360962629318237, + "learning_rate": 0.00018768199117984657, + "loss": 2.8357, + "step": 2044 + }, + { + "epoch": 0.1852732667436751, + "grad_norm": 0.672551155090332, + "learning_rate": 0.0001876759499788558, + "loss": 2.7242, + "step": 2045 + }, + { + "epoch": 0.1853638649181219, + "grad_norm": 0.6874333024024963, + "learning_rate": 0.00018766990877786506, + "loss": 2.8872, + "step": 2046 + }, + { + "epoch": 0.1854544630925687, + "grad_norm": 0.7419270277023315, + "learning_rate": 0.0001876638675768743, + "loss": 2.9577, + "step": 2047 + }, + { + "epoch": 0.18554506126701548, + "grad_norm": 0.6640570759773254, + "learning_rate": 0.00018765782637588353, + "loss": 2.83, + "step": 2048 + }, + { + "epoch": 0.18563565944146226, + "grad_norm": 0.6742091774940491, + "learning_rate": 0.00018765178517489276, + "loss": 2.382, + "step": 2049 + }, + { + "epoch": 0.18572625761590905, + "grad_norm": 0.7498068809509277, + "learning_rate": 0.00018764574397390203, + "loss": 2.7144, + "step": 2050 + }, + { + "epoch": 0.1858168557903558, + "grad_norm": 0.7473189830780029, + "learning_rate": 0.0001876397027729113, + "loss": 2.7929, + "step": 2051 + }, + { + "epoch": 0.1859074539648026, + "grad_norm": 0.683255136013031, + "learning_rate": 0.0001876336615719205, + "loss": 2.6663, + "step": 2052 + }, + { + "epoch": 0.1859980521392494, + "grad_norm": 0.7494534850120544, + "learning_rate": 0.00018762762037092975, + "loss": 3.0319, + "step": 2053 + }, + { + "epoch": 0.18608865031369617, + "grad_norm": 0.7067404389381409, + "learning_rate": 0.000187621579169939, + "loss": 3.1428, + "step": 2054 + }, + { + "epoch": 0.18617924848814296, + "grad_norm": 0.7154858708381653, + "learning_rate": 0.00018761553796894825, + "loss": 2.8784, + "step": 2055 + }, + { + "epoch": 0.18626984666258975, + "grad_norm": 0.6834127902984619, + "learning_rate": 0.00018760949676795748, + "loss": 2.9177, + "step": 2056 + }, + { + "epoch": 0.18636044483703654, + "grad_norm": 0.6586530208587646, + "learning_rate": 0.00018760345556696672, + "loss": 2.5721, + "step": 2057 + }, + { + "epoch": 0.18645104301148333, + "grad_norm": 0.711496114730835, + "learning_rate": 0.00018759741436597595, + "loss": 2.9651, + "step": 2058 + }, + { + "epoch": 0.1865416411859301, + "grad_norm": 0.6926085352897644, + "learning_rate": 0.0001875913731649852, + "loss": 2.8984, + "step": 2059 + }, + { + "epoch": 0.1866322393603769, + "grad_norm": 0.7030723094940186, + "learning_rate": 0.00018758533196399445, + "loss": 2.9409, + "step": 2060 + }, + { + "epoch": 0.18672283753482366, + "grad_norm": 0.7300137281417847, + "learning_rate": 0.00018757929076300368, + "loss": 2.7447, + "step": 2061 + }, + { + "epoch": 0.18681343570927045, + "grad_norm": 0.7633447051048279, + "learning_rate": 0.00018757324956201294, + "loss": 3.0363, + "step": 2062 + }, + { + "epoch": 0.18690403388371724, + "grad_norm": 0.6667068004608154, + "learning_rate": 0.00018756720836102218, + "loss": 3.2238, + "step": 2063 + }, + { + "epoch": 0.18699463205816402, + "grad_norm": 0.6857883334159851, + "learning_rate": 0.00018756116716003144, + "loss": 2.7999, + "step": 2064 + }, + { + "epoch": 0.1870852302326108, + "grad_norm": 0.6518551707267761, + "learning_rate": 0.00018755512595904064, + "loss": 2.7849, + "step": 2065 + }, + { + "epoch": 0.1871758284070576, + "grad_norm": 0.7061705589294434, + "learning_rate": 0.0001875490847580499, + "loss": 2.8717, + "step": 2066 + }, + { + "epoch": 0.18726642658150439, + "grad_norm": 0.6997491717338562, + "learning_rate": 0.00018754304355705917, + "loss": 2.8332, + "step": 2067 + }, + { + "epoch": 0.18735702475595117, + "grad_norm": 0.7430413961410522, + "learning_rate": 0.0001875370023560684, + "loss": 3.0262, + "step": 2068 + }, + { + "epoch": 0.18744762293039796, + "grad_norm": 0.7395188212394714, + "learning_rate": 0.00018753096115507764, + "loss": 3.0052, + "step": 2069 + }, + { + "epoch": 0.18753822110484475, + "grad_norm": 0.7332494854927063, + "learning_rate": 0.00018752491995408687, + "loss": 2.8278, + "step": 2070 + }, + { + "epoch": 0.1876288192792915, + "grad_norm": 0.7137176394462585, + "learning_rate": 0.00018751887875309613, + "loss": 3.1913, + "step": 2071 + }, + { + "epoch": 0.1877194174537383, + "grad_norm": 0.6857671737670898, + "learning_rate": 0.00018751283755210536, + "loss": 2.8167, + "step": 2072 + }, + { + "epoch": 0.18781001562818508, + "grad_norm": 0.6940906047821045, + "learning_rate": 0.0001875067963511146, + "loss": 2.9042, + "step": 2073 + }, + { + "epoch": 0.18790061380263187, + "grad_norm": 0.8246904611587524, + "learning_rate": 0.00018750075515012386, + "loss": 2.9443, + "step": 2074 + }, + { + "epoch": 0.18799121197707866, + "grad_norm": 0.7003452181816101, + "learning_rate": 0.0001874947139491331, + "loss": 2.9506, + "step": 2075 + }, + { + "epoch": 0.18808181015152545, + "grad_norm": 0.7082295417785645, + "learning_rate": 0.00018748867274814236, + "loss": 3.1525, + "step": 2076 + }, + { + "epoch": 0.18817240832597223, + "grad_norm": 0.6847960352897644, + "learning_rate": 0.0001874826315471516, + "loss": 2.6999, + "step": 2077 + }, + { + "epoch": 0.18826300650041902, + "grad_norm": 0.6984232068061829, + "learning_rate": 0.00018747659034616082, + "loss": 2.9707, + "step": 2078 + }, + { + "epoch": 0.1883536046748658, + "grad_norm": 0.6748023629188538, + "learning_rate": 0.00018747054914517006, + "loss": 2.7099, + "step": 2079 + }, + { + "epoch": 0.1884442028493126, + "grad_norm": 0.8820870518684387, + "learning_rate": 0.00018746450794417932, + "loss": 2.894, + "step": 2080 + }, + { + "epoch": 0.18853480102375936, + "grad_norm": 0.7368427515029907, + "learning_rate": 0.00018745846674318855, + "loss": 2.9314, + "step": 2081 + }, + { + "epoch": 0.18862539919820615, + "grad_norm": 0.6998025178909302, + "learning_rate": 0.00018745242554219779, + "loss": 2.9176, + "step": 2082 + }, + { + "epoch": 0.18871599737265293, + "grad_norm": 0.7905574440956116, + "learning_rate": 0.00018744638434120705, + "loss": 2.8391, + "step": 2083 + }, + { + "epoch": 0.18880659554709972, + "grad_norm": 0.743667721748352, + "learning_rate": 0.00018744034314021628, + "loss": 2.7473, + "step": 2084 + }, + { + "epoch": 0.1888971937215465, + "grad_norm": 0.7767605781555176, + "learning_rate": 0.00018743430193922554, + "loss": 2.8646, + "step": 2085 + }, + { + "epoch": 0.1889877918959933, + "grad_norm": 0.6815322041511536, + "learning_rate": 0.00018742826073823475, + "loss": 2.831, + "step": 2086 + }, + { + "epoch": 0.18907839007044008, + "grad_norm": 0.7157419323921204, + "learning_rate": 0.000187422219537244, + "loss": 2.7152, + "step": 2087 + }, + { + "epoch": 0.18916898824488687, + "grad_norm": 0.6742491126060486, + "learning_rate": 0.00018741617833625324, + "loss": 2.7306, + "step": 2088 + }, + { + "epoch": 0.18925958641933366, + "grad_norm": 0.7345736026763916, + "learning_rate": 0.0001874101371352625, + "loss": 2.6234, + "step": 2089 + }, + { + "epoch": 0.18935018459378045, + "grad_norm": 0.6675182580947876, + "learning_rate": 0.00018740409593427174, + "loss": 2.8172, + "step": 2090 + }, + { + "epoch": 0.18944078276822723, + "grad_norm": 0.6906988024711609, + "learning_rate": 0.00018739805473328097, + "loss": 2.8224, + "step": 2091 + }, + { + "epoch": 0.189531380942674, + "grad_norm": 0.74661785364151, + "learning_rate": 0.00018739201353229024, + "loss": 2.6857, + "step": 2092 + }, + { + "epoch": 0.18962197911712078, + "grad_norm": 0.6706123352050781, + "learning_rate": 0.00018738597233129947, + "loss": 2.8109, + "step": 2093 + }, + { + "epoch": 0.18971257729156757, + "grad_norm": 0.66983962059021, + "learning_rate": 0.0001873799311303087, + "loss": 2.7486, + "step": 2094 + }, + { + "epoch": 0.18980317546601436, + "grad_norm": 0.7220048308372498, + "learning_rate": 0.00018737388992931794, + "loss": 3.0041, + "step": 2095 + }, + { + "epoch": 0.18989377364046114, + "grad_norm": 0.6101451516151428, + "learning_rate": 0.0001873678487283272, + "loss": 2.127, + "step": 2096 + }, + { + "epoch": 0.18998437181490793, + "grad_norm": 0.8526487946510315, + "learning_rate": 0.00018736180752733646, + "loss": 2.8199, + "step": 2097 + }, + { + "epoch": 0.19007496998935472, + "grad_norm": 0.8542416095733643, + "learning_rate": 0.0001873557663263457, + "loss": 2.8015, + "step": 2098 + }, + { + "epoch": 0.1901655681638015, + "grad_norm": 0.7247262597084045, + "learning_rate": 0.00018734972512535493, + "loss": 2.7977, + "step": 2099 + }, + { + "epoch": 0.1902561663382483, + "grad_norm": 0.6884271502494812, + "learning_rate": 0.00018734368392436416, + "loss": 2.939, + "step": 2100 + }, + { + "epoch": 0.19034676451269508, + "grad_norm": 0.6805243492126465, + "learning_rate": 0.00018733764272337342, + "loss": 2.7225, + "step": 2101 + }, + { + "epoch": 0.19043736268714184, + "grad_norm": 0.740060031414032, + "learning_rate": 0.00018733160152238266, + "loss": 3.038, + "step": 2102 + }, + { + "epoch": 0.19052796086158863, + "grad_norm": 0.7189950346946716, + "learning_rate": 0.0001873255603213919, + "loss": 2.9629, + "step": 2103 + }, + { + "epoch": 0.19061855903603542, + "grad_norm": 0.7132756114006042, + "learning_rate": 0.00018731951912040115, + "loss": 2.7809, + "step": 2104 + }, + { + "epoch": 0.1907091572104822, + "grad_norm": 0.7097617983818054, + "learning_rate": 0.0001873134779194104, + "loss": 2.8499, + "step": 2105 + }, + { + "epoch": 0.190799755384929, + "grad_norm": 0.7645124197006226, + "learning_rate": 0.00018730743671841965, + "loss": 2.6907, + "step": 2106 + }, + { + "epoch": 0.19089035355937578, + "grad_norm": 0.7274599075317383, + "learning_rate": 0.00018730139551742885, + "loss": 2.9465, + "step": 2107 + }, + { + "epoch": 0.19098095173382257, + "grad_norm": 0.6730350255966187, + "learning_rate": 0.00018729535431643812, + "loss": 2.849, + "step": 2108 + }, + { + "epoch": 0.19107154990826936, + "grad_norm": 0.6490524411201477, + "learning_rate": 0.00018728931311544735, + "loss": 2.8569, + "step": 2109 + }, + { + "epoch": 0.19116214808271614, + "grad_norm": 0.7126410603523254, + "learning_rate": 0.0001872832719144566, + "loss": 3.1468, + "step": 2110 + }, + { + "epoch": 0.19125274625716293, + "grad_norm": 0.685493528842926, + "learning_rate": 0.00018727723071346585, + "loss": 2.7147, + "step": 2111 + }, + { + "epoch": 0.1913433444316097, + "grad_norm": 0.6763296127319336, + "learning_rate": 0.00018727118951247508, + "loss": 2.5262, + "step": 2112 + }, + { + "epoch": 0.19143394260605648, + "grad_norm": 0.6819366812705994, + "learning_rate": 0.00018726514831148434, + "loss": 2.735, + "step": 2113 + }, + { + "epoch": 0.19152454078050327, + "grad_norm": 0.7054847478866577, + "learning_rate": 0.00018725910711049357, + "loss": 2.6289, + "step": 2114 + }, + { + "epoch": 0.19161513895495005, + "grad_norm": 0.6826362609863281, + "learning_rate": 0.00018725306590950284, + "loss": 2.808, + "step": 2115 + }, + { + "epoch": 0.19170573712939684, + "grad_norm": 0.7747598886489868, + "learning_rate": 0.00018724702470851204, + "loss": 3.0733, + "step": 2116 + }, + { + "epoch": 0.19179633530384363, + "grad_norm": 0.7281854748725891, + "learning_rate": 0.0001872409835075213, + "loss": 2.9358, + "step": 2117 + }, + { + "epoch": 0.19188693347829042, + "grad_norm": 0.7827380299568176, + "learning_rate": 0.00018723494230653054, + "loss": 2.8407, + "step": 2118 + }, + { + "epoch": 0.1919775316527372, + "grad_norm": 0.6769294738769531, + "learning_rate": 0.0001872289011055398, + "loss": 2.809, + "step": 2119 + }, + { + "epoch": 0.192068129827184, + "grad_norm": 0.7760857939720154, + "learning_rate": 0.00018722285990454903, + "loss": 2.7698, + "step": 2120 + }, + { + "epoch": 0.19215872800163078, + "grad_norm": 0.7494017481803894, + "learning_rate": 0.00018721681870355827, + "loss": 2.8036, + "step": 2121 + }, + { + "epoch": 0.19224932617607754, + "grad_norm": 0.6896548867225647, + "learning_rate": 0.00018721077750256753, + "loss": 2.9314, + "step": 2122 + }, + { + "epoch": 0.19233992435052433, + "grad_norm": 0.6759308576583862, + "learning_rate": 0.00018720473630157676, + "loss": 2.8634, + "step": 2123 + }, + { + "epoch": 0.19243052252497111, + "grad_norm": 0.7204221487045288, + "learning_rate": 0.000187198695100586, + "loss": 2.708, + "step": 2124 + }, + { + "epoch": 0.1925211206994179, + "grad_norm": 0.7855488061904907, + "learning_rate": 0.00018719265389959523, + "loss": 2.1723, + "step": 2125 + }, + { + "epoch": 0.1926117188738647, + "grad_norm": 0.7578520774841309, + "learning_rate": 0.0001871866126986045, + "loss": 3.1556, + "step": 2126 + }, + { + "epoch": 0.19270231704831148, + "grad_norm": 0.6991135478019714, + "learning_rate": 0.00018718057149761375, + "loss": 2.8811, + "step": 2127 + }, + { + "epoch": 0.19279291522275827, + "grad_norm": 0.7119289636611938, + "learning_rate": 0.000187174530296623, + "loss": 2.7961, + "step": 2128 + }, + { + "epoch": 0.19288351339720505, + "grad_norm": 0.6917804479598999, + "learning_rate": 0.00018716848909563222, + "loss": 2.6668, + "step": 2129 + }, + { + "epoch": 0.19297411157165184, + "grad_norm": 0.7102370858192444, + "learning_rate": 0.00018716244789464145, + "loss": 2.8431, + "step": 2130 + }, + { + "epoch": 0.19306470974609863, + "grad_norm": 0.6713622212409973, + "learning_rate": 0.00018715640669365072, + "loss": 2.886, + "step": 2131 + }, + { + "epoch": 0.1931553079205454, + "grad_norm": 0.6915928721427917, + "learning_rate": 0.00018715036549265995, + "loss": 2.7157, + "step": 2132 + }, + { + "epoch": 0.19324590609499218, + "grad_norm": 0.7511667609214783, + "learning_rate": 0.00018714432429166918, + "loss": 2.9203, + "step": 2133 + }, + { + "epoch": 0.19333650426943896, + "grad_norm": 0.7107693552970886, + "learning_rate": 0.00018713828309067845, + "loss": 2.1786, + "step": 2134 + }, + { + "epoch": 0.19342710244388575, + "grad_norm": 0.7042957544326782, + "learning_rate": 0.00018713224188968768, + "loss": 2.9444, + "step": 2135 + }, + { + "epoch": 0.19351770061833254, + "grad_norm": 0.676999032497406, + "learning_rate": 0.00018712620068869694, + "loss": 2.8857, + "step": 2136 + }, + { + "epoch": 0.19360829879277933, + "grad_norm": 0.695420503616333, + "learning_rate": 0.00018712015948770615, + "loss": 2.804, + "step": 2137 + }, + { + "epoch": 0.19369889696722611, + "grad_norm": 0.6891962885856628, + "learning_rate": 0.0001871141182867154, + "loss": 2.8922, + "step": 2138 + }, + { + "epoch": 0.1937894951416729, + "grad_norm": 0.6497898101806641, + "learning_rate": 0.00018710807708572464, + "loss": 2.8226, + "step": 2139 + }, + { + "epoch": 0.1938800933161197, + "grad_norm": 0.6259655356407166, + "learning_rate": 0.0001871020358847339, + "loss": 2.1466, + "step": 2140 + }, + { + "epoch": 0.19397069149056648, + "grad_norm": 0.7386878728866577, + "learning_rate": 0.00018709599468374314, + "loss": 2.8298, + "step": 2141 + }, + { + "epoch": 0.19406128966501324, + "grad_norm": 0.7140002846717834, + "learning_rate": 0.00018708995348275237, + "loss": 2.849, + "step": 2142 + }, + { + "epoch": 0.19415188783946002, + "grad_norm": 0.7427158951759338, + "learning_rate": 0.00018708391228176163, + "loss": 3.0279, + "step": 2143 + }, + { + "epoch": 0.1942424860139068, + "grad_norm": 0.7194265127182007, + "learning_rate": 0.00018707787108077087, + "loss": 2.872, + "step": 2144 + }, + { + "epoch": 0.1943330841883536, + "grad_norm": 0.7027137875556946, + "learning_rate": 0.0001870718298797801, + "loss": 3.0193, + "step": 2145 + }, + { + "epoch": 0.1944236823628004, + "grad_norm": 0.7182713150978088, + "learning_rate": 0.00018706578867878934, + "loss": 2.6634, + "step": 2146 + }, + { + "epoch": 0.19451428053724718, + "grad_norm": 0.6570948362350464, + "learning_rate": 0.0001870597474777986, + "loss": 2.7196, + "step": 2147 + }, + { + "epoch": 0.19460487871169396, + "grad_norm": 0.6986480355262756, + "learning_rate": 0.00018705370627680783, + "loss": 2.6835, + "step": 2148 + }, + { + "epoch": 0.19469547688614075, + "grad_norm": 0.7203723788261414, + "learning_rate": 0.0001870476650758171, + "loss": 2.9498, + "step": 2149 + }, + { + "epoch": 0.19478607506058754, + "grad_norm": 0.6401680707931519, + "learning_rate": 0.00018704162387482633, + "loss": 2.3353, + "step": 2150 + }, + { + "epoch": 0.19487667323503433, + "grad_norm": 0.693153440952301, + "learning_rate": 0.00018703558267383556, + "loss": 2.7539, + "step": 2151 + }, + { + "epoch": 0.19496727140948109, + "grad_norm": 0.7039570212364197, + "learning_rate": 0.00018702954147284482, + "loss": 2.662, + "step": 2152 + }, + { + "epoch": 0.19505786958392787, + "grad_norm": 0.5903106331825256, + "learning_rate": 0.00018702350027185405, + "loss": 2.2876, + "step": 2153 + }, + { + "epoch": 0.19514846775837466, + "grad_norm": 0.6959506273269653, + "learning_rate": 0.0001870174590708633, + "loss": 2.9737, + "step": 2154 + }, + { + "epoch": 0.19523906593282145, + "grad_norm": 0.7880547642707825, + "learning_rate": 0.00018701141786987252, + "loss": 2.7923, + "step": 2155 + }, + { + "epoch": 0.19532966410726824, + "grad_norm": 0.6975780129432678, + "learning_rate": 0.00018700537666888178, + "loss": 2.7671, + "step": 2156 + }, + { + "epoch": 0.19542026228171502, + "grad_norm": 0.7563682198524475, + "learning_rate": 0.00018699933546789105, + "loss": 2.9918, + "step": 2157 + }, + { + "epoch": 0.1955108604561618, + "grad_norm": 0.6791977882385254, + "learning_rate": 0.00018699329426690025, + "loss": 2.5061, + "step": 2158 + }, + { + "epoch": 0.1956014586306086, + "grad_norm": 0.6693214178085327, + "learning_rate": 0.0001869872530659095, + "loss": 2.9698, + "step": 2159 + }, + { + "epoch": 0.1956920568050554, + "grad_norm": 0.7112331986427307, + "learning_rate": 0.00018698121186491875, + "loss": 2.8723, + "step": 2160 + }, + { + "epoch": 0.19578265497950217, + "grad_norm": 0.7241517305374146, + "learning_rate": 0.000186975170663928, + "loss": 2.8094, + "step": 2161 + }, + { + "epoch": 0.19587325315394893, + "grad_norm": 0.8329454064369202, + "learning_rate": 0.00018696912946293724, + "loss": 2.7938, + "step": 2162 + }, + { + "epoch": 0.19596385132839572, + "grad_norm": 0.7106068730354309, + "learning_rate": 0.00018696308826194648, + "loss": 3.0074, + "step": 2163 + }, + { + "epoch": 0.1960544495028425, + "grad_norm": 0.7780077457427979, + "learning_rate": 0.00018695704706095574, + "loss": 2.8463, + "step": 2164 + }, + { + "epoch": 0.1961450476772893, + "grad_norm": 0.6832414269447327, + "learning_rate": 0.00018695100585996497, + "loss": 2.8784, + "step": 2165 + }, + { + "epoch": 0.19623564585173608, + "grad_norm": 0.6883964538574219, + "learning_rate": 0.00018694496465897423, + "loss": 2.9045, + "step": 2166 + }, + { + "epoch": 0.19632624402618287, + "grad_norm": 0.7001375555992126, + "learning_rate": 0.00018693892345798344, + "loss": 2.6629, + "step": 2167 + }, + { + "epoch": 0.19641684220062966, + "grad_norm": 0.6536831855773926, + "learning_rate": 0.0001869328822569927, + "loss": 2.8196, + "step": 2168 + }, + { + "epoch": 0.19650744037507645, + "grad_norm": 0.7096866965293884, + "learning_rate": 0.00018692684105600194, + "loss": 2.7913, + "step": 2169 + }, + { + "epoch": 0.19659803854952324, + "grad_norm": 0.589722752571106, + "learning_rate": 0.0001869207998550112, + "loss": 2.1106, + "step": 2170 + }, + { + "epoch": 0.19668863672397002, + "grad_norm": 0.6198018193244934, + "learning_rate": 0.00018691475865402043, + "loss": 2.0487, + "step": 2171 + }, + { + "epoch": 0.1967792348984168, + "grad_norm": 0.6564381122589111, + "learning_rate": 0.00018690871745302966, + "loss": 2.6886, + "step": 2172 + }, + { + "epoch": 0.19686983307286357, + "grad_norm": 0.7359564304351807, + "learning_rate": 0.00018690267625203893, + "loss": 2.9047, + "step": 2173 + }, + { + "epoch": 0.19696043124731036, + "grad_norm": 0.6538374423980713, + "learning_rate": 0.00018689663505104816, + "loss": 2.7228, + "step": 2174 + }, + { + "epoch": 0.19705102942175715, + "grad_norm": 0.6908231973648071, + "learning_rate": 0.0001868905938500574, + "loss": 3.1244, + "step": 2175 + }, + { + "epoch": 0.19714162759620393, + "grad_norm": 0.7029678225517273, + "learning_rate": 0.00018688455264906663, + "loss": 3.0226, + "step": 2176 + }, + { + "epoch": 0.19723222577065072, + "grad_norm": 0.7387351393699646, + "learning_rate": 0.0001868785114480759, + "loss": 2.665, + "step": 2177 + }, + { + "epoch": 0.1973228239450975, + "grad_norm": 0.7225033044815063, + "learning_rate": 0.00018687247024708512, + "loss": 2.6807, + "step": 2178 + }, + { + "epoch": 0.1974134221195443, + "grad_norm": 0.6528799533843994, + "learning_rate": 0.00018686642904609438, + "loss": 2.7382, + "step": 2179 + }, + { + "epoch": 0.19750402029399108, + "grad_norm": 0.6146752834320068, + "learning_rate": 0.00018686038784510362, + "loss": 2.2291, + "step": 2180 + }, + { + "epoch": 0.19759461846843787, + "grad_norm": 0.7079707980155945, + "learning_rate": 0.00018685434664411285, + "loss": 2.6975, + "step": 2181 + }, + { + "epoch": 0.19768521664288466, + "grad_norm": 0.7643851637840271, + "learning_rate": 0.0001868483054431221, + "loss": 2.826, + "step": 2182 + }, + { + "epoch": 0.19777581481733142, + "grad_norm": 0.6733944416046143, + "learning_rate": 0.00018684226424213135, + "loss": 2.8126, + "step": 2183 + }, + { + "epoch": 0.1978664129917782, + "grad_norm": 0.6463570594787598, + "learning_rate": 0.00018683622304114058, + "loss": 2.8972, + "step": 2184 + }, + { + "epoch": 0.197957011166225, + "grad_norm": 0.6763724684715271, + "learning_rate": 0.00018683018184014982, + "loss": 2.7815, + "step": 2185 + }, + { + "epoch": 0.19804760934067178, + "grad_norm": 0.6665053367614746, + "learning_rate": 0.00018682414063915908, + "loss": 2.744, + "step": 2186 + }, + { + "epoch": 0.19813820751511857, + "grad_norm": 0.6926366090774536, + "learning_rate": 0.00018681809943816834, + "loss": 3.0112, + "step": 2187 + }, + { + "epoch": 0.19822880568956536, + "grad_norm": 0.6765273809432983, + "learning_rate": 0.00018681205823717754, + "loss": 2.6903, + "step": 2188 + }, + { + "epoch": 0.19831940386401214, + "grad_norm": 0.7311291694641113, + "learning_rate": 0.0001868060170361868, + "loss": 3.116, + "step": 2189 + }, + { + "epoch": 0.19841000203845893, + "grad_norm": 0.6974352598190308, + "learning_rate": 0.00018679997583519604, + "loss": 3.0541, + "step": 2190 + }, + { + "epoch": 0.19850060021290572, + "grad_norm": 0.6798532605171204, + "learning_rate": 0.0001867939346342053, + "loss": 2.859, + "step": 2191 + }, + { + "epoch": 0.1985911983873525, + "grad_norm": 0.6718156933784485, + "learning_rate": 0.00018678789343321454, + "loss": 2.7264, + "step": 2192 + }, + { + "epoch": 0.19868179656179927, + "grad_norm": 0.7218011021614075, + "learning_rate": 0.00018678185223222377, + "loss": 2.9486, + "step": 2193 + }, + { + "epoch": 0.19877239473624606, + "grad_norm": 0.7103990316390991, + "learning_rate": 0.00018677581103123303, + "loss": 2.5679, + "step": 2194 + }, + { + "epoch": 0.19886299291069284, + "grad_norm": 0.6878862380981445, + "learning_rate": 0.00018676976983024226, + "loss": 3.0283, + "step": 2195 + }, + { + "epoch": 0.19895359108513963, + "grad_norm": 0.7009598612785339, + "learning_rate": 0.0001867637286292515, + "loss": 3.0558, + "step": 2196 + }, + { + "epoch": 0.19904418925958642, + "grad_norm": 0.7059707641601562, + "learning_rate": 0.00018675768742826073, + "loss": 3.1375, + "step": 2197 + }, + { + "epoch": 0.1991347874340332, + "grad_norm": 0.6683478355407715, + "learning_rate": 0.00018675164622727, + "loss": 2.695, + "step": 2198 + }, + { + "epoch": 0.19922538560848, + "grad_norm": 0.6847866177558899, + "learning_rate": 0.00018674560502627923, + "loss": 2.8483, + "step": 2199 + }, + { + "epoch": 0.19931598378292678, + "grad_norm": 0.7443883419036865, + "learning_rate": 0.0001867395638252885, + "loss": 2.8026, + "step": 2200 + }, + { + "epoch": 0.19940658195737357, + "grad_norm": 0.7182314395904541, + "learning_rate": 0.00018673352262429772, + "loss": 2.7533, + "step": 2201 + }, + { + "epoch": 0.19949718013182036, + "grad_norm": 0.6398686766624451, + "learning_rate": 0.00018672748142330696, + "loss": 2.2388, + "step": 2202 + }, + { + "epoch": 0.19958777830626712, + "grad_norm": 0.6786192059516907, + "learning_rate": 0.00018672144022231622, + "loss": 2.8527, + "step": 2203 + }, + { + "epoch": 0.1996783764807139, + "grad_norm": 0.6693204641342163, + "learning_rate": 0.00018671539902132545, + "loss": 3.1487, + "step": 2204 + }, + { + "epoch": 0.1997689746551607, + "grad_norm": 0.6439124345779419, + "learning_rate": 0.00018670935782033469, + "loss": 2.5219, + "step": 2205 + }, + { + "epoch": 0.19985957282960748, + "grad_norm": 0.7165475487709045, + "learning_rate": 0.00018670331661934392, + "loss": 2.7448, + "step": 2206 + }, + { + "epoch": 0.19995017100405427, + "grad_norm": 0.7139800786972046, + "learning_rate": 0.00018669727541835318, + "loss": 2.7525, + "step": 2207 + }, + { + "epoch": 0.20004076917850105, + "grad_norm": 0.7316243052482605, + "learning_rate": 0.00018669123421736242, + "loss": 2.7373, + "step": 2208 + }, + { + "epoch": 0.20013136735294784, + "grad_norm": 0.7048583030700684, + "learning_rate": 0.00018668519301637165, + "loss": 2.6235, + "step": 2209 + }, + { + "epoch": 0.20022196552739463, + "grad_norm": 0.7227122783660889, + "learning_rate": 0.0001866791518153809, + "loss": 2.7812, + "step": 2210 + }, + { + "epoch": 0.20031256370184142, + "grad_norm": 0.6635390520095825, + "learning_rate": 0.00018667311061439014, + "loss": 2.1828, + "step": 2211 + }, + { + "epoch": 0.2004031618762882, + "grad_norm": 0.7045010328292847, + "learning_rate": 0.0001866670694133994, + "loss": 2.9437, + "step": 2212 + }, + { + "epoch": 0.20049376005073496, + "grad_norm": 0.7670987844467163, + "learning_rate": 0.00018666102821240864, + "loss": 2.8469, + "step": 2213 + }, + { + "epoch": 0.20058435822518175, + "grad_norm": 0.7147826552391052, + "learning_rate": 0.00018665498701141787, + "loss": 2.9181, + "step": 2214 + }, + { + "epoch": 0.20067495639962854, + "grad_norm": 0.7333393692970276, + "learning_rate": 0.0001866489458104271, + "loss": 2.8024, + "step": 2215 + }, + { + "epoch": 0.20076555457407533, + "grad_norm": 0.728905975818634, + "learning_rate": 0.00018664290460943637, + "loss": 2.7034, + "step": 2216 + }, + { + "epoch": 0.20085615274852212, + "grad_norm": 0.7115161418914795, + "learning_rate": 0.0001866368634084456, + "loss": 2.9227, + "step": 2217 + }, + { + "epoch": 0.2009467509229689, + "grad_norm": 0.7485663890838623, + "learning_rate": 0.00018663082220745484, + "loss": 2.9779, + "step": 2218 + }, + { + "epoch": 0.2010373490974157, + "grad_norm": 0.7670645713806152, + "learning_rate": 0.0001866247810064641, + "loss": 2.8733, + "step": 2219 + }, + { + "epoch": 0.20112794727186248, + "grad_norm": 0.6747551560401917, + "learning_rate": 0.00018661873980547333, + "loss": 2.9621, + "step": 2220 + }, + { + "epoch": 0.20121854544630927, + "grad_norm": 0.7023265957832336, + "learning_rate": 0.0001866126986044826, + "loss": 2.8768, + "step": 2221 + }, + { + "epoch": 0.20130914362075605, + "grad_norm": 0.6782286763191223, + "learning_rate": 0.0001866066574034918, + "loss": 2.9066, + "step": 2222 + }, + { + "epoch": 0.2013997417952028, + "grad_norm": 0.6703528761863708, + "learning_rate": 0.00018660061620250106, + "loss": 2.6481, + "step": 2223 + }, + { + "epoch": 0.2014903399696496, + "grad_norm": 0.6956326365470886, + "learning_rate": 0.00018659457500151032, + "loss": 2.9437, + "step": 2224 + }, + { + "epoch": 0.2015809381440964, + "grad_norm": 0.7328132390975952, + "learning_rate": 0.00018658853380051956, + "loss": 2.7684, + "step": 2225 + }, + { + "epoch": 0.20167153631854318, + "grad_norm": 0.6678996086120605, + "learning_rate": 0.0001865824925995288, + "loss": 2.6943, + "step": 2226 + }, + { + "epoch": 0.20176213449298996, + "grad_norm": 0.7059155106544495, + "learning_rate": 0.00018657645139853803, + "loss": 2.8717, + "step": 2227 + }, + { + "epoch": 0.20185273266743675, + "grad_norm": 0.7211463451385498, + "learning_rate": 0.00018657041019754729, + "loss": 2.6819, + "step": 2228 + }, + { + "epoch": 0.20194333084188354, + "grad_norm": 0.6877346634864807, + "learning_rate": 0.00018656436899655652, + "loss": 2.875, + "step": 2229 + }, + { + "epoch": 0.20203392901633033, + "grad_norm": 0.6628211140632629, + "learning_rate": 0.00018655832779556578, + "loss": 2.9564, + "step": 2230 + }, + { + "epoch": 0.20212452719077711, + "grad_norm": 0.7357572317123413, + "learning_rate": 0.00018655228659457502, + "loss": 2.8506, + "step": 2231 + }, + { + "epoch": 0.2022151253652239, + "grad_norm": 0.7483315467834473, + "learning_rate": 0.00018654624539358425, + "loss": 2.9388, + "step": 2232 + }, + { + "epoch": 0.20230572353967066, + "grad_norm": 0.7011100649833679, + "learning_rate": 0.0001865402041925935, + "loss": 2.832, + "step": 2233 + }, + { + "epoch": 0.20239632171411745, + "grad_norm": 0.6579597592353821, + "learning_rate": 0.00018653416299160274, + "loss": 3.0288, + "step": 2234 + }, + { + "epoch": 0.20248691988856424, + "grad_norm": 0.685224175453186, + "learning_rate": 0.00018652812179061198, + "loss": 3.0238, + "step": 2235 + }, + { + "epoch": 0.20257751806301103, + "grad_norm": 0.7103403806686401, + "learning_rate": 0.0001865220805896212, + "loss": 3.1239, + "step": 2236 + }, + { + "epoch": 0.2026681162374578, + "grad_norm": 0.8136425018310547, + "learning_rate": 0.00018651603938863047, + "loss": 2.8803, + "step": 2237 + }, + { + "epoch": 0.2027587144119046, + "grad_norm": 0.6628795862197876, + "learning_rate": 0.0001865099981876397, + "loss": 2.2983, + "step": 2238 + }, + { + "epoch": 0.2028493125863514, + "grad_norm": 0.729324996471405, + "learning_rate": 0.00018650395698664894, + "loss": 2.7961, + "step": 2239 + }, + { + "epoch": 0.20293991076079818, + "grad_norm": 0.7199239134788513, + "learning_rate": 0.0001864979157856582, + "loss": 2.6461, + "step": 2240 + }, + { + "epoch": 0.20303050893524496, + "grad_norm": 0.690627932548523, + "learning_rate": 0.00018649187458466744, + "loss": 2.7541, + "step": 2241 + }, + { + "epoch": 0.20312110710969175, + "grad_norm": 0.6648432612419128, + "learning_rate": 0.0001864858333836767, + "loss": 2.5225, + "step": 2242 + }, + { + "epoch": 0.2032117052841385, + "grad_norm": 0.7397660613059998, + "learning_rate": 0.00018647979218268593, + "loss": 2.7092, + "step": 2243 + }, + { + "epoch": 0.2033023034585853, + "grad_norm": 0.7920216917991638, + "learning_rate": 0.00018647375098169517, + "loss": 2.7367, + "step": 2244 + }, + { + "epoch": 0.20339290163303209, + "grad_norm": 0.7327650785446167, + "learning_rate": 0.0001864677097807044, + "loss": 2.9385, + "step": 2245 + }, + { + "epoch": 0.20348349980747887, + "grad_norm": 0.7276092767715454, + "learning_rate": 0.00018646166857971366, + "loss": 2.7559, + "step": 2246 + }, + { + "epoch": 0.20357409798192566, + "grad_norm": 0.74410080909729, + "learning_rate": 0.0001864556273787229, + "loss": 2.8371, + "step": 2247 + }, + { + "epoch": 0.20366469615637245, + "grad_norm": 0.6977006793022156, + "learning_rate": 0.00018644958617773213, + "loss": 2.8239, + "step": 2248 + }, + { + "epoch": 0.20375529433081924, + "grad_norm": 0.6810033917427063, + "learning_rate": 0.0001864435449767414, + "loss": 3.0024, + "step": 2249 + }, + { + "epoch": 0.20384589250526602, + "grad_norm": 0.704293966293335, + "learning_rate": 0.00018643750377575063, + "loss": 3.0987, + "step": 2250 + }, + { + "epoch": 0.2039364906797128, + "grad_norm": 0.7193096876144409, + "learning_rate": 0.00018643146257475989, + "loss": 2.9362, + "step": 2251 + }, + { + "epoch": 0.2040270888541596, + "grad_norm": 0.7812249660491943, + "learning_rate": 0.0001864254213737691, + "loss": 2.7727, + "step": 2252 + }, + { + "epoch": 0.2041176870286064, + "grad_norm": 0.7113869786262512, + "learning_rate": 0.00018641938017277835, + "loss": 3.1215, + "step": 2253 + }, + { + "epoch": 0.20420828520305315, + "grad_norm": 0.7073994278907776, + "learning_rate": 0.00018641333897178762, + "loss": 2.9194, + "step": 2254 + }, + { + "epoch": 0.20429888337749993, + "grad_norm": 0.7129987478256226, + "learning_rate": 0.00018640729777079685, + "loss": 2.7956, + "step": 2255 + }, + { + "epoch": 0.20438948155194672, + "grad_norm": 0.6862993240356445, + "learning_rate": 0.00018640125656980608, + "loss": 2.7387, + "step": 2256 + }, + { + "epoch": 0.2044800797263935, + "grad_norm": 0.786662220954895, + "learning_rate": 0.00018639521536881532, + "loss": 2.0718, + "step": 2257 + }, + { + "epoch": 0.2045706779008403, + "grad_norm": 0.665635883808136, + "learning_rate": 0.00018638917416782458, + "loss": 2.6017, + "step": 2258 + }, + { + "epoch": 0.20466127607528709, + "grad_norm": 0.6886976957321167, + "learning_rate": 0.0001863831329668338, + "loss": 2.9317, + "step": 2259 + }, + { + "epoch": 0.20475187424973387, + "grad_norm": 0.771779477596283, + "learning_rate": 0.00018637709176584305, + "loss": 3.0695, + "step": 2260 + }, + { + "epoch": 0.20484247242418066, + "grad_norm": 0.7322935461997986, + "learning_rate": 0.0001863710505648523, + "loss": 2.7409, + "step": 2261 + }, + { + "epoch": 0.20493307059862745, + "grad_norm": 0.7993381023406982, + "learning_rate": 0.00018636500936386154, + "loss": 2.8231, + "step": 2262 + }, + { + "epoch": 0.20502366877307424, + "grad_norm": 0.7425085306167603, + "learning_rate": 0.0001863589681628708, + "loss": 2.7881, + "step": 2263 + }, + { + "epoch": 0.205114266947521, + "grad_norm": 0.8526290655136108, + "learning_rate": 0.00018635292696188004, + "loss": 2.9662, + "step": 2264 + }, + { + "epoch": 0.20520486512196778, + "grad_norm": 0.6747987866401672, + "learning_rate": 0.00018634688576088927, + "loss": 2.6527, + "step": 2265 + }, + { + "epoch": 0.20529546329641457, + "grad_norm": 0.8626695871353149, + "learning_rate": 0.0001863408445598985, + "loss": 2.9923, + "step": 2266 + }, + { + "epoch": 0.20538606147086136, + "grad_norm": 0.7418804168701172, + "learning_rate": 0.00018633480335890777, + "loss": 3.0871, + "step": 2267 + }, + { + "epoch": 0.20547665964530815, + "grad_norm": 0.7161554098129272, + "learning_rate": 0.000186328762157917, + "loss": 2.9983, + "step": 2268 + }, + { + "epoch": 0.20556725781975493, + "grad_norm": 0.7124241590499878, + "learning_rate": 0.00018632272095692623, + "loss": 2.6585, + "step": 2269 + }, + { + "epoch": 0.20565785599420172, + "grad_norm": 0.6569780111312866, + "learning_rate": 0.0001863166797559355, + "loss": 2.2381, + "step": 2270 + }, + { + "epoch": 0.2057484541686485, + "grad_norm": 0.7441844940185547, + "learning_rate": 0.00018631063855494473, + "loss": 2.5686, + "step": 2271 + }, + { + "epoch": 0.2058390523430953, + "grad_norm": 0.743302583694458, + "learning_rate": 0.000186304597353954, + "loss": 2.818, + "step": 2272 + }, + { + "epoch": 0.20592965051754208, + "grad_norm": 0.6518080830574036, + "learning_rate": 0.0001862985561529632, + "loss": 2.3228, + "step": 2273 + }, + { + "epoch": 0.20602024869198884, + "grad_norm": 0.7116888165473938, + "learning_rate": 0.00018629251495197246, + "loss": 2.7792, + "step": 2274 + }, + { + "epoch": 0.20611084686643563, + "grad_norm": 0.6975538730621338, + "learning_rate": 0.0001862864737509817, + "loss": 2.6832, + "step": 2275 + }, + { + "epoch": 0.20620144504088242, + "grad_norm": 0.6406773924827576, + "learning_rate": 0.00018628043254999095, + "loss": 2.0672, + "step": 2276 + }, + { + "epoch": 0.2062920432153292, + "grad_norm": 0.6879306435585022, + "learning_rate": 0.0001862743913490002, + "loss": 2.7898, + "step": 2277 + }, + { + "epoch": 0.206382641389776, + "grad_norm": 0.5943479537963867, + "learning_rate": 0.00018626835014800942, + "loss": 2.1561, + "step": 2278 + }, + { + "epoch": 0.20647323956422278, + "grad_norm": 0.7427793145179749, + "learning_rate": 0.00018626230894701868, + "loss": 2.836, + "step": 2279 + }, + { + "epoch": 0.20656383773866957, + "grad_norm": 0.6345300674438477, + "learning_rate": 0.00018625626774602792, + "loss": 2.2242, + "step": 2280 + }, + { + "epoch": 0.20665443591311636, + "grad_norm": 0.7344943881034851, + "learning_rate": 0.00018625022654503715, + "loss": 2.9947, + "step": 2281 + }, + { + "epoch": 0.20674503408756315, + "grad_norm": 0.7529107928276062, + "learning_rate": 0.00018624418534404639, + "loss": 2.8463, + "step": 2282 + }, + { + "epoch": 0.20683563226200993, + "grad_norm": 0.761718213558197, + "learning_rate": 0.00018623814414305565, + "loss": 2.8221, + "step": 2283 + }, + { + "epoch": 0.2069262304364567, + "grad_norm": 0.7415899038314819, + "learning_rate": 0.0001862321029420649, + "loss": 2.9513, + "step": 2284 + }, + { + "epoch": 0.20701682861090348, + "grad_norm": 1.3653355836868286, + "learning_rate": 0.00018622606174107414, + "loss": 2.7994, + "step": 2285 + }, + { + "epoch": 0.20710742678535027, + "grad_norm": 0.6470566391944885, + "learning_rate": 0.00018622002054008338, + "loss": 2.8375, + "step": 2286 + }, + { + "epoch": 0.20719802495979706, + "grad_norm": 0.6176044344902039, + "learning_rate": 0.0001862139793390926, + "loss": 1.9679, + "step": 2287 + }, + { + "epoch": 0.20728862313424384, + "grad_norm": 0.6774952411651611, + "learning_rate": 0.00018620793813810187, + "loss": 2.6394, + "step": 2288 + }, + { + "epoch": 0.20737922130869063, + "grad_norm": 0.7421861886978149, + "learning_rate": 0.0001862018969371111, + "loss": 3.0012, + "step": 2289 + }, + { + "epoch": 0.20746981948313742, + "grad_norm": 0.7530388236045837, + "learning_rate": 0.00018619585573612034, + "loss": 2.8516, + "step": 2290 + }, + { + "epoch": 0.2075604176575842, + "grad_norm": 0.6845012903213501, + "learning_rate": 0.0001861898145351296, + "loss": 2.9703, + "step": 2291 + }, + { + "epoch": 0.207651015832031, + "grad_norm": 0.6732359528541565, + "learning_rate": 0.00018618377333413883, + "loss": 2.9007, + "step": 2292 + }, + { + "epoch": 0.20774161400647778, + "grad_norm": 0.7928762435913086, + "learning_rate": 0.0001861777321331481, + "loss": 2.2554, + "step": 2293 + }, + { + "epoch": 0.20783221218092454, + "grad_norm": 0.7287194132804871, + "learning_rate": 0.0001861716909321573, + "loss": 2.702, + "step": 2294 + }, + { + "epoch": 0.20792281035537133, + "grad_norm": 0.7180762887001038, + "learning_rate": 0.00018616564973116656, + "loss": 2.8891, + "step": 2295 + }, + { + "epoch": 0.20801340852981812, + "grad_norm": 0.6851478815078735, + "learning_rate": 0.0001861596085301758, + "loss": 2.8998, + "step": 2296 + }, + { + "epoch": 0.2081040067042649, + "grad_norm": 0.7193267941474915, + "learning_rate": 0.00018615356732918506, + "loss": 2.621, + "step": 2297 + }, + { + "epoch": 0.2081946048787117, + "grad_norm": 0.7539020776748657, + "learning_rate": 0.0001861475261281943, + "loss": 2.9117, + "step": 2298 + }, + { + "epoch": 0.20828520305315848, + "grad_norm": 0.6550323963165283, + "learning_rate": 0.00018614148492720353, + "loss": 2.6564, + "step": 2299 + }, + { + "epoch": 0.20837580122760527, + "grad_norm": 0.6814616918563843, + "learning_rate": 0.0001861354437262128, + "loss": 2.6275, + "step": 2300 + }, + { + "epoch": 0.20846639940205205, + "grad_norm": 0.7263222932815552, + "learning_rate": 0.00018612940252522202, + "loss": 2.8105, + "step": 2301 + }, + { + "epoch": 0.20855699757649884, + "grad_norm": 0.7202855348587036, + "learning_rate": 0.00018612336132423128, + "loss": 2.943, + "step": 2302 + }, + { + "epoch": 0.20864759575094563, + "grad_norm": 0.7948154211044312, + "learning_rate": 0.0001861173201232405, + "loss": 2.8349, + "step": 2303 + }, + { + "epoch": 0.2087381939253924, + "grad_norm": 0.6502124667167664, + "learning_rate": 0.00018611127892224975, + "loss": 2.7834, + "step": 2304 + }, + { + "epoch": 0.20882879209983918, + "grad_norm": 0.6930100321769714, + "learning_rate": 0.00018610523772125899, + "loss": 2.5021, + "step": 2305 + }, + { + "epoch": 0.20891939027428597, + "grad_norm": 0.7291468977928162, + "learning_rate": 0.00018609919652026825, + "loss": 3.0122, + "step": 2306 + }, + { + "epoch": 0.20900998844873275, + "grad_norm": 0.6094863414764404, + "learning_rate": 0.00018609315531927748, + "loss": 1.9675, + "step": 2307 + }, + { + "epoch": 0.20910058662317954, + "grad_norm": 0.8200351595878601, + "learning_rate": 0.00018608711411828672, + "loss": 2.7112, + "step": 2308 + }, + { + "epoch": 0.20919118479762633, + "grad_norm": 0.7280641794204712, + "learning_rate": 0.00018608107291729598, + "loss": 2.8463, + "step": 2309 + }, + { + "epoch": 0.20928178297207312, + "grad_norm": 0.7191116213798523, + "learning_rate": 0.0001860750317163052, + "loss": 2.8801, + "step": 2310 + }, + { + "epoch": 0.2093723811465199, + "grad_norm": 0.6900824904441833, + "learning_rate": 0.00018606899051531444, + "loss": 2.7464, + "step": 2311 + }, + { + "epoch": 0.2094629793209667, + "grad_norm": 0.7646921873092651, + "learning_rate": 0.00018606294931432368, + "loss": 2.2834, + "step": 2312 + }, + { + "epoch": 0.20955357749541348, + "grad_norm": 0.7691820859909058, + "learning_rate": 0.00018605690811333294, + "loss": 2.8987, + "step": 2313 + }, + { + "epoch": 0.20964417566986024, + "grad_norm": 0.7508541941642761, + "learning_rate": 0.0001860508669123422, + "loss": 2.9889, + "step": 2314 + }, + { + "epoch": 0.20973477384430703, + "grad_norm": 0.6966472864151001, + "learning_rate": 0.00018604482571135144, + "loss": 2.7584, + "step": 2315 + }, + { + "epoch": 0.20982537201875381, + "grad_norm": 0.7142229080200195, + "learning_rate": 0.00018603878451036067, + "loss": 2.8362, + "step": 2316 + }, + { + "epoch": 0.2099159701932006, + "grad_norm": 0.788076639175415, + "learning_rate": 0.0001860327433093699, + "loss": 2.944, + "step": 2317 + }, + { + "epoch": 0.2100065683676474, + "grad_norm": 0.7717854976654053, + "learning_rate": 0.00018602670210837916, + "loss": 2.8896, + "step": 2318 + }, + { + "epoch": 0.21009716654209418, + "grad_norm": 0.7208637595176697, + "learning_rate": 0.0001860206609073884, + "loss": 3.0492, + "step": 2319 + }, + { + "epoch": 0.21018776471654096, + "grad_norm": 0.7216143608093262, + "learning_rate": 0.00018601461970639763, + "loss": 3.1563, + "step": 2320 + }, + { + "epoch": 0.21027836289098775, + "grad_norm": 0.6920773386955261, + "learning_rate": 0.0001860085785054069, + "loss": 2.7923, + "step": 2321 + }, + { + "epoch": 0.21036896106543454, + "grad_norm": 0.7235106825828552, + "learning_rate": 0.00018600253730441613, + "loss": 2.7149, + "step": 2322 + }, + { + "epoch": 0.21045955923988133, + "grad_norm": 0.7111606597900391, + "learning_rate": 0.0001859964961034254, + "loss": 2.7655, + "step": 2323 + }, + { + "epoch": 0.21055015741432812, + "grad_norm": 0.6610028743743896, + "learning_rate": 0.0001859904549024346, + "loss": 2.2477, + "step": 2324 + }, + { + "epoch": 0.21064075558877488, + "grad_norm": 0.7166464328765869, + "learning_rate": 0.00018598441370144386, + "loss": 2.8423, + "step": 2325 + }, + { + "epoch": 0.21073135376322166, + "grad_norm": 0.7462083697319031, + "learning_rate": 0.0001859783725004531, + "loss": 2.8379, + "step": 2326 + }, + { + "epoch": 0.21082195193766845, + "grad_norm": 0.6893724799156189, + "learning_rate": 0.00018597233129946235, + "loss": 2.7786, + "step": 2327 + }, + { + "epoch": 0.21091255011211524, + "grad_norm": 0.8347352147102356, + "learning_rate": 0.00018596629009847159, + "loss": 2.4888, + "step": 2328 + }, + { + "epoch": 0.21100314828656203, + "grad_norm": 0.7819471955299377, + "learning_rate": 0.00018596024889748082, + "loss": 3.0129, + "step": 2329 + }, + { + "epoch": 0.2110937464610088, + "grad_norm": 0.8158562183380127, + "learning_rate": 0.00018595420769649008, + "loss": 2.9427, + "step": 2330 + }, + { + "epoch": 0.2111843446354556, + "grad_norm": 0.6807525157928467, + "learning_rate": 0.00018594816649549932, + "loss": 2.908, + "step": 2331 + }, + { + "epoch": 0.2112749428099024, + "grad_norm": 0.6893100142478943, + "learning_rate": 0.00018594212529450855, + "loss": 2.6351, + "step": 2332 + }, + { + "epoch": 0.21136554098434918, + "grad_norm": 0.7755491137504578, + "learning_rate": 0.00018593608409351778, + "loss": 2.8251, + "step": 2333 + }, + { + "epoch": 0.21145613915879596, + "grad_norm": 0.7163729071617126, + "learning_rate": 0.00018593004289252704, + "loss": 2.612, + "step": 2334 + }, + { + "epoch": 0.21154673733324272, + "grad_norm": 0.7290970683097839, + "learning_rate": 0.00018592400169153628, + "loss": 3.0692, + "step": 2335 + }, + { + "epoch": 0.2116373355076895, + "grad_norm": 0.7312525510787964, + "learning_rate": 0.00018591796049054554, + "loss": 2.7929, + "step": 2336 + }, + { + "epoch": 0.2117279336821363, + "grad_norm": 0.678330659866333, + "learning_rate": 0.00018591191928955477, + "loss": 2.7477, + "step": 2337 + }, + { + "epoch": 0.2118185318565831, + "grad_norm": 0.6961781978607178, + "learning_rate": 0.000185905878088564, + "loss": 2.9828, + "step": 2338 + }, + { + "epoch": 0.21190913003102987, + "grad_norm": 0.7575347423553467, + "learning_rate": 0.00018589983688757327, + "loss": 3.0418, + "step": 2339 + }, + { + "epoch": 0.21199972820547666, + "grad_norm": 0.7654361724853516, + "learning_rate": 0.0001858937956865825, + "loss": 2.98, + "step": 2340 + }, + { + "epoch": 0.21209032637992345, + "grad_norm": 0.7117661237716675, + "learning_rate": 0.00018588775448559174, + "loss": 2.8118, + "step": 2341 + }, + { + "epoch": 0.21218092455437024, + "grad_norm": 0.7075299620628357, + "learning_rate": 0.00018588171328460097, + "loss": 3.0275, + "step": 2342 + }, + { + "epoch": 0.21227152272881702, + "grad_norm": 0.7795016169548035, + "learning_rate": 0.00018587567208361023, + "loss": 2.7659, + "step": 2343 + }, + { + "epoch": 0.2123621209032638, + "grad_norm": 0.7264392375946045, + "learning_rate": 0.0001858696308826195, + "loss": 2.6911, + "step": 2344 + }, + { + "epoch": 0.21245271907771057, + "grad_norm": 0.7041574716567993, + "learning_rate": 0.0001858635896816287, + "loss": 3.1092, + "step": 2345 + }, + { + "epoch": 0.21254331725215736, + "grad_norm": 0.7590469121932983, + "learning_rate": 0.00018585754848063796, + "loss": 2.9641, + "step": 2346 + }, + { + "epoch": 0.21263391542660415, + "grad_norm": 0.7375516295433044, + "learning_rate": 0.0001858515072796472, + "loss": 2.9638, + "step": 2347 + }, + { + "epoch": 0.21272451360105094, + "grad_norm": 0.7147176861763, + "learning_rate": 0.00018584546607865646, + "loss": 2.723, + "step": 2348 + }, + { + "epoch": 0.21281511177549772, + "grad_norm": 0.7119570970535278, + "learning_rate": 0.0001858394248776657, + "loss": 2.1697, + "step": 2349 + }, + { + "epoch": 0.2129057099499445, + "grad_norm": 0.688774049282074, + "learning_rate": 0.00018583338367667493, + "loss": 2.9937, + "step": 2350 + }, + { + "epoch": 0.2129963081243913, + "grad_norm": 0.6837130784988403, + "learning_rate": 0.00018582734247568419, + "loss": 2.7906, + "step": 2351 + }, + { + "epoch": 0.21308690629883809, + "grad_norm": 0.744667112827301, + "learning_rate": 0.00018582130127469342, + "loss": 3.1072, + "step": 2352 + }, + { + "epoch": 0.21317750447328487, + "grad_norm": 0.737325131893158, + "learning_rate": 0.00018581526007370268, + "loss": 2.9385, + "step": 2353 + }, + { + "epoch": 0.21326810264773166, + "grad_norm": 0.7877694964408875, + "learning_rate": 0.0001858092188727119, + "loss": 2.8857, + "step": 2354 + }, + { + "epoch": 0.21335870082217842, + "grad_norm": 0.7134546637535095, + "learning_rate": 0.00018580317767172115, + "loss": 2.9955, + "step": 2355 + }, + { + "epoch": 0.2134492989966252, + "grad_norm": 0.7007724642753601, + "learning_rate": 0.00018579713647073038, + "loss": 2.8305, + "step": 2356 + }, + { + "epoch": 0.213539897171072, + "grad_norm": 0.7088493704795837, + "learning_rate": 0.00018579109526973964, + "loss": 2.7976, + "step": 2357 + }, + { + "epoch": 0.21363049534551878, + "grad_norm": 0.5251830220222473, + "learning_rate": 0.00018578505406874888, + "loss": 1.3032, + "step": 2358 + }, + { + "epoch": 0.21372109351996557, + "grad_norm": 0.6887710094451904, + "learning_rate": 0.0001857790128677581, + "loss": 2.941, + "step": 2359 + }, + { + "epoch": 0.21381169169441236, + "grad_norm": 0.7250288724899292, + "learning_rate": 0.00018577297166676737, + "loss": 2.9275, + "step": 2360 + }, + { + "epoch": 0.21390228986885915, + "grad_norm": 0.6789807081222534, + "learning_rate": 0.0001857669304657766, + "loss": 2.3735, + "step": 2361 + }, + { + "epoch": 0.21399288804330593, + "grad_norm": 0.7349227070808411, + "learning_rate": 0.00018576088926478584, + "loss": 2.7832, + "step": 2362 + }, + { + "epoch": 0.21408348621775272, + "grad_norm": 0.7498029470443726, + "learning_rate": 0.00018575484806379508, + "loss": 2.932, + "step": 2363 + }, + { + "epoch": 0.2141740843921995, + "grad_norm": 0.7350810766220093, + "learning_rate": 0.00018574880686280434, + "loss": 3.1836, + "step": 2364 + }, + { + "epoch": 0.21426468256664627, + "grad_norm": 0.6792356371879578, + "learning_rate": 0.00018574276566181357, + "loss": 2.4749, + "step": 2365 + }, + { + "epoch": 0.21435528074109306, + "grad_norm": 0.73301100730896, + "learning_rate": 0.00018573672446082283, + "loss": 2.827, + "step": 2366 + }, + { + "epoch": 0.21444587891553984, + "grad_norm": 0.7460314631462097, + "learning_rate": 0.00018573068325983207, + "loss": 2.9719, + "step": 2367 + }, + { + "epoch": 0.21453647708998663, + "grad_norm": 0.7572743892669678, + "learning_rate": 0.0001857246420588413, + "loss": 2.8107, + "step": 2368 + }, + { + "epoch": 0.21462707526443342, + "grad_norm": 0.7874836921691895, + "learning_rate": 0.00018571860085785056, + "loss": 2.4945, + "step": 2369 + }, + { + "epoch": 0.2147176734388802, + "grad_norm": 0.6679650545120239, + "learning_rate": 0.0001857125596568598, + "loss": 2.8864, + "step": 2370 + }, + { + "epoch": 0.214808271613327, + "grad_norm": 0.746231198310852, + "learning_rate": 0.00018570651845586903, + "loss": 2.6989, + "step": 2371 + }, + { + "epoch": 0.21489886978777378, + "grad_norm": 0.6171021461486816, + "learning_rate": 0.00018570047725487826, + "loss": 1.8212, + "step": 2372 + }, + { + "epoch": 0.21498946796222057, + "grad_norm": 0.7179608941078186, + "learning_rate": 0.00018569443605388753, + "loss": 2.9713, + "step": 2373 + }, + { + "epoch": 0.21508006613666736, + "grad_norm": 0.7343255877494812, + "learning_rate": 0.00018568839485289679, + "loss": 2.9826, + "step": 2374 + }, + { + "epoch": 0.21517066431111412, + "grad_norm": 0.6862093806266785, + "learning_rate": 0.000185682353651906, + "loss": 2.715, + "step": 2375 + }, + { + "epoch": 0.2152612624855609, + "grad_norm": 0.6825699806213379, + "learning_rate": 0.00018567631245091525, + "loss": 2.7166, + "step": 2376 + }, + { + "epoch": 0.2153518606600077, + "grad_norm": 0.7209079265594482, + "learning_rate": 0.0001856702712499245, + "loss": 3.0073, + "step": 2377 + }, + { + "epoch": 0.21544245883445448, + "grad_norm": 0.6688474416732788, + "learning_rate": 0.00018566423004893375, + "loss": 2.8938, + "step": 2378 + }, + { + "epoch": 0.21553305700890127, + "grad_norm": 0.7045822739601135, + "learning_rate": 0.00018565818884794298, + "loss": 2.708, + "step": 2379 + }, + { + "epoch": 0.21562365518334806, + "grad_norm": 0.8097989559173584, + "learning_rate": 0.00018565214764695222, + "loss": 2.7278, + "step": 2380 + }, + { + "epoch": 0.21571425335779484, + "grad_norm": 0.7088711857795715, + "learning_rate": 0.00018564610644596148, + "loss": 2.879, + "step": 2381 + }, + { + "epoch": 0.21580485153224163, + "grad_norm": 0.7135819792747498, + "learning_rate": 0.0001856400652449707, + "loss": 2.7271, + "step": 2382 + }, + { + "epoch": 0.21589544970668842, + "grad_norm": 0.7231793403625488, + "learning_rate": 0.00018563402404397995, + "loss": 2.7919, + "step": 2383 + }, + { + "epoch": 0.2159860478811352, + "grad_norm": 0.7153540253639221, + "learning_rate": 0.00018562798284298918, + "loss": 2.887, + "step": 2384 + }, + { + "epoch": 0.21607664605558197, + "grad_norm": 0.7557317018508911, + "learning_rate": 0.00018562194164199844, + "loss": 2.8388, + "step": 2385 + }, + { + "epoch": 0.21616724423002875, + "grad_norm": 0.7108250856399536, + "learning_rate": 0.00018561590044100768, + "loss": 2.8766, + "step": 2386 + }, + { + "epoch": 0.21625784240447554, + "grad_norm": 0.7310614585876465, + "learning_rate": 0.00018560985924001694, + "loss": 3.2695, + "step": 2387 + }, + { + "epoch": 0.21634844057892233, + "grad_norm": 0.6934235095977783, + "learning_rate": 0.00018560381803902617, + "loss": 3.0088, + "step": 2388 + }, + { + "epoch": 0.21643903875336912, + "grad_norm": 0.7213839292526245, + "learning_rate": 0.0001855977768380354, + "loss": 2.958, + "step": 2389 + }, + { + "epoch": 0.2165296369278159, + "grad_norm": 0.8543673753738403, + "learning_rate": 0.00018559173563704467, + "loss": 2.6879, + "step": 2390 + }, + { + "epoch": 0.2166202351022627, + "grad_norm": 0.6265016794204712, + "learning_rate": 0.0001855856944360539, + "loss": 2.2561, + "step": 2391 + }, + { + "epoch": 0.21671083327670948, + "grad_norm": 0.7197193503379822, + "learning_rate": 0.00018557965323506313, + "loss": 2.9132, + "step": 2392 + }, + { + "epoch": 0.21680143145115627, + "grad_norm": 0.691511869430542, + "learning_rate": 0.00018557361203407237, + "loss": 2.859, + "step": 2393 + }, + { + "epoch": 0.21689202962560306, + "grad_norm": 0.6785048842430115, + "learning_rate": 0.00018556757083308163, + "loss": 2.8774, + "step": 2394 + }, + { + "epoch": 0.21698262780004982, + "grad_norm": 0.6883125901222229, + "learning_rate": 0.00018556152963209086, + "loss": 2.8045, + "step": 2395 + }, + { + "epoch": 0.2170732259744966, + "grad_norm": 0.6909405589103699, + "learning_rate": 0.0001855554884311001, + "loss": 2.1445, + "step": 2396 + }, + { + "epoch": 0.2171638241489434, + "grad_norm": 0.7032665014266968, + "learning_rate": 0.00018554944723010936, + "loss": 2.9763, + "step": 2397 + }, + { + "epoch": 0.21725442232339018, + "grad_norm": 0.723744809627533, + "learning_rate": 0.0001855434060291186, + "loss": 2.9521, + "step": 2398 + }, + { + "epoch": 0.21734502049783697, + "grad_norm": 0.6810991168022156, + "learning_rate": 0.00018553736482812785, + "loss": 2.8496, + "step": 2399 + }, + { + "epoch": 0.21743561867228375, + "grad_norm": 0.7596989274024963, + "learning_rate": 0.0001855313236271371, + "loss": 2.9148, + "step": 2400 + }, + { + "epoch": 0.21752621684673054, + "grad_norm": 0.7428629994392395, + "learning_rate": 0.00018552528242614632, + "loss": 2.7438, + "step": 2401 + }, + { + "epoch": 0.21761681502117733, + "grad_norm": 0.7521803975105286, + "learning_rate": 0.00018551924122515556, + "loss": 2.8999, + "step": 2402 + }, + { + "epoch": 0.21770741319562412, + "grad_norm": 0.7946981191635132, + "learning_rate": 0.00018551320002416482, + "loss": 2.8523, + "step": 2403 + }, + { + "epoch": 0.2177980113700709, + "grad_norm": 0.6961906552314758, + "learning_rate": 0.00018550715882317405, + "loss": 2.8064, + "step": 2404 + }, + { + "epoch": 0.2178886095445177, + "grad_norm": 0.6857337951660156, + "learning_rate": 0.00018550111762218329, + "loss": 2.9599, + "step": 2405 + }, + { + "epoch": 0.21797920771896445, + "grad_norm": 0.6558403968811035, + "learning_rate": 0.00018549507642119255, + "loss": 2.6004, + "step": 2406 + }, + { + "epoch": 0.21806980589341124, + "grad_norm": 0.7089956998825073, + "learning_rate": 0.00018548903522020178, + "loss": 2.8179, + "step": 2407 + }, + { + "epoch": 0.21816040406785803, + "grad_norm": 0.7465911507606506, + "learning_rate": 0.00018548299401921104, + "loss": 3.0877, + "step": 2408 + }, + { + "epoch": 0.21825100224230481, + "grad_norm": 0.6970751285552979, + "learning_rate": 0.00018547695281822025, + "loss": 2.7086, + "step": 2409 + }, + { + "epoch": 0.2183416004167516, + "grad_norm": 0.6694798469543457, + "learning_rate": 0.0001854709116172295, + "loss": 2.7153, + "step": 2410 + }, + { + "epoch": 0.2184321985911984, + "grad_norm": 0.7468941807746887, + "learning_rate": 0.00018546487041623877, + "loss": 2.8659, + "step": 2411 + }, + { + "epoch": 0.21852279676564518, + "grad_norm": 0.7352182269096375, + "learning_rate": 0.000185458829215248, + "loss": 3.115, + "step": 2412 + }, + { + "epoch": 0.21861339494009197, + "grad_norm": 0.6897891759872437, + "learning_rate": 0.00018545278801425724, + "loss": 2.8603, + "step": 2413 + }, + { + "epoch": 0.21870399311453875, + "grad_norm": 0.7451123595237732, + "learning_rate": 0.00018544674681326647, + "loss": 3.0104, + "step": 2414 + }, + { + "epoch": 0.21879459128898554, + "grad_norm": 0.7395373582839966, + "learning_rate": 0.00018544070561227573, + "loss": 3.0818, + "step": 2415 + }, + { + "epoch": 0.2188851894634323, + "grad_norm": 0.7227393984794617, + "learning_rate": 0.00018543466441128497, + "loss": 2.8902, + "step": 2416 + }, + { + "epoch": 0.2189757876378791, + "grad_norm": 0.6929055452346802, + "learning_rate": 0.00018542862321029423, + "loss": 2.8388, + "step": 2417 + }, + { + "epoch": 0.21906638581232588, + "grad_norm": 0.6829860210418701, + "learning_rate": 0.00018542258200930346, + "loss": 2.5954, + "step": 2418 + }, + { + "epoch": 0.21915698398677266, + "grad_norm": 0.7035228610038757, + "learning_rate": 0.0001854165408083127, + "loss": 2.8031, + "step": 2419 + }, + { + "epoch": 0.21924758216121945, + "grad_norm": 0.7210418581962585, + "learning_rate": 0.00018541049960732196, + "loss": 2.8594, + "step": 2420 + }, + { + "epoch": 0.21933818033566624, + "grad_norm": 0.6199190616607666, + "learning_rate": 0.0001854044584063312, + "loss": 1.9373, + "step": 2421 + }, + { + "epoch": 0.21942877851011303, + "grad_norm": 0.5922998189926147, + "learning_rate": 0.00018539841720534043, + "loss": 2.1526, + "step": 2422 + }, + { + "epoch": 0.2195193766845598, + "grad_norm": 0.7017244696617126, + "learning_rate": 0.00018539237600434966, + "loss": 2.9446, + "step": 2423 + }, + { + "epoch": 0.2196099748590066, + "grad_norm": 0.6813300251960754, + "learning_rate": 0.00018538633480335892, + "loss": 2.9882, + "step": 2424 + }, + { + "epoch": 0.2197005730334534, + "grad_norm": 0.7489095330238342, + "learning_rate": 0.00018538029360236816, + "loss": 3.027, + "step": 2425 + }, + { + "epoch": 0.21979117120790015, + "grad_norm": 0.6930796504020691, + "learning_rate": 0.0001853742524013774, + "loss": 2.7793, + "step": 2426 + }, + { + "epoch": 0.21988176938234694, + "grad_norm": 0.7236893773078918, + "learning_rate": 0.00018536821120038665, + "loss": 2.9201, + "step": 2427 + }, + { + "epoch": 0.21997236755679372, + "grad_norm": 0.6659985780715942, + "learning_rate": 0.00018536216999939589, + "loss": 2.4755, + "step": 2428 + }, + { + "epoch": 0.2200629657312405, + "grad_norm": 0.6997527480125427, + "learning_rate": 0.00018535612879840515, + "loss": 2.6902, + "step": 2429 + }, + { + "epoch": 0.2201535639056873, + "grad_norm": 0.7110966444015503, + "learning_rate": 0.00018535008759741438, + "loss": 2.878, + "step": 2430 + }, + { + "epoch": 0.2202441620801341, + "grad_norm": 0.656507670879364, + "learning_rate": 0.00018534404639642362, + "loss": 2.18, + "step": 2431 + }, + { + "epoch": 0.22033476025458087, + "grad_norm": 0.7007695436477661, + "learning_rate": 0.00018533800519543285, + "loss": 2.9814, + "step": 2432 + }, + { + "epoch": 0.22042535842902766, + "grad_norm": 0.7005746960639954, + "learning_rate": 0.0001853319639944421, + "loss": 2.9483, + "step": 2433 + }, + { + "epoch": 0.22051595660347445, + "grad_norm": 0.6981417536735535, + "learning_rate": 0.00018532592279345134, + "loss": 2.9212, + "step": 2434 + }, + { + "epoch": 0.22060655477792124, + "grad_norm": 0.6950781941413879, + "learning_rate": 0.00018531988159246058, + "loss": 2.7764, + "step": 2435 + }, + { + "epoch": 0.220697152952368, + "grad_norm": 0.8080678582191467, + "learning_rate": 0.00018531384039146984, + "loss": 2.8664, + "step": 2436 + }, + { + "epoch": 0.22078775112681479, + "grad_norm": 0.7865836024284363, + "learning_rate": 0.00018530779919047907, + "loss": 2.8634, + "step": 2437 + }, + { + "epoch": 0.22087834930126157, + "grad_norm": 0.7052735686302185, + "learning_rate": 0.00018530175798948833, + "loss": 2.8705, + "step": 2438 + }, + { + "epoch": 0.22096894747570836, + "grad_norm": 0.7250115871429443, + "learning_rate": 0.00018529571678849754, + "loss": 2.7922, + "step": 2439 + }, + { + "epoch": 0.22105954565015515, + "grad_norm": 0.8686721324920654, + "learning_rate": 0.0001852896755875068, + "loss": 2.9066, + "step": 2440 + }, + { + "epoch": 0.22115014382460194, + "grad_norm": 0.7393770217895508, + "learning_rate": 0.00018528363438651606, + "loss": 2.7768, + "step": 2441 + }, + { + "epoch": 0.22124074199904872, + "grad_norm": 0.7021489143371582, + "learning_rate": 0.0001852775931855253, + "loss": 2.8989, + "step": 2442 + }, + { + "epoch": 0.2213313401734955, + "grad_norm": 0.7005136609077454, + "learning_rate": 0.00018527155198453453, + "loss": 2.6978, + "step": 2443 + }, + { + "epoch": 0.2214219383479423, + "grad_norm": 0.7022773027420044, + "learning_rate": 0.00018526551078354377, + "loss": 2.7932, + "step": 2444 + }, + { + "epoch": 0.2215125365223891, + "grad_norm": 0.6251278519630432, + "learning_rate": 0.00018525946958255303, + "loss": 2.0742, + "step": 2445 + }, + { + "epoch": 0.22160313469683585, + "grad_norm": 0.6868295669555664, + "learning_rate": 0.00018525342838156226, + "loss": 2.8535, + "step": 2446 + }, + { + "epoch": 0.22169373287128263, + "grad_norm": 0.7869505882263184, + "learning_rate": 0.0001852473871805715, + "loss": 2.7631, + "step": 2447 + }, + { + "epoch": 0.22178433104572942, + "grad_norm": 0.7377862930297852, + "learning_rate": 0.00018524134597958076, + "loss": 2.8393, + "step": 2448 + }, + { + "epoch": 0.2218749292201762, + "grad_norm": 0.6932787299156189, + "learning_rate": 0.00018523530477859, + "loss": 2.8583, + "step": 2449 + }, + { + "epoch": 0.221965527394623, + "grad_norm": 0.7164531350135803, + "learning_rate": 0.00018522926357759925, + "loss": 2.8305, + "step": 2450 + }, + { + "epoch": 0.22205612556906978, + "grad_norm": 0.6603787541389465, + "learning_rate": 0.00018522322237660849, + "loss": 2.2986, + "step": 2451 + }, + { + "epoch": 0.22214672374351657, + "grad_norm": 0.7650856375694275, + "learning_rate": 0.00018521718117561772, + "loss": 2.8709, + "step": 2452 + }, + { + "epoch": 0.22223732191796336, + "grad_norm": 0.6231502294540405, + "learning_rate": 0.00018521113997462695, + "loss": 2.0106, + "step": 2453 + }, + { + "epoch": 0.22232792009241015, + "grad_norm": 0.7083138227462769, + "learning_rate": 0.00018520509877363622, + "loss": 2.9195, + "step": 2454 + }, + { + "epoch": 0.22241851826685693, + "grad_norm": 0.7349159121513367, + "learning_rate": 0.00018519905757264545, + "loss": 2.8616, + "step": 2455 + }, + { + "epoch": 0.2225091164413037, + "grad_norm": 0.8161576986312866, + "learning_rate": 0.00018519301637165468, + "loss": 2.6309, + "step": 2456 + }, + { + "epoch": 0.22259971461575048, + "grad_norm": 0.6273926496505737, + "learning_rate": 0.00018518697517066394, + "loss": 2.0599, + "step": 2457 + }, + { + "epoch": 0.22269031279019727, + "grad_norm": 0.677998960018158, + "learning_rate": 0.00018518093396967318, + "loss": 2.4975, + "step": 2458 + }, + { + "epoch": 0.22278091096464406, + "grad_norm": 0.7139729857444763, + "learning_rate": 0.00018517489276868244, + "loss": 2.7898, + "step": 2459 + }, + { + "epoch": 0.22287150913909085, + "grad_norm": 0.7299656271934509, + "learning_rate": 0.00018516885156769165, + "loss": 2.9658, + "step": 2460 + }, + { + "epoch": 0.22296210731353763, + "grad_norm": 0.7989616990089417, + "learning_rate": 0.0001851628103667009, + "loss": 2.9428, + "step": 2461 + }, + { + "epoch": 0.22305270548798442, + "grad_norm": 0.6144214868545532, + "learning_rate": 0.00018515676916571014, + "loss": 2.2372, + "step": 2462 + }, + { + "epoch": 0.2231433036624312, + "grad_norm": 0.7516847848892212, + "learning_rate": 0.0001851507279647194, + "loss": 3.2134, + "step": 2463 + }, + { + "epoch": 0.223233901836878, + "grad_norm": 0.7157848477363586, + "learning_rate": 0.00018514468676372864, + "loss": 2.9963, + "step": 2464 + }, + { + "epoch": 0.22332450001132478, + "grad_norm": 0.7309585809707642, + "learning_rate": 0.00018513864556273787, + "loss": 2.7573, + "step": 2465 + }, + { + "epoch": 0.22341509818577154, + "grad_norm": 0.7218735218048096, + "learning_rate": 0.00018513260436174713, + "loss": 2.7768, + "step": 2466 + }, + { + "epoch": 0.22350569636021833, + "grad_norm": 0.6344693899154663, + "learning_rate": 0.00018512656316075637, + "loss": 2.2651, + "step": 2467 + }, + { + "epoch": 0.22359629453466512, + "grad_norm": 0.7364321947097778, + "learning_rate": 0.0001851205219597656, + "loss": 2.7019, + "step": 2468 + }, + { + "epoch": 0.2236868927091119, + "grad_norm": 0.7139759063720703, + "learning_rate": 0.00018511448075877483, + "loss": 2.8392, + "step": 2469 + }, + { + "epoch": 0.2237774908835587, + "grad_norm": 0.733789324760437, + "learning_rate": 0.0001851084395577841, + "loss": 3.0681, + "step": 2470 + }, + { + "epoch": 0.22386808905800548, + "grad_norm": 0.683150053024292, + "learning_rate": 0.00018510239835679336, + "loss": 2.6989, + "step": 2471 + }, + { + "epoch": 0.22395868723245227, + "grad_norm": 0.6912267804145813, + "learning_rate": 0.0001850963571558026, + "loss": 2.8791, + "step": 2472 + }, + { + "epoch": 0.22404928540689906, + "grad_norm": 0.6961430311203003, + "learning_rate": 0.00018509031595481182, + "loss": 2.7701, + "step": 2473 + }, + { + "epoch": 0.22413988358134584, + "grad_norm": 0.6875411868095398, + "learning_rate": 0.00018508427475382106, + "loss": 2.7626, + "step": 2474 + }, + { + "epoch": 0.22423048175579263, + "grad_norm": 0.7374860048294067, + "learning_rate": 0.00018507823355283032, + "loss": 3.0343, + "step": 2475 + }, + { + "epoch": 0.2243210799302394, + "grad_norm": 0.6956738829612732, + "learning_rate": 0.00018507219235183955, + "loss": 2.8386, + "step": 2476 + }, + { + "epoch": 0.22441167810468618, + "grad_norm": 0.6885871291160583, + "learning_rate": 0.0001850661511508488, + "loss": 2.7444, + "step": 2477 + }, + { + "epoch": 0.22450227627913297, + "grad_norm": 0.7100074291229248, + "learning_rate": 0.00018506010994985805, + "loss": 2.9058, + "step": 2478 + }, + { + "epoch": 0.22459287445357975, + "grad_norm": 0.6182975172996521, + "learning_rate": 0.00018505406874886728, + "loss": 2.0318, + "step": 2479 + }, + { + "epoch": 0.22468347262802654, + "grad_norm": 0.7493990659713745, + "learning_rate": 0.00018504802754787654, + "loss": 3.2417, + "step": 2480 + }, + { + "epoch": 0.22477407080247333, + "grad_norm": 0.7091223001480103, + "learning_rate": 0.00018504198634688575, + "loss": 2.9015, + "step": 2481 + }, + { + "epoch": 0.22486466897692012, + "grad_norm": 0.6739444136619568, + "learning_rate": 0.000185035945145895, + "loss": 2.8391, + "step": 2482 + }, + { + "epoch": 0.2249552671513669, + "grad_norm": 0.7194665670394897, + "learning_rate": 0.00018502990394490425, + "loss": 2.9385, + "step": 2483 + }, + { + "epoch": 0.2250458653258137, + "grad_norm": 0.656641960144043, + "learning_rate": 0.0001850238627439135, + "loss": 2.6638, + "step": 2484 + }, + { + "epoch": 0.22513646350026048, + "grad_norm": 0.6920175552368164, + "learning_rate": 0.00018501782154292274, + "loss": 2.946, + "step": 2485 + }, + { + "epoch": 0.22522706167470727, + "grad_norm": 0.6546301245689392, + "learning_rate": 0.00018501178034193198, + "loss": 2.0571, + "step": 2486 + }, + { + "epoch": 0.22531765984915403, + "grad_norm": 0.7111167311668396, + "learning_rate": 0.00018500573914094124, + "loss": 2.8179, + "step": 2487 + }, + { + "epoch": 0.22540825802360082, + "grad_norm": 0.6729758381843567, + "learning_rate": 0.00018499969793995047, + "loss": 2.859, + "step": 2488 + }, + { + "epoch": 0.2254988561980476, + "grad_norm": 0.7012884616851807, + "learning_rate": 0.00018499365673895973, + "loss": 2.8121, + "step": 2489 + }, + { + "epoch": 0.2255894543724944, + "grad_norm": 0.7229039669036865, + "learning_rate": 0.00018498761553796894, + "loss": 2.7748, + "step": 2490 + }, + { + "epoch": 0.22568005254694118, + "grad_norm": 0.6864594221115112, + "learning_rate": 0.0001849815743369782, + "loss": 2.881, + "step": 2491 + }, + { + "epoch": 0.22577065072138797, + "grad_norm": 0.7128796577453613, + "learning_rate": 0.00018497553313598743, + "loss": 2.8985, + "step": 2492 + }, + { + "epoch": 0.22586124889583475, + "grad_norm": 0.6935428380966187, + "learning_rate": 0.0001849694919349967, + "loss": 2.6696, + "step": 2493 + }, + { + "epoch": 0.22595184707028154, + "grad_norm": 0.7077720761299133, + "learning_rate": 0.00018496345073400593, + "loss": 2.8849, + "step": 2494 + }, + { + "epoch": 0.22604244524472833, + "grad_norm": 0.6903194785118103, + "learning_rate": 0.00018495740953301516, + "loss": 2.7611, + "step": 2495 + }, + { + "epoch": 0.22613304341917512, + "grad_norm": 0.7148558497428894, + "learning_rate": 0.00018495136833202442, + "loss": 2.6438, + "step": 2496 + }, + { + "epoch": 0.22622364159362188, + "grad_norm": 0.7048721313476562, + "learning_rate": 0.00018494532713103366, + "loss": 2.7456, + "step": 2497 + }, + { + "epoch": 0.22631423976806866, + "grad_norm": 0.6968353986740112, + "learning_rate": 0.0001849392859300429, + "loss": 2.8606, + "step": 2498 + }, + { + "epoch": 0.22640483794251545, + "grad_norm": 0.7540696263313293, + "learning_rate": 0.00018493324472905213, + "loss": 2.8544, + "step": 2499 + }, + { + "epoch": 0.22649543611696224, + "grad_norm": 0.7847883701324463, + "learning_rate": 0.0001849272035280614, + "loss": 2.9755, + "step": 2500 + }, + { + "epoch": 0.22658603429140903, + "grad_norm": 0.6827471852302551, + "learning_rate": 0.00018492116232707065, + "loss": 2.458, + "step": 2501 + }, + { + "epoch": 0.22667663246585582, + "grad_norm": 0.7244457602500916, + "learning_rate": 0.00018491512112607988, + "loss": 2.8034, + "step": 2502 + }, + { + "epoch": 0.2267672306403026, + "grad_norm": 0.6717630624771118, + "learning_rate": 0.00018490907992508912, + "loss": 2.8396, + "step": 2503 + }, + { + "epoch": 0.2268578288147494, + "grad_norm": 0.7409170269966125, + "learning_rate": 0.00018490303872409835, + "loss": 2.8412, + "step": 2504 + }, + { + "epoch": 0.22694842698919618, + "grad_norm": 0.7440128922462463, + "learning_rate": 0.0001848969975231076, + "loss": 2.8123, + "step": 2505 + }, + { + "epoch": 0.22703902516364297, + "grad_norm": 0.6312365531921387, + "learning_rate": 0.00018489095632211685, + "loss": 2.2728, + "step": 2506 + }, + { + "epoch": 0.22712962333808973, + "grad_norm": 0.6822347044944763, + "learning_rate": 0.00018488491512112608, + "loss": 2.7923, + "step": 2507 + }, + { + "epoch": 0.2272202215125365, + "grad_norm": 0.7417981028556824, + "learning_rate": 0.00018487887392013534, + "loss": 2.9454, + "step": 2508 + }, + { + "epoch": 0.2273108196869833, + "grad_norm": 0.6984794735908508, + "learning_rate": 0.00018487283271914458, + "loss": 2.8606, + "step": 2509 + }, + { + "epoch": 0.2274014178614301, + "grad_norm": 0.7372334003448486, + "learning_rate": 0.00018486679151815384, + "loss": 2.964, + "step": 2510 + }, + { + "epoch": 0.22749201603587688, + "grad_norm": 0.7236067056655884, + "learning_rate": 0.00018486075031716304, + "loss": 2.7432, + "step": 2511 + }, + { + "epoch": 0.22758261421032366, + "grad_norm": 0.8358088135719299, + "learning_rate": 0.0001848547091161723, + "loss": 3.0356, + "step": 2512 + }, + { + "epoch": 0.22767321238477045, + "grad_norm": 0.677582323551178, + "learning_rate": 0.00018484866791518154, + "loss": 2.9552, + "step": 2513 + }, + { + "epoch": 0.22776381055921724, + "grad_norm": 0.7312824130058289, + "learning_rate": 0.0001848426267141908, + "loss": 2.7706, + "step": 2514 + }, + { + "epoch": 0.22785440873366403, + "grad_norm": 0.6771348118782043, + "learning_rate": 0.00018483658551320003, + "loss": 2.658, + "step": 2515 + }, + { + "epoch": 0.22794500690811081, + "grad_norm": 0.7439092993736267, + "learning_rate": 0.00018483054431220927, + "loss": 2.9491, + "step": 2516 + }, + { + "epoch": 0.22803560508255757, + "grad_norm": 0.7628560662269592, + "learning_rate": 0.00018482450311121853, + "loss": 3.1405, + "step": 2517 + }, + { + "epoch": 0.22812620325700436, + "grad_norm": 0.7767862677574158, + "learning_rate": 0.00018481846191022776, + "loss": 2.7245, + "step": 2518 + }, + { + "epoch": 0.22821680143145115, + "grad_norm": 0.6862272024154663, + "learning_rate": 0.000184812420709237, + "loss": 2.927, + "step": 2519 + }, + { + "epoch": 0.22830739960589794, + "grad_norm": 0.7187618613243103, + "learning_rate": 0.00018480637950824623, + "loss": 2.8222, + "step": 2520 + }, + { + "epoch": 0.22839799778034472, + "grad_norm": 0.7411353588104248, + "learning_rate": 0.0001848003383072555, + "loss": 2.7734, + "step": 2521 + }, + { + "epoch": 0.2284885959547915, + "grad_norm": 0.7650907039642334, + "learning_rate": 0.00018479429710626473, + "loss": 2.8056, + "step": 2522 + }, + { + "epoch": 0.2285791941292383, + "grad_norm": 0.8124939203262329, + "learning_rate": 0.000184788255905274, + "loss": 3.0639, + "step": 2523 + }, + { + "epoch": 0.2286697923036851, + "grad_norm": 0.7390390634536743, + "learning_rate": 0.00018478221470428322, + "loss": 2.9977, + "step": 2524 + }, + { + "epoch": 0.22876039047813188, + "grad_norm": 0.7068085670471191, + "learning_rate": 0.00018477617350329246, + "loss": 2.7939, + "step": 2525 + }, + { + "epoch": 0.22885098865257866, + "grad_norm": 0.6959830522537231, + "learning_rate": 0.00018477013230230172, + "loss": 2.6886, + "step": 2526 + }, + { + "epoch": 0.22894158682702542, + "grad_norm": 0.7190982699394226, + "learning_rate": 0.00018476409110131095, + "loss": 2.6338, + "step": 2527 + }, + { + "epoch": 0.2290321850014722, + "grad_norm": 0.7398291826248169, + "learning_rate": 0.00018475804990032019, + "loss": 2.997, + "step": 2528 + }, + { + "epoch": 0.229122783175919, + "grad_norm": 0.6877185106277466, + "learning_rate": 0.00018475200869932942, + "loss": 2.7662, + "step": 2529 + }, + { + "epoch": 0.22921338135036579, + "grad_norm": 0.6061257719993591, + "learning_rate": 0.00018474596749833868, + "loss": 2.0569, + "step": 2530 + }, + { + "epoch": 0.22930397952481257, + "grad_norm": 0.7124534845352173, + "learning_rate": 0.00018473992629734794, + "loss": 2.98, + "step": 2531 + }, + { + "epoch": 0.22939457769925936, + "grad_norm": 0.7336921095848083, + "learning_rate": 0.00018473388509635715, + "loss": 2.7804, + "step": 2532 + }, + { + "epoch": 0.22948517587370615, + "grad_norm": 0.7038832306861877, + "learning_rate": 0.0001847278438953664, + "loss": 2.7461, + "step": 2533 + }, + { + "epoch": 0.22957577404815294, + "grad_norm": 0.6286860108375549, + "learning_rate": 0.00018472180269437564, + "loss": 2.207, + "step": 2534 + }, + { + "epoch": 0.22966637222259972, + "grad_norm": 0.8338354229927063, + "learning_rate": 0.0001847157614933849, + "loss": 2.992, + "step": 2535 + }, + { + "epoch": 0.2297569703970465, + "grad_norm": 0.7605085372924805, + "learning_rate": 0.00018470972029239414, + "loss": 2.8729, + "step": 2536 + }, + { + "epoch": 0.22984756857149327, + "grad_norm": 0.7245063781738281, + "learning_rate": 0.00018470367909140337, + "loss": 2.7154, + "step": 2537 + }, + { + "epoch": 0.22993816674594006, + "grad_norm": 0.7677305936813354, + "learning_rate": 0.00018469763789041263, + "loss": 2.8333, + "step": 2538 + }, + { + "epoch": 0.23002876492038685, + "grad_norm": 0.6997949481010437, + "learning_rate": 0.00018469159668942187, + "loss": 2.6814, + "step": 2539 + }, + { + "epoch": 0.23011936309483363, + "grad_norm": 0.6630221009254456, + "learning_rate": 0.00018468555548843113, + "loss": 2.6822, + "step": 2540 + }, + { + "epoch": 0.23020996126928042, + "grad_norm": 0.7088043093681335, + "learning_rate": 0.00018467951428744034, + "loss": 2.7794, + "step": 2541 + }, + { + "epoch": 0.2303005594437272, + "grad_norm": 0.7684569358825684, + "learning_rate": 0.0001846734730864496, + "loss": 2.9219, + "step": 2542 + }, + { + "epoch": 0.230391157618174, + "grad_norm": 0.7292912006378174, + "learning_rate": 0.00018466743188545883, + "loss": 3.0005, + "step": 2543 + }, + { + "epoch": 0.23048175579262078, + "grad_norm": 0.6942715048789978, + "learning_rate": 0.0001846613906844681, + "loss": 2.8003, + "step": 2544 + }, + { + "epoch": 0.23057235396706757, + "grad_norm": 0.6743761301040649, + "learning_rate": 0.0001846553494834773, + "loss": 2.002, + "step": 2545 + }, + { + "epoch": 0.23066295214151436, + "grad_norm": 0.6991512179374695, + "learning_rate": 0.00018464930828248656, + "loss": 2.7166, + "step": 2546 + }, + { + "epoch": 0.23075355031596112, + "grad_norm": 0.7480571269989014, + "learning_rate": 0.00018464326708149582, + "loss": 3.0304, + "step": 2547 + }, + { + "epoch": 0.2308441484904079, + "grad_norm": 0.7213049530982971, + "learning_rate": 0.00018463722588050506, + "loss": 3.0543, + "step": 2548 + }, + { + "epoch": 0.2309347466648547, + "grad_norm": 0.7210211157798767, + "learning_rate": 0.0001846311846795143, + "loss": 2.9336, + "step": 2549 + }, + { + "epoch": 0.23102534483930148, + "grad_norm": 0.6851895451545715, + "learning_rate": 0.00018462514347852352, + "loss": 2.7447, + "step": 2550 + }, + { + "epoch": 0.23111594301374827, + "grad_norm": 0.7333120107650757, + "learning_rate": 0.00018461910227753279, + "loss": 2.7302, + "step": 2551 + }, + { + "epoch": 0.23120654118819506, + "grad_norm": 0.7416210770606995, + "learning_rate": 0.00018461306107654202, + "loss": 3.0174, + "step": 2552 + }, + { + "epoch": 0.23129713936264185, + "grad_norm": 0.8178603053092957, + "learning_rate": 0.00018460701987555128, + "loss": 2.7605, + "step": 2553 + }, + { + "epoch": 0.23138773753708863, + "grad_norm": 0.7212764620780945, + "learning_rate": 0.00018460097867456052, + "loss": 3.0698, + "step": 2554 + }, + { + "epoch": 0.23147833571153542, + "grad_norm": 0.6999642848968506, + "learning_rate": 0.00018459493747356975, + "loss": 2.532, + "step": 2555 + }, + { + "epoch": 0.2315689338859822, + "grad_norm": 0.7102242112159729, + "learning_rate": 0.000184588896272579, + "loss": 2.6858, + "step": 2556 + }, + { + "epoch": 0.23165953206042897, + "grad_norm": 0.7703959345817566, + "learning_rate": 0.00018458285507158824, + "loss": 2.8995, + "step": 2557 + }, + { + "epoch": 0.23175013023487576, + "grad_norm": 0.6697355508804321, + "learning_rate": 0.00018457681387059748, + "loss": 1.921, + "step": 2558 + }, + { + "epoch": 0.23184072840932254, + "grad_norm": 0.7005141973495483, + "learning_rate": 0.0001845707726696067, + "loss": 2.687, + "step": 2559 + }, + { + "epoch": 0.23193132658376933, + "grad_norm": 0.7832239270210266, + "learning_rate": 0.00018456473146861597, + "loss": 2.9245, + "step": 2560 + }, + { + "epoch": 0.23202192475821612, + "grad_norm": 0.7035310864448547, + "learning_rate": 0.00018455869026762523, + "loss": 2.6752, + "step": 2561 + }, + { + "epoch": 0.2321125229326629, + "grad_norm": 0.6745244264602661, + "learning_rate": 0.00018455264906663444, + "loss": 2.6861, + "step": 2562 + }, + { + "epoch": 0.2322031211071097, + "grad_norm": 0.6804468035697937, + "learning_rate": 0.0001845466078656437, + "loss": 2.631, + "step": 2563 + }, + { + "epoch": 0.23229371928155648, + "grad_norm": 0.7089593410491943, + "learning_rate": 0.00018454056666465294, + "loss": 2.9847, + "step": 2564 + }, + { + "epoch": 0.23238431745600327, + "grad_norm": 0.7905791997909546, + "learning_rate": 0.0001845345254636622, + "loss": 3.0969, + "step": 2565 + }, + { + "epoch": 0.23247491563045006, + "grad_norm": 0.7902697920799255, + "learning_rate": 0.00018452848426267143, + "loss": 3.019, + "step": 2566 + }, + { + "epoch": 0.23256551380489685, + "grad_norm": 0.786371648311615, + "learning_rate": 0.00018452244306168067, + "loss": 2.8117, + "step": 2567 + }, + { + "epoch": 0.2326561119793436, + "grad_norm": 0.6947291493415833, + "learning_rate": 0.00018451640186068993, + "loss": 2.6421, + "step": 2568 + }, + { + "epoch": 0.2327467101537904, + "grad_norm": 0.6842586398124695, + "learning_rate": 0.00018451036065969916, + "loss": 2.728, + "step": 2569 + }, + { + "epoch": 0.23283730832823718, + "grad_norm": 0.7389134168624878, + "learning_rate": 0.0001845043194587084, + "loss": 3.085, + "step": 2570 + }, + { + "epoch": 0.23292790650268397, + "grad_norm": 0.7164199352264404, + "learning_rate": 0.00018449827825771763, + "loss": 2.7476, + "step": 2571 + }, + { + "epoch": 0.23301850467713076, + "grad_norm": 0.7218042016029358, + "learning_rate": 0.0001844922370567269, + "loss": 2.8846, + "step": 2572 + }, + { + "epoch": 0.23310910285157754, + "grad_norm": 0.6913018822669983, + "learning_rate": 0.00018448619585573612, + "loss": 2.7408, + "step": 2573 + }, + { + "epoch": 0.23319970102602433, + "grad_norm": 0.7478128671646118, + "learning_rate": 0.00018448015465474539, + "loss": 2.9837, + "step": 2574 + }, + { + "epoch": 0.23329029920047112, + "grad_norm": 0.7012351155281067, + "learning_rate": 0.0001844741134537546, + "loss": 3.115, + "step": 2575 + }, + { + "epoch": 0.2333808973749179, + "grad_norm": 0.6921268105506897, + "learning_rate": 0.00018446807225276385, + "loss": 2.9087, + "step": 2576 + }, + { + "epoch": 0.2334714955493647, + "grad_norm": 0.7357571125030518, + "learning_rate": 0.00018446203105177312, + "loss": 2.8255, + "step": 2577 + }, + { + "epoch": 0.23356209372381145, + "grad_norm": 0.6991948485374451, + "learning_rate": 0.00018445598985078235, + "loss": 2.7764, + "step": 2578 + }, + { + "epoch": 0.23365269189825824, + "grad_norm": 0.7018862366676331, + "learning_rate": 0.00018444994864979158, + "loss": 2.7525, + "step": 2579 + }, + { + "epoch": 0.23374329007270503, + "grad_norm": 0.7489219903945923, + "learning_rate": 0.00018444390744880082, + "loss": 2.9817, + "step": 2580 + }, + { + "epoch": 0.23383388824715182, + "grad_norm": 0.7038818001747131, + "learning_rate": 0.00018443786624781008, + "loss": 2.7999, + "step": 2581 + }, + { + "epoch": 0.2339244864215986, + "grad_norm": 0.7633290886878967, + "learning_rate": 0.0001844318250468193, + "loss": 3.0942, + "step": 2582 + }, + { + "epoch": 0.2340150845960454, + "grad_norm": 0.7029465436935425, + "learning_rate": 0.00018442578384582855, + "loss": 2.8222, + "step": 2583 + }, + { + "epoch": 0.23410568277049218, + "grad_norm": 0.706231951713562, + "learning_rate": 0.0001844197426448378, + "loss": 2.9788, + "step": 2584 + }, + { + "epoch": 0.23419628094493897, + "grad_norm": 0.7272340655326843, + "learning_rate": 0.00018441370144384704, + "loss": 2.7533, + "step": 2585 + }, + { + "epoch": 0.23428687911938575, + "grad_norm": 0.6924691200256348, + "learning_rate": 0.0001844076602428563, + "loss": 2.7875, + "step": 2586 + }, + { + "epoch": 0.23437747729383254, + "grad_norm": 0.7070139050483704, + "learning_rate": 0.00018440161904186554, + "loss": 2.9177, + "step": 2587 + }, + { + "epoch": 0.2344680754682793, + "grad_norm": 0.7410513162612915, + "learning_rate": 0.00018439557784087477, + "loss": 2.8171, + "step": 2588 + }, + { + "epoch": 0.2345586736427261, + "grad_norm": 0.7535166144371033, + "learning_rate": 0.000184389536639884, + "loss": 2.8988, + "step": 2589 + }, + { + "epoch": 0.23464927181717288, + "grad_norm": 0.7105211019515991, + "learning_rate": 0.00018438349543889327, + "loss": 2.6977, + "step": 2590 + }, + { + "epoch": 0.23473986999161967, + "grad_norm": 0.7746644020080566, + "learning_rate": 0.0001843774542379025, + "loss": 3.1393, + "step": 2591 + }, + { + "epoch": 0.23483046816606645, + "grad_norm": 0.7116461396217346, + "learning_rate": 0.00018437141303691173, + "loss": 2.1293, + "step": 2592 + }, + { + "epoch": 0.23492106634051324, + "grad_norm": 0.7052039504051208, + "learning_rate": 0.000184365371835921, + "loss": 2.8234, + "step": 2593 + }, + { + "epoch": 0.23501166451496003, + "grad_norm": 0.7079706788063049, + "learning_rate": 0.00018435933063493023, + "loss": 2.8631, + "step": 2594 + }, + { + "epoch": 0.23510226268940682, + "grad_norm": 0.6493161916732788, + "learning_rate": 0.0001843532894339395, + "loss": 2.0251, + "step": 2595 + }, + { + "epoch": 0.2351928608638536, + "grad_norm": 0.7219967842102051, + "learning_rate": 0.0001843472482329487, + "loss": 2.8048, + "step": 2596 + }, + { + "epoch": 0.2352834590383004, + "grad_norm": 0.7462196946144104, + "learning_rate": 0.00018434120703195796, + "loss": 2.8116, + "step": 2597 + }, + { + "epoch": 0.23537405721274715, + "grad_norm": 0.7187264561653137, + "learning_rate": 0.00018433516583096722, + "loss": 2.811, + "step": 2598 + }, + { + "epoch": 0.23546465538719394, + "grad_norm": 0.7235012650489807, + "learning_rate": 0.00018432912462997645, + "loss": 2.816, + "step": 2599 + }, + { + "epoch": 0.23555525356164073, + "grad_norm": 0.6288800239562988, + "learning_rate": 0.0001843230834289857, + "loss": 2.0375, + "step": 2600 + }, + { + "epoch": 0.2356458517360875, + "grad_norm": 0.7519357204437256, + "learning_rate": 0.00018431704222799492, + "loss": 3.0167, + "step": 2601 + }, + { + "epoch": 0.2357364499105343, + "grad_norm": 0.6661613583564758, + "learning_rate": 0.00018431100102700418, + "loss": 2.0417, + "step": 2602 + }, + { + "epoch": 0.2358270480849811, + "grad_norm": 0.7345326542854309, + "learning_rate": 0.00018430495982601342, + "loss": 2.7617, + "step": 2603 + }, + { + "epoch": 0.23591764625942788, + "grad_norm": 0.6287458539009094, + "learning_rate": 0.00018429891862502268, + "loss": 2.1278, + "step": 2604 + }, + { + "epoch": 0.23600824443387466, + "grad_norm": 0.834009051322937, + "learning_rate": 0.00018429287742403189, + "loss": 2.8131, + "step": 2605 + }, + { + "epoch": 0.23609884260832145, + "grad_norm": 0.6292681097984314, + "learning_rate": 0.00018428683622304115, + "loss": 2.075, + "step": 2606 + }, + { + "epoch": 0.23618944078276824, + "grad_norm": 0.6830093264579773, + "learning_rate": 0.0001842807950220504, + "loss": 2.7124, + "step": 2607 + }, + { + "epoch": 0.236280038957215, + "grad_norm": 0.7618962526321411, + "learning_rate": 0.00018427475382105964, + "loss": 3.1375, + "step": 2608 + }, + { + "epoch": 0.2363706371316618, + "grad_norm": 0.7286775708198547, + "learning_rate": 0.00018426871262006888, + "loss": 2.7802, + "step": 2609 + }, + { + "epoch": 0.23646123530610857, + "grad_norm": 0.7602386474609375, + "learning_rate": 0.0001842626714190781, + "loss": 3.0754, + "step": 2610 + }, + { + "epoch": 0.23655183348055536, + "grad_norm": 0.6769362688064575, + "learning_rate": 0.00018425663021808737, + "loss": 2.7167, + "step": 2611 + }, + { + "epoch": 0.23664243165500215, + "grad_norm": 0.7046197056770325, + "learning_rate": 0.0001842505890170966, + "loss": 2.6342, + "step": 2612 + }, + { + "epoch": 0.23673302982944894, + "grad_norm": 0.7281774282455444, + "learning_rate": 0.00018424454781610584, + "loss": 2.7967, + "step": 2613 + }, + { + "epoch": 0.23682362800389573, + "grad_norm": 0.6979345679283142, + "learning_rate": 0.0001842385066151151, + "loss": 2.6929, + "step": 2614 + }, + { + "epoch": 0.2369142261783425, + "grad_norm": 0.6923596858978271, + "learning_rate": 0.00018423246541412433, + "loss": 2.7369, + "step": 2615 + }, + { + "epoch": 0.2370048243527893, + "grad_norm": 0.6767503023147583, + "learning_rate": 0.0001842264242131336, + "loss": 2.6077, + "step": 2616 + }, + { + "epoch": 0.2370954225272361, + "grad_norm": 0.725975513458252, + "learning_rate": 0.00018422038301214283, + "loss": 2.8427, + "step": 2617 + }, + { + "epoch": 0.23718602070168285, + "grad_norm": 0.8079999089241028, + "learning_rate": 0.00018421434181115206, + "loss": 3.0081, + "step": 2618 + }, + { + "epoch": 0.23727661887612964, + "grad_norm": 0.6836512088775635, + "learning_rate": 0.0001842083006101613, + "loss": 2.8714, + "step": 2619 + }, + { + "epoch": 0.23736721705057642, + "grad_norm": 0.7433481216430664, + "learning_rate": 0.00018420225940917056, + "loss": 2.6944, + "step": 2620 + }, + { + "epoch": 0.2374578152250232, + "grad_norm": 0.6157035231590271, + "learning_rate": 0.0001841962182081798, + "loss": 2.1765, + "step": 2621 + }, + { + "epoch": 0.23754841339947, + "grad_norm": 0.7372806668281555, + "learning_rate": 0.00018419017700718903, + "loss": 2.6567, + "step": 2622 + }, + { + "epoch": 0.2376390115739168, + "grad_norm": 0.732394278049469, + "learning_rate": 0.0001841841358061983, + "loss": 2.8504, + "step": 2623 + }, + { + "epoch": 0.23772960974836357, + "grad_norm": 0.740371823310852, + "learning_rate": 0.00018417809460520752, + "loss": 3.0715, + "step": 2624 + }, + { + "epoch": 0.23782020792281036, + "grad_norm": 0.5732728838920593, + "learning_rate": 0.00018417205340421678, + "loss": 2.0912, + "step": 2625 + }, + { + "epoch": 0.23791080609725715, + "grad_norm": 0.7168335914611816, + "learning_rate": 0.000184166012203226, + "loss": 2.9698, + "step": 2626 + }, + { + "epoch": 0.23800140427170394, + "grad_norm": 0.7064235806465149, + "learning_rate": 0.00018415997100223525, + "loss": 2.9037, + "step": 2627 + }, + { + "epoch": 0.2380920024461507, + "grad_norm": 0.7225418090820312, + "learning_rate": 0.0001841539298012445, + "loss": 2.8333, + "step": 2628 + }, + { + "epoch": 0.23818260062059748, + "grad_norm": 0.7403895854949951, + "learning_rate": 0.00018414788860025375, + "loss": 2.6947, + "step": 2629 + }, + { + "epoch": 0.23827319879504427, + "grad_norm": 0.6792954802513123, + "learning_rate": 0.00018414184739926298, + "loss": 2.7746, + "step": 2630 + }, + { + "epoch": 0.23836379696949106, + "grad_norm": 0.7700604796409607, + "learning_rate": 0.00018413580619827221, + "loss": 2.8209, + "step": 2631 + }, + { + "epoch": 0.23845439514393785, + "grad_norm": 0.7334296703338623, + "learning_rate": 0.00018412976499728148, + "loss": 2.9851, + "step": 2632 + }, + { + "epoch": 0.23854499331838463, + "grad_norm": 0.7665122151374817, + "learning_rate": 0.0001841237237962907, + "loss": 2.7537, + "step": 2633 + }, + { + "epoch": 0.23863559149283142, + "grad_norm": 0.7318230271339417, + "learning_rate": 0.00018411768259529994, + "loss": 3.2429, + "step": 2634 + }, + { + "epoch": 0.2387261896672782, + "grad_norm": 0.7539876103401184, + "learning_rate": 0.00018411164139430918, + "loss": 2.5473, + "step": 2635 + }, + { + "epoch": 0.238816787841725, + "grad_norm": 0.7099336385726929, + "learning_rate": 0.00018410560019331844, + "loss": 2.7458, + "step": 2636 + }, + { + "epoch": 0.23890738601617179, + "grad_norm": 0.5990596413612366, + "learning_rate": 0.0001840995589923277, + "loss": 2.1917, + "step": 2637 + }, + { + "epoch": 0.23899798419061855, + "grad_norm": 0.6996987462043762, + "learning_rate": 0.00018409351779133693, + "loss": 2.8061, + "step": 2638 + }, + { + "epoch": 0.23908858236506533, + "grad_norm": 0.6988723874092102, + "learning_rate": 0.00018408747659034617, + "loss": 2.6599, + "step": 2639 + }, + { + "epoch": 0.23917918053951212, + "grad_norm": 0.6946936249732971, + "learning_rate": 0.0001840814353893554, + "loss": 2.7926, + "step": 2640 + }, + { + "epoch": 0.2392697787139589, + "grad_norm": 0.6939013600349426, + "learning_rate": 0.00018407539418836466, + "loss": 2.9333, + "step": 2641 + }, + { + "epoch": 0.2393603768884057, + "grad_norm": 0.7051838040351868, + "learning_rate": 0.0001840693529873739, + "loss": 2.7506, + "step": 2642 + }, + { + "epoch": 0.23945097506285248, + "grad_norm": 0.7652677893638611, + "learning_rate": 0.00018406331178638313, + "loss": 3.039, + "step": 2643 + }, + { + "epoch": 0.23954157323729927, + "grad_norm": 0.6239226460456848, + "learning_rate": 0.0001840572705853924, + "loss": 1.983, + "step": 2644 + }, + { + "epoch": 0.23963217141174606, + "grad_norm": 0.7338651418685913, + "learning_rate": 0.00018405122938440163, + "loss": 3.0873, + "step": 2645 + }, + { + "epoch": 0.23972276958619285, + "grad_norm": 0.7061101198196411, + "learning_rate": 0.0001840451881834109, + "loss": 2.775, + "step": 2646 + }, + { + "epoch": 0.23981336776063963, + "grad_norm": 0.7003567814826965, + "learning_rate": 0.0001840391469824201, + "loss": 2.8726, + "step": 2647 + }, + { + "epoch": 0.23990396593508642, + "grad_norm": 0.6685833930969238, + "learning_rate": 0.00018403310578142936, + "loss": 2.6612, + "step": 2648 + }, + { + "epoch": 0.23999456410953318, + "grad_norm": 0.7369577288627625, + "learning_rate": 0.0001840270645804386, + "loss": 2.8683, + "step": 2649 + }, + { + "epoch": 0.24008516228397997, + "grad_norm": 0.6633711457252502, + "learning_rate": 0.00018402102337944785, + "loss": 2.7239, + "step": 2650 + }, + { + "epoch": 0.24017576045842676, + "grad_norm": 0.8036003708839417, + "learning_rate": 0.00018401498217845709, + "loss": 2.9011, + "step": 2651 + }, + { + "epoch": 0.24026635863287354, + "grad_norm": 0.7825168967247009, + "learning_rate": 0.00018400894097746632, + "loss": 2.8835, + "step": 2652 + }, + { + "epoch": 0.24035695680732033, + "grad_norm": 0.7456740736961365, + "learning_rate": 0.00018400289977647558, + "loss": 2.2914, + "step": 2653 + }, + { + "epoch": 0.24044755498176712, + "grad_norm": 0.7074794173240662, + "learning_rate": 0.00018399685857548481, + "loss": 3.074, + "step": 2654 + }, + { + "epoch": 0.2405381531562139, + "grad_norm": 0.706536591053009, + "learning_rate": 0.00018399081737449405, + "loss": 2.813, + "step": 2655 + }, + { + "epoch": 0.2406287513306607, + "grad_norm": 0.7043260335922241, + "learning_rate": 0.00018398477617350328, + "loss": 2.8232, + "step": 2656 + }, + { + "epoch": 0.24071934950510748, + "grad_norm": 0.7898746728897095, + "learning_rate": 0.00018397873497251254, + "loss": 2.9967, + "step": 2657 + }, + { + "epoch": 0.24080994767955427, + "grad_norm": 0.7734653353691101, + "learning_rate": 0.0001839726937715218, + "loss": 2.9291, + "step": 2658 + }, + { + "epoch": 0.24090054585400103, + "grad_norm": 0.7122647166252136, + "learning_rate": 0.00018396665257053104, + "loss": 2.5961, + "step": 2659 + }, + { + "epoch": 0.24099114402844782, + "grad_norm": 0.6838452816009521, + "learning_rate": 0.00018396061136954027, + "loss": 2.7704, + "step": 2660 + }, + { + "epoch": 0.2410817422028946, + "grad_norm": 0.6138921976089478, + "learning_rate": 0.0001839545701685495, + "loss": 2.1018, + "step": 2661 + }, + { + "epoch": 0.2411723403773414, + "grad_norm": 0.845843493938446, + "learning_rate": 0.00018394852896755877, + "loss": 2.8842, + "step": 2662 + }, + { + "epoch": 0.24126293855178818, + "grad_norm": 0.6897029876708984, + "learning_rate": 0.000183942487766568, + "loss": 2.7432, + "step": 2663 + }, + { + "epoch": 0.24135353672623497, + "grad_norm": 0.7224735021591187, + "learning_rate": 0.00018393644656557724, + "loss": 2.1147, + "step": 2664 + }, + { + "epoch": 0.24144413490068176, + "grad_norm": 0.7464970350265503, + "learning_rate": 0.00018393040536458647, + "loss": 2.9051, + "step": 2665 + }, + { + "epoch": 0.24153473307512854, + "grad_norm": 0.7290517687797546, + "learning_rate": 0.00018392436416359573, + "loss": 2.9898, + "step": 2666 + }, + { + "epoch": 0.24162533124957533, + "grad_norm": 0.6948810815811157, + "learning_rate": 0.000183918322962605, + "loss": 2.185, + "step": 2667 + }, + { + "epoch": 0.24171592942402212, + "grad_norm": 0.6242077946662903, + "learning_rate": 0.0001839122817616142, + "loss": 2.2691, + "step": 2668 + }, + { + "epoch": 0.24180652759846888, + "grad_norm": 0.679047167301178, + "learning_rate": 0.00018390624056062346, + "loss": 2.5946, + "step": 2669 + }, + { + "epoch": 0.24189712577291567, + "grad_norm": 0.7447897791862488, + "learning_rate": 0.0001839001993596327, + "loss": 2.841, + "step": 2670 + }, + { + "epoch": 0.24198772394736245, + "grad_norm": 0.7376993298530579, + "learning_rate": 0.00018389415815864196, + "loss": 2.5215, + "step": 2671 + }, + { + "epoch": 0.24207832212180924, + "grad_norm": 0.6895816326141357, + "learning_rate": 0.0001838881169576512, + "loss": 2.627, + "step": 2672 + }, + { + "epoch": 0.24216892029625603, + "grad_norm": 0.6803039312362671, + "learning_rate": 0.00018388207575666042, + "loss": 2.6755, + "step": 2673 + }, + { + "epoch": 0.24225951847070282, + "grad_norm": 0.7240241169929504, + "learning_rate": 0.00018387603455566969, + "loss": 2.8287, + "step": 2674 + }, + { + "epoch": 0.2423501166451496, + "grad_norm": 0.7122670412063599, + "learning_rate": 0.00018386999335467892, + "loss": 2.8481, + "step": 2675 + }, + { + "epoch": 0.2424407148195964, + "grad_norm": 0.7475500106811523, + "learning_rate": 0.00018386395215368818, + "loss": 2.5975, + "step": 2676 + }, + { + "epoch": 0.24253131299404318, + "grad_norm": 0.756255030632019, + "learning_rate": 0.0001838579109526974, + "loss": 2.9467, + "step": 2677 + }, + { + "epoch": 0.24262191116848997, + "grad_norm": 0.6975583434104919, + "learning_rate": 0.00018385186975170665, + "loss": 2.9137, + "step": 2678 + }, + { + "epoch": 0.24271250934293673, + "grad_norm": 0.6992195844650269, + "learning_rate": 0.00018384582855071588, + "loss": 3.0237, + "step": 2679 + }, + { + "epoch": 0.24280310751738352, + "grad_norm": 0.7498393058776855, + "learning_rate": 0.00018383978734972514, + "loss": 2.6735, + "step": 2680 + }, + { + "epoch": 0.2428937056918303, + "grad_norm": 0.7545500993728638, + "learning_rate": 0.00018383374614873438, + "loss": 3.0351, + "step": 2681 + }, + { + "epoch": 0.2429843038662771, + "grad_norm": 0.7241052389144897, + "learning_rate": 0.0001838277049477436, + "loss": 2.9934, + "step": 2682 + }, + { + "epoch": 0.24307490204072388, + "grad_norm": 0.7581866383552551, + "learning_rate": 0.00018382166374675287, + "loss": 2.8309, + "step": 2683 + }, + { + "epoch": 0.24316550021517067, + "grad_norm": 0.6946201920509338, + "learning_rate": 0.0001838156225457621, + "loss": 2.8941, + "step": 2684 + }, + { + "epoch": 0.24325609838961745, + "grad_norm": 0.7209396362304688, + "learning_rate": 0.00018380958134477134, + "loss": 2.8115, + "step": 2685 + }, + { + "epoch": 0.24334669656406424, + "grad_norm": 0.7359237670898438, + "learning_rate": 0.00018380354014378058, + "loss": 2.6918, + "step": 2686 + }, + { + "epoch": 0.24343729473851103, + "grad_norm": 0.7258530259132385, + "learning_rate": 0.00018379749894278984, + "loss": 2.8419, + "step": 2687 + }, + { + "epoch": 0.24352789291295782, + "grad_norm": 0.6759199500083923, + "learning_rate": 0.0001837914577417991, + "loss": 2.756, + "step": 2688 + }, + { + "epoch": 0.24361849108740458, + "grad_norm": 0.731989324092865, + "learning_rate": 0.00018378541654080833, + "loss": 2.9721, + "step": 2689 + }, + { + "epoch": 0.24370908926185136, + "grad_norm": 0.7330361008644104, + "learning_rate": 0.00018377937533981757, + "loss": 2.166, + "step": 2690 + }, + { + "epoch": 0.24379968743629815, + "grad_norm": 0.6827707886695862, + "learning_rate": 0.0001837733341388268, + "loss": 2.8844, + "step": 2691 + }, + { + "epoch": 0.24389028561074494, + "grad_norm": 0.725470244884491, + "learning_rate": 0.00018376729293783606, + "loss": 2.8152, + "step": 2692 + }, + { + "epoch": 0.24398088378519173, + "grad_norm": 0.6822322010993958, + "learning_rate": 0.0001837612517368453, + "loss": 2.4567, + "step": 2693 + }, + { + "epoch": 0.24407148195963851, + "grad_norm": 0.7518590092658997, + "learning_rate": 0.00018375521053585453, + "loss": 3.1679, + "step": 2694 + }, + { + "epoch": 0.2441620801340853, + "grad_norm": 0.6859571933746338, + "learning_rate": 0.00018374916933486376, + "loss": 2.8498, + "step": 2695 + }, + { + "epoch": 0.2442526783085321, + "grad_norm": 0.7143861055374146, + "learning_rate": 0.00018374312813387302, + "loss": 2.9326, + "step": 2696 + }, + { + "epoch": 0.24434327648297888, + "grad_norm": 0.7225805521011353, + "learning_rate": 0.00018373708693288229, + "loss": 3.0288, + "step": 2697 + }, + { + "epoch": 0.24443387465742566, + "grad_norm": 0.7745459079742432, + "learning_rate": 0.0001837310457318915, + "loss": 2.811, + "step": 2698 + }, + { + "epoch": 0.24452447283187242, + "grad_norm": 0.7184926867485046, + "learning_rate": 0.00018372500453090075, + "loss": 3.0061, + "step": 2699 + }, + { + "epoch": 0.2446150710063192, + "grad_norm": 0.8034253120422363, + "learning_rate": 0.00018371896332991, + "loss": 2.8397, + "step": 2700 + }, + { + "epoch": 0.244705669180766, + "grad_norm": 0.6919952034950256, + "learning_rate": 0.00018371292212891925, + "loss": 2.7687, + "step": 2701 + }, + { + "epoch": 0.2447962673552128, + "grad_norm": 0.6123999357223511, + "learning_rate": 0.00018370688092792848, + "loss": 2.0303, + "step": 2702 + }, + { + "epoch": 0.24488686552965958, + "grad_norm": 0.7386684417724609, + "learning_rate": 0.00018370083972693772, + "loss": 2.7427, + "step": 2703 + }, + { + "epoch": 0.24497746370410636, + "grad_norm": 0.6821755766868591, + "learning_rate": 0.00018369479852594698, + "loss": 2.108, + "step": 2704 + }, + { + "epoch": 0.24506806187855315, + "grad_norm": 0.7267730832099915, + "learning_rate": 0.0001836887573249562, + "loss": 2.9205, + "step": 2705 + }, + { + "epoch": 0.24515866005299994, + "grad_norm": 0.8288866877555847, + "learning_rate": 0.00018368271612396545, + "loss": 2.8099, + "step": 2706 + }, + { + "epoch": 0.24524925822744673, + "grad_norm": 0.70851731300354, + "learning_rate": 0.00018367667492297468, + "loss": 2.9408, + "step": 2707 + }, + { + "epoch": 0.2453398564018935, + "grad_norm": 0.7990567088127136, + "learning_rate": 0.00018367063372198394, + "loss": 2.9773, + "step": 2708 + }, + { + "epoch": 0.24543045457634027, + "grad_norm": 0.6195018887519836, + "learning_rate": 0.00018366459252099318, + "loss": 2.0717, + "step": 2709 + }, + { + "epoch": 0.24552105275078706, + "grad_norm": 0.7577852606773376, + "learning_rate": 0.00018365855132000244, + "loss": 3.0188, + "step": 2710 + }, + { + "epoch": 0.24561165092523385, + "grad_norm": 0.69371497631073, + "learning_rate": 0.00018365251011901167, + "loss": 2.7543, + "step": 2711 + }, + { + "epoch": 0.24570224909968064, + "grad_norm": 0.7273867726325989, + "learning_rate": 0.0001836464689180209, + "loss": 2.8536, + "step": 2712 + }, + { + "epoch": 0.24579284727412742, + "grad_norm": 0.7666154503822327, + "learning_rate": 0.00018364042771703017, + "loss": 3.0546, + "step": 2713 + }, + { + "epoch": 0.2458834454485742, + "grad_norm": 0.7215895652770996, + "learning_rate": 0.0001836343865160394, + "loss": 2.8422, + "step": 2714 + }, + { + "epoch": 0.245974043623021, + "grad_norm": 0.6261933445930481, + "learning_rate": 0.00018362834531504863, + "loss": 2.1901, + "step": 2715 + }, + { + "epoch": 0.2460646417974678, + "grad_norm": 0.6914533972740173, + "learning_rate": 0.00018362230411405787, + "loss": 3.0505, + "step": 2716 + }, + { + "epoch": 0.24615523997191457, + "grad_norm": 0.6451778411865234, + "learning_rate": 0.00018361626291306713, + "loss": 2.1443, + "step": 2717 + }, + { + "epoch": 0.24624583814636136, + "grad_norm": 0.7584173679351807, + "learning_rate": 0.0001836102217120764, + "loss": 3.0227, + "step": 2718 + }, + { + "epoch": 0.24633643632080812, + "grad_norm": 0.7375680208206177, + "learning_rate": 0.0001836041805110856, + "loss": 3.085, + "step": 2719 + }, + { + "epoch": 0.2464270344952549, + "grad_norm": 0.7709214687347412, + "learning_rate": 0.00018359813931009486, + "loss": 3.0693, + "step": 2720 + }, + { + "epoch": 0.2465176326697017, + "grad_norm": 0.7045944929122925, + "learning_rate": 0.0001835920981091041, + "loss": 2.5773, + "step": 2721 + }, + { + "epoch": 0.24660823084414848, + "grad_norm": 0.6882445216178894, + "learning_rate": 0.00018358605690811335, + "loss": 2.7829, + "step": 2722 + }, + { + "epoch": 0.24669882901859527, + "grad_norm": 0.7938804626464844, + "learning_rate": 0.0001835800157071226, + "loss": 3.0115, + "step": 2723 + }, + { + "epoch": 0.24678942719304206, + "grad_norm": 0.7548543810844421, + "learning_rate": 0.00018357397450613182, + "loss": 2.9444, + "step": 2724 + }, + { + "epoch": 0.24688002536748885, + "grad_norm": 0.6531888842582703, + "learning_rate": 0.00018356793330514106, + "loss": 2.3664, + "step": 2725 + }, + { + "epoch": 0.24697062354193564, + "grad_norm": 0.6253365278244019, + "learning_rate": 0.00018356189210415032, + "loss": 2.3049, + "step": 2726 + }, + { + "epoch": 0.24706122171638242, + "grad_norm": 0.6126700639724731, + "learning_rate": 0.00018355585090315958, + "loss": 2.071, + "step": 2727 + }, + { + "epoch": 0.2471518198908292, + "grad_norm": 0.8654010891914368, + "learning_rate": 0.00018354980970216879, + "loss": 2.5546, + "step": 2728 + }, + { + "epoch": 0.247242418065276, + "grad_norm": 0.6912684440612793, + "learning_rate": 0.00018354376850117805, + "loss": 2.5729, + "step": 2729 + }, + { + "epoch": 0.24733301623972276, + "grad_norm": 0.6669019460678101, + "learning_rate": 0.00018353772730018728, + "loss": 2.9193, + "step": 2730 + }, + { + "epoch": 0.24742361441416955, + "grad_norm": 0.7746150493621826, + "learning_rate": 0.00018353168609919654, + "loss": 2.885, + "step": 2731 + }, + { + "epoch": 0.24751421258861633, + "grad_norm": 0.7277629971504211, + "learning_rate": 0.00018352564489820575, + "loss": 2.714, + "step": 2732 + }, + { + "epoch": 0.24760481076306312, + "grad_norm": 0.6949571371078491, + "learning_rate": 0.000183519603697215, + "loss": 2.8773, + "step": 2733 + }, + { + "epoch": 0.2476954089375099, + "grad_norm": 0.7376929521560669, + "learning_rate": 0.00018351356249622427, + "loss": 2.6845, + "step": 2734 + }, + { + "epoch": 0.2477860071119567, + "grad_norm": 0.6464071869850159, + "learning_rate": 0.0001835075212952335, + "loss": 2.1217, + "step": 2735 + }, + { + "epoch": 0.24787660528640348, + "grad_norm": 0.7567276358604431, + "learning_rate": 0.00018350148009424274, + "loss": 3.0158, + "step": 2736 + }, + { + "epoch": 0.24796720346085027, + "grad_norm": 0.7131013870239258, + "learning_rate": 0.00018349543889325197, + "loss": 2.637, + "step": 2737 + }, + { + "epoch": 0.24805780163529706, + "grad_norm": 0.7128570079803467, + "learning_rate": 0.00018348939769226123, + "loss": 2.7949, + "step": 2738 + }, + { + "epoch": 0.24814839980974385, + "grad_norm": 0.7101006507873535, + "learning_rate": 0.00018348335649127047, + "loss": 2.8103, + "step": 2739 + }, + { + "epoch": 0.2482389979841906, + "grad_norm": 0.6929042935371399, + "learning_rate": 0.00018347731529027973, + "loss": 2.7271, + "step": 2740 + }, + { + "epoch": 0.2483295961586374, + "grad_norm": 0.7589413523674011, + "learning_rate": 0.00018347127408928896, + "loss": 2.7565, + "step": 2741 + }, + { + "epoch": 0.24842019433308418, + "grad_norm": 0.7500877976417542, + "learning_rate": 0.0001834652328882982, + "loss": 2.9113, + "step": 2742 + }, + { + "epoch": 0.24851079250753097, + "grad_norm": 0.7492992281913757, + "learning_rate": 0.00018345919168730746, + "loss": 2.9054, + "step": 2743 + }, + { + "epoch": 0.24860139068197776, + "grad_norm": 0.6976815462112427, + "learning_rate": 0.0001834531504863167, + "loss": 2.712, + "step": 2744 + }, + { + "epoch": 0.24869198885642455, + "grad_norm": 0.7316451668739319, + "learning_rate": 0.00018344710928532593, + "loss": 2.7266, + "step": 2745 + }, + { + "epoch": 0.24878258703087133, + "grad_norm": 0.790882408618927, + "learning_rate": 0.00018344106808433516, + "loss": 2.7388, + "step": 2746 + }, + { + "epoch": 0.24887318520531812, + "grad_norm": 0.7314101457595825, + "learning_rate": 0.00018343502688334442, + "loss": 2.9648, + "step": 2747 + }, + { + "epoch": 0.2489637833797649, + "grad_norm": 0.828776478767395, + "learning_rate": 0.00018342898568235368, + "loss": 2.7811, + "step": 2748 + }, + { + "epoch": 0.2490543815542117, + "grad_norm": 0.7074742317199707, + "learning_rate": 0.0001834229444813629, + "loss": 2.7202, + "step": 2749 + }, + { + "epoch": 0.24914497972865846, + "grad_norm": 0.7352871894836426, + "learning_rate": 0.00018341690328037215, + "loss": 2.7837, + "step": 2750 + }, + { + "epoch": 0.24923557790310524, + "grad_norm": 0.7247578501701355, + "learning_rate": 0.00018341086207938139, + "loss": 2.7553, + "step": 2751 + }, + { + "epoch": 0.24932617607755203, + "grad_norm": 0.754871666431427, + "learning_rate": 0.00018340482087839065, + "loss": 2.6417, + "step": 2752 + }, + { + "epoch": 0.24941677425199882, + "grad_norm": 0.7111714482307434, + "learning_rate": 0.00018339877967739988, + "loss": 2.9751, + "step": 2753 + }, + { + "epoch": 0.2495073724264456, + "grad_norm": 0.7315115332603455, + "learning_rate": 0.00018339273847640911, + "loss": 3.0861, + "step": 2754 + }, + { + "epoch": 0.2495979706008924, + "grad_norm": 0.7285873889923096, + "learning_rate": 0.00018338669727541835, + "loss": 2.7117, + "step": 2755 + }, + { + "epoch": 0.24968856877533918, + "grad_norm": 0.759922444820404, + "learning_rate": 0.0001833806560744276, + "loss": 2.9869, + "step": 2756 + }, + { + "epoch": 0.24977916694978597, + "grad_norm": 0.7631885409355164, + "learning_rate": 0.00018337461487343684, + "loss": 3.0276, + "step": 2757 + }, + { + "epoch": 0.24986976512423276, + "grad_norm": 0.8238527178764343, + "learning_rate": 0.00018336857367244608, + "loss": 2.7446, + "step": 2758 + }, + { + "epoch": 0.24996036329867954, + "grad_norm": 0.7362590432167053, + "learning_rate": 0.00018336253247145534, + "loss": 2.6668, + "step": 2759 + }, + { + "epoch": 0.25005096147312633, + "grad_norm": 0.708091139793396, + "learning_rate": 0.00018335649127046457, + "loss": 2.9744, + "step": 2760 + }, + { + "epoch": 0.2501415596475731, + "grad_norm": 0.7234849333763123, + "learning_rate": 0.00018335045006947383, + "loss": 2.9349, + "step": 2761 + }, + { + "epoch": 0.2502321578220199, + "grad_norm": 0.6932078003883362, + "learning_rate": 0.00018334440886848304, + "loss": 2.9661, + "step": 2762 + }, + { + "epoch": 0.2503227559964667, + "grad_norm": 0.7747700810432434, + "learning_rate": 0.0001833383676674923, + "loss": 2.8895, + "step": 2763 + }, + { + "epoch": 0.2504133541709135, + "grad_norm": 0.6791675090789795, + "learning_rate": 0.00018333232646650156, + "loss": 2.1837, + "step": 2764 + }, + { + "epoch": 0.2505039523453602, + "grad_norm": 0.770524263381958, + "learning_rate": 0.0001833262852655108, + "loss": 2.778, + "step": 2765 + }, + { + "epoch": 0.250594550519807, + "grad_norm": 0.702750027179718, + "learning_rate": 0.00018332024406452003, + "loss": 2.7639, + "step": 2766 + }, + { + "epoch": 0.2506851486942538, + "grad_norm": 0.7295138835906982, + "learning_rate": 0.00018331420286352927, + "loss": 2.9634, + "step": 2767 + }, + { + "epoch": 0.2507757468687006, + "grad_norm": 0.6841800212860107, + "learning_rate": 0.00018330816166253853, + "loss": 2.777, + "step": 2768 + }, + { + "epoch": 0.25086634504314737, + "grad_norm": 0.712119996547699, + "learning_rate": 0.00018330212046154776, + "loss": 2.7966, + "step": 2769 + }, + { + "epoch": 0.25095694321759415, + "grad_norm": 0.7174539566040039, + "learning_rate": 0.000183296079260557, + "loss": 2.6603, + "step": 2770 + }, + { + "epoch": 0.25104754139204094, + "grad_norm": 0.7135065793991089, + "learning_rate": 0.00018329003805956626, + "loss": 2.8684, + "step": 2771 + }, + { + "epoch": 0.25113813956648773, + "grad_norm": 0.7084863185882568, + "learning_rate": 0.0001832839968585755, + "loss": 2.9302, + "step": 2772 + }, + { + "epoch": 0.2512287377409345, + "grad_norm": 0.7258697748184204, + "learning_rate": 0.00018327795565758475, + "loss": 2.7905, + "step": 2773 + }, + { + "epoch": 0.2513193359153813, + "grad_norm": 0.749137282371521, + "learning_rate": 0.00018327191445659399, + "loss": 2.8552, + "step": 2774 + }, + { + "epoch": 0.2514099340898281, + "grad_norm": 0.7040445804595947, + "learning_rate": 0.00018326587325560322, + "loss": 2.6267, + "step": 2775 + }, + { + "epoch": 0.2515005322642749, + "grad_norm": 0.7918089628219604, + "learning_rate": 0.00018325983205461245, + "loss": 2.9345, + "step": 2776 + }, + { + "epoch": 0.25159113043872167, + "grad_norm": 0.626758337020874, + "learning_rate": 0.00018325379085362171, + "loss": 2.2133, + "step": 2777 + }, + { + "epoch": 0.25168172861316845, + "grad_norm": 0.6465883851051331, + "learning_rate": 0.00018324774965263095, + "loss": 1.9787, + "step": 2778 + }, + { + "epoch": 0.25177232678761524, + "grad_norm": 0.7743313908576965, + "learning_rate": 0.00018324170845164018, + "loss": 2.8729, + "step": 2779 + }, + { + "epoch": 0.25186292496206203, + "grad_norm": 0.6860612630844116, + "learning_rate": 0.00018323566725064944, + "loss": 2.7794, + "step": 2780 + }, + { + "epoch": 0.2519535231365088, + "grad_norm": 0.7296053767204285, + "learning_rate": 0.00018322962604965868, + "loss": 2.9547, + "step": 2781 + }, + { + "epoch": 0.2520441213109556, + "grad_norm": 0.7249394655227661, + "learning_rate": 0.00018322358484866794, + "loss": 2.8082, + "step": 2782 + }, + { + "epoch": 0.2521347194854024, + "grad_norm": 0.7285077571868896, + "learning_rate": 0.00018321754364767715, + "loss": 2.6822, + "step": 2783 + }, + { + "epoch": 0.2522253176598492, + "grad_norm": 0.7677240967750549, + "learning_rate": 0.0001832115024466864, + "loss": 2.8717, + "step": 2784 + }, + { + "epoch": 0.2523159158342959, + "grad_norm": 0.7319473028182983, + "learning_rate": 0.00018320546124569564, + "loss": 3.1002, + "step": 2785 + }, + { + "epoch": 0.2524065140087427, + "grad_norm": 0.7966619729995728, + "learning_rate": 0.0001831994200447049, + "loss": 3.1805, + "step": 2786 + }, + { + "epoch": 0.2524971121831895, + "grad_norm": 0.737023115158081, + "learning_rate": 0.00018319337884371414, + "loss": 2.8576, + "step": 2787 + }, + { + "epoch": 0.2525877103576363, + "grad_norm": 0.8097476363182068, + "learning_rate": 0.00018318733764272337, + "loss": 2.7172, + "step": 2788 + }, + { + "epoch": 0.25267830853208306, + "grad_norm": 0.7374716997146606, + "learning_rate": 0.00018318129644173263, + "loss": 2.7362, + "step": 2789 + }, + { + "epoch": 0.25276890670652985, + "grad_norm": 0.6950068473815918, + "learning_rate": 0.00018317525524074187, + "loss": 2.8816, + "step": 2790 + }, + { + "epoch": 0.25285950488097664, + "grad_norm": 0.6423763036727905, + "learning_rate": 0.00018316921403975113, + "loss": 2.1117, + "step": 2791 + }, + { + "epoch": 0.2529501030554234, + "grad_norm": 0.7019621133804321, + "learning_rate": 0.00018316317283876033, + "loss": 2.2014, + "step": 2792 + }, + { + "epoch": 0.2530407012298702, + "grad_norm": 0.6985130310058594, + "learning_rate": 0.0001831571316377696, + "loss": 2.8217, + "step": 2793 + }, + { + "epoch": 0.253131299404317, + "grad_norm": 0.7162309288978577, + "learning_rate": 0.00018315109043677886, + "loss": 2.9074, + "step": 2794 + }, + { + "epoch": 0.2532218975787638, + "grad_norm": 0.7165883183479309, + "learning_rate": 0.0001831450492357881, + "loss": 2.7309, + "step": 2795 + }, + { + "epoch": 0.2533124957532106, + "grad_norm": 0.7748153209686279, + "learning_rate": 0.00018313900803479732, + "loss": 3.0174, + "step": 2796 + }, + { + "epoch": 0.25340309392765736, + "grad_norm": 0.7622590661048889, + "learning_rate": 0.00018313296683380656, + "loss": 2.6679, + "step": 2797 + }, + { + "epoch": 0.25349369210210415, + "grad_norm": 0.7363407611846924, + "learning_rate": 0.00018312692563281582, + "loss": 2.6408, + "step": 2798 + }, + { + "epoch": 0.25358429027655094, + "grad_norm": 0.812126100063324, + "learning_rate": 0.00018312088443182505, + "loss": 2.9377, + "step": 2799 + }, + { + "epoch": 0.2536748884509977, + "grad_norm": 0.7586735486984253, + "learning_rate": 0.0001831148432308343, + "loss": 2.8168, + "step": 2800 + }, + { + "epoch": 0.2537654866254445, + "grad_norm": 0.7167832255363464, + "learning_rate": 0.00018310880202984355, + "loss": 2.623, + "step": 2801 + }, + { + "epoch": 0.2538560847998913, + "grad_norm": 0.810723066329956, + "learning_rate": 0.00018310276082885278, + "loss": 2.8695, + "step": 2802 + }, + { + "epoch": 0.2539466829743381, + "grad_norm": 0.6988620758056641, + "learning_rate": 0.00018309671962786204, + "loss": 2.7627, + "step": 2803 + }, + { + "epoch": 0.2540372811487849, + "grad_norm": 0.6933172941207886, + "learning_rate": 0.00018309067842687128, + "loss": 2.7627, + "step": 2804 + }, + { + "epoch": 0.25412787932323166, + "grad_norm": 0.773481011390686, + "learning_rate": 0.0001830846372258805, + "loss": 2.8377, + "step": 2805 + }, + { + "epoch": 0.2542184774976784, + "grad_norm": 0.6963708996772766, + "learning_rate": 0.00018307859602488975, + "loss": 2.8745, + "step": 2806 + }, + { + "epoch": 0.2543090756721252, + "grad_norm": 0.8172209858894348, + "learning_rate": 0.000183072554823899, + "loss": 2.9925, + "step": 2807 + }, + { + "epoch": 0.25439967384657197, + "grad_norm": 0.7077677845954895, + "learning_rate": 0.00018306651362290824, + "loss": 2.8409, + "step": 2808 + }, + { + "epoch": 0.25449027202101876, + "grad_norm": 0.7807940244674683, + "learning_rate": 0.00018306047242191748, + "loss": 2.6516, + "step": 2809 + }, + { + "epoch": 0.25458087019546555, + "grad_norm": 0.765400767326355, + "learning_rate": 0.00018305443122092674, + "loss": 2.8468, + "step": 2810 + }, + { + "epoch": 0.25467146836991233, + "grad_norm": 0.7514704465866089, + "learning_rate": 0.00018304839001993597, + "loss": 3.3217, + "step": 2811 + }, + { + "epoch": 0.2547620665443591, + "grad_norm": 0.6895456314086914, + "learning_rate": 0.00018304234881894523, + "loss": 2.7706, + "step": 2812 + }, + { + "epoch": 0.2548526647188059, + "grad_norm": 0.7266420722007751, + "learning_rate": 0.00018303630761795444, + "loss": 2.4063, + "step": 2813 + }, + { + "epoch": 0.2549432628932527, + "grad_norm": 0.8219252228736877, + "learning_rate": 0.0001830302664169637, + "loss": 2.9372, + "step": 2814 + }, + { + "epoch": 0.2550338610676995, + "grad_norm": 0.6765227317810059, + "learning_rate": 0.00018302422521597293, + "loss": 1.9595, + "step": 2815 + }, + { + "epoch": 0.2551244592421463, + "grad_norm": 0.7336142063140869, + "learning_rate": 0.0001830181840149822, + "loss": 2.8593, + "step": 2816 + }, + { + "epoch": 0.25521505741659306, + "grad_norm": 0.736006498336792, + "learning_rate": 0.00018301214281399143, + "loss": 2.5788, + "step": 2817 + }, + { + "epoch": 0.25530565559103985, + "grad_norm": 0.6978636384010315, + "learning_rate": 0.00018300610161300066, + "loss": 2.9139, + "step": 2818 + }, + { + "epoch": 0.25539625376548664, + "grad_norm": 0.7070520520210266, + "learning_rate": 0.00018300006041200992, + "loss": 2.6525, + "step": 2819 + }, + { + "epoch": 0.2554868519399334, + "grad_norm": 0.7722323536872864, + "learning_rate": 0.00018299401921101916, + "loss": 2.8753, + "step": 2820 + }, + { + "epoch": 0.2555774501143802, + "grad_norm": 0.7480814456939697, + "learning_rate": 0.0001829879780100284, + "loss": 2.897, + "step": 2821 + }, + { + "epoch": 0.255668048288827, + "grad_norm": 0.703403651714325, + "learning_rate": 0.00018298193680903763, + "loss": 2.0682, + "step": 2822 + }, + { + "epoch": 0.2557586464632738, + "grad_norm": 0.7193097472190857, + "learning_rate": 0.0001829758956080469, + "loss": 2.9319, + "step": 2823 + }, + { + "epoch": 0.2558492446377206, + "grad_norm": 0.7358028888702393, + "learning_rate": 0.00018296985440705615, + "loss": 2.8176, + "step": 2824 + }, + { + "epoch": 0.25593984281216736, + "grad_norm": 0.7319021224975586, + "learning_rate": 0.00018296381320606538, + "loss": 2.7098, + "step": 2825 + }, + { + "epoch": 0.2560304409866141, + "grad_norm": 0.7771098613739014, + "learning_rate": 0.00018295777200507462, + "loss": 2.9939, + "step": 2826 + }, + { + "epoch": 0.2561210391610609, + "grad_norm": 0.6641890406608582, + "learning_rate": 0.00018295173080408385, + "loss": 1.986, + "step": 2827 + }, + { + "epoch": 0.25621163733550767, + "grad_norm": 0.6998568177223206, + "learning_rate": 0.0001829456896030931, + "loss": 2.8049, + "step": 2828 + }, + { + "epoch": 0.25630223550995446, + "grad_norm": 0.7087244391441345, + "learning_rate": 0.00018293964840210235, + "loss": 2.7735, + "step": 2829 + }, + { + "epoch": 0.25639283368440124, + "grad_norm": 0.728973388671875, + "learning_rate": 0.00018293360720111158, + "loss": 2.8402, + "step": 2830 + }, + { + "epoch": 0.25648343185884803, + "grad_norm": 0.7306018471717834, + "learning_rate": 0.00018292756600012084, + "loss": 2.9283, + "step": 2831 + }, + { + "epoch": 0.2565740300332948, + "grad_norm": 0.6563524603843689, + "learning_rate": 0.00018292152479913008, + "loss": 2.3316, + "step": 2832 + }, + { + "epoch": 0.2566646282077416, + "grad_norm": 0.7284501791000366, + "learning_rate": 0.00018291548359813934, + "loss": 3.1454, + "step": 2833 + }, + { + "epoch": 0.2567552263821884, + "grad_norm": 0.8029441237449646, + "learning_rate": 0.00018290944239714854, + "loss": 2.0915, + "step": 2834 + }, + { + "epoch": 0.2568458245566352, + "grad_norm": 0.7150692343711853, + "learning_rate": 0.0001829034011961578, + "loss": 2.6647, + "step": 2835 + }, + { + "epoch": 0.25693642273108197, + "grad_norm": 0.7560036778450012, + "learning_rate": 0.00018289735999516704, + "loss": 2.723, + "step": 2836 + }, + { + "epoch": 0.25702702090552876, + "grad_norm": 0.7575172185897827, + "learning_rate": 0.0001828913187941763, + "loss": 2.8057, + "step": 2837 + }, + { + "epoch": 0.25711761907997555, + "grad_norm": 0.7880700826644897, + "learning_rate": 0.00018288527759318553, + "loss": 2.7545, + "step": 2838 + }, + { + "epoch": 0.25720821725442233, + "grad_norm": 0.9342020153999329, + "learning_rate": 0.00018287923639219477, + "loss": 3.0112, + "step": 2839 + }, + { + "epoch": 0.2572988154288691, + "grad_norm": 0.7365196943283081, + "learning_rate": 0.00018287319519120403, + "loss": 2.8134, + "step": 2840 + }, + { + "epoch": 0.2573894136033159, + "grad_norm": 0.7772647738456726, + "learning_rate": 0.00018286715399021326, + "loss": 3.1701, + "step": 2841 + }, + { + "epoch": 0.2574800117777627, + "grad_norm": 0.7109144330024719, + "learning_rate": 0.0001828611127892225, + "loss": 2.9663, + "step": 2842 + }, + { + "epoch": 0.2575706099522095, + "grad_norm": 0.7260454893112183, + "learning_rate": 0.00018285507158823173, + "loss": 3.1335, + "step": 2843 + }, + { + "epoch": 0.25766120812665627, + "grad_norm": 0.6220464706420898, + "learning_rate": 0.000182849030387241, + "loss": 2.092, + "step": 2844 + }, + { + "epoch": 0.25775180630110306, + "grad_norm": 0.7076106667518616, + "learning_rate": 0.00018284298918625023, + "loss": 2.7565, + "step": 2845 + }, + { + "epoch": 0.2578424044755498, + "grad_norm": 0.6979650855064392, + "learning_rate": 0.0001828369479852595, + "loss": 2.9117, + "step": 2846 + }, + { + "epoch": 0.2579330026499966, + "grad_norm": 0.7038710117340088, + "learning_rate": 0.00018283090678426872, + "loss": 2.824, + "step": 2847 + }, + { + "epoch": 0.25802360082444337, + "grad_norm": 0.6948887705802917, + "learning_rate": 0.00018282486558327796, + "loss": 2.7133, + "step": 2848 + }, + { + "epoch": 0.25811419899889015, + "grad_norm": 0.7798718214035034, + "learning_rate": 0.00018281882438228722, + "loss": 2.8333, + "step": 2849 + }, + { + "epoch": 0.25820479717333694, + "grad_norm": 0.6877263188362122, + "learning_rate": 0.00018281278318129645, + "loss": 2.5928, + "step": 2850 + }, + { + "epoch": 0.25829539534778373, + "grad_norm": 0.7285712361335754, + "learning_rate": 0.00018280674198030569, + "loss": 2.8183, + "step": 2851 + }, + { + "epoch": 0.2583859935222305, + "grad_norm": 0.7579609155654907, + "learning_rate": 0.00018280070077931492, + "loss": 2.875, + "step": 2852 + }, + { + "epoch": 0.2584765916966773, + "grad_norm": 0.7501285076141357, + "learning_rate": 0.00018279465957832418, + "loss": 2.7449, + "step": 2853 + }, + { + "epoch": 0.2585671898711241, + "grad_norm": 0.7441195845603943, + "learning_rate": 0.00018278861837733344, + "loss": 3.0746, + "step": 2854 + }, + { + "epoch": 0.2586577880455709, + "grad_norm": 0.7374454736709595, + "learning_rate": 0.00018278257717634265, + "loss": 3.0828, + "step": 2855 + }, + { + "epoch": 0.25874838622001767, + "grad_norm": 0.6033118963241577, + "learning_rate": 0.0001827765359753519, + "loss": 2.0752, + "step": 2856 + }, + { + "epoch": 0.25883898439446446, + "grad_norm": 0.8345385789871216, + "learning_rate": 0.00018277049477436114, + "loss": 3.0496, + "step": 2857 + }, + { + "epoch": 0.25892958256891124, + "grad_norm": 0.6673837900161743, + "learning_rate": 0.0001827644535733704, + "loss": 2.6179, + "step": 2858 + }, + { + "epoch": 0.25902018074335803, + "grad_norm": 0.6864690184593201, + "learning_rate": 0.00018275841237237964, + "loss": 2.9458, + "step": 2859 + }, + { + "epoch": 0.2591107789178048, + "grad_norm": 0.7191278338432312, + "learning_rate": 0.00018275237117138887, + "loss": 2.8834, + "step": 2860 + }, + { + "epoch": 0.2592013770922516, + "grad_norm": 0.7405197024345398, + "learning_rate": 0.00018274632997039813, + "loss": 2.646, + "step": 2861 + }, + { + "epoch": 0.2592919752666984, + "grad_norm": 0.7563450932502747, + "learning_rate": 0.00018274028876940737, + "loss": 2.85, + "step": 2862 + }, + { + "epoch": 0.2593825734411452, + "grad_norm": 0.8223953247070312, + "learning_rate": 0.00018273424756841663, + "loss": 3.1462, + "step": 2863 + }, + { + "epoch": 0.25947317161559197, + "grad_norm": 0.7165445685386658, + "learning_rate": 0.00018272820636742584, + "loss": 3.0371, + "step": 2864 + }, + { + "epoch": 0.25956376979003876, + "grad_norm": 0.6508682370185852, + "learning_rate": 0.0001827221651664351, + "loss": 2.024, + "step": 2865 + }, + { + "epoch": 0.2596543679644855, + "grad_norm": 0.7795512080192566, + "learning_rate": 0.00018271612396544433, + "loss": 2.9106, + "step": 2866 + }, + { + "epoch": 0.2597449661389323, + "grad_norm": 0.6810961365699768, + "learning_rate": 0.0001827100827644536, + "loss": 2.5599, + "step": 2867 + }, + { + "epoch": 0.25983556431337906, + "grad_norm": 0.6853575706481934, + "learning_rate": 0.00018270404156346283, + "loss": 2.741, + "step": 2868 + }, + { + "epoch": 0.25992616248782585, + "grad_norm": 0.6654266119003296, + "learning_rate": 0.00018269800036247206, + "loss": 2.615, + "step": 2869 + }, + { + "epoch": 0.26001676066227264, + "grad_norm": 0.7839505672454834, + "learning_rate": 0.00018269195916148132, + "loss": 3.0858, + "step": 2870 + }, + { + "epoch": 0.2601073588367194, + "grad_norm": 0.6976786255836487, + "learning_rate": 0.00018268591796049056, + "loss": 2.9016, + "step": 2871 + }, + { + "epoch": 0.2601979570111662, + "grad_norm": 0.7121533155441284, + "learning_rate": 0.0001826798767594998, + "loss": 2.9684, + "step": 2872 + }, + { + "epoch": 0.260288555185613, + "grad_norm": 0.6855299472808838, + "learning_rate": 0.00018267383555850902, + "loss": 2.8376, + "step": 2873 + }, + { + "epoch": 0.2603791533600598, + "grad_norm": 0.7679256200790405, + "learning_rate": 0.00018266779435751829, + "loss": 2.9426, + "step": 2874 + }, + { + "epoch": 0.2604697515345066, + "grad_norm": 0.7013801336288452, + "learning_rate": 0.00018266175315652755, + "loss": 2.7471, + "step": 2875 + }, + { + "epoch": 0.26056034970895336, + "grad_norm": 0.7464705109596252, + "learning_rate": 0.00018265571195553678, + "loss": 3.3315, + "step": 2876 + }, + { + "epoch": 0.26065094788340015, + "grad_norm": 0.7009716629981995, + "learning_rate": 0.00018264967075454601, + "loss": 2.6934, + "step": 2877 + }, + { + "epoch": 0.26074154605784694, + "grad_norm": 0.6971480250358582, + "learning_rate": 0.00018264362955355525, + "loss": 2.8323, + "step": 2878 + }, + { + "epoch": 0.26083214423229373, + "grad_norm": 0.7464861273765564, + "learning_rate": 0.0001826375883525645, + "loss": 3.0252, + "step": 2879 + }, + { + "epoch": 0.2609227424067405, + "grad_norm": 0.7142953276634216, + "learning_rate": 0.00018263154715157374, + "loss": 2.8877, + "step": 2880 + }, + { + "epoch": 0.2610133405811873, + "grad_norm": 0.7425421476364136, + "learning_rate": 0.00018262550595058298, + "loss": 3.1059, + "step": 2881 + }, + { + "epoch": 0.2611039387556341, + "grad_norm": 0.8174402117729187, + "learning_rate": 0.0001826194647495922, + "loss": 2.942, + "step": 2882 + }, + { + "epoch": 0.2611945369300809, + "grad_norm": 0.7180632948875427, + "learning_rate": 0.00018261342354860147, + "loss": 2.7458, + "step": 2883 + }, + { + "epoch": 0.26128513510452767, + "grad_norm": 0.7014853954315186, + "learning_rate": 0.00018260738234761073, + "loss": 2.7781, + "step": 2884 + }, + { + "epoch": 0.26137573327897445, + "grad_norm": 0.7589818835258484, + "learning_rate": 0.00018260134114661994, + "loss": 2.8255, + "step": 2885 + }, + { + "epoch": 0.26146633145342124, + "grad_norm": 0.6871463060379028, + "learning_rate": 0.0001825952999456292, + "loss": 2.7257, + "step": 2886 + }, + { + "epoch": 0.261556929627868, + "grad_norm": 0.6902428865432739, + "learning_rate": 0.00018258925874463844, + "loss": 2.2307, + "step": 2887 + }, + { + "epoch": 0.26164752780231476, + "grad_norm": 0.6174076795578003, + "learning_rate": 0.0001825832175436477, + "loss": 2.1088, + "step": 2888 + }, + { + "epoch": 0.26173812597676155, + "grad_norm": 0.7430701851844788, + "learning_rate": 0.00018257717634265693, + "loss": 2.9613, + "step": 2889 + }, + { + "epoch": 0.26182872415120834, + "grad_norm": 0.7794860601425171, + "learning_rate": 0.00018257113514166617, + "loss": 2.7797, + "step": 2890 + }, + { + "epoch": 0.2619193223256551, + "grad_norm": 0.5882165431976318, + "learning_rate": 0.00018256509394067543, + "loss": 1.9783, + "step": 2891 + }, + { + "epoch": 0.2620099205001019, + "grad_norm": 0.7479388117790222, + "learning_rate": 0.00018255905273968466, + "loss": 2.8407, + "step": 2892 + }, + { + "epoch": 0.2621005186745487, + "grad_norm": 0.7041124701499939, + "learning_rate": 0.0001825530115386939, + "loss": 2.8776, + "step": 2893 + }, + { + "epoch": 0.2621911168489955, + "grad_norm": 0.71164870262146, + "learning_rate": 0.00018254697033770313, + "loss": 2.8616, + "step": 2894 + }, + { + "epoch": 0.2622817150234423, + "grad_norm": 0.719892680644989, + "learning_rate": 0.0001825409291367124, + "loss": 2.9598, + "step": 2895 + }, + { + "epoch": 0.26237231319788906, + "grad_norm": 0.7181551456451416, + "learning_rate": 0.00018253488793572162, + "loss": 2.8786, + "step": 2896 + }, + { + "epoch": 0.26246291137233585, + "grad_norm": 0.7007085084915161, + "learning_rate": 0.00018252884673473089, + "loss": 2.952, + "step": 2897 + }, + { + "epoch": 0.26255350954678264, + "grad_norm": 0.7084453105926514, + "learning_rate": 0.00018252280553374012, + "loss": 3.1772, + "step": 2898 + }, + { + "epoch": 0.2626441077212294, + "grad_norm": 0.7086040377616882, + "learning_rate": 0.00018251676433274935, + "loss": 2.6809, + "step": 2899 + }, + { + "epoch": 0.2627347058956762, + "grad_norm": 0.7082032561302185, + "learning_rate": 0.00018251072313175861, + "loss": 2.7981, + "step": 2900 + }, + { + "epoch": 0.262825304070123, + "grad_norm": 0.7151533961296082, + "learning_rate": 0.00018250468193076785, + "loss": 2.9577, + "step": 2901 + }, + { + "epoch": 0.2629159022445698, + "grad_norm": 0.742743730545044, + "learning_rate": 0.00018249864072977708, + "loss": 2.7808, + "step": 2902 + }, + { + "epoch": 0.2630065004190166, + "grad_norm": 0.7038187980651855, + "learning_rate": 0.00018249259952878632, + "loss": 2.6772, + "step": 2903 + }, + { + "epoch": 0.26309709859346336, + "grad_norm": 0.7202609777450562, + "learning_rate": 0.00018248655832779558, + "loss": 2.7968, + "step": 2904 + }, + { + "epoch": 0.26318769676791015, + "grad_norm": 0.7999427318572998, + "learning_rate": 0.00018248051712680484, + "loss": 2.6933, + "step": 2905 + }, + { + "epoch": 0.26327829494235694, + "grad_norm": 0.6762344241142273, + "learning_rate": 0.00018247447592581405, + "loss": 2.6976, + "step": 2906 + }, + { + "epoch": 0.26336889311680367, + "grad_norm": 0.6862350702285767, + "learning_rate": 0.0001824684347248233, + "loss": 2.8426, + "step": 2907 + }, + { + "epoch": 0.26345949129125046, + "grad_norm": 0.6825127601623535, + "learning_rate": 0.00018246239352383254, + "loss": 2.2561, + "step": 2908 + }, + { + "epoch": 0.26355008946569725, + "grad_norm": 0.7423912286758423, + "learning_rate": 0.0001824563523228418, + "loss": 2.8102, + "step": 2909 + }, + { + "epoch": 0.26364068764014403, + "grad_norm": 0.623574435710907, + "learning_rate": 0.00018245031112185104, + "loss": 2.3284, + "step": 2910 + }, + { + "epoch": 0.2637312858145908, + "grad_norm": 0.6083977818489075, + "learning_rate": 0.00018244426992086027, + "loss": 2.0091, + "step": 2911 + }, + { + "epoch": 0.2638218839890376, + "grad_norm": 0.7221079468727112, + "learning_rate": 0.0001824382287198695, + "loss": 2.9633, + "step": 2912 + }, + { + "epoch": 0.2639124821634844, + "grad_norm": 0.7102380990982056, + "learning_rate": 0.00018243218751887877, + "loss": 2.8695, + "step": 2913 + }, + { + "epoch": 0.2640030803379312, + "grad_norm": 0.7226359248161316, + "learning_rate": 0.00018242614631788803, + "loss": 2.8571, + "step": 2914 + }, + { + "epoch": 0.26409367851237797, + "grad_norm": 0.7782860994338989, + "learning_rate": 0.00018242010511689723, + "loss": 2.9812, + "step": 2915 + }, + { + "epoch": 0.26418427668682476, + "grad_norm": 0.7029539942741394, + "learning_rate": 0.0001824140639159065, + "loss": 2.5843, + "step": 2916 + }, + { + "epoch": 0.26427487486127155, + "grad_norm": 0.6666748523712158, + "learning_rate": 0.00018240802271491573, + "loss": 2.2287, + "step": 2917 + }, + { + "epoch": 0.26436547303571833, + "grad_norm": 0.7542827129364014, + "learning_rate": 0.000182401981513925, + "loss": 3.0148, + "step": 2918 + }, + { + "epoch": 0.2644560712101651, + "grad_norm": 0.6028794050216675, + "learning_rate": 0.0001823959403129342, + "loss": 2.09, + "step": 2919 + }, + { + "epoch": 0.2645466693846119, + "grad_norm": 0.6462416648864746, + "learning_rate": 0.00018238989911194346, + "loss": 2.1365, + "step": 2920 + }, + { + "epoch": 0.2646372675590587, + "grad_norm": 0.7977909445762634, + "learning_rate": 0.00018238385791095272, + "loss": 2.6982, + "step": 2921 + }, + { + "epoch": 0.2647278657335055, + "grad_norm": 0.6858701705932617, + "learning_rate": 0.00018237781670996195, + "loss": 2.7409, + "step": 2922 + }, + { + "epoch": 0.2648184639079523, + "grad_norm": 0.6700907349586487, + "learning_rate": 0.0001823717755089712, + "loss": 2.7691, + "step": 2923 + }, + { + "epoch": 0.26490906208239906, + "grad_norm": 0.7441837787628174, + "learning_rate": 0.00018236573430798042, + "loss": 2.7854, + "step": 2924 + }, + { + "epoch": 0.26499966025684585, + "grad_norm": 0.6969471573829651, + "learning_rate": 0.00018235969310698968, + "loss": 2.6078, + "step": 2925 + }, + { + "epoch": 0.26509025843129264, + "grad_norm": 0.8154081106185913, + "learning_rate": 0.00018235365190599892, + "loss": 3.142, + "step": 2926 + }, + { + "epoch": 0.26518085660573937, + "grad_norm": 0.6740630269050598, + "learning_rate": 0.00018234761070500818, + "loss": 2.6127, + "step": 2927 + }, + { + "epoch": 0.26527145478018616, + "grad_norm": 0.6447991728782654, + "learning_rate": 0.0001823415695040174, + "loss": 1.9612, + "step": 2928 + }, + { + "epoch": 0.26536205295463294, + "grad_norm": 0.7369371652603149, + "learning_rate": 0.00018233552830302665, + "loss": 2.8507, + "step": 2929 + }, + { + "epoch": 0.26545265112907973, + "grad_norm": 0.7207309603691101, + "learning_rate": 0.0001823294871020359, + "loss": 2.7779, + "step": 2930 + }, + { + "epoch": 0.2655432493035265, + "grad_norm": 0.7019874453544617, + "learning_rate": 0.00018232344590104514, + "loss": 2.9259, + "step": 2931 + }, + { + "epoch": 0.2656338474779733, + "grad_norm": 0.7187686562538147, + "learning_rate": 0.00018231740470005438, + "loss": 2.7451, + "step": 2932 + }, + { + "epoch": 0.2657244456524201, + "grad_norm": 0.7337777018547058, + "learning_rate": 0.0001823113634990636, + "loss": 2.9289, + "step": 2933 + }, + { + "epoch": 0.2658150438268669, + "grad_norm": 0.8100382089614868, + "learning_rate": 0.00018230532229807287, + "loss": 2.8465, + "step": 2934 + }, + { + "epoch": 0.26590564200131367, + "grad_norm": 0.7357702851295471, + "learning_rate": 0.00018229928109708213, + "loss": 2.9563, + "step": 2935 + }, + { + "epoch": 0.26599624017576046, + "grad_norm": 0.7598755359649658, + "learning_rate": 0.00018229323989609134, + "loss": 2.7737, + "step": 2936 + }, + { + "epoch": 0.26608683835020724, + "grad_norm": 0.6745544075965881, + "learning_rate": 0.0001822871986951006, + "loss": 2.6383, + "step": 2937 + }, + { + "epoch": 0.26617743652465403, + "grad_norm": 0.723174512386322, + "learning_rate": 0.00018228115749410983, + "loss": 2.7942, + "step": 2938 + }, + { + "epoch": 0.2662680346991008, + "grad_norm": 0.7219181656837463, + "learning_rate": 0.0001822751162931191, + "loss": 2.8644, + "step": 2939 + }, + { + "epoch": 0.2663586328735476, + "grad_norm": 0.6981692314147949, + "learning_rate": 0.00018226907509212833, + "loss": 2.8714, + "step": 2940 + }, + { + "epoch": 0.2664492310479944, + "grad_norm": 0.6882147789001465, + "learning_rate": 0.00018226303389113756, + "loss": 2.8097, + "step": 2941 + }, + { + "epoch": 0.2665398292224412, + "grad_norm": 0.6953893303871155, + "learning_rate": 0.0001822569926901468, + "loss": 2.6953, + "step": 2942 + }, + { + "epoch": 0.26663042739688797, + "grad_norm": 0.7246333360671997, + "learning_rate": 0.00018225095148915606, + "loss": 3.1272, + "step": 2943 + }, + { + "epoch": 0.26672102557133476, + "grad_norm": 0.713904619216919, + "learning_rate": 0.0001822449102881653, + "loss": 2.6506, + "step": 2944 + }, + { + "epoch": 0.26681162374578155, + "grad_norm": 0.7299203872680664, + "learning_rate": 0.00018223886908717453, + "loss": 2.6634, + "step": 2945 + }, + { + "epoch": 0.26690222192022833, + "grad_norm": 0.7662751078605652, + "learning_rate": 0.0001822328278861838, + "loss": 3.1052, + "step": 2946 + }, + { + "epoch": 0.26699282009467507, + "grad_norm": 0.7692747712135315, + "learning_rate": 0.00018222678668519302, + "loss": 2.8168, + "step": 2947 + }, + { + "epoch": 0.26708341826912185, + "grad_norm": 0.7331792712211609, + "learning_rate": 0.00018222074548420228, + "loss": 2.6384, + "step": 2948 + }, + { + "epoch": 0.26717401644356864, + "grad_norm": 0.754351794719696, + "learning_rate": 0.0001822147042832115, + "loss": 2.9376, + "step": 2949 + }, + { + "epoch": 0.26726461461801543, + "grad_norm": 0.7680066227912903, + "learning_rate": 0.00018220866308222075, + "loss": 2.8656, + "step": 2950 + }, + { + "epoch": 0.2673552127924622, + "grad_norm": 0.7108008861541748, + "learning_rate": 0.00018220262188123, + "loss": 2.8766, + "step": 2951 + }, + { + "epoch": 0.267445810966909, + "grad_norm": 0.725007951259613, + "learning_rate": 0.00018219658068023925, + "loss": 2.7823, + "step": 2952 + }, + { + "epoch": 0.2675364091413558, + "grad_norm": 0.7503877878189087, + "learning_rate": 0.00018219053947924848, + "loss": 2.8502, + "step": 2953 + }, + { + "epoch": 0.2676270073158026, + "grad_norm": 0.7535635232925415, + "learning_rate": 0.00018218449827825771, + "loss": 2.921, + "step": 2954 + }, + { + "epoch": 0.26771760549024937, + "grad_norm": 0.7328338623046875, + "learning_rate": 0.00018217845707726698, + "loss": 2.8768, + "step": 2955 + }, + { + "epoch": 0.26780820366469615, + "grad_norm": 0.7604708671569824, + "learning_rate": 0.0001821724158762762, + "loss": 2.6952, + "step": 2956 + }, + { + "epoch": 0.26789880183914294, + "grad_norm": 0.6030751466751099, + "learning_rate": 0.00018216637467528544, + "loss": 2.1435, + "step": 2957 + }, + { + "epoch": 0.26798940001358973, + "grad_norm": 0.8132200241088867, + "learning_rate": 0.0001821603334742947, + "loss": 3.0381, + "step": 2958 + }, + { + "epoch": 0.2680799981880365, + "grad_norm": 0.645603597164154, + "learning_rate": 0.00018215429227330394, + "loss": 2.2093, + "step": 2959 + }, + { + "epoch": 0.2681705963624833, + "grad_norm": 0.707737922668457, + "learning_rate": 0.0001821482510723132, + "loss": 2.7159, + "step": 2960 + }, + { + "epoch": 0.2682611945369301, + "grad_norm": 0.7224463224411011, + "learning_rate": 0.00018214220987132243, + "loss": 2.7178, + "step": 2961 + }, + { + "epoch": 0.2683517927113769, + "grad_norm": 0.7824317216873169, + "learning_rate": 0.00018213616867033167, + "loss": 2.7418, + "step": 2962 + }, + { + "epoch": 0.26844239088582367, + "grad_norm": 0.6930343508720398, + "learning_rate": 0.0001821301274693409, + "loss": 2.8576, + "step": 2963 + }, + { + "epoch": 0.26853298906027046, + "grad_norm": 0.6872320771217346, + "learning_rate": 0.00018212408626835016, + "loss": 2.8192, + "step": 2964 + }, + { + "epoch": 0.26862358723471724, + "grad_norm": 0.7405365705490112, + "learning_rate": 0.0001821180450673594, + "loss": 2.9375, + "step": 2965 + }, + { + "epoch": 0.26871418540916403, + "grad_norm": 0.7275986075401306, + "learning_rate": 0.00018211200386636863, + "loss": 3.2841, + "step": 2966 + }, + { + "epoch": 0.2688047835836108, + "grad_norm": 0.7160912752151489, + "learning_rate": 0.0001821059626653779, + "loss": 2.9441, + "step": 2967 + }, + { + "epoch": 0.26889538175805755, + "grad_norm": 0.76194167137146, + "learning_rate": 0.00018209992146438713, + "loss": 2.8592, + "step": 2968 + }, + { + "epoch": 0.26898597993250434, + "grad_norm": 0.6779826283454895, + "learning_rate": 0.0001820938802633964, + "loss": 2.5927, + "step": 2969 + }, + { + "epoch": 0.2690765781069511, + "grad_norm": 0.7029718160629272, + "learning_rate": 0.0001820878390624056, + "loss": 3.0308, + "step": 2970 + }, + { + "epoch": 0.2691671762813979, + "grad_norm": 0.7373092770576477, + "learning_rate": 0.00018208179786141486, + "loss": 2.7389, + "step": 2971 + }, + { + "epoch": 0.2692577744558447, + "grad_norm": 0.7430140376091003, + "learning_rate": 0.0001820757566604241, + "loss": 2.9024, + "step": 2972 + }, + { + "epoch": 0.2693483726302915, + "grad_norm": 0.6965854167938232, + "learning_rate": 0.00018206971545943335, + "loss": 2.7328, + "step": 2973 + }, + { + "epoch": 0.2694389708047383, + "grad_norm": 0.7401251792907715, + "learning_rate": 0.00018206367425844259, + "loss": 2.911, + "step": 2974 + }, + { + "epoch": 0.26952956897918506, + "grad_norm": 0.7711108326911926, + "learning_rate": 0.00018205763305745182, + "loss": 3.0848, + "step": 2975 + }, + { + "epoch": 0.26962016715363185, + "grad_norm": 0.7467608451843262, + "learning_rate": 0.00018205159185646108, + "loss": 3.0081, + "step": 2976 + }, + { + "epoch": 0.26971076532807864, + "grad_norm": 0.7296841740608215, + "learning_rate": 0.00018204555065547031, + "loss": 2.9778, + "step": 2977 + }, + { + "epoch": 0.2698013635025254, + "grad_norm": 0.6906659603118896, + "learning_rate": 0.00018203950945447958, + "loss": 2.7092, + "step": 2978 + }, + { + "epoch": 0.2698919616769722, + "grad_norm": 0.7962230443954468, + "learning_rate": 0.00018203346825348878, + "loss": 2.8444, + "step": 2979 + }, + { + "epoch": 0.269982559851419, + "grad_norm": 0.7523003220558167, + "learning_rate": 0.00018202742705249804, + "loss": 3.1655, + "step": 2980 + }, + { + "epoch": 0.2700731580258658, + "grad_norm": 0.6658657789230347, + "learning_rate": 0.0001820213858515073, + "loss": 2.5208, + "step": 2981 + }, + { + "epoch": 0.2701637562003126, + "grad_norm": 0.729396402835846, + "learning_rate": 0.00018201534465051654, + "loss": 2.7451, + "step": 2982 + }, + { + "epoch": 0.27025435437475936, + "grad_norm": 0.7322000861167908, + "learning_rate": 0.00018200930344952577, + "loss": 2.7236, + "step": 2983 + }, + { + "epoch": 0.27034495254920615, + "grad_norm": 0.7925589084625244, + "learning_rate": 0.000182003262248535, + "loss": 3.0367, + "step": 2984 + }, + { + "epoch": 0.27043555072365294, + "grad_norm": 0.7620494365692139, + "learning_rate": 0.00018199722104754427, + "loss": 2.8326, + "step": 2985 + }, + { + "epoch": 0.2705261488980997, + "grad_norm": 0.7171187400817871, + "learning_rate": 0.0001819911798465535, + "loss": 2.9377, + "step": 2986 + }, + { + "epoch": 0.2706167470725465, + "grad_norm": 0.7459127902984619, + "learning_rate": 0.00018198513864556274, + "loss": 2.9731, + "step": 2987 + }, + { + "epoch": 0.27070734524699325, + "grad_norm": 0.8095015287399292, + "learning_rate": 0.000181979097444572, + "loss": 2.6782, + "step": 2988 + }, + { + "epoch": 0.27079794342144003, + "grad_norm": 0.6817353963851929, + "learning_rate": 0.00018197305624358123, + "loss": 2.2983, + "step": 2989 + }, + { + "epoch": 0.2708885415958868, + "grad_norm": 0.5844125747680664, + "learning_rate": 0.0001819670150425905, + "loss": 2.0897, + "step": 2990 + }, + { + "epoch": 0.2709791397703336, + "grad_norm": 0.7630641460418701, + "learning_rate": 0.00018196097384159973, + "loss": 2.9914, + "step": 2991 + }, + { + "epoch": 0.2710697379447804, + "grad_norm": 0.7335880994796753, + "learning_rate": 0.00018195493264060896, + "loss": 2.1599, + "step": 2992 + }, + { + "epoch": 0.2711603361192272, + "grad_norm": 0.7501295208930969, + "learning_rate": 0.0001819488914396182, + "loss": 3.2476, + "step": 2993 + }, + { + "epoch": 0.271250934293674, + "grad_norm": 0.754296600818634, + "learning_rate": 0.00018194285023862746, + "loss": 2.9642, + "step": 2994 + }, + { + "epoch": 0.27134153246812076, + "grad_norm": 0.7231494784355164, + "learning_rate": 0.0001819368090376367, + "loss": 3.003, + "step": 2995 + }, + { + "epoch": 0.27143213064256755, + "grad_norm": 0.6982430815696716, + "learning_rate": 0.00018193076783664592, + "loss": 2.6221, + "step": 2996 + }, + { + "epoch": 0.27152272881701434, + "grad_norm": 0.7284793853759766, + "learning_rate": 0.00018192472663565519, + "loss": 2.8611, + "step": 2997 + }, + { + "epoch": 0.2716133269914611, + "grad_norm": 0.7455865740776062, + "learning_rate": 0.00018191868543466442, + "loss": 2.8369, + "step": 2998 + }, + { + "epoch": 0.2717039251659079, + "grad_norm": 0.7201706171035767, + "learning_rate": 0.00018191264423367368, + "loss": 2.9244, + "step": 2999 + }, + { + "epoch": 0.2717945233403547, + "grad_norm": 0.7710677981376648, + "learning_rate": 0.0001819066030326829, + "loss": 3.0309, + "step": 3000 + }, + { + "epoch": 0.2718851215148015, + "grad_norm": 0.7819190621376038, + "learning_rate": 0.00018190056183169215, + "loss": 3.1014, + "step": 3001 + }, + { + "epoch": 0.2719757196892483, + "grad_norm": 0.7338045835494995, + "learning_rate": 0.00018189452063070138, + "loss": 2.6708, + "step": 3002 + }, + { + "epoch": 0.27206631786369506, + "grad_norm": 0.7020090222358704, + "learning_rate": 0.00018188847942971064, + "loss": 2.7117, + "step": 3003 + }, + { + "epoch": 0.27215691603814185, + "grad_norm": 0.7166123986244202, + "learning_rate": 0.00018188243822871988, + "loss": 2.7946, + "step": 3004 + }, + { + "epoch": 0.27224751421258864, + "grad_norm": 0.6685934066772461, + "learning_rate": 0.0001818763970277291, + "loss": 2.6896, + "step": 3005 + }, + { + "epoch": 0.2723381123870354, + "grad_norm": 0.7717686295509338, + "learning_rate": 0.00018187035582673837, + "loss": 2.7903, + "step": 3006 + }, + { + "epoch": 0.2724287105614822, + "grad_norm": 0.8021900653839111, + "learning_rate": 0.0001818643146257476, + "loss": 3.0654, + "step": 3007 + }, + { + "epoch": 0.27251930873592894, + "grad_norm": 0.750463604927063, + "learning_rate": 0.00018185827342475684, + "loss": 2.7974, + "step": 3008 + }, + { + "epoch": 0.27260990691037573, + "grad_norm": 0.7263280153274536, + "learning_rate": 0.00018185223222376608, + "loss": 2.7099, + "step": 3009 + }, + { + "epoch": 0.2727005050848225, + "grad_norm": 0.7124164700508118, + "learning_rate": 0.00018184619102277534, + "loss": 2.8956, + "step": 3010 + }, + { + "epoch": 0.2727911032592693, + "grad_norm": 0.7099664211273193, + "learning_rate": 0.0001818401498217846, + "loss": 2.6794, + "step": 3011 + }, + { + "epoch": 0.2728817014337161, + "grad_norm": 0.7382985353469849, + "learning_rate": 0.00018183410862079383, + "loss": 2.8054, + "step": 3012 + }, + { + "epoch": 0.2729722996081629, + "grad_norm": 0.75839763879776, + "learning_rate": 0.00018182806741980307, + "loss": 3.0649, + "step": 3013 + }, + { + "epoch": 0.27306289778260967, + "grad_norm": 0.7682290077209473, + "learning_rate": 0.0001818220262188123, + "loss": 2.8302, + "step": 3014 + }, + { + "epoch": 0.27315349595705646, + "grad_norm": 0.8281857967376709, + "learning_rate": 0.00018181598501782156, + "loss": 2.7391, + "step": 3015 + }, + { + "epoch": 0.27324409413150325, + "grad_norm": 0.7258399128913879, + "learning_rate": 0.0001818099438168308, + "loss": 2.776, + "step": 3016 + }, + { + "epoch": 0.27333469230595003, + "grad_norm": 0.62248295545578, + "learning_rate": 0.00018180390261584003, + "loss": 1.9994, + "step": 3017 + }, + { + "epoch": 0.2734252904803968, + "grad_norm": 0.7154824733734131, + "learning_rate": 0.0001817978614148493, + "loss": 2.5919, + "step": 3018 + }, + { + "epoch": 0.2735158886548436, + "grad_norm": 0.7446551322937012, + "learning_rate": 0.00018179182021385852, + "loss": 2.7874, + "step": 3019 + }, + { + "epoch": 0.2736064868292904, + "grad_norm": 0.751803457736969, + "learning_rate": 0.00018178577901286779, + "loss": 2.8608, + "step": 3020 + }, + { + "epoch": 0.2736970850037372, + "grad_norm": 0.7438027858734131, + "learning_rate": 0.000181779737811877, + "loss": 3.025, + "step": 3021 + }, + { + "epoch": 0.27378768317818397, + "grad_norm": 0.7862982153892517, + "learning_rate": 0.00018177369661088625, + "loss": 2.6695, + "step": 3022 + }, + { + "epoch": 0.27387828135263076, + "grad_norm": 0.6995636224746704, + "learning_rate": 0.0001817676554098955, + "loss": 2.7453, + "step": 3023 + }, + { + "epoch": 0.27396887952707755, + "grad_norm": 0.5730842351913452, + "learning_rate": 0.00018176161420890475, + "loss": 2.102, + "step": 3024 + }, + { + "epoch": 0.27405947770152433, + "grad_norm": 0.7632070183753967, + "learning_rate": 0.00018175557300791398, + "loss": 3.1034, + "step": 3025 + }, + { + "epoch": 0.2741500758759711, + "grad_norm": 0.7293329834938049, + "learning_rate": 0.00018174953180692322, + "loss": 2.7146, + "step": 3026 + }, + { + "epoch": 0.2742406740504179, + "grad_norm": 0.7399683594703674, + "learning_rate": 0.00018174349060593248, + "loss": 2.7072, + "step": 3027 + }, + { + "epoch": 0.27433127222486464, + "grad_norm": 0.747677206993103, + "learning_rate": 0.0001817374494049417, + "loss": 2.6777, + "step": 3028 + }, + { + "epoch": 0.27442187039931143, + "grad_norm": 0.7293082475662231, + "learning_rate": 0.00018173140820395095, + "loss": 2.8177, + "step": 3029 + }, + { + "epoch": 0.2745124685737582, + "grad_norm": 0.7791783809661865, + "learning_rate": 0.00018172536700296018, + "loss": 3.1049, + "step": 3030 + }, + { + "epoch": 0.274603066748205, + "grad_norm": 0.7542382478713989, + "learning_rate": 0.00018171932580196944, + "loss": 2.7896, + "step": 3031 + }, + { + "epoch": 0.2746936649226518, + "grad_norm": 0.6950421929359436, + "learning_rate": 0.00018171328460097868, + "loss": 2.7489, + "step": 3032 + }, + { + "epoch": 0.2747842630970986, + "grad_norm": 0.731023371219635, + "learning_rate": 0.00018170724339998794, + "loss": 2.1661, + "step": 3033 + }, + { + "epoch": 0.27487486127154537, + "grad_norm": 0.7374275326728821, + "learning_rate": 0.00018170120219899717, + "loss": 3.1011, + "step": 3034 + }, + { + "epoch": 0.27496545944599216, + "grad_norm": 0.9960789680480957, + "learning_rate": 0.0001816951609980064, + "loss": 2.7692, + "step": 3035 + }, + { + "epoch": 0.27505605762043894, + "grad_norm": 0.7570144534111023, + "learning_rate": 0.00018168911979701567, + "loss": 2.9707, + "step": 3036 + }, + { + "epoch": 0.27514665579488573, + "grad_norm": 0.7316480278968811, + "learning_rate": 0.0001816830785960249, + "loss": 2.8069, + "step": 3037 + }, + { + "epoch": 0.2752372539693325, + "grad_norm": 0.7441943883895874, + "learning_rate": 0.00018167703739503413, + "loss": 2.8017, + "step": 3038 + }, + { + "epoch": 0.2753278521437793, + "grad_norm": 0.6402938961982727, + "learning_rate": 0.00018167099619404337, + "loss": 2.3514, + "step": 3039 + }, + { + "epoch": 0.2754184503182261, + "grad_norm": 0.7192772030830383, + "learning_rate": 0.00018166495499305263, + "loss": 2.7316, + "step": 3040 + }, + { + "epoch": 0.2755090484926729, + "grad_norm": 0.7422043681144714, + "learning_rate": 0.0001816589137920619, + "loss": 2.7921, + "step": 3041 + }, + { + "epoch": 0.27559964666711967, + "grad_norm": 0.8015037178993225, + "learning_rate": 0.0001816528725910711, + "loss": 3.1428, + "step": 3042 + }, + { + "epoch": 0.27569024484156646, + "grad_norm": 0.7187919020652771, + "learning_rate": 0.00018164683139008036, + "loss": 2.7572, + "step": 3043 + }, + { + "epoch": 0.27578084301601324, + "grad_norm": 0.7797412276268005, + "learning_rate": 0.0001816407901890896, + "loss": 3.0746, + "step": 3044 + }, + { + "epoch": 0.27587144119046003, + "grad_norm": 0.7102354168891907, + "learning_rate": 0.00018163474898809885, + "loss": 2.707, + "step": 3045 + }, + { + "epoch": 0.2759620393649068, + "grad_norm": 0.6829782128334045, + "learning_rate": 0.0001816287077871081, + "loss": 2.6868, + "step": 3046 + }, + { + "epoch": 0.2760526375393536, + "grad_norm": 0.6916815638542175, + "learning_rate": 0.00018162266658611732, + "loss": 2.8698, + "step": 3047 + }, + { + "epoch": 0.2761432357138004, + "grad_norm": 0.7642716765403748, + "learning_rate": 0.00018161662538512658, + "loss": 2.8899, + "step": 3048 + }, + { + "epoch": 0.2762338338882471, + "grad_norm": 0.7210948467254639, + "learning_rate": 0.00018161058418413582, + "loss": 2.8134, + "step": 3049 + }, + { + "epoch": 0.2763244320626939, + "grad_norm": 0.7593864798545837, + "learning_rate": 0.00018160454298314508, + "loss": 2.7563, + "step": 3050 + }, + { + "epoch": 0.2764150302371407, + "grad_norm": 0.7896215319633484, + "learning_rate": 0.00018159850178215428, + "loss": 3.2604, + "step": 3051 + }, + { + "epoch": 0.2765056284115875, + "grad_norm": 0.717738926410675, + "learning_rate": 0.00018159246058116355, + "loss": 2.9298, + "step": 3052 + }, + { + "epoch": 0.2765962265860343, + "grad_norm": 0.8624618053436279, + "learning_rate": 0.00018158641938017278, + "loss": 2.6605, + "step": 3053 + }, + { + "epoch": 0.27668682476048106, + "grad_norm": 0.7453065514564514, + "learning_rate": 0.00018158037817918204, + "loss": 2.7957, + "step": 3054 + }, + { + "epoch": 0.27677742293492785, + "grad_norm": 0.7429419755935669, + "learning_rate": 0.00018157433697819128, + "loss": 2.7546, + "step": 3055 + }, + { + "epoch": 0.27686802110937464, + "grad_norm": 0.7836247682571411, + "learning_rate": 0.0001815682957772005, + "loss": 2.6923, + "step": 3056 + }, + { + "epoch": 0.27695861928382143, + "grad_norm": 0.7823985815048218, + "learning_rate": 0.00018156225457620977, + "loss": 2.9994, + "step": 3057 + }, + { + "epoch": 0.2770492174582682, + "grad_norm": 0.7275105714797974, + "learning_rate": 0.000181556213375219, + "loss": 2.8191, + "step": 3058 + }, + { + "epoch": 0.277139815632715, + "grad_norm": 0.6653509140014648, + "learning_rate": 0.00018155017217422824, + "loss": 2.2254, + "step": 3059 + }, + { + "epoch": 0.2772304138071618, + "grad_norm": 0.8693149089813232, + "learning_rate": 0.00018154413097323747, + "loss": 2.8956, + "step": 3060 + }, + { + "epoch": 0.2773210119816086, + "grad_norm": 0.768696129322052, + "learning_rate": 0.00018153808977224673, + "loss": 2.8738, + "step": 3061 + }, + { + "epoch": 0.27741161015605537, + "grad_norm": 0.7266079187393188, + "learning_rate": 0.00018153204857125597, + "loss": 2.8171, + "step": 3062 + }, + { + "epoch": 0.27750220833050215, + "grad_norm": 0.7134519815444946, + "learning_rate": 0.00018152600737026523, + "loss": 2.7471, + "step": 3063 + }, + { + "epoch": 0.27759280650494894, + "grad_norm": 0.7260289788246155, + "learning_rate": 0.00018151996616927446, + "loss": 2.6209, + "step": 3064 + }, + { + "epoch": 0.27768340467939573, + "grad_norm": 0.833746612071991, + "learning_rate": 0.0001815139249682837, + "loss": 3.199, + "step": 3065 + }, + { + "epoch": 0.2777740028538425, + "grad_norm": 0.7056692838668823, + "learning_rate": 0.00018150788376729296, + "loss": 2.5974, + "step": 3066 + }, + { + "epoch": 0.2778646010282893, + "grad_norm": 0.784049928188324, + "learning_rate": 0.0001815018425663022, + "loss": 2.8512, + "step": 3067 + }, + { + "epoch": 0.2779551992027361, + "grad_norm": 0.760741114616394, + "learning_rate": 0.00018149580136531143, + "loss": 2.9258, + "step": 3068 + }, + { + "epoch": 0.2780457973771828, + "grad_norm": 0.670570433139801, + "learning_rate": 0.00018148976016432066, + "loss": 2.7405, + "step": 3069 + }, + { + "epoch": 0.2781363955516296, + "grad_norm": 0.7778488993644714, + "learning_rate": 0.00018148371896332992, + "loss": 2.8145, + "step": 3070 + }, + { + "epoch": 0.2782269937260764, + "grad_norm": 0.6881260871887207, + "learning_rate": 0.00018147767776233918, + "loss": 2.1565, + "step": 3071 + }, + { + "epoch": 0.2783175919005232, + "grad_norm": 0.6997243165969849, + "learning_rate": 0.0001814716365613484, + "loss": 2.7618, + "step": 3072 + }, + { + "epoch": 0.27840819007497, + "grad_norm": 0.7782740592956543, + "learning_rate": 0.00018146559536035765, + "loss": 2.783, + "step": 3073 + }, + { + "epoch": 0.27849878824941676, + "grad_norm": 0.7266526818275452, + "learning_rate": 0.00018145955415936688, + "loss": 2.8275, + "step": 3074 + }, + { + "epoch": 0.27858938642386355, + "grad_norm": 0.7603001594543457, + "learning_rate": 0.00018145351295837615, + "loss": 2.8533, + "step": 3075 + }, + { + "epoch": 0.27867998459831034, + "grad_norm": 0.7409597039222717, + "learning_rate": 0.00018144747175738538, + "loss": 2.7608, + "step": 3076 + }, + { + "epoch": 0.2787705827727571, + "grad_norm": 0.7374862432479858, + "learning_rate": 0.00018144143055639461, + "loss": 3.0846, + "step": 3077 + }, + { + "epoch": 0.2788611809472039, + "grad_norm": 0.7403279542922974, + "learning_rate": 0.00018143538935540388, + "loss": 2.7641, + "step": 3078 + }, + { + "epoch": 0.2789517791216507, + "grad_norm": 0.797390878200531, + "learning_rate": 0.0001814293481544131, + "loss": 2.9288, + "step": 3079 + }, + { + "epoch": 0.2790423772960975, + "grad_norm": 0.7443195581436157, + "learning_rate": 0.00018142330695342234, + "loss": 2.7557, + "step": 3080 + }, + { + "epoch": 0.2791329754705443, + "grad_norm": 0.7164614200592041, + "learning_rate": 0.00018141726575243158, + "loss": 2.3243, + "step": 3081 + }, + { + "epoch": 0.27922357364499106, + "grad_norm": 0.7325836420059204, + "learning_rate": 0.00018141122455144084, + "loss": 2.828, + "step": 3082 + }, + { + "epoch": 0.27931417181943785, + "grad_norm": 0.7697221636772156, + "learning_rate": 0.00018140518335045007, + "loss": 2.7631, + "step": 3083 + }, + { + "epoch": 0.27940476999388464, + "grad_norm": 0.7209231853485107, + "learning_rate": 0.00018139914214945933, + "loss": 2.8201, + "step": 3084 + }, + { + "epoch": 0.2794953681683314, + "grad_norm": 0.7279824614524841, + "learning_rate": 0.00018139310094846857, + "loss": 2.8717, + "step": 3085 + }, + { + "epoch": 0.2795859663427782, + "grad_norm": 0.7509292364120483, + "learning_rate": 0.0001813870597474778, + "loss": 2.7241, + "step": 3086 + }, + { + "epoch": 0.279676564517225, + "grad_norm": 0.7393221855163574, + "learning_rate": 0.00018138101854648706, + "loss": 3.005, + "step": 3087 + }, + { + "epoch": 0.2797671626916718, + "grad_norm": 0.7180887460708618, + "learning_rate": 0.0001813749773454963, + "loss": 2.6705, + "step": 3088 + }, + { + "epoch": 0.2798577608661185, + "grad_norm": 0.6989145278930664, + "learning_rate": 0.00018136893614450553, + "loss": 3.0544, + "step": 3089 + }, + { + "epoch": 0.2799483590405653, + "grad_norm": 0.7758776545524597, + "learning_rate": 0.00018136289494351477, + "loss": 2.824, + "step": 3090 + }, + { + "epoch": 0.2800389572150121, + "grad_norm": 0.7422905564308167, + "learning_rate": 0.00018135685374252403, + "loss": 2.8695, + "step": 3091 + }, + { + "epoch": 0.2801295553894589, + "grad_norm": 0.758729100227356, + "learning_rate": 0.00018135081254153326, + "loss": 2.9191, + "step": 3092 + }, + { + "epoch": 0.28022015356390567, + "grad_norm": 0.7676730155944824, + "learning_rate": 0.0001813447713405425, + "loss": 2.9995, + "step": 3093 + }, + { + "epoch": 0.28031075173835246, + "grad_norm": 0.7112414240837097, + "learning_rate": 0.00018133873013955176, + "loss": 2.7427, + "step": 3094 + }, + { + "epoch": 0.28040134991279925, + "grad_norm": 0.7521921396255493, + "learning_rate": 0.000181332688938561, + "loss": 2.5048, + "step": 3095 + }, + { + "epoch": 0.28049194808724603, + "grad_norm": 0.7200266122817993, + "learning_rate": 0.00018132664773757025, + "loss": 2.6295, + "step": 3096 + }, + { + "epoch": 0.2805825462616928, + "grad_norm": 0.8036657571792603, + "learning_rate": 0.00018132060653657948, + "loss": 1.9679, + "step": 3097 + }, + { + "epoch": 0.2806731444361396, + "grad_norm": 0.6990570425987244, + "learning_rate": 0.00018131456533558872, + "loss": 2.9211, + "step": 3098 + }, + { + "epoch": 0.2807637426105864, + "grad_norm": 0.7446485161781311, + "learning_rate": 0.00018130852413459795, + "loss": 2.9233, + "step": 3099 + }, + { + "epoch": 0.2808543407850332, + "grad_norm": 0.7021057605743408, + "learning_rate": 0.00018130248293360721, + "loss": 2.7232, + "step": 3100 + }, + { + "epoch": 0.28094493895948, + "grad_norm": 0.8227986097335815, + "learning_rate": 0.00018129644173261648, + "loss": 2.8708, + "step": 3101 + }, + { + "epoch": 0.28103553713392676, + "grad_norm": 0.7778086066246033, + "learning_rate": 0.00018129040053162568, + "loss": 3.1559, + "step": 3102 + }, + { + "epoch": 0.28112613530837355, + "grad_norm": 0.7307665348052979, + "learning_rate": 0.00018128435933063494, + "loss": 2.7158, + "step": 3103 + }, + { + "epoch": 0.28121673348282034, + "grad_norm": 0.7015884518623352, + "learning_rate": 0.00018127831812964418, + "loss": 2.686, + "step": 3104 + }, + { + "epoch": 0.2813073316572671, + "grad_norm": 0.780589759349823, + "learning_rate": 0.00018127227692865344, + "loss": 2.8855, + "step": 3105 + }, + { + "epoch": 0.2813979298317139, + "grad_norm": 0.7567388415336609, + "learning_rate": 0.00018126623572766265, + "loss": 2.659, + "step": 3106 + }, + { + "epoch": 0.2814885280061607, + "grad_norm": 0.7309257984161377, + "learning_rate": 0.0001812601945266719, + "loss": 2.8599, + "step": 3107 + }, + { + "epoch": 0.2815791261806075, + "grad_norm": 0.6774013042449951, + "learning_rate": 0.00018125415332568117, + "loss": 2.1618, + "step": 3108 + }, + { + "epoch": 0.2816697243550542, + "grad_norm": 0.775709331035614, + "learning_rate": 0.0001812481121246904, + "loss": 3.0476, + "step": 3109 + }, + { + "epoch": 0.281760322529501, + "grad_norm": 0.7934529781341553, + "learning_rate": 0.00018124207092369964, + "loss": 2.2021, + "step": 3110 + }, + { + "epoch": 0.2818509207039478, + "grad_norm": 0.7356604933738708, + "learning_rate": 0.00018123602972270887, + "loss": 2.9995, + "step": 3111 + }, + { + "epoch": 0.2819415188783946, + "grad_norm": 0.7071176767349243, + "learning_rate": 0.00018122998852171813, + "loss": 2.6534, + "step": 3112 + }, + { + "epoch": 0.28203211705284137, + "grad_norm": 0.6805758476257324, + "learning_rate": 0.00018122394732072737, + "loss": 2.8283, + "step": 3113 + }, + { + "epoch": 0.28212271522728816, + "grad_norm": 0.7031473517417908, + "learning_rate": 0.00018121790611973663, + "loss": 2.3642, + "step": 3114 + }, + { + "epoch": 0.28221331340173494, + "grad_norm": 0.7141949534416199, + "learning_rate": 0.00018121186491874586, + "loss": 2.7565, + "step": 3115 + }, + { + "epoch": 0.28230391157618173, + "grad_norm": 0.825384259223938, + "learning_rate": 0.0001812058237177551, + "loss": 2.7306, + "step": 3116 + }, + { + "epoch": 0.2823945097506285, + "grad_norm": 0.7165150046348572, + "learning_rate": 0.00018119978251676436, + "loss": 3.0553, + "step": 3117 + }, + { + "epoch": 0.2824851079250753, + "grad_norm": 0.7420250177383423, + "learning_rate": 0.0001811937413157736, + "loss": 2.9036, + "step": 3118 + }, + { + "epoch": 0.2825757060995221, + "grad_norm": 0.7694061994552612, + "learning_rate": 0.00018118770011478282, + "loss": 3.0273, + "step": 3119 + }, + { + "epoch": 0.2826663042739689, + "grad_norm": 0.7001278400421143, + "learning_rate": 0.00018118165891379206, + "loss": 2.9215, + "step": 3120 + }, + { + "epoch": 0.28275690244841567, + "grad_norm": 0.764970064163208, + "learning_rate": 0.00018117561771280132, + "loss": 2.8502, + "step": 3121 + }, + { + "epoch": 0.28284750062286246, + "grad_norm": 0.6433736681938171, + "learning_rate": 0.00018116957651181055, + "loss": 2.1476, + "step": 3122 + }, + { + "epoch": 0.28293809879730925, + "grad_norm": 0.7830066084861755, + "learning_rate": 0.0001811635353108198, + "loss": 2.4991, + "step": 3123 + }, + { + "epoch": 0.28302869697175603, + "grad_norm": 0.7752590775489807, + "learning_rate": 0.00018115749410982905, + "loss": 2.7474, + "step": 3124 + }, + { + "epoch": 0.2831192951462028, + "grad_norm": 0.689484179019928, + "learning_rate": 0.00018115145290883828, + "loss": 2.8323, + "step": 3125 + }, + { + "epoch": 0.2832098933206496, + "grad_norm": 0.6897534132003784, + "learning_rate": 0.00018114541170784754, + "loss": 2.5873, + "step": 3126 + }, + { + "epoch": 0.2833004914950964, + "grad_norm": 0.7172724604606628, + "learning_rate": 0.00018113937050685678, + "loss": 2.7679, + "step": 3127 + }, + { + "epoch": 0.2833910896695432, + "grad_norm": 0.8312467932701111, + "learning_rate": 0.000181133329305866, + "loss": 2.8296, + "step": 3128 + }, + { + "epoch": 0.28348168784398997, + "grad_norm": 0.7345076203346252, + "learning_rate": 0.00018112728810487525, + "loss": 2.8249, + "step": 3129 + }, + { + "epoch": 0.2835722860184367, + "grad_norm": 0.7507028579711914, + "learning_rate": 0.0001811212469038845, + "loss": 2.533, + "step": 3130 + }, + { + "epoch": 0.2836628841928835, + "grad_norm": 0.7335196733474731, + "learning_rate": 0.00018111520570289374, + "loss": 3.0548, + "step": 3131 + }, + { + "epoch": 0.2837534823673303, + "grad_norm": 0.7313501238822937, + "learning_rate": 0.00018110916450190297, + "loss": 2.9532, + "step": 3132 + }, + { + "epoch": 0.28384408054177707, + "grad_norm": 0.7749820351600647, + "learning_rate": 0.00018110312330091224, + "loss": 2.997, + "step": 3133 + }, + { + "epoch": 0.28393467871622385, + "grad_norm": 0.7138544917106628, + "learning_rate": 0.00018109708209992147, + "loss": 2.6325, + "step": 3134 + }, + { + "epoch": 0.28402527689067064, + "grad_norm": 0.7044491767883301, + "learning_rate": 0.00018109104089893073, + "loss": 2.8438, + "step": 3135 + }, + { + "epoch": 0.28411587506511743, + "grad_norm": 0.7165636420249939, + "learning_rate": 0.00018108499969793994, + "loss": 2.5957, + "step": 3136 + }, + { + "epoch": 0.2842064732395642, + "grad_norm": 0.7173700928688049, + "learning_rate": 0.0001810789584969492, + "loss": 2.5335, + "step": 3137 + }, + { + "epoch": 0.284297071414011, + "grad_norm": 0.7386311292648315, + "learning_rate": 0.00018107291729595846, + "loss": 2.6979, + "step": 3138 + }, + { + "epoch": 0.2843876695884578, + "grad_norm": 0.7332969903945923, + "learning_rate": 0.0001810668760949677, + "loss": 2.874, + "step": 3139 + }, + { + "epoch": 0.2844782677629046, + "grad_norm": 0.7685732245445251, + "learning_rate": 0.00018106083489397693, + "loss": 2.8606, + "step": 3140 + }, + { + "epoch": 0.28456886593735137, + "grad_norm": 0.771245539188385, + "learning_rate": 0.00018105479369298616, + "loss": 2.7347, + "step": 3141 + }, + { + "epoch": 0.28465946411179816, + "grad_norm": 0.7215897440910339, + "learning_rate": 0.00018104875249199542, + "loss": 2.6113, + "step": 3142 + }, + { + "epoch": 0.28475006228624494, + "grad_norm": 0.725309431552887, + "learning_rate": 0.00018104271129100466, + "loss": 2.9874, + "step": 3143 + }, + { + "epoch": 0.28484066046069173, + "grad_norm": 0.6638213992118835, + "learning_rate": 0.0001810366700900139, + "loss": 2.16, + "step": 3144 + }, + { + "epoch": 0.2849312586351385, + "grad_norm": 0.7502017021179199, + "learning_rate": 0.00018103062888902315, + "loss": 2.9456, + "step": 3145 + }, + { + "epoch": 0.2850218568095853, + "grad_norm": 0.7254956364631653, + "learning_rate": 0.0001810245876880324, + "loss": 2.8326, + "step": 3146 + }, + { + "epoch": 0.2851124549840321, + "grad_norm": 0.7734969854354858, + "learning_rate": 0.00018101854648704165, + "loss": 2.7492, + "step": 3147 + }, + { + "epoch": 0.2852030531584789, + "grad_norm": 0.7188076972961426, + "learning_rate": 0.00018101250528605088, + "loss": 2.7951, + "step": 3148 + }, + { + "epoch": 0.28529365133292567, + "grad_norm": 0.7402408719062805, + "learning_rate": 0.00018100646408506012, + "loss": 2.7545, + "step": 3149 + }, + { + "epoch": 0.2853842495073724, + "grad_norm": 0.6569768190383911, + "learning_rate": 0.00018100042288406935, + "loss": 1.5936, + "step": 3150 + }, + { + "epoch": 0.2854748476818192, + "grad_norm": 0.785923182964325, + "learning_rate": 0.0001809943816830786, + "loss": 3.1588, + "step": 3151 + }, + { + "epoch": 0.285565445856266, + "grad_norm": 0.813335657119751, + "learning_rate": 0.00018098834048208785, + "loss": 2.8537, + "step": 3152 + }, + { + "epoch": 0.28565604403071276, + "grad_norm": 0.6977028250694275, + "learning_rate": 0.00018098229928109708, + "loss": 2.7375, + "step": 3153 + }, + { + "epoch": 0.28574664220515955, + "grad_norm": 0.6771676540374756, + "learning_rate": 0.00018097625808010634, + "loss": 2.2406, + "step": 3154 + }, + { + "epoch": 0.28583724037960634, + "grad_norm": 0.7094483971595764, + "learning_rate": 0.00018097021687911557, + "loss": 2.815, + "step": 3155 + }, + { + "epoch": 0.2859278385540531, + "grad_norm": 0.6710242033004761, + "learning_rate": 0.00018096417567812484, + "loss": 2.3108, + "step": 3156 + }, + { + "epoch": 0.2860184367284999, + "grad_norm": 0.6939855813980103, + "learning_rate": 0.00018095813447713404, + "loss": 2.2404, + "step": 3157 + }, + { + "epoch": 0.2861090349029467, + "grad_norm": 0.7518349289894104, + "learning_rate": 0.0001809520932761433, + "loss": 2.8248, + "step": 3158 + }, + { + "epoch": 0.2861996330773935, + "grad_norm": 0.7550462484359741, + "learning_rate": 0.00018094605207515254, + "loss": 2.6805, + "step": 3159 + }, + { + "epoch": 0.2862902312518403, + "grad_norm": 0.7272040247917175, + "learning_rate": 0.0001809400108741618, + "loss": 2.7465, + "step": 3160 + }, + { + "epoch": 0.28638082942628706, + "grad_norm": 0.6982013583183289, + "learning_rate": 0.00018093396967317103, + "loss": 2.1584, + "step": 3161 + }, + { + "epoch": 0.28647142760073385, + "grad_norm": 0.7784320116043091, + "learning_rate": 0.00018092792847218027, + "loss": 3.0866, + "step": 3162 + }, + { + "epoch": 0.28656202577518064, + "grad_norm": 0.7481244206428528, + "learning_rate": 0.00018092188727118953, + "loss": 2.8376, + "step": 3163 + }, + { + "epoch": 0.2866526239496274, + "grad_norm": 0.7580200433731079, + "learning_rate": 0.00018091584607019876, + "loss": 2.8952, + "step": 3164 + }, + { + "epoch": 0.2867432221240742, + "grad_norm": 0.7507246136665344, + "learning_rate": 0.00018090980486920802, + "loss": 2.8469, + "step": 3165 + }, + { + "epoch": 0.286833820298521, + "grad_norm": 0.7784203886985779, + "learning_rate": 0.00018090376366821723, + "loss": 2.7371, + "step": 3166 + }, + { + "epoch": 0.2869244184729678, + "grad_norm": 0.760612964630127, + "learning_rate": 0.0001808977224672265, + "loss": 2.7256, + "step": 3167 + }, + { + "epoch": 0.2870150166474146, + "grad_norm": 0.7045333385467529, + "learning_rate": 0.00018089168126623575, + "loss": 2.4602, + "step": 3168 + }, + { + "epoch": 0.28710561482186137, + "grad_norm": 0.7124137878417969, + "learning_rate": 0.000180885640065245, + "loss": 2.7041, + "step": 3169 + }, + { + "epoch": 0.2871962129963081, + "grad_norm": 0.7106683254241943, + "learning_rate": 0.00018087959886425422, + "loss": 2.925, + "step": 3170 + }, + { + "epoch": 0.2872868111707549, + "grad_norm": 0.7598718404769897, + "learning_rate": 0.00018087355766326346, + "loss": 2.8131, + "step": 3171 + }, + { + "epoch": 0.2873774093452017, + "grad_norm": 0.7356531620025635, + "learning_rate": 0.00018086751646227272, + "loss": 2.7491, + "step": 3172 + }, + { + "epoch": 0.28746800751964846, + "grad_norm": 0.7331535816192627, + "learning_rate": 0.00018086147526128195, + "loss": 3.0712, + "step": 3173 + }, + { + "epoch": 0.28755860569409525, + "grad_norm": 0.6879481077194214, + "learning_rate": 0.00018085543406029118, + "loss": 2.7633, + "step": 3174 + }, + { + "epoch": 0.28764920386854204, + "grad_norm": 0.8379855155944824, + "learning_rate": 0.00018084939285930045, + "loss": 3.1484, + "step": 3175 + }, + { + "epoch": 0.2877398020429888, + "grad_norm": 0.7503129839897156, + "learning_rate": 0.00018084335165830968, + "loss": 3.0714, + "step": 3176 + }, + { + "epoch": 0.2878304002174356, + "grad_norm": 0.7638570070266724, + "learning_rate": 0.00018083731045731894, + "loss": 2.9115, + "step": 3177 + }, + { + "epoch": 0.2879209983918824, + "grad_norm": 0.7393748760223389, + "learning_rate": 0.00018083126925632818, + "loss": 2.562, + "step": 3178 + }, + { + "epoch": 0.2880115965663292, + "grad_norm": 0.7965543270111084, + "learning_rate": 0.0001808252280553374, + "loss": 3.2749, + "step": 3179 + }, + { + "epoch": 0.288102194740776, + "grad_norm": 0.6999273896217346, + "learning_rate": 0.00018081918685434664, + "loss": 2.7898, + "step": 3180 + }, + { + "epoch": 0.28819279291522276, + "grad_norm": 0.7505266070365906, + "learning_rate": 0.0001808131456533559, + "loss": 2.5003, + "step": 3181 + }, + { + "epoch": 0.28828339108966955, + "grad_norm": 0.7300152778625488, + "learning_rate": 0.00018080710445236514, + "loss": 2.271, + "step": 3182 + }, + { + "epoch": 0.28837398926411634, + "grad_norm": 0.75677090883255, + "learning_rate": 0.00018080106325137437, + "loss": 2.682, + "step": 3183 + }, + { + "epoch": 0.2884645874385631, + "grad_norm": 0.8138642311096191, + "learning_rate": 0.00018079502205038363, + "loss": 2.7856, + "step": 3184 + }, + { + "epoch": 0.2885551856130099, + "grad_norm": 0.7322288751602173, + "learning_rate": 0.00018078898084939287, + "loss": 2.7504, + "step": 3185 + }, + { + "epoch": 0.2886457837874567, + "grad_norm": 0.6965956687927246, + "learning_rate": 0.00018078293964840213, + "loss": 2.8221, + "step": 3186 + }, + { + "epoch": 0.2887363819619035, + "grad_norm": 0.7423352599143982, + "learning_rate": 0.00018077689844741134, + "loss": 2.911, + "step": 3187 + }, + { + "epoch": 0.2888269801363503, + "grad_norm": 0.7403784394264221, + "learning_rate": 0.0001807708572464206, + "loss": 2.7261, + "step": 3188 + }, + { + "epoch": 0.28891757831079706, + "grad_norm": 0.7124126553535461, + "learning_rate": 0.00018076481604542983, + "loss": 2.6465, + "step": 3189 + }, + { + "epoch": 0.2890081764852438, + "grad_norm": 0.6965331435203552, + "learning_rate": 0.0001807587748444391, + "loss": 2.5874, + "step": 3190 + }, + { + "epoch": 0.2890987746596906, + "grad_norm": 0.7444828152656555, + "learning_rate": 0.00018075273364344833, + "loss": 2.8709, + "step": 3191 + }, + { + "epoch": 0.28918937283413737, + "grad_norm": 0.6894030570983887, + "learning_rate": 0.00018074669244245756, + "loss": 2.6244, + "step": 3192 + }, + { + "epoch": 0.28927997100858416, + "grad_norm": 0.6738059520721436, + "learning_rate": 0.00018074065124146682, + "loss": 3.095, + "step": 3193 + }, + { + "epoch": 0.28937056918303095, + "grad_norm": 0.6756229996681213, + "learning_rate": 0.00018073461004047606, + "loss": 1.9821, + "step": 3194 + }, + { + "epoch": 0.28946116735747773, + "grad_norm": 0.7063896059989929, + "learning_rate": 0.0001807285688394853, + "loss": 2.8119, + "step": 3195 + }, + { + "epoch": 0.2895517655319245, + "grad_norm": 0.7771455645561218, + "learning_rate": 0.00018072252763849452, + "loss": 2.8812, + "step": 3196 + }, + { + "epoch": 0.2896423637063713, + "grad_norm": 0.7464988231658936, + "learning_rate": 0.00018071648643750378, + "loss": 2.8867, + "step": 3197 + }, + { + "epoch": 0.2897329618808181, + "grad_norm": 0.7579567432403564, + "learning_rate": 0.00018071044523651305, + "loss": 2.7766, + "step": 3198 + }, + { + "epoch": 0.2898235600552649, + "grad_norm": 0.7006301283836365, + "learning_rate": 0.00018070440403552228, + "loss": 2.9626, + "step": 3199 + }, + { + "epoch": 0.28991415822971167, + "grad_norm": 0.6801049709320068, + "learning_rate": 0.00018069836283453151, + "loss": 2.1253, + "step": 3200 + }, + { + "epoch": 0.29000475640415846, + "grad_norm": 0.7207200527191162, + "learning_rate": 0.00018069232163354075, + "loss": 2.9394, + "step": 3201 + }, + { + "epoch": 0.29009535457860525, + "grad_norm": 0.7347763776779175, + "learning_rate": 0.00018068628043255, + "loss": 2.8301, + "step": 3202 + }, + { + "epoch": 0.29018595275305203, + "grad_norm": 0.7459425926208496, + "learning_rate": 0.00018068023923155924, + "loss": 2.8521, + "step": 3203 + }, + { + "epoch": 0.2902765509274988, + "grad_norm": 0.742396891117096, + "learning_rate": 0.00018067419803056848, + "loss": 3.098, + "step": 3204 + }, + { + "epoch": 0.2903671491019456, + "grad_norm": 0.7137303948402405, + "learning_rate": 0.00018066815682957774, + "loss": 2.9494, + "step": 3205 + }, + { + "epoch": 0.2904577472763924, + "grad_norm": 0.7113533616065979, + "learning_rate": 0.00018066211562858697, + "loss": 2.8887, + "step": 3206 + }, + { + "epoch": 0.2905483454508392, + "grad_norm": 0.7121194005012512, + "learning_rate": 0.00018065607442759623, + "loss": 2.8629, + "step": 3207 + }, + { + "epoch": 0.290638943625286, + "grad_norm": 0.7377181649208069, + "learning_rate": 0.00018065003322660544, + "loss": 2.7413, + "step": 3208 + }, + { + "epoch": 0.29072954179973276, + "grad_norm": 0.7336846590042114, + "learning_rate": 0.0001806439920256147, + "loss": 3.031, + "step": 3209 + }, + { + "epoch": 0.29082013997417955, + "grad_norm": 0.7256117463111877, + "learning_rate": 0.00018063795082462394, + "loss": 2.8178, + "step": 3210 + }, + { + "epoch": 0.2909107381486263, + "grad_norm": 0.731962263584137, + "learning_rate": 0.0001806319096236332, + "loss": 2.5961, + "step": 3211 + }, + { + "epoch": 0.29100133632307307, + "grad_norm": 0.8803234100341797, + "learning_rate": 0.00018062586842264243, + "loss": 2.9194, + "step": 3212 + }, + { + "epoch": 0.29109193449751986, + "grad_norm": 0.7180642485618591, + "learning_rate": 0.00018061982722165167, + "loss": 2.6727, + "step": 3213 + }, + { + "epoch": 0.29118253267196664, + "grad_norm": 0.732540488243103, + "learning_rate": 0.00018061378602066093, + "loss": 2.0356, + "step": 3214 + }, + { + "epoch": 0.29127313084641343, + "grad_norm": 0.6564884781837463, + "learning_rate": 0.00018060774481967016, + "loss": 2.0715, + "step": 3215 + }, + { + "epoch": 0.2913637290208602, + "grad_norm": 0.7341161370277405, + "learning_rate": 0.0001806017036186794, + "loss": 2.8581, + "step": 3216 + }, + { + "epoch": 0.291454327195307, + "grad_norm": 0.7510519623756409, + "learning_rate": 0.00018059566241768863, + "loss": 3.0354, + "step": 3217 + }, + { + "epoch": 0.2915449253697538, + "grad_norm": 0.8147776126861572, + "learning_rate": 0.0001805896212166979, + "loss": 2.9734, + "step": 3218 + }, + { + "epoch": 0.2916355235442006, + "grad_norm": 0.684376060962677, + "learning_rate": 0.00018058358001570712, + "loss": 2.6884, + "step": 3219 + }, + { + "epoch": 0.29172612171864737, + "grad_norm": 0.7656441330909729, + "learning_rate": 0.00018057753881471638, + "loss": 2.9855, + "step": 3220 + }, + { + "epoch": 0.29181671989309416, + "grad_norm": 0.7274540066719055, + "learning_rate": 0.00018057149761372562, + "loss": 2.8294, + "step": 3221 + }, + { + "epoch": 0.29190731806754094, + "grad_norm": 0.6616628766059875, + "learning_rate": 0.00018056545641273485, + "loss": 2.1325, + "step": 3222 + }, + { + "epoch": 0.29199791624198773, + "grad_norm": 0.7014650106430054, + "learning_rate": 0.00018055941521174411, + "loss": 2.6139, + "step": 3223 + }, + { + "epoch": 0.2920885144164345, + "grad_norm": 0.8746328949928284, + "learning_rate": 0.00018055337401075335, + "loss": 2.5776, + "step": 3224 + }, + { + "epoch": 0.2921791125908813, + "grad_norm": 0.7314022779464722, + "learning_rate": 0.00018054733280976258, + "loss": 2.742, + "step": 3225 + }, + { + "epoch": 0.2922697107653281, + "grad_norm": 0.7168747782707214, + "learning_rate": 0.00018054129160877182, + "loss": 2.7022, + "step": 3226 + }, + { + "epoch": 0.2923603089397749, + "grad_norm": 0.7439312934875488, + "learning_rate": 0.00018053525040778108, + "loss": 2.9575, + "step": 3227 + }, + { + "epoch": 0.29245090711422167, + "grad_norm": 0.781930685043335, + "learning_rate": 0.00018052920920679034, + "loss": 2.2489, + "step": 3228 + }, + { + "epoch": 0.29254150528866846, + "grad_norm": 0.6966174244880676, + "learning_rate": 0.00018052316800579955, + "loss": 2.7836, + "step": 3229 + }, + { + "epoch": 0.29263210346311525, + "grad_norm": 0.8428810834884644, + "learning_rate": 0.0001805171268048088, + "loss": 2.8898, + "step": 3230 + }, + { + "epoch": 0.292722701637562, + "grad_norm": 0.742038905620575, + "learning_rate": 0.00018051108560381804, + "loss": 2.8173, + "step": 3231 + }, + { + "epoch": 0.29281329981200876, + "grad_norm": 0.7580826878547668, + "learning_rate": 0.0001805050444028273, + "loss": 2.6476, + "step": 3232 + }, + { + "epoch": 0.29290389798645555, + "grad_norm": 0.8651383519172668, + "learning_rate": 0.00018049900320183654, + "loss": 3.224, + "step": 3233 + }, + { + "epoch": 0.29299449616090234, + "grad_norm": 0.7104471921920776, + "learning_rate": 0.00018049296200084577, + "loss": 2.6667, + "step": 3234 + }, + { + "epoch": 0.29308509433534913, + "grad_norm": 0.7260295152664185, + "learning_rate": 0.00018048692079985503, + "loss": 2.2421, + "step": 3235 + }, + { + "epoch": 0.2931756925097959, + "grad_norm": 0.7050344944000244, + "learning_rate": 0.00018048087959886427, + "loss": 2.9608, + "step": 3236 + }, + { + "epoch": 0.2932662906842427, + "grad_norm": 0.7539309859275818, + "learning_rate": 0.00018047483839787353, + "loss": 3.0354, + "step": 3237 + }, + { + "epoch": 0.2933568888586895, + "grad_norm": 0.6987184286117554, + "learning_rate": 0.00018046879719688273, + "loss": 2.8611, + "step": 3238 + }, + { + "epoch": 0.2934474870331363, + "grad_norm": 0.7437067031860352, + "learning_rate": 0.000180462755995892, + "loss": 3.0002, + "step": 3239 + }, + { + "epoch": 0.29353808520758307, + "grad_norm": 0.7501639127731323, + "learning_rate": 0.00018045671479490123, + "loss": 2.6817, + "step": 3240 + }, + { + "epoch": 0.29362868338202985, + "grad_norm": 0.7167664170265198, + "learning_rate": 0.0001804506735939105, + "loss": 2.7859, + "step": 3241 + }, + { + "epoch": 0.29371928155647664, + "grad_norm": 0.7147490382194519, + "learning_rate": 0.00018044463239291972, + "loss": 3.0216, + "step": 3242 + }, + { + "epoch": 0.29380987973092343, + "grad_norm": 0.6820878386497498, + "learning_rate": 0.00018043859119192896, + "loss": 2.8826, + "step": 3243 + }, + { + "epoch": 0.2939004779053702, + "grad_norm": 0.734776496887207, + "learning_rate": 0.00018043254999093822, + "loss": 2.8333, + "step": 3244 + }, + { + "epoch": 0.293991076079817, + "grad_norm": 0.792255699634552, + "learning_rate": 0.00018042650878994745, + "loss": 2.7125, + "step": 3245 + }, + { + "epoch": 0.2940816742542638, + "grad_norm": 0.7831583619117737, + "learning_rate": 0.0001804204675889567, + "loss": 2.6682, + "step": 3246 + }, + { + "epoch": 0.2941722724287106, + "grad_norm": 0.6631620526313782, + "learning_rate": 0.00018041442638796592, + "loss": 2.1487, + "step": 3247 + }, + { + "epoch": 0.29426287060315737, + "grad_norm": 0.7263641953468323, + "learning_rate": 0.00018040838518697518, + "loss": 2.7808, + "step": 3248 + }, + { + "epoch": 0.29435346877760415, + "grad_norm": 0.6195445656776428, + "learning_rate": 0.00018040234398598442, + "loss": 2.0676, + "step": 3249 + }, + { + "epoch": 0.29444406695205094, + "grad_norm": 0.7439830303192139, + "learning_rate": 0.00018039630278499368, + "loss": 2.8045, + "step": 3250 + }, + { + "epoch": 0.2945346651264977, + "grad_norm": 0.723051905632019, + "learning_rate": 0.0001803902615840029, + "loss": 2.7734, + "step": 3251 + }, + { + "epoch": 0.29462526330094446, + "grad_norm": 0.7320566773414612, + "learning_rate": 0.00018038422038301215, + "loss": 3.0122, + "step": 3252 + }, + { + "epoch": 0.29471586147539125, + "grad_norm": 0.770520031452179, + "learning_rate": 0.0001803781791820214, + "loss": 3.0972, + "step": 3253 + }, + { + "epoch": 0.29480645964983804, + "grad_norm": 0.6781139969825745, + "learning_rate": 0.00018037213798103064, + "loss": 2.0559, + "step": 3254 + }, + { + "epoch": 0.2948970578242848, + "grad_norm": 0.7520358562469482, + "learning_rate": 0.00018036609678003987, + "loss": 2.9131, + "step": 3255 + }, + { + "epoch": 0.2949876559987316, + "grad_norm": 0.8091472387313843, + "learning_rate": 0.0001803600555790491, + "loss": 2.8277, + "step": 3256 + }, + { + "epoch": 0.2950782541731784, + "grad_norm": 0.7189067006111145, + "learning_rate": 0.00018035401437805837, + "loss": 2.2217, + "step": 3257 + }, + { + "epoch": 0.2951688523476252, + "grad_norm": 0.6992072463035583, + "learning_rate": 0.00018034797317706763, + "loss": 2.7623, + "step": 3258 + }, + { + "epoch": 0.295259450522072, + "grad_norm": 0.7308423519134521, + "learning_rate": 0.00018034193197607684, + "loss": 2.888, + "step": 3259 + }, + { + "epoch": 0.29535004869651876, + "grad_norm": 0.6919962167739868, + "learning_rate": 0.0001803358907750861, + "loss": 2.7046, + "step": 3260 + }, + { + "epoch": 0.29544064687096555, + "grad_norm": 0.7460502982139587, + "learning_rate": 0.00018032984957409533, + "loss": 2.8475, + "step": 3261 + }, + { + "epoch": 0.29553124504541234, + "grad_norm": 0.7264595627784729, + "learning_rate": 0.0001803238083731046, + "loss": 2.6291, + "step": 3262 + }, + { + "epoch": 0.2956218432198591, + "grad_norm": 0.7593461871147156, + "learning_rate": 0.00018031776717211383, + "loss": 2.938, + "step": 3263 + }, + { + "epoch": 0.2957124413943059, + "grad_norm": 0.7146178483963013, + "learning_rate": 0.00018031172597112306, + "loss": 2.9818, + "step": 3264 + }, + { + "epoch": 0.2958030395687527, + "grad_norm": 0.8213822245597839, + "learning_rate": 0.00018030568477013232, + "loss": 2.7194, + "step": 3265 + }, + { + "epoch": 0.2958936377431995, + "grad_norm": 0.7235067486763, + "learning_rate": 0.00018029964356914156, + "loss": 2.881, + "step": 3266 + }, + { + "epoch": 0.2959842359176463, + "grad_norm": 0.7231917381286621, + "learning_rate": 0.0001802936023681508, + "loss": 3.0128, + "step": 3267 + }, + { + "epoch": 0.29607483409209306, + "grad_norm": 0.6256208419799805, + "learning_rate": 0.00018028756116716003, + "loss": 2.1317, + "step": 3268 + }, + { + "epoch": 0.29616543226653985, + "grad_norm": 0.6854420304298401, + "learning_rate": 0.0001802815199661693, + "loss": 2.687, + "step": 3269 + }, + { + "epoch": 0.29625603044098664, + "grad_norm": 0.750320315361023, + "learning_rate": 0.00018027547876517852, + "loss": 2.7939, + "step": 3270 + }, + { + "epoch": 0.29634662861543337, + "grad_norm": 0.7379075288772583, + "learning_rate": 0.00018026943756418778, + "loss": 2.8877, + "step": 3271 + }, + { + "epoch": 0.29643722678988016, + "grad_norm": 0.6987753510475159, + "learning_rate": 0.000180263396363197, + "loss": 2.8595, + "step": 3272 + }, + { + "epoch": 0.29652782496432695, + "grad_norm": 0.7178618907928467, + "learning_rate": 0.00018025735516220625, + "loss": 2.748, + "step": 3273 + }, + { + "epoch": 0.29661842313877373, + "grad_norm": 0.7306898832321167, + "learning_rate": 0.0001802513139612155, + "loss": 2.6297, + "step": 3274 + }, + { + "epoch": 0.2967090213132205, + "grad_norm": 0.7130541205406189, + "learning_rate": 0.00018024527276022475, + "loss": 2.9144, + "step": 3275 + }, + { + "epoch": 0.2967996194876673, + "grad_norm": 0.6110102534294128, + "learning_rate": 0.00018023923155923398, + "loss": 2.3417, + "step": 3276 + }, + { + "epoch": 0.2968902176621141, + "grad_norm": 0.6689790487289429, + "learning_rate": 0.00018023319035824321, + "loss": 2.1513, + "step": 3277 + }, + { + "epoch": 0.2969808158365609, + "grad_norm": 0.7300906181335449, + "learning_rate": 0.00018022714915725247, + "loss": 2.4869, + "step": 3278 + }, + { + "epoch": 0.2970714140110077, + "grad_norm": 0.7007147073745728, + "learning_rate": 0.0001802211079562617, + "loss": 2.8285, + "step": 3279 + }, + { + "epoch": 0.29716201218545446, + "grad_norm": 0.7822953462600708, + "learning_rate": 0.00018021506675527094, + "loss": 2.793, + "step": 3280 + }, + { + "epoch": 0.29725261035990125, + "grad_norm": 0.7704519033432007, + "learning_rate": 0.0001802090255542802, + "loss": 2.7805, + "step": 3281 + }, + { + "epoch": 0.29734320853434804, + "grad_norm": 0.8939983248710632, + "learning_rate": 0.00018020298435328944, + "loss": 2.4186, + "step": 3282 + }, + { + "epoch": 0.2974338067087948, + "grad_norm": 0.7478305697441101, + "learning_rate": 0.0001801969431522987, + "loss": 2.8357, + "step": 3283 + }, + { + "epoch": 0.2975244048832416, + "grad_norm": 0.7200047969818115, + "learning_rate": 0.00018019090195130793, + "loss": 2.8018, + "step": 3284 + }, + { + "epoch": 0.2976150030576884, + "grad_norm": 0.7027959227561951, + "learning_rate": 0.00018018486075031717, + "loss": 2.765, + "step": 3285 + }, + { + "epoch": 0.2977056012321352, + "grad_norm": 0.7487689256668091, + "learning_rate": 0.0001801788195493264, + "loss": 2.5617, + "step": 3286 + }, + { + "epoch": 0.297796199406582, + "grad_norm": 0.7320858836174011, + "learning_rate": 0.00018017277834833566, + "loss": 2.8957, + "step": 3287 + }, + { + "epoch": 0.29788679758102876, + "grad_norm": 0.7254377603530884, + "learning_rate": 0.00018016673714734492, + "loss": 2.888, + "step": 3288 + }, + { + "epoch": 0.29797739575547555, + "grad_norm": 0.7567802667617798, + "learning_rate": 0.00018016069594635413, + "loss": 2.9611, + "step": 3289 + }, + { + "epoch": 0.29806799392992234, + "grad_norm": 0.7313582897186279, + "learning_rate": 0.0001801546547453634, + "loss": 2.8371, + "step": 3290 + }, + { + "epoch": 0.2981585921043691, + "grad_norm": 0.7784324288368225, + "learning_rate": 0.00018014861354437263, + "loss": 2.946, + "step": 3291 + }, + { + "epoch": 0.29824919027881586, + "grad_norm": 0.6958277821540833, + "learning_rate": 0.0001801425723433819, + "loss": 2.6878, + "step": 3292 + }, + { + "epoch": 0.29833978845326264, + "grad_norm": 0.741497814655304, + "learning_rate": 0.0001801365311423911, + "loss": 2.9144, + "step": 3293 + }, + { + "epoch": 0.29843038662770943, + "grad_norm": 0.7461440563201904, + "learning_rate": 0.00018013048994140036, + "loss": 2.869, + "step": 3294 + }, + { + "epoch": 0.2985209848021562, + "grad_norm": 0.7840694785118103, + "learning_rate": 0.00018012444874040962, + "loss": 2.9079, + "step": 3295 + }, + { + "epoch": 0.298611582976603, + "grad_norm": 0.7013192176818848, + "learning_rate": 0.00018011840753941885, + "loss": 2.7313, + "step": 3296 + }, + { + "epoch": 0.2987021811510498, + "grad_norm": 0.7170293927192688, + "learning_rate": 0.00018011236633842808, + "loss": 2.7486, + "step": 3297 + }, + { + "epoch": 0.2987927793254966, + "grad_norm": 0.7372299432754517, + "learning_rate": 0.00018010632513743732, + "loss": 2.9893, + "step": 3298 + }, + { + "epoch": 0.29888337749994337, + "grad_norm": 0.7593883275985718, + "learning_rate": 0.00018010028393644658, + "loss": 2.9733, + "step": 3299 + }, + { + "epoch": 0.29897397567439016, + "grad_norm": 0.7155198454856873, + "learning_rate": 0.00018009424273545581, + "loss": 2.6883, + "step": 3300 + }, + { + "epoch": 0.29906457384883695, + "grad_norm": 0.7552247643470764, + "learning_rate": 0.00018008820153446507, + "loss": 2.7254, + "step": 3301 + }, + { + "epoch": 0.29915517202328373, + "grad_norm": 0.6073634624481201, + "learning_rate": 0.00018008216033347428, + "loss": 2.1581, + "step": 3302 + }, + { + "epoch": 0.2992457701977305, + "grad_norm": 0.9000061750411987, + "learning_rate": 0.00018007611913248354, + "loss": 3.0715, + "step": 3303 + }, + { + "epoch": 0.2993363683721773, + "grad_norm": 0.7769244909286499, + "learning_rate": 0.0001800700779314928, + "loss": 2.8152, + "step": 3304 + }, + { + "epoch": 0.2994269665466241, + "grad_norm": 0.7376443147659302, + "learning_rate": 0.00018006403673050204, + "loss": 2.8194, + "step": 3305 + }, + { + "epoch": 0.2995175647210709, + "grad_norm": 0.7208224534988403, + "learning_rate": 0.00018005799552951127, + "loss": 2.8211, + "step": 3306 + }, + { + "epoch": 0.29960816289551767, + "grad_norm": 0.7379881739616394, + "learning_rate": 0.0001800519543285205, + "loss": 3.1488, + "step": 3307 + }, + { + "epoch": 0.29969876106996446, + "grad_norm": 0.7263110280036926, + "learning_rate": 0.00018004591312752977, + "loss": 2.6352, + "step": 3308 + }, + { + "epoch": 0.29978935924441125, + "grad_norm": 0.7178776264190674, + "learning_rate": 0.000180039871926539, + "loss": 2.891, + "step": 3309 + }, + { + "epoch": 0.29987995741885803, + "grad_norm": 0.7888861894607544, + "learning_rate": 0.00018003383072554824, + "loss": 2.8783, + "step": 3310 + }, + { + "epoch": 0.2999705555933048, + "grad_norm": 0.7275456786155701, + "learning_rate": 0.0001800277895245575, + "loss": 2.7592, + "step": 3311 + }, + { + "epoch": 0.30006115376775155, + "grad_norm": 0.7323940992355347, + "learning_rate": 0.00018002174832356673, + "loss": 2.9817, + "step": 3312 + }, + { + "epoch": 0.30015175194219834, + "grad_norm": 0.7269805073738098, + "learning_rate": 0.000180015707122576, + "loss": 2.7237, + "step": 3313 + }, + { + "epoch": 0.30024235011664513, + "grad_norm": 0.7812291979789734, + "learning_rate": 0.00018000966592158523, + "loss": 2.8841, + "step": 3314 + }, + { + "epoch": 0.3003329482910919, + "grad_norm": 0.7352524399757385, + "learning_rate": 0.00018000362472059446, + "loss": 2.7934, + "step": 3315 + }, + { + "epoch": 0.3004235464655387, + "grad_norm": 0.7407872080802917, + "learning_rate": 0.0001799975835196037, + "loss": 2.8381, + "step": 3316 + }, + { + "epoch": 0.3005141446399855, + "grad_norm": 0.7749360203742981, + "learning_rate": 0.00017999154231861296, + "loss": 2.9663, + "step": 3317 + }, + { + "epoch": 0.3006047428144323, + "grad_norm": 0.7013764381408691, + "learning_rate": 0.0001799855011176222, + "loss": 2.7865, + "step": 3318 + }, + { + "epoch": 0.30069534098887907, + "grad_norm": 0.735328197479248, + "learning_rate": 0.00017997945991663142, + "loss": 2.5593, + "step": 3319 + }, + { + "epoch": 0.30078593916332586, + "grad_norm": 0.7196971774101257, + "learning_rate": 0.00017997341871564068, + "loss": 2.9054, + "step": 3320 + }, + { + "epoch": 0.30087653733777264, + "grad_norm": 0.6896129846572876, + "learning_rate": 0.00017996737751464992, + "loss": 2.5969, + "step": 3321 + }, + { + "epoch": 0.30096713551221943, + "grad_norm": 0.7481616139411926, + "learning_rate": 0.00017996133631365918, + "loss": 2.7993, + "step": 3322 + }, + { + "epoch": 0.3010577336866662, + "grad_norm": 0.720322847366333, + "learning_rate": 0.0001799552951126684, + "loss": 2.676, + "step": 3323 + }, + { + "epoch": 0.301148331861113, + "grad_norm": 0.74559086561203, + "learning_rate": 0.00017994925391167765, + "loss": 2.7783, + "step": 3324 + }, + { + "epoch": 0.3012389300355598, + "grad_norm": 0.7109976410865784, + "learning_rate": 0.0001799432127106869, + "loss": 2.5985, + "step": 3325 + }, + { + "epoch": 0.3013295282100066, + "grad_norm": 0.6958041191101074, + "learning_rate": 0.00017993717150969614, + "loss": 2.6859, + "step": 3326 + }, + { + "epoch": 0.30142012638445337, + "grad_norm": 0.8000491857528687, + "learning_rate": 0.00017993113030870538, + "loss": 2.939, + "step": 3327 + }, + { + "epoch": 0.30151072455890016, + "grad_norm": 0.7465682029724121, + "learning_rate": 0.0001799250891077146, + "loss": 2.7803, + "step": 3328 + }, + { + "epoch": 0.30160132273334694, + "grad_norm": 0.7521325349807739, + "learning_rate": 0.00017991904790672387, + "loss": 2.965, + "step": 3329 + }, + { + "epoch": 0.30169192090779373, + "grad_norm": 0.7286850214004517, + "learning_rate": 0.0001799130067057331, + "loss": 2.8745, + "step": 3330 + }, + { + "epoch": 0.3017825190822405, + "grad_norm": 0.6535417437553406, + "learning_rate": 0.00017990696550474234, + "loss": 2.2637, + "step": 3331 + }, + { + "epoch": 0.30187311725668725, + "grad_norm": 0.7695086598396301, + "learning_rate": 0.00017990092430375157, + "loss": 3.1138, + "step": 3332 + }, + { + "epoch": 0.30196371543113404, + "grad_norm": 0.676736056804657, + "learning_rate": 0.00017989488310276084, + "loss": 2.003, + "step": 3333 + }, + { + "epoch": 0.3020543136055808, + "grad_norm": 0.7007461190223694, + "learning_rate": 0.0001798888419017701, + "loss": 2.6799, + "step": 3334 + }, + { + "epoch": 0.3021449117800276, + "grad_norm": 0.7254328727722168, + "learning_rate": 0.00017988280070077933, + "loss": 2.7679, + "step": 3335 + }, + { + "epoch": 0.3022355099544744, + "grad_norm": 0.7362014055252075, + "learning_rate": 0.00017987675949978856, + "loss": 2.7474, + "step": 3336 + }, + { + "epoch": 0.3023261081289212, + "grad_norm": 0.7474640011787415, + "learning_rate": 0.0001798707182987978, + "loss": 2.993, + "step": 3337 + }, + { + "epoch": 0.302416706303368, + "grad_norm": 0.7163699865341187, + "learning_rate": 0.00017986467709780706, + "loss": 2.8221, + "step": 3338 + }, + { + "epoch": 0.30250730447781476, + "grad_norm": 0.7632259130477905, + "learning_rate": 0.0001798586358968163, + "loss": 2.8868, + "step": 3339 + }, + { + "epoch": 0.30259790265226155, + "grad_norm": 0.7154580354690552, + "learning_rate": 0.00017985259469582553, + "loss": 2.7747, + "step": 3340 + }, + { + "epoch": 0.30268850082670834, + "grad_norm": 0.7023862600326538, + "learning_rate": 0.0001798465534948348, + "loss": 2.7889, + "step": 3341 + }, + { + "epoch": 0.3027790990011551, + "grad_norm": 0.7438956499099731, + "learning_rate": 0.00017984051229384402, + "loss": 3.0838, + "step": 3342 + }, + { + "epoch": 0.3028696971756019, + "grad_norm": 0.7532603144645691, + "learning_rate": 0.00017983447109285328, + "loss": 3.0759, + "step": 3343 + }, + { + "epoch": 0.3029602953500487, + "grad_norm": 0.7115556597709656, + "learning_rate": 0.0001798284298918625, + "loss": 2.9173, + "step": 3344 + }, + { + "epoch": 0.3030508935244955, + "grad_norm": 0.7517229318618774, + "learning_rate": 0.00017982238869087175, + "loss": 2.6848, + "step": 3345 + }, + { + "epoch": 0.3031414916989423, + "grad_norm": 0.7817388772964478, + "learning_rate": 0.000179816347489881, + "loss": 3.0053, + "step": 3346 + }, + { + "epoch": 0.30323208987338907, + "grad_norm": 0.7293429970741272, + "learning_rate": 0.00017981030628889025, + "loss": 2.6537, + "step": 3347 + }, + { + "epoch": 0.30332268804783585, + "grad_norm": 0.7689222693443298, + "learning_rate": 0.00017980426508789948, + "loss": 2.8322, + "step": 3348 + }, + { + "epoch": 0.30341328622228264, + "grad_norm": 0.7637858390808105, + "learning_rate": 0.00017979822388690872, + "loss": 2.942, + "step": 3349 + }, + { + "epoch": 0.30350388439672943, + "grad_norm": 0.7343910336494446, + "learning_rate": 0.00017979218268591798, + "loss": 2.6888, + "step": 3350 + }, + { + "epoch": 0.3035944825711762, + "grad_norm": 0.76028972864151, + "learning_rate": 0.0001797861414849272, + "loss": 3.1892, + "step": 3351 + }, + { + "epoch": 0.30368508074562295, + "grad_norm": 0.8701298832893372, + "learning_rate": 0.00017978010028393647, + "loss": 2.8663, + "step": 3352 + }, + { + "epoch": 0.30377567892006974, + "grad_norm": 0.8851218819618225, + "learning_rate": 0.00017977405908294568, + "loss": 2.7413, + "step": 3353 + }, + { + "epoch": 0.3038662770945165, + "grad_norm": 0.7183082103729248, + "learning_rate": 0.00017976801788195494, + "loss": 2.7309, + "step": 3354 + }, + { + "epoch": 0.3039568752689633, + "grad_norm": 0.7571102976799011, + "learning_rate": 0.0001797619766809642, + "loss": 3.1097, + "step": 3355 + }, + { + "epoch": 0.3040474734434101, + "grad_norm": 0.7186988592147827, + "learning_rate": 0.00017975593547997344, + "loss": 2.867, + "step": 3356 + }, + { + "epoch": 0.3041380716178569, + "grad_norm": 0.7517744302749634, + "learning_rate": 0.00017974989427898267, + "loss": 2.9457, + "step": 3357 + }, + { + "epoch": 0.3042286697923037, + "grad_norm": 0.7674211263656616, + "learning_rate": 0.0001797438530779919, + "loss": 2.7463, + "step": 3358 + }, + { + "epoch": 0.30431926796675046, + "grad_norm": 0.7773520946502686, + "learning_rate": 0.00017973781187700116, + "loss": 2.7669, + "step": 3359 + }, + { + "epoch": 0.30440986614119725, + "grad_norm": 0.7850247621536255, + "learning_rate": 0.0001797317706760104, + "loss": 2.8321, + "step": 3360 + }, + { + "epoch": 0.30450046431564404, + "grad_norm": 0.7244626879692078, + "learning_rate": 0.00017972572947501963, + "loss": 2.7182, + "step": 3361 + }, + { + "epoch": 0.3045910624900908, + "grad_norm": 0.7752231359481812, + "learning_rate": 0.00017971968827402887, + "loss": 2.9067, + "step": 3362 + }, + { + "epoch": 0.3046816606645376, + "grad_norm": 0.7108297944068909, + "learning_rate": 0.00017971364707303813, + "loss": 2.7545, + "step": 3363 + }, + { + "epoch": 0.3047722588389844, + "grad_norm": 0.7004293203353882, + "learning_rate": 0.0001797076058720474, + "loss": 3.1062, + "step": 3364 + }, + { + "epoch": 0.3048628570134312, + "grad_norm": 0.7094611525535583, + "learning_rate": 0.00017970156467105662, + "loss": 2.5678, + "step": 3365 + }, + { + "epoch": 0.304953455187878, + "grad_norm": 0.7444274425506592, + "learning_rate": 0.00017969552347006586, + "loss": 2.7926, + "step": 3366 + }, + { + "epoch": 0.30504405336232476, + "grad_norm": 0.8160226941108704, + "learning_rate": 0.0001796894822690751, + "loss": 2.8968, + "step": 3367 + }, + { + "epoch": 0.30513465153677155, + "grad_norm": 0.7099579572677612, + "learning_rate": 0.00017968344106808435, + "loss": 2.8967, + "step": 3368 + }, + { + "epoch": 0.30522524971121834, + "grad_norm": 0.7193402647972107, + "learning_rate": 0.0001796773998670936, + "loss": 2.76, + "step": 3369 + }, + { + "epoch": 0.3053158478856651, + "grad_norm": 0.7024473547935486, + "learning_rate": 0.00017967135866610282, + "loss": 2.7504, + "step": 3370 + }, + { + "epoch": 0.3054064460601119, + "grad_norm": 0.7865564823150635, + "learning_rate": 0.00017966531746511208, + "loss": 2.9479, + "step": 3371 + }, + { + "epoch": 0.3054970442345587, + "grad_norm": 0.7100896835327148, + "learning_rate": 0.00017965927626412132, + "loss": 2.2588, + "step": 3372 + }, + { + "epoch": 0.30558764240900543, + "grad_norm": 0.7324273586273193, + "learning_rate": 0.00017965323506313058, + "loss": 2.7867, + "step": 3373 + }, + { + "epoch": 0.3056782405834522, + "grad_norm": 0.8647968173027039, + "learning_rate": 0.00017964719386213978, + "loss": 3.0352, + "step": 3374 + }, + { + "epoch": 0.305768838757899, + "grad_norm": 0.7714458107948303, + "learning_rate": 0.00017964115266114905, + "loss": 2.7553, + "step": 3375 + }, + { + "epoch": 0.3058594369323458, + "grad_norm": 0.7690159678459167, + "learning_rate": 0.00017963511146015828, + "loss": 2.7015, + "step": 3376 + }, + { + "epoch": 0.3059500351067926, + "grad_norm": 0.7977215051651001, + "learning_rate": 0.00017962907025916754, + "loss": 2.923, + "step": 3377 + }, + { + "epoch": 0.30604063328123937, + "grad_norm": 0.7763900756835938, + "learning_rate": 0.00017962302905817677, + "loss": 2.7584, + "step": 3378 + }, + { + "epoch": 0.30613123145568616, + "grad_norm": 0.7319939136505127, + "learning_rate": 0.000179616987857186, + "loss": 2.7807, + "step": 3379 + }, + { + "epoch": 0.30622182963013295, + "grad_norm": 0.7721364498138428, + "learning_rate": 0.00017961094665619527, + "loss": 2.7398, + "step": 3380 + }, + { + "epoch": 0.30631242780457973, + "grad_norm": 0.7231951951980591, + "learning_rate": 0.0001796049054552045, + "loss": 2.739, + "step": 3381 + }, + { + "epoch": 0.3064030259790265, + "grad_norm": 0.7427008152008057, + "learning_rate": 0.00017959886425421374, + "loss": 2.8129, + "step": 3382 + }, + { + "epoch": 0.3064936241534733, + "grad_norm": 0.7509527206420898, + "learning_rate": 0.00017959282305322297, + "loss": 2.6479, + "step": 3383 + }, + { + "epoch": 0.3065842223279201, + "grad_norm": 0.6544380784034729, + "learning_rate": 0.00017958678185223223, + "loss": 2.5473, + "step": 3384 + }, + { + "epoch": 0.3066748205023669, + "grad_norm": 0.7875467538833618, + "learning_rate": 0.0001795807406512415, + "loss": 3.0726, + "step": 3385 + }, + { + "epoch": 0.3067654186768137, + "grad_norm": 0.7460776567459106, + "learning_rate": 0.00017957469945025073, + "loss": 2.9399, + "step": 3386 + }, + { + "epoch": 0.30685601685126046, + "grad_norm": 0.7136831879615784, + "learning_rate": 0.00017956865824925996, + "loss": 2.6611, + "step": 3387 + }, + { + "epoch": 0.30694661502570725, + "grad_norm": 0.7900742292404175, + "learning_rate": 0.0001795626170482692, + "loss": 3.1374, + "step": 3388 + }, + { + "epoch": 0.30703721320015404, + "grad_norm": 0.7650036811828613, + "learning_rate": 0.00017955657584727846, + "loss": 2.9326, + "step": 3389 + }, + { + "epoch": 0.3071278113746008, + "grad_norm": 0.7189603447914124, + "learning_rate": 0.0001795505346462877, + "loss": 2.8166, + "step": 3390 + }, + { + "epoch": 0.3072184095490476, + "grad_norm": 0.7476832270622253, + "learning_rate": 0.00017954449344529693, + "loss": 2.9109, + "step": 3391 + }, + { + "epoch": 0.3073090077234944, + "grad_norm": 0.7492210268974304, + "learning_rate": 0.00017953845224430616, + "loss": 2.7921, + "step": 3392 + }, + { + "epoch": 0.30739960589794113, + "grad_norm": 0.6744470000267029, + "learning_rate": 0.00017953241104331542, + "loss": 2.1016, + "step": 3393 + }, + { + "epoch": 0.3074902040723879, + "grad_norm": 0.7094199657440186, + "learning_rate": 0.00017952636984232468, + "loss": 2.6779, + "step": 3394 + }, + { + "epoch": 0.3075808022468347, + "grad_norm": 0.7524405717849731, + "learning_rate": 0.0001795203286413339, + "loss": 2.7024, + "step": 3395 + }, + { + "epoch": 0.3076714004212815, + "grad_norm": 0.6925508379936218, + "learning_rate": 0.00017951428744034315, + "loss": 2.7105, + "step": 3396 + }, + { + "epoch": 0.3077619985957283, + "grad_norm": 0.7909723520278931, + "learning_rate": 0.00017950824623935238, + "loss": 2.6988, + "step": 3397 + }, + { + "epoch": 0.30785259677017507, + "grad_norm": 0.6820451617240906, + "learning_rate": 0.00017950220503836165, + "loss": 2.7604, + "step": 3398 + }, + { + "epoch": 0.30794319494462186, + "grad_norm": 0.6963186860084534, + "learning_rate": 0.00017949616383737088, + "loss": 2.8162, + "step": 3399 + }, + { + "epoch": 0.30803379311906864, + "grad_norm": 0.7100507020950317, + "learning_rate": 0.0001794901226363801, + "loss": 2.7065, + "step": 3400 + }, + { + "epoch": 0.30812439129351543, + "grad_norm": 0.7443572878837585, + "learning_rate": 0.00017948408143538937, + "loss": 2.9879, + "step": 3401 + }, + { + "epoch": 0.3082149894679622, + "grad_norm": 0.6996206045150757, + "learning_rate": 0.0001794780402343986, + "loss": 2.7472, + "step": 3402 + }, + { + "epoch": 0.308305587642409, + "grad_norm": 0.6987810134887695, + "learning_rate": 0.00017947199903340784, + "loss": 2.9622, + "step": 3403 + }, + { + "epoch": 0.3083961858168558, + "grad_norm": 0.7136873602867126, + "learning_rate": 0.00017946595783241708, + "loss": 2.7604, + "step": 3404 + }, + { + "epoch": 0.3084867839913026, + "grad_norm": 0.8003178834915161, + "learning_rate": 0.00017945991663142634, + "loss": 2.6715, + "step": 3405 + }, + { + "epoch": 0.30857738216574937, + "grad_norm": 0.7441039085388184, + "learning_rate": 0.00017945387543043557, + "loss": 2.7125, + "step": 3406 + }, + { + "epoch": 0.30866798034019616, + "grad_norm": 0.7363635897636414, + "learning_rate": 0.00017944783422944483, + "loss": 2.7956, + "step": 3407 + }, + { + "epoch": 0.30875857851464295, + "grad_norm": 0.7269060015678406, + "learning_rate": 0.00017944179302845407, + "loss": 2.8091, + "step": 3408 + }, + { + "epoch": 0.30884917668908973, + "grad_norm": 0.7077039480209351, + "learning_rate": 0.0001794357518274633, + "loss": 2.8174, + "step": 3409 + }, + { + "epoch": 0.3089397748635365, + "grad_norm": 0.7154725193977356, + "learning_rate": 0.00017942971062647256, + "loss": 2.8591, + "step": 3410 + }, + { + "epoch": 0.3090303730379833, + "grad_norm": 0.7634459733963013, + "learning_rate": 0.0001794236694254818, + "loss": 2.7281, + "step": 3411 + }, + { + "epoch": 0.3091209712124301, + "grad_norm": 0.7332730889320374, + "learning_rate": 0.00017941762822449103, + "loss": 2.6403, + "step": 3412 + }, + { + "epoch": 0.30921156938687683, + "grad_norm": 0.5934591293334961, + "learning_rate": 0.00017941158702350026, + "loss": 1.3884, + "step": 3413 + }, + { + "epoch": 0.3093021675613236, + "grad_norm": 0.7463510632514954, + "learning_rate": 0.00017940554582250953, + "loss": 2.9042, + "step": 3414 + }, + { + "epoch": 0.3093927657357704, + "grad_norm": 0.7565213441848755, + "learning_rate": 0.0001793995046215188, + "loss": 2.8534, + "step": 3415 + }, + { + "epoch": 0.3094833639102172, + "grad_norm": 0.7910574674606323, + "learning_rate": 0.000179393463420528, + "loss": 2.9494, + "step": 3416 + }, + { + "epoch": 0.309573962084664, + "grad_norm": 0.7874143719673157, + "learning_rate": 0.00017938742221953726, + "loss": 2.8399, + "step": 3417 + }, + { + "epoch": 0.30966456025911077, + "grad_norm": 0.6351077556610107, + "learning_rate": 0.0001793813810185465, + "loss": 1.7918, + "step": 3418 + }, + { + "epoch": 0.30975515843355755, + "grad_norm": 0.7308236956596375, + "learning_rate": 0.00017937533981755575, + "loss": 2.8516, + "step": 3419 + }, + { + "epoch": 0.30984575660800434, + "grad_norm": 0.7390824556350708, + "learning_rate": 0.00017936929861656498, + "loss": 2.9793, + "step": 3420 + }, + { + "epoch": 0.30993635478245113, + "grad_norm": 0.7858826518058777, + "learning_rate": 0.00017936325741557422, + "loss": 2.9271, + "step": 3421 + }, + { + "epoch": 0.3100269529568979, + "grad_norm": 0.7590442895889282, + "learning_rate": 0.00017935721621458345, + "loss": 2.2565, + "step": 3422 + }, + { + "epoch": 0.3101175511313447, + "grad_norm": 0.7369298338890076, + "learning_rate": 0.00017935117501359271, + "loss": 2.6622, + "step": 3423 + }, + { + "epoch": 0.3102081493057915, + "grad_norm": 0.7330176830291748, + "learning_rate": 0.00017934513381260197, + "loss": 2.7768, + "step": 3424 + }, + { + "epoch": 0.3102987474802383, + "grad_norm": 0.7867554426193237, + "learning_rate": 0.00017933909261161118, + "loss": 2.8649, + "step": 3425 + }, + { + "epoch": 0.31038934565468507, + "grad_norm": 0.7263171076774597, + "learning_rate": 0.00017933305141062044, + "loss": 2.7469, + "step": 3426 + }, + { + "epoch": 0.31047994382913185, + "grad_norm": 0.7899981141090393, + "learning_rate": 0.00017932701020962968, + "loss": 3.1114, + "step": 3427 + }, + { + "epoch": 0.31057054200357864, + "grad_norm": 0.7315657734870911, + "learning_rate": 0.00017932096900863894, + "loss": 2.9814, + "step": 3428 + }, + { + "epoch": 0.31066114017802543, + "grad_norm": 0.6238228678703308, + "learning_rate": 0.00017931492780764817, + "loss": 2.0896, + "step": 3429 + }, + { + "epoch": 0.3107517383524722, + "grad_norm": 0.774681806564331, + "learning_rate": 0.0001793088866066574, + "loss": 3.0864, + "step": 3430 + }, + { + "epoch": 0.310842336526919, + "grad_norm": 0.812057614326477, + "learning_rate": 0.00017930284540566667, + "loss": 2.8095, + "step": 3431 + }, + { + "epoch": 0.3109329347013658, + "grad_norm": 0.5024678707122803, + "learning_rate": 0.0001792968042046759, + "loss": 1.292, + "step": 3432 + }, + { + "epoch": 0.3110235328758125, + "grad_norm": 0.6189866065979004, + "learning_rate": 0.00017929076300368514, + "loss": 1.972, + "step": 3433 + }, + { + "epoch": 0.3111141310502593, + "grad_norm": 0.7656733393669128, + "learning_rate": 0.00017928472180269437, + "loss": 2.8995, + "step": 3434 + }, + { + "epoch": 0.3112047292247061, + "grad_norm": 0.7404372096061707, + "learning_rate": 0.00017927868060170363, + "loss": 2.6171, + "step": 3435 + }, + { + "epoch": 0.3112953273991529, + "grad_norm": 0.7495457530021667, + "learning_rate": 0.00017927263940071286, + "loss": 2.73, + "step": 3436 + }, + { + "epoch": 0.3113859255735997, + "grad_norm": 0.7313725352287292, + "learning_rate": 0.00017926659819972213, + "loss": 2.7357, + "step": 3437 + }, + { + "epoch": 0.31147652374804646, + "grad_norm": 0.6523899435997009, + "learning_rate": 0.00017926055699873136, + "loss": 2.2284, + "step": 3438 + }, + { + "epoch": 0.31156712192249325, + "grad_norm": 0.6748141050338745, + "learning_rate": 0.0001792545157977406, + "loss": 2.2017, + "step": 3439 + }, + { + "epoch": 0.31165772009694004, + "grad_norm": 0.7274019718170166, + "learning_rate": 0.00017924847459674986, + "loss": 2.6666, + "step": 3440 + }, + { + "epoch": 0.3117483182713868, + "grad_norm": 0.7361884117126465, + "learning_rate": 0.0001792424333957591, + "loss": 3.0309, + "step": 3441 + }, + { + "epoch": 0.3118389164458336, + "grad_norm": 0.7311177253723145, + "learning_rate": 0.00017923639219476832, + "loss": 2.9336, + "step": 3442 + }, + { + "epoch": 0.3119295146202804, + "grad_norm": 0.7474302649497986, + "learning_rate": 0.00017923035099377756, + "loss": 2.986, + "step": 3443 + }, + { + "epoch": 0.3120201127947272, + "grad_norm": 0.7181252241134644, + "learning_rate": 0.00017922430979278682, + "loss": 2.7881, + "step": 3444 + }, + { + "epoch": 0.312110710969174, + "grad_norm": 0.6972544193267822, + "learning_rate": 0.00017921826859179608, + "loss": 2.6557, + "step": 3445 + }, + { + "epoch": 0.31220130914362076, + "grad_norm": 0.8285616040229797, + "learning_rate": 0.0001792122273908053, + "loss": 2.6728, + "step": 3446 + }, + { + "epoch": 0.31229190731806755, + "grad_norm": 0.7641589641571045, + "learning_rate": 0.00017920618618981455, + "loss": 2.9952, + "step": 3447 + }, + { + "epoch": 0.31238250549251434, + "grad_norm": 0.6704499125480652, + "learning_rate": 0.00017920014498882378, + "loss": 2.6753, + "step": 3448 + }, + { + "epoch": 0.3124731036669611, + "grad_norm": 0.780992865562439, + "learning_rate": 0.00017919410378783304, + "loss": 2.8561, + "step": 3449 + }, + { + "epoch": 0.3125637018414079, + "grad_norm": 0.7111928462982178, + "learning_rate": 0.00017918806258684228, + "loss": 2.7751, + "step": 3450 + }, + { + "epoch": 0.3126543000158547, + "grad_norm": 0.7291894555091858, + "learning_rate": 0.0001791820213858515, + "loss": 2.9108, + "step": 3451 + }, + { + "epoch": 0.3127448981903015, + "grad_norm": 0.8701716065406799, + "learning_rate": 0.00017917598018486077, + "loss": 2.8446, + "step": 3452 + }, + { + "epoch": 0.3128354963647483, + "grad_norm": 0.7429229021072388, + "learning_rate": 0.00017916993898387, + "loss": 2.7499, + "step": 3453 + }, + { + "epoch": 0.312926094539195, + "grad_norm": 0.704437255859375, + "learning_rate": 0.00017916389778287924, + "loss": 2.7084, + "step": 3454 + }, + { + "epoch": 0.3130166927136418, + "grad_norm": 0.7548723220825195, + "learning_rate": 0.00017915785658188847, + "loss": 2.8871, + "step": 3455 + }, + { + "epoch": 0.3131072908880886, + "grad_norm": 0.6922876238822937, + "learning_rate": 0.00017915181538089774, + "loss": 2.9359, + "step": 3456 + }, + { + "epoch": 0.3131978890625354, + "grad_norm": 0.6944345235824585, + "learning_rate": 0.00017914577417990697, + "loss": 2.4442, + "step": 3457 + }, + { + "epoch": 0.31328848723698216, + "grad_norm": 0.7691295146942139, + "learning_rate": 0.00017913973297891623, + "loss": 2.9301, + "step": 3458 + }, + { + "epoch": 0.31337908541142895, + "grad_norm": 0.6912885904312134, + "learning_rate": 0.00017913369177792544, + "loss": 2.2935, + "step": 3459 + }, + { + "epoch": 0.31346968358587574, + "grad_norm": 0.7925861477851868, + "learning_rate": 0.0001791276505769347, + "loss": 3.0374, + "step": 3460 + }, + { + "epoch": 0.3135602817603225, + "grad_norm": 0.6674501299858093, + "learning_rate": 0.00017912160937594396, + "loss": 2.3124, + "step": 3461 + }, + { + "epoch": 0.3136508799347693, + "grad_norm": 0.773838222026825, + "learning_rate": 0.0001791155681749532, + "loss": 2.8385, + "step": 3462 + }, + { + "epoch": 0.3137414781092161, + "grad_norm": 0.8120371103286743, + "learning_rate": 0.00017910952697396243, + "loss": 2.7108, + "step": 3463 + }, + { + "epoch": 0.3138320762836629, + "grad_norm": 0.7528212070465088, + "learning_rate": 0.00017910348577297166, + "loss": 2.8473, + "step": 3464 + }, + { + "epoch": 0.3139226744581097, + "grad_norm": 0.7396228909492493, + "learning_rate": 0.00017909744457198092, + "loss": 2.8968, + "step": 3465 + }, + { + "epoch": 0.31401327263255646, + "grad_norm": 0.6944429278373718, + "learning_rate": 0.00017909140337099016, + "loss": 2.6998, + "step": 3466 + }, + { + "epoch": 0.31410387080700325, + "grad_norm": 0.772556483745575, + "learning_rate": 0.0001790853621699994, + "loss": 2.7755, + "step": 3467 + }, + { + "epoch": 0.31419446898145004, + "grad_norm": 0.7866241335868835, + "learning_rate": 0.00017907932096900865, + "loss": 3.1186, + "step": 3468 + }, + { + "epoch": 0.3142850671558968, + "grad_norm": 0.754503607749939, + "learning_rate": 0.0001790732797680179, + "loss": 2.7217, + "step": 3469 + }, + { + "epoch": 0.3143756653303436, + "grad_norm": 0.7941619753837585, + "learning_rate": 0.00017906723856702715, + "loss": 2.6835, + "step": 3470 + }, + { + "epoch": 0.3144662635047904, + "grad_norm": 0.7705021500587463, + "learning_rate": 0.00017906119736603638, + "loss": 2.9204, + "step": 3471 + }, + { + "epoch": 0.3145568616792372, + "grad_norm": 0.7765436768531799, + "learning_rate": 0.00017905515616504562, + "loss": 2.8233, + "step": 3472 + }, + { + "epoch": 0.314647459853684, + "grad_norm": 0.7954347729682922, + "learning_rate": 0.00017904911496405485, + "loss": 2.9434, + "step": 3473 + }, + { + "epoch": 0.3147380580281307, + "grad_norm": 0.7572998404502869, + "learning_rate": 0.0001790430737630641, + "loss": 2.9255, + "step": 3474 + }, + { + "epoch": 0.3148286562025775, + "grad_norm": 0.7631779313087463, + "learning_rate": 0.00017903703256207337, + "loss": 2.7502, + "step": 3475 + }, + { + "epoch": 0.3149192543770243, + "grad_norm": 0.7358195185661316, + "learning_rate": 0.00017903099136108258, + "loss": 2.8825, + "step": 3476 + }, + { + "epoch": 0.31500985255147107, + "grad_norm": 0.7202133536338806, + "learning_rate": 0.00017902495016009184, + "loss": 2.7106, + "step": 3477 + }, + { + "epoch": 0.31510045072591786, + "grad_norm": 0.7551692128181458, + "learning_rate": 0.00017901890895910107, + "loss": 2.8462, + "step": 3478 + }, + { + "epoch": 0.31519104890036465, + "grad_norm": 0.7614291906356812, + "learning_rate": 0.00017901286775811034, + "loss": 2.9188, + "step": 3479 + }, + { + "epoch": 0.31528164707481143, + "grad_norm": 0.7289559245109558, + "learning_rate": 0.00017900682655711954, + "loss": 2.6994, + "step": 3480 + }, + { + "epoch": 0.3153722452492582, + "grad_norm": 0.8224796056747437, + "learning_rate": 0.0001790007853561288, + "loss": 2.9691, + "step": 3481 + }, + { + "epoch": 0.315462843423705, + "grad_norm": 0.7485681772232056, + "learning_rate": 0.00017899474415513806, + "loss": 2.8233, + "step": 3482 + }, + { + "epoch": 0.3155534415981518, + "grad_norm": 0.6199415922164917, + "learning_rate": 0.0001789887029541473, + "loss": 1.9651, + "step": 3483 + }, + { + "epoch": 0.3156440397725986, + "grad_norm": 0.7131426930427551, + "learning_rate": 0.00017898266175315653, + "loss": 2.755, + "step": 3484 + }, + { + "epoch": 0.31573463794704537, + "grad_norm": 0.7203279137611389, + "learning_rate": 0.00017897662055216577, + "loss": 1.934, + "step": 3485 + }, + { + "epoch": 0.31582523612149216, + "grad_norm": 0.6063336730003357, + "learning_rate": 0.00017897057935117503, + "loss": 2.2061, + "step": 3486 + }, + { + "epoch": 0.31591583429593895, + "grad_norm": 0.841606080532074, + "learning_rate": 0.00017896453815018426, + "loss": 2.7739, + "step": 3487 + }, + { + "epoch": 0.31600643247038573, + "grad_norm": 0.7554329037666321, + "learning_rate": 0.00017895849694919352, + "loss": 2.873, + "step": 3488 + }, + { + "epoch": 0.3160970306448325, + "grad_norm": 0.728856086730957, + "learning_rate": 0.00017895245574820273, + "loss": 3.0836, + "step": 3489 + }, + { + "epoch": 0.3161876288192793, + "grad_norm": 0.7838718295097351, + "learning_rate": 0.000178946414547212, + "loss": 2.7747, + "step": 3490 + }, + { + "epoch": 0.3162782269937261, + "grad_norm": 0.7115355730056763, + "learning_rate": 0.00017894037334622125, + "loss": 2.8197, + "step": 3491 + }, + { + "epoch": 0.3163688251681729, + "grad_norm": 0.7182338833808899, + "learning_rate": 0.0001789343321452305, + "loss": 2.7149, + "step": 3492 + }, + { + "epoch": 0.3164594233426197, + "grad_norm": 0.8152385354042053, + "learning_rate": 0.00017892829094423972, + "loss": 3.2746, + "step": 3493 + }, + { + "epoch": 0.3165500215170664, + "grad_norm": 0.7216611504554749, + "learning_rate": 0.00017892224974324895, + "loss": 3.1036, + "step": 3494 + }, + { + "epoch": 0.3166406196915132, + "grad_norm": 0.7679011225700378, + "learning_rate": 0.00017891620854225822, + "loss": 2.8053, + "step": 3495 + }, + { + "epoch": 0.31673121786596, + "grad_norm": 0.7257910966873169, + "learning_rate": 0.00017891016734126745, + "loss": 2.8599, + "step": 3496 + }, + { + "epoch": 0.31682181604040677, + "grad_norm": 0.7364019751548767, + "learning_rate": 0.00017890412614027668, + "loss": 2.7889, + "step": 3497 + }, + { + "epoch": 0.31691241421485356, + "grad_norm": 0.7230485677719116, + "learning_rate": 0.00017889808493928595, + "loss": 2.5413, + "step": 3498 + }, + { + "epoch": 0.31700301238930034, + "grad_norm": 0.7544516324996948, + "learning_rate": 0.00017889204373829518, + "loss": 2.8278, + "step": 3499 + }, + { + "epoch": 0.31709361056374713, + "grad_norm": 0.7799798846244812, + "learning_rate": 0.00017888600253730444, + "loss": 2.7022, + "step": 3500 + }, + { + "epoch": 0.3171842087381939, + "grad_norm": 0.7300951480865479, + "learning_rate": 0.00017887996133631367, + "loss": 3.1025, + "step": 3501 + }, + { + "epoch": 0.3172748069126407, + "grad_norm": 0.7111852169036865, + "learning_rate": 0.0001788739201353229, + "loss": 2.7266, + "step": 3502 + }, + { + "epoch": 0.3173654050870875, + "grad_norm": 0.7561782002449036, + "learning_rate": 0.00017886787893433214, + "loss": 2.8914, + "step": 3503 + }, + { + "epoch": 0.3174560032615343, + "grad_norm": 0.741917610168457, + "learning_rate": 0.0001788618377333414, + "loss": 2.9507, + "step": 3504 + }, + { + "epoch": 0.31754660143598107, + "grad_norm": 0.7589879035949707, + "learning_rate": 0.00017885579653235064, + "loss": 3.0232, + "step": 3505 + }, + { + "epoch": 0.31763719961042786, + "grad_norm": 0.7391201853752136, + "learning_rate": 0.00017884975533135987, + "loss": 3.0708, + "step": 3506 + }, + { + "epoch": 0.31772779778487464, + "grad_norm": 0.728008508682251, + "learning_rate": 0.00017884371413036913, + "loss": 2.6485, + "step": 3507 + }, + { + "epoch": 0.31781839595932143, + "grad_norm": 1.0357955694198608, + "learning_rate": 0.00017883767292937837, + "loss": 2.8578, + "step": 3508 + }, + { + "epoch": 0.3179089941337682, + "grad_norm": 0.7603660821914673, + "learning_rate": 0.00017883163172838763, + "loss": 2.9111, + "step": 3509 + }, + { + "epoch": 0.317999592308215, + "grad_norm": 0.713543176651001, + "learning_rate": 0.00017882559052739684, + "loss": 2.0067, + "step": 3510 + }, + { + "epoch": 0.3180901904826618, + "grad_norm": 0.6768275499343872, + "learning_rate": 0.0001788195493264061, + "loss": 2.3966, + "step": 3511 + }, + { + "epoch": 0.3181807886571086, + "grad_norm": 0.7395949363708496, + "learning_rate": 0.00017881350812541536, + "loss": 2.7515, + "step": 3512 + }, + { + "epoch": 0.31827138683155537, + "grad_norm": 0.7516025900840759, + "learning_rate": 0.0001788074669244246, + "loss": 3.088, + "step": 3513 + }, + { + "epoch": 0.3183619850060021, + "grad_norm": 0.7868137359619141, + "learning_rate": 0.00017880142572343383, + "loss": 2.9896, + "step": 3514 + }, + { + "epoch": 0.3184525831804489, + "grad_norm": 0.7469034790992737, + "learning_rate": 0.00017879538452244306, + "loss": 3.0458, + "step": 3515 + }, + { + "epoch": 0.3185431813548957, + "grad_norm": 0.7399596571922302, + "learning_rate": 0.00017878934332145232, + "loss": 2.8131, + "step": 3516 + }, + { + "epoch": 0.31863377952934246, + "grad_norm": 0.8548616766929626, + "learning_rate": 0.00017878330212046155, + "loss": 2.7003, + "step": 3517 + }, + { + "epoch": 0.31872437770378925, + "grad_norm": 0.800777018070221, + "learning_rate": 0.0001787772609194708, + "loss": 2.6607, + "step": 3518 + }, + { + "epoch": 0.31881497587823604, + "grad_norm": 0.7310529947280884, + "learning_rate": 0.00017877121971848002, + "loss": 2.5766, + "step": 3519 + }, + { + "epoch": 0.3189055740526828, + "grad_norm": 0.7309787273406982, + "learning_rate": 0.00017876517851748928, + "loss": 2.9037, + "step": 3520 + }, + { + "epoch": 0.3189961722271296, + "grad_norm": 0.7174170017242432, + "learning_rate": 0.00017875913731649855, + "loss": 2.494, + "step": 3521 + }, + { + "epoch": 0.3190867704015764, + "grad_norm": 0.7499672770500183, + "learning_rate": 0.00017875309611550778, + "loss": 2.6749, + "step": 3522 + }, + { + "epoch": 0.3191773685760232, + "grad_norm": 0.7501062154769897, + "learning_rate": 0.000178747054914517, + "loss": 2.8057, + "step": 3523 + }, + { + "epoch": 0.31926796675047, + "grad_norm": 0.7426259517669678, + "learning_rate": 0.00017874101371352625, + "loss": 2.8592, + "step": 3524 + }, + { + "epoch": 0.31935856492491677, + "grad_norm": 0.8345943093299866, + "learning_rate": 0.0001787349725125355, + "loss": 3.1133, + "step": 3525 + }, + { + "epoch": 0.31944916309936355, + "grad_norm": 0.6788738369941711, + "learning_rate": 0.00017872893131154474, + "loss": 2.6325, + "step": 3526 + }, + { + "epoch": 0.31953976127381034, + "grad_norm": 0.7818332314491272, + "learning_rate": 0.00017872289011055398, + "loss": 3.0105, + "step": 3527 + }, + { + "epoch": 0.31963035944825713, + "grad_norm": 0.7509610056877136, + "learning_rate": 0.00017871684890956324, + "loss": 2.7677, + "step": 3528 + }, + { + "epoch": 0.3197209576227039, + "grad_norm": 0.7766594290733337, + "learning_rate": 0.00017871080770857247, + "loss": 3.1775, + "step": 3529 + }, + { + "epoch": 0.3198115557971507, + "grad_norm": 0.7702529430389404, + "learning_rate": 0.00017870476650758173, + "loss": 2.7331, + "step": 3530 + }, + { + "epoch": 0.3199021539715975, + "grad_norm": 0.7515185475349426, + "learning_rate": 0.00017869872530659094, + "loss": 2.8599, + "step": 3531 + }, + { + "epoch": 0.3199927521460443, + "grad_norm": 0.7066697478294373, + "learning_rate": 0.0001786926841056002, + "loss": 2.897, + "step": 3532 + }, + { + "epoch": 0.32008335032049107, + "grad_norm": 0.727959394454956, + "learning_rate": 0.00017868664290460944, + "loss": 2.8869, + "step": 3533 + }, + { + "epoch": 0.32017394849493785, + "grad_norm": 0.7934474349021912, + "learning_rate": 0.0001786806017036187, + "loss": 2.816, + "step": 3534 + }, + { + "epoch": 0.3202645466693846, + "grad_norm": 0.7659173607826233, + "learning_rate": 0.00017867456050262793, + "loss": 2.9709, + "step": 3535 + }, + { + "epoch": 0.3203551448438314, + "grad_norm": 0.6798691153526306, + "learning_rate": 0.00017866851930163716, + "loss": 2.3398, + "step": 3536 + }, + { + "epoch": 0.32044574301827816, + "grad_norm": 0.7216501235961914, + "learning_rate": 0.00017866247810064643, + "loss": 2.6448, + "step": 3537 + }, + { + "epoch": 0.32053634119272495, + "grad_norm": 0.718620777130127, + "learning_rate": 0.00017865643689965566, + "loss": 2.9068, + "step": 3538 + }, + { + "epoch": 0.32062693936717174, + "grad_norm": 0.709141731262207, + "learning_rate": 0.00017865039569866492, + "loss": 3.0922, + "step": 3539 + }, + { + "epoch": 0.3207175375416185, + "grad_norm": 0.7789674401283264, + "learning_rate": 0.00017864435449767413, + "loss": 2.6619, + "step": 3540 + }, + { + "epoch": 0.3208081357160653, + "grad_norm": 0.7330108880996704, + "learning_rate": 0.0001786383132966834, + "loss": 2.8789, + "step": 3541 + }, + { + "epoch": 0.3208987338905121, + "grad_norm": 0.7923840284347534, + "learning_rate": 0.00017863227209569265, + "loss": 3.0394, + "step": 3542 + }, + { + "epoch": 0.3209893320649589, + "grad_norm": 0.7219684720039368, + "learning_rate": 0.00017862623089470188, + "loss": 2.7573, + "step": 3543 + }, + { + "epoch": 0.3210799302394057, + "grad_norm": 0.7328842878341675, + "learning_rate": 0.00017862018969371112, + "loss": 2.8781, + "step": 3544 + }, + { + "epoch": 0.32117052841385246, + "grad_norm": 0.7416486144065857, + "learning_rate": 0.00017861414849272035, + "loss": 2.8151, + "step": 3545 + }, + { + "epoch": 0.32126112658829925, + "grad_norm": 0.7624920010566711, + "learning_rate": 0.0001786081072917296, + "loss": 2.8389, + "step": 3546 + }, + { + "epoch": 0.32135172476274604, + "grad_norm": 0.719885528087616, + "learning_rate": 0.00017860206609073885, + "loss": 3.2739, + "step": 3547 + }, + { + "epoch": 0.3214423229371928, + "grad_norm": 0.7703696489334106, + "learning_rate": 0.00017859602488974808, + "loss": 2.8414, + "step": 3548 + }, + { + "epoch": 0.3215329211116396, + "grad_norm": 0.6952931880950928, + "learning_rate": 0.00017858998368875732, + "loss": 2.6759, + "step": 3549 + }, + { + "epoch": 0.3216235192860864, + "grad_norm": 0.7754200100898743, + "learning_rate": 0.00017858394248776658, + "loss": 3.0096, + "step": 3550 + }, + { + "epoch": 0.3217141174605332, + "grad_norm": 0.7698482275009155, + "learning_rate": 0.00017857790128677584, + "loss": 2.9063, + "step": 3551 + }, + { + "epoch": 0.32180471563498, + "grad_norm": 0.7285077571868896, + "learning_rate": 0.00017857186008578507, + "loss": 3.0881, + "step": 3552 + }, + { + "epoch": 0.32189531380942676, + "grad_norm": 0.7826460599899292, + "learning_rate": 0.0001785658188847943, + "loss": 2.9469, + "step": 3553 + }, + { + "epoch": 0.32198591198387355, + "grad_norm": 0.7828326225280762, + "learning_rate": 0.00017855977768380354, + "loss": 3.0101, + "step": 3554 + }, + { + "epoch": 0.3220765101583203, + "grad_norm": 0.768192708492279, + "learning_rate": 0.0001785537364828128, + "loss": 2.9138, + "step": 3555 + }, + { + "epoch": 0.32216710833276707, + "grad_norm": 0.7340543866157532, + "learning_rate": 0.00017854769528182204, + "loss": 2.7413, + "step": 3556 + }, + { + "epoch": 0.32225770650721386, + "grad_norm": 0.7190107107162476, + "learning_rate": 0.00017854165408083127, + "loss": 2.6899, + "step": 3557 + }, + { + "epoch": 0.32234830468166065, + "grad_norm": 0.7472839951515198, + "learning_rate": 0.00017853561287984053, + "loss": 2.8356, + "step": 3558 + }, + { + "epoch": 0.32243890285610743, + "grad_norm": 0.80522620677948, + "learning_rate": 0.00017852957167884976, + "loss": 2.7818, + "step": 3559 + }, + { + "epoch": 0.3225295010305542, + "grad_norm": 0.7436122894287109, + "learning_rate": 0.00017852353047785903, + "loss": 3.0724, + "step": 3560 + }, + { + "epoch": 0.322620099205001, + "grad_norm": 0.7151049971580505, + "learning_rate": 0.00017851748927686823, + "loss": 3.0741, + "step": 3561 + }, + { + "epoch": 0.3227106973794478, + "grad_norm": 0.7195466160774231, + "learning_rate": 0.0001785114480758775, + "loss": 2.7586, + "step": 3562 + }, + { + "epoch": 0.3228012955538946, + "grad_norm": 0.7244448661804199, + "learning_rate": 0.00017850540687488673, + "loss": 2.6903, + "step": 3563 + }, + { + "epoch": 0.3228918937283414, + "grad_norm": 0.7439331412315369, + "learning_rate": 0.000178499365673896, + "loss": 2.6504, + "step": 3564 + }, + { + "epoch": 0.32298249190278816, + "grad_norm": 0.7516360878944397, + "learning_rate": 0.00017849332447290522, + "loss": 2.8026, + "step": 3565 + }, + { + "epoch": 0.32307309007723495, + "grad_norm": 0.6791348457336426, + "learning_rate": 0.00017848728327191446, + "loss": 2.3769, + "step": 3566 + }, + { + "epoch": 0.32316368825168174, + "grad_norm": 0.751496434211731, + "learning_rate": 0.00017848124207092372, + "loss": 3.0293, + "step": 3567 + }, + { + "epoch": 0.3232542864261285, + "grad_norm": 0.7365366816520691, + "learning_rate": 0.00017847520086993295, + "loss": 2.8056, + "step": 3568 + }, + { + "epoch": 0.3233448846005753, + "grad_norm": 0.6907637715339661, + "learning_rate": 0.00017846915966894219, + "loss": 2.8476, + "step": 3569 + }, + { + "epoch": 0.3234354827750221, + "grad_norm": 0.7334935069084167, + "learning_rate": 0.00017846311846795142, + "loss": 2.6336, + "step": 3570 + }, + { + "epoch": 0.3235260809494689, + "grad_norm": 0.7706235647201538, + "learning_rate": 0.00017845707726696068, + "loss": 2.7189, + "step": 3571 + }, + { + "epoch": 0.3236166791239157, + "grad_norm": 0.7943733334541321, + "learning_rate": 0.00017845103606596994, + "loss": 2.8518, + "step": 3572 + }, + { + "epoch": 0.32370727729836246, + "grad_norm": 0.7311702370643616, + "learning_rate": 0.00017844499486497918, + "loss": 2.8586, + "step": 3573 + }, + { + "epoch": 0.32379787547280925, + "grad_norm": 0.7442180514335632, + "learning_rate": 0.0001784389536639884, + "loss": 2.7783, + "step": 3574 + }, + { + "epoch": 0.323888473647256, + "grad_norm": 0.7163938879966736, + "learning_rate": 0.00017843291246299764, + "loss": 2.7166, + "step": 3575 + }, + { + "epoch": 0.32397907182170277, + "grad_norm": 0.7741850018501282, + "learning_rate": 0.0001784268712620069, + "loss": 2.7777, + "step": 3576 + }, + { + "epoch": 0.32406966999614956, + "grad_norm": 0.7874680757522583, + "learning_rate": 0.00017842083006101614, + "loss": 3.1732, + "step": 3577 + }, + { + "epoch": 0.32416026817059634, + "grad_norm": 0.7847275137901306, + "learning_rate": 0.00017841478886002537, + "loss": 2.9436, + "step": 3578 + }, + { + "epoch": 0.32425086634504313, + "grad_norm": 0.752437949180603, + "learning_rate": 0.0001784087476590346, + "loss": 2.9583, + "step": 3579 + }, + { + "epoch": 0.3243414645194899, + "grad_norm": 0.7289396524429321, + "learning_rate": 0.00017840270645804387, + "loss": 2.7623, + "step": 3580 + }, + { + "epoch": 0.3244320626939367, + "grad_norm": 0.7855364084243774, + "learning_rate": 0.00017839666525705313, + "loss": 2.7217, + "step": 3581 + }, + { + "epoch": 0.3245226608683835, + "grad_norm": 0.6803807020187378, + "learning_rate": 0.00017839062405606234, + "loss": 2.1268, + "step": 3582 + }, + { + "epoch": 0.3246132590428303, + "grad_norm": 0.7590234875679016, + "learning_rate": 0.0001783845828550716, + "loss": 2.915, + "step": 3583 + }, + { + "epoch": 0.32470385721727707, + "grad_norm": 0.7596529722213745, + "learning_rate": 0.00017837854165408083, + "loss": 2.9248, + "step": 3584 + }, + { + "epoch": 0.32479445539172386, + "grad_norm": 0.7744929194450378, + "learning_rate": 0.0001783725004530901, + "loss": 2.7064, + "step": 3585 + }, + { + "epoch": 0.32488505356617065, + "grad_norm": 0.7866945266723633, + "learning_rate": 0.00017836645925209933, + "loss": 2.8415, + "step": 3586 + }, + { + "epoch": 0.32497565174061743, + "grad_norm": 0.7688101530075073, + "learning_rate": 0.00017836041805110856, + "loss": 3.0182, + "step": 3587 + }, + { + "epoch": 0.3250662499150642, + "grad_norm": 0.7039904594421387, + "learning_rate": 0.00017835437685011782, + "loss": 2.7544, + "step": 3588 + }, + { + "epoch": 0.325156848089511, + "grad_norm": 0.7530128359794617, + "learning_rate": 0.00017834833564912706, + "loss": 2.8724, + "step": 3589 + }, + { + "epoch": 0.3252474462639578, + "grad_norm": 0.7420058250427246, + "learning_rate": 0.0001783422944481363, + "loss": 2.789, + "step": 3590 + }, + { + "epoch": 0.3253380444384046, + "grad_norm": 0.7033644318580627, + "learning_rate": 0.00017833625324714553, + "loss": 2.7632, + "step": 3591 + }, + { + "epoch": 0.32542864261285137, + "grad_norm": 0.7227218151092529, + "learning_rate": 0.0001783302120461548, + "loss": 3.0621, + "step": 3592 + }, + { + "epoch": 0.32551924078729816, + "grad_norm": 0.7217003703117371, + "learning_rate": 0.00017832417084516402, + "loss": 2.8458, + "step": 3593 + }, + { + "epoch": 0.32560983896174495, + "grad_norm": 0.7665847539901733, + "learning_rate": 0.00017831812964417328, + "loss": 2.7157, + "step": 3594 + }, + { + "epoch": 0.3257004371361917, + "grad_norm": 0.7578392624855042, + "learning_rate": 0.00017831208844318252, + "loss": 3.0559, + "step": 3595 + }, + { + "epoch": 0.32579103531063847, + "grad_norm": 0.7458443641662598, + "learning_rate": 0.00017830604724219175, + "loss": 3.0696, + "step": 3596 + }, + { + "epoch": 0.32588163348508525, + "grad_norm": 0.7901167273521423, + "learning_rate": 0.000178300006041201, + "loss": 2.9169, + "step": 3597 + }, + { + "epoch": 0.32597223165953204, + "grad_norm": 0.6981526613235474, + "learning_rate": 0.00017829396484021025, + "loss": 2.7576, + "step": 3598 + }, + { + "epoch": 0.32606282983397883, + "grad_norm": 0.8248461484909058, + "learning_rate": 0.00017828792363921948, + "loss": 2.7651, + "step": 3599 + }, + { + "epoch": 0.3261534280084256, + "grad_norm": 0.7591219544410706, + "learning_rate": 0.0001782818824382287, + "loss": 2.72, + "step": 3600 + }, + { + "epoch": 0.3262440261828724, + "grad_norm": 0.7101055979728699, + "learning_rate": 0.00017827584123723797, + "loss": 2.4258, + "step": 3601 + }, + { + "epoch": 0.3263346243573192, + "grad_norm": 0.7691241502761841, + "learning_rate": 0.00017826980003624724, + "loss": 2.9165, + "step": 3602 + }, + { + "epoch": 0.326425222531766, + "grad_norm": 0.7952893376350403, + "learning_rate": 0.00017826375883525644, + "loss": 3.0081, + "step": 3603 + }, + { + "epoch": 0.32651582070621277, + "grad_norm": 0.6598417162895203, + "learning_rate": 0.0001782577176342657, + "loss": 2.0131, + "step": 3604 + }, + { + "epoch": 0.32660641888065955, + "grad_norm": 0.7382063269615173, + "learning_rate": 0.00017825167643327494, + "loss": 2.9677, + "step": 3605 + }, + { + "epoch": 0.32669701705510634, + "grad_norm": 0.7246171832084656, + "learning_rate": 0.0001782456352322842, + "loss": 2.667, + "step": 3606 + }, + { + "epoch": 0.32678761522955313, + "grad_norm": 0.7138850092887878, + "learning_rate": 0.00017823959403129343, + "loss": 2.7706, + "step": 3607 + }, + { + "epoch": 0.3268782134039999, + "grad_norm": 0.6950585246086121, + "learning_rate": 0.00017823355283030267, + "loss": 2.6522, + "step": 3608 + }, + { + "epoch": 0.3269688115784467, + "grad_norm": 0.6591595411300659, + "learning_rate": 0.0001782275116293119, + "loss": 2.0327, + "step": 3609 + }, + { + "epoch": 0.3270594097528935, + "grad_norm": 0.7099041938781738, + "learning_rate": 0.00017822147042832116, + "loss": 2.39, + "step": 3610 + }, + { + "epoch": 0.3271500079273403, + "grad_norm": 0.7646951079368591, + "learning_rate": 0.00017821542922733042, + "loss": 3.0339, + "step": 3611 + }, + { + "epoch": 0.32724060610178707, + "grad_norm": 0.7363020777702332, + "learning_rate": 0.00017820938802633963, + "loss": 3.0449, + "step": 3612 + }, + { + "epoch": 0.32733120427623386, + "grad_norm": 0.8782681226730347, + "learning_rate": 0.0001782033468253489, + "loss": 3.0185, + "step": 3613 + }, + { + "epoch": 0.32742180245068064, + "grad_norm": 0.7836095690727234, + "learning_rate": 0.00017819730562435813, + "loss": 2.8869, + "step": 3614 + }, + { + "epoch": 0.32751240062512743, + "grad_norm": 0.7731446623802185, + "learning_rate": 0.0001781912644233674, + "loss": 2.727, + "step": 3615 + }, + { + "epoch": 0.32760299879957416, + "grad_norm": 0.7138470411300659, + "learning_rate": 0.00017818522322237662, + "loss": 2.7614, + "step": 3616 + }, + { + "epoch": 0.32769359697402095, + "grad_norm": 0.7629331350326538, + "learning_rate": 0.00017817918202138585, + "loss": 2.8739, + "step": 3617 + }, + { + "epoch": 0.32778419514846774, + "grad_norm": 0.6936425566673279, + "learning_rate": 0.00017817314082039512, + "loss": 2.607, + "step": 3618 + }, + { + "epoch": 0.3278747933229145, + "grad_norm": 0.7633016109466553, + "learning_rate": 0.00017816709961940435, + "loss": 3.1677, + "step": 3619 + }, + { + "epoch": 0.3279653914973613, + "grad_norm": 0.7172055840492249, + "learning_rate": 0.00017816105841841358, + "loss": 3.0414, + "step": 3620 + }, + { + "epoch": 0.3280559896718081, + "grad_norm": 0.7059064507484436, + "learning_rate": 0.00017815501721742282, + "loss": 1.9613, + "step": 3621 + }, + { + "epoch": 0.3281465878462549, + "grad_norm": 0.747907817363739, + "learning_rate": 0.00017814897601643208, + "loss": 2.9081, + "step": 3622 + }, + { + "epoch": 0.3282371860207017, + "grad_norm": 0.7443763017654419, + "learning_rate": 0.0001781429348154413, + "loss": 2.9911, + "step": 3623 + }, + { + "epoch": 0.32832778419514846, + "grad_norm": 0.7651762962341309, + "learning_rate": 0.00017813689361445057, + "loss": 2.82, + "step": 3624 + }, + { + "epoch": 0.32841838236959525, + "grad_norm": 0.7092989683151245, + "learning_rate": 0.0001781308524134598, + "loss": 2.0973, + "step": 3625 + }, + { + "epoch": 0.32850898054404204, + "grad_norm": 0.7436683773994446, + "learning_rate": 0.00017812481121246904, + "loss": 2.948, + "step": 3626 + }, + { + "epoch": 0.3285995787184888, + "grad_norm": 0.7071669697761536, + "learning_rate": 0.0001781187700114783, + "loss": 2.8444, + "step": 3627 + }, + { + "epoch": 0.3286901768929356, + "grad_norm": 0.7583902478218079, + "learning_rate": 0.00017811272881048754, + "loss": 2.7643, + "step": 3628 + }, + { + "epoch": 0.3287807750673824, + "grad_norm": 0.7342811822891235, + "learning_rate": 0.00017810668760949677, + "loss": 2.625, + "step": 3629 + }, + { + "epoch": 0.3288713732418292, + "grad_norm": 0.7530274987220764, + "learning_rate": 0.000178100646408506, + "loss": 2.7294, + "step": 3630 + }, + { + "epoch": 0.328961971416276, + "grad_norm": 0.7940881848335266, + "learning_rate": 0.00017809460520751527, + "loss": 2.6896, + "step": 3631 + }, + { + "epoch": 0.32905256959072277, + "grad_norm": 0.7675580978393555, + "learning_rate": 0.00017808856400652453, + "loss": 2.6502, + "step": 3632 + }, + { + "epoch": 0.32914316776516955, + "grad_norm": 0.7043102383613586, + "learning_rate": 0.00017808252280553374, + "loss": 2.6852, + "step": 3633 + }, + { + "epoch": 0.32923376593961634, + "grad_norm": 0.7844038605690002, + "learning_rate": 0.000178076481604543, + "loss": 2.8391, + "step": 3634 + }, + { + "epoch": 0.32932436411406313, + "grad_norm": 0.7308193445205688, + "learning_rate": 0.00017807044040355223, + "loss": 2.8856, + "step": 3635 + }, + { + "epoch": 0.32941496228850986, + "grad_norm": 0.8539618849754333, + "learning_rate": 0.0001780643992025615, + "loss": 2.9275, + "step": 3636 + }, + { + "epoch": 0.32950556046295665, + "grad_norm": 0.6483579277992249, + "learning_rate": 0.00017805835800157073, + "loss": 1.5479, + "step": 3637 + }, + { + "epoch": 0.32959615863740344, + "grad_norm": 0.7473387718200684, + "learning_rate": 0.00017805231680057996, + "loss": 2.9806, + "step": 3638 + }, + { + "epoch": 0.3296867568118502, + "grad_norm": 0.7509278059005737, + "learning_rate": 0.0001780462755995892, + "loss": 3.0076, + "step": 3639 + }, + { + "epoch": 0.329777354986297, + "grad_norm": 0.7465243935585022, + "learning_rate": 0.00017804023439859845, + "loss": 2.7784, + "step": 3640 + }, + { + "epoch": 0.3298679531607438, + "grad_norm": 0.6116569638252258, + "learning_rate": 0.0001780341931976077, + "loss": 1.9605, + "step": 3641 + }, + { + "epoch": 0.3299585513351906, + "grad_norm": 0.7625999450683594, + "learning_rate": 0.00017802815199661692, + "loss": 2.828, + "step": 3642 + }, + { + "epoch": 0.3300491495096374, + "grad_norm": 0.7271744012832642, + "learning_rate": 0.00017802211079562618, + "loss": 2.2663, + "step": 3643 + }, + { + "epoch": 0.33013974768408416, + "grad_norm": 0.6595269441604614, + "learning_rate": 0.00017801606959463542, + "loss": 2.0695, + "step": 3644 + }, + { + "epoch": 0.33023034585853095, + "grad_norm": 0.6982164978981018, + "learning_rate": 0.00017801002839364468, + "loss": 2.8142, + "step": 3645 + }, + { + "epoch": 0.33032094403297774, + "grad_norm": 0.8033419847488403, + "learning_rate": 0.00017800398719265389, + "loss": 2.9226, + "step": 3646 + }, + { + "epoch": 0.3304115422074245, + "grad_norm": 0.7348435521125793, + "learning_rate": 0.00017799794599166315, + "loss": 2.8473, + "step": 3647 + }, + { + "epoch": 0.3305021403818713, + "grad_norm": 0.7039541602134705, + "learning_rate": 0.0001779919047906724, + "loss": 2.8987, + "step": 3648 + }, + { + "epoch": 0.3305927385563181, + "grad_norm": 0.7235540151596069, + "learning_rate": 0.00017798586358968164, + "loss": 2.8382, + "step": 3649 + }, + { + "epoch": 0.3306833367307649, + "grad_norm": 0.7725857496261597, + "learning_rate": 0.00017797982238869088, + "loss": 3.0146, + "step": 3650 + }, + { + "epoch": 0.3307739349052117, + "grad_norm": 0.7338874936103821, + "learning_rate": 0.0001779737811877001, + "loss": 2.8074, + "step": 3651 + }, + { + "epoch": 0.33086453307965846, + "grad_norm": 0.7613027691841125, + "learning_rate": 0.00017796773998670937, + "loss": 2.8961, + "step": 3652 + }, + { + "epoch": 0.33095513125410525, + "grad_norm": 0.6302671432495117, + "learning_rate": 0.0001779616987857186, + "loss": 2.2341, + "step": 3653 + }, + { + "epoch": 0.33104572942855204, + "grad_norm": 0.7972192168235779, + "learning_rate": 0.00017795565758472784, + "loss": 2.9067, + "step": 3654 + }, + { + "epoch": 0.3311363276029988, + "grad_norm": 0.7690390348434448, + "learning_rate": 0.0001779496163837371, + "loss": 2.7766, + "step": 3655 + }, + { + "epoch": 0.33122692577744556, + "grad_norm": 0.7122611999511719, + "learning_rate": 0.00017794357518274634, + "loss": 2.9441, + "step": 3656 + }, + { + "epoch": 0.33131752395189235, + "grad_norm": 0.7744887471199036, + "learning_rate": 0.0001779375339817556, + "loss": 3.2714, + "step": 3657 + }, + { + "epoch": 0.33140812212633913, + "grad_norm": 0.6793386340141296, + "learning_rate": 0.00017793149278076483, + "loss": 2.2011, + "step": 3658 + }, + { + "epoch": 0.3314987203007859, + "grad_norm": 0.744414210319519, + "learning_rate": 0.00017792545157977406, + "loss": 2.6585, + "step": 3659 + }, + { + "epoch": 0.3315893184752327, + "grad_norm": 0.6625568270683289, + "learning_rate": 0.0001779194103787833, + "loss": 2.0813, + "step": 3660 + }, + { + "epoch": 0.3316799166496795, + "grad_norm": 0.7182695865631104, + "learning_rate": 0.00017791336917779256, + "loss": 2.6899, + "step": 3661 + }, + { + "epoch": 0.3317705148241263, + "grad_norm": 0.723261296749115, + "learning_rate": 0.00017790732797680182, + "loss": 2.7508, + "step": 3662 + }, + { + "epoch": 0.33186111299857307, + "grad_norm": 0.7101771831512451, + "learning_rate": 0.00017790128677581103, + "loss": 2.0446, + "step": 3663 + }, + { + "epoch": 0.33195171117301986, + "grad_norm": 0.7850525975227356, + "learning_rate": 0.0001778952455748203, + "loss": 2.8071, + "step": 3664 + }, + { + "epoch": 0.33204230934746665, + "grad_norm": 0.7906605005264282, + "learning_rate": 0.00017788920437382952, + "loss": 3.0445, + "step": 3665 + }, + { + "epoch": 0.33213290752191343, + "grad_norm": 0.6541787981987, + "learning_rate": 0.00017788316317283878, + "loss": 2.1557, + "step": 3666 + }, + { + "epoch": 0.3322235056963602, + "grad_norm": 0.7297481298446655, + "learning_rate": 0.000177877121971848, + "loss": 3.019, + "step": 3667 + }, + { + "epoch": 0.332314103870807, + "grad_norm": 0.7612841129302979, + "learning_rate": 0.00017787108077085725, + "loss": 2.5498, + "step": 3668 + }, + { + "epoch": 0.3324047020452538, + "grad_norm": 0.8032554984092712, + "learning_rate": 0.00017786503956986649, + "loss": 3.0952, + "step": 3669 + }, + { + "epoch": 0.3324953002197006, + "grad_norm": 0.6935644149780273, + "learning_rate": 0.00017785899836887575, + "loss": 2.6393, + "step": 3670 + }, + { + "epoch": 0.3325858983941474, + "grad_norm": 0.7852073907852173, + "learning_rate": 0.00017785295716788498, + "loss": 2.929, + "step": 3671 + }, + { + "epoch": 0.33267649656859416, + "grad_norm": 0.7398603558540344, + "learning_rate": 0.00017784691596689422, + "loss": 2.6169, + "step": 3672 + }, + { + "epoch": 0.33276709474304095, + "grad_norm": 0.6979238390922546, + "learning_rate": 0.00017784087476590348, + "loss": 2.7485, + "step": 3673 + }, + { + "epoch": 0.33285769291748774, + "grad_norm": 0.7567631006240845, + "learning_rate": 0.0001778348335649127, + "loss": 2.9281, + "step": 3674 + }, + { + "epoch": 0.3329482910919345, + "grad_norm": 0.72941654920578, + "learning_rate": 0.00017782879236392197, + "loss": 2.7899, + "step": 3675 + }, + { + "epoch": 0.33303888926638126, + "grad_norm": 0.7613589763641357, + "learning_rate": 0.00017782275116293118, + "loss": 2.951, + "step": 3676 + }, + { + "epoch": 0.33312948744082804, + "grad_norm": 0.8347739577293396, + "learning_rate": 0.00017781670996194044, + "loss": 2.9824, + "step": 3677 + }, + { + "epoch": 0.33322008561527483, + "grad_norm": 0.7463276386260986, + "learning_rate": 0.0001778106687609497, + "loss": 3.0182, + "step": 3678 + }, + { + "epoch": 0.3333106837897216, + "grad_norm": 0.6586583256721497, + "learning_rate": 0.00017780462755995894, + "loss": 2.6087, + "step": 3679 + }, + { + "epoch": 0.3334012819641684, + "grad_norm": 0.7396881580352783, + "learning_rate": 0.00017779858635896817, + "loss": 2.8674, + "step": 3680 + }, + { + "epoch": 0.3334918801386152, + "grad_norm": 0.7840572595596313, + "learning_rate": 0.0001777925451579774, + "loss": 2.8526, + "step": 3681 + }, + { + "epoch": 0.333582478313062, + "grad_norm": 0.6303775906562805, + "learning_rate": 0.00017778650395698666, + "loss": 2.061, + "step": 3682 + }, + { + "epoch": 0.33367307648750877, + "grad_norm": 0.731620728969574, + "learning_rate": 0.0001777804627559959, + "loss": 2.8688, + "step": 3683 + }, + { + "epoch": 0.33376367466195556, + "grad_norm": 0.739101767539978, + "learning_rate": 0.00017777442155500513, + "loss": 3.0437, + "step": 3684 + }, + { + "epoch": 0.33385427283640234, + "grad_norm": 0.7455401420593262, + "learning_rate": 0.0001777683803540144, + "loss": 2.8999, + "step": 3685 + }, + { + "epoch": 0.33394487101084913, + "grad_norm": 0.726852297782898, + "learning_rate": 0.00017776233915302363, + "loss": 2.8708, + "step": 3686 + }, + { + "epoch": 0.3340354691852959, + "grad_norm": 0.6322923898696899, + "learning_rate": 0.0001777562979520329, + "loss": 2.1139, + "step": 3687 + }, + { + "epoch": 0.3341260673597427, + "grad_norm": 0.7443378567695618, + "learning_rate": 0.00017775025675104212, + "loss": 2.702, + "step": 3688 + }, + { + "epoch": 0.3342166655341895, + "grad_norm": 0.6599064469337463, + "learning_rate": 0.00017774421555005136, + "loss": 1.9572, + "step": 3689 + }, + { + "epoch": 0.3343072637086363, + "grad_norm": 0.7610746622085571, + "learning_rate": 0.0001777381743490606, + "loss": 2.922, + "step": 3690 + }, + { + "epoch": 0.33439786188308307, + "grad_norm": 0.7492435574531555, + "learning_rate": 0.00017773213314806985, + "loss": 2.8766, + "step": 3691 + }, + { + "epoch": 0.33448846005752986, + "grad_norm": 0.7755866050720215, + "learning_rate": 0.00017772609194707909, + "loss": 2.7956, + "step": 3692 + }, + { + "epoch": 0.33457905823197664, + "grad_norm": 0.7313345670700073, + "learning_rate": 0.00017772005074608832, + "loss": 2.765, + "step": 3693 + }, + { + "epoch": 0.33466965640642343, + "grad_norm": 0.637798547744751, + "learning_rate": 0.00017771400954509758, + "loss": 1.9207, + "step": 3694 + }, + { + "epoch": 0.3347602545808702, + "grad_norm": 0.6390852332115173, + "learning_rate": 0.00017770796834410682, + "loss": 2.0197, + "step": 3695 + }, + { + "epoch": 0.334850852755317, + "grad_norm": 0.7027332782745361, + "learning_rate": 0.00017770192714311608, + "loss": 3.2357, + "step": 3696 + }, + { + "epoch": 0.33494145092976374, + "grad_norm": 0.7533714771270752, + "learning_rate": 0.00017769588594212528, + "loss": 2.8303, + "step": 3697 + }, + { + "epoch": 0.3350320491042105, + "grad_norm": 0.7616591453552246, + "learning_rate": 0.00017768984474113454, + "loss": 2.9292, + "step": 3698 + }, + { + "epoch": 0.3351226472786573, + "grad_norm": 0.6137903928756714, + "learning_rate": 0.00017768380354014378, + "loss": 1.5952, + "step": 3699 + }, + { + "epoch": 0.3352132454531041, + "grad_norm": 0.7473433017730713, + "learning_rate": 0.00017767776233915304, + "loss": 3.0109, + "step": 3700 + }, + { + "epoch": 0.3353038436275509, + "grad_norm": 0.76595538854599, + "learning_rate": 0.00017767172113816227, + "loss": 2.8445, + "step": 3701 + }, + { + "epoch": 0.3353944418019977, + "grad_norm": 0.7888397574424744, + "learning_rate": 0.0001776656799371715, + "loss": 2.7694, + "step": 3702 + }, + { + "epoch": 0.33548503997644447, + "grad_norm": 0.7386366128921509, + "learning_rate": 0.00017765963873618077, + "loss": 2.7264, + "step": 3703 + }, + { + "epoch": 0.33557563815089125, + "grad_norm": 0.7529635429382324, + "learning_rate": 0.00017765359753519, + "loss": 2.7659, + "step": 3704 + }, + { + "epoch": 0.33566623632533804, + "grad_norm": 0.7950295209884644, + "learning_rate": 0.00017764755633419924, + "loss": 2.7007, + "step": 3705 + }, + { + "epoch": 0.33575683449978483, + "grad_norm": 0.8013949394226074, + "learning_rate": 0.00017764151513320847, + "loss": 2.924, + "step": 3706 + }, + { + "epoch": 0.3358474326742316, + "grad_norm": 0.7503864765167236, + "learning_rate": 0.00017763547393221773, + "loss": 2.9126, + "step": 3707 + }, + { + "epoch": 0.3359380308486784, + "grad_norm": 0.7666956782341003, + "learning_rate": 0.000177629432731227, + "loss": 2.6936, + "step": 3708 + }, + { + "epoch": 0.3360286290231252, + "grad_norm": 0.7805230617523193, + "learning_rate": 0.00017762339153023623, + "loss": 2.7351, + "step": 3709 + }, + { + "epoch": 0.336119227197572, + "grad_norm": 0.7481129169464111, + "learning_rate": 0.00017761735032924546, + "loss": 3.0461, + "step": 3710 + }, + { + "epoch": 0.33620982537201877, + "grad_norm": 0.8184265494346619, + "learning_rate": 0.0001776113091282547, + "loss": 2.8097, + "step": 3711 + }, + { + "epoch": 0.33630042354646555, + "grad_norm": 0.7538934350013733, + "learning_rate": 0.00017760526792726396, + "loss": 2.6889, + "step": 3712 + }, + { + "epoch": 0.33639102172091234, + "grad_norm": 0.7514844536781311, + "learning_rate": 0.0001775992267262732, + "loss": 3.0593, + "step": 3713 + }, + { + "epoch": 0.33648161989535913, + "grad_norm": 0.7322183847427368, + "learning_rate": 0.00017759318552528243, + "loss": 2.5679, + "step": 3714 + }, + { + "epoch": 0.3365722180698059, + "grad_norm": 0.7260880470275879, + "learning_rate": 0.00017758714432429169, + "loss": 2.9841, + "step": 3715 + }, + { + "epoch": 0.3366628162442527, + "grad_norm": 0.6807477474212646, + "learning_rate": 0.00017758110312330092, + "loss": 2.8326, + "step": 3716 + }, + { + "epoch": 0.33675341441869944, + "grad_norm": 0.7360072135925293, + "learning_rate": 0.00017757506192231018, + "loss": 2.8595, + "step": 3717 + }, + { + "epoch": 0.3368440125931462, + "grad_norm": 0.7403228878974915, + "learning_rate": 0.0001775690207213194, + "loss": 2.9176, + "step": 3718 + }, + { + "epoch": 0.336934610767593, + "grad_norm": 0.6754907369613647, + "learning_rate": 0.00017756297952032865, + "loss": 2.4339, + "step": 3719 + }, + { + "epoch": 0.3370252089420398, + "grad_norm": 0.7943882942199707, + "learning_rate": 0.00017755693831933788, + "loss": 2.7182, + "step": 3720 + }, + { + "epoch": 0.3371158071164866, + "grad_norm": 0.7638561129570007, + "learning_rate": 0.00017755089711834714, + "loss": 2.7961, + "step": 3721 + }, + { + "epoch": 0.3372064052909334, + "grad_norm": 0.7811289429664612, + "learning_rate": 0.00017754485591735638, + "loss": 2.7457, + "step": 3722 + }, + { + "epoch": 0.33729700346538016, + "grad_norm": 0.8741047978401184, + "learning_rate": 0.0001775388147163656, + "loss": 3.097, + "step": 3723 + }, + { + "epoch": 0.33738760163982695, + "grad_norm": 0.7587780356407166, + "learning_rate": 0.00017753277351537487, + "loss": 3.0623, + "step": 3724 + }, + { + "epoch": 0.33747819981427374, + "grad_norm": 0.7199782133102417, + "learning_rate": 0.0001775267323143841, + "loss": 2.6416, + "step": 3725 + }, + { + "epoch": 0.3375687979887205, + "grad_norm": 0.778109610080719, + "learning_rate": 0.00017752069111339337, + "loss": 2.8829, + "step": 3726 + }, + { + "epoch": 0.3376593961631673, + "grad_norm": 0.7051375508308411, + "learning_rate": 0.00017751464991240258, + "loss": 2.7325, + "step": 3727 + }, + { + "epoch": 0.3377499943376141, + "grad_norm": 0.7841918468475342, + "learning_rate": 0.00017750860871141184, + "loss": 3.1776, + "step": 3728 + }, + { + "epoch": 0.3378405925120609, + "grad_norm": 0.7191961407661438, + "learning_rate": 0.00017750256751042107, + "loss": 2.6187, + "step": 3729 + }, + { + "epoch": 0.3379311906865077, + "grad_norm": 0.7238259315490723, + "learning_rate": 0.00017749652630943033, + "loss": 2.6995, + "step": 3730 + }, + { + "epoch": 0.33802178886095446, + "grad_norm": 0.7447220683097839, + "learning_rate": 0.00017749048510843957, + "loss": 2.7613, + "step": 3731 + }, + { + "epoch": 0.33811238703540125, + "grad_norm": 0.7058773636817932, + "learning_rate": 0.0001774844439074488, + "loss": 2.7382, + "step": 3732 + }, + { + "epoch": 0.33820298520984804, + "grad_norm": 0.7719606757164001, + "learning_rate": 0.00017747840270645806, + "loss": 2.818, + "step": 3733 + }, + { + "epoch": 0.3382935833842948, + "grad_norm": 0.6643455624580383, + "learning_rate": 0.0001774723615054673, + "loss": 2.4485, + "step": 3734 + }, + { + "epoch": 0.3383841815587416, + "grad_norm": 0.8791640996932983, + "learning_rate": 0.00017746632030447653, + "loss": 2.9726, + "step": 3735 + }, + { + "epoch": 0.3384747797331884, + "grad_norm": 0.839031994342804, + "learning_rate": 0.00017746027910348576, + "loss": 2.6439, + "step": 3736 + }, + { + "epoch": 0.33856537790763513, + "grad_norm": 0.7251480221748352, + "learning_rate": 0.00017745423790249503, + "loss": 2.8618, + "step": 3737 + }, + { + "epoch": 0.3386559760820819, + "grad_norm": 0.7009608745574951, + "learning_rate": 0.00017744819670150429, + "loss": 2.738, + "step": 3738 + }, + { + "epoch": 0.3387465742565287, + "grad_norm": 0.7322866916656494, + "learning_rate": 0.00017744215550051352, + "loss": 2.7816, + "step": 3739 + }, + { + "epoch": 0.3388371724309755, + "grad_norm": 0.8528774380683899, + "learning_rate": 0.00017743611429952275, + "loss": 2.6862, + "step": 3740 + }, + { + "epoch": 0.3389277706054223, + "grad_norm": 0.7665627002716064, + "learning_rate": 0.000177430073098532, + "loss": 2.9464, + "step": 3741 + }, + { + "epoch": 0.3390183687798691, + "grad_norm": 0.7251883745193481, + "learning_rate": 0.00017742403189754125, + "loss": 3.0349, + "step": 3742 + }, + { + "epoch": 0.33910896695431586, + "grad_norm": 0.7164598703384399, + "learning_rate": 0.00017741799069655048, + "loss": 2.5879, + "step": 3743 + }, + { + "epoch": 0.33919956512876265, + "grad_norm": 0.7655898332595825, + "learning_rate": 0.00017741194949555972, + "loss": 2.9684, + "step": 3744 + }, + { + "epoch": 0.33929016330320944, + "grad_norm": 0.8712344169616699, + "learning_rate": 0.00017740590829456898, + "loss": 2.7814, + "step": 3745 + }, + { + "epoch": 0.3393807614776562, + "grad_norm": 0.7816420793533325, + "learning_rate": 0.0001773998670935782, + "loss": 2.9052, + "step": 3746 + }, + { + "epoch": 0.339471359652103, + "grad_norm": 0.7670121192932129, + "learning_rate": 0.00017739382589258747, + "loss": 2.8959, + "step": 3747 + }, + { + "epoch": 0.3395619578265498, + "grad_norm": 0.6445460915565491, + "learning_rate": 0.00017738778469159668, + "loss": 2.3083, + "step": 3748 + }, + { + "epoch": 0.3396525560009966, + "grad_norm": 0.6170081496238708, + "learning_rate": 0.00017738174349060594, + "loss": 1.4324, + "step": 3749 + }, + { + "epoch": 0.3397431541754434, + "grad_norm": 0.7793049812316895, + "learning_rate": 0.00017737570228961518, + "loss": 2.9246, + "step": 3750 + }, + { + "epoch": 0.33983375234989016, + "grad_norm": 0.7432884573936462, + "learning_rate": 0.00017736966108862444, + "loss": 2.9713, + "step": 3751 + }, + { + "epoch": 0.33992435052433695, + "grad_norm": 0.7024998664855957, + "learning_rate": 0.00017736361988763367, + "loss": 2.8565, + "step": 3752 + }, + { + "epoch": 0.34001494869878374, + "grad_norm": 0.6846765875816345, + "learning_rate": 0.0001773575786866429, + "loss": 2.6183, + "step": 3753 + }, + { + "epoch": 0.3401055468732305, + "grad_norm": 0.7078089118003845, + "learning_rate": 0.00017735153748565217, + "loss": 2.975, + "step": 3754 + }, + { + "epoch": 0.3401961450476773, + "grad_norm": 0.7663963437080383, + "learning_rate": 0.0001773454962846614, + "loss": 2.8552, + "step": 3755 + }, + { + "epoch": 0.3402867432221241, + "grad_norm": 0.7032278776168823, + "learning_rate": 0.00017733945508367063, + "loss": 2.5854, + "step": 3756 + }, + { + "epoch": 0.34037734139657083, + "grad_norm": 0.7337939739227295, + "learning_rate": 0.00017733341388267987, + "loss": 2.9566, + "step": 3757 + }, + { + "epoch": 0.3404679395710176, + "grad_norm": 0.7149320244789124, + "learning_rate": 0.00017732737268168913, + "loss": 2.773, + "step": 3758 + }, + { + "epoch": 0.3405585377454644, + "grad_norm": 0.5849186182022095, + "learning_rate": 0.00017732133148069836, + "loss": 2.078, + "step": 3759 + }, + { + "epoch": 0.3406491359199112, + "grad_norm": 0.7224475145339966, + "learning_rate": 0.00017731529027970763, + "loss": 1.9808, + "step": 3760 + }, + { + "epoch": 0.340739734094358, + "grad_norm": 0.7227410078048706, + "learning_rate": 0.00017730924907871686, + "loss": 2.6637, + "step": 3761 + }, + { + "epoch": 0.34083033226880477, + "grad_norm": 0.6337008476257324, + "learning_rate": 0.0001773032078777261, + "loss": 2.0339, + "step": 3762 + }, + { + "epoch": 0.34092093044325156, + "grad_norm": 0.7484412789344788, + "learning_rate": 0.00017729716667673535, + "loss": 2.7122, + "step": 3763 + }, + { + "epoch": 0.34101152861769835, + "grad_norm": 0.6859702467918396, + "learning_rate": 0.0001772911254757446, + "loss": 2.8391, + "step": 3764 + }, + { + "epoch": 0.34110212679214513, + "grad_norm": 0.8264082670211792, + "learning_rate": 0.00017728508427475382, + "loss": 2.9789, + "step": 3765 + }, + { + "epoch": 0.3411927249665919, + "grad_norm": 0.8020330667495728, + "learning_rate": 0.00017727904307376306, + "loss": 2.7775, + "step": 3766 + }, + { + "epoch": 0.3412833231410387, + "grad_norm": 0.7671709060668945, + "learning_rate": 0.00017727300187277232, + "loss": 2.7431, + "step": 3767 + }, + { + "epoch": 0.3413739213154855, + "grad_norm": 0.7245631217956543, + "learning_rate": 0.00017726696067178158, + "loss": 2.8239, + "step": 3768 + }, + { + "epoch": 0.3414645194899323, + "grad_norm": 0.7123704552650452, + "learning_rate": 0.00017726091947079079, + "loss": 2.895, + "step": 3769 + }, + { + "epoch": 0.34155511766437907, + "grad_norm": 0.7205565571784973, + "learning_rate": 0.00017725487826980005, + "loss": 2.8443, + "step": 3770 + }, + { + "epoch": 0.34164571583882586, + "grad_norm": 0.7330286502838135, + "learning_rate": 0.00017724883706880928, + "loss": 2.7685, + "step": 3771 + }, + { + "epoch": 0.34173631401327265, + "grad_norm": 0.7515106201171875, + "learning_rate": 0.00017724279586781854, + "loss": 2.9427, + "step": 3772 + }, + { + "epoch": 0.34182691218771943, + "grad_norm": 0.7630383968353271, + "learning_rate": 0.00017723675466682778, + "loss": 2.8548, + "step": 3773 + }, + { + "epoch": 0.3419175103621662, + "grad_norm": 0.7859088778495789, + "learning_rate": 0.000177230713465837, + "loss": 3.114, + "step": 3774 + }, + { + "epoch": 0.342008108536613, + "grad_norm": 0.7932300567626953, + "learning_rate": 0.00017722467226484627, + "loss": 2.8267, + "step": 3775 + }, + { + "epoch": 0.3420987067110598, + "grad_norm": 0.7651495933532715, + "learning_rate": 0.0001772186310638555, + "loss": 2.8156, + "step": 3776 + }, + { + "epoch": 0.3421893048855066, + "grad_norm": 0.7870830297470093, + "learning_rate": 0.00017721258986286474, + "loss": 3.1259, + "step": 3777 + }, + { + "epoch": 0.3422799030599533, + "grad_norm": 0.6995032429695129, + "learning_rate": 0.00017720654866187397, + "loss": 2.8013, + "step": 3778 + }, + { + "epoch": 0.3423705012344001, + "grad_norm": 0.7673895955085754, + "learning_rate": 0.00017720050746088323, + "loss": 2.9236, + "step": 3779 + }, + { + "epoch": 0.3424610994088469, + "grad_norm": 0.7544505596160889, + "learning_rate": 0.00017719446625989247, + "loss": 2.9384, + "step": 3780 + }, + { + "epoch": 0.3425516975832937, + "grad_norm": 0.7582785487174988, + "learning_rate": 0.00017718842505890173, + "loss": 3.1391, + "step": 3781 + }, + { + "epoch": 0.34264229575774047, + "grad_norm": 0.7619355320930481, + "learning_rate": 0.00017718238385791096, + "loss": 2.9441, + "step": 3782 + }, + { + "epoch": 0.34273289393218725, + "grad_norm": 0.7313293218612671, + "learning_rate": 0.0001771763426569202, + "loss": 2.6906, + "step": 3783 + }, + { + "epoch": 0.34282349210663404, + "grad_norm": 0.6654253005981445, + "learning_rate": 0.00017717030145592946, + "loss": 2.2541, + "step": 3784 + }, + { + "epoch": 0.34291409028108083, + "grad_norm": 0.6965106129646301, + "learning_rate": 0.0001771642602549387, + "loss": 2.5425, + "step": 3785 + }, + { + "epoch": 0.3430046884555276, + "grad_norm": 0.7260540127754211, + "learning_rate": 0.00017715821905394793, + "loss": 3.0461, + "step": 3786 + }, + { + "epoch": 0.3430952866299744, + "grad_norm": 0.7416847944259644, + "learning_rate": 0.00017715217785295716, + "loss": 2.9165, + "step": 3787 + }, + { + "epoch": 0.3431858848044212, + "grad_norm": 0.7227126955986023, + "learning_rate": 0.00017714613665196642, + "loss": 2.7965, + "step": 3788 + }, + { + "epoch": 0.343276482978868, + "grad_norm": 0.7702444195747375, + "learning_rate": 0.00017714009545097566, + "loss": 2.867, + "step": 3789 + }, + { + "epoch": 0.34336708115331477, + "grad_norm": 0.7329858541488647, + "learning_rate": 0.0001771340542499849, + "loss": 2.8645, + "step": 3790 + }, + { + "epoch": 0.34345767932776156, + "grad_norm": 0.7423519492149353, + "learning_rate": 0.00017712801304899415, + "loss": 2.7899, + "step": 3791 + }, + { + "epoch": 0.34354827750220834, + "grad_norm": 0.7094180583953857, + "learning_rate": 0.00017712197184800339, + "loss": 2.7907, + "step": 3792 + }, + { + "epoch": 0.34363887567665513, + "grad_norm": 0.7623221278190613, + "learning_rate": 0.00017711593064701265, + "loss": 2.74, + "step": 3793 + }, + { + "epoch": 0.3437294738511019, + "grad_norm": 0.7716984152793884, + "learning_rate": 0.00017710988944602188, + "loss": 2.5189, + "step": 3794 + }, + { + "epoch": 0.3438200720255487, + "grad_norm": 0.6492441296577454, + "learning_rate": 0.00017710384824503112, + "loss": 2.0504, + "step": 3795 + }, + { + "epoch": 0.3439106701999955, + "grad_norm": 0.7465148568153381, + "learning_rate": 0.00017709780704404035, + "loss": 2.7303, + "step": 3796 + }, + { + "epoch": 0.3440012683744423, + "grad_norm": 0.6852020025253296, + "learning_rate": 0.0001770917658430496, + "loss": 2.1007, + "step": 3797 + }, + { + "epoch": 0.344091866548889, + "grad_norm": 0.757495641708374, + "learning_rate": 0.00017708572464205887, + "loss": 2.7012, + "step": 3798 + }, + { + "epoch": 0.3441824647233358, + "grad_norm": 0.7505974173545837, + "learning_rate": 0.00017707968344106808, + "loss": 2.8608, + "step": 3799 + }, + { + "epoch": 0.3442730628977826, + "grad_norm": 0.763024091720581, + "learning_rate": 0.00017707364224007734, + "loss": 2.621, + "step": 3800 + }, + { + "epoch": 0.3443636610722294, + "grad_norm": 0.6953292489051819, + "learning_rate": 0.00017706760103908657, + "loss": 2.3012, + "step": 3801 + }, + { + "epoch": 0.34445425924667616, + "grad_norm": 0.716913640499115, + "learning_rate": 0.00017706155983809584, + "loss": 2.4526, + "step": 3802 + }, + { + "epoch": 0.34454485742112295, + "grad_norm": 0.740067183971405, + "learning_rate": 0.00017705551863710507, + "loss": 3.0369, + "step": 3803 + }, + { + "epoch": 0.34463545559556974, + "grad_norm": 0.8043789267539978, + "learning_rate": 0.0001770494774361143, + "loss": 2.7778, + "step": 3804 + }, + { + "epoch": 0.3447260537700165, + "grad_norm": 0.764805793762207, + "learning_rate": 0.00017704343623512356, + "loss": 3.0492, + "step": 3805 + }, + { + "epoch": 0.3448166519444633, + "grad_norm": 0.8118292689323425, + "learning_rate": 0.0001770373950341328, + "loss": 2.9875, + "step": 3806 + }, + { + "epoch": 0.3449072501189101, + "grad_norm": 0.7649564146995544, + "learning_rate": 0.00017703135383314203, + "loss": 3.1757, + "step": 3807 + }, + { + "epoch": 0.3449978482933569, + "grad_norm": 0.7910007834434509, + "learning_rate": 0.00017702531263215127, + "loss": 2.8538, + "step": 3808 + }, + { + "epoch": 0.3450884464678037, + "grad_norm": 0.63249671459198, + "learning_rate": 0.00017701927143116053, + "loss": 2.1422, + "step": 3809 + }, + { + "epoch": 0.34517904464225047, + "grad_norm": 0.8551173210144043, + "learning_rate": 0.00017701323023016976, + "loss": 2.8474, + "step": 3810 + }, + { + "epoch": 0.34526964281669725, + "grad_norm": 0.740030825138092, + "learning_rate": 0.00017700718902917902, + "loss": 2.9469, + "step": 3811 + }, + { + "epoch": 0.34536024099114404, + "grad_norm": 0.7394329905509949, + "learning_rate": 0.00017700114782818826, + "loss": 2.9081, + "step": 3812 + }, + { + "epoch": 0.34545083916559083, + "grad_norm": 0.8390881419181824, + "learning_rate": 0.0001769951066271975, + "loss": 2.6629, + "step": 3813 + }, + { + "epoch": 0.3455414373400376, + "grad_norm": 0.7793710827827454, + "learning_rate": 0.00017698906542620675, + "loss": 2.7185, + "step": 3814 + }, + { + "epoch": 0.3456320355144844, + "grad_norm": 0.7587452530860901, + "learning_rate": 0.00017698302422521599, + "loss": 2.8257, + "step": 3815 + }, + { + "epoch": 0.3457226336889312, + "grad_norm": 0.7694476842880249, + "learning_rate": 0.00017697698302422522, + "loss": 2.8117, + "step": 3816 + }, + { + "epoch": 0.345813231863378, + "grad_norm": 0.7873779535293579, + "learning_rate": 0.00017697094182323445, + "loss": 2.9203, + "step": 3817 + }, + { + "epoch": 0.3459038300378247, + "grad_norm": 0.7258294224739075, + "learning_rate": 0.00017696490062224372, + "loss": 2.7553, + "step": 3818 + }, + { + "epoch": 0.3459944282122715, + "grad_norm": 0.7850189805030823, + "learning_rate": 0.00017695885942125295, + "loss": 2.9318, + "step": 3819 + }, + { + "epoch": 0.3460850263867183, + "grad_norm": 0.7119935750961304, + "learning_rate": 0.00017695281822026218, + "loss": 2.8159, + "step": 3820 + }, + { + "epoch": 0.3461756245611651, + "grad_norm": 0.7939804196357727, + "learning_rate": 0.00017694677701927144, + "loss": 3.0523, + "step": 3821 + }, + { + "epoch": 0.34626622273561186, + "grad_norm": 0.6385887861251831, + "learning_rate": 0.00017694073581828068, + "loss": 2.2515, + "step": 3822 + }, + { + "epoch": 0.34635682091005865, + "grad_norm": 0.738706111907959, + "learning_rate": 0.00017693469461728994, + "loss": 2.8693, + "step": 3823 + }, + { + "epoch": 0.34644741908450544, + "grad_norm": 0.6510443687438965, + "learning_rate": 0.00017692865341629917, + "loss": 2.1133, + "step": 3824 + }, + { + "epoch": 0.3465380172589522, + "grad_norm": 0.7291322350502014, + "learning_rate": 0.0001769226122153084, + "loss": 2.6439, + "step": 3825 + }, + { + "epoch": 0.346628615433399, + "grad_norm": 0.7645648121833801, + "learning_rate": 0.00017691657101431764, + "loss": 2.7959, + "step": 3826 + }, + { + "epoch": 0.3467192136078458, + "grad_norm": 0.6410036087036133, + "learning_rate": 0.0001769105298133269, + "loss": 2.2062, + "step": 3827 + }, + { + "epoch": 0.3468098117822926, + "grad_norm": 0.7241801023483276, + "learning_rate": 0.00017690448861233614, + "loss": 2.9853, + "step": 3828 + }, + { + "epoch": 0.3469004099567394, + "grad_norm": 0.6996656060218811, + "learning_rate": 0.00017689844741134537, + "loss": 2.6516, + "step": 3829 + }, + { + "epoch": 0.34699100813118616, + "grad_norm": 0.7517542243003845, + "learning_rate": 0.00017689240621035463, + "loss": 2.908, + "step": 3830 + }, + { + "epoch": 0.34708160630563295, + "grad_norm": 0.7108377814292908, + "learning_rate": 0.00017688636500936387, + "loss": 2.7924, + "step": 3831 + }, + { + "epoch": 0.34717220448007974, + "grad_norm": 0.7986987829208374, + "learning_rate": 0.00017688032380837313, + "loss": 2.8599, + "step": 3832 + }, + { + "epoch": 0.3472628026545265, + "grad_norm": 0.7184118032455444, + "learning_rate": 0.00017687428260738233, + "loss": 2.6708, + "step": 3833 + }, + { + "epoch": 0.3473534008289733, + "grad_norm": 0.7376216053962708, + "learning_rate": 0.0001768682414063916, + "loss": 2.8582, + "step": 3834 + }, + { + "epoch": 0.3474439990034201, + "grad_norm": 0.7345595955848694, + "learning_rate": 0.00017686220020540086, + "loss": 3.0893, + "step": 3835 + }, + { + "epoch": 0.3475345971778669, + "grad_norm": 0.7802309393882751, + "learning_rate": 0.0001768561590044101, + "loss": 2.9369, + "step": 3836 + }, + { + "epoch": 0.3476251953523137, + "grad_norm": 0.727522075176239, + "learning_rate": 0.00017685011780341933, + "loss": 2.9124, + "step": 3837 + }, + { + "epoch": 0.3477157935267604, + "grad_norm": 0.7612119317054749, + "learning_rate": 0.00017684407660242856, + "loss": 2.9273, + "step": 3838 + }, + { + "epoch": 0.3478063917012072, + "grad_norm": 0.7205650806427002, + "learning_rate": 0.00017683803540143782, + "loss": 2.8966, + "step": 3839 + }, + { + "epoch": 0.347896989875654, + "grad_norm": 0.758416473865509, + "learning_rate": 0.00017683199420044705, + "loss": 2.8883, + "step": 3840 + }, + { + "epoch": 0.34798758805010077, + "grad_norm": 0.756216287612915, + "learning_rate": 0.0001768259529994563, + "loss": 2.8529, + "step": 3841 + }, + { + "epoch": 0.34807818622454756, + "grad_norm": 0.7180405855178833, + "learning_rate": 0.00017681991179846555, + "loss": 2.6277, + "step": 3842 + }, + { + "epoch": 0.34816878439899435, + "grad_norm": 0.8089472651481628, + "learning_rate": 0.00017681387059747478, + "loss": 2.2943, + "step": 3843 + }, + { + "epoch": 0.34825938257344113, + "grad_norm": 0.7170594930648804, + "learning_rate": 0.00017680782939648404, + "loss": 2.801, + "step": 3844 + }, + { + "epoch": 0.3483499807478879, + "grad_norm": 0.7774484753608704, + "learning_rate": 0.00017680178819549328, + "loss": 2.8248, + "step": 3845 + }, + { + "epoch": 0.3484405789223347, + "grad_norm": 0.7157157063484192, + "learning_rate": 0.0001767957469945025, + "loss": 2.8772, + "step": 3846 + }, + { + "epoch": 0.3485311770967815, + "grad_norm": 0.7472075819969177, + "learning_rate": 0.00017678970579351175, + "loss": 2.8377, + "step": 3847 + }, + { + "epoch": 0.3486217752712283, + "grad_norm": 0.6964568495750427, + "learning_rate": 0.000176783664592521, + "loss": 2.6895, + "step": 3848 + }, + { + "epoch": 0.3487123734456751, + "grad_norm": 0.7829379439353943, + "learning_rate": 0.00017677762339153024, + "loss": 2.6866, + "step": 3849 + }, + { + "epoch": 0.34880297162012186, + "grad_norm": 0.742275059223175, + "learning_rate": 0.00017677158219053948, + "loss": 2.617, + "step": 3850 + }, + { + "epoch": 0.34889356979456865, + "grad_norm": 0.7688239216804504, + "learning_rate": 0.00017676554098954874, + "loss": 2.7507, + "step": 3851 + }, + { + "epoch": 0.34898416796901544, + "grad_norm": 0.7199180126190186, + "learning_rate": 0.00017675949978855797, + "loss": 2.9053, + "step": 3852 + }, + { + "epoch": 0.3490747661434622, + "grad_norm": 0.8041924238204956, + "learning_rate": 0.00017675345858756723, + "loss": 3.0321, + "step": 3853 + }, + { + "epoch": 0.349165364317909, + "grad_norm": 0.7541585564613342, + "learning_rate": 0.00017674741738657644, + "loss": 3.021, + "step": 3854 + }, + { + "epoch": 0.3492559624923558, + "grad_norm": 0.8400750756263733, + "learning_rate": 0.0001767413761855857, + "loss": 2.8949, + "step": 3855 + }, + { + "epoch": 0.3493465606668026, + "grad_norm": 0.7679420113563538, + "learning_rate": 0.00017673533498459493, + "loss": 2.7189, + "step": 3856 + }, + { + "epoch": 0.3494371588412494, + "grad_norm": 0.6609108448028564, + "learning_rate": 0.0001767292937836042, + "loss": 2.0064, + "step": 3857 + }, + { + "epoch": 0.34952775701569616, + "grad_norm": 0.8025717735290527, + "learning_rate": 0.00017672325258261343, + "loss": 2.0062, + "step": 3858 + }, + { + "epoch": 0.3496183551901429, + "grad_norm": 0.719890296459198, + "learning_rate": 0.00017671721138162266, + "loss": 2.7051, + "step": 3859 + }, + { + "epoch": 0.3497089533645897, + "grad_norm": 0.7142035961151123, + "learning_rate": 0.00017671117018063193, + "loss": 2.7822, + "step": 3860 + }, + { + "epoch": 0.34979955153903647, + "grad_norm": 0.7605220079421997, + "learning_rate": 0.00017670512897964116, + "loss": 2.8226, + "step": 3861 + }, + { + "epoch": 0.34989014971348326, + "grad_norm": 0.7424812316894531, + "learning_rate": 0.00017669908777865042, + "loss": 2.832, + "step": 3862 + }, + { + "epoch": 0.34998074788793004, + "grad_norm": 0.7892100214958191, + "learning_rate": 0.00017669304657765963, + "loss": 2.7737, + "step": 3863 + }, + { + "epoch": 0.35007134606237683, + "grad_norm": 0.7124531865119934, + "learning_rate": 0.0001766870053766689, + "loss": 2.3412, + "step": 3864 + }, + { + "epoch": 0.3501619442368236, + "grad_norm": 0.7867082953453064, + "learning_rate": 0.00017668096417567815, + "loss": 2.7893, + "step": 3865 + }, + { + "epoch": 0.3502525424112704, + "grad_norm": 0.6884182095527649, + "learning_rate": 0.00017667492297468738, + "loss": 2.2079, + "step": 3866 + }, + { + "epoch": 0.3503431405857172, + "grad_norm": 0.7041537165641785, + "learning_rate": 0.00017666888177369662, + "loss": 2.7587, + "step": 3867 + }, + { + "epoch": 0.350433738760164, + "grad_norm": 0.6839448809623718, + "learning_rate": 0.00017666284057270585, + "loss": 2.0121, + "step": 3868 + }, + { + "epoch": 0.35052433693461077, + "grad_norm": 0.7957323789596558, + "learning_rate": 0.0001766567993717151, + "loss": 2.6649, + "step": 3869 + }, + { + "epoch": 0.35061493510905756, + "grad_norm": 0.783038854598999, + "learning_rate": 0.00017665075817072435, + "loss": 2.8, + "step": 3870 + }, + { + "epoch": 0.35070553328350434, + "grad_norm": 0.7169252634048462, + "learning_rate": 0.00017664471696973358, + "loss": 2.751, + "step": 3871 + }, + { + "epoch": 0.35079613145795113, + "grad_norm": 0.750702440738678, + "learning_rate": 0.00017663867576874284, + "loss": 2.762, + "step": 3872 + }, + { + "epoch": 0.3508867296323979, + "grad_norm": 0.7794514894485474, + "learning_rate": 0.00017663263456775208, + "loss": 2.8967, + "step": 3873 + }, + { + "epoch": 0.3509773278068447, + "grad_norm": 0.7845126986503601, + "learning_rate": 0.00017662659336676134, + "loss": 2.8057, + "step": 3874 + }, + { + "epoch": 0.3510679259812915, + "grad_norm": 0.7565348744392395, + "learning_rate": 0.00017662055216577057, + "loss": 2.9095, + "step": 3875 + }, + { + "epoch": 0.3511585241557383, + "grad_norm": 0.7624666690826416, + "learning_rate": 0.0001766145109647798, + "loss": 2.8547, + "step": 3876 + }, + { + "epoch": 0.35124912233018507, + "grad_norm": 0.7318120002746582, + "learning_rate": 0.00017660846976378904, + "loss": 2.9262, + "step": 3877 + }, + { + "epoch": 0.35133972050463186, + "grad_norm": 0.6959015727043152, + "learning_rate": 0.0001766024285627983, + "loss": 2.6447, + "step": 3878 + }, + { + "epoch": 0.3514303186790786, + "grad_norm": 0.7321987748146057, + "learning_rate": 0.00017659638736180753, + "loss": 2.7116, + "step": 3879 + }, + { + "epoch": 0.3515209168535254, + "grad_norm": 0.7123871445655823, + "learning_rate": 0.00017659034616081677, + "loss": 2.6773, + "step": 3880 + }, + { + "epoch": 0.35161151502797217, + "grad_norm": 0.7583828568458557, + "learning_rate": 0.00017658430495982603, + "loss": 2.8542, + "step": 3881 + }, + { + "epoch": 0.35170211320241895, + "grad_norm": 0.697283923625946, + "learning_rate": 0.00017657826375883526, + "loss": 2.616, + "step": 3882 + }, + { + "epoch": 0.35179271137686574, + "grad_norm": 0.8319084644317627, + "learning_rate": 0.00017657222255784453, + "loss": 2.7873, + "step": 3883 + }, + { + "epoch": 0.35188330955131253, + "grad_norm": 0.736219584941864, + "learning_rate": 0.00017656618135685373, + "loss": 2.9863, + "step": 3884 + }, + { + "epoch": 0.3519739077257593, + "grad_norm": 0.7634362578392029, + "learning_rate": 0.000176560140155863, + "loss": 2.8276, + "step": 3885 + }, + { + "epoch": 0.3520645059002061, + "grad_norm": 0.7401694059371948, + "learning_rate": 0.00017655409895487223, + "loss": 2.7418, + "step": 3886 + }, + { + "epoch": 0.3521551040746529, + "grad_norm": 0.7136375904083252, + "learning_rate": 0.0001765480577538815, + "loss": 2.6852, + "step": 3887 + }, + { + "epoch": 0.3522457022490997, + "grad_norm": 0.7470219135284424, + "learning_rate": 0.00017654201655289072, + "loss": 3.0099, + "step": 3888 + }, + { + "epoch": 0.35233630042354647, + "grad_norm": 0.7690262794494629, + "learning_rate": 0.00017653597535189996, + "loss": 2.9042, + "step": 3889 + }, + { + "epoch": 0.35242689859799325, + "grad_norm": 0.8976293206214905, + "learning_rate": 0.00017652993415090922, + "loss": 3.0275, + "step": 3890 + }, + { + "epoch": 0.35251749677244004, + "grad_norm": 0.8013485074043274, + "learning_rate": 0.00017652389294991845, + "loss": 2.7293, + "step": 3891 + }, + { + "epoch": 0.35260809494688683, + "grad_norm": 0.8352347016334534, + "learning_rate": 0.00017651785174892769, + "loss": 3.315, + "step": 3892 + }, + { + "epoch": 0.3526986931213336, + "grad_norm": 0.7350513339042664, + "learning_rate": 0.00017651181054793692, + "loss": 3.2064, + "step": 3893 + }, + { + "epoch": 0.3527892912957804, + "grad_norm": 0.7904208898544312, + "learning_rate": 0.00017650576934694618, + "loss": 2.968, + "step": 3894 + }, + { + "epoch": 0.3528798894702272, + "grad_norm": 0.7127887606620789, + "learning_rate": 0.00017649972814595544, + "loss": 2.7631, + "step": 3895 + }, + { + "epoch": 0.352970487644674, + "grad_norm": 0.7595031261444092, + "learning_rate": 0.00017649368694496468, + "loss": 2.8765, + "step": 3896 + }, + { + "epoch": 0.35306108581912077, + "grad_norm": 0.7167959213256836, + "learning_rate": 0.0001764876457439739, + "loss": 3.064, + "step": 3897 + }, + { + "epoch": 0.35315168399356756, + "grad_norm": 0.810724139213562, + "learning_rate": 0.00017648160454298314, + "loss": 2.9424, + "step": 3898 + }, + { + "epoch": 0.3532422821680143, + "grad_norm": 0.8095436096191406, + "learning_rate": 0.0001764755633419924, + "loss": 2.6523, + "step": 3899 + }, + { + "epoch": 0.3533328803424611, + "grad_norm": 0.7676697373390198, + "learning_rate": 0.00017646952214100164, + "loss": 2.7539, + "step": 3900 + }, + { + "epoch": 0.35342347851690786, + "grad_norm": 0.66880863904953, + "learning_rate": 0.00017646348094001087, + "loss": 2.0744, + "step": 3901 + }, + { + "epoch": 0.35351407669135465, + "grad_norm": 0.7094044089317322, + "learning_rate": 0.00017645743973902013, + "loss": 2.8093, + "step": 3902 + }, + { + "epoch": 0.35360467486580144, + "grad_norm": 0.770904541015625, + "learning_rate": 0.00017645139853802937, + "loss": 2.8956, + "step": 3903 + }, + { + "epoch": 0.3536952730402482, + "grad_norm": 0.7559304237365723, + "learning_rate": 0.00017644535733703863, + "loss": 2.688, + "step": 3904 + }, + { + "epoch": 0.353785871214695, + "grad_norm": 0.7482321858406067, + "learning_rate": 0.00017643931613604784, + "loss": 2.7931, + "step": 3905 + }, + { + "epoch": 0.3538764693891418, + "grad_norm": 0.7090814113616943, + "learning_rate": 0.0001764332749350571, + "loss": 2.8019, + "step": 3906 + }, + { + "epoch": 0.3539670675635886, + "grad_norm": 0.7127399444580078, + "learning_rate": 0.00017642723373406633, + "loss": 2.6283, + "step": 3907 + }, + { + "epoch": 0.3540576657380354, + "grad_norm": 0.7673052549362183, + "learning_rate": 0.0001764211925330756, + "loss": 2.7859, + "step": 3908 + }, + { + "epoch": 0.35414826391248216, + "grad_norm": 0.7636913657188416, + "learning_rate": 0.00017641515133208483, + "loss": 2.7596, + "step": 3909 + }, + { + "epoch": 0.35423886208692895, + "grad_norm": 0.8031988143920898, + "learning_rate": 0.00017640911013109406, + "loss": 2.7353, + "step": 3910 + }, + { + "epoch": 0.35432946026137574, + "grad_norm": 0.7248566150665283, + "learning_rate": 0.00017640306893010332, + "loss": 2.7868, + "step": 3911 + }, + { + "epoch": 0.3544200584358225, + "grad_norm": 0.7746075391769409, + "learning_rate": 0.00017639702772911256, + "loss": 2.8165, + "step": 3912 + }, + { + "epoch": 0.3545106566102693, + "grad_norm": 0.7812379002571106, + "learning_rate": 0.00017639098652812182, + "loss": 2.9736, + "step": 3913 + }, + { + "epoch": 0.3546012547847161, + "grad_norm": 0.8135812878608704, + "learning_rate": 0.00017638494532713102, + "loss": 2.7127, + "step": 3914 + }, + { + "epoch": 0.3546918529591629, + "grad_norm": 0.7655229568481445, + "learning_rate": 0.00017637890412614029, + "loss": 3.0635, + "step": 3915 + }, + { + "epoch": 0.3547824511336097, + "grad_norm": 0.7350382208824158, + "learning_rate": 0.00017637286292514952, + "loss": 2.9443, + "step": 3916 + }, + { + "epoch": 0.35487304930805647, + "grad_norm": 0.7956351637840271, + "learning_rate": 0.00017636682172415878, + "loss": 2.9462, + "step": 3917 + }, + { + "epoch": 0.35496364748250325, + "grad_norm": 0.7900421023368835, + "learning_rate": 0.00017636078052316802, + "loss": 2.6986, + "step": 3918 + }, + { + "epoch": 0.35505424565695, + "grad_norm": 0.7415586709976196, + "learning_rate": 0.00017635473932217725, + "loss": 2.6682, + "step": 3919 + }, + { + "epoch": 0.3551448438313968, + "grad_norm": 0.7500978708267212, + "learning_rate": 0.0001763486981211865, + "loss": 2.8077, + "step": 3920 + }, + { + "epoch": 0.35523544200584356, + "grad_norm": 0.8015119433403015, + "learning_rate": 0.00017634265692019574, + "loss": 2.8559, + "step": 3921 + }, + { + "epoch": 0.35532604018029035, + "grad_norm": 0.7866809368133545, + "learning_rate": 0.00017633661571920498, + "loss": 2.9222, + "step": 3922 + }, + { + "epoch": 0.35541663835473714, + "grad_norm": 0.8111952543258667, + "learning_rate": 0.0001763305745182142, + "loss": 2.7714, + "step": 3923 + }, + { + "epoch": 0.3555072365291839, + "grad_norm": 0.7930363416671753, + "learning_rate": 0.00017632453331722347, + "loss": 3.0165, + "step": 3924 + }, + { + "epoch": 0.3555978347036307, + "grad_norm": 0.7966210246086121, + "learning_rate": 0.00017631849211623273, + "loss": 2.8485, + "step": 3925 + }, + { + "epoch": 0.3556884328780775, + "grad_norm": 0.6752117276191711, + "learning_rate": 0.00017631245091524197, + "loss": 2.1132, + "step": 3926 + }, + { + "epoch": 0.3557790310525243, + "grad_norm": 0.7312090992927551, + "learning_rate": 0.0001763064097142512, + "loss": 2.8436, + "step": 3927 + }, + { + "epoch": 0.3558696292269711, + "grad_norm": 0.7096377015113831, + "learning_rate": 0.00017630036851326044, + "loss": 2.8329, + "step": 3928 + }, + { + "epoch": 0.35596022740141786, + "grad_norm": 0.7820291519165039, + "learning_rate": 0.0001762943273122697, + "loss": 2.9116, + "step": 3929 + }, + { + "epoch": 0.35605082557586465, + "grad_norm": 0.7527021169662476, + "learning_rate": 0.00017628828611127893, + "loss": 2.6697, + "step": 3930 + }, + { + "epoch": 0.35614142375031144, + "grad_norm": 0.7068663835525513, + "learning_rate": 0.00017628224491028817, + "loss": 2.6083, + "step": 3931 + }, + { + "epoch": 0.3562320219247582, + "grad_norm": 0.7447174191474915, + "learning_rate": 0.00017627620370929743, + "loss": 2.7167, + "step": 3932 + }, + { + "epoch": 0.356322620099205, + "grad_norm": 0.7950233817100525, + "learning_rate": 0.00017627016250830666, + "loss": 2.7755, + "step": 3933 + }, + { + "epoch": 0.3564132182736518, + "grad_norm": 0.7016203999519348, + "learning_rate": 0.00017626412130731592, + "loss": 2.6873, + "step": 3934 + }, + { + "epoch": 0.3565038164480986, + "grad_norm": 0.8354222774505615, + "learning_rate": 0.00017625808010632513, + "loss": 2.9742, + "step": 3935 + }, + { + "epoch": 0.3565944146225454, + "grad_norm": 0.8018917441368103, + "learning_rate": 0.0001762520389053344, + "loss": 3.1507, + "step": 3936 + }, + { + "epoch": 0.35668501279699216, + "grad_norm": 0.7198756337165833, + "learning_rate": 0.00017624599770434362, + "loss": 2.8854, + "step": 3937 + }, + { + "epoch": 0.35677561097143895, + "grad_norm": 0.772811233997345, + "learning_rate": 0.00017623995650335289, + "loss": 3.0525, + "step": 3938 + }, + { + "epoch": 0.35686620914588574, + "grad_norm": 0.7559927105903625, + "learning_rate": 0.00017623391530236212, + "loss": 2.7165, + "step": 3939 + }, + { + "epoch": 0.35695680732033247, + "grad_norm": 0.7346850633621216, + "learning_rate": 0.00017622787410137135, + "loss": 3.037, + "step": 3940 + }, + { + "epoch": 0.35704740549477926, + "grad_norm": 0.7090020775794983, + "learning_rate": 0.00017622183290038062, + "loss": 2.8059, + "step": 3941 + }, + { + "epoch": 0.35713800366922605, + "grad_norm": 0.7064149379730225, + "learning_rate": 0.00017621579169938985, + "loss": 2.778, + "step": 3942 + }, + { + "epoch": 0.35722860184367283, + "grad_norm": 0.7299533486366272, + "learning_rate": 0.00017620975049839908, + "loss": 2.9005, + "step": 3943 + }, + { + "epoch": 0.3573192000181196, + "grad_norm": 2.26491379737854, + "learning_rate": 0.00017620370929740832, + "loss": 3.0245, + "step": 3944 + }, + { + "epoch": 0.3574097981925664, + "grad_norm": 0.7544460296630859, + "learning_rate": 0.00017619766809641758, + "loss": 2.8943, + "step": 3945 + }, + { + "epoch": 0.3575003963670132, + "grad_norm": 0.7907846570014954, + "learning_rate": 0.0001761916268954268, + "loss": 2.8323, + "step": 3946 + }, + { + "epoch": 0.35759099454146, + "grad_norm": 0.6809419989585876, + "learning_rate": 0.00017618558569443607, + "loss": 2.5695, + "step": 3947 + }, + { + "epoch": 0.35768159271590677, + "grad_norm": 0.7199592590332031, + "learning_rate": 0.0001761795444934453, + "loss": 3.067, + "step": 3948 + }, + { + "epoch": 0.35777219089035356, + "grad_norm": 0.7539077401161194, + "learning_rate": 0.00017617350329245454, + "loss": 3.0124, + "step": 3949 + }, + { + "epoch": 0.35786278906480035, + "grad_norm": 0.7326184511184692, + "learning_rate": 0.0001761674620914638, + "loss": 2.6347, + "step": 3950 + }, + { + "epoch": 0.35795338723924713, + "grad_norm": 0.7839102745056152, + "learning_rate": 0.00017616142089047304, + "loss": 2.9522, + "step": 3951 + }, + { + "epoch": 0.3580439854136939, + "grad_norm": 0.787121057510376, + "learning_rate": 0.00017615537968948227, + "loss": 2.7612, + "step": 3952 + }, + { + "epoch": 0.3581345835881407, + "grad_norm": 0.7598521709442139, + "learning_rate": 0.0001761493384884915, + "loss": 2.8731, + "step": 3953 + }, + { + "epoch": 0.3582251817625875, + "grad_norm": 0.7327462434768677, + "learning_rate": 0.00017614329728750077, + "loss": 3.0409, + "step": 3954 + }, + { + "epoch": 0.3583157799370343, + "grad_norm": 0.7675254940986633, + "learning_rate": 0.00017613725608651003, + "loss": 2.8576, + "step": 3955 + }, + { + "epoch": 0.35840637811148107, + "grad_norm": 0.7359346747398376, + "learning_rate": 0.00017613121488551923, + "loss": 2.6936, + "step": 3956 + }, + { + "epoch": 0.35849697628592786, + "grad_norm": 0.7690472602844238, + "learning_rate": 0.0001761251736845285, + "loss": 2.8816, + "step": 3957 + }, + { + "epoch": 0.35858757446037465, + "grad_norm": 0.7818939089775085, + "learning_rate": 0.00017611913248353773, + "loss": 2.866, + "step": 3958 + }, + { + "epoch": 0.35867817263482143, + "grad_norm": 0.7493080496788025, + "learning_rate": 0.000176113091282547, + "loss": 2.7976, + "step": 3959 + }, + { + "epoch": 0.35876877080926817, + "grad_norm": 0.8042757511138916, + "learning_rate": 0.00017610705008155622, + "loss": 2.9755, + "step": 3960 + }, + { + "epoch": 0.35885936898371495, + "grad_norm": 0.7866727113723755, + "learning_rate": 0.00017610100888056546, + "loss": 2.4019, + "step": 3961 + }, + { + "epoch": 0.35894996715816174, + "grad_norm": 0.7148867845535278, + "learning_rate": 0.00017609496767957472, + "loss": 3.0044, + "step": 3962 + }, + { + "epoch": 0.35904056533260853, + "grad_norm": 0.7097441554069519, + "learning_rate": 0.00017608892647858395, + "loss": 2.9149, + "step": 3963 + }, + { + "epoch": 0.3591311635070553, + "grad_norm": 0.6834418177604675, + "learning_rate": 0.0001760828852775932, + "loss": 2.6521, + "step": 3964 + }, + { + "epoch": 0.3592217616815021, + "grad_norm": 0.728523850440979, + "learning_rate": 0.00017607684407660242, + "loss": 2.9216, + "step": 3965 + }, + { + "epoch": 0.3593123598559489, + "grad_norm": 0.7132214903831482, + "learning_rate": 0.00017607080287561168, + "loss": 2.6617, + "step": 3966 + }, + { + "epoch": 0.3594029580303957, + "grad_norm": 0.7717399597167969, + "learning_rate": 0.00017606476167462092, + "loss": 2.9521, + "step": 3967 + }, + { + "epoch": 0.35949355620484247, + "grad_norm": 0.7030473947525024, + "learning_rate": 0.00017605872047363018, + "loss": 2.707, + "step": 3968 + }, + { + "epoch": 0.35958415437928926, + "grad_norm": 0.8162513971328735, + "learning_rate": 0.00017605267927263939, + "loss": 2.9127, + "step": 3969 + }, + { + "epoch": 0.35967475255373604, + "grad_norm": 0.7288877367973328, + "learning_rate": 0.00017604663807164865, + "loss": 2.8708, + "step": 3970 + }, + { + "epoch": 0.35976535072818283, + "grad_norm": 0.7464533448219299, + "learning_rate": 0.0001760405968706579, + "loss": 2.9095, + "step": 3971 + }, + { + "epoch": 0.3598559489026296, + "grad_norm": 0.8026579022407532, + "learning_rate": 0.00017603455566966714, + "loss": 2.5555, + "step": 3972 + }, + { + "epoch": 0.3599465470770764, + "grad_norm": 0.7164578437805176, + "learning_rate": 0.00017602851446867638, + "loss": 2.8996, + "step": 3973 + }, + { + "epoch": 0.3600371452515232, + "grad_norm": 0.7692656517028809, + "learning_rate": 0.0001760224732676856, + "loss": 2.8773, + "step": 3974 + }, + { + "epoch": 0.36012774342597, + "grad_norm": 0.7646166086196899, + "learning_rate": 0.00017601643206669487, + "loss": 2.8122, + "step": 3975 + }, + { + "epoch": 0.36021834160041677, + "grad_norm": 0.7635319828987122, + "learning_rate": 0.0001760103908657041, + "loss": 2.7748, + "step": 3976 + }, + { + "epoch": 0.36030893977486356, + "grad_norm": 0.7433458566665649, + "learning_rate": 0.00017600434966471334, + "loss": 2.6448, + "step": 3977 + }, + { + "epoch": 0.36039953794931034, + "grad_norm": 0.780421257019043, + "learning_rate": 0.0001759983084637226, + "loss": 2.9692, + "step": 3978 + }, + { + "epoch": 0.36049013612375713, + "grad_norm": 0.7635530233383179, + "learning_rate": 0.00017599226726273183, + "loss": 3.011, + "step": 3979 + }, + { + "epoch": 0.36058073429820386, + "grad_norm": 0.6831689476966858, + "learning_rate": 0.0001759862260617411, + "loss": 2.8686, + "step": 3980 + }, + { + "epoch": 0.36067133247265065, + "grad_norm": 0.7502506375312805, + "learning_rate": 0.00017598018486075033, + "loss": 2.8488, + "step": 3981 + }, + { + "epoch": 0.36076193064709744, + "grad_norm": 0.734369695186615, + "learning_rate": 0.00017597414365975956, + "loss": 2.9975, + "step": 3982 + }, + { + "epoch": 0.3608525288215442, + "grad_norm": 0.7395163774490356, + "learning_rate": 0.0001759681024587688, + "loss": 2.7804, + "step": 3983 + }, + { + "epoch": 0.360943126995991, + "grad_norm": 0.7287874221801758, + "learning_rate": 0.00017596206125777806, + "loss": 2.9314, + "step": 3984 + }, + { + "epoch": 0.3610337251704378, + "grad_norm": 0.6867093443870544, + "learning_rate": 0.00017595602005678732, + "loss": 2.7603, + "step": 3985 + }, + { + "epoch": 0.3611243233448846, + "grad_norm": 0.7911293506622314, + "learning_rate": 0.00017594997885579653, + "loss": 2.9603, + "step": 3986 + }, + { + "epoch": 0.3612149215193314, + "grad_norm": 0.8380505442619324, + "learning_rate": 0.0001759439376548058, + "loss": 2.7919, + "step": 3987 + }, + { + "epoch": 0.36130551969377817, + "grad_norm": 0.7546809911727905, + "learning_rate": 0.00017593789645381502, + "loss": 2.8001, + "step": 3988 + }, + { + "epoch": 0.36139611786822495, + "grad_norm": 0.670966386795044, + "learning_rate": 0.00017593185525282428, + "loss": 1.9791, + "step": 3989 + }, + { + "epoch": 0.36148671604267174, + "grad_norm": 0.7214210033416748, + "learning_rate": 0.00017592581405183352, + "loss": 2.7506, + "step": 3990 + }, + { + "epoch": 0.36157731421711853, + "grad_norm": 0.7929299473762512, + "learning_rate": 0.00017591977285084275, + "loss": 2.8045, + "step": 3991 + }, + { + "epoch": 0.3616679123915653, + "grad_norm": 0.7308299541473389, + "learning_rate": 0.000175913731649852, + "loss": 2.7967, + "step": 3992 + }, + { + "epoch": 0.3617585105660121, + "grad_norm": 0.7308790683746338, + "learning_rate": 0.00017590769044886125, + "loss": 2.9481, + "step": 3993 + }, + { + "epoch": 0.3618491087404589, + "grad_norm": 0.7107953429222107, + "learning_rate": 0.00017590164924787048, + "loss": 2.528, + "step": 3994 + }, + { + "epoch": 0.3619397069149057, + "grad_norm": 0.7067890763282776, + "learning_rate": 0.00017589560804687971, + "loss": 2.6625, + "step": 3995 + }, + { + "epoch": 0.36203030508935247, + "grad_norm": 0.7772632837295532, + "learning_rate": 0.00017588956684588898, + "loss": 2.8255, + "step": 3996 + }, + { + "epoch": 0.36212090326379925, + "grad_norm": 0.7736832499504089, + "learning_rate": 0.0001758835256448982, + "loss": 2.8931, + "step": 3997 + }, + { + "epoch": 0.36221150143824604, + "grad_norm": 0.7521626949310303, + "learning_rate": 0.00017587748444390747, + "loss": 2.3014, + "step": 3998 + }, + { + "epoch": 0.36230209961269283, + "grad_norm": 0.7270371913909912, + "learning_rate": 0.00017587144324291668, + "loss": 2.082, + "step": 3999 + }, + { + "epoch": 0.36239269778713956, + "grad_norm": 0.7079955339431763, + "learning_rate": 0.00017586540204192594, + "loss": 2.5788, + "step": 4000 + }, + { + "epoch": 0.36248329596158635, + "grad_norm": 0.764848530292511, + "learning_rate": 0.0001758593608409352, + "loss": 3.0002, + "step": 4001 + }, + { + "epoch": 0.36257389413603314, + "grad_norm": 0.7482439279556274, + "learning_rate": 0.00017585331963994443, + "loss": 2.6215, + "step": 4002 + }, + { + "epoch": 0.3626644923104799, + "grad_norm": 0.727623462677002, + "learning_rate": 0.00017584727843895367, + "loss": 2.7549, + "step": 4003 + }, + { + "epoch": 0.3627550904849267, + "grad_norm": 0.7651418447494507, + "learning_rate": 0.0001758412372379629, + "loss": 2.8469, + "step": 4004 + }, + { + "epoch": 0.3628456886593735, + "grad_norm": 0.7550287842750549, + "learning_rate": 0.00017583519603697216, + "loss": 2.7927, + "step": 4005 + }, + { + "epoch": 0.3629362868338203, + "grad_norm": 0.8219336271286011, + "learning_rate": 0.0001758291548359814, + "loss": 2.7903, + "step": 4006 + }, + { + "epoch": 0.3630268850082671, + "grad_norm": 0.7056002020835876, + "learning_rate": 0.00017582311363499063, + "loss": 2.6483, + "step": 4007 + }, + { + "epoch": 0.36311748318271386, + "grad_norm": 0.7250471711158752, + "learning_rate": 0.0001758170724339999, + "loss": 2.8961, + "step": 4008 + }, + { + "epoch": 0.36320808135716065, + "grad_norm": 0.7881485223770142, + "learning_rate": 0.00017581103123300913, + "loss": 2.9339, + "step": 4009 + }, + { + "epoch": 0.36329867953160744, + "grad_norm": 0.7329698204994202, + "learning_rate": 0.0001758049900320184, + "loss": 2.7993, + "step": 4010 + }, + { + "epoch": 0.3633892777060542, + "grad_norm": 0.7692073583602905, + "learning_rate": 0.00017579894883102762, + "loss": 2.8357, + "step": 4011 + }, + { + "epoch": 0.363479875880501, + "grad_norm": 0.7265939116477966, + "learning_rate": 0.00017579290763003686, + "loss": 2.9515, + "step": 4012 + }, + { + "epoch": 0.3635704740549478, + "grad_norm": 0.689671516418457, + "learning_rate": 0.0001757868664290461, + "loss": 2.323, + "step": 4013 + }, + { + "epoch": 0.3636610722293946, + "grad_norm": 0.7437949776649475, + "learning_rate": 0.00017578082522805535, + "loss": 2.6707, + "step": 4014 + }, + { + "epoch": 0.3637516704038414, + "grad_norm": 0.7640599012374878, + "learning_rate": 0.00017577478402706459, + "loss": 2.8471, + "step": 4015 + }, + { + "epoch": 0.36384226857828816, + "grad_norm": 0.719760000705719, + "learning_rate": 0.00017576874282607382, + "loss": 2.974, + "step": 4016 + }, + { + "epoch": 0.36393286675273495, + "grad_norm": 0.7355942130088806, + "learning_rate": 0.00017576270162508308, + "loss": 2.8674, + "step": 4017 + }, + { + "epoch": 0.36402346492718174, + "grad_norm": 0.755924642086029, + "learning_rate": 0.00017575666042409231, + "loss": 2.9568, + "step": 4018 + }, + { + "epoch": 0.3641140631016285, + "grad_norm": 0.733121395111084, + "learning_rate": 0.00017575061922310158, + "loss": 3.0123, + "step": 4019 + }, + { + "epoch": 0.3642046612760753, + "grad_norm": 0.7243371605873108, + "learning_rate": 0.00017574457802211078, + "loss": 2.8389, + "step": 4020 + }, + { + "epoch": 0.36429525945052205, + "grad_norm": 0.7452646493911743, + "learning_rate": 0.00017573853682112004, + "loss": 2.8798, + "step": 4021 + }, + { + "epoch": 0.36438585762496883, + "grad_norm": 0.7783543467521667, + "learning_rate": 0.0001757324956201293, + "loss": 2.9264, + "step": 4022 + }, + { + "epoch": 0.3644764557994156, + "grad_norm": 0.7395209074020386, + "learning_rate": 0.00017572645441913854, + "loss": 2.8406, + "step": 4023 + }, + { + "epoch": 0.3645670539738624, + "grad_norm": 0.7639445066452026, + "learning_rate": 0.00017572041321814777, + "loss": 2.732, + "step": 4024 + }, + { + "epoch": 0.3646576521483092, + "grad_norm": 0.7298175692558289, + "learning_rate": 0.000175714372017157, + "loss": 2.9722, + "step": 4025 + }, + { + "epoch": 0.364748250322756, + "grad_norm": 0.7737180590629578, + "learning_rate": 0.00017570833081616627, + "loss": 3.1661, + "step": 4026 + }, + { + "epoch": 0.3648388484972028, + "grad_norm": 0.7293865084648132, + "learning_rate": 0.0001757022896151755, + "loss": 2.5613, + "step": 4027 + }, + { + "epoch": 0.36492944667164956, + "grad_norm": 0.8181026577949524, + "learning_rate": 0.00017569624841418474, + "loss": 2.7953, + "step": 4028 + }, + { + "epoch": 0.36502004484609635, + "grad_norm": 0.7783385515213013, + "learning_rate": 0.000175690207213194, + "loss": 2.5451, + "step": 4029 + }, + { + "epoch": 0.36511064302054314, + "grad_norm": 0.6288084983825684, + "learning_rate": 0.00017568416601220323, + "loss": 2.1826, + "step": 4030 + }, + { + "epoch": 0.3652012411949899, + "grad_norm": 0.7205748558044434, + "learning_rate": 0.0001756781248112125, + "loss": 2.7172, + "step": 4031 + }, + { + "epoch": 0.3652918393694367, + "grad_norm": 0.8074316382408142, + "learning_rate": 0.00017567208361022173, + "loss": 2.9359, + "step": 4032 + }, + { + "epoch": 0.3653824375438835, + "grad_norm": 0.7316029667854309, + "learning_rate": 0.00017566604240923096, + "loss": 2.7978, + "step": 4033 + }, + { + "epoch": 0.3654730357183303, + "grad_norm": 0.7568401098251343, + "learning_rate": 0.0001756600012082402, + "loss": 2.9683, + "step": 4034 + }, + { + "epoch": 0.3655636338927771, + "grad_norm": 0.6761384606361389, + "learning_rate": 0.00017565396000724946, + "loss": 2.2535, + "step": 4035 + }, + { + "epoch": 0.36565423206722386, + "grad_norm": 0.7735591530799866, + "learning_rate": 0.0001756479188062587, + "loss": 2.8964, + "step": 4036 + }, + { + "epoch": 0.36574483024167065, + "grad_norm": 0.5253097414970398, + "learning_rate": 0.00017564187760526792, + "loss": 1.3242, + "step": 4037 + }, + { + "epoch": 0.36583542841611744, + "grad_norm": 0.809453010559082, + "learning_rate": 0.00017563583640427719, + "loss": 2.995, + "step": 4038 + }, + { + "epoch": 0.3659260265905642, + "grad_norm": 0.7724971175193787, + "learning_rate": 0.00017562979520328642, + "loss": 2.7873, + "step": 4039 + }, + { + "epoch": 0.366016624765011, + "grad_norm": 0.6187083721160889, + "learning_rate": 0.00017562375400229568, + "loss": 2.0355, + "step": 4040 + }, + { + "epoch": 0.36610722293945774, + "grad_norm": 0.7292988300323486, + "learning_rate": 0.0001756177128013049, + "loss": 2.9108, + "step": 4041 + }, + { + "epoch": 0.36619782111390453, + "grad_norm": 0.8201996088027954, + "learning_rate": 0.00017561167160031415, + "loss": 2.6461, + "step": 4042 + }, + { + "epoch": 0.3662884192883513, + "grad_norm": 0.7520164847373962, + "learning_rate": 0.00017560563039932338, + "loss": 2.7857, + "step": 4043 + }, + { + "epoch": 0.3663790174627981, + "grad_norm": 0.7467857599258423, + "learning_rate": 0.00017559958919833264, + "loss": 3.0639, + "step": 4044 + }, + { + "epoch": 0.3664696156372449, + "grad_norm": 0.7650745511054993, + "learning_rate": 0.00017559354799734188, + "loss": 2.7346, + "step": 4045 + }, + { + "epoch": 0.3665602138116917, + "grad_norm": 0.8655344247817993, + "learning_rate": 0.0001755875067963511, + "loss": 2.8138, + "step": 4046 + }, + { + "epoch": 0.36665081198613847, + "grad_norm": 0.7456082105636597, + "learning_rate": 0.00017558146559536037, + "loss": 2.7052, + "step": 4047 + }, + { + "epoch": 0.36674141016058526, + "grad_norm": 0.7243279814720154, + "learning_rate": 0.0001755754243943696, + "loss": 2.7381, + "step": 4048 + }, + { + "epoch": 0.36683200833503204, + "grad_norm": 0.76241534948349, + "learning_rate": 0.00017556938319337887, + "loss": 2.7394, + "step": 4049 + }, + { + "epoch": 0.36692260650947883, + "grad_norm": 0.7689653038978577, + "learning_rate": 0.00017556334199238808, + "loss": 2.7992, + "step": 4050 + }, + { + "epoch": 0.3670132046839256, + "grad_norm": 0.7811753153800964, + "learning_rate": 0.00017555730079139734, + "loss": 2.9018, + "step": 4051 + }, + { + "epoch": 0.3671038028583724, + "grad_norm": 0.817703902721405, + "learning_rate": 0.0001755512595904066, + "loss": 3.259, + "step": 4052 + }, + { + "epoch": 0.3671944010328192, + "grad_norm": 0.7419345378875732, + "learning_rate": 0.00017554521838941583, + "loss": 2.838, + "step": 4053 + }, + { + "epoch": 0.367284999207266, + "grad_norm": 0.7469000220298767, + "learning_rate": 0.00017553917718842507, + "loss": 2.9016, + "step": 4054 + }, + { + "epoch": 0.36737559738171277, + "grad_norm": 0.731944739818573, + "learning_rate": 0.0001755331359874343, + "loss": 2.9275, + "step": 4055 + }, + { + "epoch": 0.36746619555615956, + "grad_norm": 0.691196084022522, + "learning_rate": 0.00017552709478644356, + "loss": 2.7705, + "step": 4056 + }, + { + "epoch": 0.36755679373060635, + "grad_norm": 0.6391705274581909, + "learning_rate": 0.0001755210535854528, + "loss": 2.1263, + "step": 4057 + }, + { + "epoch": 0.36764739190505313, + "grad_norm": 0.7323436141014099, + "learning_rate": 0.00017551501238446203, + "loss": 2.954, + "step": 4058 + }, + { + "epoch": 0.3677379900794999, + "grad_norm": 0.7952719330787659, + "learning_rate": 0.0001755089711834713, + "loss": 2.6323, + "step": 4059 + }, + { + "epoch": 0.3678285882539467, + "grad_norm": 0.7252512574195862, + "learning_rate": 0.00017550292998248052, + "loss": 2.7846, + "step": 4060 + }, + { + "epoch": 0.36791918642839344, + "grad_norm": 0.7782475352287292, + "learning_rate": 0.00017549688878148979, + "loss": 3.0865, + "step": 4061 + }, + { + "epoch": 0.36800978460284023, + "grad_norm": 0.7452778816223145, + "learning_rate": 0.00017549084758049902, + "loss": 2.8641, + "step": 4062 + }, + { + "epoch": 0.368100382777287, + "grad_norm": 0.6840134263038635, + "learning_rate": 0.00017548480637950825, + "loss": 2.6792, + "step": 4063 + }, + { + "epoch": 0.3681909809517338, + "grad_norm": 0.7867242693901062, + "learning_rate": 0.0001754787651785175, + "loss": 2.9656, + "step": 4064 + }, + { + "epoch": 0.3682815791261806, + "grad_norm": 0.6875649094581604, + "learning_rate": 0.00017547272397752675, + "loss": 2.1305, + "step": 4065 + }, + { + "epoch": 0.3683721773006274, + "grad_norm": 0.7344654202461243, + "learning_rate": 0.00017546668277653598, + "loss": 2.8666, + "step": 4066 + }, + { + "epoch": 0.36846277547507417, + "grad_norm": 0.8315809369087219, + "learning_rate": 0.00017546064157554522, + "loss": 2.9667, + "step": 4067 + }, + { + "epoch": 0.36855337364952095, + "grad_norm": 0.72691810131073, + "learning_rate": 0.00017545460037455448, + "loss": 2.8847, + "step": 4068 + }, + { + "epoch": 0.36864397182396774, + "grad_norm": 0.7477741241455078, + "learning_rate": 0.0001754485591735637, + "loss": 3.1143, + "step": 4069 + }, + { + "epoch": 0.36873456999841453, + "grad_norm": 0.7759198546409607, + "learning_rate": 0.00017544251797257297, + "loss": 2.8187, + "step": 4070 + }, + { + "epoch": 0.3688251681728613, + "grad_norm": 0.6455305814743042, + "learning_rate": 0.00017543647677158218, + "loss": 2.1363, + "step": 4071 + }, + { + "epoch": 0.3689157663473081, + "grad_norm": 0.7190482020378113, + "learning_rate": 0.00017543043557059144, + "loss": 2.6754, + "step": 4072 + }, + { + "epoch": 0.3690063645217549, + "grad_norm": 0.7574522495269775, + "learning_rate": 0.00017542439436960068, + "loss": 3.0434, + "step": 4073 + }, + { + "epoch": 0.3690969626962017, + "grad_norm": 0.7435604929924011, + "learning_rate": 0.00017541835316860994, + "loss": 2.8512, + "step": 4074 + }, + { + "epoch": 0.36918756087064847, + "grad_norm": 0.715989887714386, + "learning_rate": 0.00017541231196761917, + "loss": 2.7692, + "step": 4075 + }, + { + "epoch": 0.36927815904509526, + "grad_norm": 0.6908864974975586, + "learning_rate": 0.0001754062707666284, + "loss": 2.1947, + "step": 4076 + }, + { + "epoch": 0.36936875721954204, + "grad_norm": 0.8095675110816956, + "learning_rate": 0.00017540022956563767, + "loss": 3.1872, + "step": 4077 + }, + { + "epoch": 0.36945935539398883, + "grad_norm": 0.7694077491760254, + "learning_rate": 0.0001753941883646469, + "loss": 3.2107, + "step": 4078 + }, + { + "epoch": 0.3695499535684356, + "grad_norm": 0.8463581800460815, + "learning_rate": 0.00017538814716365613, + "loss": 2.7856, + "step": 4079 + }, + { + "epoch": 0.3696405517428824, + "grad_norm": 0.7511273622512817, + "learning_rate": 0.00017538210596266537, + "loss": 2.7258, + "step": 4080 + }, + { + "epoch": 0.36973114991732914, + "grad_norm": 0.7372514009475708, + "learning_rate": 0.00017537606476167463, + "loss": 2.7382, + "step": 4081 + }, + { + "epoch": 0.3698217480917759, + "grad_norm": 0.7920445203781128, + "learning_rate": 0.0001753700235606839, + "loss": 2.9121, + "step": 4082 + }, + { + "epoch": 0.3699123462662227, + "grad_norm": 0.7613356113433838, + "learning_rate": 0.00017536398235969312, + "loss": 2.7786, + "step": 4083 + }, + { + "epoch": 0.3700029444406695, + "grad_norm": 0.7621583342552185, + "learning_rate": 0.00017535794115870236, + "loss": 2.9559, + "step": 4084 + }, + { + "epoch": 0.3700935426151163, + "grad_norm": 0.7281734943389893, + "learning_rate": 0.0001753518999577116, + "loss": 3.2083, + "step": 4085 + }, + { + "epoch": 0.3701841407895631, + "grad_norm": 0.6603618264198303, + "learning_rate": 0.00017534585875672085, + "loss": 2.2578, + "step": 4086 + }, + { + "epoch": 0.37027473896400986, + "grad_norm": 0.8111781477928162, + "learning_rate": 0.0001753398175557301, + "loss": 2.8968, + "step": 4087 + }, + { + "epoch": 0.37036533713845665, + "grad_norm": 0.7686830759048462, + "learning_rate": 0.00017533377635473932, + "loss": 2.8164, + "step": 4088 + }, + { + "epoch": 0.37045593531290344, + "grad_norm": 0.7562522292137146, + "learning_rate": 0.00017532773515374858, + "loss": 2.6618, + "step": 4089 + }, + { + "epoch": 0.3705465334873502, + "grad_norm": 0.7601833343505859, + "learning_rate": 0.00017532169395275782, + "loss": 2.9891, + "step": 4090 + }, + { + "epoch": 0.370637131661797, + "grad_norm": 0.7963953018188477, + "learning_rate": 0.00017531565275176708, + "loss": 2.7735, + "step": 4091 + }, + { + "epoch": 0.3707277298362438, + "grad_norm": 0.7334517240524292, + "learning_rate": 0.00017530961155077629, + "loss": 2.44, + "step": 4092 + }, + { + "epoch": 0.3708183280106906, + "grad_norm": 0.7551321387290955, + "learning_rate": 0.00017530357034978555, + "loss": 2.7719, + "step": 4093 + }, + { + "epoch": 0.3709089261851374, + "grad_norm": 0.75123131275177, + "learning_rate": 0.00017529752914879478, + "loss": 2.8239, + "step": 4094 + }, + { + "epoch": 0.37099952435958417, + "grad_norm": 0.6882799863815308, + "learning_rate": 0.00017529148794780404, + "loss": 2.7849, + "step": 4095 + }, + { + "epoch": 0.37109012253403095, + "grad_norm": 0.7355629801750183, + "learning_rate": 0.00017528544674681328, + "loss": 2.6963, + "step": 4096 + }, + { + "epoch": 0.37118072070847774, + "grad_norm": 0.7401911020278931, + "learning_rate": 0.0001752794055458225, + "loss": 2.8266, + "step": 4097 + }, + { + "epoch": 0.37127131888292453, + "grad_norm": 0.7503019571304321, + "learning_rate": 0.00017527336434483177, + "loss": 3.0111, + "step": 4098 + }, + { + "epoch": 0.3713619170573713, + "grad_norm": 0.734286367893219, + "learning_rate": 0.000175267323143841, + "loss": 2.849, + "step": 4099 + }, + { + "epoch": 0.3714525152318181, + "grad_norm": 0.7760733962059021, + "learning_rate": 0.00017526128194285027, + "loss": 2.777, + "step": 4100 + }, + { + "epoch": 0.3715431134062649, + "grad_norm": 0.7300983667373657, + "learning_rate": 0.00017525524074185947, + "loss": 2.7262, + "step": 4101 + }, + { + "epoch": 0.3716337115807116, + "grad_norm": 0.7604205012321472, + "learning_rate": 0.00017524919954086873, + "loss": 2.7011, + "step": 4102 + }, + { + "epoch": 0.3717243097551584, + "grad_norm": 0.8006844520568848, + "learning_rate": 0.00017524315833987797, + "loss": 3.0283, + "step": 4103 + }, + { + "epoch": 0.3718149079296052, + "grad_norm": 0.7367100119590759, + "learning_rate": 0.00017523711713888723, + "loss": 2.5528, + "step": 4104 + }, + { + "epoch": 0.371905506104052, + "grad_norm": 0.7936495542526245, + "learning_rate": 0.00017523107593789646, + "loss": 2.7506, + "step": 4105 + }, + { + "epoch": 0.3719961042784988, + "grad_norm": 0.7408265471458435, + "learning_rate": 0.0001752250347369057, + "loss": 2.9315, + "step": 4106 + }, + { + "epoch": 0.37208670245294556, + "grad_norm": 0.7994784712791443, + "learning_rate": 0.00017521899353591496, + "loss": 3.1186, + "step": 4107 + }, + { + "epoch": 0.37217730062739235, + "grad_norm": 0.7264071702957153, + "learning_rate": 0.0001752129523349242, + "loss": 2.8186, + "step": 4108 + }, + { + "epoch": 0.37226789880183914, + "grad_norm": 0.7540162801742554, + "learning_rate": 0.00017520691113393343, + "loss": 2.6381, + "step": 4109 + }, + { + "epoch": 0.3723584969762859, + "grad_norm": 0.7421356439590454, + "learning_rate": 0.00017520086993294266, + "loss": 2.7212, + "step": 4110 + }, + { + "epoch": 0.3724490951507327, + "grad_norm": 0.6770026683807373, + "learning_rate": 0.00017519482873195192, + "loss": 2.1697, + "step": 4111 + }, + { + "epoch": 0.3725396933251795, + "grad_norm": 0.7147262692451477, + "learning_rate": 0.00017518878753096118, + "loss": 2.0491, + "step": 4112 + }, + { + "epoch": 0.3726302914996263, + "grad_norm": 0.7279869914054871, + "learning_rate": 0.00017518274632997042, + "loss": 2.9164, + "step": 4113 + }, + { + "epoch": 0.3727208896740731, + "grad_norm": 0.7053819894790649, + "learning_rate": 0.00017517670512897965, + "loss": 2.6691, + "step": 4114 + }, + { + "epoch": 0.37281148784851986, + "grad_norm": 0.7532466053962708, + "learning_rate": 0.00017517066392798889, + "loss": 2.8265, + "step": 4115 + }, + { + "epoch": 0.37290208602296665, + "grad_norm": 0.7023757100105286, + "learning_rate": 0.00017516462272699815, + "loss": 2.6124, + "step": 4116 + }, + { + "epoch": 0.37299268419741344, + "grad_norm": 0.6121535301208496, + "learning_rate": 0.00017515858152600738, + "loss": 2.023, + "step": 4117 + }, + { + "epoch": 0.3730832823718602, + "grad_norm": 0.7821834683418274, + "learning_rate": 0.00017515254032501661, + "loss": 2.7383, + "step": 4118 + }, + { + "epoch": 0.373173880546307, + "grad_norm": 0.7287090420722961, + "learning_rate": 0.00017514649912402588, + "loss": 2.7388, + "step": 4119 + }, + { + "epoch": 0.3732644787207538, + "grad_norm": 0.7646805644035339, + "learning_rate": 0.0001751404579230351, + "loss": 2.8258, + "step": 4120 + }, + { + "epoch": 0.3733550768952006, + "grad_norm": 0.7122454643249512, + "learning_rate": 0.00017513441672204437, + "loss": 2.2518, + "step": 4121 + }, + { + "epoch": 0.3734456750696473, + "grad_norm": 0.7568143606185913, + "learning_rate": 0.00017512837552105358, + "loss": 2.867, + "step": 4122 + }, + { + "epoch": 0.3735362732440941, + "grad_norm": 0.7371345162391663, + "learning_rate": 0.00017512233432006284, + "loss": 3.1848, + "step": 4123 + }, + { + "epoch": 0.3736268714185409, + "grad_norm": 0.7667779922485352, + "learning_rate": 0.00017511629311907207, + "loss": 2.7328, + "step": 4124 + }, + { + "epoch": 0.3737174695929877, + "grad_norm": 0.7850649952888489, + "learning_rate": 0.00017511025191808133, + "loss": 1.9089, + "step": 4125 + }, + { + "epoch": 0.37380806776743447, + "grad_norm": 0.7186249494552612, + "learning_rate": 0.00017510421071709057, + "loss": 2.699, + "step": 4126 + }, + { + "epoch": 0.37389866594188126, + "grad_norm": 0.7154771089553833, + "learning_rate": 0.0001750981695160998, + "loss": 2.753, + "step": 4127 + }, + { + "epoch": 0.37398926411632805, + "grad_norm": 0.773963451385498, + "learning_rate": 0.00017509212831510906, + "loss": 2.8287, + "step": 4128 + }, + { + "epoch": 0.37407986229077483, + "grad_norm": 0.6940677165985107, + "learning_rate": 0.0001750860871141183, + "loss": 2.8102, + "step": 4129 + }, + { + "epoch": 0.3741704604652216, + "grad_norm": 0.712415337562561, + "learning_rate": 0.00017508004591312753, + "loss": 2.8111, + "step": 4130 + }, + { + "epoch": 0.3742610586396684, + "grad_norm": 0.7166954874992371, + "learning_rate": 0.00017507400471213677, + "loss": 2.6655, + "step": 4131 + }, + { + "epoch": 0.3743516568141152, + "grad_norm": 0.7499769330024719, + "learning_rate": 0.00017506796351114603, + "loss": 2.927, + "step": 4132 + }, + { + "epoch": 0.374442254988562, + "grad_norm": 0.7615142464637756, + "learning_rate": 0.00017506192231015526, + "loss": 2.9705, + "step": 4133 + }, + { + "epoch": 0.37453285316300877, + "grad_norm": 0.7442064881324768, + "learning_rate": 0.00017505588110916452, + "loss": 2.8463, + "step": 4134 + }, + { + "epoch": 0.37462345133745556, + "grad_norm": 0.7361534237861633, + "learning_rate": 0.00017504983990817376, + "loss": 2.657, + "step": 4135 + }, + { + "epoch": 0.37471404951190235, + "grad_norm": 0.711447536945343, + "learning_rate": 0.000175043798707183, + "loss": 2.536, + "step": 4136 + }, + { + "epoch": 0.37480464768634913, + "grad_norm": 0.8166245222091675, + "learning_rate": 0.00017503775750619225, + "loss": 2.8192, + "step": 4137 + }, + { + "epoch": 0.3748952458607959, + "grad_norm": 0.7344986200332642, + "learning_rate": 0.00017503171630520149, + "loss": 2.826, + "step": 4138 + }, + { + "epoch": 0.3749858440352427, + "grad_norm": 0.6553145051002502, + "learning_rate": 0.00017502567510421072, + "loss": 2.3278, + "step": 4139 + }, + { + "epoch": 0.3750764422096895, + "grad_norm": 0.7713594436645508, + "learning_rate": 0.00017501963390321995, + "loss": 3.074, + "step": 4140 + }, + { + "epoch": 0.3751670403841363, + "grad_norm": 0.7400265336036682, + "learning_rate": 0.00017501359270222921, + "loss": 2.1444, + "step": 4141 + }, + { + "epoch": 0.375257638558583, + "grad_norm": 0.7192687392234802, + "learning_rate": 0.00017500755150123848, + "loss": 2.6706, + "step": 4142 + }, + { + "epoch": 0.3753482367330298, + "grad_norm": 0.8039569854736328, + "learning_rate": 0.00017500151030024768, + "loss": 2.8461, + "step": 4143 + }, + { + "epoch": 0.3754388349074766, + "grad_norm": 0.7381863594055176, + "learning_rate": 0.00017499546909925694, + "loss": 2.7842, + "step": 4144 + }, + { + "epoch": 0.3755294330819234, + "grad_norm": 0.7305881381034851, + "learning_rate": 0.00017498942789826618, + "loss": 2.8339, + "step": 4145 + }, + { + "epoch": 0.37562003125637017, + "grad_norm": 0.7094471454620361, + "learning_rate": 0.00017498338669727544, + "loss": 2.3119, + "step": 4146 + }, + { + "epoch": 0.37571062943081696, + "grad_norm": 0.7511947751045227, + "learning_rate": 0.00017497734549628467, + "loss": 2.8425, + "step": 4147 + }, + { + "epoch": 0.37580122760526374, + "grad_norm": 0.7250003814697266, + "learning_rate": 0.0001749713042952939, + "loss": 2.8482, + "step": 4148 + }, + { + "epoch": 0.37589182577971053, + "grad_norm": 0.5994963049888611, + "learning_rate": 0.00017496526309430317, + "loss": 2.0209, + "step": 4149 + }, + { + "epoch": 0.3759824239541573, + "grad_norm": 0.7344649434089661, + "learning_rate": 0.0001749592218933124, + "loss": 2.7854, + "step": 4150 + }, + { + "epoch": 0.3760730221286041, + "grad_norm": 0.7179079651832581, + "learning_rate": 0.00017495318069232164, + "loss": 2.8081, + "step": 4151 + }, + { + "epoch": 0.3761636203030509, + "grad_norm": 0.7919834852218628, + "learning_rate": 0.00017494713949133087, + "loss": 3.0841, + "step": 4152 + }, + { + "epoch": 0.3762542184774977, + "grad_norm": 0.7530466318130493, + "learning_rate": 0.00017494109829034013, + "loss": 2.8909, + "step": 4153 + }, + { + "epoch": 0.37634481665194447, + "grad_norm": 0.7437158823013306, + "learning_rate": 0.00017493505708934937, + "loss": 2.8731, + "step": 4154 + }, + { + "epoch": 0.37643541482639126, + "grad_norm": 0.6999508142471313, + "learning_rate": 0.00017492901588835863, + "loss": 2.1035, + "step": 4155 + }, + { + "epoch": 0.37652601300083804, + "grad_norm": 0.7718594670295715, + "learning_rate": 0.00017492297468736783, + "loss": 2.7521, + "step": 4156 + }, + { + "epoch": 0.37661661117528483, + "grad_norm": 0.7162525653839111, + "learning_rate": 0.0001749169334863771, + "loss": 2.5955, + "step": 4157 + }, + { + "epoch": 0.3767072093497316, + "grad_norm": 0.814902663230896, + "learning_rate": 0.00017491089228538636, + "loss": 3.0549, + "step": 4158 + }, + { + "epoch": 0.3767978075241784, + "grad_norm": 0.7835222482681274, + "learning_rate": 0.0001749048510843956, + "loss": 2.6563, + "step": 4159 + }, + { + "epoch": 0.3768884056986252, + "grad_norm": 0.6614590287208557, + "learning_rate": 0.00017489880988340482, + "loss": 2.2258, + "step": 4160 + }, + { + "epoch": 0.376979003873072, + "grad_norm": 0.7668838500976562, + "learning_rate": 0.00017489276868241406, + "loss": 2.7676, + "step": 4161 + }, + { + "epoch": 0.3770696020475187, + "grad_norm": 0.7761746048927307, + "learning_rate": 0.00017488672748142332, + "loss": 2.6959, + "step": 4162 + }, + { + "epoch": 0.3771602002219655, + "grad_norm": 0.7514036893844604, + "learning_rate": 0.00017488068628043255, + "loss": 2.9828, + "step": 4163 + }, + { + "epoch": 0.3772507983964123, + "grad_norm": 0.7799433469772339, + "learning_rate": 0.0001748746450794418, + "loss": 2.901, + "step": 4164 + }, + { + "epoch": 0.3773413965708591, + "grad_norm": 0.7243828773498535, + "learning_rate": 0.00017486860387845105, + "loss": 2.9953, + "step": 4165 + }, + { + "epoch": 0.37743199474530587, + "grad_norm": 0.7307072281837463, + "learning_rate": 0.00017486256267746028, + "loss": 2.8664, + "step": 4166 + }, + { + "epoch": 0.37752259291975265, + "grad_norm": 0.7449647784233093, + "learning_rate": 0.00017485652147646954, + "loss": 2.835, + "step": 4167 + }, + { + "epoch": 0.37761319109419944, + "grad_norm": 0.7872416377067566, + "learning_rate": 0.00017485048027547878, + "loss": 2.8976, + "step": 4168 + }, + { + "epoch": 0.37770378926864623, + "grad_norm": 0.7512543201446533, + "learning_rate": 0.000174844439074488, + "loss": 2.8926, + "step": 4169 + }, + { + "epoch": 0.377794387443093, + "grad_norm": 0.7454989552497864, + "learning_rate": 0.00017483839787349725, + "loss": 2.2186, + "step": 4170 + }, + { + "epoch": 0.3778849856175398, + "grad_norm": 0.8437703847885132, + "learning_rate": 0.0001748323566725065, + "loss": 3.1235, + "step": 4171 + }, + { + "epoch": 0.3779755837919866, + "grad_norm": 0.7718930244445801, + "learning_rate": 0.00017482631547151577, + "loss": 2.7829, + "step": 4172 + }, + { + "epoch": 0.3780661819664334, + "grad_norm": 0.7534980773925781, + "learning_rate": 0.00017482027427052498, + "loss": 2.7295, + "step": 4173 + }, + { + "epoch": 0.37815678014088017, + "grad_norm": 0.7675744295120239, + "learning_rate": 0.00017481423306953424, + "loss": 2.9655, + "step": 4174 + }, + { + "epoch": 0.37824737831532695, + "grad_norm": 0.7478449940681458, + "learning_rate": 0.00017480819186854347, + "loss": 2.8162, + "step": 4175 + }, + { + "epoch": 0.37833797648977374, + "grad_norm": 0.7421260476112366, + "learning_rate": 0.00017480215066755273, + "loss": 2.7262, + "step": 4176 + }, + { + "epoch": 0.37842857466422053, + "grad_norm": 0.7399988174438477, + "learning_rate": 0.00017479610946656197, + "loss": 2.8612, + "step": 4177 + }, + { + "epoch": 0.3785191728386673, + "grad_norm": 0.7938649654388428, + "learning_rate": 0.0001747900682655712, + "loss": 2.5033, + "step": 4178 + }, + { + "epoch": 0.3786097710131141, + "grad_norm": 0.7064005136489868, + "learning_rate": 0.00017478402706458046, + "loss": 2.0314, + "step": 4179 + }, + { + "epoch": 0.3787003691875609, + "grad_norm": 0.7500338554382324, + "learning_rate": 0.0001747779858635897, + "loss": 2.8446, + "step": 4180 + }, + { + "epoch": 0.3787909673620077, + "grad_norm": 0.7260699272155762, + "learning_rate": 0.00017477194466259893, + "loss": 2.7209, + "step": 4181 + }, + { + "epoch": 0.37888156553645447, + "grad_norm": 0.6608060002326965, + "learning_rate": 0.00017476590346160816, + "loss": 2.0711, + "step": 4182 + }, + { + "epoch": 0.3789721637109012, + "grad_norm": 0.7037761807441711, + "learning_rate": 0.00017475986226061742, + "loss": 2.2146, + "step": 4183 + }, + { + "epoch": 0.379062761885348, + "grad_norm": 0.6929085850715637, + "learning_rate": 0.00017475382105962666, + "loss": 2.7184, + "step": 4184 + }, + { + "epoch": 0.3791533600597948, + "grad_norm": 0.7656505107879639, + "learning_rate": 0.00017474777985863592, + "loss": 2.9781, + "step": 4185 + }, + { + "epoch": 0.37924395823424156, + "grad_norm": 0.7523120045661926, + "learning_rate": 0.00017474173865764513, + "loss": 2.8528, + "step": 4186 + }, + { + "epoch": 0.37933455640868835, + "grad_norm": 0.709949254989624, + "learning_rate": 0.0001747356974566544, + "loss": 2.7946, + "step": 4187 + }, + { + "epoch": 0.37942515458313514, + "grad_norm": 0.7412585616111755, + "learning_rate": 0.00017472965625566365, + "loss": 2.4339, + "step": 4188 + }, + { + "epoch": 0.3795157527575819, + "grad_norm": 0.7623683214187622, + "learning_rate": 0.00017472361505467288, + "loss": 2.7946, + "step": 4189 + }, + { + "epoch": 0.3796063509320287, + "grad_norm": 0.7916049361228943, + "learning_rate": 0.00017471757385368212, + "loss": 2.8188, + "step": 4190 + }, + { + "epoch": 0.3796969491064755, + "grad_norm": 0.6954820156097412, + "learning_rate": 0.00017471153265269135, + "loss": 2.6921, + "step": 4191 + }, + { + "epoch": 0.3797875472809223, + "grad_norm": 0.7557576298713684, + "learning_rate": 0.0001747054914517006, + "loss": 2.8957, + "step": 4192 + }, + { + "epoch": 0.3798781454553691, + "grad_norm": 0.7229032516479492, + "learning_rate": 0.00017469945025070985, + "loss": 2.7387, + "step": 4193 + }, + { + "epoch": 0.37996874362981586, + "grad_norm": 0.7771234512329102, + "learning_rate": 0.00017469340904971908, + "loss": 2.7197, + "step": 4194 + }, + { + "epoch": 0.38005934180426265, + "grad_norm": 0.7708628177642822, + "learning_rate": 0.00017468736784872834, + "loss": 2.8764, + "step": 4195 + }, + { + "epoch": 0.38014993997870944, + "grad_norm": 0.7381467819213867, + "learning_rate": 0.00017468132664773758, + "loss": 2.6491, + "step": 4196 + }, + { + "epoch": 0.3802405381531562, + "grad_norm": 0.7329146862030029, + "learning_rate": 0.00017467528544674684, + "loss": 2.4948, + "step": 4197 + }, + { + "epoch": 0.380331136327603, + "grad_norm": 0.7758803367614746, + "learning_rate": 0.00017466924424575607, + "loss": 2.7121, + "step": 4198 + }, + { + "epoch": 0.3804217345020498, + "grad_norm": 0.6914672255516052, + "learning_rate": 0.0001746632030447653, + "loss": 2.2599, + "step": 4199 + }, + { + "epoch": 0.3805123326764966, + "grad_norm": 0.7973684072494507, + "learning_rate": 0.00017465716184377454, + "loss": 2.8313, + "step": 4200 + }, + { + "epoch": 0.3806029308509434, + "grad_norm": 0.6916368007659912, + "learning_rate": 0.0001746511206427838, + "loss": 2.6288, + "step": 4201 + }, + { + "epoch": 0.38069352902539016, + "grad_norm": 0.7731599807739258, + "learning_rate": 0.00017464507944179303, + "loss": 2.8132, + "step": 4202 + }, + { + "epoch": 0.3807841271998369, + "grad_norm": 0.6913368701934814, + "learning_rate": 0.00017463903824080227, + "loss": 2.0751, + "step": 4203 + }, + { + "epoch": 0.3808747253742837, + "grad_norm": 0.7228068113327026, + "learning_rate": 0.00017463299703981153, + "loss": 2.7657, + "step": 4204 + }, + { + "epoch": 0.3809653235487305, + "grad_norm": 0.7337124943733215, + "learning_rate": 0.00017462695583882076, + "loss": 2.7212, + "step": 4205 + }, + { + "epoch": 0.38105592172317726, + "grad_norm": 0.6536605954170227, + "learning_rate": 0.00017462091463783002, + "loss": 2.1701, + "step": 4206 + }, + { + "epoch": 0.38114651989762405, + "grad_norm": 0.7938117384910583, + "learning_rate": 0.00017461487343683923, + "loss": 2.8506, + "step": 4207 + }, + { + "epoch": 0.38123711807207084, + "grad_norm": 0.7502299547195435, + "learning_rate": 0.0001746088322358485, + "loss": 2.646, + "step": 4208 + }, + { + "epoch": 0.3813277162465176, + "grad_norm": 0.8123915195465088, + "learning_rate": 0.00017460279103485775, + "loss": 3.025, + "step": 4209 + }, + { + "epoch": 0.3814183144209644, + "grad_norm": 0.6950416564941406, + "learning_rate": 0.000174596749833867, + "loss": 2.9996, + "step": 4210 + }, + { + "epoch": 0.3815089125954112, + "grad_norm": 0.7749965190887451, + "learning_rate": 0.00017459070863287622, + "loss": 2.8279, + "step": 4211 + }, + { + "epoch": 0.381599510769858, + "grad_norm": 0.7448924779891968, + "learning_rate": 0.00017458466743188546, + "loss": 2.6863, + "step": 4212 + }, + { + "epoch": 0.3816901089443048, + "grad_norm": 0.6502100825309753, + "learning_rate": 0.00017457862623089472, + "loss": 2.118, + "step": 4213 + }, + { + "epoch": 0.38178070711875156, + "grad_norm": 0.7998712062835693, + "learning_rate": 0.00017457258502990395, + "loss": 2.9149, + "step": 4214 + }, + { + "epoch": 0.38187130529319835, + "grad_norm": 0.8016895055770874, + "learning_rate": 0.00017456654382891319, + "loss": 2.8887, + "step": 4215 + }, + { + "epoch": 0.38196190346764514, + "grad_norm": 0.7281237840652466, + "learning_rate": 0.00017456050262792242, + "loss": 2.7562, + "step": 4216 + }, + { + "epoch": 0.3820525016420919, + "grad_norm": 0.776483416557312, + "learning_rate": 0.00017455446142693168, + "loss": 3.0492, + "step": 4217 + }, + { + "epoch": 0.3821430998165387, + "grad_norm": 0.7512903809547424, + "learning_rate": 0.00017454842022594094, + "loss": 2.8886, + "step": 4218 + }, + { + "epoch": 0.3822336979909855, + "grad_norm": 0.7717723250389099, + "learning_rate": 0.00017454237902495018, + "loss": 2.5692, + "step": 4219 + }, + { + "epoch": 0.3823242961654323, + "grad_norm": 0.772614061832428, + "learning_rate": 0.0001745363378239594, + "loss": 3.3057, + "step": 4220 + }, + { + "epoch": 0.3824148943398791, + "grad_norm": 0.7509413361549377, + "learning_rate": 0.00017453029662296864, + "loss": 2.8148, + "step": 4221 + }, + { + "epoch": 0.38250549251432586, + "grad_norm": 0.7778221368789673, + "learning_rate": 0.0001745242554219779, + "loss": 2.7343, + "step": 4222 + }, + { + "epoch": 0.3825960906887726, + "grad_norm": 0.752079963684082, + "learning_rate": 0.00017451821422098714, + "loss": 2.8196, + "step": 4223 + }, + { + "epoch": 0.3826866888632194, + "grad_norm": 0.6881147623062134, + "learning_rate": 0.00017451217301999637, + "loss": 2.2357, + "step": 4224 + }, + { + "epoch": 0.38277728703766617, + "grad_norm": 0.7286326289176941, + "learning_rate": 0.00017450613181900563, + "loss": 3.0186, + "step": 4225 + }, + { + "epoch": 0.38286788521211296, + "grad_norm": 0.758388340473175, + "learning_rate": 0.00017450009061801487, + "loss": 2.8866, + "step": 4226 + }, + { + "epoch": 0.38295848338655974, + "grad_norm": 0.7343044281005859, + "learning_rate": 0.00017449404941702413, + "loss": 2.9478, + "step": 4227 + }, + { + "epoch": 0.38304908156100653, + "grad_norm": 0.7292922139167786, + "learning_rate": 0.00017448800821603334, + "loss": 2.9134, + "step": 4228 + }, + { + "epoch": 0.3831396797354533, + "grad_norm": 0.7426475882530212, + "learning_rate": 0.0001744819670150426, + "loss": 2.8512, + "step": 4229 + }, + { + "epoch": 0.3832302779099001, + "grad_norm": 0.7882463932037354, + "learning_rate": 0.00017447592581405183, + "loss": 3.1137, + "step": 4230 + }, + { + "epoch": 0.3833208760843469, + "grad_norm": 0.7351574301719666, + "learning_rate": 0.0001744698846130611, + "loss": 2.8484, + "step": 4231 + }, + { + "epoch": 0.3834114742587937, + "grad_norm": 0.7687286138534546, + "learning_rate": 0.00017446384341207033, + "loss": 2.9631, + "step": 4232 + }, + { + "epoch": 0.38350207243324047, + "grad_norm": 0.7723210453987122, + "learning_rate": 0.00017445780221107956, + "loss": 3.0617, + "step": 4233 + }, + { + "epoch": 0.38359267060768726, + "grad_norm": 0.7779502868652344, + "learning_rate": 0.00017445176101008882, + "loss": 2.9278, + "step": 4234 + }, + { + "epoch": 0.38368326878213405, + "grad_norm": 0.8149654865264893, + "learning_rate": 0.00017444571980909806, + "loss": 2.8046, + "step": 4235 + }, + { + "epoch": 0.38377386695658083, + "grad_norm": 0.791018009185791, + "learning_rate": 0.00017443967860810732, + "loss": 3.0811, + "step": 4236 + }, + { + "epoch": 0.3838644651310276, + "grad_norm": 0.732795774936676, + "learning_rate": 0.00017443363740711652, + "loss": 2.6355, + "step": 4237 + }, + { + "epoch": 0.3839550633054744, + "grad_norm": 0.7110222578048706, + "learning_rate": 0.00017442759620612579, + "loss": 2.7642, + "step": 4238 + }, + { + "epoch": 0.3840456614799212, + "grad_norm": 0.7691553235054016, + "learning_rate": 0.00017442155500513505, + "loss": 2.8582, + "step": 4239 + }, + { + "epoch": 0.384136259654368, + "grad_norm": 0.744601309299469, + "learning_rate": 0.00017441551380414428, + "loss": 2.9117, + "step": 4240 + }, + { + "epoch": 0.38422685782881477, + "grad_norm": 0.8124483823776245, + "learning_rate": 0.00017440947260315351, + "loss": 2.79, + "step": 4241 + }, + { + "epoch": 0.38431745600326156, + "grad_norm": 0.7332538962364197, + "learning_rate": 0.00017440343140216275, + "loss": 2.8157, + "step": 4242 + }, + { + "epoch": 0.3844080541777083, + "grad_norm": 0.619032621383667, + "learning_rate": 0.000174397390201172, + "loss": 2.0226, + "step": 4243 + }, + { + "epoch": 0.3844986523521551, + "grad_norm": 0.6591432094573975, + "learning_rate": 0.00017439134900018124, + "loss": 2.0805, + "step": 4244 + }, + { + "epoch": 0.38458925052660187, + "grad_norm": 0.7800726890563965, + "learning_rate": 0.00017438530779919048, + "loss": 2.7226, + "step": 4245 + }, + { + "epoch": 0.38467984870104865, + "grad_norm": 0.8017358779907227, + "learning_rate": 0.0001743792665981997, + "loss": 2.8784, + "step": 4246 + }, + { + "epoch": 0.38477044687549544, + "grad_norm": 0.8286831378936768, + "learning_rate": 0.00017437322539720897, + "loss": 3.1205, + "step": 4247 + }, + { + "epoch": 0.38486104504994223, + "grad_norm": 0.7647112607955933, + "learning_rate": 0.00017436718419621823, + "loss": 2.8787, + "step": 4248 + }, + { + "epoch": 0.384951643224389, + "grad_norm": 0.8046972155570984, + "learning_rate": 0.00017436114299522747, + "loss": 2.7132, + "step": 4249 + }, + { + "epoch": 0.3850422413988358, + "grad_norm": 0.7861848473548889, + "learning_rate": 0.0001743551017942367, + "loss": 2.7838, + "step": 4250 + }, + { + "epoch": 0.3851328395732826, + "grad_norm": 0.736650288105011, + "learning_rate": 0.00017434906059324594, + "loss": 2.9212, + "step": 4251 + }, + { + "epoch": 0.3852234377477294, + "grad_norm": 0.7856557369232178, + "learning_rate": 0.0001743430193922552, + "loss": 2.8522, + "step": 4252 + }, + { + "epoch": 0.38531403592217617, + "grad_norm": 0.8810071349143982, + "learning_rate": 0.00017433697819126443, + "loss": 2.621, + "step": 4253 + }, + { + "epoch": 0.38540463409662296, + "grad_norm": 0.707516074180603, + "learning_rate": 0.00017433093699027367, + "loss": 2.8189, + "step": 4254 + }, + { + "epoch": 0.38549523227106974, + "grad_norm": 0.7235802412033081, + "learning_rate": 0.00017432489578928293, + "loss": 2.8297, + "step": 4255 + }, + { + "epoch": 0.38558583044551653, + "grad_norm": 0.7250128388404846, + "learning_rate": 0.00017431885458829216, + "loss": 2.6922, + "step": 4256 + }, + { + "epoch": 0.3856764286199633, + "grad_norm": 0.6823474764823914, + "learning_rate": 0.00017431281338730142, + "loss": 2.1071, + "step": 4257 + }, + { + "epoch": 0.3857670267944101, + "grad_norm": 0.8414506316184998, + "learning_rate": 0.00017430677218631063, + "loss": 2.7842, + "step": 4258 + }, + { + "epoch": 0.3858576249688569, + "grad_norm": 0.7334285378456116, + "learning_rate": 0.0001743007309853199, + "loss": 2.8115, + "step": 4259 + }, + { + "epoch": 0.3859482231433037, + "grad_norm": 0.7894465327262878, + "learning_rate": 0.00017429468978432912, + "loss": 2.9437, + "step": 4260 + }, + { + "epoch": 0.38603882131775047, + "grad_norm": 0.7799162864685059, + "learning_rate": 0.00017428864858333839, + "loss": 2.8864, + "step": 4261 + }, + { + "epoch": 0.38612941949219726, + "grad_norm": 0.7235219478607178, + "learning_rate": 0.00017428260738234762, + "loss": 2.5848, + "step": 4262 + }, + { + "epoch": 0.38622001766664404, + "grad_norm": 0.6476185321807861, + "learning_rate": 0.00017427656618135685, + "loss": 2.2331, + "step": 4263 + }, + { + "epoch": 0.3863106158410908, + "grad_norm": 0.7526753544807434, + "learning_rate": 0.00017427052498036611, + "loss": 2.7498, + "step": 4264 + }, + { + "epoch": 0.38640121401553756, + "grad_norm": 0.7366155385971069, + "learning_rate": 0.00017426448377937535, + "loss": 2.8061, + "step": 4265 + }, + { + "epoch": 0.38649181218998435, + "grad_norm": 0.7311974763870239, + "learning_rate": 0.00017425844257838458, + "loss": 2.497, + "step": 4266 + }, + { + "epoch": 0.38658241036443114, + "grad_norm": 0.7892488837242126, + "learning_rate": 0.00017425240137739382, + "loss": 3.0476, + "step": 4267 + }, + { + "epoch": 0.3866730085388779, + "grad_norm": 0.7996923923492432, + "learning_rate": 0.00017424636017640308, + "loss": 2.9055, + "step": 4268 + }, + { + "epoch": 0.3867636067133247, + "grad_norm": 0.8820354342460632, + "learning_rate": 0.00017424031897541234, + "loss": 2.8204, + "step": 4269 + }, + { + "epoch": 0.3868542048877715, + "grad_norm": 0.7783165574073792, + "learning_rate": 0.00017423427777442157, + "loss": 2.7908, + "step": 4270 + }, + { + "epoch": 0.3869448030622183, + "grad_norm": 0.6948442459106445, + "learning_rate": 0.0001742282365734308, + "loss": 2.2131, + "step": 4271 + }, + { + "epoch": 0.3870354012366651, + "grad_norm": 0.7923200130462646, + "learning_rate": 0.00017422219537244004, + "loss": 2.9154, + "step": 4272 + }, + { + "epoch": 0.38712599941111187, + "grad_norm": 0.801629364490509, + "learning_rate": 0.0001742161541714493, + "loss": 2.9088, + "step": 4273 + }, + { + "epoch": 0.38721659758555865, + "grad_norm": 0.7422121167182922, + "learning_rate": 0.00017421011297045854, + "loss": 2.6598, + "step": 4274 + }, + { + "epoch": 0.38730719576000544, + "grad_norm": 0.749686062335968, + "learning_rate": 0.00017420407176946777, + "loss": 2.9684, + "step": 4275 + }, + { + "epoch": 0.38739779393445223, + "grad_norm": 0.7714184522628784, + "learning_rate": 0.000174198030568477, + "loss": 2.8132, + "step": 4276 + }, + { + "epoch": 0.387488392108899, + "grad_norm": 0.6327258944511414, + "learning_rate": 0.00017419198936748627, + "loss": 2.2041, + "step": 4277 + }, + { + "epoch": 0.3875789902833458, + "grad_norm": 0.7448431253433228, + "learning_rate": 0.00017418594816649553, + "loss": 2.8344, + "step": 4278 + }, + { + "epoch": 0.3876695884577926, + "grad_norm": 0.7426571846008301, + "learning_rate": 0.00017417990696550473, + "loss": 2.9877, + "step": 4279 + }, + { + "epoch": 0.3877601866322394, + "grad_norm": 0.8159778714179993, + "learning_rate": 0.000174173865764514, + "loss": 2.768, + "step": 4280 + }, + { + "epoch": 0.38785078480668617, + "grad_norm": 0.7125199437141418, + "learning_rate": 0.00017416782456352323, + "loss": 2.7142, + "step": 4281 + }, + { + "epoch": 0.38794138298113295, + "grad_norm": 0.7881189584732056, + "learning_rate": 0.0001741617833625325, + "loss": 2.7677, + "step": 4282 + }, + { + "epoch": 0.38803198115557974, + "grad_norm": 0.6028584837913513, + "learning_rate": 0.00017415574216154172, + "loss": 2.1137, + "step": 4283 + }, + { + "epoch": 0.3881225793300265, + "grad_norm": 0.7392725944519043, + "learning_rate": 0.00017414970096055096, + "loss": 2.8317, + "step": 4284 + }, + { + "epoch": 0.38821317750447326, + "grad_norm": 0.8136605620384216, + "learning_rate": 0.00017414365975956022, + "loss": 2.9281, + "step": 4285 + }, + { + "epoch": 0.38830377567892005, + "grad_norm": 0.7831084132194519, + "learning_rate": 0.00017413761855856945, + "loss": 3.0472, + "step": 4286 + }, + { + "epoch": 0.38839437385336684, + "grad_norm": 0.7951047420501709, + "learning_rate": 0.00017413157735757871, + "loss": 2.8488, + "step": 4287 + }, + { + "epoch": 0.3884849720278136, + "grad_norm": 0.7396490573883057, + "learning_rate": 0.00017412553615658792, + "loss": 2.7106, + "step": 4288 + }, + { + "epoch": 0.3885755702022604, + "grad_norm": 0.7563432455062866, + "learning_rate": 0.00017411949495559718, + "loss": 2.7707, + "step": 4289 + }, + { + "epoch": 0.3886661683767072, + "grad_norm": 0.7531410455703735, + "learning_rate": 0.00017411345375460642, + "loss": 2.9061, + "step": 4290 + }, + { + "epoch": 0.388756766551154, + "grad_norm": 0.7123587727546692, + "learning_rate": 0.00017410741255361568, + "loss": 2.6942, + "step": 4291 + }, + { + "epoch": 0.3888473647256008, + "grad_norm": 0.7354791164398193, + "learning_rate": 0.0001741013713526249, + "loss": 2.0765, + "step": 4292 + }, + { + "epoch": 0.38893796290004756, + "grad_norm": 0.7459060549736023, + "learning_rate": 0.00017409533015163415, + "loss": 3.097, + "step": 4293 + }, + { + "epoch": 0.38902856107449435, + "grad_norm": 0.7241764664649963, + "learning_rate": 0.0001740892889506434, + "loss": 2.6018, + "step": 4294 + }, + { + "epoch": 0.38911915924894114, + "grad_norm": 0.8014176487922668, + "learning_rate": 0.00017408324774965264, + "loss": 3.0641, + "step": 4295 + }, + { + "epoch": 0.3892097574233879, + "grad_norm": 0.7261688709259033, + "learning_rate": 0.00017407720654866188, + "loss": 2.7861, + "step": 4296 + }, + { + "epoch": 0.3893003555978347, + "grad_norm": 0.6592805981636047, + "learning_rate": 0.0001740711653476711, + "loss": 2.1015, + "step": 4297 + }, + { + "epoch": 0.3893909537722815, + "grad_norm": 0.7467442154884338, + "learning_rate": 0.00017406512414668037, + "loss": 2.8333, + "step": 4298 + }, + { + "epoch": 0.3894815519467283, + "grad_norm": 0.801140308380127, + "learning_rate": 0.00017405908294568963, + "loss": 2.6805, + "step": 4299 + }, + { + "epoch": 0.3895721501211751, + "grad_norm": 0.7800965905189514, + "learning_rate": 0.00017405304174469887, + "loss": 2.8471, + "step": 4300 + }, + { + "epoch": 0.38966274829562186, + "grad_norm": 0.7633541226387024, + "learning_rate": 0.0001740470005437081, + "loss": 2.7044, + "step": 4301 + }, + { + "epoch": 0.38975334647006865, + "grad_norm": 0.7473316192626953, + "learning_rate": 0.00017404095934271733, + "loss": 2.8746, + "step": 4302 + }, + { + "epoch": 0.38984394464451544, + "grad_norm": 0.7730047106742859, + "learning_rate": 0.0001740349181417266, + "loss": 2.8375, + "step": 4303 + }, + { + "epoch": 0.38993454281896217, + "grad_norm": 0.809088945388794, + "learning_rate": 0.00017402887694073583, + "loss": 2.8021, + "step": 4304 + }, + { + "epoch": 0.39002514099340896, + "grad_norm": 0.7493605017662048, + "learning_rate": 0.00017402283573974506, + "loss": 2.618, + "step": 4305 + }, + { + "epoch": 0.39011573916785575, + "grad_norm": 0.8125585913658142, + "learning_rate": 0.0001740167945387543, + "loss": 2.9503, + "step": 4306 + }, + { + "epoch": 0.39020633734230253, + "grad_norm": 0.7619946002960205, + "learning_rate": 0.00017401075333776356, + "loss": 2.9175, + "step": 4307 + }, + { + "epoch": 0.3902969355167493, + "grad_norm": 0.7147579193115234, + "learning_rate": 0.00017400471213677282, + "loss": 2.5474, + "step": 4308 + }, + { + "epoch": 0.3903875336911961, + "grad_norm": 0.7951923608779907, + "learning_rate": 0.00017399867093578203, + "loss": 2.7839, + "step": 4309 + }, + { + "epoch": 0.3904781318656429, + "grad_norm": 0.7001587748527527, + "learning_rate": 0.0001739926297347913, + "loss": 2.5972, + "step": 4310 + }, + { + "epoch": 0.3905687300400897, + "grad_norm": 0.7518627047538757, + "learning_rate": 0.00017398658853380052, + "loss": 3.0088, + "step": 4311 + }, + { + "epoch": 0.39065932821453647, + "grad_norm": 0.7550466060638428, + "learning_rate": 0.00017398054733280978, + "loss": 2.9033, + "step": 4312 + }, + { + "epoch": 0.39074992638898326, + "grad_norm": 0.733128547668457, + "learning_rate": 0.00017397450613181902, + "loss": 2.2879, + "step": 4313 + }, + { + "epoch": 0.39084052456343005, + "grad_norm": 0.7493205666542053, + "learning_rate": 0.00017396846493082825, + "loss": 2.8253, + "step": 4314 + }, + { + "epoch": 0.39093112273787683, + "grad_norm": 0.7741645574569702, + "learning_rate": 0.0001739624237298375, + "loss": 2.8364, + "step": 4315 + }, + { + "epoch": 0.3910217209123236, + "grad_norm": 0.7293811440467834, + "learning_rate": 0.00017395638252884675, + "loss": 2.5011, + "step": 4316 + }, + { + "epoch": 0.3911123190867704, + "grad_norm": 0.8010798096656799, + "learning_rate": 0.00017395034132785598, + "loss": 2.9562, + "step": 4317 + }, + { + "epoch": 0.3912029172612172, + "grad_norm": 0.7534835338592529, + "learning_rate": 0.00017394430012686521, + "loss": 3.0859, + "step": 4318 + }, + { + "epoch": 0.391293515435664, + "grad_norm": 0.7067607045173645, + "learning_rate": 0.00017393825892587448, + "loss": 2.7267, + "step": 4319 + }, + { + "epoch": 0.3913841136101108, + "grad_norm": 0.7712793946266174, + "learning_rate": 0.0001739322177248837, + "loss": 2.8956, + "step": 4320 + }, + { + "epoch": 0.39147471178455756, + "grad_norm": 0.750716507434845, + "learning_rate": 0.00017392617652389297, + "loss": 2.7805, + "step": 4321 + }, + { + "epoch": 0.39156530995900435, + "grad_norm": 0.629844605922699, + "learning_rate": 0.0001739201353229022, + "loss": 2.4646, + "step": 4322 + }, + { + "epoch": 0.39165590813345114, + "grad_norm": 0.7485806345939636, + "learning_rate": 0.00017391409412191144, + "loss": 2.7508, + "step": 4323 + }, + { + "epoch": 0.39174650630789787, + "grad_norm": 0.7598576545715332, + "learning_rate": 0.0001739080529209207, + "loss": 2.6806, + "step": 4324 + }, + { + "epoch": 0.39183710448234466, + "grad_norm": 0.7552099227905273, + "learning_rate": 0.00017390201171992993, + "loss": 2.8619, + "step": 4325 + }, + { + "epoch": 0.39192770265679144, + "grad_norm": 0.6903321146965027, + "learning_rate": 0.00017389597051893917, + "loss": 2.0462, + "step": 4326 + }, + { + "epoch": 0.39201830083123823, + "grad_norm": 0.786170244216919, + "learning_rate": 0.0001738899293179484, + "loss": 2.8164, + "step": 4327 + }, + { + "epoch": 0.392108899005685, + "grad_norm": 0.7231353521347046, + "learning_rate": 0.00017388388811695766, + "loss": 2.8058, + "step": 4328 + }, + { + "epoch": 0.3921994971801318, + "grad_norm": 0.7334374189376831, + "learning_rate": 0.00017387784691596692, + "loss": 2.8811, + "step": 4329 + }, + { + "epoch": 0.3922900953545786, + "grad_norm": 0.6344853043556213, + "learning_rate": 0.00017387180571497613, + "loss": 2.1535, + "step": 4330 + }, + { + "epoch": 0.3923806935290254, + "grad_norm": 0.7570933699607849, + "learning_rate": 0.0001738657645139854, + "loss": 2.8706, + "step": 4331 + }, + { + "epoch": 0.39247129170347217, + "grad_norm": 0.714148223400116, + "learning_rate": 0.00017385972331299463, + "loss": 2.7225, + "step": 4332 + }, + { + "epoch": 0.39256188987791896, + "grad_norm": 0.6096174120903015, + "learning_rate": 0.0001738536821120039, + "loss": 2.0221, + "step": 4333 + }, + { + "epoch": 0.39265248805236574, + "grad_norm": 0.7304897904396057, + "learning_rate": 0.00017384764091101312, + "loss": 2.6847, + "step": 4334 + }, + { + "epoch": 0.39274308622681253, + "grad_norm": 0.7185335159301758, + "learning_rate": 0.00017384159971002236, + "loss": 2.6142, + "step": 4335 + }, + { + "epoch": 0.3928336844012593, + "grad_norm": 0.7048711776733398, + "learning_rate": 0.0001738355585090316, + "loss": 2.6766, + "step": 4336 + }, + { + "epoch": 0.3929242825757061, + "grad_norm": 0.7887117862701416, + "learning_rate": 0.00017382951730804085, + "loss": 2.7719, + "step": 4337 + }, + { + "epoch": 0.3930148807501529, + "grad_norm": 0.7651515007019043, + "learning_rate": 0.00017382347610705009, + "loss": 2.8972, + "step": 4338 + }, + { + "epoch": 0.3931054789245997, + "grad_norm": 0.7395390868186951, + "learning_rate": 0.00017381743490605932, + "loss": 2.5801, + "step": 4339 + }, + { + "epoch": 0.39319607709904647, + "grad_norm": 0.8029763102531433, + "learning_rate": 0.00017381139370506858, + "loss": 2.7164, + "step": 4340 + }, + { + "epoch": 0.39328667527349326, + "grad_norm": 0.8170705437660217, + "learning_rate": 0.00017380535250407781, + "loss": 2.8545, + "step": 4341 + }, + { + "epoch": 0.39337727344794005, + "grad_norm": 0.7399149537086487, + "learning_rate": 0.00017379931130308708, + "loss": 2.7298, + "step": 4342 + }, + { + "epoch": 0.39346787162238683, + "grad_norm": 0.7997040152549744, + "learning_rate": 0.00017379327010209628, + "loss": 2.799, + "step": 4343 + }, + { + "epoch": 0.3935584697968336, + "grad_norm": 0.755405843257904, + "learning_rate": 0.00017378722890110554, + "loss": 2.2497, + "step": 4344 + }, + { + "epoch": 0.39364906797128035, + "grad_norm": 0.7469013929367065, + "learning_rate": 0.0001737811877001148, + "loss": 2.9717, + "step": 4345 + }, + { + "epoch": 0.39373966614572714, + "grad_norm": 0.8388347029685974, + "learning_rate": 0.00017377514649912404, + "loss": 2.8362, + "step": 4346 + }, + { + "epoch": 0.39383026432017393, + "grad_norm": 0.8081896305084229, + "learning_rate": 0.00017376910529813327, + "loss": 2.9987, + "step": 4347 + }, + { + "epoch": 0.3939208624946207, + "grad_norm": 0.7701404094696045, + "learning_rate": 0.0001737630640971425, + "loss": 2.8647, + "step": 4348 + }, + { + "epoch": 0.3940114606690675, + "grad_norm": 0.8094726204872131, + "learning_rate": 0.00017375702289615177, + "loss": 3.2603, + "step": 4349 + }, + { + "epoch": 0.3941020588435143, + "grad_norm": 0.7745580077171326, + "learning_rate": 0.000173750981695161, + "loss": 2.9348, + "step": 4350 + }, + { + "epoch": 0.3941926570179611, + "grad_norm": 0.7659816145896912, + "learning_rate": 0.00017374494049417024, + "loss": 2.9627, + "step": 4351 + }, + { + "epoch": 0.39428325519240787, + "grad_norm": 0.743194043636322, + "learning_rate": 0.0001737388992931795, + "loss": 2.6637, + "step": 4352 + }, + { + "epoch": 0.39437385336685465, + "grad_norm": 0.6655996441841125, + "learning_rate": 0.00017373285809218873, + "loss": 2.1551, + "step": 4353 + }, + { + "epoch": 0.39446445154130144, + "grad_norm": 0.7464808821678162, + "learning_rate": 0.000173726816891198, + "loss": 2.9202, + "step": 4354 + }, + { + "epoch": 0.39455504971574823, + "grad_norm": 0.770313560962677, + "learning_rate": 0.00017372077569020723, + "loss": 2.6746, + "step": 4355 + }, + { + "epoch": 0.394645647890195, + "grad_norm": 0.7515906095504761, + "learning_rate": 0.00017371473448921646, + "loss": 2.9232, + "step": 4356 + }, + { + "epoch": 0.3947362460646418, + "grad_norm": 0.7535241842269897, + "learning_rate": 0.0001737086932882257, + "loss": 3.1491, + "step": 4357 + }, + { + "epoch": 0.3948268442390886, + "grad_norm": 0.8117870688438416, + "learning_rate": 0.00017370265208723496, + "loss": 2.8526, + "step": 4358 + }, + { + "epoch": 0.3949174424135354, + "grad_norm": 0.7611162662506104, + "learning_rate": 0.00017369661088624422, + "loss": 2.6762, + "step": 4359 + }, + { + "epoch": 0.39500804058798217, + "grad_norm": 0.7291202545166016, + "learning_rate": 0.00017369056968525342, + "loss": 2.8048, + "step": 4360 + }, + { + "epoch": 0.39509863876242896, + "grad_norm": 0.7518001198768616, + "learning_rate": 0.00017368452848426269, + "loss": 2.8161, + "step": 4361 + }, + { + "epoch": 0.39518923693687574, + "grad_norm": 0.7737336754798889, + "learning_rate": 0.00017367848728327192, + "loss": 2.7784, + "step": 4362 + }, + { + "epoch": 0.39527983511132253, + "grad_norm": 0.7587992548942566, + "learning_rate": 0.00017367244608228118, + "loss": 2.8236, + "step": 4363 + }, + { + "epoch": 0.3953704332857693, + "grad_norm": 0.7678070664405823, + "learning_rate": 0.00017366640488129041, + "loss": 2.7079, + "step": 4364 + }, + { + "epoch": 0.39546103146021605, + "grad_norm": 0.7235190272331238, + "learning_rate": 0.00017366036368029965, + "loss": 2.7946, + "step": 4365 + }, + { + "epoch": 0.39555162963466284, + "grad_norm": 0.6303310394287109, + "learning_rate": 0.00017365432247930888, + "loss": 2.0522, + "step": 4366 + }, + { + "epoch": 0.3956422278091096, + "grad_norm": 0.7753424048423767, + "learning_rate": 0.00017364828127831814, + "loss": 2.8607, + "step": 4367 + }, + { + "epoch": 0.3957328259835564, + "grad_norm": 0.7765650153160095, + "learning_rate": 0.00017364224007732738, + "loss": 2.8838, + "step": 4368 + }, + { + "epoch": 0.3958234241580032, + "grad_norm": 0.7187883853912354, + "learning_rate": 0.0001736361988763366, + "loss": 2.7719, + "step": 4369 + }, + { + "epoch": 0.39591402233245, + "grad_norm": 0.7306928634643555, + "learning_rate": 0.00017363015767534587, + "loss": 2.735, + "step": 4370 + }, + { + "epoch": 0.3960046205068968, + "grad_norm": 0.7104478478431702, + "learning_rate": 0.0001736241164743551, + "loss": 2.6897, + "step": 4371 + }, + { + "epoch": 0.39609521868134356, + "grad_norm": 0.7967714071273804, + "learning_rate": 0.00017361807527336437, + "loss": 3.0109, + "step": 4372 + }, + { + "epoch": 0.39618581685579035, + "grad_norm": 0.7418895959854126, + "learning_rate": 0.00017361203407237358, + "loss": 2.9139, + "step": 4373 + }, + { + "epoch": 0.39627641503023714, + "grad_norm": 0.823503851890564, + "learning_rate": 0.00017360599287138284, + "loss": 2.8606, + "step": 4374 + }, + { + "epoch": 0.3963670132046839, + "grad_norm": 0.7282558679580688, + "learning_rate": 0.0001735999516703921, + "loss": 2.7805, + "step": 4375 + }, + { + "epoch": 0.3964576113791307, + "grad_norm": 0.7357491254806519, + "learning_rate": 0.00017359391046940133, + "loss": 2.7172, + "step": 4376 + }, + { + "epoch": 0.3965482095535775, + "grad_norm": 0.6847666501998901, + "learning_rate": 0.00017358786926841057, + "loss": 2.5057, + "step": 4377 + }, + { + "epoch": 0.3966388077280243, + "grad_norm": 0.7499711513519287, + "learning_rate": 0.0001735818280674198, + "loss": 2.8679, + "step": 4378 + }, + { + "epoch": 0.3967294059024711, + "grad_norm": 0.6882975697517395, + "learning_rate": 0.00017357578686642906, + "loss": 2.4874, + "step": 4379 + }, + { + "epoch": 0.39682000407691786, + "grad_norm": 0.7775079607963562, + "learning_rate": 0.0001735697456654383, + "loss": 2.9246, + "step": 4380 + }, + { + "epoch": 0.39691060225136465, + "grad_norm": 0.7134380340576172, + "learning_rate": 0.00017356370446444753, + "loss": 2.871, + "step": 4381 + }, + { + "epoch": 0.39700120042581144, + "grad_norm": 0.6303234100341797, + "learning_rate": 0.0001735576632634568, + "loss": 2.0953, + "step": 4382 + }, + { + "epoch": 0.39709179860025823, + "grad_norm": 0.7323340773582458, + "learning_rate": 0.00017355162206246602, + "loss": 2.8181, + "step": 4383 + }, + { + "epoch": 0.397182396774705, + "grad_norm": 0.7209478616714478, + "learning_rate": 0.00017354558086147529, + "loss": 2.7599, + "step": 4384 + }, + { + "epoch": 0.39727299494915175, + "grad_norm": 0.7655490040779114, + "learning_rate": 0.00017353953966048452, + "loss": 3.0185, + "step": 4385 + }, + { + "epoch": 0.39736359312359854, + "grad_norm": 1.0018099546432495, + "learning_rate": 0.00017353349845949375, + "loss": 2.9958, + "step": 4386 + }, + { + "epoch": 0.3974541912980453, + "grad_norm": 0.7850015759468079, + "learning_rate": 0.000173527457258503, + "loss": 2.8916, + "step": 4387 + }, + { + "epoch": 0.3975447894724921, + "grad_norm": 0.7692961096763611, + "learning_rate": 0.00017352141605751225, + "loss": 2.8787, + "step": 4388 + }, + { + "epoch": 0.3976353876469389, + "grad_norm": 0.8033266663551331, + "learning_rate": 0.00017351537485652148, + "loss": 2.0607, + "step": 4389 + }, + { + "epoch": 0.3977259858213857, + "grad_norm": 0.7375762462615967, + "learning_rate": 0.00017350933365553072, + "loss": 2.9806, + "step": 4390 + }, + { + "epoch": 0.3978165839958325, + "grad_norm": 0.7928676605224609, + "learning_rate": 0.00017350329245453998, + "loss": 3.1035, + "step": 4391 + }, + { + "epoch": 0.39790718217027926, + "grad_norm": 0.756657600402832, + "learning_rate": 0.0001734972512535492, + "loss": 2.93, + "step": 4392 + }, + { + "epoch": 0.39799778034472605, + "grad_norm": 0.7554859519004822, + "learning_rate": 0.00017349121005255847, + "loss": 2.8198, + "step": 4393 + }, + { + "epoch": 0.39808837851917284, + "grad_norm": 0.7918524742126465, + "learning_rate": 0.00017348516885156768, + "loss": 3.1106, + "step": 4394 + }, + { + "epoch": 0.3981789766936196, + "grad_norm": 0.7547590136528015, + "learning_rate": 0.00017347912765057694, + "loss": 2.9472, + "step": 4395 + }, + { + "epoch": 0.3982695748680664, + "grad_norm": 0.7651265263557434, + "learning_rate": 0.00017347308644958618, + "loss": 2.6329, + "step": 4396 + }, + { + "epoch": 0.3983601730425132, + "grad_norm": 0.725236177444458, + "learning_rate": 0.00017346704524859544, + "loss": 2.7196, + "step": 4397 + }, + { + "epoch": 0.39845077121696, + "grad_norm": 0.719731867313385, + "learning_rate": 0.00017346100404760467, + "loss": 2.8484, + "step": 4398 + }, + { + "epoch": 0.3985413693914068, + "grad_norm": 0.654388427734375, + "learning_rate": 0.0001734549628466139, + "loss": 2.1806, + "step": 4399 + }, + { + "epoch": 0.39863196756585356, + "grad_norm": 0.7472866773605347, + "learning_rate": 0.00017344892164562317, + "loss": 2.7032, + "step": 4400 + }, + { + "epoch": 0.39872256574030035, + "grad_norm": 0.7242310047149658, + "learning_rate": 0.0001734428804446324, + "loss": 2.8768, + "step": 4401 + }, + { + "epoch": 0.39881316391474714, + "grad_norm": 0.8583732843399048, + "learning_rate": 0.00017343683924364163, + "loss": 2.7516, + "step": 4402 + }, + { + "epoch": 0.3989037620891939, + "grad_norm": 0.7258987426757812, + "learning_rate": 0.00017343079804265087, + "loss": 2.5478, + "step": 4403 + }, + { + "epoch": 0.3989943602636407, + "grad_norm": 0.7207145094871521, + "learning_rate": 0.00017342475684166013, + "loss": 2.5481, + "step": 4404 + }, + { + "epoch": 0.39908495843808744, + "grad_norm": 0.7025651335716248, + "learning_rate": 0.0001734187156406694, + "loss": 2.6301, + "step": 4405 + }, + { + "epoch": 0.39917555661253423, + "grad_norm": 0.7986682057380676, + "learning_rate": 0.00017341267443967862, + "loss": 2.8263, + "step": 4406 + }, + { + "epoch": 0.399266154786981, + "grad_norm": 0.7595162391662598, + "learning_rate": 0.00017340663323868786, + "loss": 2.6928, + "step": 4407 + }, + { + "epoch": 0.3993567529614278, + "grad_norm": 0.7884128093719482, + "learning_rate": 0.0001734005920376971, + "loss": 2.9296, + "step": 4408 + }, + { + "epoch": 0.3994473511358746, + "grad_norm": 0.9103771448135376, + "learning_rate": 0.00017339455083670635, + "loss": 3.0512, + "step": 4409 + }, + { + "epoch": 0.3995379493103214, + "grad_norm": 0.7471110820770264, + "learning_rate": 0.0001733885096357156, + "loss": 2.8842, + "step": 4410 + }, + { + "epoch": 0.39962854748476817, + "grad_norm": 0.7257130742073059, + "learning_rate": 0.00017338246843472482, + "loss": 2.8494, + "step": 4411 + }, + { + "epoch": 0.39971914565921496, + "grad_norm": 0.7397736310958862, + "learning_rate": 0.00017337642723373408, + "loss": 2.8253, + "step": 4412 + }, + { + "epoch": 0.39980974383366175, + "grad_norm": 0.7062044143676758, + "learning_rate": 0.00017337038603274332, + "loss": 2.1596, + "step": 4413 + }, + { + "epoch": 0.39990034200810853, + "grad_norm": 0.7418607473373413, + "learning_rate": 0.00017336434483175258, + "loss": 2.7706, + "step": 4414 + }, + { + "epoch": 0.3999909401825553, + "grad_norm": 0.6640499234199524, + "learning_rate": 0.00017335830363076178, + "loss": 2.1606, + "step": 4415 + }, + { + "epoch": 0.4000815383570021, + "grad_norm": 0.712654173374176, + "learning_rate": 0.00017335226242977105, + "loss": 1.9096, + "step": 4416 + }, + { + "epoch": 0.4001721365314489, + "grad_norm": 0.7753866910934448, + "learning_rate": 0.00017334622122878028, + "loss": 2.6862, + "step": 4417 + }, + { + "epoch": 0.4002627347058957, + "grad_norm": 0.7921904921531677, + "learning_rate": 0.00017334018002778954, + "loss": 3.0726, + "step": 4418 + }, + { + "epoch": 0.40035333288034247, + "grad_norm": 0.7282711267471313, + "learning_rate": 0.00017333413882679878, + "loss": 2.5322, + "step": 4419 + }, + { + "epoch": 0.40044393105478926, + "grad_norm": 0.8141428232192993, + "learning_rate": 0.000173328097625808, + "loss": 2.768, + "step": 4420 + }, + { + "epoch": 0.40053452922923605, + "grad_norm": 0.7839258313179016, + "learning_rate": 0.00017332205642481727, + "loss": 2.963, + "step": 4421 + }, + { + "epoch": 0.40062512740368283, + "grad_norm": 0.7192074060440063, + "learning_rate": 0.0001733160152238265, + "loss": 3.0119, + "step": 4422 + }, + { + "epoch": 0.4007157255781296, + "grad_norm": 0.7344297766685486, + "learning_rate": 0.00017330997402283577, + "loss": 2.7137, + "step": 4423 + }, + { + "epoch": 0.4008063237525764, + "grad_norm": 0.7582270503044128, + "learning_rate": 0.00017330393282184497, + "loss": 2.996, + "step": 4424 + }, + { + "epoch": 0.4008969219270232, + "grad_norm": 0.7674827575683594, + "learning_rate": 0.00017329789162085423, + "loss": 3.2553, + "step": 4425 + }, + { + "epoch": 0.40098752010146993, + "grad_norm": 0.7426122426986694, + "learning_rate": 0.00017329185041986347, + "loss": 2.7762, + "step": 4426 + }, + { + "epoch": 0.4010781182759167, + "grad_norm": 0.7410914301872253, + "learning_rate": 0.00017328580921887273, + "loss": 2.6086, + "step": 4427 + }, + { + "epoch": 0.4011687164503635, + "grad_norm": 0.8061581254005432, + "learning_rate": 0.00017327976801788196, + "loss": 3.0641, + "step": 4428 + }, + { + "epoch": 0.4012593146248103, + "grad_norm": 0.7655908465385437, + "learning_rate": 0.0001732737268168912, + "loss": 2.7373, + "step": 4429 + }, + { + "epoch": 0.4013499127992571, + "grad_norm": 0.7331250905990601, + "learning_rate": 0.00017326768561590046, + "loss": 2.8091, + "step": 4430 + }, + { + "epoch": 0.40144051097370387, + "grad_norm": 0.7549276351928711, + "learning_rate": 0.0001732616444149097, + "loss": 3.0626, + "step": 4431 + }, + { + "epoch": 0.40153110914815066, + "grad_norm": 0.812805712223053, + "learning_rate": 0.00017325560321391893, + "loss": 2.8448, + "step": 4432 + }, + { + "epoch": 0.40162170732259744, + "grad_norm": 0.7213829755783081, + "learning_rate": 0.00017324956201292816, + "loss": 2.7224, + "step": 4433 + }, + { + "epoch": 0.40171230549704423, + "grad_norm": 0.745358943939209, + "learning_rate": 0.00017324352081193742, + "loss": 2.7982, + "step": 4434 + }, + { + "epoch": 0.401802903671491, + "grad_norm": 0.745952308177948, + "learning_rate": 0.00017323747961094668, + "loss": 2.806, + "step": 4435 + }, + { + "epoch": 0.4018935018459378, + "grad_norm": 0.7522603273391724, + "learning_rate": 0.00017323143840995592, + "loss": 2.733, + "step": 4436 + }, + { + "epoch": 0.4019841000203846, + "grad_norm": 0.7263036966323853, + "learning_rate": 0.00017322539720896515, + "loss": 2.6188, + "step": 4437 + }, + { + "epoch": 0.4020746981948314, + "grad_norm": 0.7342725396156311, + "learning_rate": 0.00017321935600797438, + "loss": 2.8686, + "step": 4438 + }, + { + "epoch": 0.40216529636927817, + "grad_norm": 0.7291907668113708, + "learning_rate": 0.00017321331480698365, + "loss": 2.6303, + "step": 4439 + }, + { + "epoch": 0.40225589454372496, + "grad_norm": 0.7354617714881897, + "learning_rate": 0.00017320727360599288, + "loss": 2.9263, + "step": 4440 + }, + { + "epoch": 0.40234649271817174, + "grad_norm": 0.8349466323852539, + "learning_rate": 0.00017320123240500211, + "loss": 2.8749, + "step": 4441 + }, + { + "epoch": 0.40243709089261853, + "grad_norm": 0.7868831753730774, + "learning_rate": 0.00017319519120401138, + "loss": 2.7517, + "step": 4442 + }, + { + "epoch": 0.4025276890670653, + "grad_norm": 0.8624632358551025, + "learning_rate": 0.0001731891500030206, + "loss": 2.9729, + "step": 4443 + }, + { + "epoch": 0.4026182872415121, + "grad_norm": 0.8767156600952148, + "learning_rate": 0.00017318310880202987, + "loss": 2.843, + "step": 4444 + }, + { + "epoch": 0.4027088854159589, + "grad_norm": 0.7871874570846558, + "learning_rate": 0.00017317706760103908, + "loss": 2.817, + "step": 4445 + }, + { + "epoch": 0.4027994835904056, + "grad_norm": 0.7753174901008606, + "learning_rate": 0.00017317102640004834, + "loss": 3.0807, + "step": 4446 + }, + { + "epoch": 0.4028900817648524, + "grad_norm": 0.8568944931030273, + "learning_rate": 0.00017316498519905757, + "loss": 2.7204, + "step": 4447 + }, + { + "epoch": 0.4029806799392992, + "grad_norm": 0.7781665325164795, + "learning_rate": 0.00017315894399806683, + "loss": 2.9491, + "step": 4448 + }, + { + "epoch": 0.403071278113746, + "grad_norm": 0.7085457444190979, + "learning_rate": 0.00017315290279707607, + "loss": 2.252, + "step": 4449 + }, + { + "epoch": 0.4031618762881928, + "grad_norm": 0.7560263872146606, + "learning_rate": 0.0001731468615960853, + "loss": 2.8462, + "step": 4450 + }, + { + "epoch": 0.40325247446263957, + "grad_norm": 0.7917683124542236, + "learning_rate": 0.00017314082039509456, + "loss": 2.7866, + "step": 4451 + }, + { + "epoch": 0.40334307263708635, + "grad_norm": 0.7722923755645752, + "learning_rate": 0.0001731347791941038, + "loss": 2.8596, + "step": 4452 + }, + { + "epoch": 0.40343367081153314, + "grad_norm": 0.7835889458656311, + "learning_rate": 0.00017312873799311303, + "loss": 3.2719, + "step": 4453 + }, + { + "epoch": 0.40352426898597993, + "grad_norm": 0.733082115650177, + "learning_rate": 0.00017312269679212227, + "loss": 2.8887, + "step": 4454 + }, + { + "epoch": 0.4036148671604267, + "grad_norm": 0.7427105903625488, + "learning_rate": 0.00017311665559113153, + "loss": 2.6362, + "step": 4455 + }, + { + "epoch": 0.4037054653348735, + "grad_norm": 0.7053208947181702, + "learning_rate": 0.00017311061439014076, + "loss": 2.1922, + "step": 4456 + }, + { + "epoch": 0.4037960635093203, + "grad_norm": 0.8060571551322937, + "learning_rate": 0.00017310457318915002, + "loss": 2.8614, + "step": 4457 + }, + { + "epoch": 0.4038866616837671, + "grad_norm": 0.8613244891166687, + "learning_rate": 0.00017309853198815926, + "loss": 2.8081, + "step": 4458 + }, + { + "epoch": 0.40397725985821387, + "grad_norm": 0.7868354320526123, + "learning_rate": 0.0001730924907871685, + "loss": 2.7914, + "step": 4459 + }, + { + "epoch": 0.40406785803266065, + "grad_norm": 0.7410007119178772, + "learning_rate": 0.00017308644958617775, + "loss": 2.5055, + "step": 4460 + }, + { + "epoch": 0.40415845620710744, + "grad_norm": 0.8046941757202148, + "learning_rate": 0.00017308040838518698, + "loss": 2.7744, + "step": 4461 + }, + { + "epoch": 0.40424905438155423, + "grad_norm": 0.7448469996452332, + "learning_rate": 0.00017307436718419622, + "loss": 2.7959, + "step": 4462 + }, + { + "epoch": 0.404339652556001, + "grad_norm": 0.7569942474365234, + "learning_rate": 0.00017306832598320545, + "loss": 2.6933, + "step": 4463 + }, + { + "epoch": 0.4044302507304478, + "grad_norm": 0.7310815453529358, + "learning_rate": 0.00017306228478221471, + "loss": 2.7551, + "step": 4464 + }, + { + "epoch": 0.4045208489048946, + "grad_norm": 0.6993508338928223, + "learning_rate": 0.00017305624358122398, + "loss": 2.6208, + "step": 4465 + }, + { + "epoch": 0.4046114470793413, + "grad_norm": 0.7568324208259583, + "learning_rate": 0.00017305020238023318, + "loss": 2.6087, + "step": 4466 + }, + { + "epoch": 0.4047020452537881, + "grad_norm": 0.7106309533119202, + "learning_rate": 0.00017304416117924244, + "loss": 2.6014, + "step": 4467 + }, + { + "epoch": 0.4047926434282349, + "grad_norm": 0.8779883980751038, + "learning_rate": 0.00017303811997825168, + "loss": 2.9194, + "step": 4468 + }, + { + "epoch": 0.4048832416026817, + "grad_norm": 0.6683615446090698, + "learning_rate": 0.00017303207877726094, + "loss": 2.1422, + "step": 4469 + }, + { + "epoch": 0.4049738397771285, + "grad_norm": 0.7306983470916748, + "learning_rate": 0.00017302603757627017, + "loss": 2.6722, + "step": 4470 + }, + { + "epoch": 0.40506443795157526, + "grad_norm": 0.7694866061210632, + "learning_rate": 0.0001730199963752794, + "loss": 2.7986, + "step": 4471 + }, + { + "epoch": 0.40515503612602205, + "grad_norm": 0.8047335743904114, + "learning_rate": 0.00017301395517428867, + "loss": 2.7556, + "step": 4472 + }, + { + "epoch": 0.40524563430046884, + "grad_norm": 0.8332940936088562, + "learning_rate": 0.0001730079139732979, + "loss": 2.8015, + "step": 4473 + }, + { + "epoch": 0.4053362324749156, + "grad_norm": 0.6180290579795837, + "learning_rate": 0.00017300187277230716, + "loss": 2.0878, + "step": 4474 + }, + { + "epoch": 0.4054268306493624, + "grad_norm": 0.7667070031166077, + "learning_rate": 0.00017299583157131637, + "loss": 3.032, + "step": 4475 + }, + { + "epoch": 0.4055174288238092, + "grad_norm": 0.7339078783988953, + "learning_rate": 0.00017298979037032563, + "loss": 2.9146, + "step": 4476 + }, + { + "epoch": 0.405608026998256, + "grad_norm": 0.7342396974563599, + "learning_rate": 0.00017298374916933487, + "loss": 2.7499, + "step": 4477 + }, + { + "epoch": 0.4056986251727028, + "grad_norm": 0.7601968050003052, + "learning_rate": 0.00017297770796834413, + "loss": 2.765, + "step": 4478 + }, + { + "epoch": 0.40578922334714956, + "grad_norm": 0.6769884824752808, + "learning_rate": 0.00017297166676735336, + "loss": 2.5625, + "step": 4479 + }, + { + "epoch": 0.40587982152159635, + "grad_norm": 0.74338698387146, + "learning_rate": 0.0001729656255663626, + "loss": 2.8248, + "step": 4480 + }, + { + "epoch": 0.40597041969604314, + "grad_norm": 0.7557222843170166, + "learning_rate": 0.00017295958436537186, + "loss": 3.1646, + "step": 4481 + }, + { + "epoch": 0.4060610178704899, + "grad_norm": 0.7029734253883362, + "learning_rate": 0.0001729535431643811, + "loss": 2.5676, + "step": 4482 + }, + { + "epoch": 0.4061516160449367, + "grad_norm": 0.7118141651153564, + "learning_rate": 0.00017294750196339032, + "loss": 2.8398, + "step": 4483 + }, + { + "epoch": 0.4062422142193835, + "grad_norm": 0.7401372790336609, + "learning_rate": 0.00017294146076239956, + "loss": 2.2047, + "step": 4484 + }, + { + "epoch": 0.4063328123938303, + "grad_norm": 0.7452130913734436, + "learning_rate": 0.00017293541956140882, + "loss": 2.7653, + "step": 4485 + }, + { + "epoch": 0.406423410568277, + "grad_norm": 0.7309182286262512, + "learning_rate": 0.00017292937836041805, + "loss": 2.95, + "step": 4486 + }, + { + "epoch": 0.4065140087427238, + "grad_norm": 0.7174546718597412, + "learning_rate": 0.00017292333715942731, + "loss": 2.8512, + "step": 4487 + }, + { + "epoch": 0.4066046069171706, + "grad_norm": 0.8134185075759888, + "learning_rate": 0.00017291729595843655, + "loss": 2.7027, + "step": 4488 + }, + { + "epoch": 0.4066952050916174, + "grad_norm": 0.7285604476928711, + "learning_rate": 0.00017291125475744578, + "loss": 2.8752, + "step": 4489 + }, + { + "epoch": 0.40678580326606417, + "grad_norm": 0.6989976167678833, + "learning_rate": 0.00017290521355645504, + "loss": 2.4854, + "step": 4490 + }, + { + "epoch": 0.40687640144051096, + "grad_norm": 0.9270228147506714, + "learning_rate": 0.00017289917235546428, + "loss": 3.0187, + "step": 4491 + }, + { + "epoch": 0.40696699961495775, + "grad_norm": 0.7277869582176208, + "learning_rate": 0.0001728931311544735, + "loss": 3.0707, + "step": 4492 + }, + { + "epoch": 0.40705759778940453, + "grad_norm": 0.7560047507286072, + "learning_rate": 0.00017288708995348275, + "loss": 2.9305, + "step": 4493 + }, + { + "epoch": 0.4071481959638513, + "grad_norm": 0.7718452215194702, + "learning_rate": 0.000172881048752492, + "loss": 2.7474, + "step": 4494 + }, + { + "epoch": 0.4072387941382981, + "grad_norm": 0.7625097632408142, + "learning_rate": 0.00017287500755150127, + "loss": 2.6739, + "step": 4495 + }, + { + "epoch": 0.4073293923127449, + "grad_norm": 0.8155550956726074, + "learning_rate": 0.00017286896635051048, + "loss": 2.8975, + "step": 4496 + }, + { + "epoch": 0.4074199904871917, + "grad_norm": 0.7571144700050354, + "learning_rate": 0.00017286292514951974, + "loss": 2.7209, + "step": 4497 + }, + { + "epoch": 0.4075105886616385, + "grad_norm": 0.757973313331604, + "learning_rate": 0.00017285688394852897, + "loss": 2.9655, + "step": 4498 + }, + { + "epoch": 0.40760118683608526, + "grad_norm": 0.7546719908714294, + "learning_rate": 0.00017285084274753823, + "loss": 2.6679, + "step": 4499 + }, + { + "epoch": 0.40769178501053205, + "grad_norm": 0.7326505184173584, + "learning_rate": 0.00017284480154654747, + "loss": 2.5809, + "step": 4500 + }, + { + "epoch": 0.40778238318497884, + "grad_norm": 0.8521066308021545, + "learning_rate": 0.0001728387603455567, + "loss": 2.7012, + "step": 4501 + }, + { + "epoch": 0.4078729813594256, + "grad_norm": 0.7517671585083008, + "learning_rate": 0.00017283271914456596, + "loss": 2.8472, + "step": 4502 + }, + { + "epoch": 0.4079635795338724, + "grad_norm": 0.7628030776977539, + "learning_rate": 0.0001728266779435752, + "loss": 2.9785, + "step": 4503 + }, + { + "epoch": 0.4080541777083192, + "grad_norm": 0.7444595694541931, + "learning_rate": 0.00017282063674258443, + "loss": 2.6748, + "step": 4504 + }, + { + "epoch": 0.408144775882766, + "grad_norm": 0.827632486820221, + "learning_rate": 0.00017281459554159366, + "loss": 2.9268, + "step": 4505 + }, + { + "epoch": 0.4082353740572128, + "grad_norm": 0.7377209663391113, + "learning_rate": 0.00017280855434060292, + "loss": 3.2066, + "step": 4506 + }, + { + "epoch": 0.4083259722316595, + "grad_norm": 0.777258038520813, + "learning_rate": 0.00017280251313961216, + "loss": 2.9411, + "step": 4507 + }, + { + "epoch": 0.4084165704061063, + "grad_norm": 0.7453844547271729, + "learning_rate": 0.00017279647193862142, + "loss": 2.755, + "step": 4508 + }, + { + "epoch": 0.4085071685805531, + "grad_norm": 0.8571553826332092, + "learning_rate": 0.00017279043073763065, + "loss": 2.7817, + "step": 4509 + }, + { + "epoch": 0.40859776675499987, + "grad_norm": 0.7734798192977905, + "learning_rate": 0.0001727843895366399, + "loss": 2.7437, + "step": 4510 + }, + { + "epoch": 0.40868836492944666, + "grad_norm": 0.8008256554603577, + "learning_rate": 0.00017277834833564915, + "loss": 2.9956, + "step": 4511 + }, + { + "epoch": 0.40877896310389344, + "grad_norm": 0.7838256359100342, + "learning_rate": 0.00017277230713465838, + "loss": 3.0931, + "step": 4512 + }, + { + "epoch": 0.40886956127834023, + "grad_norm": 0.7831577658653259, + "learning_rate": 0.00017276626593366762, + "loss": 2.9142, + "step": 4513 + }, + { + "epoch": 0.408960159452787, + "grad_norm": 0.7342298030853271, + "learning_rate": 0.00017276022473267685, + "loss": 2.6858, + "step": 4514 + }, + { + "epoch": 0.4090507576272338, + "grad_norm": 0.8063110709190369, + "learning_rate": 0.0001727541835316861, + "loss": 2.7517, + "step": 4515 + }, + { + "epoch": 0.4091413558016806, + "grad_norm": 0.7653720378875732, + "learning_rate": 0.00017274814233069535, + "loss": 2.9079, + "step": 4516 + }, + { + "epoch": 0.4092319539761274, + "grad_norm": 0.7201693058013916, + "learning_rate": 0.00017274210112970458, + "loss": 2.6376, + "step": 4517 + }, + { + "epoch": 0.40932255215057417, + "grad_norm": 0.7993066906929016, + "learning_rate": 0.00017273605992871384, + "loss": 2.5812, + "step": 4518 + }, + { + "epoch": 0.40941315032502096, + "grad_norm": 0.7340923547744751, + "learning_rate": 0.00017273001872772308, + "loss": 2.7691, + "step": 4519 + }, + { + "epoch": 0.40950374849946775, + "grad_norm": 0.7376404404640198, + "learning_rate": 0.00017272397752673234, + "loss": 3.0421, + "step": 4520 + }, + { + "epoch": 0.40959434667391453, + "grad_norm": 1.3141272068023682, + "learning_rate": 0.00017271793632574157, + "loss": 3.096, + "step": 4521 + }, + { + "epoch": 0.4096849448483613, + "grad_norm": 0.7637984156608582, + "learning_rate": 0.0001727118951247508, + "loss": 2.8642, + "step": 4522 + }, + { + "epoch": 0.4097755430228081, + "grad_norm": 0.7456836700439453, + "learning_rate": 0.00017270585392376004, + "loss": 2.7841, + "step": 4523 + }, + { + "epoch": 0.4098661411972549, + "grad_norm": 0.7905747890472412, + "learning_rate": 0.0001726998127227693, + "loss": 2.6827, + "step": 4524 + }, + { + "epoch": 0.4099567393717017, + "grad_norm": 0.7828412652015686, + "learning_rate": 0.00017269377152177853, + "loss": 2.694, + "step": 4525 + }, + { + "epoch": 0.41004733754614847, + "grad_norm": 0.7919395565986633, + "learning_rate": 0.00017268773032078777, + "loss": 3.1006, + "step": 4526 + }, + { + "epoch": 0.4101379357205952, + "grad_norm": 0.8507271409034729, + "learning_rate": 0.00017268168911979703, + "loss": 2.8968, + "step": 4527 + }, + { + "epoch": 0.410228533895042, + "grad_norm": 1.0370235443115234, + "learning_rate": 0.00017267564791880626, + "loss": 2.5681, + "step": 4528 + }, + { + "epoch": 0.4103191320694888, + "grad_norm": 0.7393503785133362, + "learning_rate": 0.00017266960671781552, + "loss": 2.8612, + "step": 4529 + }, + { + "epoch": 0.41040973024393557, + "grad_norm": 0.7815709114074707, + "learning_rate": 0.00017266356551682473, + "loss": 2.8194, + "step": 4530 + }, + { + "epoch": 0.41050032841838235, + "grad_norm": 0.8723633885383606, + "learning_rate": 0.000172657524315834, + "loss": 2.7333, + "step": 4531 + }, + { + "epoch": 0.41059092659282914, + "grad_norm": 0.7602224349975586, + "learning_rate": 0.00017265148311484325, + "loss": 2.7627, + "step": 4532 + }, + { + "epoch": 0.41068152476727593, + "grad_norm": 0.781862199306488, + "learning_rate": 0.0001726454419138525, + "loss": 2.9224, + "step": 4533 + }, + { + "epoch": 0.4107721229417227, + "grad_norm": 0.667366087436676, + "learning_rate": 0.00017263940071286172, + "loss": 2.2409, + "step": 4534 + }, + { + "epoch": 0.4108627211161695, + "grad_norm": 0.7355814576148987, + "learning_rate": 0.00017263335951187096, + "loss": 2.9883, + "step": 4535 + }, + { + "epoch": 0.4109533192906163, + "grad_norm": 0.854322075843811, + "learning_rate": 0.00017262731831088022, + "loss": 2.7811, + "step": 4536 + }, + { + "epoch": 0.4110439174650631, + "grad_norm": 0.5581184029579163, + "learning_rate": 0.00017262127710988945, + "loss": 1.408, + "step": 4537 + }, + { + "epoch": 0.41113451563950987, + "grad_norm": 0.7792844772338867, + "learning_rate": 0.00017261523590889868, + "loss": 3.0891, + "step": 4538 + }, + { + "epoch": 0.41122511381395666, + "grad_norm": 0.6448352336883545, + "learning_rate": 0.00017260919470790795, + "loss": 2.1634, + "step": 4539 + }, + { + "epoch": 0.41131571198840344, + "grad_norm": 0.7635291218757629, + "learning_rate": 0.00017260315350691718, + "loss": 2.9204, + "step": 4540 + }, + { + "epoch": 0.41140631016285023, + "grad_norm": 0.6104177832603455, + "learning_rate": 0.00017259711230592644, + "loss": 1.923, + "step": 4541 + }, + { + "epoch": 0.411496908337297, + "grad_norm": 0.7098910212516785, + "learning_rate": 0.00017259107110493568, + "loss": 2.6885, + "step": 4542 + }, + { + "epoch": 0.4115875065117438, + "grad_norm": 0.7385285496711731, + "learning_rate": 0.0001725850299039449, + "loss": 2.8348, + "step": 4543 + }, + { + "epoch": 0.4116781046861906, + "grad_norm": 0.6702491044998169, + "learning_rate": 0.00017257898870295414, + "loss": 2.2023, + "step": 4544 + }, + { + "epoch": 0.4117687028606374, + "grad_norm": 0.7437846064567566, + "learning_rate": 0.0001725729475019634, + "loss": 2.8884, + "step": 4545 + }, + { + "epoch": 0.41185930103508417, + "grad_norm": 0.7065202593803406, + "learning_rate": 0.00017256690630097264, + "loss": 2.8132, + "step": 4546 + }, + { + "epoch": 0.4119498992095309, + "grad_norm": 0.7119550108909607, + "learning_rate": 0.00017256086509998187, + "loss": 2.9869, + "step": 4547 + }, + { + "epoch": 0.4120404973839777, + "grad_norm": 0.7592363357543945, + "learning_rate": 0.00017255482389899113, + "loss": 3.2412, + "step": 4548 + }, + { + "epoch": 0.4121310955584245, + "grad_norm": 0.7489522099494934, + "learning_rate": 0.00017254878269800037, + "loss": 2.7871, + "step": 4549 + }, + { + "epoch": 0.41222169373287126, + "grad_norm": 0.8171847462654114, + "learning_rate": 0.00017254274149700963, + "loss": 2.6979, + "step": 4550 + }, + { + "epoch": 0.41231229190731805, + "grad_norm": 0.6876449584960938, + "learning_rate": 0.00017253670029601886, + "loss": 2.7052, + "step": 4551 + }, + { + "epoch": 0.41240289008176484, + "grad_norm": 0.6834642887115479, + "learning_rate": 0.0001725306590950281, + "loss": 2.5394, + "step": 4552 + }, + { + "epoch": 0.4124934882562116, + "grad_norm": 0.7197080850601196, + "learning_rate": 0.00017252461789403733, + "loss": 2.2257, + "step": 4553 + }, + { + "epoch": 0.4125840864306584, + "grad_norm": 0.7681770920753479, + "learning_rate": 0.0001725185766930466, + "loss": 2.8652, + "step": 4554 + }, + { + "epoch": 0.4126746846051052, + "grad_norm": 0.7567527890205383, + "learning_rate": 0.00017251253549205583, + "loss": 2.6503, + "step": 4555 + }, + { + "epoch": 0.412765282779552, + "grad_norm": 0.7815245985984802, + "learning_rate": 0.00017250649429106506, + "loss": 2.9126, + "step": 4556 + }, + { + "epoch": 0.4128558809539988, + "grad_norm": 0.7507094144821167, + "learning_rate": 0.00017250045309007432, + "loss": 2.918, + "step": 4557 + }, + { + "epoch": 0.41294647912844556, + "grad_norm": 0.8496657013893127, + "learning_rate": 0.00017249441188908356, + "loss": 2.7448, + "step": 4558 + }, + { + "epoch": 0.41303707730289235, + "grad_norm": 0.7581692337989807, + "learning_rate": 0.00017248837068809282, + "loss": 2.7966, + "step": 4559 + }, + { + "epoch": 0.41312767547733914, + "grad_norm": 0.8272373676300049, + "learning_rate": 0.00017248232948710202, + "loss": 2.982, + "step": 4560 + }, + { + "epoch": 0.41321827365178593, + "grad_norm": 0.7859890460968018, + "learning_rate": 0.00017247628828611128, + "loss": 3.0427, + "step": 4561 + }, + { + "epoch": 0.4133088718262327, + "grad_norm": 0.8757119178771973, + "learning_rate": 0.00017247024708512055, + "loss": 2.9888, + "step": 4562 + }, + { + "epoch": 0.4133994700006795, + "grad_norm": 0.7502010464668274, + "learning_rate": 0.00017246420588412978, + "loss": 2.8538, + "step": 4563 + }, + { + "epoch": 0.4134900681751263, + "grad_norm": 0.7777090072631836, + "learning_rate": 0.00017245816468313901, + "loss": 2.7264, + "step": 4564 + }, + { + "epoch": 0.4135806663495731, + "grad_norm": 0.7438626289367676, + "learning_rate": 0.00017245212348214825, + "loss": 2.8841, + "step": 4565 + }, + { + "epoch": 0.41367126452401987, + "grad_norm": 0.796726644039154, + "learning_rate": 0.0001724460822811575, + "loss": 2.7588, + "step": 4566 + }, + { + "epoch": 0.4137618626984666, + "grad_norm": 0.7754446864128113, + "learning_rate": 0.00017244004108016674, + "loss": 2.9022, + "step": 4567 + }, + { + "epoch": 0.4138524608729134, + "grad_norm": 0.7556686401367188, + "learning_rate": 0.00017243399987917598, + "loss": 2.783, + "step": 4568 + }, + { + "epoch": 0.4139430590473602, + "grad_norm": 0.7288707494735718, + "learning_rate": 0.00017242795867818524, + "loss": 2.7087, + "step": 4569 + }, + { + "epoch": 0.41403365722180696, + "grad_norm": 0.7421033382415771, + "learning_rate": 0.00017242191747719447, + "loss": 2.912, + "step": 4570 + }, + { + "epoch": 0.41412425539625375, + "grad_norm": 0.6807780265808105, + "learning_rate": 0.00017241587627620373, + "loss": 2.0121, + "step": 4571 + }, + { + "epoch": 0.41421485357070054, + "grad_norm": 0.7200636267662048, + "learning_rate": 0.00017240983507521297, + "loss": 2.86, + "step": 4572 + }, + { + "epoch": 0.4143054517451473, + "grad_norm": 0.8438485860824585, + "learning_rate": 0.0001724037938742222, + "loss": 2.8078, + "step": 4573 + }, + { + "epoch": 0.4143960499195941, + "grad_norm": 0.6747097969055176, + "learning_rate": 0.00017239775267323144, + "loss": 2.0667, + "step": 4574 + }, + { + "epoch": 0.4144866480940409, + "grad_norm": 0.7357356548309326, + "learning_rate": 0.0001723917114722407, + "loss": 2.7956, + "step": 4575 + }, + { + "epoch": 0.4145772462684877, + "grad_norm": 0.7485372424125671, + "learning_rate": 0.00017238567027124993, + "loss": 2.8162, + "step": 4576 + }, + { + "epoch": 0.4146678444429345, + "grad_norm": 0.7314936518669128, + "learning_rate": 0.00017237962907025917, + "loss": 2.7485, + "step": 4577 + }, + { + "epoch": 0.41475844261738126, + "grad_norm": 0.8312885165214539, + "learning_rate": 0.00017237358786926843, + "loss": 2.7863, + "step": 4578 + }, + { + "epoch": 0.41484904079182805, + "grad_norm": 0.6692301630973816, + "learning_rate": 0.00017236754666827766, + "loss": 2.2158, + "step": 4579 + }, + { + "epoch": 0.41493963896627484, + "grad_norm": 0.7798373699188232, + "learning_rate": 0.00017236150546728692, + "loss": 2.1977, + "step": 4580 + }, + { + "epoch": 0.4150302371407216, + "grad_norm": 0.8151112198829651, + "learning_rate": 0.00017235546426629613, + "loss": 3.3212, + "step": 4581 + }, + { + "epoch": 0.4151208353151684, + "grad_norm": 0.7858757376670837, + "learning_rate": 0.0001723494230653054, + "loss": 2.8599, + "step": 4582 + }, + { + "epoch": 0.4152114334896152, + "grad_norm": 0.7620429992675781, + "learning_rate": 0.00017234338186431462, + "loss": 2.7386, + "step": 4583 + }, + { + "epoch": 0.415302031664062, + "grad_norm": 0.7610493898391724, + "learning_rate": 0.00017233734066332388, + "loss": 3.0471, + "step": 4584 + }, + { + "epoch": 0.4153926298385088, + "grad_norm": 0.7834547758102417, + "learning_rate": 0.00017233129946233312, + "loss": 2.8848, + "step": 4585 + }, + { + "epoch": 0.41548322801295556, + "grad_norm": 0.803360641002655, + "learning_rate": 0.00017232525826134235, + "loss": 2.7377, + "step": 4586 + }, + { + "epoch": 0.41557382618740235, + "grad_norm": 0.6961442232131958, + "learning_rate": 0.00017231921706035161, + "loss": 1.9884, + "step": 4587 + }, + { + "epoch": 0.4156644243618491, + "grad_norm": 0.7384611964225769, + "learning_rate": 0.00017231317585936085, + "loss": 3.0378, + "step": 4588 + }, + { + "epoch": 0.41575502253629587, + "grad_norm": 0.7555893659591675, + "learning_rate": 0.00017230713465837008, + "loss": 2.7671, + "step": 4589 + }, + { + "epoch": 0.41584562071074266, + "grad_norm": 0.7333597540855408, + "learning_rate": 0.00017230109345737932, + "loss": 2.6795, + "step": 4590 + }, + { + "epoch": 0.41593621888518945, + "grad_norm": 0.6450008153915405, + "learning_rate": 0.00017229505225638858, + "loss": 1.9299, + "step": 4591 + }, + { + "epoch": 0.41602681705963623, + "grad_norm": 0.7242477536201477, + "learning_rate": 0.00017228901105539784, + "loss": 2.7235, + "step": 4592 + }, + { + "epoch": 0.416117415234083, + "grad_norm": 0.7264975905418396, + "learning_rate": 0.00017228296985440707, + "loss": 3.008, + "step": 4593 + }, + { + "epoch": 0.4162080134085298, + "grad_norm": 0.7963326573371887, + "learning_rate": 0.0001722769286534163, + "loss": 2.9063, + "step": 4594 + }, + { + "epoch": 0.4162986115829766, + "grad_norm": 0.7490614652633667, + "learning_rate": 0.00017227088745242554, + "loss": 2.6869, + "step": 4595 + }, + { + "epoch": 0.4163892097574234, + "grad_norm": 0.7564235925674438, + "learning_rate": 0.0001722648462514348, + "loss": 2.8002, + "step": 4596 + }, + { + "epoch": 0.41647980793187017, + "grad_norm": 0.7606903314590454, + "learning_rate": 0.00017225880505044404, + "loss": 2.7107, + "step": 4597 + }, + { + "epoch": 0.41657040610631696, + "grad_norm": 0.7740763425827026, + "learning_rate": 0.00017225276384945327, + "loss": 1.9995, + "step": 4598 + }, + { + "epoch": 0.41666100428076375, + "grad_norm": 0.7559170126914978, + "learning_rate": 0.00017224672264846253, + "loss": 2.7706, + "step": 4599 + }, + { + "epoch": 0.41675160245521053, + "grad_norm": 0.7672294974327087, + "learning_rate": 0.00017224068144747177, + "loss": 2.9496, + "step": 4600 + }, + { + "epoch": 0.4168422006296573, + "grad_norm": 0.7743618488311768, + "learning_rate": 0.00017223464024648103, + "loss": 3.0989, + "step": 4601 + }, + { + "epoch": 0.4169327988041041, + "grad_norm": 0.7586788535118103, + "learning_rate": 0.00017222859904549023, + "loss": 2.796, + "step": 4602 + }, + { + "epoch": 0.4170233969785509, + "grad_norm": 0.780360221862793, + "learning_rate": 0.0001722225578444995, + "loss": 2.8336, + "step": 4603 + }, + { + "epoch": 0.4171139951529977, + "grad_norm": 0.7534027099609375, + "learning_rate": 0.00017221651664350873, + "loss": 2.8337, + "step": 4604 + }, + { + "epoch": 0.4172045933274445, + "grad_norm": 0.7788129448890686, + "learning_rate": 0.000172210475442518, + "loss": 3.0251, + "step": 4605 + }, + { + "epoch": 0.41729519150189126, + "grad_norm": 0.7324131727218628, + "learning_rate": 0.00017220443424152722, + "loss": 2.7563, + "step": 4606 + }, + { + "epoch": 0.41738578967633805, + "grad_norm": 0.7667083740234375, + "learning_rate": 0.00017219839304053646, + "loss": 2.7907, + "step": 4607 + }, + { + "epoch": 0.4174763878507848, + "grad_norm": 0.7509970664978027, + "learning_rate": 0.00017219235183954572, + "loss": 2.6267, + "step": 4608 + }, + { + "epoch": 0.41756698602523157, + "grad_norm": 0.8070818185806274, + "learning_rate": 0.00017218631063855495, + "loss": 2.7065, + "step": 4609 + }, + { + "epoch": 0.41765758419967836, + "grad_norm": 0.7420274019241333, + "learning_rate": 0.00017218026943756421, + "loss": 2.7497, + "step": 4610 + }, + { + "epoch": 0.41774818237412514, + "grad_norm": 0.8071148991584778, + "learning_rate": 0.00017217422823657342, + "loss": 2.9832, + "step": 4611 + }, + { + "epoch": 0.41783878054857193, + "grad_norm": 0.8442020416259766, + "learning_rate": 0.00017216818703558268, + "loss": 2.8658, + "step": 4612 + }, + { + "epoch": 0.4179293787230187, + "grad_norm": 0.7366212010383606, + "learning_rate": 0.00017216214583459192, + "loss": 2.7197, + "step": 4613 + }, + { + "epoch": 0.4180199768974655, + "grad_norm": 0.8322017788887024, + "learning_rate": 0.00017215610463360118, + "loss": 2.7876, + "step": 4614 + }, + { + "epoch": 0.4181105750719123, + "grad_norm": 0.7562809586524963, + "learning_rate": 0.0001721500634326104, + "loss": 2.5813, + "step": 4615 + }, + { + "epoch": 0.4182011732463591, + "grad_norm": 0.6178218126296997, + "learning_rate": 0.00017214402223161965, + "loss": 2.1994, + "step": 4616 + }, + { + "epoch": 0.41829177142080587, + "grad_norm": 0.7357496023178101, + "learning_rate": 0.0001721379810306289, + "loss": 2.1447, + "step": 4617 + }, + { + "epoch": 0.41838236959525266, + "grad_norm": 0.7253264784812927, + "learning_rate": 0.00017213193982963814, + "loss": 2.7231, + "step": 4618 + }, + { + "epoch": 0.41847296776969944, + "grad_norm": 0.7458580136299133, + "learning_rate": 0.00017212589862864737, + "loss": 2.9814, + "step": 4619 + }, + { + "epoch": 0.41856356594414623, + "grad_norm": 0.7652587890625, + "learning_rate": 0.0001721198574276566, + "loss": 2.8059, + "step": 4620 + }, + { + "epoch": 0.418654164118593, + "grad_norm": 0.6478111743927002, + "learning_rate": 0.00017211381622666587, + "loss": 1.97, + "step": 4621 + }, + { + "epoch": 0.4187447622930398, + "grad_norm": 0.7677211761474609, + "learning_rate": 0.00017210777502567513, + "loss": 2.9603, + "step": 4622 + }, + { + "epoch": 0.4188353604674866, + "grad_norm": 0.7630226612091064, + "learning_rate": 0.00017210173382468437, + "loss": 2.9334, + "step": 4623 + }, + { + "epoch": 0.4189259586419334, + "grad_norm": 0.7631981372833252, + "learning_rate": 0.0001720956926236936, + "loss": 2.584, + "step": 4624 + }, + { + "epoch": 0.41901655681638017, + "grad_norm": 0.6707683801651001, + "learning_rate": 0.00017208965142270283, + "loss": 2.0131, + "step": 4625 + }, + { + "epoch": 0.41910715499082696, + "grad_norm": 0.7469755411148071, + "learning_rate": 0.0001720836102217121, + "loss": 2.7327, + "step": 4626 + }, + { + "epoch": 0.41919775316527375, + "grad_norm": 0.8283634781837463, + "learning_rate": 0.00017207756902072133, + "loss": 2.7836, + "step": 4627 + }, + { + "epoch": 0.4192883513397205, + "grad_norm": 0.7640558481216431, + "learning_rate": 0.00017207152781973056, + "loss": 2.8093, + "step": 4628 + }, + { + "epoch": 0.41937894951416727, + "grad_norm": 0.715083658695221, + "learning_rate": 0.00017206548661873982, + "loss": 2.8669, + "step": 4629 + }, + { + "epoch": 0.41946954768861405, + "grad_norm": 0.6934821605682373, + "learning_rate": 0.00017205944541774906, + "loss": 2.1417, + "step": 4630 + }, + { + "epoch": 0.41956014586306084, + "grad_norm": 0.7744913101196289, + "learning_rate": 0.00017205340421675832, + "loss": 2.91, + "step": 4631 + }, + { + "epoch": 0.41965074403750763, + "grad_norm": 0.7208598852157593, + "learning_rate": 0.00017204736301576753, + "loss": 3.0625, + "step": 4632 + }, + { + "epoch": 0.4197413422119544, + "grad_norm": 0.780890703201294, + "learning_rate": 0.0001720413218147768, + "loss": 3.0689, + "step": 4633 + }, + { + "epoch": 0.4198319403864012, + "grad_norm": 0.7583006024360657, + "learning_rate": 0.00017203528061378602, + "loss": 2.6738, + "step": 4634 + }, + { + "epoch": 0.419922538560848, + "grad_norm": 0.7778444290161133, + "learning_rate": 0.00017202923941279528, + "loss": 2.6721, + "step": 4635 + }, + { + "epoch": 0.4200131367352948, + "grad_norm": 0.7263758778572083, + "learning_rate": 0.00017202319821180452, + "loss": 2.9123, + "step": 4636 + }, + { + "epoch": 0.42010373490974157, + "grad_norm": 0.7879968881607056, + "learning_rate": 0.00017201715701081375, + "loss": 2.5839, + "step": 4637 + }, + { + "epoch": 0.42019433308418835, + "grad_norm": 0.755537748336792, + "learning_rate": 0.000172011115809823, + "loss": 2.73, + "step": 4638 + }, + { + "epoch": 0.42028493125863514, + "grad_norm": 0.8064736723899841, + "learning_rate": 0.00017200507460883225, + "loss": 2.962, + "step": 4639 + }, + { + "epoch": 0.42037552943308193, + "grad_norm": 0.6867890357971191, + "learning_rate": 0.00017199903340784148, + "loss": 1.9899, + "step": 4640 + }, + { + "epoch": 0.4204661276075287, + "grad_norm": 0.6784562468528748, + "learning_rate": 0.00017199299220685071, + "loss": 2.2852, + "step": 4641 + }, + { + "epoch": 0.4205567257819755, + "grad_norm": 0.7415741086006165, + "learning_rate": 0.00017198695100585997, + "loss": 2.9029, + "step": 4642 + }, + { + "epoch": 0.4206473239564223, + "grad_norm": 0.7345883250236511, + "learning_rate": 0.0001719809098048692, + "loss": 2.7468, + "step": 4643 + }, + { + "epoch": 0.4207379221308691, + "grad_norm": 0.7599415183067322, + "learning_rate": 0.00017197486860387847, + "loss": 2.6899, + "step": 4644 + }, + { + "epoch": 0.42082852030531587, + "grad_norm": 0.8387414216995239, + "learning_rate": 0.0001719688274028877, + "loss": 2.3801, + "step": 4645 + }, + { + "epoch": 0.42091911847976266, + "grad_norm": 0.8220201134681702, + "learning_rate": 0.00017196278620189694, + "loss": 3.3113, + "step": 4646 + }, + { + "epoch": 0.42100971665420944, + "grad_norm": 0.6789528727531433, + "learning_rate": 0.0001719567450009062, + "loss": 2.1139, + "step": 4647 + }, + { + "epoch": 0.42110031482865623, + "grad_norm": 0.8097148537635803, + "learning_rate": 0.00017195070379991543, + "loss": 2.8806, + "step": 4648 + }, + { + "epoch": 0.42119091300310296, + "grad_norm": 0.7309190034866333, + "learning_rate": 0.00017194466259892467, + "loss": 2.6468, + "step": 4649 + }, + { + "epoch": 0.42128151117754975, + "grad_norm": 0.8013827204704285, + "learning_rate": 0.0001719386213979339, + "loss": 2.7793, + "step": 4650 + }, + { + "epoch": 0.42137210935199654, + "grad_norm": 0.7739374041557312, + "learning_rate": 0.00017193258019694316, + "loss": 2.8957, + "step": 4651 + }, + { + "epoch": 0.4214627075264433, + "grad_norm": 0.7721558809280396, + "learning_rate": 0.00017192653899595242, + "loss": 2.9253, + "step": 4652 + }, + { + "epoch": 0.4215533057008901, + "grad_norm": 0.7686715722084045, + "learning_rate": 0.00017192049779496163, + "loss": 2.6262, + "step": 4653 + }, + { + "epoch": 0.4216439038753369, + "grad_norm": 0.7865695953369141, + "learning_rate": 0.0001719144565939709, + "loss": 2.6759, + "step": 4654 + }, + { + "epoch": 0.4217345020497837, + "grad_norm": 0.7142041921615601, + "learning_rate": 0.00017190841539298013, + "loss": 2.6194, + "step": 4655 + }, + { + "epoch": 0.4218251002242305, + "grad_norm": 0.7939291596412659, + "learning_rate": 0.0001719023741919894, + "loss": 2.7639, + "step": 4656 + }, + { + "epoch": 0.42191569839867726, + "grad_norm": 0.7127162218093872, + "learning_rate": 0.00017189633299099862, + "loss": 2.7679, + "step": 4657 + }, + { + "epoch": 0.42200629657312405, + "grad_norm": 0.9165099263191223, + "learning_rate": 0.00017189029179000786, + "loss": 2.6656, + "step": 4658 + }, + { + "epoch": 0.42209689474757084, + "grad_norm": 0.7185727953910828, + "learning_rate": 0.00017188425058901712, + "loss": 2.7684, + "step": 4659 + }, + { + "epoch": 0.4221874929220176, + "grad_norm": 0.7989501953125, + "learning_rate": 0.00017187820938802635, + "loss": 3.1298, + "step": 4660 + }, + { + "epoch": 0.4222780910964644, + "grad_norm": 0.7134194374084473, + "learning_rate": 0.0001718721681870356, + "loss": 2.8665, + "step": 4661 + }, + { + "epoch": 0.4223686892709112, + "grad_norm": 0.7309550642967224, + "learning_rate": 0.00017186612698604482, + "loss": 2.6946, + "step": 4662 + }, + { + "epoch": 0.422459287445358, + "grad_norm": 0.7561303377151489, + "learning_rate": 0.00017186008578505408, + "loss": 2.8645, + "step": 4663 + }, + { + "epoch": 0.4225498856198048, + "grad_norm": 0.7065045833587646, + "learning_rate": 0.00017185404458406331, + "loss": 2.4972, + "step": 4664 + }, + { + "epoch": 0.42264048379425156, + "grad_norm": 0.8059767484664917, + "learning_rate": 0.00017184800338307258, + "loss": 2.8864, + "step": 4665 + }, + { + "epoch": 0.42273108196869835, + "grad_norm": 0.7170342206954956, + "learning_rate": 0.0001718419621820818, + "loss": 2.9174, + "step": 4666 + }, + { + "epoch": 0.42282168014314514, + "grad_norm": 0.7831270694732666, + "learning_rate": 0.00017183592098109104, + "loss": 3.1299, + "step": 4667 + }, + { + "epoch": 0.4229122783175919, + "grad_norm": 0.7887386083602905, + "learning_rate": 0.0001718298797801003, + "loss": 2.8487, + "step": 4668 + }, + { + "epoch": 0.42300287649203866, + "grad_norm": 0.776387631893158, + "learning_rate": 0.00017182383857910954, + "loss": 2.7565, + "step": 4669 + }, + { + "epoch": 0.42309347466648545, + "grad_norm": 0.7270143628120422, + "learning_rate": 0.00017181779737811877, + "loss": 3.1244, + "step": 4670 + }, + { + "epoch": 0.42318407284093223, + "grad_norm": 0.7322832942008972, + "learning_rate": 0.000171811756177128, + "loss": 2.6569, + "step": 4671 + }, + { + "epoch": 0.423274671015379, + "grad_norm": 0.7482110857963562, + "learning_rate": 0.00017180571497613727, + "loss": 2.9478, + "step": 4672 + }, + { + "epoch": 0.4233652691898258, + "grad_norm": 0.738477885723114, + "learning_rate": 0.0001717996737751465, + "loss": 2.8306, + "step": 4673 + }, + { + "epoch": 0.4234558673642726, + "grad_norm": 0.7360974550247192, + "learning_rate": 0.00017179363257415576, + "loss": 2.5448, + "step": 4674 + }, + { + "epoch": 0.4235464655387194, + "grad_norm": 0.8099694848060608, + "learning_rate": 0.000171787591373165, + "loss": 2.5128, + "step": 4675 + }, + { + "epoch": 0.4236370637131662, + "grad_norm": 0.7448831796646118, + "learning_rate": 0.00017178155017217423, + "loss": 2.6022, + "step": 4676 + }, + { + "epoch": 0.42372766188761296, + "grad_norm": 0.7810489535331726, + "learning_rate": 0.0001717755089711835, + "loss": 3.0585, + "step": 4677 + }, + { + "epoch": 0.42381826006205975, + "grad_norm": 0.6510171294212341, + "learning_rate": 0.00017176946777019273, + "loss": 2.2928, + "step": 4678 + }, + { + "epoch": 0.42390885823650654, + "grad_norm": 0.7682634592056274, + "learning_rate": 0.00017176342656920196, + "loss": 3.0017, + "step": 4679 + }, + { + "epoch": 0.4239994564109533, + "grad_norm": 0.7768130898475647, + "learning_rate": 0.0001717573853682112, + "loss": 2.9368, + "step": 4680 + }, + { + "epoch": 0.4240900545854001, + "grad_norm": 0.7523239254951477, + "learning_rate": 0.00017175134416722046, + "loss": 3.0051, + "step": 4681 + }, + { + "epoch": 0.4241806527598469, + "grad_norm": 0.7734317779541016, + "learning_rate": 0.00017174530296622972, + "loss": 2.797, + "step": 4682 + }, + { + "epoch": 0.4242712509342937, + "grad_norm": 0.7688535451889038, + "learning_rate": 0.00017173926176523892, + "loss": 2.9599, + "step": 4683 + }, + { + "epoch": 0.4243618491087405, + "grad_norm": 0.828260600566864, + "learning_rate": 0.00017173322056424818, + "loss": 2.9271, + "step": 4684 + }, + { + "epoch": 0.42445244728318726, + "grad_norm": 0.7822055816650391, + "learning_rate": 0.00017172717936325742, + "loss": 2.7328, + "step": 4685 + }, + { + "epoch": 0.42454304545763405, + "grad_norm": 0.7578351497650146, + "learning_rate": 0.00017172113816226668, + "loss": 2.7905, + "step": 4686 + }, + { + "epoch": 0.42463364363208084, + "grad_norm": 0.7565810680389404, + "learning_rate": 0.00017171509696127591, + "loss": 2.6961, + "step": 4687 + }, + { + "epoch": 0.4247242418065276, + "grad_norm": 0.8273733854293823, + "learning_rate": 0.00017170905576028515, + "loss": 3.1368, + "step": 4688 + }, + { + "epoch": 0.42481483998097436, + "grad_norm": 0.6999607682228088, + "learning_rate": 0.0001717030145592944, + "loss": 1.9859, + "step": 4689 + }, + { + "epoch": 0.42490543815542114, + "grad_norm": 0.8322455883026123, + "learning_rate": 0.00017169697335830364, + "loss": 2.7235, + "step": 4690 + }, + { + "epoch": 0.42499603632986793, + "grad_norm": 0.7716969847679138, + "learning_rate": 0.00017169093215731288, + "loss": 3.0209, + "step": 4691 + }, + { + "epoch": 0.4250866345043147, + "grad_norm": 0.8271007537841797, + "learning_rate": 0.0001716848909563221, + "loss": 2.963, + "step": 4692 + }, + { + "epoch": 0.4251772326787615, + "grad_norm": 0.7500782608985901, + "learning_rate": 0.00017167884975533137, + "loss": 2.7968, + "step": 4693 + }, + { + "epoch": 0.4252678308532083, + "grad_norm": 0.7960312366485596, + "learning_rate": 0.0001716728085543406, + "loss": 2.8427, + "step": 4694 + }, + { + "epoch": 0.4253584290276551, + "grad_norm": 0.7106608152389526, + "learning_rate": 0.00017166676735334987, + "loss": 2.6495, + "step": 4695 + }, + { + "epoch": 0.42544902720210187, + "grad_norm": 0.7220978140830994, + "learning_rate": 0.0001716607261523591, + "loss": 2.7777, + "step": 4696 + }, + { + "epoch": 0.42553962537654866, + "grad_norm": 0.7283453345298767, + "learning_rate": 0.00017165468495136834, + "loss": 2.8795, + "step": 4697 + }, + { + "epoch": 0.42563022355099545, + "grad_norm": 0.7564776539802551, + "learning_rate": 0.0001716486437503776, + "loss": 2.8284, + "step": 4698 + }, + { + "epoch": 0.42572082172544223, + "grad_norm": 0.7770514488220215, + "learning_rate": 0.00017164260254938683, + "loss": 2.9058, + "step": 4699 + }, + { + "epoch": 0.425811419899889, + "grad_norm": 0.7403556108474731, + "learning_rate": 0.00017163656134839607, + "loss": 2.6876, + "step": 4700 + }, + { + "epoch": 0.4259020180743358, + "grad_norm": 0.7402160167694092, + "learning_rate": 0.0001716305201474053, + "loss": 3.095, + "step": 4701 + }, + { + "epoch": 0.4259926162487826, + "grad_norm": 0.7691416144371033, + "learning_rate": 0.00017162447894641456, + "loss": 2.8618, + "step": 4702 + }, + { + "epoch": 0.4260832144232294, + "grad_norm": 0.7922854423522949, + "learning_rate": 0.0001716184377454238, + "loss": 2.8293, + "step": 4703 + }, + { + "epoch": 0.42617381259767617, + "grad_norm": 0.790333092212677, + "learning_rate": 0.00017161239654443303, + "loss": 2.8446, + "step": 4704 + }, + { + "epoch": 0.42626441077212296, + "grad_norm": 0.8005086779594421, + "learning_rate": 0.0001716063553434423, + "loss": 2.9028, + "step": 4705 + }, + { + "epoch": 0.42635500894656975, + "grad_norm": 0.7452769875526428, + "learning_rate": 0.00017160031414245152, + "loss": 2.5593, + "step": 4706 + }, + { + "epoch": 0.42644560712101653, + "grad_norm": 0.8191087245941162, + "learning_rate": 0.00017159427294146078, + "loss": 2.7069, + "step": 4707 + }, + { + "epoch": 0.4265362052954633, + "grad_norm": 0.8793687224388123, + "learning_rate": 0.00017158823174047002, + "loss": 2.7319, + "step": 4708 + }, + { + "epoch": 0.42662680346991005, + "grad_norm": 0.7350861430168152, + "learning_rate": 0.00017158219053947925, + "loss": 2.5603, + "step": 4709 + }, + { + "epoch": 0.42671740164435684, + "grad_norm": 0.8061259388923645, + "learning_rate": 0.0001715761493384885, + "loss": 2.815, + "step": 4710 + }, + { + "epoch": 0.42680799981880363, + "grad_norm": 0.7484001517295837, + "learning_rate": 0.00017157010813749775, + "loss": 2.8876, + "step": 4711 + }, + { + "epoch": 0.4268985979932504, + "grad_norm": 0.7869861125946045, + "learning_rate": 0.00017156406693650698, + "loss": 2.7511, + "step": 4712 + }, + { + "epoch": 0.4269891961676972, + "grad_norm": 0.7318457365036011, + "learning_rate": 0.00017155802573551622, + "loss": 2.779, + "step": 4713 + }, + { + "epoch": 0.427079794342144, + "grad_norm": 0.8621801137924194, + "learning_rate": 0.00017155198453452548, + "loss": 2.2253, + "step": 4714 + }, + { + "epoch": 0.4271703925165908, + "grad_norm": 0.7150713205337524, + "learning_rate": 0.0001715459433335347, + "loss": 2.6636, + "step": 4715 + }, + { + "epoch": 0.42726099069103757, + "grad_norm": 0.8060650825500488, + "learning_rate": 0.00017153990213254397, + "loss": 2.9466, + "step": 4716 + }, + { + "epoch": 0.42735158886548436, + "grad_norm": 0.8132819533348083, + "learning_rate": 0.00017153386093155318, + "loss": 2.7434, + "step": 4717 + }, + { + "epoch": 0.42744218703993114, + "grad_norm": 0.6976996660232544, + "learning_rate": 0.00017152781973056244, + "loss": 2.5103, + "step": 4718 + }, + { + "epoch": 0.42753278521437793, + "grad_norm": 0.734905481338501, + "learning_rate": 0.0001715217785295717, + "loss": 2.5656, + "step": 4719 + }, + { + "epoch": 0.4276233833888247, + "grad_norm": 0.7689043283462524, + "learning_rate": 0.00017151573732858094, + "loss": 2.9034, + "step": 4720 + }, + { + "epoch": 0.4277139815632715, + "grad_norm": 0.7429293990135193, + "learning_rate": 0.00017150969612759017, + "loss": 2.8268, + "step": 4721 + }, + { + "epoch": 0.4278045797377183, + "grad_norm": 0.8348180651664734, + "learning_rate": 0.0001715036549265994, + "loss": 2.7643, + "step": 4722 + }, + { + "epoch": 0.4278951779121651, + "grad_norm": 0.9614596366882324, + "learning_rate": 0.00017149761372560867, + "loss": 3.1491, + "step": 4723 + }, + { + "epoch": 0.42798577608661187, + "grad_norm": 0.8264265656471252, + "learning_rate": 0.0001714915725246179, + "loss": 2.7563, + "step": 4724 + }, + { + "epoch": 0.42807637426105866, + "grad_norm": 0.8091937303543091, + "learning_rate": 0.00017148553132362713, + "loss": 3.0186, + "step": 4725 + }, + { + "epoch": 0.42816697243550544, + "grad_norm": 0.8153073787689209, + "learning_rate": 0.0001714794901226364, + "loss": 2.9435, + "step": 4726 + }, + { + "epoch": 0.42825757060995223, + "grad_norm": 0.8129255175590515, + "learning_rate": 0.00017147344892164563, + "loss": 2.892, + "step": 4727 + }, + { + "epoch": 0.428348168784399, + "grad_norm": 0.7636895179748535, + "learning_rate": 0.0001714674077206549, + "loss": 2.9517, + "step": 4728 + }, + { + "epoch": 0.4284387669588458, + "grad_norm": 0.8180027604103088, + "learning_rate": 0.00017146136651966412, + "loss": 2.9488, + "step": 4729 + }, + { + "epoch": 0.42852936513329254, + "grad_norm": 0.7590973377227783, + "learning_rate": 0.00017145532531867336, + "loss": 3.0407, + "step": 4730 + }, + { + "epoch": 0.4286199633077393, + "grad_norm": 0.7910534739494324, + "learning_rate": 0.0001714492841176826, + "loss": 3.0654, + "step": 4731 + }, + { + "epoch": 0.4287105614821861, + "grad_norm": 0.7301160097122192, + "learning_rate": 0.00017144324291669185, + "loss": 2.8019, + "step": 4732 + }, + { + "epoch": 0.4288011596566329, + "grad_norm": 0.7581092119216919, + "learning_rate": 0.0001714372017157011, + "loss": 2.9239, + "step": 4733 + }, + { + "epoch": 0.4288917578310797, + "grad_norm": 0.6761390566825867, + "learning_rate": 0.00017143116051471032, + "loss": 2.1877, + "step": 4734 + }, + { + "epoch": 0.4289823560055265, + "grad_norm": 0.7866116762161255, + "learning_rate": 0.00017142511931371958, + "loss": 2.7232, + "step": 4735 + }, + { + "epoch": 0.42907295417997326, + "grad_norm": 0.7933005690574646, + "learning_rate": 0.00017141907811272882, + "loss": 3.0191, + "step": 4736 + }, + { + "epoch": 0.42916355235442005, + "grad_norm": 0.7575610280036926, + "learning_rate": 0.00017141303691173808, + "loss": 2.9004, + "step": 4737 + }, + { + "epoch": 0.42925415052886684, + "grad_norm": 0.7642340064048767, + "learning_rate": 0.0001714069957107473, + "loss": 2.7872, + "step": 4738 + }, + { + "epoch": 0.42934474870331363, + "grad_norm": 0.7163413763046265, + "learning_rate": 0.00017140095450975655, + "loss": 2.7305, + "step": 4739 + }, + { + "epoch": 0.4294353468777604, + "grad_norm": 0.7748571038246155, + "learning_rate": 0.00017139491330876578, + "loss": 2.9362, + "step": 4740 + }, + { + "epoch": 0.4295259450522072, + "grad_norm": 0.7462965846061707, + "learning_rate": 0.00017138887210777504, + "loss": 2.8656, + "step": 4741 + }, + { + "epoch": 0.429616543226654, + "grad_norm": 0.706968367099762, + "learning_rate": 0.00017138283090678427, + "loss": 2.2298, + "step": 4742 + }, + { + "epoch": 0.4297071414011008, + "grad_norm": 0.862939715385437, + "learning_rate": 0.0001713767897057935, + "loss": 2.88, + "step": 4743 + }, + { + "epoch": 0.42979773957554757, + "grad_norm": 0.8417370915412903, + "learning_rate": 0.00017137074850480277, + "loss": 2.6952, + "step": 4744 + }, + { + "epoch": 0.42988833774999435, + "grad_norm": 0.727298378944397, + "learning_rate": 0.000171364707303812, + "loss": 2.7798, + "step": 4745 + }, + { + "epoch": 0.42997893592444114, + "grad_norm": 0.7214998006820679, + "learning_rate": 0.00017135866610282127, + "loss": 2.7638, + "step": 4746 + }, + { + "epoch": 0.43006953409888793, + "grad_norm": 0.7828437685966492, + "learning_rate": 0.00017135262490183047, + "loss": 2.7896, + "step": 4747 + }, + { + "epoch": 0.4301601322733347, + "grad_norm": 0.7309760451316833, + "learning_rate": 0.00017134658370083973, + "loss": 2.9395, + "step": 4748 + }, + { + "epoch": 0.4302507304477815, + "grad_norm": 0.789032518863678, + "learning_rate": 0.000171340542499849, + "loss": 2.8823, + "step": 4749 + }, + { + "epoch": 0.43034132862222824, + "grad_norm": 0.7588320970535278, + "learning_rate": 0.00017133450129885823, + "loss": 2.7642, + "step": 4750 + }, + { + "epoch": 0.430431926796675, + "grad_norm": 0.7125871181488037, + "learning_rate": 0.00017132846009786746, + "loss": 2.5935, + "step": 4751 + }, + { + "epoch": 0.4305225249711218, + "grad_norm": 0.8484110236167908, + "learning_rate": 0.0001713224188968767, + "loss": 2.8452, + "step": 4752 + }, + { + "epoch": 0.4306131231455686, + "grad_norm": 0.7863250970840454, + "learning_rate": 0.00017131637769588596, + "loss": 2.8985, + "step": 4753 + }, + { + "epoch": 0.4307037213200154, + "grad_norm": 0.818659245967865, + "learning_rate": 0.0001713103364948952, + "loss": 2.8539, + "step": 4754 + }, + { + "epoch": 0.4307943194944622, + "grad_norm": 0.6817161440849304, + "learning_rate": 0.00017130429529390443, + "loss": 2.237, + "step": 4755 + }, + { + "epoch": 0.43088491766890896, + "grad_norm": 0.7687156796455383, + "learning_rate": 0.0001712982540929137, + "loss": 2.7173, + "step": 4756 + }, + { + "epoch": 0.43097551584335575, + "grad_norm": 0.766566812992096, + "learning_rate": 0.00017129221289192292, + "loss": 2.651, + "step": 4757 + }, + { + "epoch": 0.43106611401780254, + "grad_norm": 0.7782437801361084, + "learning_rate": 0.00017128617169093218, + "loss": 2.7102, + "step": 4758 + }, + { + "epoch": 0.4311567121922493, + "grad_norm": 0.831939697265625, + "learning_rate": 0.00017128013048994142, + "loss": 2.8441, + "step": 4759 + }, + { + "epoch": 0.4312473103666961, + "grad_norm": 0.7394784688949585, + "learning_rate": 0.00017127408928895065, + "loss": 2.7188, + "step": 4760 + }, + { + "epoch": 0.4313379085411429, + "grad_norm": 0.757503867149353, + "learning_rate": 0.00017126804808795988, + "loss": 2.9036, + "step": 4761 + }, + { + "epoch": 0.4314285067155897, + "grad_norm": 0.7783207893371582, + "learning_rate": 0.00017126200688696915, + "loss": 2.7649, + "step": 4762 + }, + { + "epoch": 0.4315191048900365, + "grad_norm": 0.8568430542945862, + "learning_rate": 0.00017125596568597838, + "loss": 3.058, + "step": 4763 + }, + { + "epoch": 0.43160970306448326, + "grad_norm": 0.7484332323074341, + "learning_rate": 0.00017124992448498761, + "loss": 2.762, + "step": 4764 + }, + { + "epoch": 0.43170030123893005, + "grad_norm": 0.7671613097190857, + "learning_rate": 0.00017124388328399687, + "loss": 2.915, + "step": 4765 + }, + { + "epoch": 0.43179089941337684, + "grad_norm": 0.7614240646362305, + "learning_rate": 0.0001712378420830061, + "loss": 2.6784, + "step": 4766 + }, + { + "epoch": 0.4318814975878236, + "grad_norm": 0.8069921135902405, + "learning_rate": 0.00017123180088201537, + "loss": 2.9643, + "step": 4767 + }, + { + "epoch": 0.4319720957622704, + "grad_norm": 0.7544201016426086, + "learning_rate": 0.00017122575968102458, + "loss": 2.8304, + "step": 4768 + }, + { + "epoch": 0.4320626939367172, + "grad_norm": 0.8199490308761597, + "learning_rate": 0.00017121971848003384, + "loss": 3.0843, + "step": 4769 + }, + { + "epoch": 0.43215329211116393, + "grad_norm": 0.8236953020095825, + "learning_rate": 0.00017121367727904307, + "loss": 2.6876, + "step": 4770 + }, + { + "epoch": 0.4322438902856107, + "grad_norm": 0.7942916750907898, + "learning_rate": 0.00017120763607805233, + "loss": 3.123, + "step": 4771 + }, + { + "epoch": 0.4323344884600575, + "grad_norm": 0.8124531507492065, + "learning_rate": 0.00017120159487706157, + "loss": 2.8622, + "step": 4772 + }, + { + "epoch": 0.4324250866345043, + "grad_norm": 0.6974303722381592, + "learning_rate": 0.0001711955536760708, + "loss": 2.068, + "step": 4773 + }, + { + "epoch": 0.4325156848089511, + "grad_norm": 0.766643226146698, + "learning_rate": 0.00017118951247508006, + "loss": 3.0009, + "step": 4774 + }, + { + "epoch": 0.43260628298339787, + "grad_norm": 0.7312448024749756, + "learning_rate": 0.0001711834712740893, + "loss": 2.8249, + "step": 4775 + }, + { + "epoch": 0.43269688115784466, + "grad_norm": 0.795558512210846, + "learning_rate": 0.00017117743007309853, + "loss": 3.102, + "step": 4776 + }, + { + "epoch": 0.43278747933229145, + "grad_norm": 0.7897014617919922, + "learning_rate": 0.00017117138887210776, + "loss": 2.8599, + "step": 4777 + }, + { + "epoch": 0.43287807750673823, + "grad_norm": 0.7337587475776672, + "learning_rate": 0.00017116534767111703, + "loss": 2.7453, + "step": 4778 + }, + { + "epoch": 0.432968675681185, + "grad_norm": 0.7728537321090698, + "learning_rate": 0.0001711593064701263, + "loss": 3.0852, + "step": 4779 + }, + { + "epoch": 0.4330592738556318, + "grad_norm": 0.6993781328201294, + "learning_rate": 0.00017115326526913552, + "loss": 2.0208, + "step": 4780 + }, + { + "epoch": 0.4331498720300786, + "grad_norm": 0.7518386244773865, + "learning_rate": 0.00017114722406814476, + "loss": 2.8484, + "step": 4781 + }, + { + "epoch": 0.4332404702045254, + "grad_norm": 0.7345287799835205, + "learning_rate": 0.000171141182867154, + "loss": 2.8608, + "step": 4782 + }, + { + "epoch": 0.4333310683789722, + "grad_norm": 0.8248275518417358, + "learning_rate": 0.00017113514166616325, + "loss": 2.8547, + "step": 4783 + }, + { + "epoch": 0.43342166655341896, + "grad_norm": 0.7492628693580627, + "learning_rate": 0.00017112910046517248, + "loss": 2.7332, + "step": 4784 + }, + { + "epoch": 0.43351226472786575, + "grad_norm": 0.6956894993782043, + "learning_rate": 0.00017112305926418172, + "loss": 1.936, + "step": 4785 + }, + { + "epoch": 0.43360286290231254, + "grad_norm": 0.7508549690246582, + "learning_rate": 0.00017111701806319098, + "loss": 3.006, + "step": 4786 + }, + { + "epoch": 0.4336934610767593, + "grad_norm": 0.8068225383758545, + "learning_rate": 0.00017111097686220021, + "loss": 2.7765, + "step": 4787 + }, + { + "epoch": 0.4337840592512061, + "grad_norm": 0.735129714012146, + "learning_rate": 0.00017110493566120947, + "loss": 2.9041, + "step": 4788 + }, + { + "epoch": 0.4338746574256529, + "grad_norm": 0.8047580718994141, + "learning_rate": 0.00017109889446021868, + "loss": 2.9158, + "step": 4789 + }, + { + "epoch": 0.43396525560009963, + "grad_norm": 0.826094388961792, + "learning_rate": 0.00017109285325922794, + "loss": 3.0092, + "step": 4790 + }, + { + "epoch": 0.4340558537745464, + "grad_norm": 0.7797979712486267, + "learning_rate": 0.00017108681205823718, + "loss": 2.9686, + "step": 4791 + }, + { + "epoch": 0.4341464519489932, + "grad_norm": 0.7933158874511719, + "learning_rate": 0.00017108077085724644, + "loss": 2.6979, + "step": 4792 + }, + { + "epoch": 0.43423705012344, + "grad_norm": 0.7627562880516052, + "learning_rate": 0.00017107472965625567, + "loss": 2.8492, + "step": 4793 + }, + { + "epoch": 0.4343276482978868, + "grad_norm": 0.7352740168571472, + "learning_rate": 0.0001710686884552649, + "loss": 2.6953, + "step": 4794 + }, + { + "epoch": 0.43441824647233357, + "grad_norm": 0.7476334571838379, + "learning_rate": 0.00017106264725427417, + "loss": 2.8047, + "step": 4795 + }, + { + "epoch": 0.43450884464678036, + "grad_norm": 0.7859455943107605, + "learning_rate": 0.0001710566060532834, + "loss": 2.7338, + "step": 4796 + }, + { + "epoch": 0.43459944282122714, + "grad_norm": 0.7308264970779419, + "learning_rate": 0.00017105056485229266, + "loss": 2.7226, + "step": 4797 + }, + { + "epoch": 0.43469004099567393, + "grad_norm": 0.751899242401123, + "learning_rate": 0.00017104452365130187, + "loss": 2.6635, + "step": 4798 + }, + { + "epoch": 0.4347806391701207, + "grad_norm": 0.7743048667907715, + "learning_rate": 0.00017103848245031113, + "loss": 2.8423, + "step": 4799 + }, + { + "epoch": 0.4348712373445675, + "grad_norm": 0.8879299163818359, + "learning_rate": 0.00017103244124932036, + "loss": 2.6949, + "step": 4800 + }, + { + "epoch": 0.4349618355190143, + "grad_norm": 0.7459008097648621, + "learning_rate": 0.00017102640004832963, + "loss": 2.9577, + "step": 4801 + }, + { + "epoch": 0.4350524336934611, + "grad_norm": 0.7693278193473816, + "learning_rate": 0.00017102035884733886, + "loss": 2.7078, + "step": 4802 + }, + { + "epoch": 0.43514303186790787, + "grad_norm": 0.8001362085342407, + "learning_rate": 0.0001710143176463481, + "loss": 2.7498, + "step": 4803 + }, + { + "epoch": 0.43523363004235466, + "grad_norm": 0.769421398639679, + "learning_rate": 0.00017100827644535736, + "loss": 2.8673, + "step": 4804 + }, + { + "epoch": 0.43532422821680145, + "grad_norm": 0.7073420286178589, + "learning_rate": 0.0001710022352443666, + "loss": 2.442, + "step": 4805 + }, + { + "epoch": 0.43541482639124823, + "grad_norm": 0.767266571521759, + "learning_rate": 0.00017099619404337582, + "loss": 2.9357, + "step": 4806 + }, + { + "epoch": 0.435505424565695, + "grad_norm": 0.8659315705299377, + "learning_rate": 0.00017099015284238506, + "loss": 2.8676, + "step": 4807 + }, + { + "epoch": 0.4355960227401418, + "grad_norm": 0.7472079992294312, + "learning_rate": 0.00017098411164139432, + "loss": 2.8514, + "step": 4808 + }, + { + "epoch": 0.4356866209145886, + "grad_norm": 0.6482210159301758, + "learning_rate": 0.00017097807044040358, + "loss": 2.2121, + "step": 4809 + }, + { + "epoch": 0.4357772190890354, + "grad_norm": 0.9252759218215942, + "learning_rate": 0.00017097202923941281, + "loss": 2.5064, + "step": 4810 + }, + { + "epoch": 0.4358678172634821, + "grad_norm": 0.7910621166229248, + "learning_rate": 0.00017096598803842205, + "loss": 2.8219, + "step": 4811 + }, + { + "epoch": 0.4359584154379289, + "grad_norm": 0.7257474064826965, + "learning_rate": 0.00017095994683743128, + "loss": 2.7461, + "step": 4812 + }, + { + "epoch": 0.4360490136123757, + "grad_norm": 0.8977155089378357, + "learning_rate": 0.00017095390563644054, + "loss": 2.8495, + "step": 4813 + }, + { + "epoch": 0.4361396117868225, + "grad_norm": 0.8446081280708313, + "learning_rate": 0.00017094786443544978, + "loss": 2.8799, + "step": 4814 + }, + { + "epoch": 0.43623020996126927, + "grad_norm": 0.7328481674194336, + "learning_rate": 0.000170941823234459, + "loss": 2.5983, + "step": 4815 + }, + { + "epoch": 0.43632080813571605, + "grad_norm": 0.7458370923995972, + "learning_rate": 0.00017093578203346827, + "loss": 2.6038, + "step": 4816 + }, + { + "epoch": 0.43641140631016284, + "grad_norm": 0.7765607237815857, + "learning_rate": 0.0001709297408324775, + "loss": 2.8315, + "step": 4817 + }, + { + "epoch": 0.43650200448460963, + "grad_norm": 0.7309267520904541, + "learning_rate": 0.00017092369963148677, + "loss": 2.5256, + "step": 4818 + }, + { + "epoch": 0.4365926026590564, + "grad_norm": 0.707560658454895, + "learning_rate": 0.00017091765843049597, + "loss": 2.8178, + "step": 4819 + }, + { + "epoch": 0.4366832008335032, + "grad_norm": 0.7885999083518982, + "learning_rate": 0.00017091161722950524, + "loss": 2.836, + "step": 4820 + }, + { + "epoch": 0.43677379900795, + "grad_norm": 0.7744390964508057, + "learning_rate": 0.00017090557602851447, + "loss": 2.8934, + "step": 4821 + }, + { + "epoch": 0.4368643971823968, + "grad_norm": 0.678325891494751, + "learning_rate": 0.00017089953482752373, + "loss": 2.2197, + "step": 4822 + }, + { + "epoch": 0.43695499535684357, + "grad_norm": 0.7781746983528137, + "learning_rate": 0.00017089349362653296, + "loss": 2.835, + "step": 4823 + }, + { + "epoch": 0.43704559353129036, + "grad_norm": 0.763587474822998, + "learning_rate": 0.0001708874524255422, + "loss": 2.2217, + "step": 4824 + }, + { + "epoch": 0.43713619170573714, + "grad_norm": 0.722429096698761, + "learning_rate": 0.00017088141122455146, + "loss": 2.8983, + "step": 4825 + }, + { + "epoch": 0.43722678988018393, + "grad_norm": 0.7389289140701294, + "learning_rate": 0.0001708753700235607, + "loss": 2.9326, + "step": 4826 + }, + { + "epoch": 0.4373173880546307, + "grad_norm": 0.7470394372940063, + "learning_rate": 0.00017086932882256993, + "loss": 2.0334, + "step": 4827 + }, + { + "epoch": 0.4374079862290775, + "grad_norm": 0.7452974915504456, + "learning_rate": 0.00017086328762157916, + "loss": 3.0828, + "step": 4828 + }, + { + "epoch": 0.4374985844035243, + "grad_norm": 0.7697020769119263, + "learning_rate": 0.00017085724642058842, + "loss": 2.7111, + "step": 4829 + }, + { + "epoch": 0.4375891825779711, + "grad_norm": 0.7301568984985352, + "learning_rate": 0.00017085120521959766, + "loss": 2.6841, + "step": 4830 + }, + { + "epoch": 0.4376797807524178, + "grad_norm": 0.7432863116264343, + "learning_rate": 0.00017084516401860692, + "loss": 2.7662, + "step": 4831 + }, + { + "epoch": 0.4377703789268646, + "grad_norm": 0.6694855690002441, + "learning_rate": 0.00017083912281761615, + "loss": 2.1851, + "step": 4832 + }, + { + "epoch": 0.4378609771013114, + "grad_norm": 0.8057236075401306, + "learning_rate": 0.0001708330816166254, + "loss": 2.9957, + "step": 4833 + }, + { + "epoch": 0.4379515752757582, + "grad_norm": 0.7012770175933838, + "learning_rate": 0.00017082704041563465, + "loss": 2.8268, + "step": 4834 + }, + { + "epoch": 0.43804217345020496, + "grad_norm": 0.7789185047149658, + "learning_rate": 0.00017082099921464388, + "loss": 2.8491, + "step": 4835 + }, + { + "epoch": 0.43813277162465175, + "grad_norm": 0.7158858180046082, + "learning_rate": 0.00017081495801365312, + "loss": 2.5701, + "step": 4836 + }, + { + "epoch": 0.43822336979909854, + "grad_norm": 0.7760608196258545, + "learning_rate": 0.00017080891681266235, + "loss": 2.8344, + "step": 4837 + }, + { + "epoch": 0.4383139679735453, + "grad_norm": 0.8601799607276917, + "learning_rate": 0.0001708028756116716, + "loss": 2.7392, + "step": 4838 + }, + { + "epoch": 0.4384045661479921, + "grad_norm": 0.7662372589111328, + "learning_rate": 0.00017079683441068087, + "loss": 2.9991, + "step": 4839 + }, + { + "epoch": 0.4384951643224389, + "grad_norm": 0.7370370626449585, + "learning_rate": 0.00017079079320969008, + "loss": 2.9104, + "step": 4840 + }, + { + "epoch": 0.4385857624968857, + "grad_norm": 0.7807759046554565, + "learning_rate": 0.00017078475200869934, + "loss": 3.0504, + "step": 4841 + }, + { + "epoch": 0.4386763606713325, + "grad_norm": 0.7975899577140808, + "learning_rate": 0.00017077871080770857, + "loss": 3.1656, + "step": 4842 + }, + { + "epoch": 0.43876695884577926, + "grad_norm": 0.7606247067451477, + "learning_rate": 0.00017077266960671784, + "loss": 2.7025, + "step": 4843 + }, + { + "epoch": 0.43885755702022605, + "grad_norm": 0.804459273815155, + "learning_rate": 0.00017076662840572707, + "loss": 2.7791, + "step": 4844 + }, + { + "epoch": 0.43894815519467284, + "grad_norm": 0.7344349026679993, + "learning_rate": 0.0001707605872047363, + "loss": 2.9638, + "step": 4845 + }, + { + "epoch": 0.4390387533691196, + "grad_norm": 0.8031205534934998, + "learning_rate": 0.00017075454600374556, + "loss": 2.9363, + "step": 4846 + }, + { + "epoch": 0.4391293515435664, + "grad_norm": 0.7458187937736511, + "learning_rate": 0.0001707485048027548, + "loss": 2.8358, + "step": 4847 + }, + { + "epoch": 0.4392199497180132, + "grad_norm": 0.7831902503967285, + "learning_rate": 0.00017074246360176406, + "loss": 3.0049, + "step": 4848 + }, + { + "epoch": 0.43931054789246, + "grad_norm": 0.7126840353012085, + "learning_rate": 0.00017073642240077327, + "loss": 2.7283, + "step": 4849 + }, + { + "epoch": 0.4394011460669068, + "grad_norm": 0.8689375519752502, + "learning_rate": 0.00017073038119978253, + "loss": 2.9305, + "step": 4850 + }, + { + "epoch": 0.4394917442413535, + "grad_norm": 0.754180908203125, + "learning_rate": 0.00017072433999879176, + "loss": 2.1258, + "step": 4851 + }, + { + "epoch": 0.4395823424158003, + "grad_norm": 0.8280024528503418, + "learning_rate": 0.00017071829879780102, + "loss": 2.7894, + "step": 4852 + }, + { + "epoch": 0.4396729405902471, + "grad_norm": 0.8077889680862427, + "learning_rate": 0.00017071225759681023, + "loss": 2.9559, + "step": 4853 + }, + { + "epoch": 0.4397635387646939, + "grad_norm": 0.7209494113922119, + "learning_rate": 0.0001707062163958195, + "loss": 2.7904, + "step": 4854 + }, + { + "epoch": 0.43985413693914066, + "grad_norm": 0.7559609413146973, + "learning_rate": 0.00017070017519482875, + "loss": 2.8267, + "step": 4855 + }, + { + "epoch": 0.43994473511358745, + "grad_norm": 0.6217970848083496, + "learning_rate": 0.000170694133993838, + "loss": 2.1442, + "step": 4856 + }, + { + "epoch": 0.44003533328803424, + "grad_norm": 0.7741178870201111, + "learning_rate": 0.00017068809279284722, + "loss": 2.8386, + "step": 4857 + }, + { + "epoch": 0.440125931462481, + "grad_norm": 0.7511405348777771, + "learning_rate": 0.00017068205159185645, + "loss": 2.8425, + "step": 4858 + }, + { + "epoch": 0.4402165296369278, + "grad_norm": 0.6740238070487976, + "learning_rate": 0.00017067601039086572, + "loss": 2.1669, + "step": 4859 + }, + { + "epoch": 0.4403071278113746, + "grad_norm": 0.7573873996734619, + "learning_rate": 0.00017066996918987495, + "loss": 2.7912, + "step": 4860 + }, + { + "epoch": 0.4403977259858214, + "grad_norm": 0.7306456565856934, + "learning_rate": 0.0001706639279888842, + "loss": 2.725, + "step": 4861 + }, + { + "epoch": 0.4404883241602682, + "grad_norm": 0.8429266810417175, + "learning_rate": 0.00017065788678789345, + "loss": 2.7641, + "step": 4862 + }, + { + "epoch": 0.44057892233471496, + "grad_norm": 0.7461340427398682, + "learning_rate": 0.00017065184558690268, + "loss": 2.7507, + "step": 4863 + }, + { + "epoch": 0.44066952050916175, + "grad_norm": 0.8057570457458496, + "learning_rate": 0.00017064580438591194, + "loss": 2.6549, + "step": 4864 + }, + { + "epoch": 0.44076011868360854, + "grad_norm": 0.7742655873298645, + "learning_rate": 0.00017063976318492117, + "loss": 2.9725, + "step": 4865 + }, + { + "epoch": 0.4408507168580553, + "grad_norm": 0.7956271171569824, + "learning_rate": 0.0001706337219839304, + "loss": 2.8442, + "step": 4866 + }, + { + "epoch": 0.4409413150325021, + "grad_norm": 0.7450558543205261, + "learning_rate": 0.00017062768078293964, + "loss": 2.5486, + "step": 4867 + }, + { + "epoch": 0.4410319132069489, + "grad_norm": 0.7541553974151611, + "learning_rate": 0.0001706216395819489, + "loss": 2.6782, + "step": 4868 + }, + { + "epoch": 0.4411225113813957, + "grad_norm": 0.7751592993736267, + "learning_rate": 0.00017061559838095817, + "loss": 3.0954, + "step": 4869 + }, + { + "epoch": 0.4412131095558425, + "grad_norm": 0.7323487997055054, + "learning_rate": 0.00017060955717996737, + "loss": 2.1917, + "step": 4870 + }, + { + "epoch": 0.4413037077302892, + "grad_norm": 0.7499911189079285, + "learning_rate": 0.00017060351597897663, + "loss": 2.7505, + "step": 4871 + }, + { + "epoch": 0.441394305904736, + "grad_norm": 0.7327212691307068, + "learning_rate": 0.00017059747477798587, + "loss": 2.8451, + "step": 4872 + }, + { + "epoch": 0.4414849040791828, + "grad_norm": 0.739216685295105, + "learning_rate": 0.00017059143357699513, + "loss": 2.6028, + "step": 4873 + }, + { + "epoch": 0.44157550225362957, + "grad_norm": 0.7382607460021973, + "learning_rate": 0.00017058539237600436, + "loss": 2.8562, + "step": 4874 + }, + { + "epoch": 0.44166610042807636, + "grad_norm": 0.7936729788780212, + "learning_rate": 0.0001705793511750136, + "loss": 2.6488, + "step": 4875 + }, + { + "epoch": 0.44175669860252315, + "grad_norm": 0.747761070728302, + "learning_rate": 0.00017057330997402286, + "loss": 2.8369, + "step": 4876 + }, + { + "epoch": 0.44184729677696993, + "grad_norm": 0.7199625372886658, + "learning_rate": 0.0001705672687730321, + "loss": 2.7297, + "step": 4877 + }, + { + "epoch": 0.4419378949514167, + "grad_norm": 0.8310785889625549, + "learning_rate": 0.00017056122757204133, + "loss": 2.857, + "step": 4878 + }, + { + "epoch": 0.4420284931258635, + "grad_norm": 0.7814924716949463, + "learning_rate": 0.00017055518637105056, + "loss": 2.8926, + "step": 4879 + }, + { + "epoch": 0.4421190913003103, + "grad_norm": 0.7683720588684082, + "learning_rate": 0.00017054914517005982, + "loss": 2.7377, + "step": 4880 + }, + { + "epoch": 0.4422096894747571, + "grad_norm": 0.8091594576835632, + "learning_rate": 0.00017054310396906905, + "loss": 2.9781, + "step": 4881 + }, + { + "epoch": 0.44230028764920387, + "grad_norm": 0.6910940408706665, + "learning_rate": 0.00017053706276807832, + "loss": 2.0578, + "step": 4882 + }, + { + "epoch": 0.44239088582365066, + "grad_norm": 0.7823684215545654, + "learning_rate": 0.00017053102156708752, + "loss": 3.0302, + "step": 4883 + }, + { + "epoch": 0.44248148399809745, + "grad_norm": 0.7447478175163269, + "learning_rate": 0.00017052498036609678, + "loss": 2.5923, + "step": 4884 + }, + { + "epoch": 0.44257208217254423, + "grad_norm": 0.7621851563453674, + "learning_rate": 0.00017051893916510605, + "loss": 2.9768, + "step": 4885 + }, + { + "epoch": 0.442662680346991, + "grad_norm": 0.7199058532714844, + "learning_rate": 0.00017051289796411528, + "loss": 2.7347, + "step": 4886 + }, + { + "epoch": 0.4427532785214378, + "grad_norm": 0.8118021488189697, + "learning_rate": 0.0001705068567631245, + "loss": 2.8034, + "step": 4887 + }, + { + "epoch": 0.4428438766958846, + "grad_norm": 0.7777242064476013, + "learning_rate": 0.00017050081556213375, + "loss": 2.8221, + "step": 4888 + }, + { + "epoch": 0.4429344748703314, + "grad_norm": 0.7774063348770142, + "learning_rate": 0.000170494774361143, + "loss": 3.0822, + "step": 4889 + }, + { + "epoch": 0.4430250730447782, + "grad_norm": 0.7995514869689941, + "learning_rate": 0.00017048873316015224, + "loss": 3.0421, + "step": 4890 + }, + { + "epoch": 0.44311567121922496, + "grad_norm": 0.7767555713653564, + "learning_rate": 0.00017048269195916148, + "loss": 3.1481, + "step": 4891 + }, + { + "epoch": 0.4432062693936717, + "grad_norm": 0.75141841173172, + "learning_rate": 0.00017047665075817074, + "loss": 2.787, + "step": 4892 + }, + { + "epoch": 0.4432968675681185, + "grad_norm": 0.8177909255027771, + "learning_rate": 0.00017047060955717997, + "loss": 3.1352, + "step": 4893 + }, + { + "epoch": 0.44338746574256527, + "grad_norm": 0.7022941708564758, + "learning_rate": 0.00017046456835618923, + "loss": 2.1824, + "step": 4894 + }, + { + "epoch": 0.44347806391701206, + "grad_norm": 0.7454754114151001, + "learning_rate": 0.00017045852715519847, + "loss": 2.6729, + "step": 4895 + }, + { + "epoch": 0.44356866209145884, + "grad_norm": 0.7500575184822083, + "learning_rate": 0.0001704524859542077, + "loss": 2.5696, + "step": 4896 + }, + { + "epoch": 0.44365926026590563, + "grad_norm": 0.7589263319969177, + "learning_rate": 0.00017044644475321694, + "loss": 2.8293, + "step": 4897 + }, + { + "epoch": 0.4437498584403524, + "grad_norm": 0.7149617671966553, + "learning_rate": 0.0001704404035522262, + "loss": 2.7317, + "step": 4898 + }, + { + "epoch": 0.4438404566147992, + "grad_norm": 0.7568726539611816, + "learning_rate": 0.00017043436235123543, + "loss": 3.0697, + "step": 4899 + }, + { + "epoch": 0.443931054789246, + "grad_norm": 0.761831521987915, + "learning_rate": 0.00017042832115024466, + "loss": 2.985, + "step": 4900 + }, + { + "epoch": 0.4440216529636928, + "grad_norm": 0.7156651616096497, + "learning_rate": 0.00017042227994925393, + "loss": 2.9942, + "step": 4901 + }, + { + "epoch": 0.44411225113813957, + "grad_norm": 0.7414016723632812, + "learning_rate": 0.00017041623874826316, + "loss": 3.0376, + "step": 4902 + }, + { + "epoch": 0.44420284931258636, + "grad_norm": 0.656190037727356, + "learning_rate": 0.00017041019754727242, + "loss": 2.0675, + "step": 4903 + }, + { + "epoch": 0.44429344748703314, + "grad_norm": 0.7650772333145142, + "learning_rate": 0.00017040415634628163, + "loss": 2.8678, + "step": 4904 + }, + { + "epoch": 0.44438404566147993, + "grad_norm": 0.7648686170578003, + "learning_rate": 0.0001703981151452909, + "loss": 2.792, + "step": 4905 + }, + { + "epoch": 0.4444746438359267, + "grad_norm": 0.7647050023078918, + "learning_rate": 0.00017039207394430015, + "loss": 2.9255, + "step": 4906 + }, + { + "epoch": 0.4445652420103735, + "grad_norm": 0.7699881792068481, + "learning_rate": 0.00017038603274330938, + "loss": 2.7428, + "step": 4907 + }, + { + "epoch": 0.4446558401848203, + "grad_norm": 0.7383936047554016, + "learning_rate": 0.00017037999154231862, + "loss": 2.9303, + "step": 4908 + }, + { + "epoch": 0.4447464383592671, + "grad_norm": 0.8250946998596191, + "learning_rate": 0.00017037395034132785, + "loss": 3.1472, + "step": 4909 + }, + { + "epoch": 0.44483703653371387, + "grad_norm": 0.7587382197380066, + "learning_rate": 0.00017036790914033711, + "loss": 2.7043, + "step": 4910 + }, + { + "epoch": 0.44492763470816066, + "grad_norm": 0.7634626626968384, + "learning_rate": 0.00017036186793934635, + "loss": 3.0498, + "step": 4911 + }, + { + "epoch": 0.4450182328826074, + "grad_norm": 0.7825750112533569, + "learning_rate": 0.00017035582673835558, + "loss": 2.7172, + "step": 4912 + }, + { + "epoch": 0.4451088310570542, + "grad_norm": 0.786526083946228, + "learning_rate": 0.00017034978553736482, + "loss": 2.8014, + "step": 4913 + }, + { + "epoch": 0.44519942923150096, + "grad_norm": 0.7704442739486694, + "learning_rate": 0.00017034374433637408, + "loss": 2.6252, + "step": 4914 + }, + { + "epoch": 0.44529002740594775, + "grad_norm": 0.7206666469573975, + "learning_rate": 0.00017033770313538334, + "loss": 2.3409, + "step": 4915 + }, + { + "epoch": 0.44538062558039454, + "grad_norm": 0.7429029941558838, + "learning_rate": 0.00017033166193439257, + "loss": 2.7878, + "step": 4916 + }, + { + "epoch": 0.44547122375484133, + "grad_norm": 0.8384262323379517, + "learning_rate": 0.0001703256207334018, + "loss": 2.7842, + "step": 4917 + }, + { + "epoch": 0.4455618219292881, + "grad_norm": 0.7259093523025513, + "learning_rate": 0.00017031957953241104, + "loss": 2.771, + "step": 4918 + }, + { + "epoch": 0.4456524201037349, + "grad_norm": 0.762956976890564, + "learning_rate": 0.0001703135383314203, + "loss": 3.016, + "step": 4919 + }, + { + "epoch": 0.4457430182781817, + "grad_norm": 0.7444080114364624, + "learning_rate": 0.00017030749713042954, + "loss": 2.8865, + "step": 4920 + }, + { + "epoch": 0.4458336164526285, + "grad_norm": 0.7389550805091858, + "learning_rate": 0.00017030145592943877, + "loss": 2.9479, + "step": 4921 + }, + { + "epoch": 0.44592421462707527, + "grad_norm": 0.7287492752075195, + "learning_rate": 0.00017029541472844803, + "loss": 2.8537, + "step": 4922 + }, + { + "epoch": 0.44601481280152205, + "grad_norm": 0.745880663394928, + "learning_rate": 0.00017028937352745726, + "loss": 2.9203, + "step": 4923 + }, + { + "epoch": 0.44610541097596884, + "grad_norm": 0.6690673232078552, + "learning_rate": 0.00017028333232646653, + "loss": 2.1049, + "step": 4924 + }, + { + "epoch": 0.44619600915041563, + "grad_norm": 0.7917125225067139, + "learning_rate": 0.00017027729112547576, + "loss": 2.8474, + "step": 4925 + }, + { + "epoch": 0.4462866073248624, + "grad_norm": 0.7803084850311279, + "learning_rate": 0.000170271249924485, + "loss": 2.746, + "step": 4926 + }, + { + "epoch": 0.4463772054993092, + "grad_norm": 0.6956706047058105, + "learning_rate": 0.00017026520872349423, + "loss": 2.8833, + "step": 4927 + }, + { + "epoch": 0.446467803673756, + "grad_norm": 0.703215479850769, + "learning_rate": 0.0001702591675225035, + "loss": 2.8339, + "step": 4928 + }, + { + "epoch": 0.4465584018482028, + "grad_norm": 0.6572996377944946, + "learning_rate": 0.00017025312632151272, + "loss": 2.0505, + "step": 4929 + }, + { + "epoch": 0.44664900002264957, + "grad_norm": 0.7531684637069702, + "learning_rate": 0.00017024708512052196, + "loss": 2.81, + "step": 4930 + }, + { + "epoch": 0.44673959819709635, + "grad_norm": 0.7497778534889221, + "learning_rate": 0.00017024104391953122, + "loss": 2.7923, + "step": 4931 + }, + { + "epoch": 0.4468301963715431, + "grad_norm": 0.7898332476615906, + "learning_rate": 0.00017023500271854045, + "loss": 2.7544, + "step": 4932 + }, + { + "epoch": 0.4469207945459899, + "grad_norm": 0.8009757995605469, + "learning_rate": 0.00017022896151754971, + "loss": 3.1096, + "step": 4933 + }, + { + "epoch": 0.44701139272043666, + "grad_norm": 0.6976144909858704, + "learning_rate": 0.00017022292031655892, + "loss": 2.9944, + "step": 4934 + }, + { + "epoch": 0.44710199089488345, + "grad_norm": 0.8639057874679565, + "learning_rate": 0.00017021687911556818, + "loss": 2.9859, + "step": 4935 + }, + { + "epoch": 0.44719258906933024, + "grad_norm": 0.7984032034873962, + "learning_rate": 0.00017021083791457744, + "loss": 3.0063, + "step": 4936 + }, + { + "epoch": 0.447283187243777, + "grad_norm": 0.7968500852584839, + "learning_rate": 0.00017020479671358668, + "loss": 2.6074, + "step": 4937 + }, + { + "epoch": 0.4473737854182238, + "grad_norm": 0.8241811394691467, + "learning_rate": 0.0001701987555125959, + "loss": 2.8278, + "step": 4938 + }, + { + "epoch": 0.4474643835926706, + "grad_norm": 0.6137234568595886, + "learning_rate": 0.00017019271431160515, + "loss": 2.1824, + "step": 4939 + }, + { + "epoch": 0.4475549817671174, + "grad_norm": 0.7509317398071289, + "learning_rate": 0.0001701866731106144, + "loss": 2.9035, + "step": 4940 + }, + { + "epoch": 0.4476455799415642, + "grad_norm": 0.7735652923583984, + "learning_rate": 0.00017018063190962364, + "loss": 2.9511, + "step": 4941 + }, + { + "epoch": 0.44773617811601096, + "grad_norm": 0.8525700569152832, + "learning_rate": 0.00017017459070863287, + "loss": 2.9619, + "step": 4942 + }, + { + "epoch": 0.44782677629045775, + "grad_norm": 0.7301997542381287, + "learning_rate": 0.0001701685495076421, + "loss": 2.7282, + "step": 4943 + }, + { + "epoch": 0.44791737446490454, + "grad_norm": 0.7197231650352478, + "learning_rate": 0.00017016250830665137, + "loss": 2.6355, + "step": 4944 + }, + { + "epoch": 0.4480079726393513, + "grad_norm": 0.6863359808921814, + "learning_rate": 0.00017015646710566063, + "loss": 2.1246, + "step": 4945 + }, + { + "epoch": 0.4480985708137981, + "grad_norm": 0.7939589023590088, + "learning_rate": 0.00017015042590466986, + "loss": 2.8142, + "step": 4946 + }, + { + "epoch": 0.4481891689882449, + "grad_norm": 0.7326997518539429, + "learning_rate": 0.0001701443847036791, + "loss": 2.8373, + "step": 4947 + }, + { + "epoch": 0.4482797671626917, + "grad_norm": 0.7509124279022217, + "learning_rate": 0.00017013834350268833, + "loss": 2.6287, + "step": 4948 + }, + { + "epoch": 0.4483703653371385, + "grad_norm": 0.8168075680732727, + "learning_rate": 0.0001701323023016976, + "loss": 2.6051, + "step": 4949 + }, + { + "epoch": 0.44846096351158526, + "grad_norm": 0.7409854531288147, + "learning_rate": 0.00017012626110070683, + "loss": 2.8969, + "step": 4950 + }, + { + "epoch": 0.44855156168603205, + "grad_norm": 0.8229681253433228, + "learning_rate": 0.00017012021989971606, + "loss": 2.7896, + "step": 4951 + }, + { + "epoch": 0.4486421598604788, + "grad_norm": 0.7669466137886047, + "learning_rate": 0.00017011417869872532, + "loss": 2.8277, + "step": 4952 + }, + { + "epoch": 0.44873275803492557, + "grad_norm": 0.7471604347229004, + "learning_rate": 0.00017010813749773456, + "loss": 2.6269, + "step": 4953 + }, + { + "epoch": 0.44882335620937236, + "grad_norm": 0.7295309901237488, + "learning_rate": 0.00017010209629674382, + "loss": 2.7533, + "step": 4954 + }, + { + "epoch": 0.44891395438381915, + "grad_norm": 0.8291679620742798, + "learning_rate": 0.00017009605509575303, + "loss": 3.1535, + "step": 4955 + }, + { + "epoch": 0.44900455255826593, + "grad_norm": 0.7813898324966431, + "learning_rate": 0.0001700900138947623, + "loss": 2.8461, + "step": 4956 + }, + { + "epoch": 0.4490951507327127, + "grad_norm": 0.7633702158927917, + "learning_rate": 0.00017008397269377152, + "loss": 2.7386, + "step": 4957 + }, + { + "epoch": 0.4491857489071595, + "grad_norm": 0.7813045382499695, + "learning_rate": 0.00017007793149278078, + "loss": 3.0159, + "step": 4958 + }, + { + "epoch": 0.4492763470816063, + "grad_norm": 0.9512060880661011, + "learning_rate": 0.00017007189029179002, + "loss": 3.1795, + "step": 4959 + }, + { + "epoch": 0.4493669452560531, + "grad_norm": 0.7890324592590332, + "learning_rate": 0.00017006584909079925, + "loss": 2.7289, + "step": 4960 + }, + { + "epoch": 0.4494575434304999, + "grad_norm": 0.8001576066017151, + "learning_rate": 0.0001700598078898085, + "loss": 2.8533, + "step": 4961 + }, + { + "epoch": 0.44954814160494666, + "grad_norm": 0.7176802754402161, + "learning_rate": 0.00017005376668881775, + "loss": 2.7721, + "step": 4962 + }, + { + "epoch": 0.44963873977939345, + "grad_norm": 0.7527653574943542, + "learning_rate": 0.00017004772548782698, + "loss": 2.6894, + "step": 4963 + }, + { + "epoch": 0.44972933795384024, + "grad_norm": 0.7665597200393677, + "learning_rate": 0.0001700416842868362, + "loss": 2.7666, + "step": 4964 + }, + { + "epoch": 0.449819936128287, + "grad_norm": 0.7887178063392639, + "learning_rate": 0.00017003564308584547, + "loss": 2.9283, + "step": 4965 + }, + { + "epoch": 0.4499105343027338, + "grad_norm": 0.7343124747276306, + "learning_rate": 0.00017002960188485474, + "loss": 2.9147, + "step": 4966 + }, + { + "epoch": 0.4500011324771806, + "grad_norm": 0.7104665040969849, + "learning_rate": 0.00017002356068386397, + "loss": 2.5639, + "step": 4967 + }, + { + "epoch": 0.4500917306516274, + "grad_norm": 0.7234047055244446, + "learning_rate": 0.0001700175194828732, + "loss": 2.6445, + "step": 4968 + }, + { + "epoch": 0.4501823288260742, + "grad_norm": 0.7342525720596313, + "learning_rate": 0.00017001147828188244, + "loss": 2.7332, + "step": 4969 + }, + { + "epoch": 0.45027292700052096, + "grad_norm": 0.7503886818885803, + "learning_rate": 0.0001700054370808917, + "loss": 3.0253, + "step": 4970 + }, + { + "epoch": 0.45036352517496775, + "grad_norm": 0.7554607391357422, + "learning_rate": 0.00016999939587990093, + "loss": 2.7018, + "step": 4971 + }, + { + "epoch": 0.45045412334941454, + "grad_norm": 0.8218497037887573, + "learning_rate": 0.00016999335467891017, + "loss": 2.9508, + "step": 4972 + }, + { + "epoch": 0.45054472152386127, + "grad_norm": 0.8021644353866577, + "learning_rate": 0.0001699873134779194, + "loss": 2.9528, + "step": 4973 + }, + { + "epoch": 0.45063531969830806, + "grad_norm": 0.8219462037086487, + "learning_rate": 0.00016998127227692866, + "loss": 2.8836, + "step": 4974 + }, + { + "epoch": 0.45072591787275484, + "grad_norm": 0.7673479914665222, + "learning_rate": 0.00016997523107593792, + "loss": 2.8343, + "step": 4975 + }, + { + "epoch": 0.45081651604720163, + "grad_norm": 0.7675079107284546, + "learning_rate": 0.00016996918987494713, + "loss": 3.1088, + "step": 4976 + }, + { + "epoch": 0.4509071142216484, + "grad_norm": 0.7952225804328918, + "learning_rate": 0.0001699631486739564, + "loss": 3.0134, + "step": 4977 + }, + { + "epoch": 0.4509977123960952, + "grad_norm": 0.7915027737617493, + "learning_rate": 0.00016995710747296563, + "loss": 2.8181, + "step": 4978 + }, + { + "epoch": 0.451088310570542, + "grad_norm": 0.6296777129173279, + "learning_rate": 0.0001699510662719749, + "loss": 2.3476, + "step": 4979 + }, + { + "epoch": 0.4511789087449888, + "grad_norm": 0.7885633111000061, + "learning_rate": 0.00016994502507098412, + "loss": 2.8133, + "step": 4980 + }, + { + "epoch": 0.45126950691943557, + "grad_norm": 0.7234942317008972, + "learning_rate": 0.00016993898386999335, + "loss": 2.712, + "step": 4981 + }, + { + "epoch": 0.45136010509388236, + "grad_norm": 0.7429606914520264, + "learning_rate": 0.00016993294266900262, + "loss": 2.8548, + "step": 4982 + }, + { + "epoch": 0.45145070326832915, + "grad_norm": 0.7693937420845032, + "learning_rate": 0.00016992690146801185, + "loss": 2.2209, + "step": 4983 + }, + { + "epoch": 0.45154130144277593, + "grad_norm": 0.6731225848197937, + "learning_rate": 0.0001699208602670211, + "loss": 1.9788, + "step": 4984 + }, + { + "epoch": 0.4516318996172227, + "grad_norm": 0.7898438572883606, + "learning_rate": 0.00016991481906603032, + "loss": 2.919, + "step": 4985 + }, + { + "epoch": 0.4517224977916695, + "grad_norm": 0.6824893355369568, + "learning_rate": 0.00016990877786503958, + "loss": 1.9892, + "step": 4986 + }, + { + "epoch": 0.4518130959661163, + "grad_norm": 0.8858458399772644, + "learning_rate": 0.0001699027366640488, + "loss": 2.5325, + "step": 4987 + }, + { + "epoch": 0.4519036941405631, + "grad_norm": 0.6959949135780334, + "learning_rate": 0.00016989669546305807, + "loss": 2.446, + "step": 4988 + }, + { + "epoch": 0.45199429231500987, + "grad_norm": 0.7135378122329712, + "learning_rate": 0.0001698906542620673, + "loss": 2.8758, + "step": 4989 + }, + { + "epoch": 0.45208489048945666, + "grad_norm": 0.6735321283340454, + "learning_rate": 0.00016988461306107654, + "loss": 2.1384, + "step": 4990 + }, + { + "epoch": 0.45217548866390345, + "grad_norm": 0.7857680916786194, + "learning_rate": 0.0001698785718600858, + "loss": 2.9736, + "step": 4991 + }, + { + "epoch": 0.45226608683835023, + "grad_norm": 0.7543774843215942, + "learning_rate": 0.00016987253065909504, + "loss": 3.0311, + "step": 4992 + }, + { + "epoch": 0.45235668501279697, + "grad_norm": 0.7946193218231201, + "learning_rate": 0.00016986648945810427, + "loss": 2.7853, + "step": 4993 + }, + { + "epoch": 0.45244728318724375, + "grad_norm": 0.739597737789154, + "learning_rate": 0.0001698604482571135, + "loss": 2.7606, + "step": 4994 + }, + { + "epoch": 0.45253788136169054, + "grad_norm": 0.7672573328018188, + "learning_rate": 0.00016985440705612277, + "loss": 2.7391, + "step": 4995 + }, + { + "epoch": 0.45262847953613733, + "grad_norm": 0.7877441644668579, + "learning_rate": 0.00016984836585513203, + "loss": 2.8802, + "step": 4996 + }, + { + "epoch": 0.4527190777105841, + "grad_norm": 0.8132261633872986, + "learning_rate": 0.00016984232465414126, + "loss": 2.9976, + "step": 4997 + }, + { + "epoch": 0.4528096758850309, + "grad_norm": 0.8082935810089111, + "learning_rate": 0.0001698362834531505, + "loss": 2.9655, + "step": 4998 + }, + { + "epoch": 0.4529002740594777, + "grad_norm": 0.777092695236206, + "learning_rate": 0.00016983024225215973, + "loss": 2.7697, + "step": 4999 + }, + { + "epoch": 0.4529908722339245, + "grad_norm": 0.7289410829544067, + "learning_rate": 0.000169824201051169, + "loss": 2.7333, + "step": 5000 + }, + { + "epoch": 0.45308147040837127, + "grad_norm": 0.7790085673332214, + "learning_rate": 0.00016981815985017823, + "loss": 2.7798, + "step": 5001 + }, + { + "epoch": 0.45317206858281806, + "grad_norm": 0.7375802397727966, + "learning_rate": 0.00016981211864918746, + "loss": 2.778, + "step": 5002 + }, + { + "epoch": 0.45326266675726484, + "grad_norm": 0.7860432267189026, + "learning_rate": 0.0001698060774481967, + "loss": 2.9943, + "step": 5003 + }, + { + "epoch": 0.45335326493171163, + "grad_norm": 0.7327303886413574, + "learning_rate": 0.00016980003624720595, + "loss": 2.6643, + "step": 5004 + }, + { + "epoch": 0.4534438631061584, + "grad_norm": 0.7646451592445374, + "learning_rate": 0.00016979399504621522, + "loss": 2.8642, + "step": 5005 + }, + { + "epoch": 0.4535344612806052, + "grad_norm": 0.752808690071106, + "learning_rate": 0.00016978795384522442, + "loss": 3.1321, + "step": 5006 + }, + { + "epoch": 0.453625059455052, + "grad_norm": 0.7898646593093872, + "learning_rate": 0.00016978191264423368, + "loss": 2.5466, + "step": 5007 + }, + { + "epoch": 0.4537156576294988, + "grad_norm": 0.6429208517074585, + "learning_rate": 0.00016977587144324292, + "loss": 2.1691, + "step": 5008 + }, + { + "epoch": 0.45380625580394557, + "grad_norm": 0.7328996658325195, + "learning_rate": 0.00016976983024225218, + "loss": 3.0042, + "step": 5009 + }, + { + "epoch": 0.45389685397839236, + "grad_norm": 0.7677553296089172, + "learning_rate": 0.0001697637890412614, + "loss": 2.8774, + "step": 5010 + }, + { + "epoch": 0.45398745215283914, + "grad_norm": 0.6977699398994446, + "learning_rate": 0.00016975774784027065, + "loss": 2.2904, + "step": 5011 + }, + { + "epoch": 0.45407805032728593, + "grad_norm": 0.8317604660987854, + "learning_rate": 0.0001697517066392799, + "loss": 2.7998, + "step": 5012 + }, + { + "epoch": 0.45416864850173266, + "grad_norm": 0.7973432540893555, + "learning_rate": 0.00016974566543828914, + "loss": 2.7928, + "step": 5013 + }, + { + "epoch": 0.45425924667617945, + "grad_norm": 0.645167350769043, + "learning_rate": 0.00016973962423729838, + "loss": 2.0928, + "step": 5014 + }, + { + "epoch": 0.45434984485062624, + "grad_norm": 0.7350099682807922, + "learning_rate": 0.0001697335830363076, + "loss": 2.794, + "step": 5015 + }, + { + "epoch": 0.454440443025073, + "grad_norm": 0.7790660262107849, + "learning_rate": 0.00016972754183531687, + "loss": 2.8705, + "step": 5016 + }, + { + "epoch": 0.4545310411995198, + "grad_norm": 0.8343010544776917, + "learning_rate": 0.0001697215006343261, + "loss": 2.7611, + "step": 5017 + }, + { + "epoch": 0.4546216393739666, + "grad_norm": 0.7641253471374512, + "learning_rate": 0.00016971545943333537, + "loss": 2.8084, + "step": 5018 + }, + { + "epoch": 0.4547122375484134, + "grad_norm": 0.7949053645133972, + "learning_rate": 0.0001697094182323446, + "loss": 2.921, + "step": 5019 + }, + { + "epoch": 0.4548028357228602, + "grad_norm": 0.7487255334854126, + "learning_rate": 0.00016970337703135384, + "loss": 2.8112, + "step": 5020 + }, + { + "epoch": 0.45489343389730696, + "grad_norm": 0.7823983430862427, + "learning_rate": 0.0001696973358303631, + "loss": 3.0073, + "step": 5021 + }, + { + "epoch": 0.45498403207175375, + "grad_norm": 0.7159098982810974, + "learning_rate": 0.00016969129462937233, + "loss": 2.1412, + "step": 5022 + }, + { + "epoch": 0.45507463024620054, + "grad_norm": 0.7626204490661621, + "learning_rate": 0.00016968525342838156, + "loss": 2.1884, + "step": 5023 + }, + { + "epoch": 0.4551652284206473, + "grad_norm": 0.7375723123550415, + "learning_rate": 0.0001696792122273908, + "loss": 2.7692, + "step": 5024 + }, + { + "epoch": 0.4552558265950941, + "grad_norm": 0.7489473223686218, + "learning_rate": 0.00016967317102640006, + "loss": 2.804, + "step": 5025 + }, + { + "epoch": 0.4553464247695409, + "grad_norm": 0.7877167463302612, + "learning_rate": 0.00016966712982540932, + "loss": 2.9024, + "step": 5026 + }, + { + "epoch": 0.4554370229439877, + "grad_norm": 0.7699241638183594, + "learning_rate": 0.00016966108862441853, + "loss": 3.0354, + "step": 5027 + }, + { + "epoch": 0.4555276211184345, + "grad_norm": 0.8364891409873962, + "learning_rate": 0.0001696550474234278, + "loss": 3.0045, + "step": 5028 + }, + { + "epoch": 0.45561821929288127, + "grad_norm": 0.7637171745300293, + "learning_rate": 0.00016964900622243702, + "loss": 2.8961, + "step": 5029 + }, + { + "epoch": 0.45570881746732805, + "grad_norm": 0.752342700958252, + "learning_rate": 0.00016964296502144628, + "loss": 2.6938, + "step": 5030 + }, + { + "epoch": 0.45579941564177484, + "grad_norm": 0.8519414067268372, + "learning_rate": 0.00016963692382045552, + "loss": 2.9146, + "step": 5031 + }, + { + "epoch": 0.45589001381622163, + "grad_norm": 0.7490873336791992, + "learning_rate": 0.00016963088261946475, + "loss": 2.4625, + "step": 5032 + }, + { + "epoch": 0.45598061199066836, + "grad_norm": 0.9337019324302673, + "learning_rate": 0.00016962484141847399, + "loss": 2.7037, + "step": 5033 + }, + { + "epoch": 0.45607121016511515, + "grad_norm": 0.8887954950332642, + "learning_rate": 0.00016961880021748325, + "loss": 2.67, + "step": 5034 + }, + { + "epoch": 0.45616180833956194, + "grad_norm": 0.7799996733665466, + "learning_rate": 0.0001696127590164925, + "loss": 3.0885, + "step": 5035 + }, + { + "epoch": 0.4562524065140087, + "grad_norm": 0.767702579498291, + "learning_rate": 0.00016960671781550172, + "loss": 2.8491, + "step": 5036 + }, + { + "epoch": 0.4563430046884555, + "grad_norm": 0.7910240888595581, + "learning_rate": 0.00016960067661451098, + "loss": 2.7624, + "step": 5037 + }, + { + "epoch": 0.4564336028629023, + "grad_norm": 0.8051721453666687, + "learning_rate": 0.0001695946354135202, + "loss": 2.7621, + "step": 5038 + }, + { + "epoch": 0.4565242010373491, + "grad_norm": 0.77924644947052, + "learning_rate": 0.00016958859421252947, + "loss": 2.8434, + "step": 5039 + }, + { + "epoch": 0.4566147992117959, + "grad_norm": 0.6660674214363098, + "learning_rate": 0.00016958255301153868, + "loss": 2.0442, + "step": 5040 + }, + { + "epoch": 0.45670539738624266, + "grad_norm": 0.802824854850769, + "learning_rate": 0.00016957651181054794, + "loss": 2.8568, + "step": 5041 + }, + { + "epoch": 0.45679599556068945, + "grad_norm": 0.7182466983795166, + "learning_rate": 0.0001695704706095572, + "loss": 2.7517, + "step": 5042 + }, + { + "epoch": 0.45688659373513624, + "grad_norm": 0.8069086670875549, + "learning_rate": 0.00016956442940856644, + "loss": 2.857, + "step": 5043 + }, + { + "epoch": 0.456977191909583, + "grad_norm": 0.7852795720100403, + "learning_rate": 0.00016955838820757567, + "loss": 2.8813, + "step": 5044 + }, + { + "epoch": 0.4570677900840298, + "grad_norm": 0.8065798878669739, + "learning_rate": 0.0001695523470065849, + "loss": 2.8364, + "step": 5045 + }, + { + "epoch": 0.4571583882584766, + "grad_norm": 0.7545676231384277, + "learning_rate": 0.00016954630580559416, + "loss": 2.7208, + "step": 5046 + }, + { + "epoch": 0.4572489864329234, + "grad_norm": 0.7547599077224731, + "learning_rate": 0.0001695402646046034, + "loss": 2.6947, + "step": 5047 + }, + { + "epoch": 0.4573395846073702, + "grad_norm": 0.7957379817962646, + "learning_rate": 0.00016953422340361266, + "loss": 2.9856, + "step": 5048 + }, + { + "epoch": 0.45743018278181696, + "grad_norm": 0.7059924006462097, + "learning_rate": 0.0001695281822026219, + "loss": 2.6324, + "step": 5049 + }, + { + "epoch": 0.45752078095626375, + "grad_norm": 0.7658380270004272, + "learning_rate": 0.00016952214100163113, + "loss": 2.7686, + "step": 5050 + }, + { + "epoch": 0.45761137913071054, + "grad_norm": 0.7406240105628967, + "learning_rate": 0.0001695160998006404, + "loss": 2.6468, + "step": 5051 + }, + { + "epoch": 0.4577019773051573, + "grad_norm": 0.6989923715591431, + "learning_rate": 0.00016951005859964962, + "loss": 2.56, + "step": 5052 + }, + { + "epoch": 0.4577925754796041, + "grad_norm": 0.794014573097229, + "learning_rate": 0.00016950401739865886, + "loss": 2.853, + "step": 5053 + }, + { + "epoch": 0.45788317365405085, + "grad_norm": 0.7469330430030823, + "learning_rate": 0.0001694979761976681, + "loss": 2.7599, + "step": 5054 + }, + { + "epoch": 0.45797377182849763, + "grad_norm": 0.7118890285491943, + "learning_rate": 0.00016949193499667735, + "loss": 2.8359, + "step": 5055 + }, + { + "epoch": 0.4580643700029444, + "grad_norm": 0.7773594856262207, + "learning_rate": 0.0001694858937956866, + "loss": 2.7834, + "step": 5056 + }, + { + "epoch": 0.4581549681773912, + "grad_norm": 0.8349024653434753, + "learning_rate": 0.00016947985259469582, + "loss": 3.0337, + "step": 5057 + }, + { + "epoch": 0.458245566351838, + "grad_norm": 0.7494614124298096, + "learning_rate": 0.00016947381139370508, + "loss": 2.734, + "step": 5058 + }, + { + "epoch": 0.4583361645262848, + "grad_norm": 0.774398148059845, + "learning_rate": 0.00016946777019271432, + "loss": 2.8089, + "step": 5059 + }, + { + "epoch": 0.45842676270073157, + "grad_norm": 0.7685260772705078, + "learning_rate": 0.00016946172899172358, + "loss": 3.115, + "step": 5060 + }, + { + "epoch": 0.45851736087517836, + "grad_norm": 0.7624256014823914, + "learning_rate": 0.0001694556877907328, + "loss": 2.7806, + "step": 5061 + }, + { + "epoch": 0.45860795904962515, + "grad_norm": 0.7913388013839722, + "learning_rate": 0.00016944964658974204, + "loss": 2.7278, + "step": 5062 + }, + { + "epoch": 0.45869855722407193, + "grad_norm": 0.7345370650291443, + "learning_rate": 0.00016944360538875128, + "loss": 2.1185, + "step": 5063 + }, + { + "epoch": 0.4587891553985187, + "grad_norm": 0.7513041496276855, + "learning_rate": 0.00016943756418776054, + "loss": 2.9597, + "step": 5064 + }, + { + "epoch": 0.4588797535729655, + "grad_norm": 0.7864038348197937, + "learning_rate": 0.00016943152298676977, + "loss": 2.9037, + "step": 5065 + }, + { + "epoch": 0.4589703517474123, + "grad_norm": 0.8560469150543213, + "learning_rate": 0.000169425481785779, + "loss": 2.7294, + "step": 5066 + }, + { + "epoch": 0.4590609499218591, + "grad_norm": 0.679572582244873, + "learning_rate": 0.00016941944058478827, + "loss": 2.2746, + "step": 5067 + }, + { + "epoch": 0.4591515480963059, + "grad_norm": 0.7598720788955688, + "learning_rate": 0.0001694133993837975, + "loss": 2.7437, + "step": 5068 + }, + { + "epoch": 0.45924214627075266, + "grad_norm": 0.7750831246376038, + "learning_rate": 0.00016940735818280676, + "loss": 2.7302, + "step": 5069 + }, + { + "epoch": 0.45933274444519945, + "grad_norm": 0.7897775173187256, + "learning_rate": 0.00016940131698181597, + "loss": 2.8964, + "step": 5070 + }, + { + "epoch": 0.45942334261964624, + "grad_norm": 0.7293692827224731, + "learning_rate": 0.00016939527578082523, + "loss": 2.0707, + "step": 5071 + }, + { + "epoch": 0.459513940794093, + "grad_norm": 0.6218019127845764, + "learning_rate": 0.0001693892345798345, + "loss": 1.9576, + "step": 5072 + }, + { + "epoch": 0.4596045389685398, + "grad_norm": 0.740358829498291, + "learning_rate": 0.00016938319337884373, + "loss": 2.6269, + "step": 5073 + }, + { + "epoch": 0.45969513714298654, + "grad_norm": 0.7571755051612854, + "learning_rate": 0.00016937715217785296, + "loss": 2.7163, + "step": 5074 + }, + { + "epoch": 0.45978573531743333, + "grad_norm": 0.7377550601959229, + "learning_rate": 0.0001693711109768622, + "loss": 2.6788, + "step": 5075 + }, + { + "epoch": 0.4598763334918801, + "grad_norm": 0.8424228429794312, + "learning_rate": 0.00016936506977587146, + "loss": 2.7626, + "step": 5076 + }, + { + "epoch": 0.4599669316663269, + "grad_norm": 0.7657473087310791, + "learning_rate": 0.0001693590285748807, + "loss": 2.7948, + "step": 5077 + }, + { + "epoch": 0.4600575298407737, + "grad_norm": 0.7830277681350708, + "learning_rate": 0.00016935298737388993, + "loss": 2.8828, + "step": 5078 + }, + { + "epoch": 0.4601481280152205, + "grad_norm": 0.8069890737533569, + "learning_rate": 0.00016934694617289919, + "loss": 2.7414, + "step": 5079 + }, + { + "epoch": 0.46023872618966727, + "grad_norm": 0.8267145156860352, + "learning_rate": 0.00016934090497190842, + "loss": 3.0989, + "step": 5080 + }, + { + "epoch": 0.46032932436411406, + "grad_norm": 0.7650925517082214, + "learning_rate": 0.00016933486377091768, + "loss": 2.6374, + "step": 5081 + }, + { + "epoch": 0.46041992253856084, + "grad_norm": 0.7330838441848755, + "learning_rate": 0.00016932882256992692, + "loss": 2.3002, + "step": 5082 + }, + { + "epoch": 0.46051052071300763, + "grad_norm": 0.8721592426300049, + "learning_rate": 0.00016932278136893615, + "loss": 3.1839, + "step": 5083 + }, + { + "epoch": 0.4606011188874544, + "grad_norm": 0.7417718768119812, + "learning_rate": 0.00016931674016794538, + "loss": 2.7921, + "step": 5084 + }, + { + "epoch": 0.4606917170619012, + "grad_norm": 0.7621380686759949, + "learning_rate": 0.00016931069896695464, + "loss": 3.0633, + "step": 5085 + }, + { + "epoch": 0.460782315236348, + "grad_norm": 0.6851346492767334, + "learning_rate": 0.00016930465776596388, + "loss": 2.2882, + "step": 5086 + }, + { + "epoch": 0.4608729134107948, + "grad_norm": 0.7303885221481323, + "learning_rate": 0.0001692986165649731, + "loss": 2.6339, + "step": 5087 + }, + { + "epoch": 0.46096351158524157, + "grad_norm": 0.7586783170700073, + "learning_rate": 0.00016929257536398237, + "loss": 2.2232, + "step": 5088 + }, + { + "epoch": 0.46105410975968836, + "grad_norm": 0.665046215057373, + "learning_rate": 0.0001692865341629916, + "loss": 2.1443, + "step": 5089 + }, + { + "epoch": 0.46114470793413515, + "grad_norm": 0.7913390398025513, + "learning_rate": 0.00016928049296200087, + "loss": 2.9334, + "step": 5090 + }, + { + "epoch": 0.46123530610858193, + "grad_norm": 0.7833670973777771, + "learning_rate": 0.00016927445176101008, + "loss": 2.7976, + "step": 5091 + }, + { + "epoch": 0.4613259042830287, + "grad_norm": 0.7661607265472412, + "learning_rate": 0.00016926841056001934, + "loss": 2.908, + "step": 5092 + }, + { + "epoch": 0.4614165024574755, + "grad_norm": 0.7801427841186523, + "learning_rate": 0.00016926236935902857, + "loss": 2.6612, + "step": 5093 + }, + { + "epoch": 0.46150710063192224, + "grad_norm": 0.7626616954803467, + "learning_rate": 0.00016925632815803783, + "loss": 2.7736, + "step": 5094 + }, + { + "epoch": 0.46159769880636903, + "grad_norm": 0.7194672226905823, + "learning_rate": 0.00016925028695704707, + "loss": 2.7919, + "step": 5095 + }, + { + "epoch": 0.4616882969808158, + "grad_norm": 0.6458209753036499, + "learning_rate": 0.0001692442457560563, + "loss": 2.1061, + "step": 5096 + }, + { + "epoch": 0.4617788951552626, + "grad_norm": 0.6443226933479309, + "learning_rate": 0.00016923820455506556, + "loss": 2.1241, + "step": 5097 + }, + { + "epoch": 0.4618694933297094, + "grad_norm": 0.807992160320282, + "learning_rate": 0.0001692321633540748, + "loss": 2.9651, + "step": 5098 + }, + { + "epoch": 0.4619600915041562, + "grad_norm": 0.7547837495803833, + "learning_rate": 0.00016922612215308403, + "loss": 2.1105, + "step": 5099 + }, + { + "epoch": 0.46205068967860297, + "grad_norm": 0.7800476551055908, + "learning_rate": 0.00016922008095209326, + "loss": 2.9309, + "step": 5100 + }, + { + "epoch": 0.46214128785304975, + "grad_norm": 0.7517395615577698, + "learning_rate": 0.00016921403975110253, + "loss": 2.9995, + "step": 5101 + }, + { + "epoch": 0.46223188602749654, + "grad_norm": 0.7896794676780701, + "learning_rate": 0.0001692079985501118, + "loss": 2.7443, + "step": 5102 + }, + { + "epoch": 0.46232248420194333, + "grad_norm": 0.8102203011512756, + "learning_rate": 0.00016920195734912102, + "loss": 2.9095, + "step": 5103 + }, + { + "epoch": 0.4624130823763901, + "grad_norm": 0.8467516899108887, + "learning_rate": 0.00016919591614813025, + "loss": 2.7697, + "step": 5104 + }, + { + "epoch": 0.4625036805508369, + "grad_norm": 0.7562469244003296, + "learning_rate": 0.0001691898749471395, + "loss": 2.9448, + "step": 5105 + }, + { + "epoch": 0.4625942787252837, + "grad_norm": 0.7826890349388123, + "learning_rate": 0.00016918383374614875, + "loss": 2.8749, + "step": 5106 + }, + { + "epoch": 0.4626848768997305, + "grad_norm": 0.753295361995697, + "learning_rate": 0.00016917779254515798, + "loss": 2.5068, + "step": 5107 + }, + { + "epoch": 0.46277547507417727, + "grad_norm": 0.7716496586799622, + "learning_rate": 0.00016917175134416722, + "loss": 2.6028, + "step": 5108 + }, + { + "epoch": 0.46286607324862405, + "grad_norm": 0.7607266902923584, + "learning_rate": 0.00016916571014317648, + "loss": 2.761, + "step": 5109 + }, + { + "epoch": 0.46295667142307084, + "grad_norm": 0.7362285256385803, + "learning_rate": 0.0001691596689421857, + "loss": 2.2295, + "step": 5110 + }, + { + "epoch": 0.46304726959751763, + "grad_norm": 0.7450121641159058, + "learning_rate": 0.00016915362774119497, + "loss": 2.8611, + "step": 5111 + }, + { + "epoch": 0.4631378677719644, + "grad_norm": 0.7322564721107483, + "learning_rate": 0.0001691475865402042, + "loss": 2.1063, + "step": 5112 + }, + { + "epoch": 0.4632284659464112, + "grad_norm": 0.7924812436103821, + "learning_rate": 0.00016914154533921344, + "loss": 2.6947, + "step": 5113 + }, + { + "epoch": 0.46331906412085794, + "grad_norm": 0.747995138168335, + "learning_rate": 0.00016913550413822268, + "loss": 2.9404, + "step": 5114 + }, + { + "epoch": 0.4634096622953047, + "grad_norm": 0.862003743648529, + "learning_rate": 0.00016912946293723194, + "loss": 2.9843, + "step": 5115 + }, + { + "epoch": 0.4635002604697515, + "grad_norm": 0.765844464302063, + "learning_rate": 0.00016912342173624117, + "loss": 2.7854, + "step": 5116 + }, + { + "epoch": 0.4635908586441983, + "grad_norm": 0.7996437549591064, + "learning_rate": 0.0001691173805352504, + "loss": 2.9475, + "step": 5117 + }, + { + "epoch": 0.4636814568186451, + "grad_norm": 0.7964616417884827, + "learning_rate": 0.00016911133933425967, + "loss": 3.2657, + "step": 5118 + }, + { + "epoch": 0.4637720549930919, + "grad_norm": 0.7524400353431702, + "learning_rate": 0.0001691052981332689, + "loss": 2.9463, + "step": 5119 + }, + { + "epoch": 0.46386265316753866, + "grad_norm": 0.7774505019187927, + "learning_rate": 0.00016909925693227816, + "loss": 2.9906, + "step": 5120 + }, + { + "epoch": 0.46395325134198545, + "grad_norm": 0.8234672546386719, + "learning_rate": 0.00016909321573128737, + "loss": 2.6364, + "step": 5121 + }, + { + "epoch": 0.46404384951643224, + "grad_norm": 0.7490497827529907, + "learning_rate": 0.00016908717453029663, + "loss": 2.7744, + "step": 5122 + }, + { + "epoch": 0.464134447690879, + "grad_norm": 0.7243006229400635, + "learning_rate": 0.00016908113332930586, + "loss": 2.7285, + "step": 5123 + }, + { + "epoch": 0.4642250458653258, + "grad_norm": 0.7356950044631958, + "learning_rate": 0.00016907509212831513, + "loss": 2.9216, + "step": 5124 + }, + { + "epoch": 0.4643156440397726, + "grad_norm": 0.8962894082069397, + "learning_rate": 0.00016906905092732436, + "loss": 2.7328, + "step": 5125 + }, + { + "epoch": 0.4644062422142194, + "grad_norm": 0.7054429054260254, + "learning_rate": 0.0001690630097263336, + "loss": 2.7846, + "step": 5126 + }, + { + "epoch": 0.4644968403886662, + "grad_norm": 0.7601459622383118, + "learning_rate": 0.00016905696852534285, + "loss": 2.4638, + "step": 5127 + }, + { + "epoch": 0.46458743856311296, + "grad_norm": 0.8157910108566284, + "learning_rate": 0.0001690509273243521, + "loss": 2.6275, + "step": 5128 + }, + { + "epoch": 0.46467803673755975, + "grad_norm": 0.7457374930381775, + "learning_rate": 0.00016904488612336132, + "loss": 2.8197, + "step": 5129 + }, + { + "epoch": 0.46476863491200654, + "grad_norm": 0.7267411351203918, + "learning_rate": 0.00016903884492237056, + "loss": 2.5885, + "step": 5130 + }, + { + "epoch": 0.4648592330864533, + "grad_norm": 0.8111249804496765, + "learning_rate": 0.00016903280372137982, + "loss": 2.8694, + "step": 5131 + }, + { + "epoch": 0.4649498312609001, + "grad_norm": 0.7139849662780762, + "learning_rate": 0.00016902676252038908, + "loss": 2.3167, + "step": 5132 + }, + { + "epoch": 0.4650404294353469, + "grad_norm": 0.889630138874054, + "learning_rate": 0.0001690207213193983, + "loss": 2.9134, + "step": 5133 + }, + { + "epoch": 0.4651310276097937, + "grad_norm": 0.7416114807128906, + "learning_rate": 0.00016901468011840755, + "loss": 2.7517, + "step": 5134 + }, + { + "epoch": 0.4652216257842404, + "grad_norm": 0.7338166236877441, + "learning_rate": 0.00016900863891741678, + "loss": 3.1279, + "step": 5135 + }, + { + "epoch": 0.4653122239586872, + "grad_norm": 0.8670511841773987, + "learning_rate": 0.00016900259771642604, + "loss": 2.9469, + "step": 5136 + }, + { + "epoch": 0.465402822133134, + "grad_norm": 0.8166145086288452, + "learning_rate": 0.00016899655651543528, + "loss": 2.8117, + "step": 5137 + }, + { + "epoch": 0.4654934203075808, + "grad_norm": 0.7483142614364624, + "learning_rate": 0.0001689905153144445, + "loss": 2.7452, + "step": 5138 + }, + { + "epoch": 0.4655840184820276, + "grad_norm": 0.7961382269859314, + "learning_rate": 0.00016898447411345377, + "loss": 3.0453, + "step": 5139 + }, + { + "epoch": 0.46567461665647436, + "grad_norm": 0.7443453073501587, + "learning_rate": 0.000168978432912463, + "loss": 2.8275, + "step": 5140 + }, + { + "epoch": 0.46576521483092115, + "grad_norm": 0.7552531957626343, + "learning_rate": 0.00016897239171147227, + "loss": 2.9705, + "step": 5141 + }, + { + "epoch": 0.46585581300536794, + "grad_norm": 0.742904007434845, + "learning_rate": 0.00016896635051048147, + "loss": 2.9288, + "step": 5142 + }, + { + "epoch": 0.4659464111798147, + "grad_norm": 0.7839171886444092, + "learning_rate": 0.00016896030930949074, + "loss": 2.7449, + "step": 5143 + }, + { + "epoch": 0.4660370093542615, + "grad_norm": 0.727437436580658, + "learning_rate": 0.00016895426810849997, + "loss": 2.7867, + "step": 5144 + }, + { + "epoch": 0.4661276075287083, + "grad_norm": 0.7325690388679504, + "learning_rate": 0.00016894822690750923, + "loss": 2.3202, + "step": 5145 + }, + { + "epoch": 0.4662182057031551, + "grad_norm": 0.7400582432746887, + "learning_rate": 0.00016894218570651846, + "loss": 2.8614, + "step": 5146 + }, + { + "epoch": 0.4663088038776019, + "grad_norm": 0.8204727172851562, + "learning_rate": 0.0001689361445055277, + "loss": 2.8474, + "step": 5147 + }, + { + "epoch": 0.46639940205204866, + "grad_norm": 0.8120856881141663, + "learning_rate": 0.00016893010330453696, + "loss": 2.9579, + "step": 5148 + }, + { + "epoch": 0.46649000022649545, + "grad_norm": 0.7278134822845459, + "learning_rate": 0.0001689240621035462, + "loss": 2.6707, + "step": 5149 + }, + { + "epoch": 0.46658059840094224, + "grad_norm": 0.7518181800842285, + "learning_rate": 0.00016891802090255543, + "loss": 2.7471, + "step": 5150 + }, + { + "epoch": 0.466671196575389, + "grad_norm": 0.7501921057701111, + "learning_rate": 0.00016891197970156466, + "loss": 2.93, + "step": 5151 + }, + { + "epoch": 0.4667617947498358, + "grad_norm": 0.7792571187019348, + "learning_rate": 0.00016890593850057392, + "loss": 3.1812, + "step": 5152 + }, + { + "epoch": 0.4668523929242826, + "grad_norm": 0.7791383266448975, + "learning_rate": 0.00016889989729958318, + "loss": 2.9548, + "step": 5153 + }, + { + "epoch": 0.4669429910987294, + "grad_norm": 0.7710021138191223, + "learning_rate": 0.00016889385609859242, + "loss": 2.7159, + "step": 5154 + }, + { + "epoch": 0.4670335892731761, + "grad_norm": 0.8034720420837402, + "learning_rate": 0.00016888781489760165, + "loss": 2.8211, + "step": 5155 + }, + { + "epoch": 0.4671241874476229, + "grad_norm": 0.7916325330734253, + "learning_rate": 0.00016888177369661089, + "loss": 2.9221, + "step": 5156 + }, + { + "epoch": 0.4672147856220697, + "grad_norm": 0.8168415427207947, + "learning_rate": 0.00016887573249562015, + "loss": 2.6704, + "step": 5157 + }, + { + "epoch": 0.4673053837965165, + "grad_norm": 0.8288335204124451, + "learning_rate": 0.00016886969129462938, + "loss": 2.8201, + "step": 5158 + }, + { + "epoch": 0.46739598197096327, + "grad_norm": 0.7475084662437439, + "learning_rate": 0.00016886365009363862, + "loss": 2.956, + "step": 5159 + }, + { + "epoch": 0.46748658014541006, + "grad_norm": 0.7752896547317505, + "learning_rate": 0.00016885760889264785, + "loss": 3.1065, + "step": 5160 + }, + { + "epoch": 0.46757717831985685, + "grad_norm": 0.7951609492301941, + "learning_rate": 0.0001688515676916571, + "loss": 2.7932, + "step": 5161 + }, + { + "epoch": 0.46766777649430363, + "grad_norm": 0.7307072877883911, + "learning_rate": 0.00016884552649066637, + "loss": 2.4268, + "step": 5162 + }, + { + "epoch": 0.4677583746687504, + "grad_norm": 0.8264990448951721, + "learning_rate": 0.00016883948528967558, + "loss": 2.6683, + "step": 5163 + }, + { + "epoch": 0.4678489728431972, + "grad_norm": 0.7722190618515015, + "learning_rate": 0.00016883344408868484, + "loss": 2.7266, + "step": 5164 + }, + { + "epoch": 0.467939571017644, + "grad_norm": 0.7864798307418823, + "learning_rate": 0.00016882740288769407, + "loss": 3.0345, + "step": 5165 + }, + { + "epoch": 0.4680301691920908, + "grad_norm": 0.8906176686286926, + "learning_rate": 0.00016882136168670334, + "loss": 2.7917, + "step": 5166 + }, + { + "epoch": 0.46812076736653757, + "grad_norm": 0.7304551601409912, + "learning_rate": 0.00016881532048571257, + "loss": 3.0312, + "step": 5167 + }, + { + "epoch": 0.46821136554098436, + "grad_norm": 0.8245891332626343, + "learning_rate": 0.0001688092792847218, + "loss": 3.0537, + "step": 5168 + }, + { + "epoch": 0.46830196371543115, + "grad_norm": 0.7323639988899231, + "learning_rate": 0.00016880323808373106, + "loss": 2.9531, + "step": 5169 + }, + { + "epoch": 0.46839256188987793, + "grad_norm": 0.7237206697463989, + "learning_rate": 0.0001687971968827403, + "loss": 2.7444, + "step": 5170 + }, + { + "epoch": 0.4684831600643247, + "grad_norm": 0.7948635220527649, + "learning_rate": 0.00016879115568174956, + "loss": 2.9833, + "step": 5171 + }, + { + "epoch": 0.4685737582387715, + "grad_norm": 0.7182056307792664, + "learning_rate": 0.00016878511448075877, + "loss": 2.6463, + "step": 5172 + }, + { + "epoch": 0.4686643564132183, + "grad_norm": 0.7666380405426025, + "learning_rate": 0.00016877907327976803, + "loss": 2.7968, + "step": 5173 + }, + { + "epoch": 0.4687549545876651, + "grad_norm": 0.7636033892631531, + "learning_rate": 0.00016877303207877726, + "loss": 2.9441, + "step": 5174 + }, + { + "epoch": 0.4688455527621118, + "grad_norm": 0.7432243227958679, + "learning_rate": 0.00016876699087778652, + "loss": 2.8908, + "step": 5175 + }, + { + "epoch": 0.4689361509365586, + "grad_norm": 0.7290223240852356, + "learning_rate": 0.00016876094967679576, + "loss": 2.8077, + "step": 5176 + }, + { + "epoch": 0.4690267491110054, + "grad_norm": 0.8003751039505005, + "learning_rate": 0.000168754908475805, + "loss": 2.8954, + "step": 5177 + }, + { + "epoch": 0.4691173472854522, + "grad_norm": 0.7642250657081604, + "learning_rate": 0.00016874886727481425, + "loss": 3.0195, + "step": 5178 + }, + { + "epoch": 0.46920794545989897, + "grad_norm": 0.7288727760314941, + "learning_rate": 0.00016874282607382349, + "loss": 2.9213, + "step": 5179 + }, + { + "epoch": 0.46929854363434576, + "grad_norm": 0.7535523176193237, + "learning_rate": 0.00016873678487283272, + "loss": 2.8115, + "step": 5180 + }, + { + "epoch": 0.46938914180879254, + "grad_norm": 0.7587772607803345, + "learning_rate": 0.00016873074367184195, + "loss": 2.9173, + "step": 5181 + }, + { + "epoch": 0.46947973998323933, + "grad_norm": 0.753482460975647, + "learning_rate": 0.00016872470247085122, + "loss": 2.7245, + "step": 5182 + }, + { + "epoch": 0.4695703381576861, + "grad_norm": 0.778948962688446, + "learning_rate": 0.00016871866126986048, + "loss": 2.725, + "step": 5183 + }, + { + "epoch": 0.4696609363321329, + "grad_norm": 0.7641888856887817, + "learning_rate": 0.0001687126200688697, + "loss": 2.5885, + "step": 5184 + }, + { + "epoch": 0.4697515345065797, + "grad_norm": 0.7935823798179626, + "learning_rate": 0.00016870657886787894, + "loss": 2.8739, + "step": 5185 + }, + { + "epoch": 0.4698421326810265, + "grad_norm": 0.7317373156547546, + "learning_rate": 0.00016870053766688818, + "loss": 2.1485, + "step": 5186 + }, + { + "epoch": 0.46993273085547327, + "grad_norm": 0.7910207509994507, + "learning_rate": 0.00016869449646589744, + "loss": 2.8671, + "step": 5187 + }, + { + "epoch": 0.47002332902992006, + "grad_norm": 0.7806037068367004, + "learning_rate": 0.00016868845526490667, + "loss": 2.8697, + "step": 5188 + }, + { + "epoch": 0.47011392720436684, + "grad_norm": 0.7591358423233032, + "learning_rate": 0.0001686824140639159, + "loss": 2.8203, + "step": 5189 + }, + { + "epoch": 0.47020452537881363, + "grad_norm": 0.7965754270553589, + "learning_rate": 0.00016867637286292514, + "loss": 2.7098, + "step": 5190 + }, + { + "epoch": 0.4702951235532604, + "grad_norm": 0.7651569247245789, + "learning_rate": 0.0001686703316619344, + "loss": 2.768, + "step": 5191 + }, + { + "epoch": 0.4703857217277072, + "grad_norm": 0.718097984790802, + "learning_rate": 0.00016866429046094366, + "loss": 2.9084, + "step": 5192 + }, + { + "epoch": 0.470476319902154, + "grad_norm": 0.7542867660522461, + "learning_rate": 0.00016865824925995287, + "loss": 2.8948, + "step": 5193 + }, + { + "epoch": 0.4705669180766008, + "grad_norm": 0.8145817518234253, + "learning_rate": 0.00016865220805896213, + "loss": 2.5653, + "step": 5194 + }, + { + "epoch": 0.4706575162510475, + "grad_norm": 0.7923183441162109, + "learning_rate": 0.00016864616685797137, + "loss": 2.9484, + "step": 5195 + }, + { + "epoch": 0.4707481144254943, + "grad_norm": 0.7430731058120728, + "learning_rate": 0.00016864012565698063, + "loss": 2.7818, + "step": 5196 + }, + { + "epoch": 0.4708387125999411, + "grad_norm": 0.7718393206596375, + "learning_rate": 0.00016863408445598986, + "loss": 2.8797, + "step": 5197 + }, + { + "epoch": 0.4709293107743879, + "grad_norm": 0.764340877532959, + "learning_rate": 0.0001686280432549991, + "loss": 2.8853, + "step": 5198 + }, + { + "epoch": 0.47101990894883466, + "grad_norm": 0.7259171605110168, + "learning_rate": 0.00016862200205400836, + "loss": 2.8827, + "step": 5199 + }, + { + "epoch": 0.47111050712328145, + "grad_norm": 0.6985301375389099, + "learning_rate": 0.0001686159608530176, + "loss": 2.6329, + "step": 5200 + }, + { + "epoch": 0.47120110529772824, + "grad_norm": 0.5451188087463379, + "learning_rate": 0.00016860991965202683, + "loss": 1.3663, + "step": 5201 + }, + { + "epoch": 0.471291703472175, + "grad_norm": 0.8115806579589844, + "learning_rate": 0.00016860387845103606, + "loss": 2.8382, + "step": 5202 + }, + { + "epoch": 0.4713823016466218, + "grad_norm": 0.7737947106361389, + "learning_rate": 0.00016859783725004532, + "loss": 2.8346, + "step": 5203 + }, + { + "epoch": 0.4714728998210686, + "grad_norm": 0.7439373731613159, + "learning_rate": 0.00016859179604905455, + "loss": 2.7843, + "step": 5204 + }, + { + "epoch": 0.4715634979955154, + "grad_norm": 0.7381324172019958, + "learning_rate": 0.00016858575484806382, + "loss": 2.905, + "step": 5205 + }, + { + "epoch": 0.4716540961699622, + "grad_norm": 0.9033580422401428, + "learning_rate": 0.00016857971364707305, + "loss": 3.0418, + "step": 5206 + }, + { + "epoch": 0.47174469434440897, + "grad_norm": 0.7161144614219666, + "learning_rate": 0.00016857367244608228, + "loss": 2.2512, + "step": 5207 + }, + { + "epoch": 0.47183529251885575, + "grad_norm": 0.7700833082199097, + "learning_rate": 0.00016856763124509154, + "loss": 2.8618, + "step": 5208 + }, + { + "epoch": 0.47192589069330254, + "grad_norm": 0.7835800051689148, + "learning_rate": 0.00016856159004410078, + "loss": 2.7351, + "step": 5209 + }, + { + "epoch": 0.47201648886774933, + "grad_norm": 0.8038104772567749, + "learning_rate": 0.00016855554884311, + "loss": 2.7759, + "step": 5210 + }, + { + "epoch": 0.4721070870421961, + "grad_norm": 0.665627121925354, + "learning_rate": 0.00016854950764211925, + "loss": 2.2258, + "step": 5211 + }, + { + "epoch": 0.4721976852166429, + "grad_norm": 0.7585942149162292, + "learning_rate": 0.0001685434664411285, + "loss": 2.9848, + "step": 5212 + }, + { + "epoch": 0.4722882833910897, + "grad_norm": 0.7553530931472778, + "learning_rate": 0.00016853742524013777, + "loss": 2.3906, + "step": 5213 + }, + { + "epoch": 0.4723788815655365, + "grad_norm": 0.768183708190918, + "learning_rate": 0.00016853138403914698, + "loss": 2.8451, + "step": 5214 + }, + { + "epoch": 0.47246947973998327, + "grad_norm": 0.7397485375404358, + "learning_rate": 0.00016852534283815624, + "loss": 2.6467, + "step": 5215 + }, + { + "epoch": 0.47256007791443, + "grad_norm": 0.7794460654258728, + "learning_rate": 0.00016851930163716547, + "loss": 2.9936, + "step": 5216 + }, + { + "epoch": 0.4726506760888768, + "grad_norm": 0.7763407826423645, + "learning_rate": 0.00016851326043617473, + "loss": 2.6758, + "step": 5217 + }, + { + "epoch": 0.4727412742633236, + "grad_norm": 0.7525830864906311, + "learning_rate": 0.00016850721923518397, + "loss": 2.8392, + "step": 5218 + }, + { + "epoch": 0.47283187243777036, + "grad_norm": 0.8010318279266357, + "learning_rate": 0.0001685011780341932, + "loss": 2.776, + "step": 5219 + }, + { + "epoch": 0.47292247061221715, + "grad_norm": 0.7990502119064331, + "learning_rate": 0.00016849513683320243, + "loss": 2.9625, + "step": 5220 + }, + { + "epoch": 0.47301306878666394, + "grad_norm": 0.7917962670326233, + "learning_rate": 0.0001684890956322117, + "loss": 2.7727, + "step": 5221 + }, + { + "epoch": 0.4731036669611107, + "grad_norm": 0.8128529191017151, + "learning_rate": 0.00016848305443122096, + "loss": 2.6536, + "step": 5222 + }, + { + "epoch": 0.4731942651355575, + "grad_norm": 0.7917459607124329, + "learning_rate": 0.00016847701323023016, + "loss": 2.7066, + "step": 5223 + }, + { + "epoch": 0.4732848633100043, + "grad_norm": 0.8167937397956848, + "learning_rate": 0.00016847097202923943, + "loss": 2.8021, + "step": 5224 + }, + { + "epoch": 0.4733754614844511, + "grad_norm": 0.7886762619018555, + "learning_rate": 0.00016846493082824866, + "loss": 2.808, + "step": 5225 + }, + { + "epoch": 0.4734660596588979, + "grad_norm": 0.7690802216529846, + "learning_rate": 0.00016845888962725792, + "loss": 2.8856, + "step": 5226 + }, + { + "epoch": 0.47355665783334466, + "grad_norm": 0.7133263349533081, + "learning_rate": 0.00016845284842626713, + "loss": 2.6318, + "step": 5227 + }, + { + "epoch": 0.47364725600779145, + "grad_norm": 0.7753891348838806, + "learning_rate": 0.0001684468072252764, + "loss": 2.8247, + "step": 5228 + }, + { + "epoch": 0.47373785418223824, + "grad_norm": 0.8098220229148865, + "learning_rate": 0.00016844076602428565, + "loss": 3.0371, + "step": 5229 + }, + { + "epoch": 0.473828452356685, + "grad_norm": 0.7325533032417297, + "learning_rate": 0.00016843472482329488, + "loss": 2.8723, + "step": 5230 + }, + { + "epoch": 0.4739190505311318, + "grad_norm": 0.7415362000465393, + "learning_rate": 0.00016842868362230412, + "loss": 2.574, + "step": 5231 + }, + { + "epoch": 0.4740096487055786, + "grad_norm": 0.775285005569458, + "learning_rate": 0.00016842264242131335, + "loss": 2.6826, + "step": 5232 + }, + { + "epoch": 0.4741002468800254, + "grad_norm": 0.7562990188598633, + "learning_rate": 0.0001684166012203226, + "loss": 2.9728, + "step": 5233 + }, + { + "epoch": 0.4741908450544722, + "grad_norm": 0.6506154537200928, + "learning_rate": 0.00016841056001933185, + "loss": 1.961, + "step": 5234 + }, + { + "epoch": 0.47428144322891896, + "grad_norm": 0.7880546450614929, + "learning_rate": 0.0001684045188183411, + "loss": 2.816, + "step": 5235 + }, + { + "epoch": 0.4743720414033657, + "grad_norm": 0.8200824856758118, + "learning_rate": 0.00016839847761735034, + "loss": 2.9784, + "step": 5236 + }, + { + "epoch": 0.4744626395778125, + "grad_norm": 0.7353612184524536, + "learning_rate": 0.00016839243641635958, + "loss": 2.7352, + "step": 5237 + }, + { + "epoch": 0.47455323775225927, + "grad_norm": 0.8115986585617065, + "learning_rate": 0.00016838639521536884, + "loss": 2.9622, + "step": 5238 + }, + { + "epoch": 0.47464383592670606, + "grad_norm": 0.7366397380828857, + "learning_rate": 0.00016838035401437807, + "loss": 2.6119, + "step": 5239 + }, + { + "epoch": 0.47473443410115285, + "grad_norm": 0.7868549823760986, + "learning_rate": 0.0001683743128133873, + "loss": 3.0945, + "step": 5240 + }, + { + "epoch": 0.47482503227559963, + "grad_norm": 0.7752383947372437, + "learning_rate": 0.00016836827161239654, + "loss": 2.9045, + "step": 5241 + }, + { + "epoch": 0.4749156304500464, + "grad_norm": 0.7892537117004395, + "learning_rate": 0.0001683622304114058, + "loss": 2.6907, + "step": 5242 + }, + { + "epoch": 0.4750062286244932, + "grad_norm": 0.7652416229248047, + "learning_rate": 0.00016835618921041506, + "loss": 2.7859, + "step": 5243 + }, + { + "epoch": 0.47509682679894, + "grad_norm": 0.7339704632759094, + "learning_rate": 0.00016835014800942427, + "loss": 2.7065, + "step": 5244 + }, + { + "epoch": 0.4751874249733868, + "grad_norm": 0.7952680587768555, + "learning_rate": 0.00016834410680843353, + "loss": 2.7442, + "step": 5245 + }, + { + "epoch": 0.4752780231478336, + "grad_norm": 0.8084836006164551, + "learning_rate": 0.00016833806560744276, + "loss": 2.7824, + "step": 5246 + }, + { + "epoch": 0.47536862132228036, + "grad_norm": 0.8043961524963379, + "learning_rate": 0.00016833202440645203, + "loss": 2.7211, + "step": 5247 + }, + { + "epoch": 0.47545921949672715, + "grad_norm": 0.7545416355133057, + "learning_rate": 0.00016832598320546126, + "loss": 3.1317, + "step": 5248 + }, + { + "epoch": 0.47554981767117394, + "grad_norm": 0.7776731252670288, + "learning_rate": 0.0001683199420044705, + "loss": 2.5886, + "step": 5249 + }, + { + "epoch": 0.4756404158456207, + "grad_norm": 0.7615066766738892, + "learning_rate": 0.00016831390080347973, + "loss": 2.6949, + "step": 5250 + }, + { + "epoch": 0.4757310140200675, + "grad_norm": 0.6420223712921143, + "learning_rate": 0.000168307859602489, + "loss": 2.0903, + "step": 5251 + }, + { + "epoch": 0.4758216121945143, + "grad_norm": 0.7811489105224609, + "learning_rate": 0.00016830181840149822, + "loss": 2.6988, + "step": 5252 + }, + { + "epoch": 0.4759122103689611, + "grad_norm": 0.7462693452835083, + "learning_rate": 0.00016829577720050746, + "loss": 2.625, + "step": 5253 + }, + { + "epoch": 0.4760028085434079, + "grad_norm": 0.6549460291862488, + "learning_rate": 0.00016828973599951672, + "loss": 2.2083, + "step": 5254 + }, + { + "epoch": 0.47609340671785466, + "grad_norm": 0.7360005378723145, + "learning_rate": 0.00016828369479852595, + "loss": 2.7693, + "step": 5255 + }, + { + "epoch": 0.4761840048923014, + "grad_norm": 0.989733874797821, + "learning_rate": 0.0001682776535975352, + "loss": 2.6774, + "step": 5256 + }, + { + "epoch": 0.4762746030667482, + "grad_norm": 0.9153986573219299, + "learning_rate": 0.00016827161239654442, + "loss": 2.9887, + "step": 5257 + }, + { + "epoch": 0.47636520124119497, + "grad_norm": 0.7295084595680237, + "learning_rate": 0.00016826557119555368, + "loss": 2.6522, + "step": 5258 + }, + { + "epoch": 0.47645579941564176, + "grad_norm": 0.7270078063011169, + "learning_rate": 0.00016825952999456294, + "loss": 2.7806, + "step": 5259 + }, + { + "epoch": 0.47654639759008854, + "grad_norm": 0.7459527850151062, + "learning_rate": 0.00016825348879357218, + "loss": 2.9729, + "step": 5260 + }, + { + "epoch": 0.47663699576453533, + "grad_norm": 0.7275234460830688, + "learning_rate": 0.0001682474475925814, + "loss": 2.8464, + "step": 5261 + }, + { + "epoch": 0.4767275939389821, + "grad_norm": 0.7385685443878174, + "learning_rate": 0.00016824140639159064, + "loss": 2.7757, + "step": 5262 + }, + { + "epoch": 0.4768181921134289, + "grad_norm": 0.8057173490524292, + "learning_rate": 0.0001682353651905999, + "loss": 2.8934, + "step": 5263 + }, + { + "epoch": 0.4769087902878757, + "grad_norm": 0.8006224036216736, + "learning_rate": 0.00016822932398960914, + "loss": 2.809, + "step": 5264 + }, + { + "epoch": 0.4769993884623225, + "grad_norm": 0.7459072470664978, + "learning_rate": 0.00016822328278861837, + "loss": 2.7171, + "step": 5265 + }, + { + "epoch": 0.47708998663676927, + "grad_norm": 0.7879559397697449, + "learning_rate": 0.00016821724158762763, + "loss": 2.9485, + "step": 5266 + }, + { + "epoch": 0.47718058481121606, + "grad_norm": 0.7824838161468506, + "learning_rate": 0.00016821120038663687, + "loss": 2.7805, + "step": 5267 + }, + { + "epoch": 0.47727118298566285, + "grad_norm": 0.7532554268836975, + "learning_rate": 0.00016820515918564613, + "loss": 2.8799, + "step": 5268 + }, + { + "epoch": 0.47736178116010963, + "grad_norm": 0.7465601563453674, + "learning_rate": 0.00016819911798465536, + "loss": 2.7887, + "step": 5269 + }, + { + "epoch": 0.4774523793345564, + "grad_norm": 0.7497247457504272, + "learning_rate": 0.0001681930767836646, + "loss": 2.7475, + "step": 5270 + }, + { + "epoch": 0.4775429775090032, + "grad_norm": 0.7830521464347839, + "learning_rate": 0.00016818703558267383, + "loss": 2.8966, + "step": 5271 + }, + { + "epoch": 0.47763357568345, + "grad_norm": 0.7462111115455627, + "learning_rate": 0.0001681809943816831, + "loss": 2.8275, + "step": 5272 + }, + { + "epoch": 0.4777241738578968, + "grad_norm": 0.722071647644043, + "learning_rate": 0.00016817495318069233, + "loss": 2.8563, + "step": 5273 + }, + { + "epoch": 0.47781477203234357, + "grad_norm": 0.690037190914154, + "learning_rate": 0.00016816891197970156, + "loss": 2.3474, + "step": 5274 + }, + { + "epoch": 0.47790537020679036, + "grad_norm": 0.7435261011123657, + "learning_rate": 0.00016816287077871082, + "loss": 2.7808, + "step": 5275 + }, + { + "epoch": 0.4779959683812371, + "grad_norm": 0.7084172368049622, + "learning_rate": 0.00016815682957772006, + "loss": 2.0364, + "step": 5276 + }, + { + "epoch": 0.4780865665556839, + "grad_norm": 0.7702202796936035, + "learning_rate": 0.00016815078837672932, + "loss": 2.9106, + "step": 5277 + }, + { + "epoch": 0.47817716473013067, + "grad_norm": 0.7568758726119995, + "learning_rate": 0.00016814474717573852, + "loss": 2.6745, + "step": 5278 + }, + { + "epoch": 0.47826776290457745, + "grad_norm": 0.776443600654602, + "learning_rate": 0.00016813870597474779, + "loss": 2.8308, + "step": 5279 + }, + { + "epoch": 0.47835836107902424, + "grad_norm": 0.8102123737335205, + "learning_rate": 0.00016813266477375702, + "loss": 2.7681, + "step": 5280 + }, + { + "epoch": 0.47844895925347103, + "grad_norm": 0.7669890522956848, + "learning_rate": 0.00016812662357276628, + "loss": 3.0803, + "step": 5281 + }, + { + "epoch": 0.4785395574279178, + "grad_norm": 0.6858260035514832, + "learning_rate": 0.00016812058237177552, + "loss": 2.1824, + "step": 5282 + }, + { + "epoch": 0.4786301556023646, + "grad_norm": 0.7573320269584656, + "learning_rate": 0.00016811454117078475, + "loss": 2.5739, + "step": 5283 + }, + { + "epoch": 0.4787207537768114, + "grad_norm": 0.7429168224334717, + "learning_rate": 0.000168108499969794, + "loss": 2.7613, + "step": 5284 + }, + { + "epoch": 0.4788113519512582, + "grad_norm": 0.5418465733528137, + "learning_rate": 0.00016810245876880324, + "loss": 1.4431, + "step": 5285 + }, + { + "epoch": 0.47890195012570497, + "grad_norm": 0.734359622001648, + "learning_rate": 0.00016809641756781248, + "loss": 2.8991, + "step": 5286 + }, + { + "epoch": 0.47899254830015175, + "grad_norm": 0.8223562836647034, + "learning_rate": 0.0001680903763668217, + "loss": 2.9976, + "step": 5287 + }, + { + "epoch": 0.47908314647459854, + "grad_norm": 0.6573883891105652, + "learning_rate": 0.00016808433516583097, + "loss": 2.2767, + "step": 5288 + }, + { + "epoch": 0.47917374464904533, + "grad_norm": 0.7763997912406921, + "learning_rate": 0.00016807829396484023, + "loss": 2.825, + "step": 5289 + }, + { + "epoch": 0.4792643428234921, + "grad_norm": 0.8027622699737549, + "learning_rate": 0.00016807225276384947, + "loss": 3.0101, + "step": 5290 + }, + { + "epoch": 0.4793549409979389, + "grad_norm": 0.8179006576538086, + "learning_rate": 0.0001680662115628587, + "loss": 2.6795, + "step": 5291 + }, + { + "epoch": 0.4794455391723857, + "grad_norm": 0.7624850869178772, + "learning_rate": 0.00016806017036186794, + "loss": 3.01, + "step": 5292 + }, + { + "epoch": 0.4795361373468325, + "grad_norm": 0.7514864206314087, + "learning_rate": 0.0001680541291608772, + "loss": 2.744, + "step": 5293 + }, + { + "epoch": 0.47962673552127927, + "grad_norm": 0.7715620398521423, + "learning_rate": 0.00016804808795988643, + "loss": 2.7038, + "step": 5294 + }, + { + "epoch": 0.47971733369572606, + "grad_norm": 0.7169265151023865, + "learning_rate": 0.00016804204675889567, + "loss": 2.076, + "step": 5295 + }, + { + "epoch": 0.47980793187017284, + "grad_norm": 0.6203566193580627, + "learning_rate": 0.00016803600555790493, + "loss": 2.177, + "step": 5296 + }, + { + "epoch": 0.4798985300446196, + "grad_norm": 0.7527379989624023, + "learning_rate": 0.00016802996435691416, + "loss": 2.988, + "step": 5297 + }, + { + "epoch": 0.47998912821906636, + "grad_norm": 0.747346818447113, + "learning_rate": 0.00016802392315592342, + "loss": 2.8837, + "step": 5298 + }, + { + "epoch": 0.48007972639351315, + "grad_norm": 0.7919115424156189, + "learning_rate": 0.00016801788195493266, + "loss": 2.6609, + "step": 5299 + }, + { + "epoch": 0.48017032456795994, + "grad_norm": 0.8260490298271179, + "learning_rate": 0.0001680118407539419, + "loss": 2.7032, + "step": 5300 + }, + { + "epoch": 0.4802609227424067, + "grad_norm": 0.7567138075828552, + "learning_rate": 0.00016800579955295112, + "loss": 2.728, + "step": 5301 + }, + { + "epoch": 0.4803515209168535, + "grad_norm": 0.6673864126205444, + "learning_rate": 0.00016799975835196039, + "loss": 2.2007, + "step": 5302 + }, + { + "epoch": 0.4804421190913003, + "grad_norm": 0.7150009274482727, + "learning_rate": 0.00016799371715096962, + "loss": 2.1147, + "step": 5303 + }, + { + "epoch": 0.4805327172657471, + "grad_norm": 0.7457982301712036, + "learning_rate": 0.00016798767594997885, + "loss": 2.6724, + "step": 5304 + }, + { + "epoch": 0.4806233154401939, + "grad_norm": 0.7058779001235962, + "learning_rate": 0.00016798163474898812, + "loss": 2.5939, + "step": 5305 + }, + { + "epoch": 0.48071391361464066, + "grad_norm": 0.8325377106666565, + "learning_rate": 0.00016797559354799735, + "loss": 2.8707, + "step": 5306 + }, + { + "epoch": 0.48080451178908745, + "grad_norm": 0.7150831818580627, + "learning_rate": 0.0001679695523470066, + "loss": 2.8121, + "step": 5307 + }, + { + "epoch": 0.48089510996353424, + "grad_norm": 0.7476849555969238, + "learning_rate": 0.00016796351114601582, + "loss": 2.9517, + "step": 5308 + }, + { + "epoch": 0.480985708137981, + "grad_norm": 0.7419776916503906, + "learning_rate": 0.00016795746994502508, + "loss": 2.8022, + "step": 5309 + }, + { + "epoch": 0.4810763063124278, + "grad_norm": 0.7570226788520813, + "learning_rate": 0.0001679514287440343, + "loss": 2.76, + "step": 5310 + }, + { + "epoch": 0.4811669044868746, + "grad_norm": 0.6730136275291443, + "learning_rate": 0.00016794538754304357, + "loss": 1.9947, + "step": 5311 + }, + { + "epoch": 0.4812575026613214, + "grad_norm": 0.7773056626319885, + "learning_rate": 0.0001679393463420528, + "loss": 2.8859, + "step": 5312 + }, + { + "epoch": 0.4813481008357682, + "grad_norm": 0.7541865706443787, + "learning_rate": 0.00016793330514106204, + "loss": 2.565, + "step": 5313 + }, + { + "epoch": 0.48143869901021497, + "grad_norm": 0.7680559158325195, + "learning_rate": 0.0001679272639400713, + "loss": 2.9931, + "step": 5314 + }, + { + "epoch": 0.48152929718466175, + "grad_norm": 0.6429498195648193, + "learning_rate": 0.00016792122273908054, + "loss": 1.9159, + "step": 5315 + }, + { + "epoch": 0.48161989535910854, + "grad_norm": 0.7840405702590942, + "learning_rate": 0.00016791518153808977, + "loss": 2.8059, + "step": 5316 + }, + { + "epoch": 0.4817104935335553, + "grad_norm": 0.7458990216255188, + "learning_rate": 0.000167909140337099, + "loss": 2.8279, + "step": 5317 + }, + { + "epoch": 0.48180109170800206, + "grad_norm": 0.7174658179283142, + "learning_rate": 0.00016790309913610827, + "loss": 2.7366, + "step": 5318 + }, + { + "epoch": 0.48189168988244885, + "grad_norm": 0.7912449836730957, + "learning_rate": 0.00016789705793511753, + "loss": 2.6802, + "step": 5319 + }, + { + "epoch": 0.48198228805689564, + "grad_norm": 0.7776302099227905, + "learning_rate": 0.00016789101673412676, + "loss": 2.772, + "step": 5320 + }, + { + "epoch": 0.4820728862313424, + "grad_norm": 0.8202427625656128, + "learning_rate": 0.000167884975533136, + "loss": 3.0435, + "step": 5321 + }, + { + "epoch": 0.4821634844057892, + "grad_norm": 0.7255066633224487, + "learning_rate": 0.00016787893433214523, + "loss": 2.78, + "step": 5322 + }, + { + "epoch": 0.482254082580236, + "grad_norm": 0.7869598269462585, + "learning_rate": 0.0001678728931311545, + "loss": 3.0364, + "step": 5323 + }, + { + "epoch": 0.4823446807546828, + "grad_norm": 0.785122811794281, + "learning_rate": 0.00016786685193016372, + "loss": 2.8401, + "step": 5324 + }, + { + "epoch": 0.4824352789291296, + "grad_norm": 0.7620280385017395, + "learning_rate": 0.00016786081072917296, + "loss": 2.824, + "step": 5325 + }, + { + "epoch": 0.48252587710357636, + "grad_norm": 0.6588999629020691, + "learning_rate": 0.00016785476952818222, + "loss": 2.136, + "step": 5326 + }, + { + "epoch": 0.48261647527802315, + "grad_norm": 0.777341365814209, + "learning_rate": 0.00016784872832719145, + "loss": 3.0475, + "step": 5327 + }, + { + "epoch": 0.48270707345246994, + "grad_norm": 0.8056312799453735, + "learning_rate": 0.00016784268712620072, + "loss": 2.6953, + "step": 5328 + }, + { + "epoch": 0.4827976716269167, + "grad_norm": 0.7633291482925415, + "learning_rate": 0.00016783664592520992, + "loss": 2.874, + "step": 5329 + }, + { + "epoch": 0.4828882698013635, + "grad_norm": 0.6528100371360779, + "learning_rate": 0.00016783060472421918, + "loss": 2.0542, + "step": 5330 + }, + { + "epoch": 0.4829788679758103, + "grad_norm": 0.7644476294517517, + "learning_rate": 0.00016782456352322842, + "loss": 2.5553, + "step": 5331 + }, + { + "epoch": 0.4830694661502571, + "grad_norm": 0.739971935749054, + "learning_rate": 0.00016781852232223768, + "loss": 2.9762, + "step": 5332 + }, + { + "epoch": 0.4831600643247039, + "grad_norm": 0.7578445076942444, + "learning_rate": 0.0001678124811212469, + "loss": 2.7663, + "step": 5333 + }, + { + "epoch": 0.48325066249915066, + "grad_norm": 0.7738564610481262, + "learning_rate": 0.00016780643992025615, + "loss": 2.7752, + "step": 5334 + }, + { + "epoch": 0.48334126067359745, + "grad_norm": 0.8855776786804199, + "learning_rate": 0.0001678003987192654, + "loss": 2.872, + "step": 5335 + }, + { + "epoch": 0.48343185884804424, + "grad_norm": 0.7578370571136475, + "learning_rate": 0.00016779435751827464, + "loss": 2.8346, + "step": 5336 + }, + { + "epoch": 0.48352245702249097, + "grad_norm": 0.7363061904907227, + "learning_rate": 0.00016778831631728388, + "loss": 2.8034, + "step": 5337 + }, + { + "epoch": 0.48361305519693776, + "grad_norm": 0.8635277152061462, + "learning_rate": 0.0001677822751162931, + "loss": 2.8785, + "step": 5338 + }, + { + "epoch": 0.48370365337138455, + "grad_norm": 0.7596970796585083, + "learning_rate": 0.00016777623391530237, + "loss": 2.8705, + "step": 5339 + }, + { + "epoch": 0.48379425154583133, + "grad_norm": 0.8351067304611206, + "learning_rate": 0.0001677701927143116, + "loss": 3.1447, + "step": 5340 + }, + { + "epoch": 0.4838848497202781, + "grad_norm": 0.8702050447463989, + "learning_rate": 0.00016776415151332087, + "loss": 2.7393, + "step": 5341 + }, + { + "epoch": 0.4839754478947249, + "grad_norm": 0.7326213121414185, + "learning_rate": 0.0001677581103123301, + "loss": 2.8418, + "step": 5342 + }, + { + "epoch": 0.4840660460691717, + "grad_norm": 0.7964458465576172, + "learning_rate": 0.00016775206911133933, + "loss": 2.7356, + "step": 5343 + }, + { + "epoch": 0.4841566442436185, + "grad_norm": 0.7734220623970032, + "learning_rate": 0.0001677460279103486, + "loss": 2.9408, + "step": 5344 + }, + { + "epoch": 0.48424724241806527, + "grad_norm": 0.7455647587776184, + "learning_rate": 0.00016773998670935783, + "loss": 2.7188, + "step": 5345 + }, + { + "epoch": 0.48433784059251206, + "grad_norm": 0.7800576686859131, + "learning_rate": 0.00016773394550836706, + "loss": 2.7575, + "step": 5346 + }, + { + "epoch": 0.48442843876695885, + "grad_norm": 0.7805792093276978, + "learning_rate": 0.0001677279043073763, + "loss": 2.6544, + "step": 5347 + }, + { + "epoch": 0.48451903694140563, + "grad_norm": 0.8167234659194946, + "learning_rate": 0.00016772186310638556, + "loss": 2.8336, + "step": 5348 + }, + { + "epoch": 0.4846096351158524, + "grad_norm": 0.7602725625038147, + "learning_rate": 0.00016771582190539482, + "loss": 2.6827, + "step": 5349 + }, + { + "epoch": 0.4847002332902992, + "grad_norm": 0.8358849287033081, + "learning_rate": 0.00016770978070440403, + "loss": 3.3113, + "step": 5350 + }, + { + "epoch": 0.484790831464746, + "grad_norm": 0.748202919960022, + "learning_rate": 0.0001677037395034133, + "loss": 2.9851, + "step": 5351 + }, + { + "epoch": 0.4848814296391928, + "grad_norm": 0.8003296256065369, + "learning_rate": 0.00016769769830242252, + "loss": 2.9743, + "step": 5352 + }, + { + "epoch": 0.4849720278136396, + "grad_norm": 0.768934965133667, + "learning_rate": 0.00016769165710143178, + "loss": 2.8035, + "step": 5353 + }, + { + "epoch": 0.48506262598808636, + "grad_norm": 0.7189128994941711, + "learning_rate": 0.00016768561590044102, + "loss": 1.9599, + "step": 5354 + }, + { + "epoch": 0.48515322416253315, + "grad_norm": 0.7923698425292969, + "learning_rate": 0.00016767957469945025, + "loss": 2.8277, + "step": 5355 + }, + { + "epoch": 0.48524382233697994, + "grad_norm": 0.8256731033325195, + "learning_rate": 0.0001676735334984595, + "loss": 2.7995, + "step": 5356 + }, + { + "epoch": 0.48533442051142667, + "grad_norm": 0.824791669845581, + "learning_rate": 0.00016766749229746875, + "loss": 2.851, + "step": 5357 + }, + { + "epoch": 0.48542501868587346, + "grad_norm": 0.7614158391952515, + "learning_rate": 0.000167661451096478, + "loss": 2.9405, + "step": 5358 + }, + { + "epoch": 0.48551561686032024, + "grad_norm": 0.7184038758277893, + "learning_rate": 0.00016765540989548722, + "loss": 2.6741, + "step": 5359 + }, + { + "epoch": 0.48560621503476703, + "grad_norm": 0.694442868232727, + "learning_rate": 0.00016764936869449648, + "loss": 2.0772, + "step": 5360 + }, + { + "epoch": 0.4856968132092138, + "grad_norm": 0.7317333221435547, + "learning_rate": 0.0001676433274935057, + "loss": 2.7777, + "step": 5361 + }, + { + "epoch": 0.4857874113836606, + "grad_norm": 0.6985698342323303, + "learning_rate": 0.00016763728629251497, + "loss": 2.6483, + "step": 5362 + }, + { + "epoch": 0.4858780095581074, + "grad_norm": 0.8094331622123718, + "learning_rate": 0.0001676312450915242, + "loss": 2.839, + "step": 5363 + }, + { + "epoch": 0.4859686077325542, + "grad_norm": 0.7877212166786194, + "learning_rate": 0.00016762520389053344, + "loss": 2.8993, + "step": 5364 + }, + { + "epoch": 0.48605920590700097, + "grad_norm": 0.7311065793037415, + "learning_rate": 0.0001676191626895427, + "loss": 2.6299, + "step": 5365 + }, + { + "epoch": 0.48614980408144776, + "grad_norm": 0.7864188551902771, + "learning_rate": 0.00016761312148855193, + "loss": 3.0479, + "step": 5366 + }, + { + "epoch": 0.48624040225589454, + "grad_norm": 0.7447869777679443, + "learning_rate": 0.00016760708028756117, + "loss": 3.1841, + "step": 5367 + }, + { + "epoch": 0.48633100043034133, + "grad_norm": 0.8247730135917664, + "learning_rate": 0.0001676010390865704, + "loss": 2.8755, + "step": 5368 + }, + { + "epoch": 0.4864215986047881, + "grad_norm": 0.6882030963897705, + "learning_rate": 0.00016759499788557966, + "loss": 2.2763, + "step": 5369 + }, + { + "epoch": 0.4865121967792349, + "grad_norm": 0.6561381220817566, + "learning_rate": 0.0001675889566845889, + "loss": 2.315, + "step": 5370 + }, + { + "epoch": 0.4866027949536817, + "grad_norm": 0.8564037084579468, + "learning_rate": 0.00016758291548359816, + "loss": 2.6198, + "step": 5371 + }, + { + "epoch": 0.4866933931281285, + "grad_norm": 0.6644406318664551, + "learning_rate": 0.0001675768742826074, + "loss": 2.1518, + "step": 5372 + }, + { + "epoch": 0.48678399130257527, + "grad_norm": 0.794067919254303, + "learning_rate": 0.00016757083308161663, + "loss": 2.8407, + "step": 5373 + }, + { + "epoch": 0.48687458947702206, + "grad_norm": 0.7348027229309082, + "learning_rate": 0.0001675647918806259, + "loss": 2.7626, + "step": 5374 + }, + { + "epoch": 0.48696518765146884, + "grad_norm": 0.7617695927619934, + "learning_rate": 0.00016755875067963512, + "loss": 2.6567, + "step": 5375 + }, + { + "epoch": 0.48705578582591563, + "grad_norm": 0.8093473315238953, + "learning_rate": 0.00016755270947864436, + "loss": 2.9659, + "step": 5376 + }, + { + "epoch": 0.4871463840003624, + "grad_norm": 0.7420236468315125, + "learning_rate": 0.0001675466682776536, + "loss": 2.7864, + "step": 5377 + }, + { + "epoch": 0.48723698217480915, + "grad_norm": 0.7461451292037964, + "learning_rate": 0.00016754062707666285, + "loss": 2.8532, + "step": 5378 + }, + { + "epoch": 0.48732758034925594, + "grad_norm": 0.7888150215148926, + "learning_rate": 0.0001675345858756721, + "loss": 2.8488, + "step": 5379 + }, + { + "epoch": 0.4874181785237027, + "grad_norm": 0.7869457006454468, + "learning_rate": 0.00016752854467468132, + "loss": 2.7189, + "step": 5380 + }, + { + "epoch": 0.4875087766981495, + "grad_norm": 0.769819438457489, + "learning_rate": 0.00016752250347369058, + "loss": 2.7867, + "step": 5381 + }, + { + "epoch": 0.4875993748725963, + "grad_norm": 0.7493003606796265, + "learning_rate": 0.00016751646227269982, + "loss": 2.6426, + "step": 5382 + }, + { + "epoch": 0.4876899730470431, + "grad_norm": 0.6982594728469849, + "learning_rate": 0.00016751042107170908, + "loss": 2.0365, + "step": 5383 + }, + { + "epoch": 0.4877805712214899, + "grad_norm": 0.7105021476745605, + "learning_rate": 0.0001675043798707183, + "loss": 2.6728, + "step": 5384 + }, + { + "epoch": 0.48787116939593667, + "grad_norm": 0.8306036591529846, + "learning_rate": 0.00016749833866972754, + "loss": 2.7375, + "step": 5385 + }, + { + "epoch": 0.48796176757038345, + "grad_norm": 0.7910940051078796, + "learning_rate": 0.0001674922974687368, + "loss": 2.7297, + "step": 5386 + }, + { + "epoch": 0.48805236574483024, + "grad_norm": 0.8233325481414795, + "learning_rate": 0.00016748625626774604, + "loss": 2.8399, + "step": 5387 + }, + { + "epoch": 0.48814296391927703, + "grad_norm": 0.7748681306838989, + "learning_rate": 0.00016748021506675527, + "loss": 2.6304, + "step": 5388 + }, + { + "epoch": 0.4882335620937238, + "grad_norm": 0.7908282279968262, + "learning_rate": 0.0001674741738657645, + "loss": 3.0426, + "step": 5389 + }, + { + "epoch": 0.4883241602681706, + "grad_norm": 0.7761780619621277, + "learning_rate": 0.00016746813266477377, + "loss": 2.7867, + "step": 5390 + }, + { + "epoch": 0.4884147584426174, + "grad_norm": 0.7997323870658875, + "learning_rate": 0.000167462091463783, + "loss": 2.7391, + "step": 5391 + }, + { + "epoch": 0.4885053566170642, + "grad_norm": 0.7594248056411743, + "learning_rate": 0.00016745605026279226, + "loss": 2.7986, + "step": 5392 + }, + { + "epoch": 0.48859595479151097, + "grad_norm": 0.7734372615814209, + "learning_rate": 0.0001674500090618015, + "loss": 2.5594, + "step": 5393 + }, + { + "epoch": 0.48868655296595775, + "grad_norm": 0.7769542336463928, + "learning_rate": 0.00016744396786081073, + "loss": 2.1305, + "step": 5394 + }, + { + "epoch": 0.48877715114040454, + "grad_norm": 0.7588053345680237, + "learning_rate": 0.00016743792665982, + "loss": 2.5477, + "step": 5395 + }, + { + "epoch": 0.48886774931485133, + "grad_norm": 0.7890965938568115, + "learning_rate": 0.00016743188545882923, + "loss": 2.533, + "step": 5396 + }, + { + "epoch": 0.4889583474892981, + "grad_norm": 0.7553604245185852, + "learning_rate": 0.00016742584425783846, + "loss": 2.7955, + "step": 5397 + }, + { + "epoch": 0.48904894566374485, + "grad_norm": 0.7218215465545654, + "learning_rate": 0.0001674198030568477, + "loss": 2.4209, + "step": 5398 + }, + { + "epoch": 0.48913954383819164, + "grad_norm": 0.7924960255622864, + "learning_rate": 0.00016741376185585696, + "loss": 2.6764, + "step": 5399 + }, + { + "epoch": 0.4892301420126384, + "grad_norm": 0.815432608127594, + "learning_rate": 0.0001674077206548662, + "loss": 2.9996, + "step": 5400 + }, + { + "epoch": 0.4893207401870852, + "grad_norm": 0.7805432677268982, + "learning_rate": 0.00016740167945387542, + "loss": 3.0133, + "step": 5401 + }, + { + "epoch": 0.489411338361532, + "grad_norm": 0.8187296986579895, + "learning_rate": 0.00016739563825288469, + "loss": 2.8629, + "step": 5402 + }, + { + "epoch": 0.4895019365359788, + "grad_norm": 0.8058576583862305, + "learning_rate": 0.00016738959705189392, + "loss": 2.9258, + "step": 5403 + }, + { + "epoch": 0.4895925347104256, + "grad_norm": 0.752209484577179, + "learning_rate": 0.00016738355585090318, + "loss": 2.1645, + "step": 5404 + }, + { + "epoch": 0.48968313288487236, + "grad_norm": 0.7859792113304138, + "learning_rate": 0.00016737751464991242, + "loss": 2.7309, + "step": 5405 + }, + { + "epoch": 0.48977373105931915, + "grad_norm": 0.7800381779670715, + "learning_rate": 0.00016737147344892165, + "loss": 3.0983, + "step": 5406 + }, + { + "epoch": 0.48986432923376594, + "grad_norm": 1.080209732055664, + "learning_rate": 0.00016736543224793088, + "loss": 2.8233, + "step": 5407 + }, + { + "epoch": 0.4899549274082127, + "grad_norm": 0.7671409845352173, + "learning_rate": 0.00016735939104694014, + "loss": 2.7349, + "step": 5408 + }, + { + "epoch": 0.4900455255826595, + "grad_norm": 0.8111056685447693, + "learning_rate": 0.0001673533498459494, + "loss": 3.0996, + "step": 5409 + }, + { + "epoch": 0.4901361237571063, + "grad_norm": 0.8127857446670532, + "learning_rate": 0.0001673473086449586, + "loss": 2.6063, + "step": 5410 + }, + { + "epoch": 0.4902267219315531, + "grad_norm": 0.75286865234375, + "learning_rate": 0.00016734126744396787, + "loss": 2.94, + "step": 5411 + }, + { + "epoch": 0.4903173201059999, + "grad_norm": 0.7859688997268677, + "learning_rate": 0.0001673352262429771, + "loss": 2.9885, + "step": 5412 + }, + { + "epoch": 0.49040791828044666, + "grad_norm": 0.7176862955093384, + "learning_rate": 0.00016732918504198637, + "loss": 2.7853, + "step": 5413 + }, + { + "epoch": 0.49049851645489345, + "grad_norm": 0.7884936928749084, + "learning_rate": 0.00016732314384099558, + "loss": 2.962, + "step": 5414 + }, + { + "epoch": 0.49058911462934024, + "grad_norm": 0.7349588871002197, + "learning_rate": 0.00016731710264000484, + "loss": 2.6374, + "step": 5415 + }, + { + "epoch": 0.490679712803787, + "grad_norm": 0.735816478729248, + "learning_rate": 0.0001673110614390141, + "loss": 2.8272, + "step": 5416 + }, + { + "epoch": 0.4907703109782338, + "grad_norm": 0.8261759877204895, + "learning_rate": 0.00016730502023802333, + "loss": 2.8716, + "step": 5417 + }, + { + "epoch": 0.49086090915268055, + "grad_norm": 0.761186957359314, + "learning_rate": 0.00016729897903703257, + "loss": 2.6975, + "step": 5418 + }, + { + "epoch": 0.49095150732712733, + "grad_norm": 0.6805592775344849, + "learning_rate": 0.0001672929378360418, + "loss": 2.5464, + "step": 5419 + }, + { + "epoch": 0.4910421055015741, + "grad_norm": 0.7448706030845642, + "learning_rate": 0.00016728689663505106, + "loss": 2.9114, + "step": 5420 + }, + { + "epoch": 0.4911327036760209, + "grad_norm": 0.8335459232330322, + "learning_rate": 0.0001672808554340603, + "loss": 2.7312, + "step": 5421 + }, + { + "epoch": 0.4912233018504677, + "grad_norm": 0.7607126235961914, + "learning_rate": 0.00016727481423306956, + "loss": 2.8935, + "step": 5422 + }, + { + "epoch": 0.4913139000249145, + "grad_norm": 0.8172277212142944, + "learning_rate": 0.0001672687730320788, + "loss": 2.902, + "step": 5423 + }, + { + "epoch": 0.4914044981993613, + "grad_norm": 0.8398140072822571, + "learning_rate": 0.00016726273183108802, + "loss": 3.0379, + "step": 5424 + }, + { + "epoch": 0.49149509637380806, + "grad_norm": 0.7425488233566284, + "learning_rate": 0.00016725669063009729, + "loss": 2.7074, + "step": 5425 + }, + { + "epoch": 0.49158569454825485, + "grad_norm": 0.7993810772895813, + "learning_rate": 0.00016725064942910652, + "loss": 2.9121, + "step": 5426 + }, + { + "epoch": 0.49167629272270164, + "grad_norm": 0.7821193337440491, + "learning_rate": 0.00016724460822811575, + "loss": 3.0911, + "step": 5427 + }, + { + "epoch": 0.4917668908971484, + "grad_norm": 0.7917747497558594, + "learning_rate": 0.000167238567027125, + "loss": 2.6187, + "step": 5428 + }, + { + "epoch": 0.4918574890715952, + "grad_norm": 0.7077671885490417, + "learning_rate": 0.00016723252582613425, + "loss": 2.696, + "step": 5429 + }, + { + "epoch": 0.491948087246042, + "grad_norm": 0.7889499664306641, + "learning_rate": 0.00016722648462514348, + "loss": 2.1919, + "step": 5430 + }, + { + "epoch": 0.4920386854204888, + "grad_norm": 0.7869011759757996, + "learning_rate": 0.00016722044342415272, + "loss": 2.769, + "step": 5431 + }, + { + "epoch": 0.4921292835949356, + "grad_norm": 0.7717429399490356, + "learning_rate": 0.00016721440222316198, + "loss": 3.4117, + "step": 5432 + }, + { + "epoch": 0.49221988176938236, + "grad_norm": 0.9020236134529114, + "learning_rate": 0.0001672083610221712, + "loss": 3.1751, + "step": 5433 + }, + { + "epoch": 0.49231047994382915, + "grad_norm": 0.6743930578231812, + "learning_rate": 0.00016720231982118047, + "loss": 1.95, + "step": 5434 + }, + { + "epoch": 0.49240107811827594, + "grad_norm": 0.7822391986846924, + "learning_rate": 0.0001671962786201897, + "loss": 2.7362, + "step": 5435 + }, + { + "epoch": 0.4924916762927227, + "grad_norm": 0.7344678044319153, + "learning_rate": 0.00016719023741919894, + "loss": 2.6803, + "step": 5436 + }, + { + "epoch": 0.4925822744671695, + "grad_norm": 0.7562480568885803, + "learning_rate": 0.00016718419621820818, + "loss": 2.931, + "step": 5437 + }, + { + "epoch": 0.49267287264161624, + "grad_norm": 0.8281113505363464, + "learning_rate": 0.00016717815501721744, + "loss": 2.7342, + "step": 5438 + }, + { + "epoch": 0.49276347081606303, + "grad_norm": 0.8264453411102295, + "learning_rate": 0.00016717211381622667, + "loss": 3.1913, + "step": 5439 + }, + { + "epoch": 0.4928540689905098, + "grad_norm": 0.8070917129516602, + "learning_rate": 0.0001671660726152359, + "loss": 2.9998, + "step": 5440 + }, + { + "epoch": 0.4929446671649566, + "grad_norm": 0.7394355535507202, + "learning_rate": 0.00016716003141424517, + "loss": 2.7214, + "step": 5441 + }, + { + "epoch": 0.4930352653394034, + "grad_norm": 0.7219634056091309, + "learning_rate": 0.0001671539902132544, + "loss": 2.6079, + "step": 5442 + }, + { + "epoch": 0.4931258635138502, + "grad_norm": 0.7246356010437012, + "learning_rate": 0.00016714794901226366, + "loss": 2.8946, + "step": 5443 + }, + { + "epoch": 0.49321646168829697, + "grad_norm": 0.7554160356521606, + "learning_rate": 0.00016714190781127287, + "loss": 2.9248, + "step": 5444 + }, + { + "epoch": 0.49330705986274376, + "grad_norm": 0.7433953285217285, + "learning_rate": 0.00016713586661028213, + "loss": 2.7704, + "step": 5445 + }, + { + "epoch": 0.49339765803719055, + "grad_norm": 0.8752137422561646, + "learning_rate": 0.0001671298254092914, + "loss": 2.6424, + "step": 5446 + }, + { + "epoch": 0.49348825621163733, + "grad_norm": 0.7365843057632446, + "learning_rate": 0.00016712378420830062, + "loss": 2.8551, + "step": 5447 + }, + { + "epoch": 0.4935788543860841, + "grad_norm": 0.7550005316734314, + "learning_rate": 0.00016711774300730986, + "loss": 2.8514, + "step": 5448 + }, + { + "epoch": 0.4936694525605309, + "grad_norm": 0.7392608523368835, + "learning_rate": 0.0001671117018063191, + "loss": 2.5832, + "step": 5449 + }, + { + "epoch": 0.4937600507349777, + "grad_norm": 0.6806595921516418, + "learning_rate": 0.00016710566060532835, + "loss": 2.3728, + "step": 5450 + }, + { + "epoch": 0.4938506489094245, + "grad_norm": 0.8622460961341858, + "learning_rate": 0.0001670996194043376, + "loss": 2.8936, + "step": 5451 + }, + { + "epoch": 0.49394124708387127, + "grad_norm": 0.7443419098854065, + "learning_rate": 0.00016709357820334682, + "loss": 2.8296, + "step": 5452 + }, + { + "epoch": 0.49403184525831806, + "grad_norm": 0.7538564205169678, + "learning_rate": 0.00016708753700235608, + "loss": 2.943, + "step": 5453 + }, + { + "epoch": 0.49412244343276485, + "grad_norm": 0.7694092988967896, + "learning_rate": 0.00016708149580136532, + "loss": 2.7614, + "step": 5454 + }, + { + "epoch": 0.49421304160721163, + "grad_norm": 0.7339399456977844, + "learning_rate": 0.00016707545460037458, + "loss": 2.6399, + "step": 5455 + }, + { + "epoch": 0.4943036397816584, + "grad_norm": 0.7857187390327454, + "learning_rate": 0.0001670694133993838, + "loss": 3.0004, + "step": 5456 + }, + { + "epoch": 0.4943942379561052, + "grad_norm": 0.7544234395027161, + "learning_rate": 0.00016706337219839305, + "loss": 2.638, + "step": 5457 + }, + { + "epoch": 0.494484836130552, + "grad_norm": 0.7559523582458496, + "learning_rate": 0.00016705733099740228, + "loss": 2.7131, + "step": 5458 + }, + { + "epoch": 0.49457543430499873, + "grad_norm": 0.8187568783760071, + "learning_rate": 0.00016705128979641154, + "loss": 3.0591, + "step": 5459 + }, + { + "epoch": 0.4946660324794455, + "grad_norm": 0.7585282921791077, + "learning_rate": 0.00016704524859542078, + "loss": 2.7509, + "step": 5460 + }, + { + "epoch": 0.4947566306538923, + "grad_norm": 0.7666134834289551, + "learning_rate": 0.00016703920739443, + "loss": 2.8803, + "step": 5461 + }, + { + "epoch": 0.4948472288283391, + "grad_norm": 0.7621480822563171, + "learning_rate": 0.00016703316619343927, + "loss": 2.6972, + "step": 5462 + }, + { + "epoch": 0.4949378270027859, + "grad_norm": 0.7382242679595947, + "learning_rate": 0.0001670271249924485, + "loss": 2.9088, + "step": 5463 + }, + { + "epoch": 0.49502842517723267, + "grad_norm": 0.7141303420066833, + "learning_rate": 0.00016702108379145777, + "loss": 2.5338, + "step": 5464 + }, + { + "epoch": 0.49511902335167945, + "grad_norm": 0.7536666393280029, + "learning_rate": 0.00016701504259046697, + "loss": 2.8397, + "step": 5465 + }, + { + "epoch": 0.49520962152612624, + "grad_norm": 0.7431158423423767, + "learning_rate": 0.00016700900138947623, + "loss": 2.7018, + "step": 5466 + }, + { + "epoch": 0.49530021970057303, + "grad_norm": 0.6794232726097107, + "learning_rate": 0.00016700296018848547, + "loss": 2.0967, + "step": 5467 + }, + { + "epoch": 0.4953908178750198, + "grad_norm": 0.7819642424583435, + "learning_rate": 0.00016699691898749473, + "loss": 3.0809, + "step": 5468 + }, + { + "epoch": 0.4954814160494666, + "grad_norm": 0.787426769733429, + "learning_rate": 0.00016699087778650396, + "loss": 2.6854, + "step": 5469 + }, + { + "epoch": 0.4955720142239134, + "grad_norm": 0.798134446144104, + "learning_rate": 0.0001669848365855132, + "loss": 2.9871, + "step": 5470 + }, + { + "epoch": 0.4956626123983602, + "grad_norm": 0.8160821795463562, + "learning_rate": 0.00016697879538452246, + "loss": 2.8533, + "step": 5471 + }, + { + "epoch": 0.49575321057280697, + "grad_norm": 0.7633422613143921, + "learning_rate": 0.0001669727541835317, + "loss": 2.9159, + "step": 5472 + }, + { + "epoch": 0.49584380874725376, + "grad_norm": 0.7239334583282471, + "learning_rate": 0.00016696671298254093, + "loss": 2.6564, + "step": 5473 + }, + { + "epoch": 0.49593440692170054, + "grad_norm": 0.7744816541671753, + "learning_rate": 0.00016696067178155016, + "loss": 2.7131, + "step": 5474 + }, + { + "epoch": 0.49602500509614733, + "grad_norm": 0.791863203048706, + "learning_rate": 0.00016695463058055942, + "loss": 2.7234, + "step": 5475 + }, + { + "epoch": 0.4961156032705941, + "grad_norm": 0.7213311791419983, + "learning_rate": 0.00016694858937956868, + "loss": 2.8321, + "step": 5476 + }, + { + "epoch": 0.4962062014450409, + "grad_norm": 0.7311343550682068, + "learning_rate": 0.00016694254817857792, + "loss": 2.8673, + "step": 5477 + }, + { + "epoch": 0.4962967996194877, + "grad_norm": 0.7829902172088623, + "learning_rate": 0.00016693650697758715, + "loss": 2.7481, + "step": 5478 + }, + { + "epoch": 0.4963873977939344, + "grad_norm": 0.7710520625114441, + "learning_rate": 0.00016693046577659639, + "loss": 2.2263, + "step": 5479 + }, + { + "epoch": 0.4964779959683812, + "grad_norm": 0.7335677146911621, + "learning_rate": 0.00016692442457560565, + "loss": 2.8276, + "step": 5480 + }, + { + "epoch": 0.496568594142828, + "grad_norm": 0.7686776518821716, + "learning_rate": 0.00016691838337461488, + "loss": 2.7758, + "step": 5481 + }, + { + "epoch": 0.4966591923172748, + "grad_norm": 0.7477664351463318, + "learning_rate": 0.00016691234217362411, + "loss": 2.6383, + "step": 5482 + }, + { + "epoch": 0.4967497904917216, + "grad_norm": 0.7717370986938477, + "learning_rate": 0.00016690630097263338, + "loss": 2.7348, + "step": 5483 + }, + { + "epoch": 0.49684038866616836, + "grad_norm": 0.7307156324386597, + "learning_rate": 0.0001669002597716426, + "loss": 2.6311, + "step": 5484 + }, + { + "epoch": 0.49693098684061515, + "grad_norm": 0.73916095495224, + "learning_rate": 0.00016689421857065187, + "loss": 2.7254, + "step": 5485 + }, + { + "epoch": 0.49702158501506194, + "grad_norm": 0.8185293078422546, + "learning_rate": 0.00016688817736966108, + "loss": 2.9329, + "step": 5486 + }, + { + "epoch": 0.4971121831895087, + "grad_norm": 0.7787230014801025, + "learning_rate": 0.00016688213616867034, + "loss": 2.6541, + "step": 5487 + }, + { + "epoch": 0.4972027813639555, + "grad_norm": 0.7948181629180908, + "learning_rate": 0.00016687609496767957, + "loss": 2.8648, + "step": 5488 + }, + { + "epoch": 0.4972933795384023, + "grad_norm": 0.6358432173728943, + "learning_rate": 0.00016687005376668883, + "loss": 1.9508, + "step": 5489 + }, + { + "epoch": 0.4973839777128491, + "grad_norm": 0.734417200088501, + "learning_rate": 0.00016686401256569807, + "loss": 2.3001, + "step": 5490 + }, + { + "epoch": 0.4974745758872959, + "grad_norm": 0.7768407464027405, + "learning_rate": 0.0001668579713647073, + "loss": 2.8268, + "step": 5491 + }, + { + "epoch": 0.49756517406174267, + "grad_norm": 0.8950443863868713, + "learning_rate": 0.00016685193016371656, + "loss": 2.9603, + "step": 5492 + }, + { + "epoch": 0.49765577223618945, + "grad_norm": 0.7674424648284912, + "learning_rate": 0.0001668458889627258, + "loss": 2.9696, + "step": 5493 + }, + { + "epoch": 0.49774637041063624, + "grad_norm": 0.8085172772407532, + "learning_rate": 0.00016683984776173506, + "loss": 2.9545, + "step": 5494 + }, + { + "epoch": 0.49783696858508303, + "grad_norm": 0.7673991322517395, + "learning_rate": 0.00016683380656074427, + "loss": 2.7298, + "step": 5495 + }, + { + "epoch": 0.4979275667595298, + "grad_norm": 0.8113872408866882, + "learning_rate": 0.00016682776535975353, + "loss": 2.8638, + "step": 5496 + }, + { + "epoch": 0.4980181649339766, + "grad_norm": 0.7918789982795715, + "learning_rate": 0.00016682172415876276, + "loss": 2.613, + "step": 5497 + }, + { + "epoch": 0.4981087631084234, + "grad_norm": 0.8121913075447083, + "learning_rate": 0.00016681568295777202, + "loss": 2.7095, + "step": 5498 + }, + { + "epoch": 0.4981993612828701, + "grad_norm": 0.7398341298103333, + "learning_rate": 0.00016680964175678126, + "loss": 2.8715, + "step": 5499 + }, + { + "epoch": 0.4982899594573169, + "grad_norm": 0.7521448135375977, + "learning_rate": 0.0001668036005557905, + "loss": 2.7953, + "step": 5500 + }, + { + "epoch": 0.4983805576317637, + "grad_norm": 0.6573072075843811, + "learning_rate": 0.00016679755935479975, + "loss": 2.302, + "step": 5501 + }, + { + "epoch": 0.4984711558062105, + "grad_norm": 0.7973517179489136, + "learning_rate": 0.00016679151815380899, + "loss": 2.8234, + "step": 5502 + }, + { + "epoch": 0.4985617539806573, + "grad_norm": 0.754153847694397, + "learning_rate": 0.00016678547695281822, + "loss": 2.7378, + "step": 5503 + }, + { + "epoch": 0.49865235215510406, + "grad_norm": 0.7374809980392456, + "learning_rate": 0.00016677943575182745, + "loss": 2.0667, + "step": 5504 + }, + { + "epoch": 0.49874295032955085, + "grad_norm": 0.6569148898124695, + "learning_rate": 0.00016677339455083671, + "loss": 2.1629, + "step": 5505 + }, + { + "epoch": 0.49883354850399764, + "grad_norm": 0.6821491122245789, + "learning_rate": 0.00016676735334984598, + "loss": 2.1081, + "step": 5506 + }, + { + "epoch": 0.4989241466784444, + "grad_norm": 0.807829737663269, + "learning_rate": 0.0001667613121488552, + "loss": 3.0665, + "step": 5507 + }, + { + "epoch": 0.4990147448528912, + "grad_norm": 0.7262287139892578, + "learning_rate": 0.00016675527094786444, + "loss": 2.6613, + "step": 5508 + }, + { + "epoch": 0.499105343027338, + "grad_norm": 0.7749921083450317, + "learning_rate": 0.00016674922974687368, + "loss": 2.9529, + "step": 5509 + }, + { + "epoch": 0.4991959412017848, + "grad_norm": 0.7398866415023804, + "learning_rate": 0.00016674318854588294, + "loss": 2.8278, + "step": 5510 + }, + { + "epoch": 0.4992865393762316, + "grad_norm": 0.7638974189758301, + "learning_rate": 0.00016673714734489217, + "loss": 2.7088, + "step": 5511 + }, + { + "epoch": 0.49937713755067836, + "grad_norm": 0.7721091508865356, + "learning_rate": 0.0001667311061439014, + "loss": 2.759, + "step": 5512 + }, + { + "epoch": 0.49946773572512515, + "grad_norm": 0.7628786563873291, + "learning_rate": 0.00016672506494291067, + "loss": 2.8013, + "step": 5513 + }, + { + "epoch": 0.49955833389957194, + "grad_norm": 0.7896884083747864, + "learning_rate": 0.0001667190237419199, + "loss": 2.7976, + "step": 5514 + }, + { + "epoch": 0.4996489320740187, + "grad_norm": 0.6369234919548035, + "learning_rate": 0.00016671298254092916, + "loss": 1.9738, + "step": 5515 + }, + { + "epoch": 0.4997395302484655, + "grad_norm": 0.6649976372718811, + "learning_rate": 0.00016670694133993837, + "loss": 2.228, + "step": 5516 + }, + { + "epoch": 0.4998301284229123, + "grad_norm": 0.7922840714454651, + "learning_rate": 0.00016670090013894763, + "loss": 3.0303, + "step": 5517 + }, + { + "epoch": 0.4999207265973591, + "grad_norm": 0.8513602018356323, + "learning_rate": 0.00016669485893795687, + "loss": 2.8254, + "step": 5518 + }, + { + "epoch": 0.5000113247718059, + "grad_norm": 0.7991888523101807, + "learning_rate": 0.00016668881773696613, + "loss": 2.8184, + "step": 5519 + }, + { + "epoch": 0.5001019229462527, + "grad_norm": 0.8111244440078735, + "learning_rate": 0.00016668277653597536, + "loss": 2.8237, + "step": 5520 + }, + { + "epoch": 0.5001925211206995, + "grad_norm": 0.7742196917533875, + "learning_rate": 0.0001666767353349846, + "loss": 2.6116, + "step": 5521 + }, + { + "epoch": 0.5002831192951462, + "grad_norm": 0.787329912185669, + "learning_rate": 0.00016667069413399386, + "loss": 3.2368, + "step": 5522 + }, + { + "epoch": 0.500373717469593, + "grad_norm": 0.7711870670318604, + "learning_rate": 0.0001666646529330031, + "loss": 2.8698, + "step": 5523 + }, + { + "epoch": 0.5004643156440398, + "grad_norm": 0.7556888461112976, + "learning_rate": 0.00016665861173201232, + "loss": 2.8058, + "step": 5524 + }, + { + "epoch": 0.5005549138184866, + "grad_norm": 0.8029634952545166, + "learning_rate": 0.00016665257053102156, + "loss": 2.5161, + "step": 5525 + }, + { + "epoch": 0.5006455119929334, + "grad_norm": 0.7439908981323242, + "learning_rate": 0.00016664652933003082, + "loss": 2.8634, + "step": 5526 + }, + { + "epoch": 0.5007361101673802, + "grad_norm": 0.8461180925369263, + "learning_rate": 0.00016664048812904005, + "loss": 2.6891, + "step": 5527 + }, + { + "epoch": 0.500826708341827, + "grad_norm": 0.7791764140129089, + "learning_rate": 0.00016663444692804931, + "loss": 2.9503, + "step": 5528 + }, + { + "epoch": 0.5009173065162738, + "grad_norm": 0.755513608455658, + "learning_rate": 0.00016662840572705855, + "loss": 2.7299, + "step": 5529 + }, + { + "epoch": 0.5010079046907204, + "grad_norm": 0.7501204013824463, + "learning_rate": 0.00016662236452606778, + "loss": 2.8243, + "step": 5530 + }, + { + "epoch": 0.5010985028651672, + "grad_norm": 0.7256134152412415, + "learning_rate": 0.00016661632332507704, + "loss": 2.1388, + "step": 5531 + }, + { + "epoch": 0.501189101039614, + "grad_norm": 0.7715451717376709, + "learning_rate": 0.00016661028212408628, + "loss": 2.9863, + "step": 5532 + }, + { + "epoch": 0.5012796992140608, + "grad_norm": 0.7602688670158386, + "learning_rate": 0.0001666042409230955, + "loss": 2.9333, + "step": 5533 + }, + { + "epoch": 0.5013702973885076, + "grad_norm": 0.7060196399688721, + "learning_rate": 0.00016659819972210475, + "loss": 2.3266, + "step": 5534 + }, + { + "epoch": 0.5014608955629544, + "grad_norm": 0.7603363990783691, + "learning_rate": 0.000166592158521114, + "loss": 2.5436, + "step": 5535 + }, + { + "epoch": 0.5015514937374012, + "grad_norm": 0.7565180063247681, + "learning_rate": 0.00016658611732012327, + "loss": 2.728, + "step": 5536 + }, + { + "epoch": 0.5016420919118479, + "grad_norm": 0.6691232919692993, + "learning_rate": 0.00016658007611913248, + "loss": 1.8, + "step": 5537 + }, + { + "epoch": 0.5017326900862947, + "grad_norm": 0.7478617429733276, + "learning_rate": 0.00016657403491814174, + "loss": 2.9297, + "step": 5538 + }, + { + "epoch": 0.5018232882607415, + "grad_norm": 0.606614351272583, + "learning_rate": 0.00016656799371715097, + "loss": 1.9795, + "step": 5539 + }, + { + "epoch": 0.5019138864351883, + "grad_norm": 0.7428659796714783, + "learning_rate": 0.00016656195251616023, + "loss": 2.6995, + "step": 5540 + }, + { + "epoch": 0.5020044846096351, + "grad_norm": 0.7772549390792847, + "learning_rate": 0.00016655591131516947, + "loss": 2.6908, + "step": 5541 + }, + { + "epoch": 0.5020950827840819, + "grad_norm": 0.6310734748840332, + "learning_rate": 0.0001665498701141787, + "loss": 2.1164, + "step": 5542 + }, + { + "epoch": 0.5021856809585287, + "grad_norm": 0.7089002132415771, + "learning_rate": 0.00016654382891318796, + "loss": 2.6968, + "step": 5543 + }, + { + "epoch": 0.5022762791329755, + "grad_norm": 0.779255211353302, + "learning_rate": 0.0001665377877121972, + "loss": 2.9638, + "step": 5544 + }, + { + "epoch": 0.5023668773074222, + "grad_norm": 0.7379576563835144, + "learning_rate": 0.00016653174651120646, + "loss": 2.3287, + "step": 5545 + }, + { + "epoch": 0.502457475481869, + "grad_norm": 0.7383188605308533, + "learning_rate": 0.00016652570531021566, + "loss": 2.6507, + "step": 5546 + }, + { + "epoch": 0.5025480736563158, + "grad_norm": 0.7339186668395996, + "learning_rate": 0.00016651966410922492, + "loss": 3.0522, + "step": 5547 + }, + { + "epoch": 0.5026386718307626, + "grad_norm": 0.7612391114234924, + "learning_rate": 0.00016651362290823416, + "loss": 2.7286, + "step": 5548 + }, + { + "epoch": 0.5027292700052094, + "grad_norm": 0.7435015439987183, + "learning_rate": 0.00016650758170724342, + "loss": 2.908, + "step": 5549 + }, + { + "epoch": 0.5028198681796562, + "grad_norm": 0.8957889080047607, + "learning_rate": 0.00016650154050625263, + "loss": 2.9738, + "step": 5550 + }, + { + "epoch": 0.502910466354103, + "grad_norm": 0.7297683358192444, + "learning_rate": 0.0001664954993052619, + "loss": 2.7267, + "step": 5551 + }, + { + "epoch": 0.5030010645285498, + "grad_norm": 0.7685043811798096, + "learning_rate": 0.00016648945810427115, + "loss": 2.592, + "step": 5552 + }, + { + "epoch": 0.5030916627029965, + "grad_norm": 0.7566169500350952, + "learning_rate": 0.00016648341690328038, + "loss": 2.9646, + "step": 5553 + }, + { + "epoch": 0.5031822608774433, + "grad_norm": 0.753410279750824, + "learning_rate": 0.00016647737570228962, + "loss": 2.7372, + "step": 5554 + }, + { + "epoch": 0.5032728590518901, + "grad_norm": 0.6606966853141785, + "learning_rate": 0.00016647133450129885, + "loss": 1.9443, + "step": 5555 + }, + { + "epoch": 0.5033634572263369, + "grad_norm": 0.8035297989845276, + "learning_rate": 0.0001664652933003081, + "loss": 2.8927, + "step": 5556 + }, + { + "epoch": 0.5034540554007837, + "grad_norm": 0.7895997166633606, + "learning_rate": 0.00016645925209931735, + "loss": 2.6817, + "step": 5557 + }, + { + "epoch": 0.5035446535752305, + "grad_norm": 0.742048442363739, + "learning_rate": 0.0001664532108983266, + "loss": 2.8441, + "step": 5558 + }, + { + "epoch": 0.5036352517496773, + "grad_norm": 0.7334046959877014, + "learning_rate": 0.00016644716969733584, + "loss": 2.7803, + "step": 5559 + }, + { + "epoch": 0.5037258499241241, + "grad_norm": 0.8116430640220642, + "learning_rate": 0.00016644112849634508, + "loss": 2.7632, + "step": 5560 + }, + { + "epoch": 0.5038164480985708, + "grad_norm": 0.8022090196609497, + "learning_rate": 0.00016643508729535434, + "loss": 3.1188, + "step": 5561 + }, + { + "epoch": 0.5039070462730176, + "grad_norm": 0.6598437428474426, + "learning_rate": 0.00016642904609436357, + "loss": 2.1126, + "step": 5562 + }, + { + "epoch": 0.5039976444474644, + "grad_norm": 0.7187564373016357, + "learning_rate": 0.0001664230048933728, + "loss": 2.0804, + "step": 5563 + }, + { + "epoch": 0.5040882426219112, + "grad_norm": 0.8076379299163818, + "learning_rate": 0.00016641696369238204, + "loss": 2.7787, + "step": 5564 + }, + { + "epoch": 0.504178840796358, + "grad_norm": 0.7720252871513367, + "learning_rate": 0.0001664109224913913, + "loss": 3.0434, + "step": 5565 + }, + { + "epoch": 0.5042694389708048, + "grad_norm": 0.7270326018333435, + "learning_rate": 0.00016640488129040056, + "loss": 2.76, + "step": 5566 + }, + { + "epoch": 0.5043600371452516, + "grad_norm": 0.8655713200569153, + "learning_rate": 0.00016639884008940977, + "loss": 2.7022, + "step": 5567 + }, + { + "epoch": 0.5044506353196984, + "grad_norm": 0.7685008645057678, + "learning_rate": 0.00016639279888841903, + "loss": 2.8771, + "step": 5568 + }, + { + "epoch": 0.5045412334941451, + "grad_norm": 0.7583226561546326, + "learning_rate": 0.00016638675768742826, + "loss": 3.0057, + "step": 5569 + }, + { + "epoch": 0.5046318316685918, + "grad_norm": 0.7670584321022034, + "learning_rate": 0.00016638071648643752, + "loss": 2.9498, + "step": 5570 + }, + { + "epoch": 0.5047224298430386, + "grad_norm": 0.7311130166053772, + "learning_rate": 0.00016637467528544676, + "loss": 2.8184, + "step": 5571 + }, + { + "epoch": 0.5048130280174854, + "grad_norm": 0.7829018235206604, + "learning_rate": 0.000166368634084456, + "loss": 2.9024, + "step": 5572 + }, + { + "epoch": 0.5049036261919322, + "grad_norm": 0.7401725053787231, + "learning_rate": 0.00016636259288346525, + "loss": 2.9137, + "step": 5573 + }, + { + "epoch": 0.504994224366379, + "grad_norm": 0.7437192797660828, + "learning_rate": 0.0001663565516824745, + "loss": 2.783, + "step": 5574 + }, + { + "epoch": 0.5050848225408258, + "grad_norm": 0.7854348421096802, + "learning_rate": 0.00016635051048148372, + "loss": 2.8859, + "step": 5575 + }, + { + "epoch": 0.5051754207152725, + "grad_norm": 0.7540155649185181, + "learning_rate": 0.00016634446928049296, + "loss": 2.7306, + "step": 5576 + }, + { + "epoch": 0.5052660188897193, + "grad_norm": 0.7717779874801636, + "learning_rate": 0.00016633842807950222, + "loss": 2.8819, + "step": 5577 + }, + { + "epoch": 0.5053566170641661, + "grad_norm": 0.7683731913566589, + "learning_rate": 0.00016633238687851145, + "loss": 2.6938, + "step": 5578 + }, + { + "epoch": 0.5054472152386129, + "grad_norm": 0.821087121963501, + "learning_rate": 0.0001663263456775207, + "loss": 2.9786, + "step": 5579 + }, + { + "epoch": 0.5055378134130597, + "grad_norm": 0.7218408584594727, + "learning_rate": 0.00016632030447652992, + "loss": 2.6785, + "step": 5580 + }, + { + "epoch": 0.5056284115875065, + "grad_norm": 0.7819834351539612, + "learning_rate": 0.00016631426327553918, + "loss": 2.6618, + "step": 5581 + }, + { + "epoch": 0.5057190097619533, + "grad_norm": 0.6732271909713745, + "learning_rate": 0.00016630822207454844, + "loss": 2.1977, + "step": 5582 + }, + { + "epoch": 0.5058096079364001, + "grad_norm": 0.7882545590400696, + "learning_rate": 0.00016630218087355768, + "loss": 3.1957, + "step": 5583 + }, + { + "epoch": 0.5059002061108469, + "grad_norm": 0.7992120385169983, + "learning_rate": 0.0001662961396725669, + "loss": 2.9454, + "step": 5584 + }, + { + "epoch": 0.5059908042852936, + "grad_norm": 0.820397675037384, + "learning_rate": 0.00016629009847157614, + "loss": 2.9144, + "step": 5585 + }, + { + "epoch": 0.5060814024597404, + "grad_norm": 0.7534196972846985, + "learning_rate": 0.0001662840572705854, + "loss": 2.8069, + "step": 5586 + }, + { + "epoch": 0.5061720006341872, + "grad_norm": 0.7143990397453308, + "learning_rate": 0.00016627801606959464, + "loss": 2.7607, + "step": 5587 + }, + { + "epoch": 0.506262598808634, + "grad_norm": 0.7273293137550354, + "learning_rate": 0.00016627197486860387, + "loss": 2.8593, + "step": 5588 + }, + { + "epoch": 0.5063531969830808, + "grad_norm": 0.8105553388595581, + "learning_rate": 0.00016626593366761313, + "loss": 2.9458, + "step": 5589 + }, + { + "epoch": 0.5064437951575276, + "grad_norm": 0.7726716995239258, + "learning_rate": 0.00016625989246662237, + "loss": 2.875, + "step": 5590 + }, + { + "epoch": 0.5065343933319744, + "grad_norm": 0.9224415421485901, + "learning_rate": 0.00016625385126563163, + "loss": 3.0272, + "step": 5591 + }, + { + "epoch": 0.5066249915064212, + "grad_norm": 0.7702038884162903, + "learning_rate": 0.00016624781006464086, + "loss": 2.8664, + "step": 5592 + }, + { + "epoch": 0.5067155896808679, + "grad_norm": 0.7467065453529358, + "learning_rate": 0.0001662417688636501, + "loss": 2.8908, + "step": 5593 + }, + { + "epoch": 0.5068061878553147, + "grad_norm": 0.6970253586769104, + "learning_rate": 0.00016623572766265933, + "loss": 2.2249, + "step": 5594 + }, + { + "epoch": 0.5068967860297615, + "grad_norm": 0.8058832883834839, + "learning_rate": 0.0001662296864616686, + "loss": 2.7016, + "step": 5595 + }, + { + "epoch": 0.5069873842042083, + "grad_norm": 0.7809755206108093, + "learning_rate": 0.00016622364526067785, + "loss": 2.8164, + "step": 5596 + }, + { + "epoch": 0.5070779823786551, + "grad_norm": 0.780070960521698, + "learning_rate": 0.00016621760405968706, + "loss": 2.7818, + "step": 5597 + }, + { + "epoch": 0.5071685805531019, + "grad_norm": 0.8084604740142822, + "learning_rate": 0.00016621156285869632, + "loss": 2.7571, + "step": 5598 + }, + { + "epoch": 0.5072591787275487, + "grad_norm": 0.7668792009353638, + "learning_rate": 0.00016620552165770556, + "loss": 2.6487, + "step": 5599 + }, + { + "epoch": 0.5073497769019955, + "grad_norm": 0.834679126739502, + "learning_rate": 0.00016619948045671482, + "loss": 3.04, + "step": 5600 + }, + { + "epoch": 0.5074403750764422, + "grad_norm": 0.7047902941703796, + "learning_rate": 0.00016619343925572402, + "loss": 2.4412, + "step": 5601 + }, + { + "epoch": 0.507530973250889, + "grad_norm": 0.7374272346496582, + "learning_rate": 0.00016618739805473329, + "loss": 2.6519, + "step": 5602 + }, + { + "epoch": 0.5076215714253358, + "grad_norm": 0.7204418778419495, + "learning_rate": 0.00016618135685374255, + "loss": 2.7818, + "step": 5603 + }, + { + "epoch": 0.5077121695997826, + "grad_norm": 0.7668619751930237, + "learning_rate": 0.00016617531565275178, + "loss": 2.749, + "step": 5604 + }, + { + "epoch": 0.5078027677742294, + "grad_norm": 0.7553606629371643, + "learning_rate": 0.00016616927445176101, + "loss": 2.7691, + "step": 5605 + }, + { + "epoch": 0.5078933659486762, + "grad_norm": 0.7717010378837585, + "learning_rate": 0.00016616323325077025, + "loss": 2.5685, + "step": 5606 + }, + { + "epoch": 0.507983964123123, + "grad_norm": 0.7248255014419556, + "learning_rate": 0.0001661571920497795, + "loss": 2.5586, + "step": 5607 + }, + { + "epoch": 0.5080745622975698, + "grad_norm": 0.6611232757568359, + "learning_rate": 0.00016615115084878874, + "loss": 1.9957, + "step": 5608 + }, + { + "epoch": 0.5081651604720165, + "grad_norm": 0.6745007038116455, + "learning_rate": 0.000166145109647798, + "loss": 1.981, + "step": 5609 + }, + { + "epoch": 0.5082557586464633, + "grad_norm": 0.5603225827217102, + "learning_rate": 0.0001661390684468072, + "loss": 1.3168, + "step": 5610 + }, + { + "epoch": 0.50834635682091, + "grad_norm": 0.9067627787590027, + "learning_rate": 0.00016613302724581647, + "loss": 2.7825, + "step": 5611 + }, + { + "epoch": 0.5084369549953568, + "grad_norm": 0.7832474708557129, + "learning_rate": 0.00016612698604482573, + "loss": 2.7083, + "step": 5612 + }, + { + "epoch": 0.5085275531698036, + "grad_norm": 0.771190881729126, + "learning_rate": 0.00016612094484383497, + "loss": 2.9821, + "step": 5613 + }, + { + "epoch": 0.5086181513442504, + "grad_norm": 0.6995805501937866, + "learning_rate": 0.0001661149036428442, + "loss": 2.1378, + "step": 5614 + }, + { + "epoch": 0.5087087495186972, + "grad_norm": 0.7728263735771179, + "learning_rate": 0.00016610886244185344, + "loss": 2.8135, + "step": 5615 + }, + { + "epoch": 0.5087993476931439, + "grad_norm": 0.7260735034942627, + "learning_rate": 0.0001661028212408627, + "loss": 2.7205, + "step": 5616 + }, + { + "epoch": 0.5088899458675907, + "grad_norm": 0.8420956134796143, + "learning_rate": 0.00016609678003987193, + "loss": 2.7827, + "step": 5617 + }, + { + "epoch": 0.5089805440420375, + "grad_norm": 0.8321720957756042, + "learning_rate": 0.00016609073883888117, + "loss": 2.9581, + "step": 5618 + }, + { + "epoch": 0.5090711422164843, + "grad_norm": 0.8100752234458923, + "learning_rate": 0.00016608469763789043, + "loss": 2.75, + "step": 5619 + }, + { + "epoch": 0.5091617403909311, + "grad_norm": 0.68099445104599, + "learning_rate": 0.00016607865643689966, + "loss": 2.2344, + "step": 5620 + }, + { + "epoch": 0.5092523385653779, + "grad_norm": 0.8177551627159119, + "learning_rate": 0.00016607261523590892, + "loss": 2.7605, + "step": 5621 + }, + { + "epoch": 0.5093429367398247, + "grad_norm": 0.7909172177314758, + "learning_rate": 0.00016606657403491816, + "loss": 2.6669, + "step": 5622 + }, + { + "epoch": 0.5094335349142715, + "grad_norm": 0.7159810662269592, + "learning_rate": 0.0001660605328339274, + "loss": 2.8094, + "step": 5623 + }, + { + "epoch": 0.5095241330887182, + "grad_norm": 0.7526470422744751, + "learning_rate": 0.00016605449163293662, + "loss": 2.5632, + "step": 5624 + }, + { + "epoch": 0.509614731263165, + "grad_norm": 0.8265487551689148, + "learning_rate": 0.00016604845043194589, + "loss": 2.5375, + "step": 5625 + }, + { + "epoch": 0.5097053294376118, + "grad_norm": 0.7713657021522522, + "learning_rate": 0.00016604240923095512, + "loss": 2.8706, + "step": 5626 + }, + { + "epoch": 0.5097959276120586, + "grad_norm": 0.7755748629570007, + "learning_rate": 0.00016603636802996435, + "loss": 3.0118, + "step": 5627 + }, + { + "epoch": 0.5098865257865054, + "grad_norm": 0.7955690026283264, + "learning_rate": 0.00016603032682897361, + "loss": 3.002, + "step": 5628 + }, + { + "epoch": 0.5099771239609522, + "grad_norm": 0.7532292604446411, + "learning_rate": 0.00016602428562798285, + "loss": 2.6664, + "step": 5629 + }, + { + "epoch": 0.510067722135399, + "grad_norm": 0.7493876218795776, + "learning_rate": 0.0001660182444269921, + "loss": 2.6504, + "step": 5630 + }, + { + "epoch": 0.5101583203098458, + "grad_norm": 0.8347432613372803, + "learning_rate": 0.00016601220322600132, + "loss": 3.0868, + "step": 5631 + }, + { + "epoch": 0.5102489184842925, + "grad_norm": 0.8172168731689453, + "learning_rate": 0.00016600616202501058, + "loss": 2.6447, + "step": 5632 + }, + { + "epoch": 0.5103395166587393, + "grad_norm": 0.7744309306144714, + "learning_rate": 0.00016600012082401984, + "loss": 2.8721, + "step": 5633 + }, + { + "epoch": 0.5104301148331861, + "grad_norm": 0.8131820559501648, + "learning_rate": 0.00016599407962302907, + "loss": 2.8904, + "step": 5634 + }, + { + "epoch": 0.5105207130076329, + "grad_norm": 0.8080336451530457, + "learning_rate": 0.0001659880384220383, + "loss": 2.8082, + "step": 5635 + }, + { + "epoch": 0.5106113111820797, + "grad_norm": 0.8070424795150757, + "learning_rate": 0.00016598199722104754, + "loss": 2.7369, + "step": 5636 + }, + { + "epoch": 0.5107019093565265, + "grad_norm": 0.7441325187683105, + "learning_rate": 0.0001659759560200568, + "loss": 2.6849, + "step": 5637 + }, + { + "epoch": 0.5107925075309733, + "grad_norm": 0.7527042627334595, + "learning_rate": 0.00016596991481906604, + "loss": 3.1364, + "step": 5638 + }, + { + "epoch": 0.5108831057054201, + "grad_norm": 0.7711227536201477, + "learning_rate": 0.00016596387361807527, + "loss": 2.7951, + "step": 5639 + }, + { + "epoch": 0.5109737038798668, + "grad_norm": 0.7086138725280762, + "learning_rate": 0.0001659578324170845, + "loss": 2.2593, + "step": 5640 + }, + { + "epoch": 0.5110643020543136, + "grad_norm": 0.7938376665115356, + "learning_rate": 0.00016595179121609377, + "loss": 2.7169, + "step": 5641 + }, + { + "epoch": 0.5111549002287604, + "grad_norm": 0.6616971492767334, + "learning_rate": 0.00016594575001510303, + "loss": 2.0932, + "step": 5642 + }, + { + "epoch": 0.5112454984032072, + "grad_norm": 0.7573007941246033, + "learning_rate": 0.00016593970881411226, + "loss": 2.9592, + "step": 5643 + }, + { + "epoch": 0.511336096577654, + "grad_norm": 0.7975010871887207, + "learning_rate": 0.0001659336676131215, + "loss": 2.8557, + "step": 5644 + }, + { + "epoch": 0.5114266947521008, + "grad_norm": 0.7393587231636047, + "learning_rate": 0.00016592762641213073, + "loss": 2.8462, + "step": 5645 + }, + { + "epoch": 0.5115172929265476, + "grad_norm": 0.7291321158409119, + "learning_rate": 0.00016592158521114, + "loss": 2.6671, + "step": 5646 + }, + { + "epoch": 0.5116078911009944, + "grad_norm": 0.7774960398674011, + "learning_rate": 0.00016591554401014922, + "loss": 3.0395, + "step": 5647 + }, + { + "epoch": 0.5116984892754411, + "grad_norm": 0.727083146572113, + "learning_rate": 0.00016590950280915846, + "loss": 2.4914, + "step": 5648 + }, + { + "epoch": 0.5117890874498879, + "grad_norm": 0.7127039432525635, + "learning_rate": 0.00016590346160816772, + "loss": 1.9744, + "step": 5649 + }, + { + "epoch": 0.5118796856243347, + "grad_norm": 0.7610482573509216, + "learning_rate": 0.00016589742040717695, + "loss": 2.8106, + "step": 5650 + }, + { + "epoch": 0.5119702837987814, + "grad_norm": 0.7412326335906982, + "learning_rate": 0.00016589137920618621, + "loss": 2.8487, + "step": 5651 + }, + { + "epoch": 0.5120608819732282, + "grad_norm": 0.7320615649223328, + "learning_rate": 0.00016588533800519542, + "loss": 2.8231, + "step": 5652 + }, + { + "epoch": 0.512151480147675, + "grad_norm": 0.9086033701896667, + "learning_rate": 0.00016587929680420468, + "loss": 3.0562, + "step": 5653 + }, + { + "epoch": 0.5122420783221218, + "grad_norm": 0.7050548195838928, + "learning_rate": 0.00016587325560321392, + "loss": 2.7718, + "step": 5654 + }, + { + "epoch": 0.5123326764965686, + "grad_norm": 0.751365065574646, + "learning_rate": 0.00016586721440222318, + "loss": 2.6397, + "step": 5655 + }, + { + "epoch": 0.5124232746710153, + "grad_norm": 0.8145926594734192, + "learning_rate": 0.0001658611732012324, + "loss": 2.9687, + "step": 5656 + }, + { + "epoch": 0.5125138728454621, + "grad_norm": 0.7817577123641968, + "learning_rate": 0.00016585513200024165, + "loss": 2.5972, + "step": 5657 + }, + { + "epoch": 0.5126044710199089, + "grad_norm": 0.7726883888244629, + "learning_rate": 0.0001658490907992509, + "loss": 2.9554, + "step": 5658 + }, + { + "epoch": 0.5126950691943557, + "grad_norm": 0.8084436655044556, + "learning_rate": 0.00016584304959826014, + "loss": 2.7935, + "step": 5659 + }, + { + "epoch": 0.5127856673688025, + "grad_norm": 0.8217757940292358, + "learning_rate": 0.00016583700839726938, + "loss": 2.8511, + "step": 5660 + }, + { + "epoch": 0.5128762655432493, + "grad_norm": 0.8181508183479309, + "learning_rate": 0.0001658309671962786, + "loss": 2.855, + "step": 5661 + }, + { + "epoch": 0.5129668637176961, + "grad_norm": 0.7996970415115356, + "learning_rate": 0.00016582492599528787, + "loss": 2.7963, + "step": 5662 + }, + { + "epoch": 0.5130574618921429, + "grad_norm": 0.6957526803016663, + "learning_rate": 0.00016581888479429713, + "loss": 2.8662, + "step": 5663 + }, + { + "epoch": 0.5131480600665896, + "grad_norm": 0.7715067863464355, + "learning_rate": 0.00016581284359330637, + "loss": 2.9756, + "step": 5664 + }, + { + "epoch": 0.5132386582410364, + "grad_norm": 0.8331395387649536, + "learning_rate": 0.0001658068023923156, + "loss": 2.9022, + "step": 5665 + }, + { + "epoch": 0.5133292564154832, + "grad_norm": 0.7467517256736755, + "learning_rate": 0.00016580076119132483, + "loss": 2.2596, + "step": 5666 + }, + { + "epoch": 0.51341985458993, + "grad_norm": 0.8420605659484863, + "learning_rate": 0.0001657947199903341, + "loss": 2.6914, + "step": 5667 + }, + { + "epoch": 0.5135104527643768, + "grad_norm": 0.7712684273719788, + "learning_rate": 0.00016578867878934333, + "loss": 2.919, + "step": 5668 + }, + { + "epoch": 0.5136010509388236, + "grad_norm": 0.8019439578056335, + "learning_rate": 0.00016578263758835256, + "loss": 2.5609, + "step": 5669 + }, + { + "epoch": 0.5136916491132704, + "grad_norm": 0.7783631682395935, + "learning_rate": 0.0001657765963873618, + "loss": 2.8455, + "step": 5670 + }, + { + "epoch": 0.5137822472877172, + "grad_norm": 0.637123167514801, + "learning_rate": 0.00016577055518637106, + "loss": 1.8848, + "step": 5671 + }, + { + "epoch": 0.5138728454621639, + "grad_norm": 0.7585150003433228, + "learning_rate": 0.00016576451398538032, + "loss": 2.7307, + "step": 5672 + }, + { + "epoch": 0.5139634436366107, + "grad_norm": 0.7800832986831665, + "learning_rate": 0.00016575847278438953, + "loss": 2.8186, + "step": 5673 + }, + { + "epoch": 0.5140540418110575, + "grad_norm": 0.7745206356048584, + "learning_rate": 0.0001657524315833988, + "loss": 2.7318, + "step": 5674 + }, + { + "epoch": 0.5141446399855043, + "grad_norm": 0.7835695743560791, + "learning_rate": 0.00016574639038240802, + "loss": 2.9096, + "step": 5675 + }, + { + "epoch": 0.5142352381599511, + "grad_norm": 0.837411642074585, + "learning_rate": 0.00016574034918141728, + "loss": 2.9973, + "step": 5676 + }, + { + "epoch": 0.5143258363343979, + "grad_norm": 0.7738702297210693, + "learning_rate": 0.00016573430798042652, + "loss": 2.7218, + "step": 5677 + }, + { + "epoch": 0.5144164345088447, + "grad_norm": 0.8285644054412842, + "learning_rate": 0.00016572826677943575, + "loss": 2.68, + "step": 5678 + }, + { + "epoch": 0.5145070326832915, + "grad_norm": 0.7797390222549438, + "learning_rate": 0.000165722225578445, + "loss": 2.7101, + "step": 5679 + }, + { + "epoch": 0.5145976308577382, + "grad_norm": 0.8201199769973755, + "learning_rate": 0.00016571618437745425, + "loss": 2.9292, + "step": 5680 + }, + { + "epoch": 0.514688229032185, + "grad_norm": 0.7480183243751526, + "learning_rate": 0.0001657101431764635, + "loss": 2.6415, + "step": 5681 + }, + { + "epoch": 0.5147788272066318, + "grad_norm": 0.7357466816902161, + "learning_rate": 0.00016570410197547271, + "loss": 2.7688, + "step": 5682 + }, + { + "epoch": 0.5148694253810786, + "grad_norm": 0.7376413941383362, + "learning_rate": 0.00016569806077448198, + "loss": 2.5902, + "step": 5683 + }, + { + "epoch": 0.5149600235555254, + "grad_norm": 0.7994638085365295, + "learning_rate": 0.0001656920195734912, + "loss": 2.8095, + "step": 5684 + }, + { + "epoch": 0.5150506217299722, + "grad_norm": 0.8265538215637207, + "learning_rate": 0.00016568597837250047, + "loss": 3.0608, + "step": 5685 + }, + { + "epoch": 0.515141219904419, + "grad_norm": 0.7837026715278625, + "learning_rate": 0.0001656799371715097, + "loss": 2.8281, + "step": 5686 + }, + { + "epoch": 0.5152318180788658, + "grad_norm": 0.7369375228881836, + "learning_rate": 0.00016567389597051894, + "loss": 2.8017, + "step": 5687 + }, + { + "epoch": 0.5153224162533125, + "grad_norm": 0.7917884588241577, + "learning_rate": 0.0001656678547695282, + "loss": 2.6857, + "step": 5688 + }, + { + "epoch": 0.5154130144277593, + "grad_norm": 0.7895687222480774, + "learning_rate": 0.00016566181356853743, + "loss": 2.2231, + "step": 5689 + }, + { + "epoch": 0.5155036126022061, + "grad_norm": 0.7741201519966125, + "learning_rate": 0.00016565577236754667, + "loss": 2.8506, + "step": 5690 + }, + { + "epoch": 0.5155942107766529, + "grad_norm": 0.7667773962020874, + "learning_rate": 0.0001656497311665559, + "loss": 2.7333, + "step": 5691 + }, + { + "epoch": 0.5156848089510996, + "grad_norm": 0.7931208610534668, + "learning_rate": 0.00016564368996556516, + "loss": 2.8761, + "step": 5692 + }, + { + "epoch": 0.5157754071255464, + "grad_norm": 0.7762443423271179, + "learning_rate": 0.00016563764876457442, + "loss": 2.9262, + "step": 5693 + }, + { + "epoch": 0.5158660052999932, + "grad_norm": 0.7813248634338379, + "learning_rate": 0.00016563160756358366, + "loss": 2.878, + "step": 5694 + }, + { + "epoch": 0.51595660347444, + "grad_norm": 0.768079936504364, + "learning_rate": 0.0001656255663625929, + "loss": 2.7869, + "step": 5695 + }, + { + "epoch": 0.5160472016488867, + "grad_norm": 0.7684898972511292, + "learning_rate": 0.00016561952516160213, + "loss": 2.7925, + "step": 5696 + }, + { + "epoch": 0.5161377998233335, + "grad_norm": 0.7273120284080505, + "learning_rate": 0.0001656134839606114, + "loss": 2.6451, + "step": 5697 + }, + { + "epoch": 0.5162283979977803, + "grad_norm": 0.6772921681404114, + "learning_rate": 0.00016560744275962062, + "loss": 2.148, + "step": 5698 + }, + { + "epoch": 0.5163189961722271, + "grad_norm": 0.7026914358139038, + "learning_rate": 0.00016560140155862986, + "loss": 2.2821, + "step": 5699 + }, + { + "epoch": 0.5164095943466739, + "grad_norm": 0.7427133321762085, + "learning_rate": 0.0001655953603576391, + "loss": 2.9553, + "step": 5700 + }, + { + "epoch": 0.5165001925211207, + "grad_norm": 0.881084144115448, + "learning_rate": 0.00016558931915664835, + "loss": 2.8812, + "step": 5701 + }, + { + "epoch": 0.5165907906955675, + "grad_norm": 0.7153615355491638, + "learning_rate": 0.0001655832779556576, + "loss": 2.2611, + "step": 5702 + }, + { + "epoch": 0.5166813888700142, + "grad_norm": 0.8991145491600037, + "learning_rate": 0.00016557723675466682, + "loss": 2.675, + "step": 5703 + }, + { + "epoch": 0.516771987044461, + "grad_norm": 0.7699596881866455, + "learning_rate": 0.00016557119555367608, + "loss": 3.1036, + "step": 5704 + }, + { + "epoch": 0.5168625852189078, + "grad_norm": 0.796552300453186, + "learning_rate": 0.00016556515435268531, + "loss": 2.7911, + "step": 5705 + }, + { + "epoch": 0.5169531833933546, + "grad_norm": 0.808149516582489, + "learning_rate": 0.00016555911315169458, + "loss": 3.1086, + "step": 5706 + }, + { + "epoch": 0.5170437815678014, + "grad_norm": 0.8045104742050171, + "learning_rate": 0.0001655530719507038, + "loss": 2.9832, + "step": 5707 + }, + { + "epoch": 0.5171343797422482, + "grad_norm": 0.8579520583152771, + "learning_rate": 0.00016554703074971304, + "loss": 2.805, + "step": 5708 + }, + { + "epoch": 0.517224977916695, + "grad_norm": 0.7594131231307983, + "learning_rate": 0.0001655409895487223, + "loss": 2.7066, + "step": 5709 + }, + { + "epoch": 0.5173155760911418, + "grad_norm": 0.7862403988838196, + "learning_rate": 0.00016553494834773154, + "loss": 2.7988, + "step": 5710 + }, + { + "epoch": 0.5174061742655885, + "grad_norm": 0.8069267868995667, + "learning_rate": 0.00016552890714674077, + "loss": 2.9953, + "step": 5711 + }, + { + "epoch": 0.5174967724400353, + "grad_norm": 0.744811475276947, + "learning_rate": 0.00016552286594575, + "loss": 2.8521, + "step": 5712 + }, + { + "epoch": 0.5175873706144821, + "grad_norm": 0.7877512574195862, + "learning_rate": 0.00016551682474475927, + "loss": 2.8358, + "step": 5713 + }, + { + "epoch": 0.5176779687889289, + "grad_norm": 0.9423094391822815, + "learning_rate": 0.0001655107835437685, + "loss": 2.6743, + "step": 5714 + }, + { + "epoch": 0.5177685669633757, + "grad_norm": 0.8092348575592041, + "learning_rate": 0.00016550474234277776, + "loss": 2.9472, + "step": 5715 + }, + { + "epoch": 0.5178591651378225, + "grad_norm": 0.8876063227653503, + "learning_rate": 0.000165498701141787, + "loss": 2.9043, + "step": 5716 + }, + { + "epoch": 0.5179497633122693, + "grad_norm": 0.7481822371482849, + "learning_rate": 0.00016549265994079623, + "loss": 2.6938, + "step": 5717 + }, + { + "epoch": 0.5180403614867161, + "grad_norm": 0.8531092405319214, + "learning_rate": 0.0001654866187398055, + "loss": 2.9792, + "step": 5718 + }, + { + "epoch": 0.5181309596611628, + "grad_norm": 0.8007026314735413, + "learning_rate": 0.00016548057753881473, + "loss": 2.8391, + "step": 5719 + }, + { + "epoch": 0.5182215578356096, + "grad_norm": 0.8363354802131653, + "learning_rate": 0.00016547453633782396, + "loss": 2.7312, + "step": 5720 + }, + { + "epoch": 0.5183121560100564, + "grad_norm": 0.6777344942092896, + "learning_rate": 0.0001654684951368332, + "loss": 2.0771, + "step": 5721 + }, + { + "epoch": 0.5184027541845032, + "grad_norm": 0.7982129454612732, + "learning_rate": 0.00016546245393584246, + "loss": 2.7243, + "step": 5722 + }, + { + "epoch": 0.51849335235895, + "grad_norm": 0.7611583471298218, + "learning_rate": 0.00016545641273485172, + "loss": 2.4947, + "step": 5723 + }, + { + "epoch": 0.5185839505333968, + "grad_norm": 0.7729705572128296, + "learning_rate": 0.00016545037153386092, + "loss": 2.7343, + "step": 5724 + }, + { + "epoch": 0.5186745487078436, + "grad_norm": 0.8414747714996338, + "learning_rate": 0.00016544433033287019, + "loss": 3.0329, + "step": 5725 + }, + { + "epoch": 0.5187651468822904, + "grad_norm": 0.8167032599449158, + "learning_rate": 0.00016543828913187942, + "loss": 2.7928, + "step": 5726 + }, + { + "epoch": 0.5188557450567371, + "grad_norm": 0.9262503385543823, + "learning_rate": 0.00016543224793088868, + "loss": 2.5885, + "step": 5727 + }, + { + "epoch": 0.5189463432311839, + "grad_norm": 0.8005927801132202, + "learning_rate": 0.00016542620672989791, + "loss": 2.8858, + "step": 5728 + }, + { + "epoch": 0.5190369414056307, + "grad_norm": 0.7927110195159912, + "learning_rate": 0.00016542016552890715, + "loss": 2.9976, + "step": 5729 + }, + { + "epoch": 0.5191275395800775, + "grad_norm": 0.7378653287887573, + "learning_rate": 0.0001654141243279164, + "loss": 2.8024, + "step": 5730 + }, + { + "epoch": 0.5192181377545243, + "grad_norm": 0.787685751914978, + "learning_rate": 0.00016540808312692564, + "loss": 2.5906, + "step": 5731 + }, + { + "epoch": 0.519308735928971, + "grad_norm": 0.7690780162811279, + "learning_rate": 0.0001654020419259349, + "loss": 2.988, + "step": 5732 + }, + { + "epoch": 0.5193993341034178, + "grad_norm": 0.7960997819900513, + "learning_rate": 0.0001653960007249441, + "loss": 2.8206, + "step": 5733 + }, + { + "epoch": 0.5194899322778646, + "grad_norm": 0.7949032187461853, + "learning_rate": 0.00016538995952395337, + "loss": 2.8693, + "step": 5734 + }, + { + "epoch": 0.5195805304523113, + "grad_norm": 0.7963933944702148, + "learning_rate": 0.0001653839183229626, + "loss": 2.8846, + "step": 5735 + }, + { + "epoch": 0.5196711286267581, + "grad_norm": 0.7649208903312683, + "learning_rate": 0.00016537787712197187, + "loss": 2.9894, + "step": 5736 + }, + { + "epoch": 0.5197617268012049, + "grad_norm": 0.7987843751907349, + "learning_rate": 0.00016537183592098108, + "loss": 2.6205, + "step": 5737 + }, + { + "epoch": 0.5198523249756517, + "grad_norm": 0.8255748152732849, + "learning_rate": 0.00016536579471999034, + "loss": 2.7711, + "step": 5738 + }, + { + "epoch": 0.5199429231500985, + "grad_norm": 0.7613734602928162, + "learning_rate": 0.0001653597535189996, + "loss": 2.8994, + "step": 5739 + }, + { + "epoch": 0.5200335213245453, + "grad_norm": 0.7100048661231995, + "learning_rate": 0.00016535371231800883, + "loss": 2.8408, + "step": 5740 + }, + { + "epoch": 0.5201241194989921, + "grad_norm": 0.8202680349349976, + "learning_rate": 0.00016534767111701807, + "loss": 3.0505, + "step": 5741 + }, + { + "epoch": 0.5202147176734389, + "grad_norm": 0.8184162378311157, + "learning_rate": 0.0001653416299160273, + "loss": 2.7368, + "step": 5742 + }, + { + "epoch": 0.5203053158478856, + "grad_norm": 0.7492666840553284, + "learning_rate": 0.00016533558871503656, + "loss": 2.6387, + "step": 5743 + }, + { + "epoch": 0.5203959140223324, + "grad_norm": 0.8638722896575928, + "learning_rate": 0.0001653295475140458, + "loss": 2.9043, + "step": 5744 + }, + { + "epoch": 0.5204865121967792, + "grad_norm": 0.7974038124084473, + "learning_rate": 0.00016532350631305506, + "loss": 2.9251, + "step": 5745 + }, + { + "epoch": 0.520577110371226, + "grad_norm": 0.683721125125885, + "learning_rate": 0.0001653174651120643, + "loss": 2.0845, + "step": 5746 + }, + { + "epoch": 0.5206677085456728, + "grad_norm": 0.759930431842804, + "learning_rate": 0.00016531142391107352, + "loss": 2.7707, + "step": 5747 + }, + { + "epoch": 0.5207583067201196, + "grad_norm": 0.7622895836830139, + "learning_rate": 0.00016530538271008279, + "loss": 2.9547, + "step": 5748 + }, + { + "epoch": 0.5208489048945664, + "grad_norm": 0.7774733304977417, + "learning_rate": 0.00016529934150909202, + "loss": 2.7315, + "step": 5749 + }, + { + "epoch": 0.5209395030690132, + "grad_norm": 0.7747950553894043, + "learning_rate": 0.00016529330030810125, + "loss": 2.7652, + "step": 5750 + }, + { + "epoch": 0.5210301012434599, + "grad_norm": 0.8742499947547913, + "learning_rate": 0.0001652872591071105, + "loss": 2.9157, + "step": 5751 + }, + { + "epoch": 0.5211206994179067, + "grad_norm": 0.7698177099227905, + "learning_rate": 0.00016528121790611975, + "loss": 2.5068, + "step": 5752 + }, + { + "epoch": 0.5212112975923535, + "grad_norm": 0.790360689163208, + "learning_rate": 0.000165275176705129, + "loss": 2.93, + "step": 5753 + }, + { + "epoch": 0.5213018957668003, + "grad_norm": 0.7619889974594116, + "learning_rate": 0.00016526913550413822, + "loss": 2.8821, + "step": 5754 + }, + { + "epoch": 0.5213924939412471, + "grad_norm": 0.7317619323730469, + "learning_rate": 0.00016526309430314748, + "loss": 2.8471, + "step": 5755 + }, + { + "epoch": 0.5214830921156939, + "grad_norm": 0.7840265035629272, + "learning_rate": 0.0001652570531021567, + "loss": 2.7503, + "step": 5756 + }, + { + "epoch": 0.5215736902901407, + "grad_norm": 0.7496140003204346, + "learning_rate": 0.00016525101190116597, + "loss": 2.9963, + "step": 5757 + }, + { + "epoch": 0.5216642884645875, + "grad_norm": 0.8258744478225708, + "learning_rate": 0.0001652449707001752, + "loss": 2.7905, + "step": 5758 + }, + { + "epoch": 0.5217548866390342, + "grad_norm": 0.7609494924545288, + "learning_rate": 0.00016523892949918444, + "loss": 2.5807, + "step": 5759 + }, + { + "epoch": 0.521845484813481, + "grad_norm": 0.7606557011604309, + "learning_rate": 0.0001652328882981937, + "loss": 2.6057, + "step": 5760 + }, + { + "epoch": 0.5219360829879278, + "grad_norm": 0.7828856706619263, + "learning_rate": 0.00016522684709720294, + "loss": 2.6411, + "step": 5761 + }, + { + "epoch": 0.5220266811623746, + "grad_norm": 0.7680474519729614, + "learning_rate": 0.00016522080589621217, + "loss": 2.7062, + "step": 5762 + }, + { + "epoch": 0.5221172793368214, + "grad_norm": 0.7650892734527588, + "learning_rate": 0.0001652147646952214, + "loss": 2.7671, + "step": 5763 + }, + { + "epoch": 0.5222078775112682, + "grad_norm": 0.7836150527000427, + "learning_rate": 0.00016520872349423067, + "loss": 2.5688, + "step": 5764 + }, + { + "epoch": 0.522298475685715, + "grad_norm": 0.7341196537017822, + "learning_rate": 0.0001652026822932399, + "loss": 2.8753, + "step": 5765 + }, + { + "epoch": 0.5223890738601618, + "grad_norm": 0.7604097723960876, + "learning_rate": 0.00016519664109224916, + "loss": 2.7987, + "step": 5766 + }, + { + "epoch": 0.5224796720346085, + "grad_norm": 0.8153210878372192, + "learning_rate": 0.00016519059989125837, + "loss": 2.9285, + "step": 5767 + }, + { + "epoch": 0.5225702702090553, + "grad_norm": 0.7475562691688538, + "learning_rate": 0.00016518455869026763, + "loss": 2.6676, + "step": 5768 + }, + { + "epoch": 0.5226608683835021, + "grad_norm": 0.7430313229560852, + "learning_rate": 0.0001651785174892769, + "loss": 2.8908, + "step": 5769 + }, + { + "epoch": 0.5227514665579489, + "grad_norm": 0.8179738521575928, + "learning_rate": 0.00016517247628828612, + "loss": 2.8741, + "step": 5770 + }, + { + "epoch": 0.5228420647323957, + "grad_norm": 0.7642006278038025, + "learning_rate": 0.00016516643508729536, + "loss": 2.7899, + "step": 5771 + }, + { + "epoch": 0.5229326629068425, + "grad_norm": 0.745780348777771, + "learning_rate": 0.0001651603938863046, + "loss": 2.9406, + "step": 5772 + }, + { + "epoch": 0.5230232610812892, + "grad_norm": 0.7861371636390686, + "learning_rate": 0.00016515435268531385, + "loss": 2.7697, + "step": 5773 + }, + { + "epoch": 0.523113859255736, + "grad_norm": 0.7661114931106567, + "learning_rate": 0.0001651483114843231, + "loss": 2.7413, + "step": 5774 + }, + { + "epoch": 0.5232044574301827, + "grad_norm": 0.6854845881462097, + "learning_rate": 0.00016514227028333232, + "loss": 1.998, + "step": 5775 + }, + { + "epoch": 0.5232950556046295, + "grad_norm": 0.7150222659111023, + "learning_rate": 0.00016513622908234158, + "loss": 2.0703, + "step": 5776 + }, + { + "epoch": 0.5233856537790763, + "grad_norm": 0.7745054364204407, + "learning_rate": 0.00016513018788135082, + "loss": 2.6899, + "step": 5777 + }, + { + "epoch": 0.5234762519535231, + "grad_norm": 0.7550756335258484, + "learning_rate": 0.00016512414668036008, + "loss": 2.688, + "step": 5778 + }, + { + "epoch": 0.5235668501279699, + "grad_norm": 0.7891982197761536, + "learning_rate": 0.0001651181054793693, + "loss": 2.873, + "step": 5779 + }, + { + "epoch": 0.5236574483024167, + "grad_norm": 0.8616702556610107, + "learning_rate": 0.00016511206427837855, + "loss": 2.7681, + "step": 5780 + }, + { + "epoch": 0.5237480464768635, + "grad_norm": 0.6690250039100647, + "learning_rate": 0.00016510602307738778, + "loss": 2.0628, + "step": 5781 + }, + { + "epoch": 0.5238386446513102, + "grad_norm": 0.7467344999313354, + "learning_rate": 0.00016509998187639704, + "loss": 2.8049, + "step": 5782 + }, + { + "epoch": 0.523929242825757, + "grad_norm": 0.6531656980514526, + "learning_rate": 0.00016509394067540628, + "loss": 2.1537, + "step": 5783 + }, + { + "epoch": 0.5240198410002038, + "grad_norm": 0.7735799551010132, + "learning_rate": 0.0001650878994744155, + "loss": 2.9926, + "step": 5784 + }, + { + "epoch": 0.5241104391746506, + "grad_norm": 0.7571062445640564, + "learning_rate": 0.00016508185827342477, + "loss": 2.7067, + "step": 5785 + }, + { + "epoch": 0.5242010373490974, + "grad_norm": 0.7884806394577026, + "learning_rate": 0.000165075817072434, + "loss": 2.8515, + "step": 5786 + }, + { + "epoch": 0.5242916355235442, + "grad_norm": 0.6884248852729797, + "learning_rate": 0.00016506977587144327, + "loss": 2.2617, + "step": 5787 + }, + { + "epoch": 0.524382233697991, + "grad_norm": 0.8682228326797485, + "learning_rate": 0.00016506373467045247, + "loss": 2.7509, + "step": 5788 + }, + { + "epoch": 0.5244728318724378, + "grad_norm": 0.8501055240631104, + "learning_rate": 0.00016505769346946173, + "loss": 2.6268, + "step": 5789 + }, + { + "epoch": 0.5245634300468845, + "grad_norm": 0.7646764516830444, + "learning_rate": 0.000165051652268471, + "loss": 2.5725, + "step": 5790 + }, + { + "epoch": 0.5246540282213313, + "grad_norm": 0.8020423650741577, + "learning_rate": 0.00016504561106748023, + "loss": 2.915, + "step": 5791 + }, + { + "epoch": 0.5247446263957781, + "grad_norm": 0.7467533946037292, + "learning_rate": 0.00016503956986648946, + "loss": 2.6826, + "step": 5792 + }, + { + "epoch": 0.5248352245702249, + "grad_norm": 0.7775886058807373, + "learning_rate": 0.0001650335286654987, + "loss": 2.6206, + "step": 5793 + }, + { + "epoch": 0.5249258227446717, + "grad_norm": 0.6982570886611938, + "learning_rate": 0.00016502748746450796, + "loss": 2.0578, + "step": 5794 + }, + { + "epoch": 0.5250164209191185, + "grad_norm": 0.8176747560501099, + "learning_rate": 0.0001650214462635172, + "loss": 2.6137, + "step": 5795 + }, + { + "epoch": 0.5251070190935653, + "grad_norm": 0.7619027495384216, + "learning_rate": 0.00016501540506252645, + "loss": 2.5577, + "step": 5796 + }, + { + "epoch": 0.5251976172680121, + "grad_norm": 0.8105587959289551, + "learning_rate": 0.00016500936386153566, + "loss": 2.9099, + "step": 5797 + }, + { + "epoch": 0.5252882154424589, + "grad_norm": 0.7223200798034668, + "learning_rate": 0.00016500332266054492, + "loss": 2.8628, + "step": 5798 + }, + { + "epoch": 0.5253788136169056, + "grad_norm": 0.7393845319747925, + "learning_rate": 0.00016499728145955418, + "loss": 2.8065, + "step": 5799 + }, + { + "epoch": 0.5254694117913524, + "grad_norm": 0.7890602350234985, + "learning_rate": 0.00016499124025856342, + "loss": 3.0617, + "step": 5800 + }, + { + "epoch": 0.5255600099657992, + "grad_norm": 0.8333385586738586, + "learning_rate": 0.00016498519905757265, + "loss": 2.9002, + "step": 5801 + }, + { + "epoch": 0.525650608140246, + "grad_norm": 0.8604364991188049, + "learning_rate": 0.00016497915785658189, + "loss": 2.9137, + "step": 5802 + }, + { + "epoch": 0.5257412063146928, + "grad_norm": 0.7887088060379028, + "learning_rate": 0.00016497311665559115, + "loss": 2.8102, + "step": 5803 + }, + { + "epoch": 0.5258318044891396, + "grad_norm": 0.7783533930778503, + "learning_rate": 0.00016496707545460038, + "loss": 2.7609, + "step": 5804 + }, + { + "epoch": 0.5259224026635864, + "grad_norm": 0.7695934772491455, + "learning_rate": 0.00016496103425360961, + "loss": 2.7517, + "step": 5805 + }, + { + "epoch": 0.5260130008380332, + "grad_norm": 0.7747960090637207, + "learning_rate": 0.00016495499305261888, + "loss": 2.9962, + "step": 5806 + }, + { + "epoch": 0.5261035990124799, + "grad_norm": 0.7346100807189941, + "learning_rate": 0.0001649489518516281, + "loss": 2.7367, + "step": 5807 + }, + { + "epoch": 0.5261941971869267, + "grad_norm": 0.7297483682632446, + "learning_rate": 0.00016494291065063737, + "loss": 2.8138, + "step": 5808 + }, + { + "epoch": 0.5262847953613735, + "grad_norm": 0.7883182168006897, + "learning_rate": 0.0001649368694496466, + "loss": 2.8091, + "step": 5809 + }, + { + "epoch": 0.5263753935358203, + "grad_norm": 0.7247925996780396, + "learning_rate": 0.00016493082824865584, + "loss": 2.8433, + "step": 5810 + }, + { + "epoch": 0.5264659917102671, + "grad_norm": 0.8443247675895691, + "learning_rate": 0.00016492478704766507, + "loss": 2.6534, + "step": 5811 + }, + { + "epoch": 0.5265565898847139, + "grad_norm": 0.8089460730552673, + "learning_rate": 0.00016491874584667433, + "loss": 2.7807, + "step": 5812 + }, + { + "epoch": 0.5266471880591606, + "grad_norm": 0.816027045249939, + "learning_rate": 0.00016491270464568357, + "loss": 3.0911, + "step": 5813 + }, + { + "epoch": 0.5267377862336073, + "grad_norm": 0.808955729007721, + "learning_rate": 0.0001649066634446928, + "loss": 2.8561, + "step": 5814 + }, + { + "epoch": 0.5268283844080541, + "grad_norm": 0.7334979176521301, + "learning_rate": 0.00016490062224370206, + "loss": 2.7831, + "step": 5815 + }, + { + "epoch": 0.5269189825825009, + "grad_norm": 0.7614395618438721, + "learning_rate": 0.0001648945810427113, + "loss": 2.9836, + "step": 5816 + }, + { + "epoch": 0.5270095807569477, + "grad_norm": 0.7541818618774414, + "learning_rate": 0.00016488853984172056, + "loss": 2.887, + "step": 5817 + }, + { + "epoch": 0.5271001789313945, + "grad_norm": 0.7931084632873535, + "learning_rate": 0.00016488249864072977, + "loss": 2.6764, + "step": 5818 + }, + { + "epoch": 0.5271907771058413, + "grad_norm": 0.7468187212944031, + "learning_rate": 0.00016487645743973903, + "loss": 2.8801, + "step": 5819 + }, + { + "epoch": 0.5272813752802881, + "grad_norm": 0.7190799117088318, + "learning_rate": 0.0001648704162387483, + "loss": 2.7249, + "step": 5820 + }, + { + "epoch": 0.5273719734547349, + "grad_norm": 0.8043505549430847, + "learning_rate": 0.00016486437503775752, + "loss": 2.2248, + "step": 5821 + }, + { + "epoch": 0.5274625716291816, + "grad_norm": 0.8578393459320068, + "learning_rate": 0.00016485833383676676, + "loss": 2.9785, + "step": 5822 + }, + { + "epoch": 0.5275531698036284, + "grad_norm": 0.7980063557624817, + "learning_rate": 0.000164852292635776, + "loss": 2.9035, + "step": 5823 + }, + { + "epoch": 0.5276437679780752, + "grad_norm": 0.8123658299446106, + "learning_rate": 0.00016484625143478525, + "loss": 3.0818, + "step": 5824 + }, + { + "epoch": 0.527734366152522, + "grad_norm": 0.8496807217597961, + "learning_rate": 0.00016484021023379449, + "loss": 2.7593, + "step": 5825 + }, + { + "epoch": 0.5278249643269688, + "grad_norm": 0.8117905855178833, + "learning_rate": 0.00016483416903280372, + "loss": 2.7409, + "step": 5826 + }, + { + "epoch": 0.5279155625014156, + "grad_norm": 0.8170349597930908, + "learning_rate": 0.00016482812783181295, + "loss": 2.9015, + "step": 5827 + }, + { + "epoch": 0.5280061606758624, + "grad_norm": 0.7967211008071899, + "learning_rate": 0.00016482208663082221, + "loss": 2.897, + "step": 5828 + }, + { + "epoch": 0.5280967588503092, + "grad_norm": 0.7416833639144897, + "learning_rate": 0.00016481604542983148, + "loss": 2.8546, + "step": 5829 + }, + { + "epoch": 0.5281873570247559, + "grad_norm": 0.8382271528244019, + "learning_rate": 0.0001648100042288407, + "loss": 2.8958, + "step": 5830 + }, + { + "epoch": 0.5282779551992027, + "grad_norm": 0.8219386339187622, + "learning_rate": 0.00016480396302784994, + "loss": 2.9748, + "step": 5831 + }, + { + "epoch": 0.5283685533736495, + "grad_norm": 0.7365836501121521, + "learning_rate": 0.00016479792182685918, + "loss": 2.6633, + "step": 5832 + }, + { + "epoch": 0.5284591515480963, + "grad_norm": 0.7786422967910767, + "learning_rate": 0.00016479188062586844, + "loss": 2.7372, + "step": 5833 + }, + { + "epoch": 0.5285497497225431, + "grad_norm": 0.696376621723175, + "learning_rate": 0.00016478583942487767, + "loss": 2.1668, + "step": 5834 + }, + { + "epoch": 0.5286403478969899, + "grad_norm": 0.8148064613342285, + "learning_rate": 0.0001647797982238869, + "loss": 2.7443, + "step": 5835 + }, + { + "epoch": 0.5287309460714367, + "grad_norm": 0.737956702709198, + "learning_rate": 0.00016477375702289617, + "loss": 2.7454, + "step": 5836 + }, + { + "epoch": 0.5288215442458835, + "grad_norm": 0.6727933883666992, + "learning_rate": 0.0001647677158219054, + "loss": 2.1343, + "step": 5837 + }, + { + "epoch": 0.5289121424203302, + "grad_norm": 0.7531753778457642, + "learning_rate": 0.00016476167462091466, + "loss": 2.9574, + "step": 5838 + }, + { + "epoch": 0.529002740594777, + "grad_norm": 0.810757040977478, + "learning_rate": 0.00016475563341992387, + "loss": 3.1412, + "step": 5839 + }, + { + "epoch": 0.5290933387692238, + "grad_norm": 0.7123261094093323, + "learning_rate": 0.00016474959221893313, + "loss": 2.521, + "step": 5840 + }, + { + "epoch": 0.5291839369436706, + "grad_norm": 0.7522170543670654, + "learning_rate": 0.00016474355101794237, + "loss": 2.8756, + "step": 5841 + }, + { + "epoch": 0.5292745351181174, + "grad_norm": 0.7510632276535034, + "learning_rate": 0.00016473750981695163, + "loss": 2.8306, + "step": 5842 + }, + { + "epoch": 0.5293651332925642, + "grad_norm": 0.7737167477607727, + "learning_rate": 0.00016473146861596086, + "loss": 2.6374, + "step": 5843 + }, + { + "epoch": 0.529455731467011, + "grad_norm": 0.8179709911346436, + "learning_rate": 0.0001647254274149701, + "loss": 2.7084, + "step": 5844 + }, + { + "epoch": 0.5295463296414578, + "grad_norm": 0.7787980437278748, + "learning_rate": 0.00016471938621397936, + "loss": 2.8346, + "step": 5845 + }, + { + "epoch": 0.5296369278159045, + "grad_norm": 0.7218495607376099, + "learning_rate": 0.0001647133450129886, + "loss": 2.7991, + "step": 5846 + }, + { + "epoch": 0.5297275259903513, + "grad_norm": 0.7792003154754639, + "learning_rate": 0.00016470730381199782, + "loss": 2.9865, + "step": 5847 + }, + { + "epoch": 0.5298181241647981, + "grad_norm": 0.7663440108299255, + "learning_rate": 0.00016470126261100706, + "loss": 2.9197, + "step": 5848 + }, + { + "epoch": 0.5299087223392449, + "grad_norm": 0.7736614346504211, + "learning_rate": 0.00016469522141001632, + "loss": 2.7953, + "step": 5849 + }, + { + "epoch": 0.5299993205136917, + "grad_norm": 0.7872914671897888, + "learning_rate": 0.00016468918020902558, + "loss": 2.9763, + "step": 5850 + }, + { + "epoch": 0.5300899186881385, + "grad_norm": 0.7466636896133423, + "learning_rate": 0.00016468313900803481, + "loss": 2.8041, + "step": 5851 + }, + { + "epoch": 0.5301805168625853, + "grad_norm": 0.751136064529419, + "learning_rate": 0.00016467709780704405, + "loss": 2.2704, + "step": 5852 + }, + { + "epoch": 0.5302711150370321, + "grad_norm": 0.7613030672073364, + "learning_rate": 0.00016467105660605328, + "loss": 2.7737, + "step": 5853 + }, + { + "epoch": 0.5303617132114787, + "grad_norm": 0.8209874629974365, + "learning_rate": 0.00016466501540506254, + "loss": 2.9358, + "step": 5854 + }, + { + "epoch": 0.5304523113859255, + "grad_norm": 0.811555027961731, + "learning_rate": 0.00016465897420407178, + "loss": 2.9498, + "step": 5855 + }, + { + "epoch": 0.5305429095603723, + "grad_norm": 0.6645377278327942, + "learning_rate": 0.000164652933003081, + "loss": 2.0241, + "step": 5856 + }, + { + "epoch": 0.5306335077348191, + "grad_norm": 0.7775384187698364, + "learning_rate": 0.00016464689180209025, + "loss": 3.03, + "step": 5857 + }, + { + "epoch": 0.5307241059092659, + "grad_norm": 0.7698490023612976, + "learning_rate": 0.0001646408506010995, + "loss": 2.796, + "step": 5858 + }, + { + "epoch": 0.5308147040837127, + "grad_norm": 0.8355410099029541, + "learning_rate": 0.00016463480940010877, + "loss": 3.082, + "step": 5859 + }, + { + "epoch": 0.5309053022581595, + "grad_norm": 0.7756670117378235, + "learning_rate": 0.00016462876819911798, + "loss": 2.6291, + "step": 5860 + }, + { + "epoch": 0.5309959004326062, + "grad_norm": 0.7726898193359375, + "learning_rate": 0.00016462272699812724, + "loss": 2.8368, + "step": 5861 + }, + { + "epoch": 0.531086498607053, + "grad_norm": 0.7948160767555237, + "learning_rate": 0.00016461668579713647, + "loss": 2.8334, + "step": 5862 + }, + { + "epoch": 0.5311770967814998, + "grad_norm": 0.7605547308921814, + "learning_rate": 0.00016461064459614573, + "loss": 2.6697, + "step": 5863 + }, + { + "epoch": 0.5312676949559466, + "grad_norm": 0.7611485719680786, + "learning_rate": 0.00016460460339515497, + "loss": 2.8781, + "step": 5864 + }, + { + "epoch": 0.5313582931303934, + "grad_norm": 0.7749800682067871, + "learning_rate": 0.0001645985621941642, + "loss": 2.7139, + "step": 5865 + }, + { + "epoch": 0.5314488913048402, + "grad_norm": 0.7690423727035522, + "learning_rate": 0.00016459252099317346, + "loss": 2.7232, + "step": 5866 + }, + { + "epoch": 0.531539489479287, + "grad_norm": 0.7610759139060974, + "learning_rate": 0.0001645864797921827, + "loss": 2.9449, + "step": 5867 + }, + { + "epoch": 0.5316300876537338, + "grad_norm": 0.7499331831932068, + "learning_rate": 0.00016458043859119196, + "loss": 2.8366, + "step": 5868 + }, + { + "epoch": 0.5317206858281806, + "grad_norm": 0.7986124753952026, + "learning_rate": 0.00016457439739020116, + "loss": 3.0704, + "step": 5869 + }, + { + "epoch": 0.5318112840026273, + "grad_norm": 0.7006113529205322, + "learning_rate": 0.00016456835618921042, + "loss": 2.1247, + "step": 5870 + }, + { + "epoch": 0.5319018821770741, + "grad_norm": 0.7539955973625183, + "learning_rate": 0.00016456231498821966, + "loss": 2.785, + "step": 5871 + }, + { + "epoch": 0.5319924803515209, + "grad_norm": 0.7711989879608154, + "learning_rate": 0.00016455627378722892, + "loss": 2.7957, + "step": 5872 + }, + { + "epoch": 0.5320830785259677, + "grad_norm": 0.667991042137146, + "learning_rate": 0.00016455023258623815, + "loss": 2.0939, + "step": 5873 + }, + { + "epoch": 0.5321736767004145, + "grad_norm": 0.9111413955688477, + "learning_rate": 0.0001645441913852474, + "loss": 2.6441, + "step": 5874 + }, + { + "epoch": 0.5322642748748613, + "grad_norm": 0.7929190397262573, + "learning_rate": 0.00016453815018425665, + "loss": 2.9073, + "step": 5875 + }, + { + "epoch": 0.5323548730493081, + "grad_norm": 0.7797077894210815, + "learning_rate": 0.00016453210898326588, + "loss": 2.8123, + "step": 5876 + }, + { + "epoch": 0.5324454712237549, + "grad_norm": 0.7939873337745667, + "learning_rate": 0.00016452606778227512, + "loss": 3.0037, + "step": 5877 + }, + { + "epoch": 0.5325360693982016, + "grad_norm": 0.8070234656333923, + "learning_rate": 0.00016452002658128435, + "loss": 2.802, + "step": 5878 + }, + { + "epoch": 0.5326266675726484, + "grad_norm": 0.8013607859611511, + "learning_rate": 0.0001645139853802936, + "loss": 3.08, + "step": 5879 + }, + { + "epoch": 0.5327172657470952, + "grad_norm": 0.8176683187484741, + "learning_rate": 0.00016450794417930287, + "loss": 3.0263, + "step": 5880 + }, + { + "epoch": 0.532807863921542, + "grad_norm": 0.7466903924942017, + "learning_rate": 0.0001645019029783121, + "loss": 2.8536, + "step": 5881 + }, + { + "epoch": 0.5328984620959888, + "grad_norm": 0.7904576063156128, + "learning_rate": 0.00016449586177732134, + "loss": 2.7289, + "step": 5882 + }, + { + "epoch": 0.5329890602704356, + "grad_norm": 0.7902599573135376, + "learning_rate": 0.00016448982057633058, + "loss": 3.0018, + "step": 5883 + }, + { + "epoch": 0.5330796584448824, + "grad_norm": 0.8714344501495361, + "learning_rate": 0.00016448377937533984, + "loss": 2.1624, + "step": 5884 + }, + { + "epoch": 0.5331702566193292, + "grad_norm": 0.7940675616264343, + "learning_rate": 0.00016447773817434907, + "loss": 2.9381, + "step": 5885 + }, + { + "epoch": 0.5332608547937759, + "grad_norm": 0.7674645185470581, + "learning_rate": 0.0001644716969733583, + "loss": 2.7362, + "step": 5886 + }, + { + "epoch": 0.5333514529682227, + "grad_norm": 0.7393640875816345, + "learning_rate": 0.00016446565577236754, + "loss": 2.665, + "step": 5887 + }, + { + "epoch": 0.5334420511426695, + "grad_norm": 0.7977780103683472, + "learning_rate": 0.0001644596145713768, + "loss": 2.7083, + "step": 5888 + }, + { + "epoch": 0.5335326493171163, + "grad_norm": 0.8367840647697449, + "learning_rate": 0.00016445357337038606, + "loss": 2.8584, + "step": 5889 + }, + { + "epoch": 0.5336232474915631, + "grad_norm": 0.7505356073379517, + "learning_rate": 0.00016444753216939527, + "loss": 2.6512, + "step": 5890 + }, + { + "epoch": 0.5337138456660099, + "grad_norm": 0.7848244905471802, + "learning_rate": 0.00016444149096840453, + "loss": 2.5747, + "step": 5891 + }, + { + "epoch": 0.5338044438404567, + "grad_norm": 0.7619044780731201, + "learning_rate": 0.00016443544976741376, + "loss": 2.7549, + "step": 5892 + }, + { + "epoch": 0.5338950420149035, + "grad_norm": 0.8073681592941284, + "learning_rate": 0.00016442940856642302, + "loss": 2.8188, + "step": 5893 + }, + { + "epoch": 0.5339856401893501, + "grad_norm": 0.794746994972229, + "learning_rate": 0.00016442336736543226, + "loss": 2.7407, + "step": 5894 + }, + { + "epoch": 0.5340762383637969, + "grad_norm": 0.8163253664970398, + "learning_rate": 0.0001644173261644415, + "loss": 2.7756, + "step": 5895 + }, + { + "epoch": 0.5341668365382437, + "grad_norm": 0.83976811170578, + "learning_rate": 0.00016441128496345075, + "loss": 2.7837, + "step": 5896 + }, + { + "epoch": 0.5342574347126905, + "grad_norm": 0.7808994054794312, + "learning_rate": 0.00016440524376246, + "loss": 2.9875, + "step": 5897 + }, + { + "epoch": 0.5343480328871373, + "grad_norm": 0.7553728818893433, + "learning_rate": 0.00016439920256146922, + "loss": 2.5335, + "step": 5898 + }, + { + "epoch": 0.5344386310615841, + "grad_norm": 0.7666693329811096, + "learning_rate": 0.00016439316136047846, + "loss": 2.5995, + "step": 5899 + }, + { + "epoch": 0.5345292292360309, + "grad_norm": 0.7630746960639954, + "learning_rate": 0.00016438712015948772, + "loss": 2.8885, + "step": 5900 + }, + { + "epoch": 0.5346198274104776, + "grad_norm": 0.7696202993392944, + "learning_rate": 0.00016438107895849695, + "loss": 2.417, + "step": 5901 + }, + { + "epoch": 0.5347104255849244, + "grad_norm": 0.8004668354988098, + "learning_rate": 0.0001643750377575062, + "loss": 2.8314, + "step": 5902 + }, + { + "epoch": 0.5348010237593712, + "grad_norm": 0.8097230792045593, + "learning_rate": 0.00016436899655651545, + "loss": 2.801, + "step": 5903 + }, + { + "epoch": 0.534891621933818, + "grad_norm": 0.7569288611412048, + "learning_rate": 0.00016436295535552468, + "loss": 2.6024, + "step": 5904 + }, + { + "epoch": 0.5349822201082648, + "grad_norm": 0.6989501118659973, + "learning_rate": 0.00016435691415453394, + "loss": 2.0196, + "step": 5905 + }, + { + "epoch": 0.5350728182827116, + "grad_norm": 0.8008301258087158, + "learning_rate": 0.00016435087295354318, + "loss": 2.7774, + "step": 5906 + }, + { + "epoch": 0.5351634164571584, + "grad_norm": 0.7860605716705322, + "learning_rate": 0.0001643448317525524, + "loss": 2.7831, + "step": 5907 + }, + { + "epoch": 0.5352540146316052, + "grad_norm": 0.7790439128875732, + "learning_rate": 0.00016433879055156164, + "loss": 2.9258, + "step": 5908 + }, + { + "epoch": 0.535344612806052, + "grad_norm": 0.6422746181488037, + "learning_rate": 0.0001643327493505709, + "loss": 2.1189, + "step": 5909 + }, + { + "epoch": 0.5354352109804987, + "grad_norm": 0.7550760507583618, + "learning_rate": 0.00016432670814958017, + "loss": 2.7467, + "step": 5910 + }, + { + "epoch": 0.5355258091549455, + "grad_norm": 0.8347893357276917, + "learning_rate": 0.00016432066694858937, + "loss": 3.1426, + "step": 5911 + }, + { + "epoch": 0.5356164073293923, + "grad_norm": 0.736731231212616, + "learning_rate": 0.00016431462574759863, + "loss": 2.629, + "step": 5912 + }, + { + "epoch": 0.5357070055038391, + "grad_norm": 0.8759426474571228, + "learning_rate": 0.00016430858454660787, + "loss": 2.5921, + "step": 5913 + }, + { + "epoch": 0.5357976036782859, + "grad_norm": 0.7682211399078369, + "learning_rate": 0.00016430254334561713, + "loss": 2.7353, + "step": 5914 + }, + { + "epoch": 0.5358882018527327, + "grad_norm": 0.7206417918205261, + "learning_rate": 0.00016429650214462636, + "loss": 2.035, + "step": 5915 + }, + { + "epoch": 0.5359788000271795, + "grad_norm": 0.8494240045547485, + "learning_rate": 0.0001642904609436356, + "loss": 2.8497, + "step": 5916 + }, + { + "epoch": 0.5360693982016262, + "grad_norm": 0.7820627093315125, + "learning_rate": 0.00016428441974264483, + "loss": 3.0918, + "step": 5917 + }, + { + "epoch": 0.536159996376073, + "grad_norm": 0.7399786710739136, + "learning_rate": 0.0001642783785416541, + "loss": 2.7878, + "step": 5918 + }, + { + "epoch": 0.5362505945505198, + "grad_norm": 0.7022741436958313, + "learning_rate": 0.00016427233734066335, + "loss": 2.0811, + "step": 5919 + }, + { + "epoch": 0.5363411927249666, + "grad_norm": 0.8070629239082336, + "learning_rate": 0.00016426629613967256, + "loss": 2.8538, + "step": 5920 + }, + { + "epoch": 0.5364317908994134, + "grad_norm": 0.7599188089370728, + "learning_rate": 0.00016426025493868182, + "loss": 2.6364, + "step": 5921 + }, + { + "epoch": 0.5365223890738602, + "grad_norm": 0.7621845602989197, + "learning_rate": 0.00016425421373769106, + "loss": 2.9964, + "step": 5922 + }, + { + "epoch": 0.536612987248307, + "grad_norm": 0.7614776492118835, + "learning_rate": 0.00016424817253670032, + "loss": 2.683, + "step": 5923 + }, + { + "epoch": 0.5367035854227538, + "grad_norm": 0.8209134340286255, + "learning_rate": 0.00016424213133570952, + "loss": 3.3488, + "step": 5924 + }, + { + "epoch": 0.5367941835972005, + "grad_norm": 0.8493196368217468, + "learning_rate": 0.00016423609013471878, + "loss": 2.7604, + "step": 5925 + }, + { + "epoch": 0.5368847817716473, + "grad_norm": 0.7399532794952393, + "learning_rate": 0.00016423004893372805, + "loss": 2.9408, + "step": 5926 + }, + { + "epoch": 0.5369753799460941, + "grad_norm": 0.7820083498954773, + "learning_rate": 0.00016422400773273728, + "loss": 3.0347, + "step": 5927 + }, + { + "epoch": 0.5370659781205409, + "grad_norm": 0.7633047103881836, + "learning_rate": 0.00016421796653174651, + "loss": 2.7401, + "step": 5928 + }, + { + "epoch": 0.5371565762949877, + "grad_norm": 0.6799317002296448, + "learning_rate": 0.00016421192533075575, + "loss": 2.1682, + "step": 5929 + }, + { + "epoch": 0.5372471744694345, + "grad_norm": 0.7610468864440918, + "learning_rate": 0.000164205884129765, + "loss": 2.8807, + "step": 5930 + }, + { + "epoch": 0.5373377726438813, + "grad_norm": 0.7473232746124268, + "learning_rate": 0.00016419984292877424, + "loss": 2.7545, + "step": 5931 + }, + { + "epoch": 0.5374283708183281, + "grad_norm": 0.771636426448822, + "learning_rate": 0.0001641938017277835, + "loss": 2.9715, + "step": 5932 + }, + { + "epoch": 0.5375189689927748, + "grad_norm": 0.7137282490730286, + "learning_rate": 0.00016418776052679274, + "loss": 2.8375, + "step": 5933 + }, + { + "epoch": 0.5376095671672216, + "grad_norm": 0.727220356464386, + "learning_rate": 0.00016418171932580197, + "loss": 2.7321, + "step": 5934 + }, + { + "epoch": 0.5377001653416683, + "grad_norm": 0.7273107767105103, + "learning_rate": 0.00016417567812481123, + "loss": 2.6909, + "step": 5935 + }, + { + "epoch": 0.5377907635161151, + "grad_norm": 0.767292320728302, + "learning_rate": 0.00016416963692382047, + "loss": 3.0173, + "step": 5936 + }, + { + "epoch": 0.5378813616905619, + "grad_norm": 0.7486748099327087, + "learning_rate": 0.0001641635957228297, + "loss": 2.5089, + "step": 5937 + }, + { + "epoch": 0.5379719598650087, + "grad_norm": 0.7986935377120972, + "learning_rate": 0.00016415755452183894, + "loss": 2.888, + "step": 5938 + }, + { + "epoch": 0.5380625580394555, + "grad_norm": 0.7429930567741394, + "learning_rate": 0.0001641515133208482, + "loss": 2.7997, + "step": 5939 + }, + { + "epoch": 0.5381531562139023, + "grad_norm": 0.6547744870185852, + "learning_rate": 0.00016414547211985746, + "loss": 2.3235, + "step": 5940 + }, + { + "epoch": 0.538243754388349, + "grad_norm": 0.760724663734436, + "learning_rate": 0.00016413943091886667, + "loss": 2.745, + "step": 5941 + }, + { + "epoch": 0.5383343525627958, + "grad_norm": 0.7762205004692078, + "learning_rate": 0.00016413338971787593, + "loss": 2.9378, + "step": 5942 + }, + { + "epoch": 0.5384249507372426, + "grad_norm": 0.7662271857261658, + "learning_rate": 0.00016412734851688516, + "loss": 2.7522, + "step": 5943 + }, + { + "epoch": 0.5385155489116894, + "grad_norm": 0.7994664311408997, + "learning_rate": 0.00016412130731589442, + "loss": 2.9504, + "step": 5944 + }, + { + "epoch": 0.5386061470861362, + "grad_norm": 0.788619875907898, + "learning_rate": 0.00016411526611490366, + "loss": 2.8862, + "step": 5945 + }, + { + "epoch": 0.538696745260583, + "grad_norm": 0.7972175478935242, + "learning_rate": 0.0001641092249139129, + "loss": 3.1342, + "step": 5946 + }, + { + "epoch": 0.5387873434350298, + "grad_norm": 0.8594028353691101, + "learning_rate": 0.00016410318371292212, + "loss": 2.7881, + "step": 5947 + }, + { + "epoch": 0.5388779416094766, + "grad_norm": 0.7613884806632996, + "learning_rate": 0.00016409714251193138, + "loss": 2.6779, + "step": 5948 + }, + { + "epoch": 0.5389685397839233, + "grad_norm": 0.7938780784606934, + "learning_rate": 0.00016409110131094062, + "loss": 2.9171, + "step": 5949 + }, + { + "epoch": 0.5390591379583701, + "grad_norm": 0.75136798620224, + "learning_rate": 0.00016408506010994985, + "loss": 2.679, + "step": 5950 + }, + { + "epoch": 0.5391497361328169, + "grad_norm": 0.7651563882827759, + "learning_rate": 0.00016407901890895911, + "loss": 2.7844, + "step": 5951 + }, + { + "epoch": 0.5392403343072637, + "grad_norm": 0.6774317026138306, + "learning_rate": 0.00016407297770796835, + "loss": 1.9745, + "step": 5952 + }, + { + "epoch": 0.5393309324817105, + "grad_norm": 0.7718104720115662, + "learning_rate": 0.0001640669365069776, + "loss": 2.9358, + "step": 5953 + }, + { + "epoch": 0.5394215306561573, + "grad_norm": 0.7979744672775269, + "learning_rate": 0.00016406089530598682, + "loss": 2.7016, + "step": 5954 + }, + { + "epoch": 0.5395121288306041, + "grad_norm": 0.8062502145767212, + "learning_rate": 0.00016405485410499608, + "loss": 2.9239, + "step": 5955 + }, + { + "epoch": 0.5396027270050509, + "grad_norm": 0.8228276371955872, + "learning_rate": 0.00016404881290400534, + "loss": 2.6167, + "step": 5956 + }, + { + "epoch": 0.5396933251794976, + "grad_norm": 0.7508929967880249, + "learning_rate": 0.00016404277170301457, + "loss": 2.5868, + "step": 5957 + }, + { + "epoch": 0.5397839233539444, + "grad_norm": 0.7004851698875427, + "learning_rate": 0.0001640367305020238, + "loss": 2.3448, + "step": 5958 + }, + { + "epoch": 0.5398745215283912, + "grad_norm": 0.7788493633270264, + "learning_rate": 0.00016403068930103304, + "loss": 2.5445, + "step": 5959 + }, + { + "epoch": 0.539965119702838, + "grad_norm": 0.8437797427177429, + "learning_rate": 0.0001640246481000423, + "loss": 2.9666, + "step": 5960 + }, + { + "epoch": 0.5400557178772848, + "grad_norm": 0.7431809306144714, + "learning_rate": 0.00016401860689905154, + "loss": 2.8767, + "step": 5961 + }, + { + "epoch": 0.5401463160517316, + "grad_norm": 0.6578434109687805, + "learning_rate": 0.00016401256569806077, + "loss": 1.9777, + "step": 5962 + }, + { + "epoch": 0.5402369142261784, + "grad_norm": 0.6600267887115479, + "learning_rate": 0.00016400652449707003, + "loss": 2.1811, + "step": 5963 + }, + { + "epoch": 0.5403275124006252, + "grad_norm": 0.8163760304450989, + "learning_rate": 0.00016400048329607927, + "loss": 2.7893, + "step": 5964 + }, + { + "epoch": 0.5404181105750719, + "grad_norm": 0.9903095960617065, + "learning_rate": 0.00016399444209508853, + "loss": 2.7576, + "step": 5965 + }, + { + "epoch": 0.5405087087495187, + "grad_norm": 0.8250631093978882, + "learning_rate": 0.00016398840089409776, + "loss": 2.8286, + "step": 5966 + }, + { + "epoch": 0.5405993069239655, + "grad_norm": 0.7625171542167664, + "learning_rate": 0.000163982359693107, + "loss": 2.5734, + "step": 5967 + }, + { + "epoch": 0.5406899050984123, + "grad_norm": 0.8026340007781982, + "learning_rate": 0.00016397631849211623, + "loss": 2.8434, + "step": 5968 + }, + { + "epoch": 0.5407805032728591, + "grad_norm": 0.8000808358192444, + "learning_rate": 0.0001639702772911255, + "loss": 2.2422, + "step": 5969 + }, + { + "epoch": 0.5408711014473059, + "grad_norm": 0.7694547176361084, + "learning_rate": 0.00016396423609013472, + "loss": 2.8137, + "step": 5970 + }, + { + "epoch": 0.5409616996217527, + "grad_norm": 0.7535328269004822, + "learning_rate": 0.00016395819488914396, + "loss": 2.7554, + "step": 5971 + }, + { + "epoch": 0.5410522977961995, + "grad_norm": 0.6721504330635071, + "learning_rate": 0.00016395215368815322, + "loss": 2.0488, + "step": 5972 + }, + { + "epoch": 0.5411428959706462, + "grad_norm": 0.7878772616386414, + "learning_rate": 0.00016394611248716245, + "loss": 2.814, + "step": 5973 + }, + { + "epoch": 0.541233494145093, + "grad_norm": 0.7763349413871765, + "learning_rate": 0.00016394007128617171, + "loss": 2.9673, + "step": 5974 + }, + { + "epoch": 0.5413240923195397, + "grad_norm": 0.7524158358573914, + "learning_rate": 0.00016393403008518092, + "loss": 2.3268, + "step": 5975 + }, + { + "epoch": 0.5414146904939865, + "grad_norm": 0.8234326243400574, + "learning_rate": 0.00016392798888419018, + "loss": 2.6451, + "step": 5976 + }, + { + "epoch": 0.5415052886684333, + "grad_norm": 0.7684634923934937, + "learning_rate": 0.00016392194768319942, + "loss": 2.8145, + "step": 5977 + }, + { + "epoch": 0.5415958868428801, + "grad_norm": 0.752798318862915, + "learning_rate": 0.00016391590648220868, + "loss": 2.7851, + "step": 5978 + }, + { + "epoch": 0.5416864850173269, + "grad_norm": 0.7257302403450012, + "learning_rate": 0.0001639098652812179, + "loss": 2.6356, + "step": 5979 + }, + { + "epoch": 0.5417770831917736, + "grad_norm": 0.6989591121673584, + "learning_rate": 0.00016390382408022715, + "loss": 2.148, + "step": 5980 + }, + { + "epoch": 0.5418676813662204, + "grad_norm": 0.797608494758606, + "learning_rate": 0.0001638977828792364, + "loss": 3.0926, + "step": 5981 + }, + { + "epoch": 0.5419582795406672, + "grad_norm": 0.8449243307113647, + "learning_rate": 0.00016389174167824564, + "loss": 2.6913, + "step": 5982 + }, + { + "epoch": 0.542048877715114, + "grad_norm": 0.7667732834815979, + "learning_rate": 0.0001638857004772549, + "loss": 2.7807, + "step": 5983 + }, + { + "epoch": 0.5421394758895608, + "grad_norm": 0.7211959362030029, + "learning_rate": 0.0001638796592762641, + "loss": 1.8871, + "step": 5984 + }, + { + "epoch": 0.5422300740640076, + "grad_norm": 0.7860173583030701, + "learning_rate": 0.00016387361807527337, + "loss": 2.9521, + "step": 5985 + }, + { + "epoch": 0.5423206722384544, + "grad_norm": 0.8462398052215576, + "learning_rate": 0.00016386757687428263, + "loss": 2.701, + "step": 5986 + }, + { + "epoch": 0.5424112704129012, + "grad_norm": 0.7628206610679626, + "learning_rate": 0.00016386153567329187, + "loss": 3.009, + "step": 5987 + }, + { + "epoch": 0.542501868587348, + "grad_norm": 0.761093258857727, + "learning_rate": 0.0001638554944723011, + "loss": 2.8167, + "step": 5988 + }, + { + "epoch": 0.5425924667617947, + "grad_norm": 0.7654920816421509, + "learning_rate": 0.00016384945327131033, + "loss": 2.6583, + "step": 5989 + }, + { + "epoch": 0.5426830649362415, + "grad_norm": 0.7280387282371521, + "learning_rate": 0.0001638434120703196, + "loss": 2.6908, + "step": 5990 + }, + { + "epoch": 0.5427736631106883, + "grad_norm": 0.7690613865852356, + "learning_rate": 0.00016383737086932883, + "loss": 2.7402, + "step": 5991 + }, + { + "epoch": 0.5428642612851351, + "grad_norm": 0.7907320857048035, + "learning_rate": 0.00016383132966833806, + "loss": 2.7898, + "step": 5992 + }, + { + "epoch": 0.5429548594595819, + "grad_norm": 0.7246724367141724, + "learning_rate": 0.00016382528846734732, + "loss": 2.8166, + "step": 5993 + }, + { + "epoch": 0.5430454576340287, + "grad_norm": 0.7980607151985168, + "learning_rate": 0.00016381924726635656, + "loss": 2.5174, + "step": 5994 + }, + { + "epoch": 0.5431360558084755, + "grad_norm": 0.7892653942108154, + "learning_rate": 0.00016381320606536582, + "loss": 2.9212, + "step": 5995 + }, + { + "epoch": 0.5432266539829222, + "grad_norm": 0.7590755224227905, + "learning_rate": 0.00016380716486437505, + "loss": 2.7648, + "step": 5996 + }, + { + "epoch": 0.543317252157369, + "grad_norm": 0.7696068286895752, + "learning_rate": 0.0001638011236633843, + "loss": 3.0289, + "step": 5997 + }, + { + "epoch": 0.5434078503318158, + "grad_norm": 0.8114903569221497, + "learning_rate": 0.00016379508246239352, + "loss": 3.0931, + "step": 5998 + }, + { + "epoch": 0.5434984485062626, + "grad_norm": 0.8486101031303406, + "learning_rate": 0.00016378904126140278, + "loss": 2.7005, + "step": 5999 + }, + { + "epoch": 0.5435890466807094, + "grad_norm": 0.7712904810905457, + "learning_rate": 0.00016378300006041202, + "loss": 3.2335, + "step": 6000 + }, + { + "epoch": 0.5436796448551562, + "grad_norm": 0.6863800287246704, + "learning_rate": 0.00016377695885942125, + "loss": 2.1791, + "step": 6001 + }, + { + "epoch": 0.543770243029603, + "grad_norm": 0.6917300820350647, + "learning_rate": 0.0001637709176584305, + "loss": 2.0772, + "step": 6002 + }, + { + "epoch": 0.5438608412040498, + "grad_norm": 0.7521175742149353, + "learning_rate": 0.00016376487645743975, + "loss": 2.7285, + "step": 6003 + }, + { + "epoch": 0.5439514393784965, + "grad_norm": 0.8785490989685059, + "learning_rate": 0.000163758835256449, + "loss": 2.7236, + "step": 6004 + }, + { + "epoch": 0.5440420375529433, + "grad_norm": 0.7851468920707703, + "learning_rate": 0.00016375279405545821, + "loss": 2.9679, + "step": 6005 + }, + { + "epoch": 0.5441326357273901, + "grad_norm": 0.786919355392456, + "learning_rate": 0.00016374675285446748, + "loss": 2.6864, + "step": 6006 + }, + { + "epoch": 0.5442232339018369, + "grad_norm": 0.659155547618866, + "learning_rate": 0.0001637407116534767, + "loss": 2.0361, + "step": 6007 + }, + { + "epoch": 0.5443138320762837, + "grad_norm": 0.8397767543792725, + "learning_rate": 0.00016373467045248597, + "loss": 2.98, + "step": 6008 + }, + { + "epoch": 0.5444044302507305, + "grad_norm": 0.8041370511054993, + "learning_rate": 0.0001637286292514952, + "loss": 2.5801, + "step": 6009 + }, + { + "epoch": 0.5444950284251773, + "grad_norm": 0.8039294481277466, + "learning_rate": 0.00016372258805050444, + "loss": 2.7742, + "step": 6010 + }, + { + "epoch": 0.5445856265996241, + "grad_norm": 0.820553183555603, + "learning_rate": 0.0001637165468495137, + "loss": 2.7899, + "step": 6011 + }, + { + "epoch": 0.5446762247740708, + "grad_norm": 0.8493404984474182, + "learning_rate": 0.00016371050564852293, + "loss": 2.8208, + "step": 6012 + }, + { + "epoch": 0.5447668229485176, + "grad_norm": 0.8028607964515686, + "learning_rate": 0.00016370446444753217, + "loss": 2.9781, + "step": 6013 + }, + { + "epoch": 0.5448574211229644, + "grad_norm": 0.7472384572029114, + "learning_rate": 0.0001636984232465414, + "loss": 2.666, + "step": 6014 + }, + { + "epoch": 0.5449480192974112, + "grad_norm": 0.7758143544197083, + "learning_rate": 0.00016369238204555066, + "loss": 3.0482, + "step": 6015 + }, + { + "epoch": 0.5450386174718579, + "grad_norm": 0.806374192237854, + "learning_rate": 0.00016368634084455992, + "loss": 2.8576, + "step": 6016 + }, + { + "epoch": 0.5451292156463047, + "grad_norm": 0.9006339311599731, + "learning_rate": 0.00016368029964356916, + "loss": 2.8085, + "step": 6017 + }, + { + "epoch": 0.5452198138207515, + "grad_norm": 0.7699541449546814, + "learning_rate": 0.0001636742584425784, + "loss": 2.6936, + "step": 6018 + }, + { + "epoch": 0.5453104119951983, + "grad_norm": 0.7732337713241577, + "learning_rate": 0.00016366821724158763, + "loss": 2.534, + "step": 6019 + }, + { + "epoch": 0.545401010169645, + "grad_norm": 0.6932775974273682, + "learning_rate": 0.0001636621760405969, + "loss": 2.0695, + "step": 6020 + }, + { + "epoch": 0.5454916083440918, + "grad_norm": 0.8066011071205139, + "learning_rate": 0.00016365613483960612, + "loss": 2.836, + "step": 6021 + }, + { + "epoch": 0.5455822065185386, + "grad_norm": 0.7834087610244751, + "learning_rate": 0.00016365009363861536, + "loss": 3.0293, + "step": 6022 + }, + { + "epoch": 0.5456728046929854, + "grad_norm": 0.7746036648750305, + "learning_rate": 0.00016364405243762462, + "loss": 2.8583, + "step": 6023 + }, + { + "epoch": 0.5457634028674322, + "grad_norm": 0.7967986464500427, + "learning_rate": 0.00016363801123663385, + "loss": 2.8976, + "step": 6024 + }, + { + "epoch": 0.545854001041879, + "grad_norm": 0.7950435876846313, + "learning_rate": 0.0001636319700356431, + "loss": 2.6055, + "step": 6025 + }, + { + "epoch": 0.5459445992163258, + "grad_norm": 0.7570753693580627, + "learning_rate": 0.00016362592883465232, + "loss": 2.7291, + "step": 6026 + }, + { + "epoch": 0.5460351973907726, + "grad_norm": 0.6386801600456238, + "learning_rate": 0.00016361988763366158, + "loss": 2.2184, + "step": 6027 + }, + { + "epoch": 0.5461257955652193, + "grad_norm": 0.7840952277183533, + "learning_rate": 0.00016361384643267081, + "loss": 2.7379, + "step": 6028 + }, + { + "epoch": 0.5462163937396661, + "grad_norm": 0.7609872817993164, + "learning_rate": 0.00016360780523168008, + "loss": 2.7888, + "step": 6029 + }, + { + "epoch": 0.5463069919141129, + "grad_norm": 0.7242224216461182, + "learning_rate": 0.0001636017640306893, + "loss": 2.8769, + "step": 6030 + }, + { + "epoch": 0.5463975900885597, + "grad_norm": 0.6762561202049255, + "learning_rate": 0.00016359572282969854, + "loss": 1.8972, + "step": 6031 + }, + { + "epoch": 0.5464881882630065, + "grad_norm": 0.8269873857498169, + "learning_rate": 0.0001635896816287078, + "loss": 2.9532, + "step": 6032 + }, + { + "epoch": 0.5465787864374533, + "grad_norm": 0.7746034860610962, + "learning_rate": 0.00016358364042771704, + "loss": 2.6347, + "step": 6033 + }, + { + "epoch": 0.5466693846119001, + "grad_norm": 0.6772369742393494, + "learning_rate": 0.00016357759922672627, + "loss": 1.9906, + "step": 6034 + }, + { + "epoch": 0.5467599827863469, + "grad_norm": 0.8104320764541626, + "learning_rate": 0.0001635715580257355, + "loss": 3.0602, + "step": 6035 + }, + { + "epoch": 0.5468505809607936, + "grad_norm": 0.7418733239173889, + "learning_rate": 0.00016356551682474477, + "loss": 2.6542, + "step": 6036 + }, + { + "epoch": 0.5469411791352404, + "grad_norm": 0.8161481022834778, + "learning_rate": 0.000163559475623754, + "loss": 2.9902, + "step": 6037 + }, + { + "epoch": 0.5470317773096872, + "grad_norm": 0.7878612279891968, + "learning_rate": 0.00016355343442276326, + "loss": 2.8696, + "step": 6038 + }, + { + "epoch": 0.547122375484134, + "grad_norm": 0.7599819898605347, + "learning_rate": 0.0001635473932217725, + "loss": 2.8101, + "step": 6039 + }, + { + "epoch": 0.5472129736585808, + "grad_norm": 0.7854194641113281, + "learning_rate": 0.00016354135202078173, + "loss": 2.7561, + "step": 6040 + }, + { + "epoch": 0.5473035718330276, + "grad_norm": 0.7968162894248962, + "learning_rate": 0.000163535310819791, + "loss": 2.9389, + "step": 6041 + }, + { + "epoch": 0.5473941700074744, + "grad_norm": 0.8403955698013306, + "learning_rate": 0.00016352926961880023, + "loss": 2.9706, + "step": 6042 + }, + { + "epoch": 0.5474847681819212, + "grad_norm": 0.6956443190574646, + "learning_rate": 0.00016352322841780946, + "loss": 2.743, + "step": 6043 + }, + { + "epoch": 0.5475753663563679, + "grad_norm": 0.7717607617378235, + "learning_rate": 0.0001635171872168187, + "loss": 2.882, + "step": 6044 + }, + { + "epoch": 0.5476659645308147, + "grad_norm": 0.7759180068969727, + "learning_rate": 0.00016351114601582796, + "loss": 2.8318, + "step": 6045 + }, + { + "epoch": 0.5477565627052615, + "grad_norm": 0.7752950191497803, + "learning_rate": 0.00016350510481483722, + "loss": 2.7426, + "step": 6046 + }, + { + "epoch": 0.5478471608797083, + "grad_norm": 0.8043593764305115, + "learning_rate": 0.00016349906361384642, + "loss": 2.5999, + "step": 6047 + }, + { + "epoch": 0.5479377590541551, + "grad_norm": 0.778914749622345, + "learning_rate": 0.00016349302241285568, + "loss": 2.8639, + "step": 6048 + }, + { + "epoch": 0.5480283572286019, + "grad_norm": 0.7827397584915161, + "learning_rate": 0.00016348698121186492, + "loss": 2.4778, + "step": 6049 + }, + { + "epoch": 0.5481189554030487, + "grad_norm": 0.7892813682556152, + "learning_rate": 0.00016348094001087418, + "loss": 2.8566, + "step": 6050 + }, + { + "epoch": 0.5482095535774955, + "grad_norm": 0.7457934617996216, + "learning_rate": 0.00016347489880988341, + "loss": 2.8494, + "step": 6051 + }, + { + "epoch": 0.5483001517519422, + "grad_norm": 0.7352835536003113, + "learning_rate": 0.00016346885760889265, + "loss": 2.9513, + "step": 6052 + }, + { + "epoch": 0.548390749926389, + "grad_norm": 0.7162964344024658, + "learning_rate": 0.0001634628164079019, + "loss": 2.2658, + "step": 6053 + }, + { + "epoch": 0.5484813481008358, + "grad_norm": 0.7449924945831299, + "learning_rate": 0.00016345677520691114, + "loss": 3.0304, + "step": 6054 + }, + { + "epoch": 0.5485719462752826, + "grad_norm": 0.6794970035552979, + "learning_rate": 0.0001634507340059204, + "loss": 2.1866, + "step": 6055 + }, + { + "epoch": 0.5486625444497293, + "grad_norm": 0.6880223155021667, + "learning_rate": 0.0001634446928049296, + "loss": 2.1507, + "step": 6056 + }, + { + "epoch": 0.5487531426241761, + "grad_norm": 0.7457188963890076, + "learning_rate": 0.00016343865160393887, + "loss": 2.7958, + "step": 6057 + }, + { + "epoch": 0.5488437407986229, + "grad_norm": 0.7793325781822205, + "learning_rate": 0.0001634326104029481, + "loss": 3.0926, + "step": 6058 + }, + { + "epoch": 0.5489343389730696, + "grad_norm": 0.8188062310218811, + "learning_rate": 0.00016342656920195737, + "loss": 2.8596, + "step": 6059 + }, + { + "epoch": 0.5490249371475164, + "grad_norm": 0.7878708839416504, + "learning_rate": 0.0001634205280009666, + "loss": 2.6649, + "step": 6060 + }, + { + "epoch": 0.5491155353219632, + "grad_norm": 0.7527418732643127, + "learning_rate": 0.00016341448679997584, + "loss": 2.8024, + "step": 6061 + }, + { + "epoch": 0.54920613349641, + "grad_norm": 0.6875285506248474, + "learning_rate": 0.0001634084455989851, + "loss": 2.1585, + "step": 6062 + }, + { + "epoch": 0.5492967316708568, + "grad_norm": 0.8431931734085083, + "learning_rate": 0.00016340240439799433, + "loss": 2.3718, + "step": 6063 + }, + { + "epoch": 0.5493873298453036, + "grad_norm": 0.7858365178108215, + "learning_rate": 0.00016339636319700357, + "loss": 2.7134, + "step": 6064 + }, + { + "epoch": 0.5494779280197504, + "grad_norm": 0.756395161151886, + "learning_rate": 0.0001633903219960128, + "loss": 2.9098, + "step": 6065 + }, + { + "epoch": 0.5495685261941972, + "grad_norm": 0.74216628074646, + "learning_rate": 0.00016338428079502206, + "loss": 2.7637, + "step": 6066 + }, + { + "epoch": 0.549659124368644, + "grad_norm": 0.800971508026123, + "learning_rate": 0.0001633782395940313, + "loss": 3.019, + "step": 6067 + }, + { + "epoch": 0.5497497225430907, + "grad_norm": 0.7574571967124939, + "learning_rate": 0.00016337219839304056, + "loss": 2.7077, + "step": 6068 + }, + { + "epoch": 0.5498403207175375, + "grad_norm": 0.8317729830741882, + "learning_rate": 0.0001633661571920498, + "loss": 2.7903, + "step": 6069 + }, + { + "epoch": 0.5499309188919843, + "grad_norm": 0.824051558971405, + "learning_rate": 0.00016336011599105902, + "loss": 2.8768, + "step": 6070 + }, + { + "epoch": 0.5500215170664311, + "grad_norm": 0.7327190637588501, + "learning_rate": 0.00016335407479006828, + "loss": 2.8044, + "step": 6071 + }, + { + "epoch": 0.5501121152408779, + "grad_norm": 0.7725288271903992, + "learning_rate": 0.00016334803358907752, + "loss": 2.8791, + "step": 6072 + }, + { + "epoch": 0.5502027134153247, + "grad_norm": 0.7613720297813416, + "learning_rate": 0.00016334199238808675, + "loss": 2.9432, + "step": 6073 + }, + { + "epoch": 0.5502933115897715, + "grad_norm": 0.7782252430915833, + "learning_rate": 0.000163335951187096, + "loss": 3.0299, + "step": 6074 + }, + { + "epoch": 0.5503839097642182, + "grad_norm": 0.8583107590675354, + "learning_rate": 0.00016332990998610525, + "loss": 2.6257, + "step": 6075 + }, + { + "epoch": 0.550474507938665, + "grad_norm": 0.9111669063568115, + "learning_rate": 0.0001633238687851145, + "loss": 2.8103, + "step": 6076 + }, + { + "epoch": 0.5505651061131118, + "grad_norm": 0.6613486409187317, + "learning_rate": 0.00016331782758412372, + "loss": 2.2464, + "step": 6077 + }, + { + "epoch": 0.5506557042875586, + "grad_norm": 0.7557885050773621, + "learning_rate": 0.00016331178638313298, + "loss": 2.8222, + "step": 6078 + }, + { + "epoch": 0.5507463024620054, + "grad_norm": 0.7906992435455322, + "learning_rate": 0.0001633057451821422, + "loss": 2.7844, + "step": 6079 + }, + { + "epoch": 0.5508369006364522, + "grad_norm": 0.7571853995323181, + "learning_rate": 0.00016329970398115147, + "loss": 2.8088, + "step": 6080 + }, + { + "epoch": 0.550927498810899, + "grad_norm": 0.7717328667640686, + "learning_rate": 0.0001632936627801607, + "loss": 2.6979, + "step": 6081 + }, + { + "epoch": 0.5510180969853458, + "grad_norm": 0.8604846596717834, + "learning_rate": 0.00016328762157916994, + "loss": 2.8489, + "step": 6082 + }, + { + "epoch": 0.5511086951597925, + "grad_norm": 0.8074079155921936, + "learning_rate": 0.0001632815803781792, + "loss": 2.7272, + "step": 6083 + }, + { + "epoch": 0.5511992933342393, + "grad_norm": 0.7863456606864929, + "learning_rate": 0.00016327553917718844, + "loss": 3.0715, + "step": 6084 + }, + { + "epoch": 0.5512898915086861, + "grad_norm": 0.8495341539382935, + "learning_rate": 0.00016326949797619767, + "loss": 3.0031, + "step": 6085 + }, + { + "epoch": 0.5513804896831329, + "grad_norm": 0.8246411681175232, + "learning_rate": 0.0001632634567752069, + "loss": 2.9212, + "step": 6086 + }, + { + "epoch": 0.5514710878575797, + "grad_norm": 0.7230509519577026, + "learning_rate": 0.00016325741557421617, + "loss": 2.2519, + "step": 6087 + }, + { + "epoch": 0.5515616860320265, + "grad_norm": 0.8659189343452454, + "learning_rate": 0.0001632513743732254, + "loss": 2.7533, + "step": 6088 + }, + { + "epoch": 0.5516522842064733, + "grad_norm": 0.7307212352752686, + "learning_rate": 0.00016324533317223466, + "loss": 2.8833, + "step": 6089 + }, + { + "epoch": 0.5517428823809201, + "grad_norm": 0.7988254427909851, + "learning_rate": 0.0001632392919712439, + "loss": 2.8948, + "step": 6090 + }, + { + "epoch": 0.5518334805553669, + "grad_norm": 0.7466258406639099, + "learning_rate": 0.00016323325077025313, + "loss": 3.1149, + "step": 6091 + }, + { + "epoch": 0.5519240787298136, + "grad_norm": 0.7458983063697815, + "learning_rate": 0.0001632272095692624, + "loss": 2.6463, + "step": 6092 + }, + { + "epoch": 0.5520146769042604, + "grad_norm": 0.7210414409637451, + "learning_rate": 0.00016322116836827162, + "loss": 2.4085, + "step": 6093 + }, + { + "epoch": 0.5521052750787072, + "grad_norm": 0.7596874237060547, + "learning_rate": 0.00016321512716728086, + "loss": 3.1261, + "step": 6094 + }, + { + "epoch": 0.552195873253154, + "grad_norm": 0.7595881223678589, + "learning_rate": 0.0001632090859662901, + "loss": 2.6557, + "step": 6095 + }, + { + "epoch": 0.5522864714276008, + "grad_norm": 0.7748221158981323, + "learning_rate": 0.00016320304476529935, + "loss": 2.9194, + "step": 6096 + }, + { + "epoch": 0.5523770696020475, + "grad_norm": 0.7419552206993103, + "learning_rate": 0.0001631970035643086, + "loss": 2.9567, + "step": 6097 + }, + { + "epoch": 0.5524676677764943, + "grad_norm": 0.8029980659484863, + "learning_rate": 0.00016319096236331782, + "loss": 2.5969, + "step": 6098 + }, + { + "epoch": 0.552558265950941, + "grad_norm": 0.8517844676971436, + "learning_rate": 0.00016318492116232708, + "loss": 2.8816, + "step": 6099 + }, + { + "epoch": 0.5526488641253878, + "grad_norm": 0.8219412565231323, + "learning_rate": 0.00016317887996133632, + "loss": 2.8959, + "step": 6100 + }, + { + "epoch": 0.5527394622998346, + "grad_norm": 0.799318790435791, + "learning_rate": 0.00016317283876034558, + "loss": 2.8575, + "step": 6101 + }, + { + "epoch": 0.5528300604742814, + "grad_norm": 0.7620834708213806, + "learning_rate": 0.0001631667975593548, + "loss": 2.7554, + "step": 6102 + }, + { + "epoch": 0.5529206586487282, + "grad_norm": 0.824093759059906, + "learning_rate": 0.00016316075635836405, + "loss": 2.7337, + "step": 6103 + }, + { + "epoch": 0.553011256823175, + "grad_norm": 0.7657593488693237, + "learning_rate": 0.00016315471515737328, + "loss": 3.0029, + "step": 6104 + }, + { + "epoch": 0.5531018549976218, + "grad_norm": 0.8146747350692749, + "learning_rate": 0.00016314867395638254, + "loss": 2.94, + "step": 6105 + }, + { + "epoch": 0.5531924531720686, + "grad_norm": 0.8592027425765991, + "learning_rate": 0.0001631426327553918, + "loss": 2.812, + "step": 6106 + }, + { + "epoch": 0.5532830513465153, + "grad_norm": 0.8154405951499939, + "learning_rate": 0.000163136591554401, + "loss": 2.7539, + "step": 6107 + }, + { + "epoch": 0.5533736495209621, + "grad_norm": 0.8328912854194641, + "learning_rate": 0.00016313055035341027, + "loss": 2.7746, + "step": 6108 + }, + { + "epoch": 0.5534642476954089, + "grad_norm": 0.7297109365463257, + "learning_rate": 0.0001631245091524195, + "loss": 2.7874, + "step": 6109 + }, + { + "epoch": 0.5535548458698557, + "grad_norm": 0.8187052607536316, + "learning_rate": 0.00016311846795142877, + "loss": 2.8081, + "step": 6110 + }, + { + "epoch": 0.5536454440443025, + "grad_norm": 0.7537796497344971, + "learning_rate": 0.00016311242675043797, + "loss": 2.8435, + "step": 6111 + }, + { + "epoch": 0.5537360422187493, + "grad_norm": 0.8326866030693054, + "learning_rate": 0.00016310638554944723, + "loss": 2.89, + "step": 6112 + }, + { + "epoch": 0.5538266403931961, + "grad_norm": 0.8041486740112305, + "learning_rate": 0.0001631003443484565, + "loss": 2.7136, + "step": 6113 + }, + { + "epoch": 0.5539172385676429, + "grad_norm": 0.8619235754013062, + "learning_rate": 0.00016309430314746573, + "loss": 2.8139, + "step": 6114 + }, + { + "epoch": 0.5540078367420896, + "grad_norm": 0.7124544382095337, + "learning_rate": 0.00016308826194647496, + "loss": 2.4679, + "step": 6115 + }, + { + "epoch": 0.5540984349165364, + "grad_norm": 0.8220873475074768, + "learning_rate": 0.0001630822207454842, + "loss": 2.8854, + "step": 6116 + }, + { + "epoch": 0.5541890330909832, + "grad_norm": 0.7526025772094727, + "learning_rate": 0.00016307617954449346, + "loss": 2.6606, + "step": 6117 + }, + { + "epoch": 0.55427963126543, + "grad_norm": 0.7919800281524658, + "learning_rate": 0.0001630701383435027, + "loss": 2.7968, + "step": 6118 + }, + { + "epoch": 0.5543702294398768, + "grad_norm": 0.705896258354187, + "learning_rate": 0.00016306409714251195, + "loss": 2.7764, + "step": 6119 + }, + { + "epoch": 0.5544608276143236, + "grad_norm": 0.714896023273468, + "learning_rate": 0.0001630580559415212, + "loss": 2.544, + "step": 6120 + }, + { + "epoch": 0.5545514257887704, + "grad_norm": 0.7659037113189697, + "learning_rate": 0.00016305201474053042, + "loss": 2.7142, + "step": 6121 + }, + { + "epoch": 0.5546420239632172, + "grad_norm": 0.7859302163124084, + "learning_rate": 0.00016304597353953968, + "loss": 2.8156, + "step": 6122 + }, + { + "epoch": 0.5547326221376639, + "grad_norm": 0.7856906056404114, + "learning_rate": 0.00016303993233854892, + "loss": 3.0822, + "step": 6123 + }, + { + "epoch": 0.5548232203121107, + "grad_norm": 0.7538726329803467, + "learning_rate": 0.00016303389113755815, + "loss": 2.9214, + "step": 6124 + }, + { + "epoch": 0.5549138184865575, + "grad_norm": 0.827820897102356, + "learning_rate": 0.00016302784993656738, + "loss": 2.8194, + "step": 6125 + }, + { + "epoch": 0.5550044166610043, + "grad_norm": 0.758987307548523, + "learning_rate": 0.00016302180873557665, + "loss": 2.8973, + "step": 6126 + }, + { + "epoch": 0.5550950148354511, + "grad_norm": 0.7419729828834534, + "learning_rate": 0.00016301576753458588, + "loss": 2.8927, + "step": 6127 + }, + { + "epoch": 0.5551856130098979, + "grad_norm": 0.7677034735679626, + "learning_rate": 0.00016300972633359511, + "loss": 2.6851, + "step": 6128 + }, + { + "epoch": 0.5552762111843447, + "grad_norm": 0.8131447434425354, + "learning_rate": 0.00016300368513260437, + "loss": 2.8974, + "step": 6129 + }, + { + "epoch": 0.5553668093587915, + "grad_norm": 0.7519810795783997, + "learning_rate": 0.0001629976439316136, + "loss": 2.8756, + "step": 6130 + }, + { + "epoch": 0.5554574075332382, + "grad_norm": 0.8235570192337036, + "learning_rate": 0.00016299160273062287, + "loss": 2.9286, + "step": 6131 + }, + { + "epoch": 0.555548005707685, + "grad_norm": 0.7852466702461243, + "learning_rate": 0.0001629855615296321, + "loss": 2.9373, + "step": 6132 + }, + { + "epoch": 0.5556386038821318, + "grad_norm": 0.8290079236030579, + "learning_rate": 0.00016297952032864134, + "loss": 2.8675, + "step": 6133 + }, + { + "epoch": 0.5557292020565786, + "grad_norm": 0.8202714323997498, + "learning_rate": 0.00016297347912765057, + "loss": 2.6272, + "step": 6134 + }, + { + "epoch": 0.5558198002310254, + "grad_norm": 0.7570961713790894, + "learning_rate": 0.00016296743792665983, + "loss": 2.876, + "step": 6135 + }, + { + "epoch": 0.5559103984054722, + "grad_norm": 0.8090668320655823, + "learning_rate": 0.00016296139672566907, + "loss": 2.9222, + "step": 6136 + }, + { + "epoch": 0.5560009965799189, + "grad_norm": 0.7601586580276489, + "learning_rate": 0.0001629553555246783, + "loss": 2.7415, + "step": 6137 + }, + { + "epoch": 0.5560915947543656, + "grad_norm": 0.7561345100402832, + "learning_rate": 0.00016294931432368756, + "loss": 2.7252, + "step": 6138 + }, + { + "epoch": 0.5561821929288124, + "grad_norm": 0.7659820914268494, + "learning_rate": 0.0001629432731226968, + "loss": 2.9118, + "step": 6139 + }, + { + "epoch": 0.5562727911032592, + "grad_norm": 0.7714248895645142, + "learning_rate": 0.00016293723192170606, + "loss": 3.0518, + "step": 6140 + }, + { + "epoch": 0.556363389277706, + "grad_norm": 0.7298742532730103, + "learning_rate": 0.00016293119072071526, + "loss": 2.6499, + "step": 6141 + }, + { + "epoch": 0.5564539874521528, + "grad_norm": 0.7697960734367371, + "learning_rate": 0.00016292514951972453, + "loss": 2.7282, + "step": 6142 + }, + { + "epoch": 0.5565445856265996, + "grad_norm": 0.7400776743888855, + "learning_rate": 0.0001629191083187338, + "loss": 2.8027, + "step": 6143 + }, + { + "epoch": 0.5566351838010464, + "grad_norm": 0.7760695815086365, + "learning_rate": 0.00016291306711774302, + "loss": 2.8481, + "step": 6144 + }, + { + "epoch": 0.5567257819754932, + "grad_norm": 0.7497555017471313, + "learning_rate": 0.00016290702591675226, + "loss": 2.8952, + "step": 6145 + }, + { + "epoch": 0.55681638014994, + "grad_norm": 0.7487077116966248, + "learning_rate": 0.0001629009847157615, + "loss": 2.8787, + "step": 6146 + }, + { + "epoch": 0.5569069783243867, + "grad_norm": 0.8638854622840881, + "learning_rate": 0.00016289494351477075, + "loss": 2.0743, + "step": 6147 + }, + { + "epoch": 0.5569975764988335, + "grad_norm": 0.7938999533653259, + "learning_rate": 0.00016288890231377998, + "loss": 2.5969, + "step": 6148 + }, + { + "epoch": 0.5570881746732803, + "grad_norm": 0.7840208411216736, + "learning_rate": 0.00016288286111278922, + "loss": 3.0868, + "step": 6149 + }, + { + "epoch": 0.5571787728477271, + "grad_norm": 0.6657220721244812, + "learning_rate": 0.00016287681991179848, + "loss": 2.061, + "step": 6150 + }, + { + "epoch": 0.5572693710221739, + "grad_norm": 0.7426849007606506, + "learning_rate": 0.00016287077871080771, + "loss": 2.7336, + "step": 6151 + }, + { + "epoch": 0.5573599691966207, + "grad_norm": 0.7771461009979248, + "learning_rate": 0.00016286473750981697, + "loss": 2.6088, + "step": 6152 + }, + { + "epoch": 0.5574505673710675, + "grad_norm": 0.7520039081573486, + "learning_rate": 0.0001628586963088262, + "loss": 2.813, + "step": 6153 + }, + { + "epoch": 0.5575411655455143, + "grad_norm": 0.7925395369529724, + "learning_rate": 0.00016285265510783544, + "loss": 2.8899, + "step": 6154 + }, + { + "epoch": 0.557631763719961, + "grad_norm": 0.8061308264732361, + "learning_rate": 0.00016284661390684468, + "loss": 2.9088, + "step": 6155 + }, + { + "epoch": 0.5577223618944078, + "grad_norm": 0.8168323636054993, + "learning_rate": 0.00016284057270585394, + "loss": 2.9481, + "step": 6156 + }, + { + "epoch": 0.5578129600688546, + "grad_norm": 0.7991609573364258, + "learning_rate": 0.00016283453150486317, + "loss": 2.9608, + "step": 6157 + }, + { + "epoch": 0.5579035582433014, + "grad_norm": 0.8567960858345032, + "learning_rate": 0.0001628284903038724, + "loss": 2.7364, + "step": 6158 + }, + { + "epoch": 0.5579941564177482, + "grad_norm": 0.82466721534729, + "learning_rate": 0.00016282244910288167, + "loss": 2.9255, + "step": 6159 + }, + { + "epoch": 0.558084754592195, + "grad_norm": 0.7914303541183472, + "learning_rate": 0.0001628164079018909, + "loss": 2.8233, + "step": 6160 + }, + { + "epoch": 0.5581753527666418, + "grad_norm": 0.7465413808822632, + "learning_rate": 0.00016281036670090016, + "loss": 2.6799, + "step": 6161 + }, + { + "epoch": 0.5582659509410886, + "grad_norm": 0.8071081042289734, + "learning_rate": 0.00016280432549990937, + "loss": 2.8849, + "step": 6162 + }, + { + "epoch": 0.5583565491155353, + "grad_norm": 0.8202202916145325, + "learning_rate": 0.00016279828429891863, + "loss": 3.1217, + "step": 6163 + }, + { + "epoch": 0.5584471472899821, + "grad_norm": 0.8514063358306885, + "learning_rate": 0.00016279224309792786, + "loss": 2.9097, + "step": 6164 + }, + { + "epoch": 0.5585377454644289, + "grad_norm": 0.7712492942810059, + "learning_rate": 0.00016278620189693713, + "loss": 2.8948, + "step": 6165 + }, + { + "epoch": 0.5586283436388757, + "grad_norm": 0.7902787327766418, + "learning_rate": 0.00016278016069594636, + "loss": 2.9856, + "step": 6166 + }, + { + "epoch": 0.5587189418133225, + "grad_norm": 0.7993704676628113, + "learning_rate": 0.0001627741194949556, + "loss": 2.9513, + "step": 6167 + }, + { + "epoch": 0.5588095399877693, + "grad_norm": 0.8246023654937744, + "learning_rate": 0.00016276807829396486, + "loss": 3.328, + "step": 6168 + }, + { + "epoch": 0.5589001381622161, + "grad_norm": 0.7984826564788818, + "learning_rate": 0.0001627620370929741, + "loss": 2.8297, + "step": 6169 + }, + { + "epoch": 0.5589907363366629, + "grad_norm": 0.786972165107727, + "learning_rate": 0.00016275599589198335, + "loss": 3.1649, + "step": 6170 + }, + { + "epoch": 0.5590813345111096, + "grad_norm": 0.73220294713974, + "learning_rate": 0.00016274995469099256, + "loss": 2.7748, + "step": 6171 + }, + { + "epoch": 0.5591719326855564, + "grad_norm": 0.7594836354255676, + "learning_rate": 0.00016274391349000182, + "loss": 2.6682, + "step": 6172 + }, + { + "epoch": 0.5592625308600032, + "grad_norm": 0.7627028226852417, + "learning_rate": 0.00016273787228901108, + "loss": 2.7614, + "step": 6173 + }, + { + "epoch": 0.55935312903445, + "grad_norm": 0.6990233063697815, + "learning_rate": 0.00016273183108802031, + "loss": 2.2423, + "step": 6174 + }, + { + "epoch": 0.5594437272088968, + "grad_norm": 0.6916767358779907, + "learning_rate": 0.00016272578988702955, + "loss": 2.2573, + "step": 6175 + }, + { + "epoch": 0.5595343253833436, + "grad_norm": 0.8669309020042419, + "learning_rate": 0.00016271974868603878, + "loss": 2.784, + "step": 6176 + }, + { + "epoch": 0.5596249235577904, + "grad_norm": 0.7545742392539978, + "learning_rate": 0.00016271370748504804, + "loss": 2.665, + "step": 6177 + }, + { + "epoch": 0.559715521732237, + "grad_norm": 0.7916483879089355, + "learning_rate": 0.00016270766628405728, + "loss": 2.8097, + "step": 6178 + }, + { + "epoch": 0.5598061199066838, + "grad_norm": 0.705786406993866, + "learning_rate": 0.0001627016250830665, + "loss": 2.2693, + "step": 6179 + }, + { + "epoch": 0.5598967180811306, + "grad_norm": 0.7388074994087219, + "learning_rate": 0.00016269558388207577, + "loss": 2.6162, + "step": 6180 + }, + { + "epoch": 0.5599873162555774, + "grad_norm": 0.7033690214157104, + "learning_rate": 0.000162689542681085, + "loss": 2.2147, + "step": 6181 + }, + { + "epoch": 0.5600779144300242, + "grad_norm": 0.7666773796081543, + "learning_rate": 0.00016268350148009427, + "loss": 2.5602, + "step": 6182 + }, + { + "epoch": 0.560168512604471, + "grad_norm": 0.8362269997596741, + "learning_rate": 0.0001626774602791035, + "loss": 3.0187, + "step": 6183 + }, + { + "epoch": 0.5602591107789178, + "grad_norm": 0.7818682193756104, + "learning_rate": 0.00016267141907811274, + "loss": 2.7435, + "step": 6184 + }, + { + "epoch": 0.5603497089533646, + "grad_norm": 0.8378534913063049, + "learning_rate": 0.00016266537787712197, + "loss": 2.8191, + "step": 6185 + }, + { + "epoch": 0.5604403071278113, + "grad_norm": 0.7734033465385437, + "learning_rate": 0.00016265933667613123, + "loss": 2.8722, + "step": 6186 + }, + { + "epoch": 0.5605309053022581, + "grad_norm": 0.8163118958473206, + "learning_rate": 0.00016265329547514046, + "loss": 2.788, + "step": 6187 + }, + { + "epoch": 0.5606215034767049, + "grad_norm": 0.7792544364929199, + "learning_rate": 0.0001626472542741497, + "loss": 2.9219, + "step": 6188 + }, + { + "epoch": 0.5607121016511517, + "grad_norm": 0.862827479839325, + "learning_rate": 0.00016264121307315896, + "loss": 2.678, + "step": 6189 + }, + { + "epoch": 0.5608026998255985, + "grad_norm": 0.791549026966095, + "learning_rate": 0.0001626351718721682, + "loss": 2.6851, + "step": 6190 + }, + { + "epoch": 0.5608932980000453, + "grad_norm": 0.7912705540657043, + "learning_rate": 0.00016262913067117746, + "loss": 2.9816, + "step": 6191 + }, + { + "epoch": 0.5609838961744921, + "grad_norm": 0.8179735541343689, + "learning_rate": 0.00016262308947018666, + "loss": 2.9653, + "step": 6192 + }, + { + "epoch": 0.5610744943489389, + "grad_norm": 0.8157753348350525, + "learning_rate": 0.00016261704826919592, + "loss": 3.0162, + "step": 6193 + }, + { + "epoch": 0.5611650925233856, + "grad_norm": 0.7362911105155945, + "learning_rate": 0.00016261100706820516, + "loss": 2.5039, + "step": 6194 + }, + { + "epoch": 0.5612556906978324, + "grad_norm": 0.8251895308494568, + "learning_rate": 0.00016260496586721442, + "loss": 2.5825, + "step": 6195 + }, + { + "epoch": 0.5613462888722792, + "grad_norm": 0.7618198394775391, + "learning_rate": 0.00016259892466622365, + "loss": 2.7736, + "step": 6196 + }, + { + "epoch": 0.561436887046726, + "grad_norm": 0.7946454286575317, + "learning_rate": 0.0001625928834652329, + "loss": 2.8669, + "step": 6197 + }, + { + "epoch": 0.5615274852211728, + "grad_norm": 0.6841808557510376, + "learning_rate": 0.00016258684226424215, + "loss": 2.503, + "step": 6198 + }, + { + "epoch": 0.5616180833956196, + "grad_norm": 0.7649664282798767, + "learning_rate": 0.00016258080106325138, + "loss": 2.7687, + "step": 6199 + }, + { + "epoch": 0.5617086815700664, + "grad_norm": 0.8022119998931885, + "learning_rate": 0.00016257475986226062, + "loss": 3.1057, + "step": 6200 + }, + { + "epoch": 0.5617992797445132, + "grad_norm": 0.8436130285263062, + "learning_rate": 0.00016256871866126985, + "loss": 2.7967, + "step": 6201 + }, + { + "epoch": 0.56188987791896, + "grad_norm": 0.7661660313606262, + "learning_rate": 0.0001625626774602791, + "loss": 2.9506, + "step": 6202 + }, + { + "epoch": 0.5619804760934067, + "grad_norm": 0.6762027740478516, + "learning_rate": 0.00016255663625928837, + "loss": 2.0521, + "step": 6203 + }, + { + "epoch": 0.5620710742678535, + "grad_norm": 0.7132807374000549, + "learning_rate": 0.0001625505950582976, + "loss": 2.9005, + "step": 6204 + }, + { + "epoch": 0.5621616724423003, + "grad_norm": 0.7522382140159607, + "learning_rate": 0.00016254455385730684, + "loss": 2.337, + "step": 6205 + }, + { + "epoch": 0.5622522706167471, + "grad_norm": 0.8473376035690308, + "learning_rate": 0.00016253851265631607, + "loss": 2.8124, + "step": 6206 + }, + { + "epoch": 0.5623428687911939, + "grad_norm": 0.7903246283531189, + "learning_rate": 0.00016253247145532534, + "loss": 2.8063, + "step": 6207 + }, + { + "epoch": 0.5624334669656407, + "grad_norm": 0.7263258099555969, + "learning_rate": 0.00016252643025433457, + "loss": 2.8375, + "step": 6208 + }, + { + "epoch": 0.5625240651400875, + "grad_norm": 0.7863811254501343, + "learning_rate": 0.0001625203890533438, + "loss": 2.7062, + "step": 6209 + }, + { + "epoch": 0.5626146633145342, + "grad_norm": 0.7084877490997314, + "learning_rate": 0.00016251434785235307, + "loss": 2.1199, + "step": 6210 + }, + { + "epoch": 0.562705261488981, + "grad_norm": 0.7452782988548279, + "learning_rate": 0.0001625083066513623, + "loss": 2.5909, + "step": 6211 + }, + { + "epoch": 0.5627958596634278, + "grad_norm": 0.723782479763031, + "learning_rate": 0.00016250226545037156, + "loss": 2.066, + "step": 6212 + }, + { + "epoch": 0.5628864578378746, + "grad_norm": 0.8155089020729065, + "learning_rate": 0.00016249622424938077, + "loss": 3.0024, + "step": 6213 + }, + { + "epoch": 0.5629770560123214, + "grad_norm": 0.7419770956039429, + "learning_rate": 0.00016249018304839003, + "loss": 2.7957, + "step": 6214 + }, + { + "epoch": 0.5630676541867682, + "grad_norm": 0.7788947820663452, + "learning_rate": 0.00016248414184739926, + "loss": 2.7111, + "step": 6215 + }, + { + "epoch": 0.563158252361215, + "grad_norm": 0.9158602356910706, + "learning_rate": 0.00016247810064640852, + "loss": 2.9307, + "step": 6216 + }, + { + "epoch": 0.5632488505356618, + "grad_norm": 0.7781678438186646, + "learning_rate": 0.00016247205944541776, + "loss": 2.5062, + "step": 6217 + }, + { + "epoch": 0.5633394487101084, + "grad_norm": 0.7470220923423767, + "learning_rate": 0.000162466018244427, + "loss": 2.8065, + "step": 6218 + }, + { + "epoch": 0.5634300468845552, + "grad_norm": 0.699614405632019, + "learning_rate": 0.00016245997704343625, + "loss": 2.1861, + "step": 6219 + }, + { + "epoch": 0.563520645059002, + "grad_norm": 0.8087307214736938, + "learning_rate": 0.0001624539358424455, + "loss": 2.9335, + "step": 6220 + }, + { + "epoch": 0.5636112432334488, + "grad_norm": 0.8331238031387329, + "learning_rate": 0.00016244789464145472, + "loss": 2.9929, + "step": 6221 + }, + { + "epoch": 0.5637018414078956, + "grad_norm": 0.731889545917511, + "learning_rate": 0.00016244185344046396, + "loss": 2.9129, + "step": 6222 + }, + { + "epoch": 0.5637924395823424, + "grad_norm": 0.8423002362251282, + "learning_rate": 0.00016243581223947322, + "loss": 2.845, + "step": 6223 + }, + { + "epoch": 0.5638830377567892, + "grad_norm": 0.7558349370956421, + "learning_rate": 0.00016242977103848245, + "loss": 2.9547, + "step": 6224 + }, + { + "epoch": 0.563973635931236, + "grad_norm": 0.9569439888000488, + "learning_rate": 0.0001624237298374917, + "loss": 2.607, + "step": 6225 + }, + { + "epoch": 0.5640642341056827, + "grad_norm": 0.7600277066230774, + "learning_rate": 0.00016241768863650095, + "loss": 2.88, + "step": 6226 + }, + { + "epoch": 0.5641548322801295, + "grad_norm": 0.746267557144165, + "learning_rate": 0.00016241164743551018, + "loss": 2.7382, + "step": 6227 + }, + { + "epoch": 0.5642454304545763, + "grad_norm": 0.7729396224021912, + "learning_rate": 0.00016240560623451944, + "loss": 2.7883, + "step": 6228 + }, + { + "epoch": 0.5643360286290231, + "grad_norm": 0.8345381021499634, + "learning_rate": 0.00016239956503352867, + "loss": 2.9088, + "step": 6229 + }, + { + "epoch": 0.5644266268034699, + "grad_norm": 0.728637158870697, + "learning_rate": 0.0001623935238325379, + "loss": 2.8457, + "step": 6230 + }, + { + "epoch": 0.5645172249779167, + "grad_norm": 0.7938954830169678, + "learning_rate": 0.00016238748263154714, + "loss": 2.6457, + "step": 6231 + }, + { + "epoch": 0.5646078231523635, + "grad_norm": 0.6799490451812744, + "learning_rate": 0.0001623814414305564, + "loss": 2.2786, + "step": 6232 + }, + { + "epoch": 0.5646984213268103, + "grad_norm": 0.7681643962860107, + "learning_rate": 0.00016237540022956567, + "loss": 2.784, + "step": 6233 + }, + { + "epoch": 0.564789019501257, + "grad_norm": 0.6946690082550049, + "learning_rate": 0.00016236935902857487, + "loss": 2.0781, + "step": 6234 + }, + { + "epoch": 0.5648796176757038, + "grad_norm": 0.7819913029670715, + "learning_rate": 0.00016236331782758413, + "loss": 2.9397, + "step": 6235 + }, + { + "epoch": 0.5649702158501506, + "grad_norm": 0.7799591422080994, + "learning_rate": 0.00016235727662659337, + "loss": 2.4891, + "step": 6236 + }, + { + "epoch": 0.5650608140245974, + "grad_norm": 0.7342369556427002, + "learning_rate": 0.00016235123542560263, + "loss": 2.079, + "step": 6237 + }, + { + "epoch": 0.5651514121990442, + "grad_norm": 0.8405458331108093, + "learning_rate": 0.00016234519422461186, + "loss": 2.7825, + "step": 6238 + }, + { + "epoch": 0.565242010373491, + "grad_norm": 0.8234740495681763, + "learning_rate": 0.0001623391530236211, + "loss": 2.829, + "step": 6239 + }, + { + "epoch": 0.5653326085479378, + "grad_norm": 0.8261618614196777, + "learning_rate": 0.00016233311182263036, + "loss": 3.1853, + "step": 6240 + }, + { + "epoch": 0.5654232067223846, + "grad_norm": 0.7556267380714417, + "learning_rate": 0.0001623270706216396, + "loss": 2.9503, + "step": 6241 + }, + { + "epoch": 0.5655138048968313, + "grad_norm": 0.6740286946296692, + "learning_rate": 0.00016232102942064885, + "loss": 2.0364, + "step": 6242 + }, + { + "epoch": 0.5656044030712781, + "grad_norm": 0.8094225525856018, + "learning_rate": 0.00016231498821965806, + "loss": 2.908, + "step": 6243 + }, + { + "epoch": 0.5656950012457249, + "grad_norm": 0.7453715801239014, + "learning_rate": 0.00016230894701866732, + "loss": 3.0039, + "step": 6244 + }, + { + "epoch": 0.5657855994201717, + "grad_norm": 0.7226675748825073, + "learning_rate": 0.00016230290581767656, + "loss": 2.5421, + "step": 6245 + }, + { + "epoch": 0.5658761975946185, + "grad_norm": 0.8107357621192932, + "learning_rate": 0.00016229686461668582, + "loss": 2.7686, + "step": 6246 + }, + { + "epoch": 0.5659667957690653, + "grad_norm": 0.8599118590354919, + "learning_rate": 0.00016229082341569505, + "loss": 2.9619, + "step": 6247 + }, + { + "epoch": 0.5660573939435121, + "grad_norm": 0.7224686145782471, + "learning_rate": 0.00016228478221470428, + "loss": 2.3326, + "step": 6248 + }, + { + "epoch": 0.5661479921179589, + "grad_norm": 0.8077912926673889, + "learning_rate": 0.00016227874101371355, + "loss": 2.8884, + "step": 6249 + }, + { + "epoch": 0.5662385902924056, + "grad_norm": 0.7744174599647522, + "learning_rate": 0.00016227269981272278, + "loss": 2.8131, + "step": 6250 + }, + { + "epoch": 0.5663291884668524, + "grad_norm": 0.7633171081542969, + "learning_rate": 0.00016226665861173201, + "loss": 2.6179, + "step": 6251 + }, + { + "epoch": 0.5664197866412992, + "grad_norm": 0.9466992616653442, + "learning_rate": 0.00016226061741074125, + "loss": 2.6739, + "step": 6252 + }, + { + "epoch": 0.566510384815746, + "grad_norm": 0.8098570704460144, + "learning_rate": 0.0001622545762097505, + "loss": 2.769, + "step": 6253 + }, + { + "epoch": 0.5666009829901928, + "grad_norm": 0.8163718581199646, + "learning_rate": 0.00016224853500875974, + "loss": 2.8105, + "step": 6254 + }, + { + "epoch": 0.5666915811646396, + "grad_norm": 0.800478458404541, + "learning_rate": 0.000162242493807769, + "loss": 2.7619, + "step": 6255 + }, + { + "epoch": 0.5667821793390864, + "grad_norm": 0.8026400804519653, + "learning_rate": 0.00016223645260677824, + "loss": 2.9985, + "step": 6256 + }, + { + "epoch": 0.5668727775135332, + "grad_norm": 0.6526283621788025, + "learning_rate": 0.00016223041140578747, + "loss": 1.9796, + "step": 6257 + }, + { + "epoch": 0.5669633756879799, + "grad_norm": 0.7539329528808594, + "learning_rate": 0.00016222437020479673, + "loss": 2.7302, + "step": 6258 + }, + { + "epoch": 0.5670539738624266, + "grad_norm": 0.7476037740707397, + "learning_rate": 0.00016221832900380597, + "loss": 2.838, + "step": 6259 + }, + { + "epoch": 0.5671445720368734, + "grad_norm": 0.8274138569831848, + "learning_rate": 0.0001622122878028152, + "loss": 2.7913, + "step": 6260 + }, + { + "epoch": 0.5672351702113202, + "grad_norm": 0.7820761203765869, + "learning_rate": 0.00016220624660182444, + "loss": 2.817, + "step": 6261 + }, + { + "epoch": 0.567325768385767, + "grad_norm": 0.7192254066467285, + "learning_rate": 0.0001622002054008337, + "loss": 2.0941, + "step": 6262 + }, + { + "epoch": 0.5674163665602138, + "grad_norm": 0.8004590272903442, + "learning_rate": 0.00016219416419984296, + "loss": 2.565, + "step": 6263 + }, + { + "epoch": 0.5675069647346606, + "grad_norm": 0.766886293888092, + "learning_rate": 0.00016218812299885216, + "loss": 3.0483, + "step": 6264 + }, + { + "epoch": 0.5675975629091073, + "grad_norm": 0.8089662790298462, + "learning_rate": 0.00016218208179786143, + "loss": 2.8379, + "step": 6265 + }, + { + "epoch": 0.5676881610835541, + "grad_norm": 0.778570830821991, + "learning_rate": 0.00016217604059687066, + "loss": 2.6673, + "step": 6266 + }, + { + "epoch": 0.5677787592580009, + "grad_norm": 0.7702764868736267, + "learning_rate": 0.00016216999939587992, + "loss": 2.7715, + "step": 6267 + }, + { + "epoch": 0.5678693574324477, + "grad_norm": 0.8354554772377014, + "learning_rate": 0.00016216395819488916, + "loss": 2.6872, + "step": 6268 + }, + { + "epoch": 0.5679599556068945, + "grad_norm": 0.7413842678070068, + "learning_rate": 0.0001621579169938984, + "loss": 2.847, + "step": 6269 + }, + { + "epoch": 0.5680505537813413, + "grad_norm": 0.8334892988204956, + "learning_rate": 0.00016215187579290765, + "loss": 2.8933, + "step": 6270 + }, + { + "epoch": 0.5681411519557881, + "grad_norm": 0.8512803316116333, + "learning_rate": 0.00016214583459191688, + "loss": 2.9172, + "step": 6271 + }, + { + "epoch": 0.5682317501302349, + "grad_norm": 0.7767369151115417, + "learning_rate": 0.00016213979339092612, + "loss": 2.8014, + "step": 6272 + }, + { + "epoch": 0.5683223483046816, + "grad_norm": 0.8278610706329346, + "learning_rate": 0.00016213375218993535, + "loss": 2.7349, + "step": 6273 + }, + { + "epoch": 0.5684129464791284, + "grad_norm": 0.762817919254303, + "learning_rate": 0.00016212771098894461, + "loss": 2.6438, + "step": 6274 + }, + { + "epoch": 0.5685035446535752, + "grad_norm": 0.8500766158103943, + "learning_rate": 0.00016212166978795385, + "loss": 2.8862, + "step": 6275 + }, + { + "epoch": 0.568594142828022, + "grad_norm": 0.8097584247589111, + "learning_rate": 0.0001621156285869631, + "loss": 2.8575, + "step": 6276 + }, + { + "epoch": 0.5686847410024688, + "grad_norm": 0.8393802046775818, + "learning_rate": 0.00016210958738597232, + "loss": 2.6877, + "step": 6277 + }, + { + "epoch": 0.5687753391769156, + "grad_norm": 0.8198975324630737, + "learning_rate": 0.00016210354618498158, + "loss": 2.8106, + "step": 6278 + }, + { + "epoch": 0.5688659373513624, + "grad_norm": 0.81657874584198, + "learning_rate": 0.00016209750498399084, + "loss": 2.7996, + "step": 6279 + }, + { + "epoch": 0.5689565355258092, + "grad_norm": 0.8058813810348511, + "learning_rate": 0.00016209146378300007, + "loss": 2.704, + "step": 6280 + }, + { + "epoch": 0.569047133700256, + "grad_norm": 0.8439697623252869, + "learning_rate": 0.0001620854225820093, + "loss": 2.8299, + "step": 6281 + }, + { + "epoch": 0.5691377318747027, + "grad_norm": 0.7437413334846497, + "learning_rate": 0.00016207938138101854, + "loss": 2.527, + "step": 6282 + }, + { + "epoch": 0.5692283300491495, + "grad_norm": 0.7843703031539917, + "learning_rate": 0.0001620733401800278, + "loss": 2.8103, + "step": 6283 + }, + { + "epoch": 0.5693189282235963, + "grad_norm": 0.807651698589325, + "learning_rate": 0.00016206729897903704, + "loss": 2.8772, + "step": 6284 + }, + { + "epoch": 0.5694095263980431, + "grad_norm": 0.8125895857810974, + "learning_rate": 0.00016206125777804627, + "loss": 2.8498, + "step": 6285 + }, + { + "epoch": 0.5695001245724899, + "grad_norm": 0.8116030693054199, + "learning_rate": 0.00016205521657705553, + "loss": 2.6434, + "step": 6286 + }, + { + "epoch": 0.5695907227469367, + "grad_norm": 0.7705584764480591, + "learning_rate": 0.00016204917537606476, + "loss": 2.9563, + "step": 6287 + }, + { + "epoch": 0.5696813209213835, + "grad_norm": 0.7871513962745667, + "learning_rate": 0.00016204313417507403, + "loss": 2.6124, + "step": 6288 + }, + { + "epoch": 0.5697719190958302, + "grad_norm": 0.7982051372528076, + "learning_rate": 0.00016203709297408326, + "loss": 2.922, + "step": 6289 + }, + { + "epoch": 0.569862517270277, + "grad_norm": 0.9271914958953857, + "learning_rate": 0.0001620310517730925, + "loss": 2.8428, + "step": 6290 + }, + { + "epoch": 0.5699531154447238, + "grad_norm": 0.79289710521698, + "learning_rate": 0.00016202501057210173, + "loss": 3.0345, + "step": 6291 + }, + { + "epoch": 0.5700437136191706, + "grad_norm": 0.6825013756752014, + "learning_rate": 0.000162018969371111, + "loss": 1.9749, + "step": 6292 + }, + { + "epoch": 0.5701343117936174, + "grad_norm": 0.6973628401756287, + "learning_rate": 0.00016201292817012025, + "loss": 1.9399, + "step": 6293 + }, + { + "epoch": 0.5702249099680642, + "grad_norm": 0.795417070388794, + "learning_rate": 0.00016200688696912946, + "loss": 3.0225, + "step": 6294 + }, + { + "epoch": 0.570315508142511, + "grad_norm": 0.9466925859451294, + "learning_rate": 0.00016200084576813872, + "loss": 2.7962, + "step": 6295 + }, + { + "epoch": 0.5704061063169578, + "grad_norm": 0.7579772472381592, + "learning_rate": 0.00016199480456714795, + "loss": 2.9371, + "step": 6296 + }, + { + "epoch": 0.5704967044914045, + "grad_norm": 0.7853136658668518, + "learning_rate": 0.00016198876336615721, + "loss": 2.7715, + "step": 6297 + }, + { + "epoch": 0.5705873026658513, + "grad_norm": 0.7593656778335571, + "learning_rate": 0.00016198272216516642, + "loss": 2.5996, + "step": 6298 + }, + { + "epoch": 0.570677900840298, + "grad_norm": 0.7362257242202759, + "learning_rate": 0.00016197668096417568, + "loss": 2.7613, + "step": 6299 + }, + { + "epoch": 0.5707684990147448, + "grad_norm": 0.7541530132293701, + "learning_rate": 0.00016197063976318494, + "loss": 2.9087, + "step": 6300 + }, + { + "epoch": 0.5708590971891916, + "grad_norm": 0.7787705063819885, + "learning_rate": 0.00016196459856219418, + "loss": 2.7178, + "step": 6301 + }, + { + "epoch": 0.5709496953636384, + "grad_norm": 0.8766541481018066, + "learning_rate": 0.0001619585573612034, + "loss": 2.9621, + "step": 6302 + }, + { + "epoch": 0.5710402935380852, + "grad_norm": 0.7506852746009827, + "learning_rate": 0.00016195251616021265, + "loss": 2.3512, + "step": 6303 + }, + { + "epoch": 0.571130891712532, + "grad_norm": 0.7967430949211121, + "learning_rate": 0.0001619464749592219, + "loss": 3.1007, + "step": 6304 + }, + { + "epoch": 0.5712214898869787, + "grad_norm": 0.8082072734832764, + "learning_rate": 0.00016194043375823114, + "loss": 3.0033, + "step": 6305 + }, + { + "epoch": 0.5713120880614255, + "grad_norm": 0.7418798804283142, + "learning_rate": 0.0001619343925572404, + "loss": 2.715, + "step": 6306 + }, + { + "epoch": 0.5714026862358723, + "grad_norm": 0.7335873246192932, + "learning_rate": 0.00016192835135624964, + "loss": 2.6279, + "step": 6307 + }, + { + "epoch": 0.5714932844103191, + "grad_norm": 0.7993500828742981, + "learning_rate": 0.00016192231015525887, + "loss": 2.797, + "step": 6308 + }, + { + "epoch": 0.5715838825847659, + "grad_norm": 0.819999098777771, + "learning_rate": 0.00016191626895426813, + "loss": 2.8678, + "step": 6309 + }, + { + "epoch": 0.5716744807592127, + "grad_norm": 0.7493383884429932, + "learning_rate": 0.00016191022775327736, + "loss": 2.6728, + "step": 6310 + }, + { + "epoch": 0.5717650789336595, + "grad_norm": 0.6953831911087036, + "learning_rate": 0.0001619041865522866, + "loss": 2.0169, + "step": 6311 + }, + { + "epoch": 0.5718556771081063, + "grad_norm": 0.6483675241470337, + "learning_rate": 0.00016189814535129583, + "loss": 2.009, + "step": 6312 + }, + { + "epoch": 0.571946275282553, + "grad_norm": 0.780102550983429, + "learning_rate": 0.0001618921041503051, + "loss": 3.0371, + "step": 6313 + }, + { + "epoch": 0.5720368734569998, + "grad_norm": 0.78737872838974, + "learning_rate": 0.00016188606294931433, + "loss": 2.9853, + "step": 6314 + }, + { + "epoch": 0.5721274716314466, + "grad_norm": 0.8017177581787109, + "learning_rate": 0.00016188002174832356, + "loss": 2.8708, + "step": 6315 + }, + { + "epoch": 0.5722180698058934, + "grad_norm": 0.8083885908126831, + "learning_rate": 0.00016187398054733282, + "loss": 2.8388, + "step": 6316 + }, + { + "epoch": 0.5723086679803402, + "grad_norm": 0.7940637469291687, + "learning_rate": 0.00016186793934634206, + "loss": 3.0154, + "step": 6317 + }, + { + "epoch": 0.572399266154787, + "grad_norm": 0.7338160276412964, + "learning_rate": 0.00016186189814535132, + "loss": 2.7762, + "step": 6318 + }, + { + "epoch": 0.5724898643292338, + "grad_norm": 0.7956347465515137, + "learning_rate": 0.00016185585694436055, + "loss": 3.0553, + "step": 6319 + }, + { + "epoch": 0.5725804625036806, + "grad_norm": 0.7973214387893677, + "learning_rate": 0.0001618498157433698, + "loss": 2.7615, + "step": 6320 + }, + { + "epoch": 0.5726710606781273, + "grad_norm": 0.7039952874183655, + "learning_rate": 0.00016184377454237902, + "loss": 2.0535, + "step": 6321 + }, + { + "epoch": 0.5727616588525741, + "grad_norm": 0.7380589842796326, + "learning_rate": 0.00016183773334138828, + "loss": 2.7514, + "step": 6322 + }, + { + "epoch": 0.5728522570270209, + "grad_norm": 0.7824957966804504, + "learning_rate": 0.00016183169214039752, + "loss": 2.9539, + "step": 6323 + }, + { + "epoch": 0.5729428552014677, + "grad_norm": 0.764854371547699, + "learning_rate": 0.00016182565093940675, + "loss": 3.0041, + "step": 6324 + }, + { + "epoch": 0.5730334533759145, + "grad_norm": 0.6893315315246582, + "learning_rate": 0.000161819609738416, + "loss": 2.2033, + "step": 6325 + }, + { + "epoch": 0.5731240515503613, + "grad_norm": 0.728300929069519, + "learning_rate": 0.00016181356853742525, + "loss": 2.6915, + "step": 6326 + }, + { + "epoch": 0.5732146497248081, + "grad_norm": 0.7640861868858337, + "learning_rate": 0.0001618075273364345, + "loss": 2.8375, + "step": 6327 + }, + { + "epoch": 0.5733052478992549, + "grad_norm": 0.8046134114265442, + "learning_rate": 0.0001618014861354437, + "loss": 2.7613, + "step": 6328 + }, + { + "epoch": 0.5733958460737016, + "grad_norm": 0.7674505710601807, + "learning_rate": 0.00016179544493445297, + "loss": 2.8923, + "step": 6329 + }, + { + "epoch": 0.5734864442481484, + "grad_norm": 0.8612422347068787, + "learning_rate": 0.00016178940373346224, + "loss": 2.8117, + "step": 6330 + }, + { + "epoch": 0.5735770424225952, + "grad_norm": 0.7853430509567261, + "learning_rate": 0.00016178336253247147, + "loss": 2.6149, + "step": 6331 + }, + { + "epoch": 0.573667640597042, + "grad_norm": 0.7410385608673096, + "learning_rate": 0.0001617773213314807, + "loss": 2.7414, + "step": 6332 + }, + { + "epoch": 0.5737582387714888, + "grad_norm": 0.6175351142883301, + "learning_rate": 0.00016177128013048994, + "loss": 2.0678, + "step": 6333 + }, + { + "epoch": 0.5738488369459356, + "grad_norm": 0.8344235420227051, + "learning_rate": 0.0001617652389294992, + "loss": 2.9479, + "step": 6334 + }, + { + "epoch": 0.5739394351203824, + "grad_norm": 0.7771464586257935, + "learning_rate": 0.00016175919772850843, + "loss": 2.916, + "step": 6335 + }, + { + "epoch": 0.5740300332948292, + "grad_norm": 0.8433598875999451, + "learning_rate": 0.00016175315652751767, + "loss": 2.7233, + "step": 6336 + }, + { + "epoch": 0.5741206314692759, + "grad_norm": 0.8196492195129395, + "learning_rate": 0.00016174711532652693, + "loss": 2.975, + "step": 6337 + }, + { + "epoch": 0.5742112296437227, + "grad_norm": 0.8609060645103455, + "learning_rate": 0.00016174107412553616, + "loss": 2.615, + "step": 6338 + }, + { + "epoch": 0.5743018278181695, + "grad_norm": 0.8442268967628479, + "learning_rate": 0.00016173503292454542, + "loss": 2.6539, + "step": 6339 + }, + { + "epoch": 0.5743924259926162, + "grad_norm": 0.7429385185241699, + "learning_rate": 0.00016172899172355466, + "loss": 2.862, + "step": 6340 + }, + { + "epoch": 0.574483024167063, + "grad_norm": 0.7979495525360107, + "learning_rate": 0.0001617229505225639, + "loss": 2.9834, + "step": 6341 + }, + { + "epoch": 0.5745736223415098, + "grad_norm": 0.7842235565185547, + "learning_rate": 0.00016171690932157313, + "loss": 2.6657, + "step": 6342 + }, + { + "epoch": 0.5746642205159566, + "grad_norm": 0.8722321391105652, + "learning_rate": 0.0001617108681205824, + "loss": 2.8266, + "step": 6343 + }, + { + "epoch": 0.5747548186904033, + "grad_norm": 0.7728561758995056, + "learning_rate": 0.00016170482691959162, + "loss": 2.753, + "step": 6344 + }, + { + "epoch": 0.5748454168648501, + "grad_norm": 0.7405296564102173, + "learning_rate": 0.00016169878571860085, + "loss": 2.7724, + "step": 6345 + }, + { + "epoch": 0.5749360150392969, + "grad_norm": 0.7696117162704468, + "learning_rate": 0.00016169274451761012, + "loss": 2.8388, + "step": 6346 + }, + { + "epoch": 0.5750266132137437, + "grad_norm": 0.7526652216911316, + "learning_rate": 0.00016168670331661935, + "loss": 2.7604, + "step": 6347 + }, + { + "epoch": 0.5751172113881905, + "grad_norm": 0.8246147632598877, + "learning_rate": 0.0001616806621156286, + "loss": 3.0024, + "step": 6348 + }, + { + "epoch": 0.5752078095626373, + "grad_norm": 0.7276580333709717, + "learning_rate": 0.00016167462091463782, + "loss": 2.3483, + "step": 6349 + }, + { + "epoch": 0.5752984077370841, + "grad_norm": 0.7754582166671753, + "learning_rate": 0.00016166857971364708, + "loss": 3.0184, + "step": 6350 + }, + { + "epoch": 0.5753890059115309, + "grad_norm": 0.7870290279388428, + "learning_rate": 0.0001616625385126563, + "loss": 2.8787, + "step": 6351 + }, + { + "epoch": 0.5754796040859776, + "grad_norm": 0.8699702024459839, + "learning_rate": 0.00016165649731166557, + "loss": 2.8946, + "step": 6352 + }, + { + "epoch": 0.5755702022604244, + "grad_norm": 0.8178938031196594, + "learning_rate": 0.0001616504561106748, + "loss": 2.9069, + "step": 6353 + }, + { + "epoch": 0.5756608004348712, + "grad_norm": 0.7773289084434509, + "learning_rate": 0.00016164441490968404, + "loss": 2.9727, + "step": 6354 + }, + { + "epoch": 0.575751398609318, + "grad_norm": 0.6770817041397095, + "learning_rate": 0.0001616383737086933, + "loss": 2.2581, + "step": 6355 + }, + { + "epoch": 0.5758419967837648, + "grad_norm": 0.7598217129707336, + "learning_rate": 0.00016163233250770254, + "loss": 2.6718, + "step": 6356 + }, + { + "epoch": 0.5759325949582116, + "grad_norm": 0.7699580192565918, + "learning_rate": 0.0001616262913067118, + "loss": 2.6803, + "step": 6357 + }, + { + "epoch": 0.5760231931326584, + "grad_norm": 0.777739942073822, + "learning_rate": 0.000161620250105721, + "loss": 2.7356, + "step": 6358 + }, + { + "epoch": 0.5761137913071052, + "grad_norm": 0.7166391015052795, + "learning_rate": 0.00016161420890473027, + "loss": 2.7618, + "step": 6359 + }, + { + "epoch": 0.576204389481552, + "grad_norm": 0.7785729169845581, + "learning_rate": 0.00016160816770373953, + "loss": 2.7829, + "step": 6360 + }, + { + "epoch": 0.5762949876559987, + "grad_norm": 0.7960578203201294, + "learning_rate": 0.00016160212650274876, + "loss": 2.9317, + "step": 6361 + }, + { + "epoch": 0.5763855858304455, + "grad_norm": 0.7839518785476685, + "learning_rate": 0.000161596085301758, + "loss": 2.8181, + "step": 6362 + }, + { + "epoch": 0.5764761840048923, + "grad_norm": 0.7592917680740356, + "learning_rate": 0.00016159004410076723, + "loss": 2.8788, + "step": 6363 + }, + { + "epoch": 0.5765667821793391, + "grad_norm": 0.8114678263664246, + "learning_rate": 0.0001615840028997765, + "loss": 2.6597, + "step": 6364 + }, + { + "epoch": 0.5766573803537859, + "grad_norm": 0.887795090675354, + "learning_rate": 0.00016157796169878573, + "loss": 2.84, + "step": 6365 + }, + { + "epoch": 0.5767479785282327, + "grad_norm": 0.8321828842163086, + "learning_rate": 0.00016157192049779496, + "loss": 2.9087, + "step": 6366 + }, + { + "epoch": 0.5768385767026795, + "grad_norm": 0.7864603400230408, + "learning_rate": 0.00016156587929680422, + "loss": 2.7093, + "step": 6367 + }, + { + "epoch": 0.5769291748771262, + "grad_norm": 0.7920430898666382, + "learning_rate": 0.00016155983809581345, + "loss": 2.8988, + "step": 6368 + }, + { + "epoch": 0.577019773051573, + "grad_norm": 0.8224665522575378, + "learning_rate": 0.00016155379689482272, + "loss": 2.7406, + "step": 6369 + }, + { + "epoch": 0.5771103712260198, + "grad_norm": 0.7845863103866577, + "learning_rate": 0.00016154775569383195, + "loss": 2.9224, + "step": 6370 + }, + { + "epoch": 0.5772009694004666, + "grad_norm": 0.6631881594657898, + "learning_rate": 0.00016154171449284118, + "loss": 2.1181, + "step": 6371 + }, + { + "epoch": 0.5772915675749134, + "grad_norm": 0.7688816785812378, + "learning_rate": 0.00016153567329185042, + "loss": 2.5949, + "step": 6372 + }, + { + "epoch": 0.5773821657493602, + "grad_norm": 0.7653917074203491, + "learning_rate": 0.00016152963209085968, + "loss": 2.7931, + "step": 6373 + }, + { + "epoch": 0.577472763923807, + "grad_norm": 0.6814138293266296, + "learning_rate": 0.0001615235908898689, + "loss": 1.438, + "step": 6374 + }, + { + "epoch": 0.5775633620982538, + "grad_norm": 0.7644531726837158, + "learning_rate": 0.00016151754968887815, + "loss": 2.7095, + "step": 6375 + }, + { + "epoch": 0.5776539602727006, + "grad_norm": 0.7840530872344971, + "learning_rate": 0.0001615115084878874, + "loss": 2.9139, + "step": 6376 + }, + { + "epoch": 0.5777445584471473, + "grad_norm": 0.8011008501052856, + "learning_rate": 0.00016150546728689664, + "loss": 2.9153, + "step": 6377 + }, + { + "epoch": 0.5778351566215941, + "grad_norm": 0.8154099583625793, + "learning_rate": 0.0001614994260859059, + "loss": 2.7751, + "step": 6378 + }, + { + "epoch": 0.5779257547960409, + "grad_norm": 0.6862921118736267, + "learning_rate": 0.0001614933848849151, + "loss": 2.0148, + "step": 6379 + }, + { + "epoch": 0.5780163529704876, + "grad_norm": 0.806822657585144, + "learning_rate": 0.00016148734368392437, + "loss": 2.7579, + "step": 6380 + }, + { + "epoch": 0.5781069511449344, + "grad_norm": 0.7706897258758545, + "learning_rate": 0.0001614813024829336, + "loss": 2.76, + "step": 6381 + }, + { + "epoch": 0.5781975493193812, + "grad_norm": 0.7495807409286499, + "learning_rate": 0.00016147526128194287, + "loss": 2.8037, + "step": 6382 + }, + { + "epoch": 0.578288147493828, + "grad_norm": 0.788901686668396, + "learning_rate": 0.0001614692200809521, + "loss": 2.852, + "step": 6383 + }, + { + "epoch": 0.5783787456682747, + "grad_norm": 0.7709506750106812, + "learning_rate": 0.00016146317887996134, + "loss": 2.6916, + "step": 6384 + }, + { + "epoch": 0.5784693438427215, + "grad_norm": 0.8013147115707397, + "learning_rate": 0.0001614571376789706, + "loss": 2.8938, + "step": 6385 + }, + { + "epoch": 0.5785599420171683, + "grad_norm": 0.6397950649261475, + "learning_rate": 0.00016145109647797983, + "loss": 2.0629, + "step": 6386 + }, + { + "epoch": 0.5786505401916151, + "grad_norm": 0.81270432472229, + "learning_rate": 0.00016144505527698906, + "loss": 2.6804, + "step": 6387 + }, + { + "epoch": 0.5787411383660619, + "grad_norm": 0.7682207226753235, + "learning_rate": 0.0001614390140759983, + "loss": 3.0153, + "step": 6388 + }, + { + "epoch": 0.5788317365405087, + "grad_norm": 0.7949522137641907, + "learning_rate": 0.00016143297287500756, + "loss": 2.7722, + "step": 6389 + }, + { + "epoch": 0.5789223347149555, + "grad_norm": 0.7892931699752808, + "learning_rate": 0.00016142693167401682, + "loss": 2.8845, + "step": 6390 + }, + { + "epoch": 0.5790129328894023, + "grad_norm": 0.7468647360801697, + "learning_rate": 0.00016142089047302605, + "loss": 2.6844, + "step": 6391 + }, + { + "epoch": 0.579103531063849, + "grad_norm": 0.6787067651748657, + "learning_rate": 0.0001614148492720353, + "loss": 2.3629, + "step": 6392 + }, + { + "epoch": 0.5791941292382958, + "grad_norm": 0.8104769587516785, + "learning_rate": 0.00016140880807104452, + "loss": 2.0008, + "step": 6393 + }, + { + "epoch": 0.5792847274127426, + "grad_norm": 0.7477659583091736, + "learning_rate": 0.00016140276687005378, + "loss": 2.7226, + "step": 6394 + }, + { + "epoch": 0.5793753255871894, + "grad_norm": 0.7664437294006348, + "learning_rate": 0.00016139672566906302, + "loss": 2.7022, + "step": 6395 + }, + { + "epoch": 0.5794659237616362, + "grad_norm": 0.822920024394989, + "learning_rate": 0.00016139068446807225, + "loss": 2.5632, + "step": 6396 + }, + { + "epoch": 0.579556521936083, + "grad_norm": 0.7349152565002441, + "learning_rate": 0.0001613846432670815, + "loss": 2.8419, + "step": 6397 + }, + { + "epoch": 0.5796471201105298, + "grad_norm": 0.7024494409561157, + "learning_rate": 0.00016137860206609075, + "loss": 2.6936, + "step": 6398 + }, + { + "epoch": 0.5797377182849766, + "grad_norm": 0.7474285960197449, + "learning_rate": 0.0001613725608651, + "loss": 2.7814, + "step": 6399 + }, + { + "epoch": 0.5798283164594233, + "grad_norm": 0.7557839751243591, + "learning_rate": 0.00016136651966410922, + "loss": 2.8451, + "step": 6400 + }, + { + "epoch": 0.5799189146338701, + "grad_norm": 0.8118436336517334, + "learning_rate": 0.00016136047846311848, + "loss": 2.5565, + "step": 6401 + }, + { + "epoch": 0.5800095128083169, + "grad_norm": 0.7639405131340027, + "learning_rate": 0.0001613544372621277, + "loss": 2.8622, + "step": 6402 + }, + { + "epoch": 0.5801001109827637, + "grad_norm": 0.6928056478500366, + "learning_rate": 0.00016134839606113697, + "loss": 1.9891, + "step": 6403 + }, + { + "epoch": 0.5801907091572105, + "grad_norm": 0.7664768099784851, + "learning_rate": 0.0001613423548601462, + "loss": 2.6239, + "step": 6404 + }, + { + "epoch": 0.5802813073316573, + "grad_norm": 0.7377654910087585, + "learning_rate": 0.00016133631365915544, + "loss": 2.6659, + "step": 6405 + }, + { + "epoch": 0.5803719055061041, + "grad_norm": 0.7598743438720703, + "learning_rate": 0.0001613302724581647, + "loss": 2.7737, + "step": 6406 + }, + { + "epoch": 0.5804625036805509, + "grad_norm": 0.8322122097015381, + "learning_rate": 0.00016132423125717394, + "loss": 2.6985, + "step": 6407 + }, + { + "epoch": 0.5805531018549976, + "grad_norm": 0.6703125238418579, + "learning_rate": 0.00016131819005618317, + "loss": 2.1024, + "step": 6408 + }, + { + "epoch": 0.5806437000294444, + "grad_norm": 0.761530876159668, + "learning_rate": 0.0001613121488551924, + "loss": 2.7439, + "step": 6409 + }, + { + "epoch": 0.5807342982038912, + "grad_norm": 0.7716554999351501, + "learning_rate": 0.00016130610765420166, + "loss": 2.8426, + "step": 6410 + }, + { + "epoch": 0.580824896378338, + "grad_norm": 0.7860034704208374, + "learning_rate": 0.0001613000664532109, + "loss": 2.7488, + "step": 6411 + }, + { + "epoch": 0.5809154945527848, + "grad_norm": 0.8370069265365601, + "learning_rate": 0.00016129402525222016, + "loss": 2.7834, + "step": 6412 + }, + { + "epoch": 0.5810060927272316, + "grad_norm": 0.7840654253959656, + "learning_rate": 0.0001612879840512294, + "loss": 2.8497, + "step": 6413 + }, + { + "epoch": 0.5810966909016784, + "grad_norm": 0.7560189962387085, + "learning_rate": 0.00016128194285023863, + "loss": 2.8745, + "step": 6414 + }, + { + "epoch": 0.5811872890761252, + "grad_norm": 0.7474507093429565, + "learning_rate": 0.0001612759016492479, + "loss": 2.7905, + "step": 6415 + }, + { + "epoch": 0.581277887250572, + "grad_norm": 0.8146980404853821, + "learning_rate": 0.00016126986044825712, + "loss": 2.9411, + "step": 6416 + }, + { + "epoch": 0.5813684854250187, + "grad_norm": 0.7983516454696655, + "learning_rate": 0.00016126381924726636, + "loss": 2.9305, + "step": 6417 + }, + { + "epoch": 0.5814590835994655, + "grad_norm": 0.6596072912216187, + "learning_rate": 0.0001612577780462756, + "loss": 2.0595, + "step": 6418 + }, + { + "epoch": 0.5815496817739123, + "grad_norm": 0.7429381012916565, + "learning_rate": 0.00016125173684528485, + "loss": 2.7154, + "step": 6419 + }, + { + "epoch": 0.5816402799483591, + "grad_norm": 0.7922447323799133, + "learning_rate": 0.00016124569564429411, + "loss": 2.8134, + "step": 6420 + }, + { + "epoch": 0.5817308781228058, + "grad_norm": 0.9508923292160034, + "learning_rate": 0.00016123965444330332, + "loss": 2.7989, + "step": 6421 + }, + { + "epoch": 0.5818214762972526, + "grad_norm": 0.8530473709106445, + "learning_rate": 0.00016123361324231258, + "loss": 2.8887, + "step": 6422 + }, + { + "epoch": 0.5819120744716993, + "grad_norm": 0.6009452939033508, + "learning_rate": 0.00016122757204132182, + "loss": 1.4705, + "step": 6423 + }, + { + "epoch": 0.5820026726461461, + "grad_norm": 0.752027153968811, + "learning_rate": 0.00016122153084033108, + "loss": 2.6816, + "step": 6424 + }, + { + "epoch": 0.5820932708205929, + "grad_norm": 0.6596143245697021, + "learning_rate": 0.0001612154896393403, + "loss": 2.0362, + "step": 6425 + }, + { + "epoch": 0.5821838689950397, + "grad_norm": 0.7489548921585083, + "learning_rate": 0.00016120944843834955, + "loss": 2.9026, + "step": 6426 + }, + { + "epoch": 0.5822744671694865, + "grad_norm": 0.7773759961128235, + "learning_rate": 0.0001612034072373588, + "loss": 2.9083, + "step": 6427 + }, + { + "epoch": 0.5823650653439333, + "grad_norm": 0.7974605560302734, + "learning_rate": 0.00016119736603636804, + "loss": 2.8257, + "step": 6428 + }, + { + "epoch": 0.5824556635183801, + "grad_norm": 0.7759786248207092, + "learning_rate": 0.0001611913248353773, + "loss": 2.8738, + "step": 6429 + }, + { + "epoch": 0.5825462616928269, + "grad_norm": 0.7428448796272278, + "learning_rate": 0.0001611852836343865, + "loss": 2.6497, + "step": 6430 + }, + { + "epoch": 0.5826368598672736, + "grad_norm": 0.7758772373199463, + "learning_rate": 0.00016117924243339577, + "loss": 2.7906, + "step": 6431 + }, + { + "epoch": 0.5827274580417204, + "grad_norm": 0.8017013669013977, + "learning_rate": 0.000161173201232405, + "loss": 3.1068, + "step": 6432 + }, + { + "epoch": 0.5828180562161672, + "grad_norm": 0.7973453402519226, + "learning_rate": 0.00016116716003141426, + "loss": 2.7737, + "step": 6433 + }, + { + "epoch": 0.582908654390614, + "grad_norm": 0.7974139451980591, + "learning_rate": 0.0001611611188304235, + "loss": 2.6185, + "step": 6434 + }, + { + "epoch": 0.5829992525650608, + "grad_norm": 0.7749601602554321, + "learning_rate": 0.00016115507762943273, + "loss": 2.7913, + "step": 6435 + }, + { + "epoch": 0.5830898507395076, + "grad_norm": 0.8787773847579956, + "learning_rate": 0.000161149036428442, + "loss": 2.8886, + "step": 6436 + }, + { + "epoch": 0.5831804489139544, + "grad_norm": 0.8011559247970581, + "learning_rate": 0.00016114299522745123, + "loss": 2.5257, + "step": 6437 + }, + { + "epoch": 0.5832710470884012, + "grad_norm": 0.8116858601570129, + "learning_rate": 0.00016113695402646046, + "loss": 2.9328, + "step": 6438 + }, + { + "epoch": 0.583361645262848, + "grad_norm": 0.7493641972541809, + "learning_rate": 0.0001611309128254697, + "loss": 2.7718, + "step": 6439 + }, + { + "epoch": 0.5834522434372947, + "grad_norm": 0.8379961252212524, + "learning_rate": 0.00016112487162447896, + "loss": 2.8216, + "step": 6440 + }, + { + "epoch": 0.5835428416117415, + "grad_norm": 0.8543705940246582, + "learning_rate": 0.0001611188304234882, + "loss": 3.2005, + "step": 6441 + }, + { + "epoch": 0.5836334397861883, + "grad_norm": 0.7062990665435791, + "learning_rate": 0.00016111278922249745, + "loss": 1.9438, + "step": 6442 + }, + { + "epoch": 0.5837240379606351, + "grad_norm": 0.791081428527832, + "learning_rate": 0.0001611067480215067, + "loss": 2.1388, + "step": 6443 + }, + { + "epoch": 0.5838146361350819, + "grad_norm": 0.7172688841819763, + "learning_rate": 0.00016110070682051592, + "loss": 2.0885, + "step": 6444 + }, + { + "epoch": 0.5839052343095287, + "grad_norm": 0.6634475588798523, + "learning_rate": 0.00016109466561952518, + "loss": 2.3398, + "step": 6445 + }, + { + "epoch": 0.5839958324839755, + "grad_norm": 0.841059148311615, + "learning_rate": 0.00016108862441853442, + "loss": 2.9119, + "step": 6446 + }, + { + "epoch": 0.5840864306584223, + "grad_norm": 0.7874518036842346, + "learning_rate": 0.00016108258321754365, + "loss": 2.8603, + "step": 6447 + }, + { + "epoch": 0.584177028832869, + "grad_norm": 0.7932372689247131, + "learning_rate": 0.00016107654201655288, + "loss": 2.7582, + "step": 6448 + }, + { + "epoch": 0.5842676270073158, + "grad_norm": 0.7304374575614929, + "learning_rate": 0.00016107050081556215, + "loss": 2.0432, + "step": 6449 + }, + { + "epoch": 0.5843582251817626, + "grad_norm": 0.8004328012466431, + "learning_rate": 0.0001610644596145714, + "loss": 2.7999, + "step": 6450 + }, + { + "epoch": 0.5844488233562094, + "grad_norm": 0.7787051796913147, + "learning_rate": 0.0001610584184135806, + "loss": 2.8391, + "step": 6451 + }, + { + "epoch": 0.5845394215306562, + "grad_norm": 0.7956358194351196, + "learning_rate": 0.00016105237721258987, + "loss": 2.7458, + "step": 6452 + }, + { + "epoch": 0.584630019705103, + "grad_norm": 0.8608744740486145, + "learning_rate": 0.0001610463360115991, + "loss": 3.1649, + "step": 6453 + }, + { + "epoch": 0.5847206178795498, + "grad_norm": 0.78754061460495, + "learning_rate": 0.00016104029481060837, + "loss": 2.8628, + "step": 6454 + }, + { + "epoch": 0.5848112160539966, + "grad_norm": 0.7743457555770874, + "learning_rate": 0.0001610342536096176, + "loss": 2.9929, + "step": 6455 + }, + { + "epoch": 0.5849018142284433, + "grad_norm": 0.788504421710968, + "learning_rate": 0.00016102821240862684, + "loss": 2.6696, + "step": 6456 + }, + { + "epoch": 0.5849924124028901, + "grad_norm": 0.7133967280387878, + "learning_rate": 0.0001610221712076361, + "loss": 2.1752, + "step": 6457 + }, + { + "epoch": 0.5850830105773369, + "grad_norm": 0.744251012802124, + "learning_rate": 0.00016101613000664533, + "loss": 2.7641, + "step": 6458 + }, + { + "epoch": 0.5851736087517837, + "grad_norm": 0.8123493790626526, + "learning_rate": 0.00016101008880565457, + "loss": 2.9402, + "step": 6459 + }, + { + "epoch": 0.5852642069262305, + "grad_norm": 0.787761926651001, + "learning_rate": 0.0001610040476046638, + "loss": 2.8782, + "step": 6460 + }, + { + "epoch": 0.5853548051006772, + "grad_norm": 0.7515963912010193, + "learning_rate": 0.00016099800640367306, + "loss": 2.7784, + "step": 6461 + }, + { + "epoch": 0.585445403275124, + "grad_norm": 0.7830501198768616, + "learning_rate": 0.0001609919652026823, + "loss": 2.832, + "step": 6462 + }, + { + "epoch": 0.5855360014495707, + "grad_norm": 0.8656880259513855, + "learning_rate": 0.00016098592400169156, + "loss": 2.831, + "step": 6463 + }, + { + "epoch": 0.5856265996240175, + "grad_norm": 0.7740803360939026, + "learning_rate": 0.00016097988280070076, + "loss": 2.6919, + "step": 6464 + }, + { + "epoch": 0.5857171977984643, + "grad_norm": 0.8261739015579224, + "learning_rate": 0.00016097384159971003, + "loss": 2.9029, + "step": 6465 + }, + { + "epoch": 0.5858077959729111, + "grad_norm": 0.7470955848693848, + "learning_rate": 0.0001609678003987193, + "loss": 2.4994, + "step": 6466 + }, + { + "epoch": 0.5858983941473579, + "grad_norm": 0.7991675734519958, + "learning_rate": 0.00016096175919772852, + "loss": 3.1002, + "step": 6467 + }, + { + "epoch": 0.5859889923218047, + "grad_norm": 0.8144655823707581, + "learning_rate": 0.00016095571799673775, + "loss": 2.8284, + "step": 6468 + }, + { + "epoch": 0.5860795904962515, + "grad_norm": 0.8051585555076599, + "learning_rate": 0.000160949676795747, + "loss": 2.9348, + "step": 6469 + }, + { + "epoch": 0.5861701886706983, + "grad_norm": 0.7626333236694336, + "learning_rate": 0.00016094363559475625, + "loss": 2.8017, + "step": 6470 + }, + { + "epoch": 0.586260786845145, + "grad_norm": 0.7284559607505798, + "learning_rate": 0.00016093759439376548, + "loss": 2.4366, + "step": 6471 + }, + { + "epoch": 0.5863513850195918, + "grad_norm": 0.7642302513122559, + "learning_rate": 0.00016093155319277472, + "loss": 2.9119, + "step": 6472 + }, + { + "epoch": 0.5864419831940386, + "grad_norm": 0.8452093005180359, + "learning_rate": 0.00016092551199178398, + "loss": 2.6877, + "step": 6473 + }, + { + "epoch": 0.5865325813684854, + "grad_norm": 0.690949559211731, + "learning_rate": 0.0001609194707907932, + "loss": 2.2569, + "step": 6474 + }, + { + "epoch": 0.5866231795429322, + "grad_norm": 0.7978082895278931, + "learning_rate": 0.00016091342958980247, + "loss": 3.1042, + "step": 6475 + }, + { + "epoch": 0.586713777717379, + "grad_norm": 0.8915961384773254, + "learning_rate": 0.0001609073883888117, + "loss": 2.8431, + "step": 6476 + }, + { + "epoch": 0.5868043758918258, + "grad_norm": 0.7540754079818726, + "learning_rate": 0.00016090134718782094, + "loss": 2.691, + "step": 6477 + }, + { + "epoch": 0.5868949740662726, + "grad_norm": 0.8110822439193726, + "learning_rate": 0.00016089530598683018, + "loss": 2.9386, + "step": 6478 + }, + { + "epoch": 0.5869855722407193, + "grad_norm": 0.7790345549583435, + "learning_rate": 0.00016088926478583944, + "loss": 2.7107, + "step": 6479 + }, + { + "epoch": 0.5870761704151661, + "grad_norm": 0.7488500475883484, + "learning_rate": 0.0001608832235848487, + "loss": 2.7061, + "step": 6480 + }, + { + "epoch": 0.5871667685896129, + "grad_norm": 0.6446399688720703, + "learning_rate": 0.0001608771823838579, + "loss": 2.1028, + "step": 6481 + }, + { + "epoch": 0.5872573667640597, + "grad_norm": 0.7716556191444397, + "learning_rate": 0.00016087114118286717, + "loss": 2.9633, + "step": 6482 + }, + { + "epoch": 0.5873479649385065, + "grad_norm": 0.7848048210144043, + "learning_rate": 0.0001608650999818764, + "loss": 2.6664, + "step": 6483 + }, + { + "epoch": 0.5874385631129533, + "grad_norm": 0.8099701404571533, + "learning_rate": 0.00016085905878088566, + "loss": 2.6234, + "step": 6484 + }, + { + "epoch": 0.5875291612874001, + "grad_norm": 0.8177139759063721, + "learning_rate": 0.00016085301757989487, + "loss": 2.9192, + "step": 6485 + }, + { + "epoch": 0.5876197594618469, + "grad_norm": 0.686674952507019, + "learning_rate": 0.00016084697637890413, + "loss": 1.9381, + "step": 6486 + }, + { + "epoch": 0.5877103576362936, + "grad_norm": 0.8900063633918762, + "learning_rate": 0.0001608409351779134, + "loss": 2.7094, + "step": 6487 + }, + { + "epoch": 0.5878009558107404, + "grad_norm": 0.7796790599822998, + "learning_rate": 0.00016083489397692263, + "loss": 2.9183, + "step": 6488 + }, + { + "epoch": 0.5878915539851872, + "grad_norm": 0.8017303347587585, + "learning_rate": 0.00016082885277593186, + "loss": 2.7645, + "step": 6489 + }, + { + "epoch": 0.587982152159634, + "grad_norm": 0.7949511408805847, + "learning_rate": 0.0001608228115749411, + "loss": 2.9549, + "step": 6490 + }, + { + "epoch": 0.5880727503340808, + "grad_norm": 0.783331036567688, + "learning_rate": 0.00016081677037395035, + "loss": 2.5497, + "step": 6491 + }, + { + "epoch": 0.5881633485085276, + "grad_norm": 0.7851982712745667, + "learning_rate": 0.0001608107291729596, + "loss": 2.895, + "step": 6492 + }, + { + "epoch": 0.5882539466829744, + "grad_norm": 0.737775981426239, + "learning_rate": 0.00016080468797196885, + "loss": 2.8824, + "step": 6493 + }, + { + "epoch": 0.5883445448574212, + "grad_norm": 0.8411592245101929, + "learning_rate": 0.00016079864677097806, + "loss": 3.1612, + "step": 6494 + }, + { + "epoch": 0.588435143031868, + "grad_norm": 0.8364894986152649, + "learning_rate": 0.00016079260556998732, + "loss": 2.5787, + "step": 6495 + }, + { + "epoch": 0.5885257412063147, + "grad_norm": 0.7791295051574707, + "learning_rate": 0.00016078656436899658, + "loss": 2.6159, + "step": 6496 + }, + { + "epoch": 0.5886163393807615, + "grad_norm": 0.7746856212615967, + "learning_rate": 0.0001607805231680058, + "loss": 2.8951, + "step": 6497 + }, + { + "epoch": 0.5887069375552083, + "grad_norm": 0.7391599416732788, + "learning_rate": 0.00016077448196701505, + "loss": 2.779, + "step": 6498 + }, + { + "epoch": 0.5887975357296551, + "grad_norm": 0.7901140451431274, + "learning_rate": 0.00016076844076602428, + "loss": 2.5429, + "step": 6499 + }, + { + "epoch": 0.5888881339041019, + "grad_norm": 0.775015115737915, + "learning_rate": 0.00016076239956503354, + "loss": 2.8422, + "step": 6500 + }, + { + "epoch": 0.5889787320785487, + "grad_norm": 0.8196051120758057, + "learning_rate": 0.00016075635836404278, + "loss": 2.8141, + "step": 6501 + }, + { + "epoch": 0.5890693302529953, + "grad_norm": 0.7669282555580139, + "learning_rate": 0.000160750317163052, + "loss": 2.2307, + "step": 6502 + }, + { + "epoch": 0.5891599284274421, + "grad_norm": 0.8245154023170471, + "learning_rate": 0.00016074427596206127, + "loss": 2.9578, + "step": 6503 + }, + { + "epoch": 0.5892505266018889, + "grad_norm": 0.7778146266937256, + "learning_rate": 0.0001607382347610705, + "loss": 2.8048, + "step": 6504 + }, + { + "epoch": 0.5893411247763357, + "grad_norm": 0.7662192583084106, + "learning_rate": 0.00016073219356007977, + "loss": 2.8426, + "step": 6505 + }, + { + "epoch": 0.5894317229507825, + "grad_norm": 0.7568835020065308, + "learning_rate": 0.000160726152359089, + "loss": 2.7141, + "step": 6506 + }, + { + "epoch": 0.5895223211252293, + "grad_norm": 0.9593003392219543, + "learning_rate": 0.00016072011115809824, + "loss": 2.8458, + "step": 6507 + }, + { + "epoch": 0.5896129192996761, + "grad_norm": 0.7980571985244751, + "learning_rate": 0.00016071406995710747, + "loss": 2.6849, + "step": 6508 + }, + { + "epoch": 0.5897035174741229, + "grad_norm": 0.8382912278175354, + "learning_rate": 0.00016070802875611673, + "loss": 3.1465, + "step": 6509 + }, + { + "epoch": 0.5897941156485697, + "grad_norm": 0.8097947835922241, + "learning_rate": 0.00016070198755512596, + "loss": 3.0361, + "step": 6510 + }, + { + "epoch": 0.5898847138230164, + "grad_norm": 0.8103305697441101, + "learning_rate": 0.0001606959463541352, + "loss": 2.9413, + "step": 6511 + }, + { + "epoch": 0.5899753119974632, + "grad_norm": 0.8065044283866882, + "learning_rate": 0.00016068990515314446, + "loss": 2.6072, + "step": 6512 + }, + { + "epoch": 0.59006591017191, + "grad_norm": 0.8610251545906067, + "learning_rate": 0.0001606838639521537, + "loss": 2.9939, + "step": 6513 + }, + { + "epoch": 0.5901565083463568, + "grad_norm": 0.7356152534484863, + "learning_rate": 0.00016067782275116295, + "loss": 3.1089, + "step": 6514 + }, + { + "epoch": 0.5902471065208036, + "grad_norm": 0.7799525856971741, + "learning_rate": 0.00016067178155017216, + "loss": 2.5834, + "step": 6515 + }, + { + "epoch": 0.5903377046952504, + "grad_norm": 0.7188210487365723, + "learning_rate": 0.00016066574034918142, + "loss": 2.9014, + "step": 6516 + }, + { + "epoch": 0.5904283028696972, + "grad_norm": 0.823655366897583, + "learning_rate": 0.00016065969914819068, + "loss": 2.8719, + "step": 6517 + }, + { + "epoch": 0.590518901044144, + "grad_norm": 0.7726443409919739, + "learning_rate": 0.00016065365794719992, + "loss": 2.7623, + "step": 6518 + }, + { + "epoch": 0.5906094992185907, + "grad_norm": 0.7380010485649109, + "learning_rate": 0.00016064761674620915, + "loss": 2.5824, + "step": 6519 + }, + { + "epoch": 0.5907000973930375, + "grad_norm": 0.7995994091033936, + "learning_rate": 0.00016064157554521839, + "loss": 2.9286, + "step": 6520 + }, + { + "epoch": 0.5907906955674843, + "grad_norm": 0.8472757339477539, + "learning_rate": 0.00016063553434422765, + "loss": 2.8608, + "step": 6521 + }, + { + "epoch": 0.5908812937419311, + "grad_norm": 0.7722655534744263, + "learning_rate": 0.00016062949314323688, + "loss": 2.673, + "step": 6522 + }, + { + "epoch": 0.5909718919163779, + "grad_norm": 0.7869336605072021, + "learning_rate": 0.00016062345194224612, + "loss": 2.8508, + "step": 6523 + }, + { + "epoch": 0.5910624900908247, + "grad_norm": 0.7871437072753906, + "learning_rate": 0.00016061741074125535, + "loss": 2.9905, + "step": 6524 + }, + { + "epoch": 0.5911530882652715, + "grad_norm": 0.8068969249725342, + "learning_rate": 0.0001606113695402646, + "loss": 2.9481, + "step": 6525 + }, + { + "epoch": 0.5912436864397183, + "grad_norm": 0.7685253620147705, + "learning_rate": 0.00016060532833927387, + "loss": 2.7626, + "step": 6526 + }, + { + "epoch": 0.591334284614165, + "grad_norm": 0.7365028858184814, + "learning_rate": 0.0001605992871382831, + "loss": 2.6907, + "step": 6527 + }, + { + "epoch": 0.5914248827886118, + "grad_norm": 0.7344130277633667, + "learning_rate": 0.00016059324593729234, + "loss": 2.8236, + "step": 6528 + }, + { + "epoch": 0.5915154809630586, + "grad_norm": 0.8090617060661316, + "learning_rate": 0.00016058720473630157, + "loss": 2.9186, + "step": 6529 + }, + { + "epoch": 0.5916060791375054, + "grad_norm": 0.8450279235839844, + "learning_rate": 0.00016058116353531084, + "loss": 2.7505, + "step": 6530 + }, + { + "epoch": 0.5916966773119522, + "grad_norm": 0.7733344435691833, + "learning_rate": 0.00016057512233432007, + "loss": 2.7041, + "step": 6531 + }, + { + "epoch": 0.591787275486399, + "grad_norm": 0.8578661680221558, + "learning_rate": 0.0001605690811333293, + "loss": 2.856, + "step": 6532 + }, + { + "epoch": 0.5918778736608458, + "grad_norm": 0.784783124923706, + "learning_rate": 0.00016056303993233856, + "loss": 2.7098, + "step": 6533 + }, + { + "epoch": 0.5919684718352926, + "grad_norm": 0.8310110569000244, + "learning_rate": 0.0001605569987313478, + "loss": 2.9505, + "step": 6534 + }, + { + "epoch": 0.5920590700097393, + "grad_norm": 0.7613919973373413, + "learning_rate": 0.00016055095753035706, + "loss": 2.6198, + "step": 6535 + }, + { + "epoch": 0.5921496681841861, + "grad_norm": 0.7711179852485657, + "learning_rate": 0.00016054491632936627, + "loss": 2.7799, + "step": 6536 + }, + { + "epoch": 0.5922402663586329, + "grad_norm": 0.96076899766922, + "learning_rate": 0.00016053887512837553, + "loss": 2.9473, + "step": 6537 + }, + { + "epoch": 0.5923308645330797, + "grad_norm": 0.8636482357978821, + "learning_rate": 0.00016053283392738476, + "loss": 2.9895, + "step": 6538 + }, + { + "epoch": 0.5924214627075265, + "grad_norm": 0.7390627264976501, + "learning_rate": 0.00016052679272639402, + "loss": 2.7462, + "step": 6539 + }, + { + "epoch": 0.5925120608819733, + "grad_norm": 0.782942533493042, + "learning_rate": 0.00016052075152540326, + "loss": 2.6957, + "step": 6540 + }, + { + "epoch": 0.5926026590564201, + "grad_norm": 0.7931856513023376, + "learning_rate": 0.0001605147103244125, + "loss": 2.8144, + "step": 6541 + }, + { + "epoch": 0.5926932572308667, + "grad_norm": 0.7939469814300537, + "learning_rate": 0.00016050866912342175, + "loss": 2.8289, + "step": 6542 + }, + { + "epoch": 0.5927838554053135, + "grad_norm": 0.8060559034347534, + "learning_rate": 0.00016050262792243099, + "loss": 2.5013, + "step": 6543 + }, + { + "epoch": 0.5928744535797603, + "grad_norm": 0.7780488133430481, + "learning_rate": 0.00016049658672144025, + "loss": 2.5814, + "step": 6544 + }, + { + "epoch": 0.5929650517542071, + "grad_norm": 0.8803808689117432, + "learning_rate": 0.00016049054552044945, + "loss": 3.0234, + "step": 6545 + }, + { + "epoch": 0.5930556499286539, + "grad_norm": 0.8060812950134277, + "learning_rate": 0.00016048450431945872, + "loss": 2.4415, + "step": 6546 + }, + { + "epoch": 0.5931462481031007, + "grad_norm": 0.7975788116455078, + "learning_rate": 0.00016047846311846798, + "loss": 2.7275, + "step": 6547 + }, + { + "epoch": 0.5932368462775475, + "grad_norm": 0.7284592390060425, + "learning_rate": 0.0001604724219174772, + "loss": 2.6777, + "step": 6548 + }, + { + "epoch": 0.5933274444519943, + "grad_norm": 0.7972930669784546, + "learning_rate": 0.00016046638071648644, + "loss": 2.7634, + "step": 6549 + }, + { + "epoch": 0.593418042626441, + "grad_norm": 0.8338993787765503, + "learning_rate": 0.00016046033951549568, + "loss": 2.6611, + "step": 6550 + }, + { + "epoch": 0.5935086408008878, + "grad_norm": 0.7916128635406494, + "learning_rate": 0.00016045429831450494, + "loss": 2.7206, + "step": 6551 + }, + { + "epoch": 0.5935992389753346, + "grad_norm": 0.8957263827323914, + "learning_rate": 0.00016044825711351417, + "loss": 2.2261, + "step": 6552 + }, + { + "epoch": 0.5936898371497814, + "grad_norm": 0.7940071821212769, + "learning_rate": 0.0001604422159125234, + "loss": 2.7093, + "step": 6553 + }, + { + "epoch": 0.5937804353242282, + "grad_norm": 0.8617628216743469, + "learning_rate": 0.00016043617471153264, + "loss": 2.5908, + "step": 6554 + }, + { + "epoch": 0.593871033498675, + "grad_norm": 0.7818888425827026, + "learning_rate": 0.0001604301335105419, + "loss": 2.6788, + "step": 6555 + }, + { + "epoch": 0.5939616316731218, + "grad_norm": 0.8790663480758667, + "learning_rate": 0.00016042409230955116, + "loss": 2.818, + "step": 6556 + }, + { + "epoch": 0.5940522298475686, + "grad_norm": 0.7547175288200378, + "learning_rate": 0.0001604180511085604, + "loss": 2.8233, + "step": 6557 + }, + { + "epoch": 0.5941428280220153, + "grad_norm": 0.7859234809875488, + "learning_rate": 0.00016041200990756963, + "loss": 2.7231, + "step": 6558 + }, + { + "epoch": 0.5942334261964621, + "grad_norm": 0.8081414103507996, + "learning_rate": 0.00016040596870657887, + "loss": 2.6858, + "step": 6559 + }, + { + "epoch": 0.5943240243709089, + "grad_norm": 0.7832096815109253, + "learning_rate": 0.00016039992750558813, + "loss": 2.8234, + "step": 6560 + }, + { + "epoch": 0.5944146225453557, + "grad_norm": 0.8027554750442505, + "learning_rate": 0.00016039388630459736, + "loss": 2.6806, + "step": 6561 + }, + { + "epoch": 0.5945052207198025, + "grad_norm": 0.843209445476532, + "learning_rate": 0.0001603878451036066, + "loss": 3.2031, + "step": 6562 + }, + { + "epoch": 0.5945958188942493, + "grad_norm": 0.7923314571380615, + "learning_rate": 0.00016038180390261586, + "loss": 2.6668, + "step": 6563 + }, + { + "epoch": 0.5946864170686961, + "grad_norm": 0.739385187625885, + "learning_rate": 0.0001603757627016251, + "loss": 2.0047, + "step": 6564 + }, + { + "epoch": 0.5947770152431429, + "grad_norm": 0.785069465637207, + "learning_rate": 0.00016036972150063435, + "loss": 2.9413, + "step": 6565 + }, + { + "epoch": 0.5948676134175896, + "grad_norm": 0.8528835773468018, + "learning_rate": 0.00016036368029964356, + "loss": 2.8998, + "step": 6566 + }, + { + "epoch": 0.5949582115920364, + "grad_norm": 0.8175220489501953, + "learning_rate": 0.00016035763909865282, + "loss": 2.996, + "step": 6567 + }, + { + "epoch": 0.5950488097664832, + "grad_norm": 0.823988676071167, + "learning_rate": 0.00016035159789766205, + "loss": 2.8678, + "step": 6568 + }, + { + "epoch": 0.59513940794093, + "grad_norm": 0.786704421043396, + "learning_rate": 0.00016034555669667132, + "loss": 2.8152, + "step": 6569 + }, + { + "epoch": 0.5952300061153768, + "grad_norm": 0.7507853507995605, + "learning_rate": 0.00016033951549568055, + "loss": 2.698, + "step": 6570 + }, + { + "epoch": 0.5953206042898236, + "grad_norm": 0.8256428837776184, + "learning_rate": 0.00016033347429468978, + "loss": 2.7333, + "step": 6571 + }, + { + "epoch": 0.5954112024642704, + "grad_norm": 0.8105140924453735, + "learning_rate": 0.00016032743309369904, + "loss": 2.5104, + "step": 6572 + }, + { + "epoch": 0.5955018006387172, + "grad_norm": 0.8746134042739868, + "learning_rate": 0.00016032139189270828, + "loss": 3.0551, + "step": 6573 + }, + { + "epoch": 0.595592398813164, + "grad_norm": 0.7458591461181641, + "learning_rate": 0.0001603153506917175, + "loss": 2.5574, + "step": 6574 + }, + { + "epoch": 0.5956829969876107, + "grad_norm": 0.8049378395080566, + "learning_rate": 0.00016030930949072675, + "loss": 2.8361, + "step": 6575 + }, + { + "epoch": 0.5957735951620575, + "grad_norm": 0.7955208420753479, + "learning_rate": 0.000160303268289736, + "loss": 2.7921, + "step": 6576 + }, + { + "epoch": 0.5958641933365043, + "grad_norm": 0.757216215133667, + "learning_rate": 0.00016029722708874527, + "loss": 2.9664, + "step": 6577 + }, + { + "epoch": 0.5959547915109511, + "grad_norm": 0.7388872504234314, + "learning_rate": 0.0001602911858877545, + "loss": 2.6519, + "step": 6578 + }, + { + "epoch": 0.5960453896853979, + "grad_norm": 0.6987183094024658, + "learning_rate": 0.00016028514468676374, + "loss": 2.5843, + "step": 6579 + }, + { + "epoch": 0.5961359878598447, + "grad_norm": 0.7318403124809265, + "learning_rate": 0.00016027910348577297, + "loss": 2.8428, + "step": 6580 + }, + { + "epoch": 0.5962265860342915, + "grad_norm": 0.7496508955955505, + "learning_rate": 0.00016027306228478223, + "loss": 2.9881, + "step": 6581 + }, + { + "epoch": 0.5963171842087382, + "grad_norm": 0.8198869824409485, + "learning_rate": 0.00016026702108379147, + "loss": 2.5493, + "step": 6582 + }, + { + "epoch": 0.5964077823831849, + "grad_norm": 0.7645756006240845, + "learning_rate": 0.0001602609798828007, + "loss": 2.7858, + "step": 6583 + }, + { + "epoch": 0.5964983805576317, + "grad_norm": 0.7609567642211914, + "learning_rate": 0.00016025493868180993, + "loss": 2.6699, + "step": 6584 + }, + { + "epoch": 0.5965889787320785, + "grad_norm": 0.7598527669906616, + "learning_rate": 0.0001602488974808192, + "loss": 2.67, + "step": 6585 + }, + { + "epoch": 0.5966795769065253, + "grad_norm": 0.7763387560844421, + "learning_rate": 0.00016024285627982846, + "loss": 2.901, + "step": 6586 + }, + { + "epoch": 0.5967701750809721, + "grad_norm": 0.7954226732254028, + "learning_rate": 0.00016023681507883766, + "loss": 3.0992, + "step": 6587 + }, + { + "epoch": 0.5968607732554189, + "grad_norm": 0.8120779991149902, + "learning_rate": 0.00016023077387784693, + "loss": 2.8494, + "step": 6588 + }, + { + "epoch": 0.5969513714298657, + "grad_norm": 0.8109497427940369, + "learning_rate": 0.00016022473267685616, + "loss": 2.6046, + "step": 6589 + }, + { + "epoch": 0.5970419696043124, + "grad_norm": 0.7611082196235657, + "learning_rate": 0.00016021869147586542, + "loss": 2.8101, + "step": 6590 + }, + { + "epoch": 0.5971325677787592, + "grad_norm": 0.754813015460968, + "learning_rate": 0.00016021265027487465, + "loss": 2.8852, + "step": 6591 + }, + { + "epoch": 0.597223165953206, + "grad_norm": 0.7436503767967224, + "learning_rate": 0.0001602066090738839, + "loss": 2.1271, + "step": 6592 + }, + { + "epoch": 0.5973137641276528, + "grad_norm": 0.8401960730552673, + "learning_rate": 0.00016020056787289315, + "loss": 2.7961, + "step": 6593 + }, + { + "epoch": 0.5974043623020996, + "grad_norm": 0.8043657541275024, + "learning_rate": 0.00016019452667190238, + "loss": 3.026, + "step": 6594 + }, + { + "epoch": 0.5974949604765464, + "grad_norm": 0.5495526194572449, + "learning_rate": 0.00016018848547091162, + "loss": 1.3658, + "step": 6595 + }, + { + "epoch": 0.5975855586509932, + "grad_norm": 0.7082830667495728, + "learning_rate": 0.00016018244426992085, + "loss": 1.9756, + "step": 6596 + }, + { + "epoch": 0.59767615682544, + "grad_norm": 0.8310166597366333, + "learning_rate": 0.0001601764030689301, + "loss": 2.7933, + "step": 6597 + }, + { + "epoch": 0.5977667549998867, + "grad_norm": 0.8568918108940125, + "learning_rate": 0.00016017036186793935, + "loss": 2.8256, + "step": 6598 + }, + { + "epoch": 0.5978573531743335, + "grad_norm": 0.8112151622772217, + "learning_rate": 0.0001601643206669486, + "loss": 2.6916, + "step": 6599 + }, + { + "epoch": 0.5979479513487803, + "grad_norm": 0.7834407091140747, + "learning_rate": 0.00016015827946595784, + "loss": 2.6967, + "step": 6600 + }, + { + "epoch": 0.5980385495232271, + "grad_norm": 0.7485979199409485, + "learning_rate": 0.00016015223826496708, + "loss": 2.6967, + "step": 6601 + }, + { + "epoch": 0.5981291476976739, + "grad_norm": 0.7376247644424438, + "learning_rate": 0.00016014619706397634, + "loss": 2.8665, + "step": 6602 + }, + { + "epoch": 0.5982197458721207, + "grad_norm": 0.7891947031021118, + "learning_rate": 0.00016014015586298557, + "loss": 2.8035, + "step": 6603 + }, + { + "epoch": 0.5983103440465675, + "grad_norm": 0.7704862356185913, + "learning_rate": 0.0001601341146619948, + "loss": 2.6442, + "step": 6604 + }, + { + "epoch": 0.5984009422210143, + "grad_norm": 0.7943727374076843, + "learning_rate": 0.00016012807346100404, + "loss": 2.7035, + "step": 6605 + }, + { + "epoch": 0.598491540395461, + "grad_norm": 0.8000490069389343, + "learning_rate": 0.0001601220322600133, + "loss": 2.6352, + "step": 6606 + }, + { + "epoch": 0.5985821385699078, + "grad_norm": 0.8097966909408569, + "learning_rate": 0.00016011599105902256, + "loss": 2.8598, + "step": 6607 + }, + { + "epoch": 0.5986727367443546, + "grad_norm": 0.7767142057418823, + "learning_rate": 0.00016010994985803177, + "loss": 2.7076, + "step": 6608 + }, + { + "epoch": 0.5987633349188014, + "grad_norm": 1.0142897367477417, + "learning_rate": 0.00016010390865704103, + "loss": 2.0175, + "step": 6609 + }, + { + "epoch": 0.5988539330932482, + "grad_norm": 0.7936180233955383, + "learning_rate": 0.00016009786745605026, + "loss": 2.0871, + "step": 6610 + }, + { + "epoch": 0.598944531267695, + "grad_norm": 0.9381776452064514, + "learning_rate": 0.00016009182625505953, + "loss": 2.9072, + "step": 6611 + }, + { + "epoch": 0.5990351294421418, + "grad_norm": 0.7899720072746277, + "learning_rate": 0.00016008578505406876, + "loss": 2.7629, + "step": 6612 + }, + { + "epoch": 0.5991257276165886, + "grad_norm": 0.7649264335632324, + "learning_rate": 0.000160079743853078, + "loss": 2.6787, + "step": 6613 + }, + { + "epoch": 0.5992163257910353, + "grad_norm": 0.778034508228302, + "learning_rate": 0.00016007370265208723, + "loss": 2.7891, + "step": 6614 + }, + { + "epoch": 0.5993069239654821, + "grad_norm": 0.9064620733261108, + "learning_rate": 0.0001600676614510965, + "loss": 2.7754, + "step": 6615 + }, + { + "epoch": 0.5993975221399289, + "grad_norm": 0.7628629207611084, + "learning_rate": 0.00016006162025010575, + "loss": 2.8696, + "step": 6616 + }, + { + "epoch": 0.5994881203143757, + "grad_norm": 0.8229329586029053, + "learning_rate": 0.00016005557904911496, + "loss": 2.9502, + "step": 6617 + }, + { + "epoch": 0.5995787184888225, + "grad_norm": 0.7661387920379639, + "learning_rate": 0.00016004953784812422, + "loss": 2.7606, + "step": 6618 + }, + { + "epoch": 0.5996693166632693, + "grad_norm": 0.7762496471405029, + "learning_rate": 0.00016004349664713345, + "loss": 2.872, + "step": 6619 + }, + { + "epoch": 0.5997599148377161, + "grad_norm": 0.8897504806518555, + "learning_rate": 0.0001600374554461427, + "loss": 2.778, + "step": 6620 + }, + { + "epoch": 0.5998505130121629, + "grad_norm": 0.7513941526412964, + "learning_rate": 0.00016003141424515195, + "loss": 2.5198, + "step": 6621 + }, + { + "epoch": 0.5999411111866096, + "grad_norm": 0.7726816534996033, + "learning_rate": 0.00016002537304416118, + "loss": 2.8393, + "step": 6622 + }, + { + "epoch": 0.6000317093610563, + "grad_norm": 0.8351532220840454, + "learning_rate": 0.00016001933184317044, + "loss": 2.7591, + "step": 6623 + }, + { + "epoch": 0.6001223075355031, + "grad_norm": 0.7962231636047363, + "learning_rate": 0.00016001329064217968, + "loss": 3.0368, + "step": 6624 + }, + { + "epoch": 0.6002129057099499, + "grad_norm": 0.816051721572876, + "learning_rate": 0.0001600072494411889, + "loss": 2.9915, + "step": 6625 + }, + { + "epoch": 0.6003035038843967, + "grad_norm": 0.7930405735969543, + "learning_rate": 0.00016000120824019814, + "loss": 2.7336, + "step": 6626 + }, + { + "epoch": 0.6003941020588435, + "grad_norm": 0.7935348153114319, + "learning_rate": 0.0001599951670392074, + "loss": 2.7852, + "step": 6627 + }, + { + "epoch": 0.6004847002332903, + "grad_norm": 0.7383541464805603, + "learning_rate": 0.00015998912583821664, + "loss": 2.6328, + "step": 6628 + }, + { + "epoch": 0.600575298407737, + "grad_norm": 0.7017726302146912, + "learning_rate": 0.0001599830846372259, + "loss": 2.5483, + "step": 6629 + }, + { + "epoch": 0.6006658965821838, + "grad_norm": 0.7681573033332825, + "learning_rate": 0.00015997704343623514, + "loss": 2.9355, + "step": 6630 + }, + { + "epoch": 0.6007564947566306, + "grad_norm": 0.7767881751060486, + "learning_rate": 0.00015997100223524437, + "loss": 3.0002, + "step": 6631 + }, + { + "epoch": 0.6008470929310774, + "grad_norm": 0.783509373664856, + "learning_rate": 0.00015996496103425363, + "loss": 2.8591, + "step": 6632 + }, + { + "epoch": 0.6009376911055242, + "grad_norm": 0.8367683291435242, + "learning_rate": 0.00015995891983326286, + "loss": 3.099, + "step": 6633 + }, + { + "epoch": 0.601028289279971, + "grad_norm": 0.8080604076385498, + "learning_rate": 0.0001599528786322721, + "loss": 2.8314, + "step": 6634 + }, + { + "epoch": 0.6011188874544178, + "grad_norm": 0.8402804732322693, + "learning_rate": 0.00015994683743128133, + "loss": 2.7928, + "step": 6635 + }, + { + "epoch": 0.6012094856288646, + "grad_norm": 0.7700327634811401, + "learning_rate": 0.0001599407962302906, + "loss": 2.7823, + "step": 6636 + }, + { + "epoch": 0.6013000838033113, + "grad_norm": 0.8575711846351624, + "learning_rate": 0.00015993475502929985, + "loss": 2.8459, + "step": 6637 + }, + { + "epoch": 0.6013906819777581, + "grad_norm": 0.7835549712181091, + "learning_rate": 0.00015992871382830906, + "loss": 2.8692, + "step": 6638 + }, + { + "epoch": 0.6014812801522049, + "grad_norm": 0.7348049879074097, + "learning_rate": 0.00015992267262731832, + "loss": 2.6704, + "step": 6639 + }, + { + "epoch": 0.6015718783266517, + "grad_norm": 0.7324451208114624, + "learning_rate": 0.00015991663142632756, + "loss": 2.5358, + "step": 6640 + }, + { + "epoch": 0.6016624765010985, + "grad_norm": 0.8055888414382935, + "learning_rate": 0.00015991059022533682, + "loss": 2.8011, + "step": 6641 + }, + { + "epoch": 0.6017530746755453, + "grad_norm": 0.7886210680007935, + "learning_rate": 0.00015990454902434605, + "loss": 2.7492, + "step": 6642 + }, + { + "epoch": 0.6018436728499921, + "grad_norm": 0.7068564891815186, + "learning_rate": 0.00015989850782335529, + "loss": 1.9904, + "step": 6643 + }, + { + "epoch": 0.6019342710244389, + "grad_norm": 0.7671238780021667, + "learning_rate": 0.00015989246662236452, + "loss": 2.6966, + "step": 6644 + }, + { + "epoch": 0.6020248691988856, + "grad_norm": 0.6548976302146912, + "learning_rate": 0.00015988642542137378, + "loss": 2.0043, + "step": 6645 + }, + { + "epoch": 0.6021154673733324, + "grad_norm": 0.8043086528778076, + "learning_rate": 0.00015988038422038302, + "loss": 2.799, + "step": 6646 + }, + { + "epoch": 0.6022060655477792, + "grad_norm": 0.8283980488777161, + "learning_rate": 0.00015987434301939225, + "loss": 2.4905, + "step": 6647 + }, + { + "epoch": 0.602296663722226, + "grad_norm": 0.8465111255645752, + "learning_rate": 0.0001598683018184015, + "loss": 2.7628, + "step": 6648 + }, + { + "epoch": 0.6023872618966728, + "grad_norm": 0.7429111003875732, + "learning_rate": 0.00015986226061741074, + "loss": 2.6278, + "step": 6649 + }, + { + "epoch": 0.6024778600711196, + "grad_norm": 0.815032958984375, + "learning_rate": 0.00015985621941642, + "loss": 2.9033, + "step": 6650 + }, + { + "epoch": 0.6025684582455664, + "grad_norm": 0.7748801708221436, + "learning_rate": 0.0001598501782154292, + "loss": 2.7556, + "step": 6651 + }, + { + "epoch": 0.6026590564200132, + "grad_norm": 0.6249948143959045, + "learning_rate": 0.00015984413701443847, + "loss": 1.9679, + "step": 6652 + }, + { + "epoch": 0.60274965459446, + "grad_norm": 0.7404276132583618, + "learning_rate": 0.00015983809581344774, + "loss": 2.9107, + "step": 6653 + }, + { + "epoch": 0.6028402527689067, + "grad_norm": 0.8347627520561218, + "learning_rate": 0.00015983205461245697, + "loss": 3.0484, + "step": 6654 + }, + { + "epoch": 0.6029308509433535, + "grad_norm": 0.7852662205696106, + "learning_rate": 0.0001598260134114662, + "loss": 2.5518, + "step": 6655 + }, + { + "epoch": 0.6030214491178003, + "grad_norm": 0.8639278411865234, + "learning_rate": 0.00015981997221047544, + "loss": 2.8284, + "step": 6656 + }, + { + "epoch": 0.6031120472922471, + "grad_norm": 0.8407073020935059, + "learning_rate": 0.0001598139310094847, + "loss": 2.9287, + "step": 6657 + }, + { + "epoch": 0.6032026454666939, + "grad_norm": 0.9046632051467896, + "learning_rate": 0.00015980788980849393, + "loss": 2.8617, + "step": 6658 + }, + { + "epoch": 0.6032932436411407, + "grad_norm": 0.8088057637214661, + "learning_rate": 0.00015980184860750317, + "loss": 2.8785, + "step": 6659 + }, + { + "epoch": 0.6033838418155875, + "grad_norm": 0.7630606293678284, + "learning_rate": 0.00015979580740651243, + "loss": 2.8523, + "step": 6660 + }, + { + "epoch": 0.6034744399900343, + "grad_norm": 0.7412778735160828, + "learning_rate": 0.00015978976620552166, + "loss": 2.469, + "step": 6661 + }, + { + "epoch": 0.603565038164481, + "grad_norm": 0.7334986329078674, + "learning_rate": 0.00015978372500453092, + "loss": 2.8235, + "step": 6662 + }, + { + "epoch": 0.6036556363389278, + "grad_norm": 0.7753661274909973, + "learning_rate": 0.00015977768380354016, + "loss": 2.0617, + "step": 6663 + }, + { + "epoch": 0.6037462345133745, + "grad_norm": 0.7910802960395813, + "learning_rate": 0.0001597716426025494, + "loss": 3.0337, + "step": 6664 + }, + { + "epoch": 0.6038368326878213, + "grad_norm": 0.7716887593269348, + "learning_rate": 0.00015976560140155863, + "loss": 2.6533, + "step": 6665 + }, + { + "epoch": 0.6039274308622681, + "grad_norm": 0.7972207069396973, + "learning_rate": 0.00015975956020056789, + "loss": 2.8934, + "step": 6666 + }, + { + "epoch": 0.6040180290367149, + "grad_norm": 0.7730864882469177, + "learning_rate": 0.00015975351899957715, + "loss": 2.8652, + "step": 6667 + }, + { + "epoch": 0.6041086272111617, + "grad_norm": 0.7707040309906006, + "learning_rate": 0.00015974747779858635, + "loss": 2.8301, + "step": 6668 + }, + { + "epoch": 0.6041992253856084, + "grad_norm": 0.6569299697875977, + "learning_rate": 0.00015974143659759562, + "loss": 1.9408, + "step": 6669 + }, + { + "epoch": 0.6042898235600552, + "grad_norm": 0.8505617380142212, + "learning_rate": 0.00015973539539660485, + "loss": 2.9905, + "step": 6670 + }, + { + "epoch": 0.604380421734502, + "grad_norm": 0.7697169780731201, + "learning_rate": 0.0001597293541956141, + "loss": 2.4682, + "step": 6671 + }, + { + "epoch": 0.6044710199089488, + "grad_norm": 0.6966012120246887, + "learning_rate": 0.00015972331299462332, + "loss": 2.2534, + "step": 6672 + }, + { + "epoch": 0.6045616180833956, + "grad_norm": 0.7773188948631287, + "learning_rate": 0.00015971727179363258, + "loss": 2.8246, + "step": 6673 + }, + { + "epoch": 0.6046522162578424, + "grad_norm": 0.6210180521011353, + "learning_rate": 0.0001597112305926418, + "loss": 2.2715, + "step": 6674 + }, + { + "epoch": 0.6047428144322892, + "grad_norm": 0.7689477205276489, + "learning_rate": 0.00015970518939165107, + "loss": 2.5528, + "step": 6675 + }, + { + "epoch": 0.604833412606736, + "grad_norm": 0.7456188201904297, + "learning_rate": 0.0001596991481906603, + "loss": 2.7083, + "step": 6676 + }, + { + "epoch": 0.6049240107811827, + "grad_norm": 0.8803488612174988, + "learning_rate": 0.00015969310698966954, + "loss": 2.6191, + "step": 6677 + }, + { + "epoch": 0.6050146089556295, + "grad_norm": 0.7911258339881897, + "learning_rate": 0.0001596870657886788, + "loss": 2.8825, + "step": 6678 + }, + { + "epoch": 0.6051052071300763, + "grad_norm": 0.8071193695068359, + "learning_rate": 0.00015968102458768804, + "loss": 2.6862, + "step": 6679 + }, + { + "epoch": 0.6051958053045231, + "grad_norm": 0.7705795764923096, + "learning_rate": 0.0001596749833866973, + "loss": 2.9527, + "step": 6680 + }, + { + "epoch": 0.6052864034789699, + "grad_norm": 0.7600675821304321, + "learning_rate": 0.0001596689421857065, + "loss": 2.8081, + "step": 6681 + }, + { + "epoch": 0.6053770016534167, + "grad_norm": 0.8250626921653748, + "learning_rate": 0.00015966290098471577, + "loss": 2.8991, + "step": 6682 + }, + { + "epoch": 0.6054675998278635, + "grad_norm": 0.7790247201919556, + "learning_rate": 0.00015965685978372503, + "loss": 3.036, + "step": 6683 + }, + { + "epoch": 0.6055581980023103, + "grad_norm": 0.7686272263526917, + "learning_rate": 0.00015965081858273426, + "loss": 2.7705, + "step": 6684 + }, + { + "epoch": 0.605648796176757, + "grad_norm": 0.7810840606689453, + "learning_rate": 0.0001596447773817435, + "loss": 2.8832, + "step": 6685 + }, + { + "epoch": 0.6057393943512038, + "grad_norm": 0.8177449107170105, + "learning_rate": 0.00015963873618075273, + "loss": 2.8964, + "step": 6686 + }, + { + "epoch": 0.6058299925256506, + "grad_norm": 0.7174443006515503, + "learning_rate": 0.000159632694979762, + "loss": 2.637, + "step": 6687 + }, + { + "epoch": 0.6059205907000974, + "grad_norm": 0.8231069445610046, + "learning_rate": 0.00015962665377877123, + "loss": 2.4729, + "step": 6688 + }, + { + "epoch": 0.6060111888745442, + "grad_norm": 0.7702984809875488, + "learning_rate": 0.00015962061257778046, + "loss": 2.6265, + "step": 6689 + }, + { + "epoch": 0.606101787048991, + "grad_norm": 0.7481586337089539, + "learning_rate": 0.00015961457137678972, + "loss": 2.9147, + "step": 6690 + }, + { + "epoch": 0.6061923852234378, + "grad_norm": 0.7374228239059448, + "learning_rate": 0.00015960853017579895, + "loss": 2.7554, + "step": 6691 + }, + { + "epoch": 0.6062829833978846, + "grad_norm": 0.7876877188682556, + "learning_rate": 0.00015960248897480822, + "loss": 2.8816, + "step": 6692 + }, + { + "epoch": 0.6063735815723313, + "grad_norm": 0.7620639204978943, + "learning_rate": 0.00015959644777381745, + "loss": 2.6838, + "step": 6693 + }, + { + "epoch": 0.6064641797467781, + "grad_norm": 0.8010193705558777, + "learning_rate": 0.00015959040657282668, + "loss": 2.9627, + "step": 6694 + }, + { + "epoch": 0.6065547779212249, + "grad_norm": 0.7842471599578857, + "learning_rate": 0.00015958436537183592, + "loss": 2.7285, + "step": 6695 + }, + { + "epoch": 0.6066453760956717, + "grad_norm": 0.7904655933380127, + "learning_rate": 0.00015957832417084518, + "loss": 2.82, + "step": 6696 + }, + { + "epoch": 0.6067359742701185, + "grad_norm": 0.8400886058807373, + "learning_rate": 0.0001595722829698544, + "loss": 2.8261, + "step": 6697 + }, + { + "epoch": 0.6068265724445653, + "grad_norm": 0.8253859877586365, + "learning_rate": 0.00015956624176886365, + "loss": 2.9574, + "step": 6698 + }, + { + "epoch": 0.6069171706190121, + "grad_norm": 0.7892544865608215, + "learning_rate": 0.0001595602005678729, + "loss": 2.7256, + "step": 6699 + }, + { + "epoch": 0.6070077687934589, + "grad_norm": 0.7319737672805786, + "learning_rate": 0.00015955415936688214, + "loss": 2.1233, + "step": 6700 + }, + { + "epoch": 0.6070983669679056, + "grad_norm": 0.751402735710144, + "learning_rate": 0.0001595481181658914, + "loss": 2.9187, + "step": 6701 + }, + { + "epoch": 0.6071889651423524, + "grad_norm": 0.7592307329177856, + "learning_rate": 0.0001595420769649006, + "loss": 2.7956, + "step": 6702 + }, + { + "epoch": 0.6072795633167992, + "grad_norm": 0.822433590888977, + "learning_rate": 0.00015953603576390987, + "loss": 2.7574, + "step": 6703 + }, + { + "epoch": 0.6073701614912459, + "grad_norm": 0.7512436509132385, + "learning_rate": 0.0001595299945629191, + "loss": 2.3779, + "step": 6704 + }, + { + "epoch": 0.6074607596656927, + "grad_norm": 0.8235759139060974, + "learning_rate": 0.00015952395336192837, + "loss": 2.8866, + "step": 6705 + }, + { + "epoch": 0.6075513578401395, + "grad_norm": 0.8796727061271667, + "learning_rate": 0.0001595179121609376, + "loss": 2.7627, + "step": 6706 + }, + { + "epoch": 0.6076419560145863, + "grad_norm": 0.7658399343490601, + "learning_rate": 0.00015951187095994683, + "loss": 2.6696, + "step": 6707 + }, + { + "epoch": 0.607732554189033, + "grad_norm": 0.7948396801948547, + "learning_rate": 0.0001595058297589561, + "loss": 2.742, + "step": 6708 + }, + { + "epoch": 0.6078231523634798, + "grad_norm": 0.8151334524154663, + "learning_rate": 0.00015949978855796533, + "loss": 3.0149, + "step": 6709 + }, + { + "epoch": 0.6079137505379266, + "grad_norm": 0.8242236971855164, + "learning_rate": 0.00015949374735697456, + "loss": 3.0625, + "step": 6710 + }, + { + "epoch": 0.6080043487123734, + "grad_norm": 0.8796868920326233, + "learning_rate": 0.0001594877061559838, + "loss": 3.1916, + "step": 6711 + }, + { + "epoch": 0.6080949468868202, + "grad_norm": 0.7683547735214233, + "learning_rate": 0.00015948166495499306, + "loss": 2.9251, + "step": 6712 + }, + { + "epoch": 0.608185545061267, + "grad_norm": 0.8171910643577576, + "learning_rate": 0.00015947562375400232, + "loss": 2.8377, + "step": 6713 + }, + { + "epoch": 0.6082761432357138, + "grad_norm": 0.7421539425849915, + "learning_rate": 0.00015946958255301155, + "loss": 2.7872, + "step": 6714 + }, + { + "epoch": 0.6083667414101606, + "grad_norm": 0.7631475329399109, + "learning_rate": 0.0001594635413520208, + "loss": 2.9172, + "step": 6715 + }, + { + "epoch": 0.6084573395846073, + "grad_norm": 0.7372889518737793, + "learning_rate": 0.00015945750015103002, + "loss": 2.7094, + "step": 6716 + }, + { + "epoch": 0.6085479377590541, + "grad_norm": 0.690965473651886, + "learning_rate": 0.00015945145895003928, + "loss": 2.1022, + "step": 6717 + }, + { + "epoch": 0.6086385359335009, + "grad_norm": 0.7455119490623474, + "learning_rate": 0.00015944541774904852, + "loss": 2.7405, + "step": 6718 + }, + { + "epoch": 0.6087291341079477, + "grad_norm": 0.7901667356491089, + "learning_rate": 0.00015943937654805775, + "loss": 3.1029, + "step": 6719 + }, + { + "epoch": 0.6088197322823945, + "grad_norm": 0.8017676472663879, + "learning_rate": 0.000159433335347067, + "loss": 2.7477, + "step": 6720 + }, + { + "epoch": 0.6089103304568413, + "grad_norm": 0.7874942421913147, + "learning_rate": 0.00015942729414607625, + "loss": 3.0609, + "step": 6721 + }, + { + "epoch": 0.6090009286312881, + "grad_norm": 0.8128302693367004, + "learning_rate": 0.0001594212529450855, + "loss": 2.7827, + "step": 6722 + }, + { + "epoch": 0.6090915268057349, + "grad_norm": 0.7595224976539612, + "learning_rate": 0.00015941521174409472, + "loss": 2.519, + "step": 6723 + }, + { + "epoch": 0.6091821249801816, + "grad_norm": 0.7455273866653442, + "learning_rate": 0.00015940917054310398, + "loss": 2.6077, + "step": 6724 + }, + { + "epoch": 0.6092727231546284, + "grad_norm": 0.7704893946647644, + "learning_rate": 0.0001594031293421132, + "loss": 2.7311, + "step": 6725 + }, + { + "epoch": 0.6093633213290752, + "grad_norm": 0.8304439187049866, + "learning_rate": 0.00015939708814112247, + "loss": 2.8566, + "step": 6726 + }, + { + "epoch": 0.609453919503522, + "grad_norm": 0.7819035053253174, + "learning_rate": 0.0001593910469401317, + "loss": 3.0583, + "step": 6727 + }, + { + "epoch": 0.6095445176779688, + "grad_norm": 0.8917064070701599, + "learning_rate": 0.00015938500573914094, + "loss": 2.6802, + "step": 6728 + }, + { + "epoch": 0.6096351158524156, + "grad_norm": 0.6940508484840393, + "learning_rate": 0.0001593789645381502, + "loss": 2.2175, + "step": 6729 + }, + { + "epoch": 0.6097257140268624, + "grad_norm": 0.7980599403381348, + "learning_rate": 0.00015937292333715943, + "loss": 2.5519, + "step": 6730 + }, + { + "epoch": 0.6098163122013092, + "grad_norm": 0.8163037896156311, + "learning_rate": 0.0001593668821361687, + "loss": 3.14, + "step": 6731 + }, + { + "epoch": 0.609906910375756, + "grad_norm": 0.8622370362281799, + "learning_rate": 0.0001593608409351779, + "loss": 2.867, + "step": 6732 + }, + { + "epoch": 0.6099975085502027, + "grad_norm": 0.7476710081100464, + "learning_rate": 0.00015935479973418716, + "loss": 2.7519, + "step": 6733 + }, + { + "epoch": 0.6100881067246495, + "grad_norm": 0.8114187717437744, + "learning_rate": 0.0001593487585331964, + "loss": 2.9156, + "step": 6734 + }, + { + "epoch": 0.6101787048990963, + "grad_norm": 0.8585984706878662, + "learning_rate": 0.00015934271733220566, + "loss": 2.5057, + "step": 6735 + }, + { + "epoch": 0.6102693030735431, + "grad_norm": 0.736662745475769, + "learning_rate": 0.0001593366761312149, + "loss": 2.7137, + "step": 6736 + }, + { + "epoch": 0.6103599012479899, + "grad_norm": 0.7911975979804993, + "learning_rate": 0.00015933063493022413, + "loss": 2.8497, + "step": 6737 + }, + { + "epoch": 0.6104504994224367, + "grad_norm": 0.8675116300582886, + "learning_rate": 0.0001593245937292334, + "loss": 2.6994, + "step": 6738 + }, + { + "epoch": 0.6105410975968835, + "grad_norm": 0.7969244718551636, + "learning_rate": 0.00015931855252824262, + "loss": 2.9033, + "step": 6739 + }, + { + "epoch": 0.6106316957713303, + "grad_norm": 0.7109357118606567, + "learning_rate": 0.00015931251132725186, + "loss": 2.371, + "step": 6740 + }, + { + "epoch": 0.610722293945777, + "grad_norm": 0.7784517407417297, + "learning_rate": 0.0001593064701262611, + "loss": 2.664, + "step": 6741 + }, + { + "epoch": 0.6108128921202238, + "grad_norm": 0.7700798511505127, + "learning_rate": 0.00015930042892527035, + "loss": 2.8234, + "step": 6742 + }, + { + "epoch": 0.6109034902946706, + "grad_norm": 0.7740976810455322, + "learning_rate": 0.0001592943877242796, + "loss": 3.1093, + "step": 6743 + }, + { + "epoch": 0.6109940884691174, + "grad_norm": 0.7599223852157593, + "learning_rate": 0.00015928834652328885, + "loss": 2.9526, + "step": 6744 + }, + { + "epoch": 0.6110846866435641, + "grad_norm": 0.8517208099365234, + "learning_rate": 0.00015928230532229808, + "loss": 3.0817, + "step": 6745 + }, + { + "epoch": 0.6111752848180109, + "grad_norm": 0.7649569511413574, + "learning_rate": 0.00015927626412130732, + "loss": 2.7913, + "step": 6746 + }, + { + "epoch": 0.6112658829924577, + "grad_norm": 0.7613916397094727, + "learning_rate": 0.00015927022292031658, + "loss": 2.9123, + "step": 6747 + }, + { + "epoch": 0.6113564811669044, + "grad_norm": 0.80134117603302, + "learning_rate": 0.0001592641817193258, + "loss": 2.7636, + "step": 6748 + }, + { + "epoch": 0.6114470793413512, + "grad_norm": 0.779915452003479, + "learning_rate": 0.00015925814051833504, + "loss": 2.734, + "step": 6749 + }, + { + "epoch": 0.611537677515798, + "grad_norm": 0.6870887279510498, + "learning_rate": 0.0001592520993173443, + "loss": 2.1009, + "step": 6750 + }, + { + "epoch": 0.6116282756902448, + "grad_norm": 0.6830922961235046, + "learning_rate": 0.00015924605811635354, + "loss": 2.0995, + "step": 6751 + }, + { + "epoch": 0.6117188738646916, + "grad_norm": 0.875725507736206, + "learning_rate": 0.0001592400169153628, + "loss": 2.8369, + "step": 6752 + }, + { + "epoch": 0.6118094720391384, + "grad_norm": 0.7341935634613037, + "learning_rate": 0.000159233975714372, + "loss": 2.4116, + "step": 6753 + }, + { + "epoch": 0.6119000702135852, + "grad_norm": 0.7825195789337158, + "learning_rate": 0.00015922793451338127, + "loss": 2.7746, + "step": 6754 + }, + { + "epoch": 0.611990668388032, + "grad_norm": 0.7839562892913818, + "learning_rate": 0.0001592218933123905, + "loss": 3.0353, + "step": 6755 + }, + { + "epoch": 0.6120812665624787, + "grad_norm": 0.8704665303230286, + "learning_rate": 0.00015921585211139976, + "loss": 2.985, + "step": 6756 + }, + { + "epoch": 0.6121718647369255, + "grad_norm": 0.8237364888191223, + "learning_rate": 0.000159209810910409, + "loss": 3.0972, + "step": 6757 + }, + { + "epoch": 0.6122624629113723, + "grad_norm": 0.7822195291519165, + "learning_rate": 0.00015920376970941823, + "loss": 2.795, + "step": 6758 + }, + { + "epoch": 0.6123530610858191, + "grad_norm": 0.7179099321365356, + "learning_rate": 0.0001591977285084275, + "loss": 2.5546, + "step": 6759 + }, + { + "epoch": 0.6124436592602659, + "grad_norm": 0.8299955725669861, + "learning_rate": 0.00015919168730743673, + "loss": 2.9422, + "step": 6760 + }, + { + "epoch": 0.6125342574347127, + "grad_norm": 0.9942171573638916, + "learning_rate": 0.00015918564610644596, + "loss": 2.8485, + "step": 6761 + }, + { + "epoch": 0.6126248556091595, + "grad_norm": 0.7924450635910034, + "learning_rate": 0.0001591796049054552, + "loss": 2.8398, + "step": 6762 + }, + { + "epoch": 0.6127154537836063, + "grad_norm": 0.768669843673706, + "learning_rate": 0.00015917356370446446, + "loss": 2.7596, + "step": 6763 + }, + { + "epoch": 0.612806051958053, + "grad_norm": 0.8036745190620422, + "learning_rate": 0.0001591675225034737, + "loss": 2.8748, + "step": 6764 + }, + { + "epoch": 0.6128966501324998, + "grad_norm": 0.8319517970085144, + "learning_rate": 0.00015916148130248295, + "loss": 2.9977, + "step": 6765 + }, + { + "epoch": 0.6129872483069466, + "grad_norm": 0.9252572059631348, + "learning_rate": 0.00015915544010149219, + "loss": 2.7136, + "step": 6766 + }, + { + "epoch": 0.6130778464813934, + "grad_norm": 0.8500002026557922, + "learning_rate": 0.00015914939890050142, + "loss": 2.8257, + "step": 6767 + }, + { + "epoch": 0.6131684446558402, + "grad_norm": 0.8663365840911865, + "learning_rate": 0.00015914335769951068, + "loss": 3.0216, + "step": 6768 + }, + { + "epoch": 0.613259042830287, + "grad_norm": 0.5847336053848267, + "learning_rate": 0.00015913731649851992, + "loss": 1.5172, + "step": 6769 + }, + { + "epoch": 0.6133496410047338, + "grad_norm": 0.823456346988678, + "learning_rate": 0.00015913127529752915, + "loss": 2.9235, + "step": 6770 + }, + { + "epoch": 0.6134402391791806, + "grad_norm": 0.7743830680847168, + "learning_rate": 0.00015912523409653838, + "loss": 2.6924, + "step": 6771 + }, + { + "epoch": 0.6135308373536273, + "grad_norm": 0.6526783108711243, + "learning_rate": 0.00015911919289554764, + "loss": 2.1621, + "step": 6772 + }, + { + "epoch": 0.6136214355280741, + "grad_norm": 0.7240053415298462, + "learning_rate": 0.0001591131516945569, + "loss": 2.1736, + "step": 6773 + }, + { + "epoch": 0.6137120337025209, + "grad_norm": 0.8289021253585815, + "learning_rate": 0.0001591071104935661, + "loss": 2.9454, + "step": 6774 + }, + { + "epoch": 0.6138026318769677, + "grad_norm": 0.7634495496749878, + "learning_rate": 0.00015910106929257537, + "loss": 2.6113, + "step": 6775 + }, + { + "epoch": 0.6138932300514145, + "grad_norm": 0.7603107690811157, + "learning_rate": 0.0001590950280915846, + "loss": 2.7135, + "step": 6776 + }, + { + "epoch": 0.6139838282258613, + "grad_norm": 0.7543471455574036, + "learning_rate": 0.00015908898689059387, + "loss": 2.6504, + "step": 6777 + }, + { + "epoch": 0.6140744264003081, + "grad_norm": 0.8087485432624817, + "learning_rate": 0.0001590829456896031, + "loss": 2.9417, + "step": 6778 + }, + { + "epoch": 0.6141650245747549, + "grad_norm": 0.7382306456565857, + "learning_rate": 0.00015907690448861234, + "loss": 2.7589, + "step": 6779 + }, + { + "epoch": 0.6142556227492016, + "grad_norm": 0.7799112796783447, + "learning_rate": 0.0001590708632876216, + "loss": 2.7637, + "step": 6780 + }, + { + "epoch": 0.6143462209236484, + "grad_norm": 0.7816524505615234, + "learning_rate": 0.00015906482208663083, + "loss": 2.8854, + "step": 6781 + }, + { + "epoch": 0.6144368190980952, + "grad_norm": 0.7929564714431763, + "learning_rate": 0.00015905878088564007, + "loss": 2.8229, + "step": 6782 + }, + { + "epoch": 0.614527417272542, + "grad_norm": 0.7778319120407104, + "learning_rate": 0.0001590527396846493, + "loss": 2.7824, + "step": 6783 + }, + { + "epoch": 0.6146180154469888, + "grad_norm": 0.7838743925094604, + "learning_rate": 0.00015904669848365856, + "loss": 2.7517, + "step": 6784 + }, + { + "epoch": 0.6147086136214355, + "grad_norm": 0.7931084036827087, + "learning_rate": 0.0001590406572826678, + "loss": 2.8238, + "step": 6785 + }, + { + "epoch": 0.6147992117958823, + "grad_norm": 0.7794212102890015, + "learning_rate": 0.00015903461608167706, + "loss": 2.7453, + "step": 6786 + }, + { + "epoch": 0.614889809970329, + "grad_norm": 0.9551914930343628, + "learning_rate": 0.0001590285748806863, + "loss": 2.6643, + "step": 6787 + }, + { + "epoch": 0.6149804081447758, + "grad_norm": 0.7542933821678162, + "learning_rate": 0.00015902253367969552, + "loss": 2.2156, + "step": 6788 + }, + { + "epoch": 0.6150710063192226, + "grad_norm": 0.7840610146522522, + "learning_rate": 0.00015901649247870479, + "loss": 2.9515, + "step": 6789 + }, + { + "epoch": 0.6151616044936694, + "grad_norm": 0.8024063110351562, + "learning_rate": 0.00015901045127771402, + "loss": 2.7723, + "step": 6790 + }, + { + "epoch": 0.6152522026681162, + "grad_norm": 0.7757372260093689, + "learning_rate": 0.00015900441007672325, + "loss": 3.0636, + "step": 6791 + }, + { + "epoch": 0.615342800842563, + "grad_norm": 0.7741333246231079, + "learning_rate": 0.0001589983688757325, + "loss": 2.9098, + "step": 6792 + }, + { + "epoch": 0.6154333990170098, + "grad_norm": 0.7835542559623718, + "learning_rate": 0.00015899232767474175, + "loss": 2.6944, + "step": 6793 + }, + { + "epoch": 0.6155239971914566, + "grad_norm": 0.8319432139396667, + "learning_rate": 0.00015898628647375098, + "loss": 2.8069, + "step": 6794 + }, + { + "epoch": 0.6156145953659033, + "grad_norm": 0.7641199827194214, + "learning_rate": 0.00015898024527276022, + "loss": 3.0577, + "step": 6795 + }, + { + "epoch": 0.6157051935403501, + "grad_norm": 0.8146473169326782, + "learning_rate": 0.00015897420407176948, + "loss": 2.7443, + "step": 6796 + }, + { + "epoch": 0.6157957917147969, + "grad_norm": 0.7360274195671082, + "learning_rate": 0.0001589681628707787, + "loss": 2.5655, + "step": 6797 + }, + { + "epoch": 0.6158863898892437, + "grad_norm": 0.8030917644500732, + "learning_rate": 0.00015896212166978797, + "loss": 2.6955, + "step": 6798 + }, + { + "epoch": 0.6159769880636905, + "grad_norm": 0.7956218719482422, + "learning_rate": 0.0001589560804687972, + "loss": 2.8243, + "step": 6799 + }, + { + "epoch": 0.6160675862381373, + "grad_norm": 0.7606529593467712, + "learning_rate": 0.00015895003926780644, + "loss": 2.6984, + "step": 6800 + }, + { + "epoch": 0.6161581844125841, + "grad_norm": 0.8499045968055725, + "learning_rate": 0.00015894399806681568, + "loss": 2.8801, + "step": 6801 + }, + { + "epoch": 0.6162487825870309, + "grad_norm": 0.8473526835441589, + "learning_rate": 0.00015893795686582494, + "loss": 2.8452, + "step": 6802 + }, + { + "epoch": 0.6163393807614777, + "grad_norm": 0.8199546933174133, + "learning_rate": 0.0001589319156648342, + "loss": 3.2056, + "step": 6803 + }, + { + "epoch": 0.6164299789359244, + "grad_norm": 0.8357741832733154, + "learning_rate": 0.0001589258744638434, + "loss": 3.2317, + "step": 6804 + }, + { + "epoch": 0.6165205771103712, + "grad_norm": 0.7643887400627136, + "learning_rate": 0.00015891983326285267, + "loss": 2.8058, + "step": 6805 + }, + { + "epoch": 0.616611175284818, + "grad_norm": 0.79172682762146, + "learning_rate": 0.0001589137920618619, + "loss": 2.8237, + "step": 6806 + }, + { + "epoch": 0.6167017734592648, + "grad_norm": 0.7370193004608154, + "learning_rate": 0.00015890775086087116, + "loss": 2.0156, + "step": 6807 + }, + { + "epoch": 0.6167923716337116, + "grad_norm": 0.9160693287849426, + "learning_rate": 0.0001589017096598804, + "loss": 2.7626, + "step": 6808 + }, + { + "epoch": 0.6168829698081584, + "grad_norm": 0.7714483141899109, + "learning_rate": 0.00015889566845888963, + "loss": 2.8628, + "step": 6809 + }, + { + "epoch": 0.6169735679826052, + "grad_norm": 0.6832020878791809, + "learning_rate": 0.0001588896272578989, + "loss": 2.1509, + "step": 6810 + }, + { + "epoch": 0.617064166157052, + "grad_norm": 0.792529284954071, + "learning_rate": 0.00015888358605690812, + "loss": 2.9375, + "step": 6811 + }, + { + "epoch": 0.6171547643314987, + "grad_norm": 0.7864300608634949, + "learning_rate": 0.00015887754485591736, + "loss": 2.8306, + "step": 6812 + }, + { + "epoch": 0.6172453625059455, + "grad_norm": 0.7756305932998657, + "learning_rate": 0.0001588715036549266, + "loss": 2.724, + "step": 6813 + }, + { + "epoch": 0.6173359606803923, + "grad_norm": 0.669203519821167, + "learning_rate": 0.00015886546245393585, + "loss": 2.059, + "step": 6814 + }, + { + "epoch": 0.6174265588548391, + "grad_norm": 0.7677441835403442, + "learning_rate": 0.0001588594212529451, + "loss": 2.8885, + "step": 6815 + }, + { + "epoch": 0.6175171570292859, + "grad_norm": 0.7690462470054626, + "learning_rate": 0.00015885338005195435, + "loss": 3.0225, + "step": 6816 + }, + { + "epoch": 0.6176077552037327, + "grad_norm": 0.8709571957588196, + "learning_rate": 0.00015884733885096358, + "loss": 2.9256, + "step": 6817 + }, + { + "epoch": 0.6176983533781795, + "grad_norm": 0.7647148370742798, + "learning_rate": 0.00015884129764997282, + "loss": 2.6707, + "step": 6818 + }, + { + "epoch": 0.6177889515526263, + "grad_norm": 0.7619794011116028, + "learning_rate": 0.00015883525644898208, + "loss": 3.0719, + "step": 6819 + }, + { + "epoch": 0.617879549727073, + "grad_norm": 0.7495009303092957, + "learning_rate": 0.0001588292152479913, + "loss": 2.8371, + "step": 6820 + }, + { + "epoch": 0.6179701479015198, + "grad_norm": 0.7852463126182556, + "learning_rate": 0.00015882317404700055, + "loss": 2.8354, + "step": 6821 + }, + { + "epoch": 0.6180607460759666, + "grad_norm": 0.7674608826637268, + "learning_rate": 0.00015881713284600978, + "loss": 2.7676, + "step": 6822 + }, + { + "epoch": 0.6181513442504134, + "grad_norm": 0.7817456722259521, + "learning_rate": 0.00015881109164501904, + "loss": 2.7611, + "step": 6823 + }, + { + "epoch": 0.6182419424248602, + "grad_norm": 0.781477689743042, + "learning_rate": 0.00015880505044402828, + "loss": 2.7528, + "step": 6824 + }, + { + "epoch": 0.618332540599307, + "grad_norm": 0.8241608738899231, + "learning_rate": 0.0001587990092430375, + "loss": 2.6811, + "step": 6825 + }, + { + "epoch": 0.6184231387737537, + "grad_norm": 0.6565054655075073, + "learning_rate": 0.00015879296804204677, + "loss": 2.1449, + "step": 6826 + }, + { + "epoch": 0.6185137369482004, + "grad_norm": 0.7831161022186279, + "learning_rate": 0.000158786926841056, + "loss": 2.9889, + "step": 6827 + }, + { + "epoch": 0.6186043351226472, + "grad_norm": 0.8045616149902344, + "learning_rate": 0.00015878088564006527, + "loss": 2.7094, + "step": 6828 + }, + { + "epoch": 0.618694933297094, + "grad_norm": 0.8100526928901672, + "learning_rate": 0.0001587748444390745, + "loss": 2.846, + "step": 6829 + }, + { + "epoch": 0.6187855314715408, + "grad_norm": 0.7457093596458435, + "learning_rate": 0.00015876880323808373, + "loss": 2.6457, + "step": 6830 + }, + { + "epoch": 0.6188761296459876, + "grad_norm": 0.8146907091140747, + "learning_rate": 0.00015876276203709297, + "loss": 3.04, + "step": 6831 + }, + { + "epoch": 0.6189667278204344, + "grad_norm": 0.7732624411582947, + "learning_rate": 0.00015875672083610223, + "loss": 2.9729, + "step": 6832 + }, + { + "epoch": 0.6190573259948812, + "grad_norm": 0.7445349097251892, + "learning_rate": 0.00015875067963511146, + "loss": 2.6276, + "step": 6833 + }, + { + "epoch": 0.619147924169328, + "grad_norm": 0.8101680874824524, + "learning_rate": 0.0001587446384341207, + "loss": 3.1504, + "step": 6834 + }, + { + "epoch": 0.6192385223437747, + "grad_norm": 0.7965414524078369, + "learning_rate": 0.00015873859723312996, + "loss": 2.8544, + "step": 6835 + }, + { + "epoch": 0.6193291205182215, + "grad_norm": 0.8288535475730896, + "learning_rate": 0.0001587325560321392, + "loss": 2.7324, + "step": 6836 + }, + { + "epoch": 0.6194197186926683, + "grad_norm": 0.7407006025314331, + "learning_rate": 0.00015872651483114845, + "loss": 2.8829, + "step": 6837 + }, + { + "epoch": 0.6195103168671151, + "grad_norm": 0.7358027100563049, + "learning_rate": 0.00015872047363015766, + "loss": 1.9156, + "step": 6838 + }, + { + "epoch": 0.6196009150415619, + "grad_norm": 0.7825627326965332, + "learning_rate": 0.00015871443242916692, + "loss": 2.8572, + "step": 6839 + }, + { + "epoch": 0.6196915132160087, + "grad_norm": 0.8376271724700928, + "learning_rate": 0.00015870839122817618, + "loss": 2.843, + "step": 6840 + }, + { + "epoch": 0.6197821113904555, + "grad_norm": 0.7592083215713501, + "learning_rate": 0.00015870235002718542, + "loss": 2.4748, + "step": 6841 + }, + { + "epoch": 0.6198727095649023, + "grad_norm": 0.7430919408798218, + "learning_rate": 0.00015869630882619465, + "loss": 2.9009, + "step": 6842 + }, + { + "epoch": 0.619963307739349, + "grad_norm": 0.7796184420585632, + "learning_rate": 0.00015869026762520389, + "loss": 2.7063, + "step": 6843 + }, + { + "epoch": 0.6200539059137958, + "grad_norm": 0.7969872951507568, + "learning_rate": 0.00015868422642421315, + "loss": 2.7719, + "step": 6844 + }, + { + "epoch": 0.6201445040882426, + "grad_norm": 0.7737727761268616, + "learning_rate": 0.00015867818522322238, + "loss": 2.6065, + "step": 6845 + }, + { + "epoch": 0.6202351022626894, + "grad_norm": 0.7963882684707642, + "learning_rate": 0.00015867214402223161, + "loss": 2.9552, + "step": 6846 + }, + { + "epoch": 0.6203257004371362, + "grad_norm": 0.8672401905059814, + "learning_rate": 0.00015866610282124088, + "loss": 3.007, + "step": 6847 + }, + { + "epoch": 0.620416298611583, + "grad_norm": 0.6803562641143799, + "learning_rate": 0.0001586600616202501, + "loss": 2.0357, + "step": 6848 + }, + { + "epoch": 0.6205068967860298, + "grad_norm": 0.7654628753662109, + "learning_rate": 0.00015865402041925937, + "loss": 2.7834, + "step": 6849 + }, + { + "epoch": 0.6205974949604766, + "grad_norm": 0.8663042187690735, + "learning_rate": 0.0001586479792182686, + "loss": 3.0187, + "step": 6850 + }, + { + "epoch": 0.6206880931349233, + "grad_norm": 0.7966555953025818, + "learning_rate": 0.00015864193801727784, + "loss": 2.857, + "step": 6851 + }, + { + "epoch": 0.6207786913093701, + "grad_norm": 0.8171558976173401, + "learning_rate": 0.00015863589681628707, + "loss": 2.8439, + "step": 6852 + }, + { + "epoch": 0.6208692894838169, + "grad_norm": 0.8102526664733887, + "learning_rate": 0.00015862985561529633, + "loss": 2.7655, + "step": 6853 + }, + { + "epoch": 0.6209598876582637, + "grad_norm": 0.7806975841522217, + "learning_rate": 0.00015862381441430557, + "loss": 2.6146, + "step": 6854 + }, + { + "epoch": 0.6210504858327105, + "grad_norm": 0.8601168990135193, + "learning_rate": 0.0001586177732133148, + "loss": 2.9063, + "step": 6855 + }, + { + "epoch": 0.6211410840071573, + "grad_norm": 0.7940048575401306, + "learning_rate": 0.00015861173201232406, + "loss": 2.7338, + "step": 6856 + }, + { + "epoch": 0.6212316821816041, + "grad_norm": 0.7839202284812927, + "learning_rate": 0.0001586056908113333, + "loss": 2.8239, + "step": 6857 + }, + { + "epoch": 0.6213222803560509, + "grad_norm": 0.7860508561134338, + "learning_rate": 0.00015859964961034256, + "loss": 2.5885, + "step": 6858 + }, + { + "epoch": 0.6214128785304976, + "grad_norm": 0.8230621218681335, + "learning_rate": 0.00015859360840935177, + "loss": 2.7517, + "step": 6859 + }, + { + "epoch": 0.6215034767049444, + "grad_norm": 0.796114444732666, + "learning_rate": 0.00015858756720836103, + "loss": 2.9323, + "step": 6860 + }, + { + "epoch": 0.6215940748793912, + "grad_norm": 0.7696070075035095, + "learning_rate": 0.00015858152600737026, + "loss": 2.7361, + "step": 6861 + }, + { + "epoch": 0.621684673053838, + "grad_norm": 0.8226678371429443, + "learning_rate": 0.00015857548480637952, + "loss": 2.723, + "step": 6862 + }, + { + "epoch": 0.6217752712282848, + "grad_norm": 0.811801552772522, + "learning_rate": 0.00015856944360538876, + "loss": 2.5235, + "step": 6863 + }, + { + "epoch": 0.6218658694027316, + "grad_norm": 0.8085715174674988, + "learning_rate": 0.000158563402404398, + "loss": 2.8844, + "step": 6864 + }, + { + "epoch": 0.6219564675771784, + "grad_norm": 0.8231387138366699, + "learning_rate": 0.00015855736120340725, + "loss": 2.6065, + "step": 6865 + }, + { + "epoch": 0.622047065751625, + "grad_norm": 0.8237628936767578, + "learning_rate": 0.00015855132000241649, + "loss": 2.9229, + "step": 6866 + }, + { + "epoch": 0.6221376639260718, + "grad_norm": 0.7953349947929382, + "learning_rate": 0.00015854527880142575, + "loss": 2.8026, + "step": 6867 + }, + { + "epoch": 0.6222282621005186, + "grad_norm": 0.6811199188232422, + "learning_rate": 0.00015853923760043495, + "loss": 2.1139, + "step": 6868 + }, + { + "epoch": 0.6223188602749654, + "grad_norm": 0.79582679271698, + "learning_rate": 0.00015853319639944422, + "loss": 2.7048, + "step": 6869 + }, + { + "epoch": 0.6224094584494122, + "grad_norm": 0.7507842183113098, + "learning_rate": 0.00015852715519845348, + "loss": 2.5814, + "step": 6870 + }, + { + "epoch": 0.622500056623859, + "grad_norm": 0.7628829479217529, + "learning_rate": 0.0001585211139974627, + "loss": 2.8273, + "step": 6871 + }, + { + "epoch": 0.6225906547983058, + "grad_norm": 0.7631202340126038, + "learning_rate": 0.00015851507279647194, + "loss": 2.7333, + "step": 6872 + }, + { + "epoch": 0.6226812529727526, + "grad_norm": 0.8006885647773743, + "learning_rate": 0.00015850903159548118, + "loss": 2.8696, + "step": 6873 + }, + { + "epoch": 0.6227718511471994, + "grad_norm": 0.7788715958595276, + "learning_rate": 0.00015850299039449044, + "loss": 2.7, + "step": 6874 + }, + { + "epoch": 0.6228624493216461, + "grad_norm": 0.7845468521118164, + "learning_rate": 0.00015849694919349967, + "loss": 2.9467, + "step": 6875 + }, + { + "epoch": 0.6229530474960929, + "grad_norm": 0.7755083441734314, + "learning_rate": 0.0001584909079925089, + "loss": 2.6649, + "step": 6876 + }, + { + "epoch": 0.6230436456705397, + "grad_norm": 0.7604110836982727, + "learning_rate": 0.00015848486679151817, + "loss": 2.7665, + "step": 6877 + }, + { + "epoch": 0.6231342438449865, + "grad_norm": 0.8034870028495789, + "learning_rate": 0.0001584788255905274, + "loss": 2.7346, + "step": 6878 + }, + { + "epoch": 0.6232248420194333, + "grad_norm": 0.7580821514129639, + "learning_rate": 0.00015847278438953666, + "loss": 2.7801, + "step": 6879 + }, + { + "epoch": 0.6233154401938801, + "grad_norm": 0.7094796895980835, + "learning_rate": 0.0001584667431885459, + "loss": 2.4315, + "step": 6880 + }, + { + "epoch": 0.6234060383683269, + "grad_norm": 0.777010440826416, + "learning_rate": 0.00015846070198755513, + "loss": 2.7746, + "step": 6881 + }, + { + "epoch": 0.6234966365427737, + "grad_norm": 0.7518843412399292, + "learning_rate": 0.00015845466078656437, + "loss": 2.8335, + "step": 6882 + }, + { + "epoch": 0.6235872347172204, + "grad_norm": 0.7488223910331726, + "learning_rate": 0.00015844861958557363, + "loss": 2.6176, + "step": 6883 + }, + { + "epoch": 0.6236778328916672, + "grad_norm": 0.7711161971092224, + "learning_rate": 0.00015844257838458286, + "loss": 2.8062, + "step": 6884 + }, + { + "epoch": 0.623768431066114, + "grad_norm": 0.8440073132514954, + "learning_rate": 0.0001584365371835921, + "loss": 2.8401, + "step": 6885 + }, + { + "epoch": 0.6238590292405608, + "grad_norm": 0.6978834271430969, + "learning_rate": 0.00015843049598260136, + "loss": 2.088, + "step": 6886 + }, + { + "epoch": 0.6239496274150076, + "grad_norm": 0.770609974861145, + "learning_rate": 0.0001584244547816106, + "loss": 2.8148, + "step": 6887 + }, + { + "epoch": 0.6240402255894544, + "grad_norm": 0.8531061410903931, + "learning_rate": 0.00015841841358061985, + "loss": 2.847, + "step": 6888 + }, + { + "epoch": 0.6241308237639012, + "grad_norm": 0.8163027167320251, + "learning_rate": 0.00015841237237962906, + "loss": 2.8808, + "step": 6889 + }, + { + "epoch": 0.624221421938348, + "grad_norm": 0.8074580430984497, + "learning_rate": 0.00015840633117863832, + "loss": 2.9566, + "step": 6890 + }, + { + "epoch": 0.6243120201127947, + "grad_norm": 0.8512511253356934, + "learning_rate": 0.00015840028997764755, + "loss": 3.0115, + "step": 6891 + }, + { + "epoch": 0.6244026182872415, + "grad_norm": 0.6609553098678589, + "learning_rate": 0.00015839424877665682, + "loss": 2.3086, + "step": 6892 + }, + { + "epoch": 0.6244932164616883, + "grad_norm": 0.7413296103477478, + "learning_rate": 0.00015838820757566605, + "loss": 2.8248, + "step": 6893 + }, + { + "epoch": 0.6245838146361351, + "grad_norm": 0.7559319734573364, + "learning_rate": 0.00015838216637467528, + "loss": 2.8022, + "step": 6894 + }, + { + "epoch": 0.6246744128105819, + "grad_norm": 0.7248803973197937, + "learning_rate": 0.00015837612517368454, + "loss": 2.1758, + "step": 6895 + }, + { + "epoch": 0.6247650109850287, + "grad_norm": 0.6419049501419067, + "learning_rate": 0.00015837008397269378, + "loss": 1.8701, + "step": 6896 + }, + { + "epoch": 0.6248556091594755, + "grad_norm": 0.7151840329170227, + "learning_rate": 0.000158364042771703, + "loss": 2.6491, + "step": 6897 + }, + { + "epoch": 0.6249462073339223, + "grad_norm": 0.691300094127655, + "learning_rate": 0.00015835800157071225, + "loss": 2.4052, + "step": 6898 + }, + { + "epoch": 0.625036805508369, + "grad_norm": 0.7806651592254639, + "learning_rate": 0.0001583519603697215, + "loss": 2.8347, + "step": 6899 + }, + { + "epoch": 0.6251274036828158, + "grad_norm": 0.8255139589309692, + "learning_rate": 0.00015834591916873077, + "loss": 2.6712, + "step": 6900 + }, + { + "epoch": 0.6252180018572626, + "grad_norm": 0.7060269117355347, + "learning_rate": 0.00015833987796774, + "loss": 2.4431, + "step": 6901 + }, + { + "epoch": 0.6253086000317094, + "grad_norm": 0.6855252385139465, + "learning_rate": 0.00015833383676674924, + "loss": 2.2437, + "step": 6902 + }, + { + "epoch": 0.6253991982061562, + "grad_norm": 0.7533145546913147, + "learning_rate": 0.00015832779556575847, + "loss": 2.9996, + "step": 6903 + }, + { + "epoch": 0.625489796380603, + "grad_norm": 0.8386768102645874, + "learning_rate": 0.00015832175436476773, + "loss": 2.7922, + "step": 6904 + }, + { + "epoch": 0.6255803945550498, + "grad_norm": 0.7786985039710999, + "learning_rate": 0.00015831571316377697, + "loss": 2.8866, + "step": 6905 + }, + { + "epoch": 0.6256709927294966, + "grad_norm": 0.78391432762146, + "learning_rate": 0.0001583096719627862, + "loss": 3.2893, + "step": 6906 + }, + { + "epoch": 0.6257615909039432, + "grad_norm": 0.7309778332710266, + "learning_rate": 0.00015830363076179546, + "loss": 2.8449, + "step": 6907 + }, + { + "epoch": 0.62585218907839, + "grad_norm": 0.8799160122871399, + "learning_rate": 0.0001582975895608047, + "loss": 2.68, + "step": 6908 + }, + { + "epoch": 0.6259427872528368, + "grad_norm": 0.7707398533821106, + "learning_rate": 0.00015829154835981396, + "loss": 3.1228, + "step": 6909 + }, + { + "epoch": 0.6260333854272836, + "grad_norm": 0.8501266241073608, + "learning_rate": 0.00015828550715882316, + "loss": 2.9212, + "step": 6910 + }, + { + "epoch": 0.6261239836017304, + "grad_norm": 0.9080597758293152, + "learning_rate": 0.00015827946595783242, + "loss": 2.544, + "step": 6911 + }, + { + "epoch": 0.6262145817761772, + "grad_norm": 0.6826556921005249, + "learning_rate": 0.00015827342475684166, + "loss": 2.1193, + "step": 6912 + }, + { + "epoch": 0.626305179950624, + "grad_norm": 0.8093412518501282, + "learning_rate": 0.00015826738355585092, + "loss": 2.8889, + "step": 6913 + }, + { + "epoch": 0.6263957781250707, + "grad_norm": 0.7561994194984436, + "learning_rate": 0.00015826134235486015, + "loss": 2.7012, + "step": 6914 + }, + { + "epoch": 0.6264863762995175, + "grad_norm": 0.7471259832382202, + "learning_rate": 0.0001582553011538694, + "loss": 2.7984, + "step": 6915 + }, + { + "epoch": 0.6265769744739643, + "grad_norm": 0.6833937168121338, + "learning_rate": 0.00015824925995287865, + "loss": 1.9816, + "step": 6916 + }, + { + "epoch": 0.6266675726484111, + "grad_norm": 0.7466819882392883, + "learning_rate": 0.00015824321875188788, + "loss": 2.8825, + "step": 6917 + }, + { + "epoch": 0.6267581708228579, + "grad_norm": 0.7382583618164062, + "learning_rate": 0.00015823717755089714, + "loss": 2.6687, + "step": 6918 + }, + { + "epoch": 0.6268487689973047, + "grad_norm": 0.8255411982536316, + "learning_rate": 0.00015823113634990635, + "loss": 2.9454, + "step": 6919 + }, + { + "epoch": 0.6269393671717515, + "grad_norm": 0.7412514686584473, + "learning_rate": 0.0001582250951489156, + "loss": 2.7688, + "step": 6920 + }, + { + "epoch": 0.6270299653461983, + "grad_norm": 0.6891207695007324, + "learning_rate": 0.00015821905394792485, + "loss": 2.052, + "step": 6921 + }, + { + "epoch": 0.627120563520645, + "grad_norm": 0.8409563302993774, + "learning_rate": 0.0001582130127469341, + "loss": 2.7864, + "step": 6922 + }, + { + "epoch": 0.6272111616950918, + "grad_norm": 0.7649407982826233, + "learning_rate": 0.00015820697154594334, + "loss": 2.7487, + "step": 6923 + }, + { + "epoch": 0.6273017598695386, + "grad_norm": 0.8224073052406311, + "learning_rate": 0.00015820093034495258, + "loss": 2.762, + "step": 6924 + }, + { + "epoch": 0.6273923580439854, + "grad_norm": 0.6786626577377319, + "learning_rate": 0.00015819488914396184, + "loss": 2.2915, + "step": 6925 + }, + { + "epoch": 0.6274829562184322, + "grad_norm": 0.813910722732544, + "learning_rate": 0.00015818884794297107, + "loss": 2.9798, + "step": 6926 + }, + { + "epoch": 0.627573554392879, + "grad_norm": 0.6788438558578491, + "learning_rate": 0.0001581828067419803, + "loss": 1.9127, + "step": 6927 + }, + { + "epoch": 0.6276641525673258, + "grad_norm": 0.7544005513191223, + "learning_rate": 0.00015817676554098954, + "loss": 2.7169, + "step": 6928 + }, + { + "epoch": 0.6277547507417726, + "grad_norm": 0.7773034572601318, + "learning_rate": 0.0001581707243399988, + "loss": 2.9646, + "step": 6929 + }, + { + "epoch": 0.6278453489162193, + "grad_norm": 0.7432611584663391, + "learning_rate": 0.00015816468313900806, + "loss": 2.7772, + "step": 6930 + }, + { + "epoch": 0.6279359470906661, + "grad_norm": 0.7112497091293335, + "learning_rate": 0.0001581586419380173, + "loss": 2.775, + "step": 6931 + }, + { + "epoch": 0.6280265452651129, + "grad_norm": 0.8867354989051819, + "learning_rate": 0.00015815260073702653, + "loss": 2.9059, + "step": 6932 + }, + { + "epoch": 0.6281171434395597, + "grad_norm": 0.8099237680435181, + "learning_rate": 0.00015814655953603576, + "loss": 2.6852, + "step": 6933 + }, + { + "epoch": 0.6282077416140065, + "grad_norm": 0.7984824180603027, + "learning_rate": 0.00015814051833504502, + "loss": 3.0087, + "step": 6934 + }, + { + "epoch": 0.6282983397884533, + "grad_norm": 0.8392012119293213, + "learning_rate": 0.00015813447713405426, + "loss": 2.8533, + "step": 6935 + }, + { + "epoch": 0.6283889379629001, + "grad_norm": 0.7669112682342529, + "learning_rate": 0.0001581284359330635, + "loss": 2.8618, + "step": 6936 + }, + { + "epoch": 0.6284795361373469, + "grad_norm": 0.7102333307266235, + "learning_rate": 0.00015812239473207275, + "loss": 2.4793, + "step": 6937 + }, + { + "epoch": 0.6285701343117936, + "grad_norm": 0.7388784289360046, + "learning_rate": 0.000158116353531082, + "loss": 2.5903, + "step": 6938 + }, + { + "epoch": 0.6286607324862404, + "grad_norm": 0.834050714969635, + "learning_rate": 0.00015811031233009125, + "loss": 3.248, + "step": 6939 + }, + { + "epoch": 0.6287513306606872, + "grad_norm": 0.7900153994560242, + "learning_rate": 0.00015810427112910046, + "loss": 3.1099, + "step": 6940 + }, + { + "epoch": 0.628841928835134, + "grad_norm": 0.6271913051605225, + "learning_rate": 0.00015809822992810972, + "loss": 2.0117, + "step": 6941 + }, + { + "epoch": 0.6289325270095808, + "grad_norm": 0.7790738940238953, + "learning_rate": 0.00015809218872711895, + "loss": 2.8358, + "step": 6942 + }, + { + "epoch": 0.6290231251840276, + "grad_norm": 0.7965233325958252, + "learning_rate": 0.0001580861475261282, + "loss": 3.2268, + "step": 6943 + }, + { + "epoch": 0.6291137233584744, + "grad_norm": 0.7591434121131897, + "learning_rate": 0.00015808010632513745, + "loss": 2.7257, + "step": 6944 + }, + { + "epoch": 0.6292043215329212, + "grad_norm": 0.7173876166343689, + "learning_rate": 0.00015807406512414668, + "loss": 2.1055, + "step": 6945 + }, + { + "epoch": 0.629294919707368, + "grad_norm": 0.7739055156707764, + "learning_rate": 0.00015806802392315594, + "loss": 2.9483, + "step": 6946 + }, + { + "epoch": 0.6293855178818146, + "grad_norm": 0.8069486021995544, + "learning_rate": 0.00015806198272216518, + "loss": 2.9238, + "step": 6947 + }, + { + "epoch": 0.6294761160562614, + "grad_norm": 0.6894083619117737, + "learning_rate": 0.0001580559415211744, + "loss": 2.1309, + "step": 6948 + }, + { + "epoch": 0.6295667142307082, + "grad_norm": 0.7608951330184937, + "learning_rate": 0.00015804990032018364, + "loss": 2.7744, + "step": 6949 + }, + { + "epoch": 0.629657312405155, + "grad_norm": 0.5633809566497803, + "learning_rate": 0.0001580438591191929, + "loss": 1.5475, + "step": 6950 + }, + { + "epoch": 0.6297479105796018, + "grad_norm": 0.7939672470092773, + "learning_rate": 0.00015803781791820214, + "loss": 2.8517, + "step": 6951 + }, + { + "epoch": 0.6298385087540486, + "grad_norm": 0.750761866569519, + "learning_rate": 0.0001580317767172114, + "loss": 2.7522, + "step": 6952 + }, + { + "epoch": 0.6299291069284954, + "grad_norm": 0.9143991470336914, + "learning_rate": 0.00015802573551622063, + "loss": 2.7043, + "step": 6953 + }, + { + "epoch": 0.6300197051029421, + "grad_norm": 0.8288782835006714, + "learning_rate": 0.00015801969431522987, + "loss": 2.7834, + "step": 6954 + }, + { + "epoch": 0.6301103032773889, + "grad_norm": 0.7431500554084778, + "learning_rate": 0.00015801365311423913, + "loss": 2.7954, + "step": 6955 + }, + { + "epoch": 0.6302009014518357, + "grad_norm": 0.7779028415679932, + "learning_rate": 0.00015800761191324836, + "loss": 2.682, + "step": 6956 + }, + { + "epoch": 0.6302914996262825, + "grad_norm": 0.7555111646652222, + "learning_rate": 0.0001580015707122576, + "loss": 2.7673, + "step": 6957 + }, + { + "epoch": 0.6303820978007293, + "grad_norm": 0.6661214232444763, + "learning_rate": 0.00015799552951126683, + "loss": 2.2289, + "step": 6958 + }, + { + "epoch": 0.6304726959751761, + "grad_norm": 0.7217296957969666, + "learning_rate": 0.0001579894883102761, + "loss": 2.1538, + "step": 6959 + }, + { + "epoch": 0.6305632941496229, + "grad_norm": 0.8262364864349365, + "learning_rate": 0.00015798344710928535, + "loss": 2.7263, + "step": 6960 + }, + { + "epoch": 0.6306538923240697, + "grad_norm": 0.7958664298057556, + "learning_rate": 0.00015797740590829456, + "loss": 2.8259, + "step": 6961 + }, + { + "epoch": 0.6307444904985164, + "grad_norm": 0.7932977080345154, + "learning_rate": 0.00015797136470730382, + "loss": 2.7108, + "step": 6962 + }, + { + "epoch": 0.6308350886729632, + "grad_norm": 0.7847117185592651, + "learning_rate": 0.00015796532350631306, + "loss": 2.7988, + "step": 6963 + }, + { + "epoch": 0.63092568684741, + "grad_norm": 0.695652425289154, + "learning_rate": 0.00015795928230532232, + "loss": 2.0839, + "step": 6964 + }, + { + "epoch": 0.6310162850218568, + "grad_norm": 0.7461254000663757, + "learning_rate": 0.00015795324110433155, + "loss": 2.8459, + "step": 6965 + }, + { + "epoch": 0.6311068831963036, + "grad_norm": 0.8055736422538757, + "learning_rate": 0.00015794719990334079, + "loss": 2.9073, + "step": 6966 + }, + { + "epoch": 0.6311974813707504, + "grad_norm": 0.8499023914337158, + "learning_rate": 0.00015794115870235005, + "loss": 3.1609, + "step": 6967 + }, + { + "epoch": 0.6312880795451972, + "grad_norm": 0.7529367208480835, + "learning_rate": 0.00015793511750135928, + "loss": 2.5993, + "step": 6968 + }, + { + "epoch": 0.631378677719644, + "grad_norm": 0.7293235063552856, + "learning_rate": 0.00015792907630036851, + "loss": 2.231, + "step": 6969 + }, + { + "epoch": 0.6314692758940907, + "grad_norm": 0.8191384673118591, + "learning_rate": 0.00015792303509937775, + "loss": 2.6206, + "step": 6970 + }, + { + "epoch": 0.6315598740685375, + "grad_norm": 0.8350822329521179, + "learning_rate": 0.000157916993898387, + "loss": 2.9282, + "step": 6971 + }, + { + "epoch": 0.6316504722429843, + "grad_norm": 0.7527723908424377, + "learning_rate": 0.00015791095269739624, + "loss": 2.7256, + "step": 6972 + }, + { + "epoch": 0.6317410704174311, + "grad_norm": 0.7916616797447205, + "learning_rate": 0.0001579049114964055, + "loss": 2.9669, + "step": 6973 + }, + { + "epoch": 0.6318316685918779, + "grad_norm": 0.890182375907898, + "learning_rate": 0.00015789887029541474, + "loss": 2.9335, + "step": 6974 + }, + { + "epoch": 0.6319222667663247, + "grad_norm": 0.7865279912948608, + "learning_rate": 0.00015789282909442397, + "loss": 2.9179, + "step": 6975 + }, + { + "epoch": 0.6320128649407715, + "grad_norm": 0.8894097208976746, + "learning_rate": 0.00015788678789343323, + "loss": 2.8234, + "step": 6976 + }, + { + "epoch": 0.6321034631152183, + "grad_norm": 0.7824143171310425, + "learning_rate": 0.00015788074669244247, + "loss": 2.8983, + "step": 6977 + }, + { + "epoch": 0.632194061289665, + "grad_norm": 0.7450358867645264, + "learning_rate": 0.0001578747054914517, + "loss": 2.6771, + "step": 6978 + }, + { + "epoch": 0.6322846594641118, + "grad_norm": 0.7943394780158997, + "learning_rate": 0.00015786866429046094, + "loss": 2.9505, + "step": 6979 + }, + { + "epoch": 0.6323752576385586, + "grad_norm": 0.80960613489151, + "learning_rate": 0.0001578626230894702, + "loss": 2.9717, + "step": 6980 + }, + { + "epoch": 0.6324658558130054, + "grad_norm": 0.79057776927948, + "learning_rate": 0.00015785658188847943, + "loss": 2.8738, + "step": 6981 + }, + { + "epoch": 0.6325564539874522, + "grad_norm": 0.8674218058586121, + "learning_rate": 0.00015785054068748867, + "loss": 2.2433, + "step": 6982 + }, + { + "epoch": 0.632647052161899, + "grad_norm": 0.7619445323944092, + "learning_rate": 0.00015784449948649793, + "loss": 2.7821, + "step": 6983 + }, + { + "epoch": 0.6327376503363458, + "grad_norm": 0.8083558082580566, + "learning_rate": 0.00015783845828550716, + "loss": 2.7339, + "step": 6984 + }, + { + "epoch": 0.6328282485107926, + "grad_norm": 0.7860944271087646, + "learning_rate": 0.00015783241708451642, + "loss": 2.7594, + "step": 6985 + }, + { + "epoch": 0.6329188466852393, + "grad_norm": 0.8355899453163147, + "learning_rate": 0.00015782637588352566, + "loss": 2.8088, + "step": 6986 + }, + { + "epoch": 0.6330094448596861, + "grad_norm": 0.7532414197921753, + "learning_rate": 0.0001578203346825349, + "loss": 2.7769, + "step": 6987 + }, + { + "epoch": 0.6331000430341328, + "grad_norm": 0.7451673150062561, + "learning_rate": 0.00015781429348154412, + "loss": 2.749, + "step": 6988 + }, + { + "epoch": 0.6331906412085796, + "grad_norm": 0.7879685759544373, + "learning_rate": 0.00015780825228055339, + "loss": 2.7576, + "step": 6989 + }, + { + "epoch": 0.6332812393830264, + "grad_norm": 0.7849060297012329, + "learning_rate": 0.00015780221107956265, + "loss": 2.9763, + "step": 6990 + }, + { + "epoch": 0.6333718375574732, + "grad_norm": 0.7245126366615295, + "learning_rate": 0.00015779616987857185, + "loss": 2.2409, + "step": 6991 + }, + { + "epoch": 0.63346243573192, + "grad_norm": 0.7917807698249817, + "learning_rate": 0.00015779012867758111, + "loss": 2.6589, + "step": 6992 + }, + { + "epoch": 0.6335530339063667, + "grad_norm": 0.6788161396980286, + "learning_rate": 0.00015778408747659035, + "loss": 2.0393, + "step": 6993 + }, + { + "epoch": 0.6336436320808135, + "grad_norm": 0.7794530987739563, + "learning_rate": 0.0001577780462755996, + "loss": 2.7844, + "step": 6994 + }, + { + "epoch": 0.6337342302552603, + "grad_norm": 0.7293229699134827, + "learning_rate": 0.00015777200507460884, + "loss": 2.6573, + "step": 6995 + }, + { + "epoch": 0.6338248284297071, + "grad_norm": 0.8243412375450134, + "learning_rate": 0.00015776596387361808, + "loss": 2.8666, + "step": 6996 + }, + { + "epoch": 0.6339154266041539, + "grad_norm": 0.7772268652915955, + "learning_rate": 0.00015775992267262734, + "loss": 2.9441, + "step": 6997 + }, + { + "epoch": 0.6340060247786007, + "grad_norm": 0.7226855158805847, + "learning_rate": 0.00015775388147163657, + "loss": 2.6963, + "step": 6998 + }, + { + "epoch": 0.6340966229530475, + "grad_norm": 0.7401133179664612, + "learning_rate": 0.0001577478402706458, + "loss": 2.4823, + "step": 6999 + }, + { + "epoch": 0.6341872211274943, + "grad_norm": 0.7030699253082275, + "learning_rate": 0.00015774179906965504, + "loss": 2.2106, + "step": 7000 + }, + { + "epoch": 0.634277819301941, + "grad_norm": 0.7634336352348328, + "learning_rate": 0.0001577357578686643, + "loss": 2.7751, + "step": 7001 + }, + { + "epoch": 0.6343684174763878, + "grad_norm": 0.8417442440986633, + "learning_rate": 0.00015772971666767354, + "loss": 2.8209, + "step": 7002 + }, + { + "epoch": 0.6344590156508346, + "grad_norm": 0.7628512978553772, + "learning_rate": 0.0001577236754666828, + "loss": 2.7616, + "step": 7003 + }, + { + "epoch": 0.6345496138252814, + "grad_norm": 0.6590785980224609, + "learning_rate": 0.00015771763426569203, + "loss": 2.2206, + "step": 7004 + }, + { + "epoch": 0.6346402119997282, + "grad_norm": 0.7651087641716003, + "learning_rate": 0.00015771159306470127, + "loss": 2.8058, + "step": 7005 + }, + { + "epoch": 0.634730810174175, + "grad_norm": 0.8067734837532043, + "learning_rate": 0.00015770555186371053, + "loss": 2.6968, + "step": 7006 + }, + { + "epoch": 0.6348214083486218, + "grad_norm": 0.8131486177444458, + "learning_rate": 0.00015769951066271976, + "loss": 3.0025, + "step": 7007 + }, + { + "epoch": 0.6349120065230686, + "grad_norm": 0.7525383830070496, + "learning_rate": 0.000157693469461729, + "loss": 2.5851, + "step": 7008 + }, + { + "epoch": 0.6350026046975153, + "grad_norm": 0.8089686632156372, + "learning_rate": 0.00015768742826073823, + "loss": 2.8495, + "step": 7009 + }, + { + "epoch": 0.6350932028719621, + "grad_norm": 0.8128174543380737, + "learning_rate": 0.0001576813870597475, + "loss": 3.0867, + "step": 7010 + }, + { + "epoch": 0.6351838010464089, + "grad_norm": 0.7810013890266418, + "learning_rate": 0.00015767534585875672, + "loss": 2.7273, + "step": 7011 + }, + { + "epoch": 0.6352743992208557, + "grad_norm": 0.6908231377601624, + "learning_rate": 0.00015766930465776596, + "loss": 2.3225, + "step": 7012 + }, + { + "epoch": 0.6353649973953025, + "grad_norm": 0.6458523869514465, + "learning_rate": 0.00015766326345677522, + "loss": 2.0261, + "step": 7013 + }, + { + "epoch": 0.6354555955697493, + "grad_norm": 0.7720932960510254, + "learning_rate": 0.00015765722225578445, + "loss": 2.8038, + "step": 7014 + }, + { + "epoch": 0.6355461937441961, + "grad_norm": 0.8168903589248657, + "learning_rate": 0.00015765118105479371, + "loss": 2.7328, + "step": 7015 + }, + { + "epoch": 0.6356367919186429, + "grad_norm": 0.7885969877243042, + "learning_rate": 0.00015764513985380295, + "loss": 2.8693, + "step": 7016 + }, + { + "epoch": 0.6357273900930897, + "grad_norm": 0.77680903673172, + "learning_rate": 0.00015763909865281218, + "loss": 2.7362, + "step": 7017 + }, + { + "epoch": 0.6358179882675364, + "grad_norm": 0.8204694986343384, + "learning_rate": 0.00015763305745182142, + "loss": 3.0461, + "step": 7018 + }, + { + "epoch": 0.6359085864419832, + "grad_norm": 0.7783116698265076, + "learning_rate": 0.00015762701625083068, + "loss": 2.8161, + "step": 7019 + }, + { + "epoch": 0.63599918461643, + "grad_norm": 0.9187782406806946, + "learning_rate": 0.0001576209750498399, + "loss": 3.0518, + "step": 7020 + }, + { + "epoch": 0.6360897827908768, + "grad_norm": 0.7534744739532471, + "learning_rate": 0.00015761493384884915, + "loss": 2.6419, + "step": 7021 + }, + { + "epoch": 0.6361803809653236, + "grad_norm": 0.7974609136581421, + "learning_rate": 0.0001576088926478584, + "loss": 2.8862, + "step": 7022 + }, + { + "epoch": 0.6362709791397704, + "grad_norm": 0.7395187616348267, + "learning_rate": 0.00015760285144686764, + "loss": 2.7092, + "step": 7023 + }, + { + "epoch": 0.6363615773142172, + "grad_norm": 0.8404020667076111, + "learning_rate": 0.0001575968102458769, + "loss": 2.8085, + "step": 7024 + }, + { + "epoch": 0.636452175488664, + "grad_norm": 0.6509790420532227, + "learning_rate": 0.0001575907690448861, + "loss": 2.2033, + "step": 7025 + }, + { + "epoch": 0.6365427736631107, + "grad_norm": 0.8843257427215576, + "learning_rate": 0.00015758472784389537, + "loss": 2.7413, + "step": 7026 + }, + { + "epoch": 0.6366333718375575, + "grad_norm": 0.8206231594085693, + "learning_rate": 0.00015757868664290463, + "loss": 2.9308, + "step": 7027 + }, + { + "epoch": 0.6367239700120042, + "grad_norm": 0.8692829012870789, + "learning_rate": 0.00015757264544191387, + "loss": 2.869, + "step": 7028 + }, + { + "epoch": 0.636814568186451, + "grad_norm": 0.8071771264076233, + "learning_rate": 0.0001575666042409231, + "loss": 2.7897, + "step": 7029 + }, + { + "epoch": 0.6369051663608978, + "grad_norm": 0.8275893330574036, + "learning_rate": 0.00015756056303993233, + "loss": 2.5226, + "step": 7030 + }, + { + "epoch": 0.6369957645353446, + "grad_norm": 0.732168972492218, + "learning_rate": 0.0001575545218389416, + "loss": 2.6802, + "step": 7031 + }, + { + "epoch": 0.6370863627097914, + "grad_norm": 0.7590142488479614, + "learning_rate": 0.00015754848063795083, + "loss": 2.7462, + "step": 7032 + }, + { + "epoch": 0.6371769608842381, + "grad_norm": 0.7747914791107178, + "learning_rate": 0.00015754243943696006, + "loss": 2.8147, + "step": 7033 + }, + { + "epoch": 0.6372675590586849, + "grad_norm": 0.69512939453125, + "learning_rate": 0.00015753639823596932, + "loss": 2.2414, + "step": 7034 + }, + { + "epoch": 0.6373581572331317, + "grad_norm": 0.8166472315788269, + "learning_rate": 0.00015753035703497856, + "loss": 2.9197, + "step": 7035 + }, + { + "epoch": 0.6374487554075785, + "grad_norm": 0.8076759576797485, + "learning_rate": 0.00015752431583398782, + "loss": 2.9253, + "step": 7036 + }, + { + "epoch": 0.6375393535820253, + "grad_norm": 0.7500279545783997, + "learning_rate": 0.00015751827463299705, + "loss": 2.5937, + "step": 7037 + }, + { + "epoch": 0.6376299517564721, + "grad_norm": 0.842692494392395, + "learning_rate": 0.0001575122334320063, + "loss": 2.8206, + "step": 7038 + }, + { + "epoch": 0.6377205499309189, + "grad_norm": 0.7335516810417175, + "learning_rate": 0.00015750619223101552, + "loss": 2.76, + "step": 7039 + }, + { + "epoch": 0.6378111481053657, + "grad_norm": 0.7115073204040527, + "learning_rate": 0.00015750015103002478, + "loss": 1.9871, + "step": 7040 + }, + { + "epoch": 0.6379017462798124, + "grad_norm": 0.9061356782913208, + "learning_rate": 0.00015749410982903402, + "loss": 2.69, + "step": 7041 + }, + { + "epoch": 0.6379923444542592, + "grad_norm": 0.760604977607727, + "learning_rate": 0.00015748806862804325, + "loss": 2.8234, + "step": 7042 + }, + { + "epoch": 0.638082942628706, + "grad_norm": 0.7106902599334717, + "learning_rate": 0.0001574820274270525, + "loss": 2.5569, + "step": 7043 + }, + { + "epoch": 0.6381735408031528, + "grad_norm": 0.8121486306190491, + "learning_rate": 0.00015747598622606175, + "loss": 2.8632, + "step": 7044 + }, + { + "epoch": 0.6382641389775996, + "grad_norm": 0.8205869197845459, + "learning_rate": 0.000157469945025071, + "loss": 2.7399, + "step": 7045 + }, + { + "epoch": 0.6383547371520464, + "grad_norm": 0.8226275444030762, + "learning_rate": 0.00015746390382408021, + "loss": 2.9403, + "step": 7046 + }, + { + "epoch": 0.6384453353264932, + "grad_norm": 0.7816091179847717, + "learning_rate": 0.00015745786262308948, + "loss": 2.556, + "step": 7047 + }, + { + "epoch": 0.63853593350094, + "grad_norm": 0.8333846926689148, + "learning_rate": 0.0001574518214220987, + "loss": 2.9474, + "step": 7048 + }, + { + "epoch": 0.6386265316753867, + "grad_norm": 0.7924343347549438, + "learning_rate": 0.00015744578022110797, + "loss": 2.6248, + "step": 7049 + }, + { + "epoch": 0.6387171298498335, + "grad_norm": 0.811528742313385, + "learning_rate": 0.0001574397390201172, + "loss": 2.916, + "step": 7050 + }, + { + "epoch": 0.6388077280242803, + "grad_norm": 0.8486295938491821, + "learning_rate": 0.00015743369781912644, + "loss": 2.9364, + "step": 7051 + }, + { + "epoch": 0.6388983261987271, + "grad_norm": 0.7398650646209717, + "learning_rate": 0.0001574276566181357, + "loss": 2.952, + "step": 7052 + }, + { + "epoch": 0.6389889243731739, + "grad_norm": 0.7732760906219482, + "learning_rate": 0.00015742161541714493, + "loss": 2.7423, + "step": 7053 + }, + { + "epoch": 0.6390795225476207, + "grad_norm": 0.8021649122238159, + "learning_rate": 0.0001574155742161542, + "loss": 2.6826, + "step": 7054 + }, + { + "epoch": 0.6391701207220675, + "grad_norm": 0.7807809710502625, + "learning_rate": 0.0001574095330151634, + "loss": 2.7834, + "step": 7055 + }, + { + "epoch": 0.6392607188965143, + "grad_norm": 0.7585616707801819, + "learning_rate": 0.00015740349181417266, + "loss": 2.4918, + "step": 7056 + }, + { + "epoch": 0.639351317070961, + "grad_norm": 0.8276301026344299, + "learning_rate": 0.00015739745061318192, + "loss": 2.8519, + "step": 7057 + }, + { + "epoch": 0.6394419152454078, + "grad_norm": 0.7949022054672241, + "learning_rate": 0.00015739140941219116, + "loss": 2.7743, + "step": 7058 + }, + { + "epoch": 0.6395325134198546, + "grad_norm": 0.762509822845459, + "learning_rate": 0.0001573853682112004, + "loss": 2.6271, + "step": 7059 + }, + { + "epoch": 0.6396231115943014, + "grad_norm": 0.8353800177574158, + "learning_rate": 0.00015737932701020963, + "loss": 2.1776, + "step": 7060 + }, + { + "epoch": 0.6397137097687482, + "grad_norm": 0.8574018478393555, + "learning_rate": 0.0001573732858092189, + "loss": 2.9432, + "step": 7061 + }, + { + "epoch": 0.639804307943195, + "grad_norm": 0.7639187574386597, + "learning_rate": 0.00015736724460822812, + "loss": 2.6213, + "step": 7062 + }, + { + "epoch": 0.6398949061176418, + "grad_norm": 0.6923154592514038, + "learning_rate": 0.00015736120340723736, + "loss": 2.2117, + "step": 7063 + }, + { + "epoch": 0.6399855042920886, + "grad_norm": 0.7983418703079224, + "learning_rate": 0.00015735516220624662, + "loss": 2.8683, + "step": 7064 + }, + { + "epoch": 0.6400761024665353, + "grad_norm": 0.8000441789627075, + "learning_rate": 0.00015734912100525585, + "loss": 2.815, + "step": 7065 + }, + { + "epoch": 0.6401667006409821, + "grad_norm": 0.7874900102615356, + "learning_rate": 0.0001573430798042651, + "loss": 2.7836, + "step": 7066 + }, + { + "epoch": 0.6402572988154289, + "grad_norm": 0.7597563862800598, + "learning_rate": 0.00015733703860327435, + "loss": 3.0182, + "step": 7067 + }, + { + "epoch": 0.6403478969898757, + "grad_norm": 0.8042964935302734, + "learning_rate": 0.00015733099740228358, + "loss": 2.5768, + "step": 7068 + }, + { + "epoch": 0.6404384951643224, + "grad_norm": 0.7641382217407227, + "learning_rate": 0.00015732495620129281, + "loss": 2.799, + "step": 7069 + }, + { + "epoch": 0.6405290933387692, + "grad_norm": 0.7671838998794556, + "learning_rate": 0.00015731891500030208, + "loss": 2.6609, + "step": 7070 + }, + { + "epoch": 0.640619691513216, + "grad_norm": 0.7961508631706238, + "learning_rate": 0.0001573128737993113, + "loss": 3.0206, + "step": 7071 + }, + { + "epoch": 0.6407102896876627, + "grad_norm": 0.8777120113372803, + "learning_rate": 0.00015730683259832054, + "loss": 2.7138, + "step": 7072 + }, + { + "epoch": 0.6408008878621095, + "grad_norm": 0.7753223776817322, + "learning_rate": 0.0001573007913973298, + "loss": 2.7297, + "step": 7073 + }, + { + "epoch": 0.6408914860365563, + "grad_norm": 0.8379204869270325, + "learning_rate": 0.00015729475019633904, + "loss": 2.5314, + "step": 7074 + }, + { + "epoch": 0.6409820842110031, + "grad_norm": 0.7645781636238098, + "learning_rate": 0.0001572887089953483, + "loss": 2.7003, + "step": 7075 + }, + { + "epoch": 0.6410726823854499, + "grad_norm": 0.8353139162063599, + "learning_rate": 0.0001572826677943575, + "loss": 2.96, + "step": 7076 + }, + { + "epoch": 0.6411632805598967, + "grad_norm": 0.7625341415405273, + "learning_rate": 0.00015727662659336677, + "loss": 2.7342, + "step": 7077 + }, + { + "epoch": 0.6412538787343435, + "grad_norm": 0.79698246717453, + "learning_rate": 0.000157270585392376, + "loss": 2.9814, + "step": 7078 + }, + { + "epoch": 0.6413444769087903, + "grad_norm": 0.8075357675552368, + "learning_rate": 0.00015726454419138526, + "loss": 2.5203, + "step": 7079 + }, + { + "epoch": 0.641435075083237, + "grad_norm": 0.7880104184150696, + "learning_rate": 0.0001572585029903945, + "loss": 2.9128, + "step": 7080 + }, + { + "epoch": 0.6415256732576838, + "grad_norm": 0.7461639046669006, + "learning_rate": 0.00015725246178940373, + "loss": 2.7157, + "step": 7081 + }, + { + "epoch": 0.6416162714321306, + "grad_norm": 0.7657484412193298, + "learning_rate": 0.000157246420588413, + "loss": 2.7967, + "step": 7082 + }, + { + "epoch": 0.6417068696065774, + "grad_norm": 0.8521610498428345, + "learning_rate": 0.00015724037938742223, + "loss": 3.0967, + "step": 7083 + }, + { + "epoch": 0.6417974677810242, + "grad_norm": 0.7892771363258362, + "learning_rate": 0.00015723433818643146, + "loss": 2.813, + "step": 7084 + }, + { + "epoch": 0.641888065955471, + "grad_norm": 0.7561718821525574, + "learning_rate": 0.0001572282969854407, + "loss": 3.0533, + "step": 7085 + }, + { + "epoch": 0.6419786641299178, + "grad_norm": 0.7745482921600342, + "learning_rate": 0.00015722225578444996, + "loss": 2.5506, + "step": 7086 + }, + { + "epoch": 0.6420692623043646, + "grad_norm": 0.8109975457191467, + "learning_rate": 0.00015721621458345922, + "loss": 2.5918, + "step": 7087 + }, + { + "epoch": 0.6421598604788114, + "grad_norm": 0.7873353958129883, + "learning_rate": 0.00015721017338246845, + "loss": 2.7515, + "step": 7088 + }, + { + "epoch": 0.6422504586532581, + "grad_norm": 0.7670327425003052, + "learning_rate": 0.00015720413218147769, + "loss": 2.1873, + "step": 7089 + }, + { + "epoch": 0.6423410568277049, + "grad_norm": 0.7371611595153809, + "learning_rate": 0.00015719809098048692, + "loss": 2.0295, + "step": 7090 + }, + { + "epoch": 0.6424316550021517, + "grad_norm": 0.7668536901473999, + "learning_rate": 0.00015719204977949618, + "loss": 2.7105, + "step": 7091 + }, + { + "epoch": 0.6425222531765985, + "grad_norm": 0.839102566242218, + "learning_rate": 0.00015718600857850541, + "loss": 2.8362, + "step": 7092 + }, + { + "epoch": 0.6426128513510453, + "grad_norm": 0.8562485575675964, + "learning_rate": 0.00015717996737751465, + "loss": 2.9469, + "step": 7093 + }, + { + "epoch": 0.6427034495254921, + "grad_norm": 0.7659716010093689, + "learning_rate": 0.0001571739261765239, + "loss": 2.7895, + "step": 7094 + }, + { + "epoch": 0.6427940476999389, + "grad_norm": 0.6936764121055603, + "learning_rate": 0.00015716788497553314, + "loss": 2.0127, + "step": 7095 + }, + { + "epoch": 0.6428846458743857, + "grad_norm": 0.7728007435798645, + "learning_rate": 0.0001571618437745424, + "loss": 2.7295, + "step": 7096 + }, + { + "epoch": 0.6429752440488324, + "grad_norm": 0.7851961255073547, + "learning_rate": 0.0001571558025735516, + "loss": 2.9222, + "step": 7097 + }, + { + "epoch": 0.6430658422232792, + "grad_norm": 0.8209002017974854, + "learning_rate": 0.00015714976137256087, + "loss": 2.8503, + "step": 7098 + }, + { + "epoch": 0.643156440397726, + "grad_norm": 0.7487711906433105, + "learning_rate": 0.0001571437201715701, + "loss": 2.7023, + "step": 7099 + }, + { + "epoch": 0.6432470385721728, + "grad_norm": 0.7776013612747192, + "learning_rate": 0.00015713767897057937, + "loss": 2.7495, + "step": 7100 + }, + { + "epoch": 0.6433376367466196, + "grad_norm": 0.8291208744049072, + "learning_rate": 0.0001571316377695886, + "loss": 2.8284, + "step": 7101 + }, + { + "epoch": 0.6434282349210664, + "grad_norm": 0.7607172131538391, + "learning_rate": 0.00015712559656859784, + "loss": 2.7454, + "step": 7102 + }, + { + "epoch": 0.6435188330955132, + "grad_norm": 0.7925736308097839, + "learning_rate": 0.0001571195553676071, + "loss": 2.9781, + "step": 7103 + }, + { + "epoch": 0.64360943126996, + "grad_norm": 0.8175441026687622, + "learning_rate": 0.00015711351416661633, + "loss": 2.7526, + "step": 7104 + }, + { + "epoch": 0.6437000294444067, + "grad_norm": 0.8110489845275879, + "learning_rate": 0.0001571074729656256, + "loss": 2.7961, + "step": 7105 + }, + { + "epoch": 0.6437906276188535, + "grad_norm": 0.8226502537727356, + "learning_rate": 0.0001571014317646348, + "loss": 2.9736, + "step": 7106 + }, + { + "epoch": 0.6438812257933003, + "grad_norm": 0.7711653709411621, + "learning_rate": 0.00015709539056364406, + "loss": 2.6009, + "step": 7107 + }, + { + "epoch": 0.6439718239677471, + "grad_norm": 0.685596227645874, + "learning_rate": 0.0001570893493626533, + "loss": 2.4639, + "step": 7108 + }, + { + "epoch": 0.6440624221421938, + "grad_norm": 0.7965496182441711, + "learning_rate": 0.00015708330816166256, + "loss": 2.9251, + "step": 7109 + }, + { + "epoch": 0.6441530203166406, + "grad_norm": 0.6884186267852783, + "learning_rate": 0.0001570772669606718, + "loss": 2.0997, + "step": 7110 + }, + { + "epoch": 0.6442436184910874, + "grad_norm": 0.6520992517471313, + "learning_rate": 0.00015707122575968102, + "loss": 2.2073, + "step": 7111 + }, + { + "epoch": 0.6443342166655341, + "grad_norm": 0.757624626159668, + "learning_rate": 0.00015706518455869029, + "loss": 2.7304, + "step": 7112 + }, + { + "epoch": 0.6444248148399809, + "grad_norm": 0.765316903591156, + "learning_rate": 0.00015705914335769952, + "loss": 2.5662, + "step": 7113 + }, + { + "epoch": 0.6445154130144277, + "grad_norm": 0.7797386646270752, + "learning_rate": 0.00015705310215670875, + "loss": 2.9636, + "step": 7114 + }, + { + "epoch": 0.6446060111888745, + "grad_norm": 0.8502938747406006, + "learning_rate": 0.000157047060955718, + "loss": 3.0553, + "step": 7115 + }, + { + "epoch": 0.6446966093633213, + "grad_norm": 0.7835056185722351, + "learning_rate": 0.00015704101975472725, + "loss": 2.9345, + "step": 7116 + }, + { + "epoch": 0.6447872075377681, + "grad_norm": 0.8195754885673523, + "learning_rate": 0.0001570349785537365, + "loss": 3.2524, + "step": 7117 + }, + { + "epoch": 0.6448778057122149, + "grad_norm": 0.8418869972229004, + "learning_rate": 0.00015702893735274574, + "loss": 2.7903, + "step": 7118 + }, + { + "epoch": 0.6449684038866617, + "grad_norm": 0.8083350658416748, + "learning_rate": 0.00015702289615175498, + "loss": 3.0523, + "step": 7119 + }, + { + "epoch": 0.6450590020611084, + "grad_norm": 0.8527089953422546, + "learning_rate": 0.0001570168549507642, + "loss": 2.8238, + "step": 7120 + }, + { + "epoch": 0.6451496002355552, + "grad_norm": 0.5876594185829163, + "learning_rate": 0.00015701081374977347, + "loss": 1.4628, + "step": 7121 + }, + { + "epoch": 0.645240198410002, + "grad_norm": 0.7712540030479431, + "learning_rate": 0.0001570047725487827, + "loss": 2.7688, + "step": 7122 + }, + { + "epoch": 0.6453307965844488, + "grad_norm": 0.77640300989151, + "learning_rate": 0.00015699873134779194, + "loss": 2.8341, + "step": 7123 + }, + { + "epoch": 0.6454213947588956, + "grad_norm": 0.7626445293426514, + "learning_rate": 0.0001569926901468012, + "loss": 2.7733, + "step": 7124 + }, + { + "epoch": 0.6455119929333424, + "grad_norm": 0.7505971789360046, + "learning_rate": 0.00015698664894581044, + "loss": 2.8184, + "step": 7125 + }, + { + "epoch": 0.6456025911077892, + "grad_norm": 0.7433246970176697, + "learning_rate": 0.0001569806077448197, + "loss": 2.703, + "step": 7126 + }, + { + "epoch": 0.645693189282236, + "grad_norm": 0.9005174040794373, + "learning_rate": 0.0001569745665438289, + "loss": 2.629, + "step": 7127 + }, + { + "epoch": 0.6457837874566827, + "grad_norm": 0.7752107977867126, + "learning_rate": 0.00015696852534283817, + "loss": 2.6958, + "step": 7128 + }, + { + "epoch": 0.6458743856311295, + "grad_norm": 0.7804620265960693, + "learning_rate": 0.0001569624841418474, + "loss": 2.7358, + "step": 7129 + }, + { + "epoch": 0.6459649838055763, + "grad_norm": 0.6996299028396606, + "learning_rate": 0.00015695644294085666, + "loss": 2.0078, + "step": 7130 + }, + { + "epoch": 0.6460555819800231, + "grad_norm": 0.7706143856048584, + "learning_rate": 0.0001569504017398659, + "loss": 2.7974, + "step": 7131 + }, + { + "epoch": 0.6461461801544699, + "grad_norm": 0.598340630531311, + "learning_rate": 0.00015694436053887513, + "loss": 1.7283, + "step": 7132 + }, + { + "epoch": 0.6462367783289167, + "grad_norm": 0.8167099952697754, + "learning_rate": 0.0001569383193378844, + "loss": 2.7064, + "step": 7133 + }, + { + "epoch": 0.6463273765033635, + "grad_norm": 0.782568097114563, + "learning_rate": 0.00015693227813689362, + "loss": 2.9489, + "step": 7134 + }, + { + "epoch": 0.6464179746778103, + "grad_norm": 0.7846044301986694, + "learning_rate": 0.00015692623693590286, + "loss": 2.8103, + "step": 7135 + }, + { + "epoch": 0.646508572852257, + "grad_norm": 0.7256118059158325, + "learning_rate": 0.0001569201957349121, + "loss": 2.8687, + "step": 7136 + }, + { + "epoch": 0.6465991710267038, + "grad_norm": 0.774036169052124, + "learning_rate": 0.00015691415453392135, + "loss": 2.7842, + "step": 7137 + }, + { + "epoch": 0.6466897692011506, + "grad_norm": 0.7053288221359253, + "learning_rate": 0.0001569081133329306, + "loss": 2.1031, + "step": 7138 + }, + { + "epoch": 0.6467803673755974, + "grad_norm": 0.7443202137947083, + "learning_rate": 0.00015690207213193985, + "loss": 2.6275, + "step": 7139 + }, + { + "epoch": 0.6468709655500442, + "grad_norm": 0.7482186555862427, + "learning_rate": 0.00015689603093094908, + "loss": 2.6478, + "step": 7140 + }, + { + "epoch": 0.646961563724491, + "grad_norm": 0.7790105938911438, + "learning_rate": 0.00015688998972995832, + "loss": 2.7504, + "step": 7141 + }, + { + "epoch": 0.6470521618989378, + "grad_norm": 0.7654740214347839, + "learning_rate": 0.00015688394852896758, + "loss": 2.6498, + "step": 7142 + }, + { + "epoch": 0.6471427600733846, + "grad_norm": 0.8270073533058167, + "learning_rate": 0.0001568779073279768, + "loss": 2.7354, + "step": 7143 + }, + { + "epoch": 0.6472333582478313, + "grad_norm": 0.7203041315078735, + "learning_rate": 0.00015687186612698605, + "loss": 2.0854, + "step": 7144 + }, + { + "epoch": 0.6473239564222781, + "grad_norm": 0.7537549734115601, + "learning_rate": 0.00015686582492599528, + "loss": 2.6934, + "step": 7145 + }, + { + "epoch": 0.6474145545967249, + "grad_norm": 0.8047160506248474, + "learning_rate": 0.00015685978372500454, + "loss": 3.0123, + "step": 7146 + }, + { + "epoch": 0.6475051527711717, + "grad_norm": 0.764214038848877, + "learning_rate": 0.0001568537425240138, + "loss": 2.8106, + "step": 7147 + }, + { + "epoch": 0.6475957509456185, + "grad_norm": 1.046268105506897, + "learning_rate": 0.000156847701323023, + "loss": 2.6831, + "step": 7148 + }, + { + "epoch": 0.6476863491200653, + "grad_norm": 0.8015698790550232, + "learning_rate": 0.00015684166012203227, + "loss": 2.7084, + "step": 7149 + }, + { + "epoch": 0.647776947294512, + "grad_norm": 0.7498407363891602, + "learning_rate": 0.0001568356189210415, + "loss": 2.5817, + "step": 7150 + }, + { + "epoch": 0.6478675454689587, + "grad_norm": 0.8691050410270691, + "learning_rate": 0.00015682957772005077, + "loss": 2.8888, + "step": 7151 + }, + { + "epoch": 0.6479581436434055, + "grad_norm": 0.7540866732597351, + "learning_rate": 0.00015682353651906, + "loss": 2.7272, + "step": 7152 + }, + { + "epoch": 0.6480487418178523, + "grad_norm": 0.7820520401000977, + "learning_rate": 0.00015681749531806923, + "loss": 2.8812, + "step": 7153 + }, + { + "epoch": 0.6481393399922991, + "grad_norm": 0.839358925819397, + "learning_rate": 0.0001568114541170785, + "loss": 2.8856, + "step": 7154 + }, + { + "epoch": 0.6482299381667459, + "grad_norm": 0.7329369783401489, + "learning_rate": 0.00015680541291608773, + "loss": 2.0349, + "step": 7155 + }, + { + "epoch": 0.6483205363411927, + "grad_norm": 0.7801313996315002, + "learning_rate": 0.00015679937171509696, + "loss": 2.8317, + "step": 7156 + }, + { + "epoch": 0.6484111345156395, + "grad_norm": 0.9333421587944031, + "learning_rate": 0.0001567933305141062, + "loss": 3.0576, + "step": 7157 + }, + { + "epoch": 0.6485017326900863, + "grad_norm": 0.7718250155448914, + "learning_rate": 0.00015678728931311546, + "loss": 2.611, + "step": 7158 + }, + { + "epoch": 0.648592330864533, + "grad_norm": 0.7998504638671875, + "learning_rate": 0.0001567812481121247, + "loss": 2.909, + "step": 7159 + }, + { + "epoch": 0.6486829290389798, + "grad_norm": 0.7955465912818909, + "learning_rate": 0.00015677520691113395, + "loss": 2.9158, + "step": 7160 + }, + { + "epoch": 0.6487735272134266, + "grad_norm": 0.7760422229766846, + "learning_rate": 0.00015676916571014316, + "loss": 2.7121, + "step": 7161 + }, + { + "epoch": 0.6488641253878734, + "grad_norm": 0.7811868786811829, + "learning_rate": 0.00015676312450915242, + "loss": 2.8794, + "step": 7162 + }, + { + "epoch": 0.6489547235623202, + "grad_norm": 0.7678468227386475, + "learning_rate": 0.00015675708330816168, + "loss": 2.5061, + "step": 7163 + }, + { + "epoch": 0.649045321736767, + "grad_norm": 0.7853142619132996, + "learning_rate": 0.00015675104210717092, + "loss": 2.7637, + "step": 7164 + }, + { + "epoch": 0.6491359199112138, + "grad_norm": 0.8809463977813721, + "learning_rate": 0.00015674500090618015, + "loss": 2.7913, + "step": 7165 + }, + { + "epoch": 0.6492265180856606, + "grad_norm": 0.808484673500061, + "learning_rate": 0.00015673895970518939, + "loss": 2.8198, + "step": 7166 + }, + { + "epoch": 0.6493171162601074, + "grad_norm": 0.8155338168144226, + "learning_rate": 0.00015673291850419865, + "loss": 2.8145, + "step": 7167 + }, + { + "epoch": 0.6494077144345541, + "grad_norm": 0.7522750496864319, + "learning_rate": 0.00015672687730320788, + "loss": 2.6749, + "step": 7168 + }, + { + "epoch": 0.6494983126090009, + "grad_norm": 0.8672066926956177, + "learning_rate": 0.00015672083610221711, + "loss": 3.0427, + "step": 7169 + }, + { + "epoch": 0.6495889107834477, + "grad_norm": 0.7745272517204285, + "learning_rate": 0.00015671479490122638, + "loss": 3.0066, + "step": 7170 + }, + { + "epoch": 0.6496795089578945, + "grad_norm": 0.7679499983787537, + "learning_rate": 0.0001567087537002356, + "loss": 2.7759, + "step": 7171 + }, + { + "epoch": 0.6497701071323413, + "grad_norm": 0.8137831687927246, + "learning_rate": 0.00015670271249924487, + "loss": 2.6598, + "step": 7172 + }, + { + "epoch": 0.6498607053067881, + "grad_norm": 0.8636423945426941, + "learning_rate": 0.0001566966712982541, + "loss": 2.9804, + "step": 7173 + }, + { + "epoch": 0.6499513034812349, + "grad_norm": 0.7932905554771423, + "learning_rate": 0.00015669063009726334, + "loss": 2.8852, + "step": 7174 + }, + { + "epoch": 0.6500419016556817, + "grad_norm": 0.7591701745986938, + "learning_rate": 0.00015668458889627257, + "loss": 2.614, + "step": 7175 + }, + { + "epoch": 0.6501324998301284, + "grad_norm": 0.5920486450195312, + "learning_rate": 0.00015667854769528183, + "loss": 1.3435, + "step": 7176 + }, + { + "epoch": 0.6502230980045752, + "grad_norm": 0.7862117886543274, + "learning_rate": 0.0001566725064942911, + "loss": 2.6568, + "step": 7177 + }, + { + "epoch": 0.650313696179022, + "grad_norm": 0.8015889525413513, + "learning_rate": 0.0001566664652933003, + "loss": 2.8441, + "step": 7178 + }, + { + "epoch": 0.6504042943534688, + "grad_norm": 0.8199909329414368, + "learning_rate": 0.00015666042409230956, + "loss": 3.02, + "step": 7179 + }, + { + "epoch": 0.6504948925279156, + "grad_norm": 0.7799011468887329, + "learning_rate": 0.0001566543828913188, + "loss": 3.0221, + "step": 7180 + }, + { + "epoch": 0.6505854907023624, + "grad_norm": 0.7971848845481873, + "learning_rate": 0.00015664834169032806, + "loss": 3.2006, + "step": 7181 + }, + { + "epoch": 0.6506760888768092, + "grad_norm": 0.8258726596832275, + "learning_rate": 0.0001566423004893373, + "loss": 3.0606, + "step": 7182 + }, + { + "epoch": 0.650766687051256, + "grad_norm": 0.7385668158531189, + "learning_rate": 0.00015663625928834653, + "loss": 2.6939, + "step": 7183 + }, + { + "epoch": 0.6508572852257027, + "grad_norm": 0.8077927231788635, + "learning_rate": 0.0001566302180873558, + "loss": 2.6804, + "step": 7184 + }, + { + "epoch": 0.6509478834001495, + "grad_norm": 0.7573215961456299, + "learning_rate": 0.00015662417688636502, + "loss": 2.7015, + "step": 7185 + }, + { + "epoch": 0.6510384815745963, + "grad_norm": 0.7825071215629578, + "learning_rate": 0.00015661813568537426, + "loss": 2.6687, + "step": 7186 + }, + { + "epoch": 0.6511290797490431, + "grad_norm": 0.7521699666976929, + "learning_rate": 0.0001566120944843835, + "loss": 2.7517, + "step": 7187 + }, + { + "epoch": 0.6512196779234899, + "grad_norm": 0.694229245185852, + "learning_rate": 0.00015660605328339275, + "loss": 2.4427, + "step": 7188 + }, + { + "epoch": 0.6513102760979367, + "grad_norm": 0.8046308159828186, + "learning_rate": 0.00015660001208240199, + "loss": 2.8164, + "step": 7189 + }, + { + "epoch": 0.6514008742723834, + "grad_norm": 0.7516562342643738, + "learning_rate": 0.00015659397088141125, + "loss": 2.7331, + "step": 7190 + }, + { + "epoch": 0.6514914724468301, + "grad_norm": 0.811676025390625, + "learning_rate": 0.00015658792968042045, + "loss": 2.7787, + "step": 7191 + }, + { + "epoch": 0.6515820706212769, + "grad_norm": 0.8960166573524475, + "learning_rate": 0.00015658188847942971, + "loss": 2.6614, + "step": 7192 + }, + { + "epoch": 0.6516726687957237, + "grad_norm": 0.7341805696487427, + "learning_rate": 0.00015657584727843898, + "loss": 2.5056, + "step": 7193 + }, + { + "epoch": 0.6517632669701705, + "grad_norm": 0.7384070754051208, + "learning_rate": 0.0001565698060774482, + "loss": 2.5248, + "step": 7194 + }, + { + "epoch": 0.6518538651446173, + "grad_norm": 0.7450034618377686, + "learning_rate": 0.00015656376487645744, + "loss": 2.8385, + "step": 7195 + }, + { + "epoch": 0.6519444633190641, + "grad_norm": 0.7600663900375366, + "learning_rate": 0.00015655772367546668, + "loss": 2.5257, + "step": 7196 + }, + { + "epoch": 0.6520350614935109, + "grad_norm": 0.7619049549102783, + "learning_rate": 0.00015655168247447594, + "loss": 2.8278, + "step": 7197 + }, + { + "epoch": 0.6521256596679577, + "grad_norm": 0.8092134594917297, + "learning_rate": 0.00015654564127348517, + "loss": 3.1985, + "step": 7198 + }, + { + "epoch": 0.6522162578424044, + "grad_norm": 0.958672285079956, + "learning_rate": 0.0001565396000724944, + "loss": 2.8961, + "step": 7199 + }, + { + "epoch": 0.6523068560168512, + "grad_norm": 0.7848172783851624, + "learning_rate": 0.00015653355887150367, + "loss": 2.8356, + "step": 7200 + }, + { + "epoch": 0.652397454191298, + "grad_norm": 0.7418564558029175, + "learning_rate": 0.0001565275176705129, + "loss": 2.7976, + "step": 7201 + }, + { + "epoch": 0.6524880523657448, + "grad_norm": 0.7698329091072083, + "learning_rate": 0.00015652147646952216, + "loss": 2.9695, + "step": 7202 + }, + { + "epoch": 0.6525786505401916, + "grad_norm": 0.7409756183624268, + "learning_rate": 0.0001565154352685314, + "loss": 2.819, + "step": 7203 + }, + { + "epoch": 0.6526692487146384, + "grad_norm": 0.8100387454032898, + "learning_rate": 0.00015650939406754063, + "loss": 2.7516, + "step": 7204 + }, + { + "epoch": 0.6527598468890852, + "grad_norm": 0.8059036135673523, + "learning_rate": 0.00015650335286654987, + "loss": 2.7453, + "step": 7205 + }, + { + "epoch": 0.652850445063532, + "grad_norm": 0.7366684079170227, + "learning_rate": 0.00015649731166555913, + "loss": 2.6249, + "step": 7206 + }, + { + "epoch": 0.6529410432379787, + "grad_norm": 0.7689147591590881, + "learning_rate": 0.00015649127046456836, + "loss": 2.8157, + "step": 7207 + }, + { + "epoch": 0.6530316414124255, + "grad_norm": 0.7962402105331421, + "learning_rate": 0.0001564852292635776, + "loss": 3.1325, + "step": 7208 + }, + { + "epoch": 0.6531222395868723, + "grad_norm": 0.7366228699684143, + "learning_rate": 0.00015647918806258686, + "loss": 2.6431, + "step": 7209 + }, + { + "epoch": 0.6532128377613191, + "grad_norm": 0.7772252559661865, + "learning_rate": 0.0001564731468615961, + "loss": 2.693, + "step": 7210 + }, + { + "epoch": 0.6533034359357659, + "grad_norm": 0.7566537857055664, + "learning_rate": 0.00015646710566060535, + "loss": 2.7855, + "step": 7211 + }, + { + "epoch": 0.6533940341102127, + "grad_norm": 0.7521429657936096, + "learning_rate": 0.00015646106445961456, + "loss": 2.8519, + "step": 7212 + }, + { + "epoch": 0.6534846322846595, + "grad_norm": 0.7411168217658997, + "learning_rate": 0.00015645502325862382, + "loss": 2.8857, + "step": 7213 + }, + { + "epoch": 0.6535752304591063, + "grad_norm": 0.8834652900695801, + "learning_rate": 0.00015644898205763308, + "loss": 2.6211, + "step": 7214 + }, + { + "epoch": 0.653665828633553, + "grad_norm": 0.832001805305481, + "learning_rate": 0.00015644294085664231, + "loss": 2.6795, + "step": 7215 + }, + { + "epoch": 0.6537564268079998, + "grad_norm": 0.8013150691986084, + "learning_rate": 0.00015643689965565155, + "loss": 2.8788, + "step": 7216 + }, + { + "epoch": 0.6538470249824466, + "grad_norm": 0.7439073324203491, + "learning_rate": 0.00015643085845466078, + "loss": 2.6413, + "step": 7217 + }, + { + "epoch": 0.6539376231568934, + "grad_norm": 0.7680752277374268, + "learning_rate": 0.00015642481725367004, + "loss": 2.7578, + "step": 7218 + }, + { + "epoch": 0.6540282213313402, + "grad_norm": 0.771808922290802, + "learning_rate": 0.00015641877605267928, + "loss": 2.7255, + "step": 7219 + }, + { + "epoch": 0.654118819505787, + "grad_norm": 0.8005893230438232, + "learning_rate": 0.0001564127348516885, + "loss": 2.8394, + "step": 7220 + }, + { + "epoch": 0.6542094176802338, + "grad_norm": 0.8161522746086121, + "learning_rate": 0.00015640669365069775, + "loss": 2.9371, + "step": 7221 + }, + { + "epoch": 0.6543000158546806, + "grad_norm": 0.9289745688438416, + "learning_rate": 0.000156400652449707, + "loss": 3.079, + "step": 7222 + }, + { + "epoch": 0.6543906140291273, + "grad_norm": 0.7802804708480835, + "learning_rate": 0.00015639461124871627, + "loss": 2.5855, + "step": 7223 + }, + { + "epoch": 0.6544812122035741, + "grad_norm": 0.7467418313026428, + "learning_rate": 0.0001563885700477255, + "loss": 2.6705, + "step": 7224 + }, + { + "epoch": 0.6545718103780209, + "grad_norm": 0.7641212940216064, + "learning_rate": 0.00015638252884673474, + "loss": 2.5874, + "step": 7225 + }, + { + "epoch": 0.6546624085524677, + "grad_norm": 0.6190158724784851, + "learning_rate": 0.00015637648764574397, + "loss": 1.3979, + "step": 7226 + }, + { + "epoch": 0.6547530067269145, + "grad_norm": 0.7720859050750732, + "learning_rate": 0.00015637044644475323, + "loss": 1.9252, + "step": 7227 + }, + { + "epoch": 0.6548436049013613, + "grad_norm": 0.7755964398384094, + "learning_rate": 0.00015636440524376247, + "loss": 2.7564, + "step": 7228 + }, + { + "epoch": 0.6549342030758081, + "grad_norm": 0.8444638252258301, + "learning_rate": 0.0001563583640427717, + "loss": 2.698, + "step": 7229 + }, + { + "epoch": 0.6550248012502549, + "grad_norm": 0.9187250137329102, + "learning_rate": 0.00015635232284178096, + "loss": 2.7814, + "step": 7230 + }, + { + "epoch": 0.6551153994247015, + "grad_norm": 0.8031560182571411, + "learning_rate": 0.0001563462816407902, + "loss": 2.6531, + "step": 7231 + }, + { + "epoch": 0.6552059975991483, + "grad_norm": 0.754973828792572, + "learning_rate": 0.00015634024043979946, + "loss": 2.6821, + "step": 7232 + }, + { + "epoch": 0.6552965957735951, + "grad_norm": 0.7790125012397766, + "learning_rate": 0.00015633419923880866, + "loss": 2.976, + "step": 7233 + }, + { + "epoch": 0.6553871939480419, + "grad_norm": 0.7937088012695312, + "learning_rate": 0.00015632815803781792, + "loss": 2.7631, + "step": 7234 + }, + { + "epoch": 0.6554777921224887, + "grad_norm": 0.822517991065979, + "learning_rate": 0.00015632211683682716, + "loss": 3.0563, + "step": 7235 + }, + { + "epoch": 0.6555683902969355, + "grad_norm": 0.8051598072052002, + "learning_rate": 0.00015631607563583642, + "loss": 2.7698, + "step": 7236 + }, + { + "epoch": 0.6556589884713823, + "grad_norm": 0.7294332981109619, + "learning_rate": 0.00015631003443484565, + "loss": 1.9277, + "step": 7237 + }, + { + "epoch": 0.655749586645829, + "grad_norm": 0.8874070644378662, + "learning_rate": 0.0001563039932338549, + "loss": 2.6116, + "step": 7238 + }, + { + "epoch": 0.6558401848202758, + "grad_norm": 0.825710117816925, + "learning_rate": 0.00015629795203286415, + "loss": 2.7716, + "step": 7239 + }, + { + "epoch": 0.6559307829947226, + "grad_norm": 0.7577652335166931, + "learning_rate": 0.00015629191083187338, + "loss": 2.5239, + "step": 7240 + }, + { + "epoch": 0.6560213811691694, + "grad_norm": 0.8164464235305786, + "learning_rate": 0.00015628586963088264, + "loss": 2.9229, + "step": 7241 + }, + { + "epoch": 0.6561119793436162, + "grad_norm": 0.8221468329429626, + "learning_rate": 0.00015627982842989185, + "loss": 2.8126, + "step": 7242 + }, + { + "epoch": 0.656202577518063, + "grad_norm": 0.7524198293685913, + "learning_rate": 0.0001562737872289011, + "loss": 2.6969, + "step": 7243 + }, + { + "epoch": 0.6562931756925098, + "grad_norm": 0.7724577784538269, + "learning_rate": 0.00015626774602791037, + "loss": 2.6979, + "step": 7244 + }, + { + "epoch": 0.6563837738669566, + "grad_norm": 0.7568369507789612, + "learning_rate": 0.0001562617048269196, + "loss": 2.6336, + "step": 7245 + }, + { + "epoch": 0.6564743720414034, + "grad_norm": 0.7027232646942139, + "learning_rate": 0.00015625566362592884, + "loss": 2.2213, + "step": 7246 + }, + { + "epoch": 0.6565649702158501, + "grad_norm": 0.6872568726539612, + "learning_rate": 0.00015624962242493808, + "loss": 2.2257, + "step": 7247 + }, + { + "epoch": 0.6566555683902969, + "grad_norm": 0.7831913828849792, + "learning_rate": 0.00015624358122394734, + "loss": 2.9617, + "step": 7248 + }, + { + "epoch": 0.6567461665647437, + "grad_norm": 0.8203994631767273, + "learning_rate": 0.00015623754002295657, + "loss": 2.8292, + "step": 7249 + }, + { + "epoch": 0.6568367647391905, + "grad_norm": 0.8654953241348267, + "learning_rate": 0.0001562314988219658, + "loss": 3.0704, + "step": 7250 + }, + { + "epoch": 0.6569273629136373, + "grad_norm": 0.7954952120780945, + "learning_rate": 0.00015622545762097504, + "loss": 2.9826, + "step": 7251 + }, + { + "epoch": 0.6570179610880841, + "grad_norm": 0.8077565431594849, + "learning_rate": 0.0001562194164199843, + "loss": 2.9692, + "step": 7252 + }, + { + "epoch": 0.6571085592625309, + "grad_norm": 0.8474165797233582, + "learning_rate": 0.00015621337521899356, + "loss": 3.166, + "step": 7253 + }, + { + "epoch": 0.6571991574369777, + "grad_norm": 0.8303209543228149, + "learning_rate": 0.0001562073340180028, + "loss": 3.0586, + "step": 7254 + }, + { + "epoch": 0.6572897556114244, + "grad_norm": 0.771838366985321, + "learning_rate": 0.00015620129281701203, + "loss": 2.517, + "step": 7255 + }, + { + "epoch": 0.6573803537858712, + "grad_norm": 0.8426068425178528, + "learning_rate": 0.00015619525161602126, + "loss": 2.8246, + "step": 7256 + }, + { + "epoch": 0.657470951960318, + "grad_norm": 0.7469764351844788, + "learning_rate": 0.00015618921041503052, + "loss": 2.6398, + "step": 7257 + }, + { + "epoch": 0.6575615501347648, + "grad_norm": 0.9033700227737427, + "learning_rate": 0.00015618316921403976, + "loss": 2.8605, + "step": 7258 + }, + { + "epoch": 0.6576521483092116, + "grad_norm": 0.8014668822288513, + "learning_rate": 0.000156177128013049, + "loss": 2.8085, + "step": 7259 + }, + { + "epoch": 0.6577427464836584, + "grad_norm": 0.8181038498878479, + "learning_rate": 0.00015617108681205825, + "loss": 2.641, + "step": 7260 + }, + { + "epoch": 0.6578333446581052, + "grad_norm": 0.8099126219749451, + "learning_rate": 0.0001561650456110675, + "loss": 2.9934, + "step": 7261 + }, + { + "epoch": 0.657923942832552, + "grad_norm": 0.7616696357727051, + "learning_rate": 0.00015615900441007675, + "loss": 2.6654, + "step": 7262 + }, + { + "epoch": 0.6580145410069987, + "grad_norm": 0.7380683422088623, + "learning_rate": 0.00015615296320908596, + "loss": 2.8366, + "step": 7263 + }, + { + "epoch": 0.6581051391814455, + "grad_norm": 0.8364399075508118, + "learning_rate": 0.00015614692200809522, + "loss": 2.8078, + "step": 7264 + }, + { + "epoch": 0.6581957373558923, + "grad_norm": 0.7541628479957581, + "learning_rate": 0.00015614088080710445, + "loss": 2.7518, + "step": 7265 + }, + { + "epoch": 0.6582863355303391, + "grad_norm": 0.6499919295310974, + "learning_rate": 0.0001561348396061137, + "loss": 1.9568, + "step": 7266 + }, + { + "epoch": 0.6583769337047859, + "grad_norm": 0.7784257531166077, + "learning_rate": 0.00015612879840512295, + "loss": 2.7767, + "step": 7267 + }, + { + "epoch": 0.6584675318792327, + "grad_norm": 0.7227312326431274, + "learning_rate": 0.00015612275720413218, + "loss": 2.7202, + "step": 7268 + }, + { + "epoch": 0.6585581300536795, + "grad_norm": 0.7347221970558167, + "learning_rate": 0.00015611671600314144, + "loss": 2.7027, + "step": 7269 + }, + { + "epoch": 0.6586487282281263, + "grad_norm": 0.8338147401809692, + "learning_rate": 0.00015611067480215068, + "loss": 2.9689, + "step": 7270 + }, + { + "epoch": 0.6587393264025729, + "grad_norm": 0.7702489495277405, + "learning_rate": 0.0001561046336011599, + "loss": 2.7426, + "step": 7271 + }, + { + "epoch": 0.6588299245770197, + "grad_norm": 0.8105913400650024, + "learning_rate": 0.00015609859240016914, + "loss": 2.6835, + "step": 7272 + }, + { + "epoch": 0.6589205227514665, + "grad_norm": 0.8654102087020874, + "learning_rate": 0.0001560925511991784, + "loss": 2.7504, + "step": 7273 + }, + { + "epoch": 0.6590111209259133, + "grad_norm": 0.7874342203140259, + "learning_rate": 0.00015608650999818767, + "loss": 2.7736, + "step": 7274 + }, + { + "epoch": 0.6591017191003601, + "grad_norm": 0.8677444458007812, + "learning_rate": 0.0001560804687971969, + "loss": 2.8029, + "step": 7275 + }, + { + "epoch": 0.6591923172748069, + "grad_norm": 0.6980878114700317, + "learning_rate": 0.00015607442759620613, + "loss": 2.4906, + "step": 7276 + }, + { + "epoch": 0.6592829154492537, + "grad_norm": 0.916812539100647, + "learning_rate": 0.00015606838639521537, + "loss": 3.1364, + "step": 7277 + }, + { + "epoch": 0.6593735136237004, + "grad_norm": 0.7759513258934021, + "learning_rate": 0.00015606234519422463, + "loss": 2.5718, + "step": 7278 + }, + { + "epoch": 0.6594641117981472, + "grad_norm": 0.8244889378547668, + "learning_rate": 0.00015605630399323386, + "loss": 2.7968, + "step": 7279 + }, + { + "epoch": 0.659554709972594, + "grad_norm": 0.7820501327514648, + "learning_rate": 0.0001560502627922431, + "loss": 2.4439, + "step": 7280 + }, + { + "epoch": 0.6596453081470408, + "grad_norm": 0.7556466460227966, + "learning_rate": 0.00015604422159125233, + "loss": 2.0661, + "step": 7281 + }, + { + "epoch": 0.6597359063214876, + "grad_norm": 0.7344481348991394, + "learning_rate": 0.0001560381803902616, + "loss": 2.9992, + "step": 7282 + }, + { + "epoch": 0.6598265044959344, + "grad_norm": 0.739254891872406, + "learning_rate": 0.00015603213918927085, + "loss": 2.6717, + "step": 7283 + }, + { + "epoch": 0.6599171026703812, + "grad_norm": 0.7798665761947632, + "learning_rate": 0.00015602609798828006, + "loss": 2.4446, + "step": 7284 + }, + { + "epoch": 0.660007700844828, + "grad_norm": 0.6789169907569885, + "learning_rate": 0.00015602005678728932, + "loss": 2.2963, + "step": 7285 + }, + { + "epoch": 0.6600982990192747, + "grad_norm": 0.8107040524482727, + "learning_rate": 0.00015601401558629856, + "loss": 2.9611, + "step": 7286 + }, + { + "epoch": 0.6601888971937215, + "grad_norm": 0.7999677062034607, + "learning_rate": 0.00015600797438530782, + "loss": 2.8721, + "step": 7287 + }, + { + "epoch": 0.6602794953681683, + "grad_norm": 0.6556376218795776, + "learning_rate": 0.00015600193318431705, + "loss": 2.1146, + "step": 7288 + }, + { + "epoch": 0.6603700935426151, + "grad_norm": 0.8645402789115906, + "learning_rate": 0.00015599589198332629, + "loss": 2.7495, + "step": 7289 + }, + { + "epoch": 0.6604606917170619, + "grad_norm": 0.8557880520820618, + "learning_rate": 0.00015598985078233555, + "loss": 3.0428, + "step": 7290 + }, + { + "epoch": 0.6605512898915087, + "grad_norm": 0.7954426407814026, + "learning_rate": 0.00015598380958134478, + "loss": 2.651, + "step": 7291 + }, + { + "epoch": 0.6606418880659555, + "grad_norm": 0.721440851688385, + "learning_rate": 0.00015597776838035404, + "loss": 2.7061, + "step": 7292 + }, + { + "epoch": 0.6607324862404023, + "grad_norm": 0.7758052349090576, + "learning_rate": 0.00015597172717936325, + "loss": 2.9759, + "step": 7293 + }, + { + "epoch": 0.660823084414849, + "grad_norm": 0.7159985303878784, + "learning_rate": 0.0001559656859783725, + "loss": 2.2386, + "step": 7294 + }, + { + "epoch": 0.6609136825892958, + "grad_norm": 0.7803939580917358, + "learning_rate": 0.00015595964477738174, + "loss": 2.5923, + "step": 7295 + }, + { + "epoch": 0.6610042807637426, + "grad_norm": 0.6946631073951721, + "learning_rate": 0.000155953603576391, + "loss": 2.0351, + "step": 7296 + }, + { + "epoch": 0.6610948789381894, + "grad_norm": 0.7257297039031982, + "learning_rate": 0.00015594756237540024, + "loss": 2.4494, + "step": 7297 + }, + { + "epoch": 0.6611854771126362, + "grad_norm": 0.7393313646316528, + "learning_rate": 0.00015594152117440947, + "loss": 2.6917, + "step": 7298 + }, + { + "epoch": 0.661276075287083, + "grad_norm": 0.7583003640174866, + "learning_rate": 0.00015593547997341873, + "loss": 2.9095, + "step": 7299 + }, + { + "epoch": 0.6613666734615298, + "grad_norm": 0.7919238805770874, + "learning_rate": 0.00015592943877242797, + "loss": 2.6278, + "step": 7300 + }, + { + "epoch": 0.6614572716359766, + "grad_norm": 0.7910330891609192, + "learning_rate": 0.0001559233975714372, + "loss": 2.5977, + "step": 7301 + }, + { + "epoch": 0.6615478698104234, + "grad_norm": 0.7241906523704529, + "learning_rate": 0.00015591735637044644, + "loss": 1.9946, + "step": 7302 + }, + { + "epoch": 0.6616384679848701, + "grad_norm": 0.7681901454925537, + "learning_rate": 0.0001559113151694557, + "loss": 2.8051, + "step": 7303 + }, + { + "epoch": 0.6617290661593169, + "grad_norm": 0.8705862164497375, + "learning_rate": 0.00015590527396846496, + "loss": 2.7029, + "step": 7304 + }, + { + "epoch": 0.6618196643337637, + "grad_norm": 0.9075371623039246, + "learning_rate": 0.0001558992327674742, + "loss": 2.9206, + "step": 7305 + }, + { + "epoch": 0.6619102625082105, + "grad_norm": 0.7970467209815979, + "learning_rate": 0.00015589319156648343, + "loss": 3.0078, + "step": 7306 + }, + { + "epoch": 0.6620008606826573, + "grad_norm": 0.7680588960647583, + "learning_rate": 0.00015588715036549266, + "loss": 2.6993, + "step": 7307 + }, + { + "epoch": 0.6620914588571041, + "grad_norm": 0.7806230187416077, + "learning_rate": 0.00015588110916450192, + "loss": 2.8657, + "step": 7308 + }, + { + "epoch": 0.6621820570315509, + "grad_norm": 0.8113061785697937, + "learning_rate": 0.00015587506796351116, + "loss": 2.973, + "step": 7309 + }, + { + "epoch": 0.6622726552059977, + "grad_norm": 0.8042522072792053, + "learning_rate": 0.0001558690267625204, + "loss": 2.8747, + "step": 7310 + }, + { + "epoch": 0.6623632533804444, + "grad_norm": 0.7432583570480347, + "learning_rate": 0.00015586298556152962, + "loss": 2.5984, + "step": 7311 + }, + { + "epoch": 0.6624538515548911, + "grad_norm": 0.7837374806404114, + "learning_rate": 0.00015585694436053889, + "loss": 2.7229, + "step": 7312 + }, + { + "epoch": 0.6625444497293379, + "grad_norm": 0.7928093075752258, + "learning_rate": 0.00015585090315954815, + "loss": 2.8665, + "step": 7313 + }, + { + "epoch": 0.6626350479037847, + "grad_norm": 0.7946595549583435, + "learning_rate": 0.00015584486195855735, + "loss": 2.6919, + "step": 7314 + }, + { + "epoch": 0.6627256460782315, + "grad_norm": 0.7415473461151123, + "learning_rate": 0.00015583882075756661, + "loss": 3.0501, + "step": 7315 + }, + { + "epoch": 0.6628162442526783, + "grad_norm": 0.8271952271461487, + "learning_rate": 0.00015583277955657585, + "loss": 2.8393, + "step": 7316 + }, + { + "epoch": 0.662906842427125, + "grad_norm": 0.7971226572990417, + "learning_rate": 0.0001558267383555851, + "loss": 2.6206, + "step": 7317 + }, + { + "epoch": 0.6629974406015718, + "grad_norm": 0.8688997626304626, + "learning_rate": 0.00015582069715459434, + "loss": 2.4429, + "step": 7318 + }, + { + "epoch": 0.6630880387760186, + "grad_norm": 0.7603760957717896, + "learning_rate": 0.00015581465595360358, + "loss": 2.9682, + "step": 7319 + }, + { + "epoch": 0.6631786369504654, + "grad_norm": 0.8421613574028015, + "learning_rate": 0.00015580861475261284, + "loss": 2.7372, + "step": 7320 + }, + { + "epoch": 0.6632692351249122, + "grad_norm": 0.7556456923484802, + "learning_rate": 0.00015580257355162207, + "loss": 2.6879, + "step": 7321 + }, + { + "epoch": 0.663359833299359, + "grad_norm": 0.8173494338989258, + "learning_rate": 0.0001557965323506313, + "loss": 3.1094, + "step": 7322 + }, + { + "epoch": 0.6634504314738058, + "grad_norm": 0.7635543346405029, + "learning_rate": 0.00015579049114964054, + "loss": 2.7366, + "step": 7323 + }, + { + "epoch": 0.6635410296482526, + "grad_norm": 0.8046569228172302, + "learning_rate": 0.0001557844499486498, + "loss": 2.8178, + "step": 7324 + }, + { + "epoch": 0.6636316278226994, + "grad_norm": 0.7939793467521667, + "learning_rate": 0.00015577840874765904, + "loss": 2.6348, + "step": 7325 + }, + { + "epoch": 0.6637222259971461, + "grad_norm": 0.7213746905326843, + "learning_rate": 0.0001557723675466683, + "loss": 2.501, + "step": 7326 + }, + { + "epoch": 0.6638128241715929, + "grad_norm": 0.8822300434112549, + "learning_rate": 0.00015576632634567753, + "loss": 2.7973, + "step": 7327 + }, + { + "epoch": 0.6639034223460397, + "grad_norm": 0.6955375671386719, + "learning_rate": 0.00015576028514468677, + "loss": 2.1082, + "step": 7328 + }, + { + "epoch": 0.6639940205204865, + "grad_norm": 0.8044022917747498, + "learning_rate": 0.00015575424394369603, + "loss": 2.8562, + "step": 7329 + }, + { + "epoch": 0.6640846186949333, + "grad_norm": 0.745799720287323, + "learning_rate": 0.00015574820274270526, + "loss": 2.7725, + "step": 7330 + }, + { + "epoch": 0.6641752168693801, + "grad_norm": 0.7632625699043274, + "learning_rate": 0.0001557421615417145, + "loss": 2.6439, + "step": 7331 + }, + { + "epoch": 0.6642658150438269, + "grad_norm": 0.7096785306930542, + "learning_rate": 0.00015573612034072373, + "loss": 2.1943, + "step": 7332 + }, + { + "epoch": 0.6643564132182737, + "grad_norm": 0.7842033505439758, + "learning_rate": 0.000155730079139733, + "loss": 2.8674, + "step": 7333 + }, + { + "epoch": 0.6644470113927204, + "grad_norm": 0.8101304173469543, + "learning_rate": 0.00015572403793874225, + "loss": 2.6985, + "step": 7334 + }, + { + "epoch": 0.6645376095671672, + "grad_norm": 0.8138712644577026, + "learning_rate": 0.00015571799673775146, + "loss": 2.7419, + "step": 7335 + }, + { + "epoch": 0.664628207741614, + "grad_norm": 0.8262655735015869, + "learning_rate": 0.00015571195553676072, + "loss": 2.7057, + "step": 7336 + }, + { + "epoch": 0.6647188059160608, + "grad_norm": 0.8036178350448608, + "learning_rate": 0.00015570591433576995, + "loss": 2.7534, + "step": 7337 + }, + { + "epoch": 0.6648094040905076, + "grad_norm": 0.7976306080818176, + "learning_rate": 0.00015569987313477921, + "loss": 2.7569, + "step": 7338 + }, + { + "epoch": 0.6649000022649544, + "grad_norm": 0.7915976047515869, + "learning_rate": 0.00015569383193378845, + "loss": 2.9076, + "step": 7339 + }, + { + "epoch": 0.6649906004394012, + "grad_norm": 0.7597315907478333, + "learning_rate": 0.00015568779073279768, + "loss": 2.8239, + "step": 7340 + }, + { + "epoch": 0.665081198613848, + "grad_norm": 0.8053820729255676, + "learning_rate": 0.00015568174953180692, + "loss": 2.6229, + "step": 7341 + }, + { + "epoch": 0.6651717967882947, + "grad_norm": 0.8033503293991089, + "learning_rate": 0.00015567570833081618, + "loss": 3.0619, + "step": 7342 + }, + { + "epoch": 0.6652623949627415, + "grad_norm": 0.8817557096481323, + "learning_rate": 0.0001556696671298254, + "loss": 2.5291, + "step": 7343 + }, + { + "epoch": 0.6653529931371883, + "grad_norm": 0.8528518080711365, + "learning_rate": 0.00015566362592883465, + "loss": 2.7678, + "step": 7344 + }, + { + "epoch": 0.6654435913116351, + "grad_norm": 0.7526767253875732, + "learning_rate": 0.0001556575847278439, + "loss": 2.9874, + "step": 7345 + }, + { + "epoch": 0.6655341894860819, + "grad_norm": 0.8056029677391052, + "learning_rate": 0.00015565154352685314, + "loss": 3.0556, + "step": 7346 + }, + { + "epoch": 0.6656247876605287, + "grad_norm": 0.8193864226341248, + "learning_rate": 0.0001556455023258624, + "loss": 2.8796, + "step": 7347 + }, + { + "epoch": 0.6657153858349755, + "grad_norm": 0.7373614311218262, + "learning_rate": 0.0001556394611248716, + "loss": 2.7215, + "step": 7348 + }, + { + "epoch": 0.6658059840094223, + "grad_norm": 0.8236273527145386, + "learning_rate": 0.00015563341992388087, + "loss": 2.7834, + "step": 7349 + }, + { + "epoch": 0.665896582183869, + "grad_norm": 0.7525055408477783, + "learning_rate": 0.00015562737872289013, + "loss": 2.805, + "step": 7350 + }, + { + "epoch": 0.6659871803583158, + "grad_norm": 0.8415221571922302, + "learning_rate": 0.00015562133752189937, + "loss": 3.1167, + "step": 7351 + }, + { + "epoch": 0.6660777785327625, + "grad_norm": 0.8601372838020325, + "learning_rate": 0.0001556152963209086, + "loss": 2.7551, + "step": 7352 + }, + { + "epoch": 0.6661683767072093, + "grad_norm": 0.8252455592155457, + "learning_rate": 0.00015560925511991783, + "loss": 2.9037, + "step": 7353 + }, + { + "epoch": 0.6662589748816561, + "grad_norm": 0.8369109034538269, + "learning_rate": 0.0001556032139189271, + "loss": 2.7829, + "step": 7354 + }, + { + "epoch": 0.6663495730561029, + "grad_norm": 0.7202902436256409, + "learning_rate": 0.00015559717271793633, + "loss": 2.7711, + "step": 7355 + }, + { + "epoch": 0.6664401712305497, + "grad_norm": 0.8473628163337708, + "learning_rate": 0.00015559113151694556, + "loss": 2.6438, + "step": 7356 + }, + { + "epoch": 0.6665307694049964, + "grad_norm": 0.8407551050186157, + "learning_rate": 0.00015558509031595482, + "loss": 2.8757, + "step": 7357 + }, + { + "epoch": 0.6666213675794432, + "grad_norm": 0.7362101674079895, + "learning_rate": 0.00015557904911496406, + "loss": 3.0129, + "step": 7358 + }, + { + "epoch": 0.66671196575389, + "grad_norm": 0.7005151510238647, + "learning_rate": 0.00015557300791397332, + "loss": 2.0918, + "step": 7359 + }, + { + "epoch": 0.6668025639283368, + "grad_norm": 0.8024141192436218, + "learning_rate": 0.00015556696671298255, + "loss": 3.0369, + "step": 7360 + }, + { + "epoch": 0.6668931621027836, + "grad_norm": 0.7561980485916138, + "learning_rate": 0.0001555609255119918, + "loss": 2.819, + "step": 7361 + }, + { + "epoch": 0.6669837602772304, + "grad_norm": 0.8832188248634338, + "learning_rate": 0.00015555488431100102, + "loss": 2.9051, + "step": 7362 + }, + { + "epoch": 0.6670743584516772, + "grad_norm": 0.7998010516166687, + "learning_rate": 0.00015554884311001028, + "loss": 2.709, + "step": 7363 + }, + { + "epoch": 0.667164956626124, + "grad_norm": 0.7917867302894592, + "learning_rate": 0.00015554280190901954, + "loss": 2.8735, + "step": 7364 + }, + { + "epoch": 0.6672555548005707, + "grad_norm": 0.7706528902053833, + "learning_rate": 0.00015553676070802875, + "loss": 2.2612, + "step": 7365 + }, + { + "epoch": 0.6673461529750175, + "grad_norm": 0.7952792048454285, + "learning_rate": 0.000155530719507038, + "loss": 2.7966, + "step": 7366 + }, + { + "epoch": 0.6674367511494643, + "grad_norm": 0.8434330224990845, + "learning_rate": 0.00015552467830604725, + "loss": 2.8969, + "step": 7367 + }, + { + "epoch": 0.6675273493239111, + "grad_norm": 0.7947022318840027, + "learning_rate": 0.0001555186371050565, + "loss": 2.8318, + "step": 7368 + }, + { + "epoch": 0.6676179474983579, + "grad_norm": 0.7761032581329346, + "learning_rate": 0.00015551259590406574, + "loss": 2.8089, + "step": 7369 + }, + { + "epoch": 0.6677085456728047, + "grad_norm": 0.7662448287010193, + "learning_rate": 0.00015550655470307498, + "loss": 2.7638, + "step": 7370 + }, + { + "epoch": 0.6677991438472515, + "grad_norm": 0.7817125916481018, + "learning_rate": 0.0001555005135020842, + "loss": 3.0617, + "step": 7371 + }, + { + "epoch": 0.6678897420216983, + "grad_norm": 0.7575405240058899, + "learning_rate": 0.00015549447230109347, + "loss": 2.6601, + "step": 7372 + }, + { + "epoch": 0.667980340196145, + "grad_norm": 0.7421769499778748, + "learning_rate": 0.0001554884311001027, + "loss": 2.9151, + "step": 7373 + }, + { + "epoch": 0.6680709383705918, + "grad_norm": 0.7940061688423157, + "learning_rate": 0.00015548238989911194, + "loss": 2.9489, + "step": 7374 + }, + { + "epoch": 0.6681615365450386, + "grad_norm": 0.7282138466835022, + "learning_rate": 0.0001554763486981212, + "loss": 2.6307, + "step": 7375 + }, + { + "epoch": 0.6682521347194854, + "grad_norm": 0.7839386463165283, + "learning_rate": 0.00015547030749713043, + "loss": 2.8636, + "step": 7376 + }, + { + "epoch": 0.6683427328939322, + "grad_norm": 0.8015424609184265, + "learning_rate": 0.0001554642662961397, + "loss": 2.9613, + "step": 7377 + }, + { + "epoch": 0.668433331068379, + "grad_norm": 0.7734625935554504, + "learning_rate": 0.0001554582250951489, + "loss": 2.7951, + "step": 7378 + }, + { + "epoch": 0.6685239292428258, + "grad_norm": 0.8189271688461304, + "learning_rate": 0.00015545218389415816, + "loss": 2.8565, + "step": 7379 + }, + { + "epoch": 0.6686145274172726, + "grad_norm": 0.757046103477478, + "learning_rate": 0.00015544614269316742, + "loss": 2.844, + "step": 7380 + }, + { + "epoch": 0.6687051255917194, + "grad_norm": 0.8604857325553894, + "learning_rate": 0.00015544010149217666, + "loss": 3.0304, + "step": 7381 + }, + { + "epoch": 0.6687957237661661, + "grad_norm": 1.012380838394165, + "learning_rate": 0.0001554340602911859, + "loss": 2.599, + "step": 7382 + }, + { + "epoch": 0.6688863219406129, + "grad_norm": 0.9489734768867493, + "learning_rate": 0.00015542801909019513, + "loss": 3.3298, + "step": 7383 + }, + { + "epoch": 0.6689769201150597, + "grad_norm": 0.8390312790870667, + "learning_rate": 0.0001554219778892044, + "loss": 2.9288, + "step": 7384 + }, + { + "epoch": 0.6690675182895065, + "grad_norm": 0.8196800947189331, + "learning_rate": 0.00015541593668821362, + "loss": 2.96, + "step": 7385 + }, + { + "epoch": 0.6691581164639533, + "grad_norm": 0.781050980091095, + "learning_rate": 0.00015540989548722286, + "loss": 2.9601, + "step": 7386 + }, + { + "epoch": 0.6692487146384001, + "grad_norm": 0.744533896446228, + "learning_rate": 0.00015540385428623212, + "loss": 2.7609, + "step": 7387 + }, + { + "epoch": 0.6693393128128469, + "grad_norm": 0.750312328338623, + "learning_rate": 0.00015539781308524135, + "loss": 2.6963, + "step": 7388 + }, + { + "epoch": 0.6694299109872937, + "grad_norm": 0.7890178561210632, + "learning_rate": 0.0001553917718842506, + "loss": 2.5256, + "step": 7389 + }, + { + "epoch": 0.6695205091617404, + "grad_norm": 0.7176899909973145, + "learning_rate": 0.00015538573068325985, + "loss": 2.1561, + "step": 7390 + }, + { + "epoch": 0.6696111073361872, + "grad_norm": 0.7809820771217346, + "learning_rate": 0.00015537968948226908, + "loss": 2.8025, + "step": 7391 + }, + { + "epoch": 0.669701705510634, + "grad_norm": 0.7549856305122375, + "learning_rate": 0.00015537364828127831, + "loss": 2.8701, + "step": 7392 + }, + { + "epoch": 0.6697923036850807, + "grad_norm": 0.8220723271369934, + "learning_rate": 0.00015536760708028758, + "loss": 2.6261, + "step": 7393 + }, + { + "epoch": 0.6698829018595275, + "grad_norm": 0.7666531205177307, + "learning_rate": 0.0001553615658792968, + "loss": 2.8464, + "step": 7394 + }, + { + "epoch": 0.6699735000339743, + "grad_norm": 0.7531270980834961, + "learning_rate": 0.00015535552467830604, + "loss": 2.8027, + "step": 7395 + }, + { + "epoch": 0.670064098208421, + "grad_norm": 0.810744047164917, + "learning_rate": 0.0001553494834773153, + "loss": 2.9696, + "step": 7396 + }, + { + "epoch": 0.6701546963828678, + "grad_norm": 0.8246064186096191, + "learning_rate": 0.00015534344227632454, + "loss": 3.0185, + "step": 7397 + }, + { + "epoch": 0.6702452945573146, + "grad_norm": 0.7859065532684326, + "learning_rate": 0.0001553374010753338, + "loss": 2.8885, + "step": 7398 + }, + { + "epoch": 0.6703358927317614, + "grad_norm": 0.8272415995597839, + "learning_rate": 0.000155331359874343, + "loss": 2.8489, + "step": 7399 + }, + { + "epoch": 0.6704264909062082, + "grad_norm": 0.8384962677955627, + "learning_rate": 0.00015532531867335227, + "loss": 2.9537, + "step": 7400 + }, + { + "epoch": 0.670517089080655, + "grad_norm": 0.8032721877098083, + "learning_rate": 0.0001553192774723615, + "loss": 2.8632, + "step": 7401 + }, + { + "epoch": 0.6706076872551018, + "grad_norm": 0.8313271999359131, + "learning_rate": 0.00015531323627137076, + "loss": 2.8028, + "step": 7402 + }, + { + "epoch": 0.6706982854295486, + "grad_norm": 0.8187699317932129, + "learning_rate": 0.00015530719507038, + "loss": 2.7599, + "step": 7403 + }, + { + "epoch": 0.6707888836039954, + "grad_norm": 0.7086654305458069, + "learning_rate": 0.00015530115386938923, + "loss": 2.6734, + "step": 7404 + }, + { + "epoch": 0.6708794817784421, + "grad_norm": 0.7729156613349915, + "learning_rate": 0.0001552951126683985, + "loss": 2.7125, + "step": 7405 + }, + { + "epoch": 0.6709700799528889, + "grad_norm": 0.812049150466919, + "learning_rate": 0.00015528907146740773, + "loss": 2.8848, + "step": 7406 + }, + { + "epoch": 0.6710606781273357, + "grad_norm": 0.7689921855926514, + "learning_rate": 0.00015528303026641696, + "loss": 2.9499, + "step": 7407 + }, + { + "epoch": 0.6711512763017825, + "grad_norm": 0.8504211902618408, + "learning_rate": 0.0001552769890654262, + "loss": 2.9372, + "step": 7408 + }, + { + "epoch": 0.6712418744762293, + "grad_norm": 0.7990657687187195, + "learning_rate": 0.00015527094786443546, + "loss": 3.0063, + "step": 7409 + }, + { + "epoch": 0.6713324726506761, + "grad_norm": 0.8406283855438232, + "learning_rate": 0.00015526490666344472, + "loss": 2.7891, + "step": 7410 + }, + { + "epoch": 0.6714230708251229, + "grad_norm": 0.7798230051994324, + "learning_rate": 0.00015525886546245395, + "loss": 2.9022, + "step": 7411 + }, + { + "epoch": 0.6715136689995697, + "grad_norm": 0.8415400385856628, + "learning_rate": 0.00015525282426146318, + "loss": 2.9204, + "step": 7412 + }, + { + "epoch": 0.6716042671740164, + "grad_norm": 0.7841160893440247, + "learning_rate": 0.00015524678306047242, + "loss": 2.6912, + "step": 7413 + }, + { + "epoch": 0.6716948653484632, + "grad_norm": 0.775570809841156, + "learning_rate": 0.00015524074185948168, + "loss": 2.9284, + "step": 7414 + }, + { + "epoch": 0.67178546352291, + "grad_norm": 0.768926203250885, + "learning_rate": 0.00015523470065849091, + "loss": 2.7134, + "step": 7415 + }, + { + "epoch": 0.6718760616973568, + "grad_norm": 0.7987819314002991, + "learning_rate": 0.00015522865945750015, + "loss": 2.8604, + "step": 7416 + }, + { + "epoch": 0.6719666598718036, + "grad_norm": 0.7515825629234314, + "learning_rate": 0.0001552226182565094, + "loss": 2.0882, + "step": 7417 + }, + { + "epoch": 0.6720572580462504, + "grad_norm": 0.7921103239059448, + "learning_rate": 0.00015521657705551864, + "loss": 2.799, + "step": 7418 + }, + { + "epoch": 0.6721478562206972, + "grad_norm": 0.7675628662109375, + "learning_rate": 0.0001552105358545279, + "loss": 2.8215, + "step": 7419 + }, + { + "epoch": 0.672238454395144, + "grad_norm": 0.8855920433998108, + "learning_rate": 0.0001552044946535371, + "loss": 2.9965, + "step": 7420 + }, + { + "epoch": 0.6723290525695907, + "grad_norm": 0.7557165026664734, + "learning_rate": 0.00015519845345254637, + "loss": 2.717, + "step": 7421 + }, + { + "epoch": 0.6724196507440375, + "grad_norm": 0.8151484131813049, + "learning_rate": 0.0001551924122515556, + "loss": 2.7903, + "step": 7422 + }, + { + "epoch": 0.6725102489184843, + "grad_norm": 0.7726618051528931, + "learning_rate": 0.00015518637105056487, + "loss": 2.6691, + "step": 7423 + }, + { + "epoch": 0.6726008470929311, + "grad_norm": 0.7676268219947815, + "learning_rate": 0.0001551803298495741, + "loss": 2.8111, + "step": 7424 + }, + { + "epoch": 0.6726914452673779, + "grad_norm": 0.8124436736106873, + "learning_rate": 0.00015517428864858334, + "loss": 2.6461, + "step": 7425 + }, + { + "epoch": 0.6727820434418247, + "grad_norm": 0.7410966157913208, + "learning_rate": 0.0001551682474475926, + "loss": 2.6946, + "step": 7426 + }, + { + "epoch": 0.6728726416162715, + "grad_norm": 0.7767509818077087, + "learning_rate": 0.00015516220624660183, + "loss": 2.7166, + "step": 7427 + }, + { + "epoch": 0.6729632397907183, + "grad_norm": 0.8046210408210754, + "learning_rate": 0.0001551561650456111, + "loss": 2.7925, + "step": 7428 + }, + { + "epoch": 0.673053837965165, + "grad_norm": 0.8941406607627869, + "learning_rate": 0.0001551501238446203, + "loss": 2.7148, + "step": 7429 + }, + { + "epoch": 0.6731444361396118, + "grad_norm": 0.7692221403121948, + "learning_rate": 0.00015514408264362956, + "loss": 2.8503, + "step": 7430 + }, + { + "epoch": 0.6732350343140586, + "grad_norm": 0.7991165518760681, + "learning_rate": 0.00015513804144263882, + "loss": 2.8561, + "step": 7431 + }, + { + "epoch": 0.6733256324885054, + "grad_norm": 0.5419783592224121, + "learning_rate": 0.00015513200024164806, + "loss": 1.2, + "step": 7432 + }, + { + "epoch": 0.6734162306629521, + "grad_norm": 0.7754290699958801, + "learning_rate": 0.0001551259590406573, + "loss": 2.8211, + "step": 7433 + }, + { + "epoch": 0.6735068288373989, + "grad_norm": 0.7549805045127869, + "learning_rate": 0.00015511991783966652, + "loss": 2.6007, + "step": 7434 + }, + { + "epoch": 0.6735974270118457, + "grad_norm": 0.7720409631729126, + "learning_rate": 0.00015511387663867578, + "loss": 2.7591, + "step": 7435 + }, + { + "epoch": 0.6736880251862924, + "grad_norm": 0.7719963788986206, + "learning_rate": 0.00015510783543768502, + "loss": 2.7551, + "step": 7436 + }, + { + "epoch": 0.6737786233607392, + "grad_norm": 0.7369260787963867, + "learning_rate": 0.00015510179423669425, + "loss": 2.5497, + "step": 7437 + }, + { + "epoch": 0.673869221535186, + "grad_norm": 1.0344539880752563, + "learning_rate": 0.0001550957530357035, + "loss": 2.707, + "step": 7438 + }, + { + "epoch": 0.6739598197096328, + "grad_norm": 0.7255682945251465, + "learning_rate": 0.00015508971183471275, + "loss": 2.7285, + "step": 7439 + }, + { + "epoch": 0.6740504178840796, + "grad_norm": 0.9144217371940613, + "learning_rate": 0.000155083670633722, + "loss": 2.5005, + "step": 7440 + }, + { + "epoch": 0.6741410160585264, + "grad_norm": 0.8209573030471802, + "learning_rate": 0.00015507762943273124, + "loss": 2.9622, + "step": 7441 + }, + { + "epoch": 0.6742316142329732, + "grad_norm": 0.8095264434814453, + "learning_rate": 0.00015507158823174048, + "loss": 2.5952, + "step": 7442 + }, + { + "epoch": 0.67432221240742, + "grad_norm": 0.7525365948677063, + "learning_rate": 0.0001550655470307497, + "loss": 2.872, + "step": 7443 + }, + { + "epoch": 0.6744128105818668, + "grad_norm": 0.8029952049255371, + "learning_rate": 0.00015505950582975897, + "loss": 2.8209, + "step": 7444 + }, + { + "epoch": 0.6745034087563135, + "grad_norm": 0.9575304985046387, + "learning_rate": 0.0001550534646287682, + "loss": 2.993, + "step": 7445 + }, + { + "epoch": 0.6745940069307603, + "grad_norm": 0.8123261332511902, + "learning_rate": 0.00015504742342777744, + "loss": 2.8947, + "step": 7446 + }, + { + "epoch": 0.6746846051052071, + "grad_norm": 0.7919873595237732, + "learning_rate": 0.0001550413822267867, + "loss": 2.9143, + "step": 7447 + }, + { + "epoch": 0.6747752032796539, + "grad_norm": 0.7428973913192749, + "learning_rate": 0.00015503534102579594, + "loss": 1.9439, + "step": 7448 + }, + { + "epoch": 0.6748658014541007, + "grad_norm": 0.8031386733055115, + "learning_rate": 0.0001550292998248052, + "loss": 2.6086, + "step": 7449 + }, + { + "epoch": 0.6749563996285475, + "grad_norm": 0.793606698513031, + "learning_rate": 0.0001550232586238144, + "loss": 1.9595, + "step": 7450 + }, + { + "epoch": 0.6750469978029943, + "grad_norm": 0.9845926761627197, + "learning_rate": 0.00015501721742282367, + "loss": 2.8279, + "step": 7451 + }, + { + "epoch": 0.675137595977441, + "grad_norm": 0.8875769972801208, + "learning_rate": 0.0001550111762218329, + "loss": 2.8799, + "step": 7452 + }, + { + "epoch": 0.6752281941518878, + "grad_norm": 0.7860258221626282, + "learning_rate": 0.00015500513502084216, + "loss": 2.8114, + "step": 7453 + }, + { + "epoch": 0.6753187923263346, + "grad_norm": 0.8451524376869202, + "learning_rate": 0.0001549990938198514, + "loss": 2.8224, + "step": 7454 + }, + { + "epoch": 0.6754093905007814, + "grad_norm": 0.8326388001441956, + "learning_rate": 0.00015499305261886063, + "loss": 2.8629, + "step": 7455 + }, + { + "epoch": 0.6754999886752282, + "grad_norm": 0.7709252238273621, + "learning_rate": 0.0001549870114178699, + "loss": 2.8969, + "step": 7456 + }, + { + "epoch": 0.675590586849675, + "grad_norm": 0.7519417405128479, + "learning_rate": 0.00015498097021687912, + "loss": 2.5368, + "step": 7457 + }, + { + "epoch": 0.6756811850241218, + "grad_norm": 0.8117855191230774, + "learning_rate": 0.00015497492901588836, + "loss": 2.6997, + "step": 7458 + }, + { + "epoch": 0.6757717831985686, + "grad_norm": 0.7795746326446533, + "learning_rate": 0.0001549688878148976, + "loss": 2.6841, + "step": 7459 + }, + { + "epoch": 0.6758623813730154, + "grad_norm": 0.7704284191131592, + "learning_rate": 0.00015496284661390685, + "loss": 2.842, + "step": 7460 + }, + { + "epoch": 0.6759529795474621, + "grad_norm": 0.838564932346344, + "learning_rate": 0.00015495680541291611, + "loss": 2.6797, + "step": 7461 + }, + { + "epoch": 0.6760435777219089, + "grad_norm": 0.7720712423324585, + "learning_rate": 0.00015495076421192535, + "loss": 2.8156, + "step": 7462 + }, + { + "epoch": 0.6761341758963557, + "grad_norm": 0.7437976002693176, + "learning_rate": 0.00015494472301093458, + "loss": 2.6574, + "step": 7463 + }, + { + "epoch": 0.6762247740708025, + "grad_norm": 0.7774302959442139, + "learning_rate": 0.00015493868180994382, + "loss": 2.9871, + "step": 7464 + }, + { + "epoch": 0.6763153722452493, + "grad_norm": 0.904835045337677, + "learning_rate": 0.00015493264060895308, + "loss": 2.6133, + "step": 7465 + }, + { + "epoch": 0.6764059704196961, + "grad_norm": 0.7781492471694946, + "learning_rate": 0.0001549265994079623, + "loss": 2.8266, + "step": 7466 + }, + { + "epoch": 0.6764965685941429, + "grad_norm": 0.8087309002876282, + "learning_rate": 0.00015492055820697155, + "loss": 2.7273, + "step": 7467 + }, + { + "epoch": 0.6765871667685897, + "grad_norm": 0.7677531242370605, + "learning_rate": 0.00015491451700598078, + "loss": 2.6005, + "step": 7468 + }, + { + "epoch": 0.6766777649430364, + "grad_norm": 0.7872472405433655, + "learning_rate": 0.00015490847580499004, + "loss": 2.8009, + "step": 7469 + }, + { + "epoch": 0.6767683631174832, + "grad_norm": 0.7815655469894409, + "learning_rate": 0.0001549024346039993, + "loss": 2.823, + "step": 7470 + }, + { + "epoch": 0.67685896129193, + "grad_norm": 0.8161665201187134, + "learning_rate": 0.0001548963934030085, + "loss": 2.7197, + "step": 7471 + }, + { + "epoch": 0.6769495594663768, + "grad_norm": 0.8262826204299927, + "learning_rate": 0.00015489035220201777, + "loss": 2.8345, + "step": 7472 + }, + { + "epoch": 0.6770401576408236, + "grad_norm": 0.8102785348892212, + "learning_rate": 0.000154884311001027, + "loss": 2.7805, + "step": 7473 + }, + { + "epoch": 0.6771307558152703, + "grad_norm": 0.7973754405975342, + "learning_rate": 0.00015487826980003627, + "loss": 2.6725, + "step": 7474 + }, + { + "epoch": 0.6772213539897171, + "grad_norm": 0.8210186958312988, + "learning_rate": 0.0001548722285990455, + "loss": 2.7528, + "step": 7475 + }, + { + "epoch": 0.6773119521641638, + "grad_norm": 0.7375604510307312, + "learning_rate": 0.00015486618739805473, + "loss": 2.7173, + "step": 7476 + }, + { + "epoch": 0.6774025503386106, + "grad_norm": 0.7294208407402039, + "learning_rate": 0.000154860146197064, + "loss": 2.939, + "step": 7477 + }, + { + "epoch": 0.6774931485130574, + "grad_norm": 0.7925382852554321, + "learning_rate": 0.00015485410499607323, + "loss": 2.9478, + "step": 7478 + }, + { + "epoch": 0.6775837466875042, + "grad_norm": 0.6837471723556519, + "learning_rate": 0.0001548480637950825, + "loss": 2.1284, + "step": 7479 + }, + { + "epoch": 0.677674344861951, + "grad_norm": 0.732401430606842, + "learning_rate": 0.0001548420225940917, + "loss": 2.5745, + "step": 7480 + }, + { + "epoch": 0.6777649430363978, + "grad_norm": 0.7511626482009888, + "learning_rate": 0.00015483598139310096, + "loss": 2.7348, + "step": 7481 + }, + { + "epoch": 0.6778555412108446, + "grad_norm": 0.7945426106452942, + "learning_rate": 0.0001548299401921102, + "loss": 2.9181, + "step": 7482 + }, + { + "epoch": 0.6779461393852914, + "grad_norm": 0.7590388655662537, + "learning_rate": 0.00015482389899111945, + "loss": 2.9354, + "step": 7483 + }, + { + "epoch": 0.6780367375597381, + "grad_norm": 0.8519288301467896, + "learning_rate": 0.0001548178577901287, + "loss": 2.7692, + "step": 7484 + }, + { + "epoch": 0.6781273357341849, + "grad_norm": 0.8291605710983276, + "learning_rate": 0.00015481181658913792, + "loss": 2.7952, + "step": 7485 + }, + { + "epoch": 0.6782179339086317, + "grad_norm": 0.7914581298828125, + "learning_rate": 0.00015480577538814718, + "loss": 2.9481, + "step": 7486 + }, + { + "epoch": 0.6783085320830785, + "grad_norm": 0.885530412197113, + "learning_rate": 0.00015479973418715642, + "loss": 2.9845, + "step": 7487 + }, + { + "epoch": 0.6783991302575253, + "grad_norm": 0.6503614783287048, + "learning_rate": 0.00015479369298616565, + "loss": 2.0261, + "step": 7488 + }, + { + "epoch": 0.6784897284319721, + "grad_norm": 0.7855043411254883, + "learning_rate": 0.00015478765178517488, + "loss": 2.9996, + "step": 7489 + }, + { + "epoch": 0.6785803266064189, + "grad_norm": 0.7265236377716064, + "learning_rate": 0.00015478161058418415, + "loss": 2.6142, + "step": 7490 + }, + { + "epoch": 0.6786709247808657, + "grad_norm": 0.7905483841896057, + "learning_rate": 0.0001547755693831934, + "loss": 2.9335, + "step": 7491 + }, + { + "epoch": 0.6787615229553124, + "grad_norm": 0.813042163848877, + "learning_rate": 0.00015476952818220264, + "loss": 2.9659, + "step": 7492 + }, + { + "epoch": 0.6788521211297592, + "grad_norm": 0.8011922240257263, + "learning_rate": 0.00015476348698121188, + "loss": 2.7032, + "step": 7493 + }, + { + "epoch": 0.678942719304206, + "grad_norm": 0.7819845080375671, + "learning_rate": 0.0001547574457802211, + "loss": 2.7541, + "step": 7494 + }, + { + "epoch": 0.6790333174786528, + "grad_norm": 0.7678344249725342, + "learning_rate": 0.00015475140457923037, + "loss": 2.6586, + "step": 7495 + }, + { + "epoch": 0.6791239156530996, + "grad_norm": 0.8654134273529053, + "learning_rate": 0.0001547453633782396, + "loss": 2.1679, + "step": 7496 + }, + { + "epoch": 0.6792145138275464, + "grad_norm": 0.7945859432220459, + "learning_rate": 0.00015473932217724884, + "loss": 2.8584, + "step": 7497 + }, + { + "epoch": 0.6793051120019932, + "grad_norm": 0.7867735624313354, + "learning_rate": 0.00015473328097625807, + "loss": 2.6919, + "step": 7498 + }, + { + "epoch": 0.67939571017644, + "grad_norm": 0.7877516746520996, + "learning_rate": 0.00015472723977526733, + "loss": 2.7536, + "step": 7499 + }, + { + "epoch": 0.6794863083508867, + "grad_norm": 0.747519850730896, + "learning_rate": 0.0001547211985742766, + "loss": 2.3252, + "step": 7500 + }, + { + "epoch": 0.6795769065253335, + "grad_norm": 0.7867474555969238, + "learning_rate": 0.0001547151573732858, + "loss": 2.872, + "step": 7501 + }, + { + "epoch": 0.6796675046997803, + "grad_norm": 0.8208909630775452, + "learning_rate": 0.00015470911617229506, + "loss": 2.682, + "step": 7502 + }, + { + "epoch": 0.6797581028742271, + "grad_norm": 0.9172671437263489, + "learning_rate": 0.0001547030749713043, + "loss": 2.6561, + "step": 7503 + }, + { + "epoch": 0.6798487010486739, + "grad_norm": 0.7627586126327515, + "learning_rate": 0.00015469703377031356, + "loss": 2.7274, + "step": 7504 + }, + { + "epoch": 0.6799392992231207, + "grad_norm": 0.8106408715248108, + "learning_rate": 0.0001546909925693228, + "loss": 3.0148, + "step": 7505 + }, + { + "epoch": 0.6800298973975675, + "grad_norm": 0.8616191148757935, + "learning_rate": 0.00015468495136833203, + "loss": 2.6896, + "step": 7506 + }, + { + "epoch": 0.6801204955720143, + "grad_norm": 0.7891805171966553, + "learning_rate": 0.0001546789101673413, + "loss": 2.8468, + "step": 7507 + }, + { + "epoch": 0.680211093746461, + "grad_norm": 0.7972517013549805, + "learning_rate": 0.00015467286896635052, + "loss": 2.9555, + "step": 7508 + }, + { + "epoch": 0.6803016919209078, + "grad_norm": 0.7949333190917969, + "learning_rate": 0.00015466682776535976, + "loss": 2.6453, + "step": 7509 + }, + { + "epoch": 0.6803922900953546, + "grad_norm": 0.8110067248344421, + "learning_rate": 0.000154660786564369, + "loss": 2.9859, + "step": 7510 + }, + { + "epoch": 0.6804828882698014, + "grad_norm": 0.8269016146659851, + "learning_rate": 0.00015465474536337825, + "loss": 2.8881, + "step": 7511 + }, + { + "epoch": 0.6805734864442482, + "grad_norm": 0.8383034467697144, + "learning_rate": 0.00015464870416238748, + "loss": 2.7762, + "step": 7512 + }, + { + "epoch": 0.680664084618695, + "grad_norm": 0.7870425581932068, + "learning_rate": 0.00015464266296139675, + "loss": 2.7416, + "step": 7513 + }, + { + "epoch": 0.6807546827931417, + "grad_norm": 0.757256805896759, + "learning_rate": 0.00015463662176040598, + "loss": 2.0495, + "step": 7514 + }, + { + "epoch": 0.6808452809675885, + "grad_norm": 0.8238621950149536, + "learning_rate": 0.00015463058055941521, + "loss": 3.0212, + "step": 7515 + }, + { + "epoch": 0.6809358791420352, + "grad_norm": 0.9017121195793152, + "learning_rate": 0.00015462453935842448, + "loss": 2.8885, + "step": 7516 + }, + { + "epoch": 0.681026477316482, + "grad_norm": 0.8056957721710205, + "learning_rate": 0.0001546184981574337, + "loss": 2.6785, + "step": 7517 + }, + { + "epoch": 0.6811170754909288, + "grad_norm": 0.7624282240867615, + "learning_rate": 0.00015461245695644294, + "loss": 2.8559, + "step": 7518 + }, + { + "epoch": 0.6812076736653756, + "grad_norm": 0.738038957118988, + "learning_rate": 0.00015460641575545218, + "loss": 2.7021, + "step": 7519 + }, + { + "epoch": 0.6812982718398224, + "grad_norm": 0.8403162360191345, + "learning_rate": 0.00015460037455446144, + "loss": 2.7712, + "step": 7520 + }, + { + "epoch": 0.6813888700142692, + "grad_norm": 0.7750338912010193, + "learning_rate": 0.0001545943333534707, + "loss": 3.0248, + "step": 7521 + }, + { + "epoch": 0.681479468188716, + "grad_norm": 0.8099494576454163, + "learning_rate": 0.0001545882921524799, + "loss": 2.7561, + "step": 7522 + }, + { + "epoch": 0.6815700663631628, + "grad_norm": 0.8244946002960205, + "learning_rate": 0.00015458225095148917, + "loss": 2.7826, + "step": 7523 + }, + { + "epoch": 0.6816606645376095, + "grad_norm": 0.7904675602912903, + "learning_rate": 0.0001545762097504984, + "loss": 2.7469, + "step": 7524 + }, + { + "epoch": 0.6817512627120563, + "grad_norm": 0.6762490272521973, + "learning_rate": 0.00015457016854950766, + "loss": 2.1495, + "step": 7525 + }, + { + "epoch": 0.6818418608865031, + "grad_norm": 0.7817524671554565, + "learning_rate": 0.0001545641273485169, + "loss": 2.6189, + "step": 7526 + }, + { + "epoch": 0.6819324590609499, + "grad_norm": 0.7685588598251343, + "learning_rate": 0.00015455808614752613, + "loss": 2.9233, + "step": 7527 + }, + { + "epoch": 0.6820230572353967, + "grad_norm": 0.8074855208396912, + "learning_rate": 0.00015455204494653537, + "loss": 2.0455, + "step": 7528 + }, + { + "epoch": 0.6821136554098435, + "grad_norm": 0.7534730434417725, + "learning_rate": 0.00015454600374554463, + "loss": 2.688, + "step": 7529 + }, + { + "epoch": 0.6822042535842903, + "grad_norm": 0.8489305973052979, + "learning_rate": 0.00015453996254455386, + "loss": 3.0212, + "step": 7530 + }, + { + "epoch": 0.682294851758737, + "grad_norm": 0.7809769511222839, + "learning_rate": 0.0001545339213435631, + "loss": 2.6223, + "step": 7531 + }, + { + "epoch": 0.6823854499331838, + "grad_norm": 0.8152599930763245, + "learning_rate": 0.00015452788014257236, + "loss": 2.9314, + "step": 7532 + }, + { + "epoch": 0.6824760481076306, + "grad_norm": 0.79545658826828, + "learning_rate": 0.0001545218389415816, + "loss": 2.6807, + "step": 7533 + }, + { + "epoch": 0.6825666462820774, + "grad_norm": 0.7742835879325867, + "learning_rate": 0.00015451579774059085, + "loss": 2.0795, + "step": 7534 + }, + { + "epoch": 0.6826572444565242, + "grad_norm": 0.8037658333778381, + "learning_rate": 0.00015450975653960006, + "loss": 3.0734, + "step": 7535 + }, + { + "epoch": 0.682747842630971, + "grad_norm": 0.7789700031280518, + "learning_rate": 0.00015450371533860932, + "loss": 2.8671, + "step": 7536 + }, + { + "epoch": 0.6828384408054178, + "grad_norm": 0.7636655569076538, + "learning_rate": 0.00015449767413761858, + "loss": 2.7142, + "step": 7537 + }, + { + "epoch": 0.6829290389798646, + "grad_norm": 0.8135193586349487, + "learning_rate": 0.00015449163293662781, + "loss": 2.9321, + "step": 7538 + }, + { + "epoch": 0.6830196371543114, + "grad_norm": 0.846886157989502, + "learning_rate": 0.00015448559173563705, + "loss": 2.8359, + "step": 7539 + }, + { + "epoch": 0.6831102353287581, + "grad_norm": 0.7404038906097412, + "learning_rate": 0.00015447955053464628, + "loss": 2.4248, + "step": 7540 + }, + { + "epoch": 0.6832008335032049, + "grad_norm": 0.753184974193573, + "learning_rate": 0.00015447350933365554, + "loss": 2.7087, + "step": 7541 + }, + { + "epoch": 0.6832914316776517, + "grad_norm": 0.7626619338989258, + "learning_rate": 0.00015446746813266478, + "loss": 2.9681, + "step": 7542 + }, + { + "epoch": 0.6833820298520985, + "grad_norm": 0.7743733525276184, + "learning_rate": 0.000154461426931674, + "loss": 2.7117, + "step": 7543 + }, + { + "epoch": 0.6834726280265453, + "grad_norm": 0.8250136971473694, + "learning_rate": 0.00015445538573068327, + "loss": 2.799, + "step": 7544 + }, + { + "epoch": 0.6835632262009921, + "grad_norm": 0.7786048650741577, + "learning_rate": 0.0001544493445296925, + "loss": 2.8803, + "step": 7545 + }, + { + "epoch": 0.6836538243754389, + "grad_norm": 0.7638477683067322, + "learning_rate": 0.00015444330332870177, + "loss": 2.8697, + "step": 7546 + }, + { + "epoch": 0.6837444225498857, + "grad_norm": 0.7815786004066467, + "learning_rate": 0.000154437262127711, + "loss": 2.7538, + "step": 7547 + }, + { + "epoch": 0.6838350207243324, + "grad_norm": 0.7829124331474304, + "learning_rate": 0.00015443122092672024, + "loss": 2.8276, + "step": 7548 + }, + { + "epoch": 0.6839256188987792, + "grad_norm": 0.7980673313140869, + "learning_rate": 0.00015442517972572947, + "loss": 2.9321, + "step": 7549 + }, + { + "epoch": 0.684016217073226, + "grad_norm": 0.7556538581848145, + "learning_rate": 0.00015441913852473873, + "loss": 2.7629, + "step": 7550 + }, + { + "epoch": 0.6841068152476728, + "grad_norm": 0.7825047373771667, + "learning_rate": 0.000154413097323748, + "loss": 2.8342, + "step": 7551 + }, + { + "epoch": 0.6841974134221196, + "grad_norm": 0.7558799982070923, + "learning_rate": 0.0001544070561227572, + "loss": 2.7742, + "step": 7552 + }, + { + "epoch": 0.6842880115965664, + "grad_norm": 0.7427237033843994, + "learning_rate": 0.00015440101492176646, + "loss": 2.7895, + "step": 7553 + }, + { + "epoch": 0.6843786097710132, + "grad_norm": 0.7915409803390503, + "learning_rate": 0.0001543949737207757, + "loss": 2.6774, + "step": 7554 + }, + { + "epoch": 0.6844692079454598, + "grad_norm": 0.8128548264503479, + "learning_rate": 0.00015438893251978496, + "loss": 2.8869, + "step": 7555 + }, + { + "epoch": 0.6845598061199066, + "grad_norm": 0.8470263481140137, + "learning_rate": 0.0001543828913187942, + "loss": 2.9722, + "step": 7556 + }, + { + "epoch": 0.6846504042943534, + "grad_norm": 0.757692277431488, + "learning_rate": 0.00015437685011780342, + "loss": 2.7774, + "step": 7557 + }, + { + "epoch": 0.6847410024688002, + "grad_norm": 0.7870338559150696, + "learning_rate": 0.00015437080891681266, + "loss": 2.8425, + "step": 7558 + }, + { + "epoch": 0.684831600643247, + "grad_norm": 0.7925295829772949, + "learning_rate": 0.00015436476771582192, + "loss": 2.9911, + "step": 7559 + }, + { + "epoch": 0.6849221988176938, + "grad_norm": 0.7866104245185852, + "learning_rate": 0.00015435872651483115, + "loss": 2.7965, + "step": 7560 + }, + { + "epoch": 0.6850127969921406, + "grad_norm": 0.790723979473114, + "learning_rate": 0.0001543526853138404, + "loss": 2.7222, + "step": 7561 + }, + { + "epoch": 0.6851033951665874, + "grad_norm": 0.837399423122406, + "learning_rate": 0.00015434664411284965, + "loss": 3.3062, + "step": 7562 + }, + { + "epoch": 0.6851939933410341, + "grad_norm": 0.7540143728256226, + "learning_rate": 0.00015434060291185888, + "loss": 2.7956, + "step": 7563 + }, + { + "epoch": 0.6852845915154809, + "grad_norm": 0.8613789081573486, + "learning_rate": 0.00015433456171086814, + "loss": 2.8619, + "step": 7564 + }, + { + "epoch": 0.6853751896899277, + "grad_norm": 0.7518636584281921, + "learning_rate": 0.00015432852050987735, + "loss": 2.6646, + "step": 7565 + }, + { + "epoch": 0.6854657878643745, + "grad_norm": 0.7530277371406555, + "learning_rate": 0.0001543224793088866, + "loss": 2.6941, + "step": 7566 + }, + { + "epoch": 0.6855563860388213, + "grad_norm": 0.7520513534545898, + "learning_rate": 0.00015431643810789587, + "loss": 2.7739, + "step": 7567 + }, + { + "epoch": 0.6856469842132681, + "grad_norm": 0.7579111456871033, + "learning_rate": 0.0001543103969069051, + "loss": 2.6239, + "step": 7568 + }, + { + "epoch": 0.6857375823877149, + "grad_norm": 0.7417479753494263, + "learning_rate": 0.00015430435570591434, + "loss": 2.9354, + "step": 7569 + }, + { + "epoch": 0.6858281805621617, + "grad_norm": 0.7866603136062622, + "learning_rate": 0.00015429831450492357, + "loss": 2.9271, + "step": 7570 + }, + { + "epoch": 0.6859187787366084, + "grad_norm": 0.7717056274414062, + "learning_rate": 0.00015429227330393284, + "loss": 2.8153, + "step": 7571 + }, + { + "epoch": 0.6860093769110552, + "grad_norm": 0.7712816596031189, + "learning_rate": 0.00015428623210294207, + "loss": 2.7954, + "step": 7572 + }, + { + "epoch": 0.686099975085502, + "grad_norm": 0.7964044213294983, + "learning_rate": 0.0001542801909019513, + "loss": 2.8285, + "step": 7573 + }, + { + "epoch": 0.6861905732599488, + "grad_norm": 0.8093847632408142, + "learning_rate": 0.00015427414970096057, + "loss": 2.7167, + "step": 7574 + }, + { + "epoch": 0.6862811714343956, + "grad_norm": 0.7555229067802429, + "learning_rate": 0.0001542681084999698, + "loss": 2.5882, + "step": 7575 + }, + { + "epoch": 0.6863717696088424, + "grad_norm": 0.8036696314811707, + "learning_rate": 0.00015426206729897906, + "loss": 2.7547, + "step": 7576 + }, + { + "epoch": 0.6864623677832892, + "grad_norm": 0.7290729880332947, + "learning_rate": 0.0001542560260979883, + "loss": 2.5497, + "step": 7577 + }, + { + "epoch": 0.686552965957736, + "grad_norm": 0.676535427570343, + "learning_rate": 0.00015424998489699753, + "loss": 2.0552, + "step": 7578 + }, + { + "epoch": 0.6866435641321827, + "grad_norm": 0.7938894033432007, + "learning_rate": 0.00015424394369600676, + "loss": 2.6428, + "step": 7579 + }, + { + "epoch": 0.6867341623066295, + "grad_norm": 0.8095124959945679, + "learning_rate": 0.00015423790249501602, + "loss": 2.9834, + "step": 7580 + }, + { + "epoch": 0.6868247604810763, + "grad_norm": 0.8195133209228516, + "learning_rate": 0.00015423186129402526, + "loss": 2.8355, + "step": 7581 + }, + { + "epoch": 0.6869153586555231, + "grad_norm": 0.7471675872802734, + "learning_rate": 0.0001542258200930345, + "loss": 2.8134, + "step": 7582 + }, + { + "epoch": 0.6870059568299699, + "grad_norm": 0.8057431578636169, + "learning_rate": 0.00015421977889204375, + "loss": 2.8225, + "step": 7583 + }, + { + "epoch": 0.6870965550044167, + "grad_norm": 0.802756130695343, + "learning_rate": 0.000154213737691053, + "loss": 2.8674, + "step": 7584 + }, + { + "epoch": 0.6871871531788635, + "grad_norm": 0.7600396871566772, + "learning_rate": 0.00015420769649006225, + "loss": 2.8762, + "step": 7585 + }, + { + "epoch": 0.6872777513533103, + "grad_norm": 0.7813625335693359, + "learning_rate": 0.00015420165528907146, + "loss": 3.1554, + "step": 7586 + }, + { + "epoch": 0.687368349527757, + "grad_norm": 0.7734324336051941, + "learning_rate": 0.00015419561408808072, + "loss": 2.7395, + "step": 7587 + }, + { + "epoch": 0.6874589477022038, + "grad_norm": 0.7528666853904724, + "learning_rate": 0.00015418957288708995, + "loss": 2.7315, + "step": 7588 + }, + { + "epoch": 0.6875495458766506, + "grad_norm": 0.7696382403373718, + "learning_rate": 0.0001541835316860992, + "loss": 2.9069, + "step": 7589 + }, + { + "epoch": 0.6876401440510974, + "grad_norm": 0.8370119333267212, + "learning_rate": 0.00015417749048510845, + "loss": 2.7635, + "step": 7590 + }, + { + "epoch": 0.6877307422255442, + "grad_norm": 0.8012731671333313, + "learning_rate": 0.00015417144928411768, + "loss": 2.7538, + "step": 7591 + }, + { + "epoch": 0.687821340399991, + "grad_norm": 0.7768990993499756, + "learning_rate": 0.00015416540808312694, + "loss": 2.9933, + "step": 7592 + }, + { + "epoch": 0.6879119385744378, + "grad_norm": 0.7826328873634338, + "learning_rate": 0.00015415936688213617, + "loss": 2.6737, + "step": 7593 + }, + { + "epoch": 0.6880025367488846, + "grad_norm": 0.8613027930259705, + "learning_rate": 0.0001541533256811454, + "loss": 2.7127, + "step": 7594 + }, + { + "epoch": 0.6880931349233312, + "grad_norm": 0.851296067237854, + "learning_rate": 0.00015414728448015464, + "loss": 2.6402, + "step": 7595 + }, + { + "epoch": 0.688183733097778, + "grad_norm": 0.816107988357544, + "learning_rate": 0.0001541412432791639, + "loss": 3.1428, + "step": 7596 + }, + { + "epoch": 0.6882743312722248, + "grad_norm": 0.8224785923957825, + "learning_rate": 0.00015413520207817317, + "loss": 2.8309, + "step": 7597 + }, + { + "epoch": 0.6883649294466716, + "grad_norm": 0.7262051701545715, + "learning_rate": 0.0001541291608771824, + "loss": 2.5108, + "step": 7598 + }, + { + "epoch": 0.6884555276211184, + "grad_norm": 0.7988467216491699, + "learning_rate": 0.00015412311967619163, + "loss": 2.8777, + "step": 7599 + }, + { + "epoch": 0.6885461257955652, + "grad_norm": 0.8398341536521912, + "learning_rate": 0.00015411707847520087, + "loss": 3.0257, + "step": 7600 + }, + { + "epoch": 0.688636723970012, + "grad_norm": 0.7610725164413452, + "learning_rate": 0.00015411103727421013, + "loss": 2.6582, + "step": 7601 + }, + { + "epoch": 0.6887273221444588, + "grad_norm": 0.8263300061225891, + "learning_rate": 0.00015410499607321936, + "loss": 2.858, + "step": 7602 + }, + { + "epoch": 0.6888179203189055, + "grad_norm": 0.7225262522697449, + "learning_rate": 0.0001540989548722286, + "loss": 2.0566, + "step": 7603 + }, + { + "epoch": 0.6889085184933523, + "grad_norm": 0.7747693657875061, + "learning_rate": 0.00015409291367123786, + "loss": 2.7099, + "step": 7604 + }, + { + "epoch": 0.6889991166677991, + "grad_norm": 0.7878962755203247, + "learning_rate": 0.0001540868724702471, + "loss": 2.6348, + "step": 7605 + }, + { + "epoch": 0.6890897148422459, + "grad_norm": 0.7959868311882019, + "learning_rate": 0.00015408083126925635, + "loss": 2.5745, + "step": 7606 + }, + { + "epoch": 0.6891803130166927, + "grad_norm": 0.7982162833213806, + "learning_rate": 0.00015407479006826556, + "loss": 2.6366, + "step": 7607 + }, + { + "epoch": 0.6892709111911395, + "grad_norm": 0.8138356804847717, + "learning_rate": 0.00015406874886727482, + "loss": 2.7824, + "step": 7608 + }, + { + "epoch": 0.6893615093655863, + "grad_norm": 0.803126335144043, + "learning_rate": 0.00015406270766628406, + "loss": 2.8247, + "step": 7609 + }, + { + "epoch": 0.689452107540033, + "grad_norm": 0.7989891767501831, + "learning_rate": 0.00015405666646529332, + "loss": 2.9266, + "step": 7610 + }, + { + "epoch": 0.6895427057144798, + "grad_norm": 0.7921611070632935, + "learning_rate": 0.00015405062526430255, + "loss": 2.8157, + "step": 7611 + }, + { + "epoch": 0.6896333038889266, + "grad_norm": 0.7977306246757507, + "learning_rate": 0.00015404458406331178, + "loss": 2.8268, + "step": 7612 + }, + { + "epoch": 0.6897239020633734, + "grad_norm": 0.778037428855896, + "learning_rate": 0.00015403854286232105, + "loss": 2.9179, + "step": 7613 + }, + { + "epoch": 0.6898145002378202, + "grad_norm": 0.8137833476066589, + "learning_rate": 0.00015403250166133028, + "loss": 2.8044, + "step": 7614 + }, + { + "epoch": 0.689905098412267, + "grad_norm": 0.8530728220939636, + "learning_rate": 0.00015402646046033954, + "loss": 2.9251, + "step": 7615 + }, + { + "epoch": 0.6899956965867138, + "grad_norm": 0.7559429407119751, + "learning_rate": 0.00015402041925934875, + "loss": 2.6778, + "step": 7616 + }, + { + "epoch": 0.6900862947611606, + "grad_norm": 0.8116037249565125, + "learning_rate": 0.000154014378058358, + "loss": 2.8226, + "step": 7617 + }, + { + "epoch": 0.6901768929356074, + "grad_norm": 0.8453320264816284, + "learning_rate": 0.00015400833685736724, + "loss": 2.84, + "step": 7618 + }, + { + "epoch": 0.6902674911100541, + "grad_norm": 0.6276211142539978, + "learning_rate": 0.0001540022956563765, + "loss": 1.9363, + "step": 7619 + }, + { + "epoch": 0.6903580892845009, + "grad_norm": 0.6778476238250732, + "learning_rate": 0.00015399625445538574, + "loss": 1.9989, + "step": 7620 + }, + { + "epoch": 0.6904486874589477, + "grad_norm": 0.7800390124320984, + "learning_rate": 0.00015399021325439497, + "loss": 2.8846, + "step": 7621 + }, + { + "epoch": 0.6905392856333945, + "grad_norm": 0.7912356853485107, + "learning_rate": 0.00015398417205340423, + "loss": 3.1103, + "step": 7622 + }, + { + "epoch": 0.6906298838078413, + "grad_norm": 0.8396485447883606, + "learning_rate": 0.00015397813085241347, + "loss": 2.7235, + "step": 7623 + }, + { + "epoch": 0.6907204819822881, + "grad_norm": 0.817734956741333, + "learning_rate": 0.0001539720896514227, + "loss": 2.6133, + "step": 7624 + }, + { + "epoch": 0.6908110801567349, + "grad_norm": 0.850747287273407, + "learning_rate": 0.00015396604845043194, + "loss": 2.5258, + "step": 7625 + }, + { + "epoch": 0.6909016783311817, + "grad_norm": 0.8047150373458862, + "learning_rate": 0.0001539600072494412, + "loss": 2.9354, + "step": 7626 + }, + { + "epoch": 0.6909922765056284, + "grad_norm": 0.8094454407691956, + "learning_rate": 0.00015395396604845046, + "loss": 2.8607, + "step": 7627 + }, + { + "epoch": 0.6910828746800752, + "grad_norm": 0.8268592953681946, + "learning_rate": 0.0001539479248474597, + "loss": 2.0583, + "step": 7628 + }, + { + "epoch": 0.691173472854522, + "grad_norm": 0.8232091069221497, + "learning_rate": 0.00015394188364646893, + "loss": 2.9096, + "step": 7629 + }, + { + "epoch": 0.6912640710289688, + "grad_norm": 0.8495098948478699, + "learning_rate": 0.00015393584244547816, + "loss": 2.6706, + "step": 7630 + }, + { + "epoch": 0.6913546692034156, + "grad_norm": 0.7920978665351868, + "learning_rate": 0.00015392980124448742, + "loss": 2.8765, + "step": 7631 + }, + { + "epoch": 0.6914452673778624, + "grad_norm": 0.7352113723754883, + "learning_rate": 0.00015392376004349666, + "loss": 2.7832, + "step": 7632 + }, + { + "epoch": 0.6915358655523092, + "grad_norm": 0.7548392415046692, + "learning_rate": 0.0001539177188425059, + "loss": 2.7503, + "step": 7633 + }, + { + "epoch": 0.691626463726756, + "grad_norm": 0.7999079823493958, + "learning_rate": 0.00015391167764151515, + "loss": 2.9759, + "step": 7634 + }, + { + "epoch": 0.6917170619012027, + "grad_norm": 0.778206467628479, + "learning_rate": 0.00015390563644052438, + "loss": 2.8768, + "step": 7635 + }, + { + "epoch": 0.6918076600756494, + "grad_norm": 0.7751704454421997, + "learning_rate": 0.00015389959523953365, + "loss": 2.9831, + "step": 7636 + }, + { + "epoch": 0.6918982582500962, + "grad_norm": 1.024889588356018, + "learning_rate": 0.00015389355403854285, + "loss": 2.1489, + "step": 7637 + }, + { + "epoch": 0.691988856424543, + "grad_norm": 0.8251684904098511, + "learning_rate": 0.00015388751283755211, + "loss": 2.9047, + "step": 7638 + }, + { + "epoch": 0.6920794545989898, + "grad_norm": 0.7770611643791199, + "learning_rate": 0.00015388147163656135, + "loss": 2.7967, + "step": 7639 + }, + { + "epoch": 0.6921700527734366, + "grad_norm": 0.7939862608909607, + "learning_rate": 0.0001538754304355706, + "loss": 2.7847, + "step": 7640 + }, + { + "epoch": 0.6922606509478834, + "grad_norm": 0.7556101679801941, + "learning_rate": 0.00015386938923457984, + "loss": 2.8536, + "step": 7641 + }, + { + "epoch": 0.6923512491223301, + "grad_norm": 0.7946643829345703, + "learning_rate": 0.00015386334803358908, + "loss": 2.7648, + "step": 7642 + }, + { + "epoch": 0.6924418472967769, + "grad_norm": 0.7586665153503418, + "learning_rate": 0.00015385730683259834, + "loss": 2.5502, + "step": 7643 + }, + { + "epoch": 0.6925324454712237, + "grad_norm": 0.7844125628471375, + "learning_rate": 0.00015385126563160757, + "loss": 2.9038, + "step": 7644 + }, + { + "epoch": 0.6926230436456705, + "grad_norm": 0.8460304141044617, + "learning_rate": 0.0001538452244306168, + "loss": 2.9947, + "step": 7645 + }, + { + "epoch": 0.6927136418201173, + "grad_norm": 0.7415556311607361, + "learning_rate": 0.00015383918322962604, + "loss": 2.8417, + "step": 7646 + }, + { + "epoch": 0.6928042399945641, + "grad_norm": 0.776774525642395, + "learning_rate": 0.0001538331420286353, + "loss": 2.7251, + "step": 7647 + }, + { + "epoch": 0.6928948381690109, + "grad_norm": 0.7708441019058228, + "learning_rate": 0.00015382710082764454, + "loss": 2.8795, + "step": 7648 + }, + { + "epoch": 0.6929854363434577, + "grad_norm": 0.7800931930541992, + "learning_rate": 0.0001538210596266538, + "loss": 2.8051, + "step": 7649 + }, + { + "epoch": 0.6930760345179044, + "grad_norm": 0.7769211530685425, + "learning_rate": 0.00015381501842566303, + "loss": 2.9264, + "step": 7650 + }, + { + "epoch": 0.6931666326923512, + "grad_norm": 0.7864177227020264, + "learning_rate": 0.00015380897722467226, + "loss": 3.07, + "step": 7651 + }, + { + "epoch": 0.693257230866798, + "grad_norm": 0.8212213516235352, + "learning_rate": 0.00015380293602368153, + "loss": 2.8828, + "step": 7652 + }, + { + "epoch": 0.6933478290412448, + "grad_norm": 1.0753145217895508, + "learning_rate": 0.00015379689482269076, + "loss": 2.1432, + "step": 7653 + }, + { + "epoch": 0.6934384272156916, + "grad_norm": 0.7669535279273987, + "learning_rate": 0.0001537908536217, + "loss": 2.6064, + "step": 7654 + }, + { + "epoch": 0.6935290253901384, + "grad_norm": 0.7521100640296936, + "learning_rate": 0.00015378481242070923, + "loss": 2.7741, + "step": 7655 + }, + { + "epoch": 0.6936196235645852, + "grad_norm": 0.8176746368408203, + "learning_rate": 0.0001537787712197185, + "loss": 2.9297, + "step": 7656 + }, + { + "epoch": 0.693710221739032, + "grad_norm": 0.7976363897323608, + "learning_rate": 0.00015377273001872775, + "loss": 2.8117, + "step": 7657 + }, + { + "epoch": 0.6938008199134788, + "grad_norm": 0.8535690903663635, + "learning_rate": 0.00015376668881773696, + "loss": 2.8026, + "step": 7658 + }, + { + "epoch": 0.6938914180879255, + "grad_norm": 0.6112192273139954, + "learning_rate": 0.00015376064761674622, + "loss": 2.01, + "step": 7659 + }, + { + "epoch": 0.6939820162623723, + "grad_norm": 0.7744585275650024, + "learning_rate": 0.00015375460641575545, + "loss": 2.7665, + "step": 7660 + }, + { + "epoch": 0.6940726144368191, + "grad_norm": 0.8453598022460938, + "learning_rate": 0.00015374856521476471, + "loss": 2.9693, + "step": 7661 + }, + { + "epoch": 0.6941632126112659, + "grad_norm": 0.8995106816291809, + "learning_rate": 0.00015374252401377395, + "loss": 2.7658, + "step": 7662 + }, + { + "epoch": 0.6942538107857127, + "grad_norm": 0.7923425436019897, + "learning_rate": 0.00015373648281278318, + "loss": 2.8185, + "step": 7663 + }, + { + "epoch": 0.6943444089601595, + "grad_norm": 0.8055721521377563, + "learning_rate": 0.00015373044161179244, + "loss": 3.0459, + "step": 7664 + }, + { + "epoch": 0.6944350071346063, + "grad_norm": 0.6745054721832275, + "learning_rate": 0.00015372440041080168, + "loss": 2.0501, + "step": 7665 + }, + { + "epoch": 0.694525605309053, + "grad_norm": 0.779097855091095, + "learning_rate": 0.00015371835920981094, + "loss": 2.7964, + "step": 7666 + }, + { + "epoch": 0.6946162034834998, + "grad_norm": 0.7901269793510437, + "learning_rate": 0.00015371231800882015, + "loss": 2.8665, + "step": 7667 + }, + { + "epoch": 0.6947068016579466, + "grad_norm": 0.9417189359664917, + "learning_rate": 0.0001537062768078294, + "loss": 2.8053, + "step": 7668 + }, + { + "epoch": 0.6947973998323934, + "grad_norm": 0.7801570296287537, + "learning_rate": 0.00015370023560683864, + "loss": 2.7202, + "step": 7669 + }, + { + "epoch": 0.6948879980068402, + "grad_norm": 0.8088344931602478, + "learning_rate": 0.0001536941944058479, + "loss": 2.6478, + "step": 7670 + }, + { + "epoch": 0.694978596181287, + "grad_norm": 0.7895446419715881, + "learning_rate": 0.00015368815320485714, + "loss": 2.7918, + "step": 7671 + }, + { + "epoch": 0.6950691943557338, + "grad_norm": 0.7112696766853333, + "learning_rate": 0.00015368211200386637, + "loss": 2.1295, + "step": 7672 + }, + { + "epoch": 0.6951597925301806, + "grad_norm": 0.7986565828323364, + "learning_rate": 0.00015367607080287563, + "loss": 2.9101, + "step": 7673 + }, + { + "epoch": 0.6952503907046274, + "grad_norm": 0.8611263632774353, + "learning_rate": 0.00015367002960188486, + "loss": 2.8753, + "step": 7674 + }, + { + "epoch": 0.6953409888790741, + "grad_norm": 0.7618210911750793, + "learning_rate": 0.0001536639884008941, + "loss": 2.771, + "step": 7675 + }, + { + "epoch": 0.6954315870535208, + "grad_norm": 0.7660027742385864, + "learning_rate": 0.00015365794719990333, + "loss": 2.5349, + "step": 7676 + }, + { + "epoch": 0.6955221852279676, + "grad_norm": 0.7933700680732727, + "learning_rate": 0.0001536519059989126, + "loss": 2.7694, + "step": 7677 + }, + { + "epoch": 0.6956127834024144, + "grad_norm": 0.8118396997451782, + "learning_rate": 0.00015364586479792183, + "loss": 2.7587, + "step": 7678 + }, + { + "epoch": 0.6957033815768612, + "grad_norm": 0.6807992458343506, + "learning_rate": 0.0001536398235969311, + "loss": 2.1136, + "step": 7679 + }, + { + "epoch": 0.695793979751308, + "grad_norm": 0.6658110618591309, + "learning_rate": 0.00015363378239594032, + "loss": 1.4599, + "step": 7680 + }, + { + "epoch": 0.6958845779257548, + "grad_norm": 0.814122200012207, + "learning_rate": 0.00015362774119494956, + "loss": 2.9412, + "step": 7681 + }, + { + "epoch": 0.6959751761002015, + "grad_norm": 0.7562648057937622, + "learning_rate": 0.00015362169999395882, + "loss": 2.1985, + "step": 7682 + }, + { + "epoch": 0.6960657742746483, + "grad_norm": 0.8942673206329346, + "learning_rate": 0.00015361565879296805, + "loss": 2.9574, + "step": 7683 + }, + { + "epoch": 0.6961563724490951, + "grad_norm": 0.797749936580658, + "learning_rate": 0.0001536096175919773, + "loss": 2.7573, + "step": 7684 + }, + { + "epoch": 0.6962469706235419, + "grad_norm": 0.7599079012870789, + "learning_rate": 0.00015360357639098652, + "loss": 2.8927, + "step": 7685 + }, + { + "epoch": 0.6963375687979887, + "grad_norm": 0.7546654343605042, + "learning_rate": 0.00015359753518999578, + "loss": 2.0697, + "step": 7686 + }, + { + "epoch": 0.6964281669724355, + "grad_norm": 0.7571536302566528, + "learning_rate": 0.00015359149398900504, + "loss": 2.6582, + "step": 7687 + }, + { + "epoch": 0.6965187651468823, + "grad_norm": 0.8672663569450378, + "learning_rate": 0.00015358545278801425, + "loss": 2.8856, + "step": 7688 + }, + { + "epoch": 0.6966093633213291, + "grad_norm": 0.7976168394088745, + "learning_rate": 0.0001535794115870235, + "loss": 2.9226, + "step": 7689 + }, + { + "epoch": 0.6966999614957758, + "grad_norm": 0.7856960296630859, + "learning_rate": 0.00015357337038603275, + "loss": 2.7702, + "step": 7690 + }, + { + "epoch": 0.6967905596702226, + "grad_norm": 0.7451328635215759, + "learning_rate": 0.000153567329185042, + "loss": 2.9302, + "step": 7691 + }, + { + "epoch": 0.6968811578446694, + "grad_norm": 0.7855068445205688, + "learning_rate": 0.00015356128798405124, + "loss": 2.734, + "step": 7692 + }, + { + "epoch": 0.6969717560191162, + "grad_norm": 0.7567856311798096, + "learning_rate": 0.00015355524678306047, + "loss": 2.74, + "step": 7693 + }, + { + "epoch": 0.697062354193563, + "grad_norm": 0.7970122694969177, + "learning_rate": 0.00015354920558206974, + "loss": 2.7209, + "step": 7694 + }, + { + "epoch": 0.6971529523680098, + "grad_norm": 0.6490296125411987, + "learning_rate": 0.00015354316438107897, + "loss": 1.946, + "step": 7695 + }, + { + "epoch": 0.6972435505424566, + "grad_norm": 0.8055873513221741, + "learning_rate": 0.0001535371231800882, + "loss": 2.9231, + "step": 7696 + }, + { + "epoch": 0.6973341487169034, + "grad_norm": 0.8313952684402466, + "learning_rate": 0.00015353108197909744, + "loss": 2.6463, + "step": 7697 + }, + { + "epoch": 0.6974247468913501, + "grad_norm": 0.7707017064094543, + "learning_rate": 0.0001535250407781067, + "loss": 2.7736, + "step": 7698 + }, + { + "epoch": 0.6975153450657969, + "grad_norm": 0.8141201138496399, + "learning_rate": 0.00015351899957711593, + "loss": 2.8386, + "step": 7699 + }, + { + "epoch": 0.6976059432402437, + "grad_norm": 0.826134979724884, + "learning_rate": 0.0001535129583761252, + "loss": 2.9824, + "step": 7700 + }, + { + "epoch": 0.6976965414146905, + "grad_norm": 0.8246468901634216, + "learning_rate": 0.00015350691717513443, + "loss": 3.1612, + "step": 7701 + }, + { + "epoch": 0.6977871395891373, + "grad_norm": 0.7713882327079773, + "learning_rate": 0.00015350087597414366, + "loss": 2.572, + "step": 7702 + }, + { + "epoch": 0.6978777377635841, + "grad_norm": 0.8286321759223938, + "learning_rate": 0.00015349483477315292, + "loss": 2.7426, + "step": 7703 + }, + { + "epoch": 0.6979683359380309, + "grad_norm": 0.8012830018997192, + "learning_rate": 0.00015348879357216216, + "loss": 2.8821, + "step": 7704 + }, + { + "epoch": 0.6980589341124777, + "grad_norm": 0.8472405672073364, + "learning_rate": 0.0001534827523711714, + "loss": 2.959, + "step": 7705 + }, + { + "epoch": 0.6981495322869244, + "grad_norm": 0.8280558586120605, + "learning_rate": 0.00015347671117018063, + "loss": 3.0004, + "step": 7706 + }, + { + "epoch": 0.6982401304613712, + "grad_norm": 0.9089266657829285, + "learning_rate": 0.0001534706699691899, + "loss": 2.7926, + "step": 7707 + }, + { + "epoch": 0.698330728635818, + "grad_norm": 0.7990745902061462, + "learning_rate": 0.00015346462876819912, + "loss": 2.8796, + "step": 7708 + }, + { + "epoch": 0.6984213268102648, + "grad_norm": 0.7948095202445984, + "learning_rate": 0.00015345858756720835, + "loss": 2.3369, + "step": 7709 + }, + { + "epoch": 0.6985119249847116, + "grad_norm": 0.7704527378082275, + "learning_rate": 0.00015345254636621762, + "loss": 2.653, + "step": 7710 + }, + { + "epoch": 0.6986025231591584, + "grad_norm": 0.7840922474861145, + "learning_rate": 0.00015344650516522685, + "loss": 2.662, + "step": 7711 + }, + { + "epoch": 0.6986931213336052, + "grad_norm": 0.7423478364944458, + "learning_rate": 0.0001534404639642361, + "loss": 2.5577, + "step": 7712 + }, + { + "epoch": 0.698783719508052, + "grad_norm": 0.744530975818634, + "learning_rate": 0.00015343442276324535, + "loss": 2.6022, + "step": 7713 + }, + { + "epoch": 0.6988743176824987, + "grad_norm": 0.795498251914978, + "learning_rate": 0.00015342838156225458, + "loss": 2.7197, + "step": 7714 + }, + { + "epoch": 0.6989649158569455, + "grad_norm": 0.8964200019836426, + "learning_rate": 0.0001534223403612638, + "loss": 2.9917, + "step": 7715 + }, + { + "epoch": 0.6990555140313923, + "grad_norm": 0.799309492111206, + "learning_rate": 0.00015341629916027307, + "loss": 2.6895, + "step": 7716 + }, + { + "epoch": 0.699146112205839, + "grad_norm": 0.8076914548873901, + "learning_rate": 0.0001534102579592823, + "loss": 2.9228, + "step": 7717 + }, + { + "epoch": 0.6992367103802858, + "grad_norm": 0.8553876876831055, + "learning_rate": 0.00015340421675829154, + "loss": 2.9344, + "step": 7718 + }, + { + "epoch": 0.6993273085547326, + "grad_norm": 0.8341047763824463, + "learning_rate": 0.0001533981755573008, + "loss": 2.8317, + "step": 7719 + }, + { + "epoch": 0.6994179067291794, + "grad_norm": 0.8305470943450928, + "learning_rate": 0.00015339213435631004, + "loss": 2.1062, + "step": 7720 + }, + { + "epoch": 0.6995085049036261, + "grad_norm": 0.8547671437263489, + "learning_rate": 0.0001533860931553193, + "loss": 2.8017, + "step": 7721 + }, + { + "epoch": 0.6995991030780729, + "grad_norm": 0.7941144108772278, + "learning_rate": 0.0001533800519543285, + "loss": 2.8584, + "step": 7722 + }, + { + "epoch": 0.6996897012525197, + "grad_norm": 0.7370090484619141, + "learning_rate": 0.00015337401075333777, + "loss": 1.9299, + "step": 7723 + }, + { + "epoch": 0.6997802994269665, + "grad_norm": 0.7648054957389832, + "learning_rate": 0.00015336796955234703, + "loss": 2.8334, + "step": 7724 + }, + { + "epoch": 0.6998708976014133, + "grad_norm": 0.7919084429740906, + "learning_rate": 0.00015336192835135626, + "loss": 2.9988, + "step": 7725 + }, + { + "epoch": 0.6999614957758601, + "grad_norm": 0.905188798904419, + "learning_rate": 0.0001533558871503655, + "loss": 2.9863, + "step": 7726 + }, + { + "epoch": 0.7000520939503069, + "grad_norm": 0.7997329235076904, + "learning_rate": 0.00015334984594937473, + "loss": 2.7796, + "step": 7727 + }, + { + "epoch": 0.7001426921247537, + "grad_norm": 0.7647366523742676, + "learning_rate": 0.000153343804748384, + "loss": 2.7375, + "step": 7728 + }, + { + "epoch": 0.7002332902992005, + "grad_norm": 0.7685820460319519, + "learning_rate": 0.00015333776354739323, + "loss": 2.8857, + "step": 7729 + }, + { + "epoch": 0.7003238884736472, + "grad_norm": 0.9432979226112366, + "learning_rate": 0.00015333172234640246, + "loss": 2.9625, + "step": 7730 + }, + { + "epoch": 0.700414486648094, + "grad_norm": 0.7769181132316589, + "learning_rate": 0.00015332568114541172, + "loss": 2.5091, + "step": 7731 + }, + { + "epoch": 0.7005050848225408, + "grad_norm": 0.8524624109268188, + "learning_rate": 0.00015331963994442096, + "loss": 2.73, + "step": 7732 + }, + { + "epoch": 0.7005956829969876, + "grad_norm": 0.800821840763092, + "learning_rate": 0.00015331359874343022, + "loss": 2.9608, + "step": 7733 + }, + { + "epoch": 0.7006862811714344, + "grad_norm": 0.8270512223243713, + "learning_rate": 0.00015330755754243945, + "loss": 2.7627, + "step": 7734 + }, + { + "epoch": 0.7007768793458812, + "grad_norm": 0.7476973533630371, + "learning_rate": 0.00015330151634144868, + "loss": 2.9012, + "step": 7735 + }, + { + "epoch": 0.700867477520328, + "grad_norm": 0.8137900233268738, + "learning_rate": 0.00015329547514045792, + "loss": 2.8197, + "step": 7736 + }, + { + "epoch": 0.7009580756947748, + "grad_norm": 0.7764749526977539, + "learning_rate": 0.00015328943393946718, + "loss": 2.9454, + "step": 7737 + }, + { + "epoch": 0.7010486738692215, + "grad_norm": 0.8532440662384033, + "learning_rate": 0.00015328339273847641, + "loss": 2.9231, + "step": 7738 + }, + { + "epoch": 0.7011392720436683, + "grad_norm": 0.7593422532081604, + "learning_rate": 0.00015327735153748565, + "loss": 2.8993, + "step": 7739 + }, + { + "epoch": 0.7012298702181151, + "grad_norm": 0.7800534963607788, + "learning_rate": 0.0001532713103364949, + "loss": 2.8817, + "step": 7740 + }, + { + "epoch": 0.7013204683925619, + "grad_norm": 0.8955763578414917, + "learning_rate": 0.00015326526913550414, + "loss": 2.0997, + "step": 7741 + }, + { + "epoch": 0.7014110665670087, + "grad_norm": 0.7681538462638855, + "learning_rate": 0.0001532592279345134, + "loss": 2.9056, + "step": 7742 + }, + { + "epoch": 0.7015016647414555, + "grad_norm": 0.825357973575592, + "learning_rate": 0.00015325318673352264, + "loss": 2.6816, + "step": 7743 + }, + { + "epoch": 0.7015922629159023, + "grad_norm": 0.780554473400116, + "learning_rate": 0.00015324714553253187, + "loss": 2.7247, + "step": 7744 + }, + { + "epoch": 0.701682861090349, + "grad_norm": 0.6385935544967651, + "learning_rate": 0.0001532411043315411, + "loss": 1.6046, + "step": 7745 + }, + { + "epoch": 0.7017734592647958, + "grad_norm": 0.8212732672691345, + "learning_rate": 0.00015323506313055037, + "loss": 2.8818, + "step": 7746 + }, + { + "epoch": 0.7018640574392426, + "grad_norm": 0.7167004346847534, + "learning_rate": 0.0001532290219295596, + "loss": 2.4407, + "step": 7747 + }, + { + "epoch": 0.7019546556136894, + "grad_norm": 0.8613130450248718, + "learning_rate": 0.00015322298072856884, + "loss": 2.8973, + "step": 7748 + }, + { + "epoch": 0.7020452537881362, + "grad_norm": 0.9321388602256775, + "learning_rate": 0.0001532169395275781, + "loss": 2.8569, + "step": 7749 + }, + { + "epoch": 0.702135851962583, + "grad_norm": 0.8089168667793274, + "learning_rate": 0.00015321089832658733, + "loss": 2.9548, + "step": 7750 + }, + { + "epoch": 0.7022264501370298, + "grad_norm": 0.7819808721542358, + "learning_rate": 0.0001532048571255966, + "loss": 2.8625, + "step": 7751 + }, + { + "epoch": 0.7023170483114766, + "grad_norm": 0.7577162981033325, + "learning_rate": 0.0001531988159246058, + "loss": 2.5082, + "step": 7752 + }, + { + "epoch": 0.7024076464859234, + "grad_norm": 0.8223294615745544, + "learning_rate": 0.00015319277472361506, + "loss": 2.6288, + "step": 7753 + }, + { + "epoch": 0.7024982446603701, + "grad_norm": 0.8009559512138367, + "learning_rate": 0.00015318673352262432, + "loss": 2.9502, + "step": 7754 + }, + { + "epoch": 0.7025888428348169, + "grad_norm": 0.780040979385376, + "learning_rate": 0.00015318069232163356, + "loss": 2.9007, + "step": 7755 + }, + { + "epoch": 0.7026794410092637, + "grad_norm": 0.7956690788269043, + "learning_rate": 0.0001531746511206428, + "loss": 3.0878, + "step": 7756 + }, + { + "epoch": 0.7027700391837104, + "grad_norm": 0.7651402354240417, + "learning_rate": 0.00015316860991965202, + "loss": 2.5644, + "step": 7757 + }, + { + "epoch": 0.7028606373581572, + "grad_norm": 0.8338688611984253, + "learning_rate": 0.00015316256871866128, + "loss": 2.7254, + "step": 7758 + }, + { + "epoch": 0.702951235532604, + "grad_norm": 0.7919600605964661, + "learning_rate": 0.00015315652751767052, + "loss": 2.6188, + "step": 7759 + }, + { + "epoch": 0.7030418337070508, + "grad_norm": 0.7842077612876892, + "learning_rate": 0.00015315048631667975, + "loss": 2.9945, + "step": 7760 + }, + { + "epoch": 0.7031324318814975, + "grad_norm": 0.735845148563385, + "learning_rate": 0.00015314444511568901, + "loss": 2.8162, + "step": 7761 + }, + { + "epoch": 0.7032230300559443, + "grad_norm": 0.8642950057983398, + "learning_rate": 0.00015313840391469825, + "loss": 2.9787, + "step": 7762 + }, + { + "epoch": 0.7033136282303911, + "grad_norm": 0.7076333165168762, + "learning_rate": 0.0001531323627137075, + "loss": 1.9451, + "step": 7763 + }, + { + "epoch": 0.7034042264048379, + "grad_norm": 0.7726010680198669, + "learning_rate": 0.00015312632151271674, + "loss": 2.7363, + "step": 7764 + }, + { + "epoch": 0.7034948245792847, + "grad_norm": 0.6823406219482422, + "learning_rate": 0.00015312028031172598, + "loss": 1.6067, + "step": 7765 + }, + { + "epoch": 0.7035854227537315, + "grad_norm": 0.8034241199493408, + "learning_rate": 0.0001531142391107352, + "loss": 2.8554, + "step": 7766 + }, + { + "epoch": 0.7036760209281783, + "grad_norm": 0.8174206018447876, + "learning_rate": 0.00015310819790974447, + "loss": 2.8445, + "step": 7767 + }, + { + "epoch": 0.7037666191026251, + "grad_norm": 0.7509902119636536, + "learning_rate": 0.0001531021567087537, + "loss": 2.797, + "step": 7768 + }, + { + "epoch": 0.7038572172770718, + "grad_norm": 0.811248242855072, + "learning_rate": 0.00015309611550776294, + "loss": 2.6805, + "step": 7769 + }, + { + "epoch": 0.7039478154515186, + "grad_norm": 0.7388079166412354, + "learning_rate": 0.0001530900743067722, + "loss": 2.4878, + "step": 7770 + }, + { + "epoch": 0.7040384136259654, + "grad_norm": 0.6308068633079529, + "learning_rate": 0.00015308403310578144, + "loss": 2.1286, + "step": 7771 + }, + { + "epoch": 0.7041290118004122, + "grad_norm": 0.6670313477516174, + "learning_rate": 0.0001530779919047907, + "loss": 2.2234, + "step": 7772 + }, + { + "epoch": 0.704219609974859, + "grad_norm": 0.7995680570602417, + "learning_rate": 0.0001530719507037999, + "loss": 2.7733, + "step": 7773 + }, + { + "epoch": 0.7043102081493058, + "grad_norm": 0.8058984875679016, + "learning_rate": 0.00015306590950280916, + "loss": 2.8197, + "step": 7774 + }, + { + "epoch": 0.7044008063237526, + "grad_norm": 0.7850799560546875, + "learning_rate": 0.0001530598683018184, + "loss": 3.168, + "step": 7775 + }, + { + "epoch": 0.7044914044981994, + "grad_norm": 0.8971701264381409, + "learning_rate": 0.00015305382710082766, + "loss": 2.8861, + "step": 7776 + }, + { + "epoch": 0.7045820026726461, + "grad_norm": 0.8062689900398254, + "learning_rate": 0.0001530477858998369, + "loss": 3.0323, + "step": 7777 + }, + { + "epoch": 0.7046726008470929, + "grad_norm": 0.8582597374916077, + "learning_rate": 0.00015304174469884613, + "loss": 2.6587, + "step": 7778 + }, + { + "epoch": 0.7047631990215397, + "grad_norm": 0.8180885910987854, + "learning_rate": 0.0001530357034978554, + "loss": 2.6335, + "step": 7779 + }, + { + "epoch": 0.7048537971959865, + "grad_norm": 0.8696692585945129, + "learning_rate": 0.00015302966229686462, + "loss": 2.5843, + "step": 7780 + }, + { + "epoch": 0.7049443953704333, + "grad_norm": 0.7650071382522583, + "learning_rate": 0.00015302362109587386, + "loss": 2.6272, + "step": 7781 + }, + { + "epoch": 0.7050349935448801, + "grad_norm": 0.7992003560066223, + "learning_rate": 0.0001530175798948831, + "loss": 2.8851, + "step": 7782 + }, + { + "epoch": 0.7051255917193269, + "grad_norm": 0.837904155254364, + "learning_rate": 0.00015301153869389235, + "loss": 2.6131, + "step": 7783 + }, + { + "epoch": 0.7052161898937737, + "grad_norm": 0.8678693175315857, + "learning_rate": 0.00015300549749290161, + "loss": 1.9664, + "step": 7784 + }, + { + "epoch": 0.7053067880682204, + "grad_norm": 0.8034924864768982, + "learning_rate": 0.00015299945629191085, + "loss": 2.76, + "step": 7785 + }, + { + "epoch": 0.7053973862426672, + "grad_norm": 0.7838320136070251, + "learning_rate": 0.00015299341509092008, + "loss": 2.9546, + "step": 7786 + }, + { + "epoch": 0.705487984417114, + "grad_norm": 0.8068084716796875, + "learning_rate": 0.00015298737388992932, + "loss": 2.5372, + "step": 7787 + }, + { + "epoch": 0.7055785825915608, + "grad_norm": 0.7004544138908386, + "learning_rate": 0.00015298133268893858, + "loss": 2.0915, + "step": 7788 + }, + { + "epoch": 0.7056691807660076, + "grad_norm": 0.7926044464111328, + "learning_rate": 0.0001529752914879478, + "loss": 2.8945, + "step": 7789 + }, + { + "epoch": 0.7057597789404544, + "grad_norm": 0.7664875984191895, + "learning_rate": 0.00015296925028695705, + "loss": 2.7508, + "step": 7790 + }, + { + "epoch": 0.7058503771149012, + "grad_norm": 0.8238847255706787, + "learning_rate": 0.0001529632090859663, + "loss": 2.8149, + "step": 7791 + }, + { + "epoch": 0.705940975289348, + "grad_norm": 0.808998167514801, + "learning_rate": 0.00015295716788497554, + "loss": 2.6532, + "step": 7792 + }, + { + "epoch": 0.7060315734637947, + "grad_norm": 0.8245424032211304, + "learning_rate": 0.0001529511266839848, + "loss": 2.82, + "step": 7793 + }, + { + "epoch": 0.7061221716382415, + "grad_norm": 0.8327363133430481, + "learning_rate": 0.000152945085482994, + "loss": 2.8679, + "step": 7794 + }, + { + "epoch": 0.7062127698126883, + "grad_norm": 0.7760666608810425, + "learning_rate": 0.00015293904428200327, + "loss": 2.7385, + "step": 7795 + }, + { + "epoch": 0.7063033679871351, + "grad_norm": 0.7758881449699402, + "learning_rate": 0.0001529330030810125, + "loss": 2.716, + "step": 7796 + }, + { + "epoch": 0.7063939661615819, + "grad_norm": 0.8607891201972961, + "learning_rate": 0.00015292696188002176, + "loss": 2.939, + "step": 7797 + }, + { + "epoch": 0.7064845643360286, + "grad_norm": 0.762657880783081, + "learning_rate": 0.000152920920679031, + "loss": 2.8461, + "step": 7798 + }, + { + "epoch": 0.7065751625104754, + "grad_norm": 0.780203640460968, + "learning_rate": 0.00015291487947804023, + "loss": 3.043, + "step": 7799 + }, + { + "epoch": 0.7066657606849222, + "grad_norm": 0.7369453310966492, + "learning_rate": 0.0001529088382770495, + "loss": 2.602, + "step": 7800 + }, + { + "epoch": 0.7067563588593689, + "grad_norm": 0.7824944853782654, + "learning_rate": 0.00015290279707605873, + "loss": 2.7986, + "step": 7801 + }, + { + "epoch": 0.7068469570338157, + "grad_norm": 0.8141531944274902, + "learning_rate": 0.000152896755875068, + "loss": 2.9927, + "step": 7802 + }, + { + "epoch": 0.7069375552082625, + "grad_norm": 0.8234567046165466, + "learning_rate": 0.0001528907146740772, + "loss": 2.8927, + "step": 7803 + }, + { + "epoch": 0.7070281533827093, + "grad_norm": 0.7627398371696472, + "learning_rate": 0.00015288467347308646, + "loss": 2.6751, + "step": 7804 + }, + { + "epoch": 0.7071187515571561, + "grad_norm": 0.763616144657135, + "learning_rate": 0.0001528786322720957, + "loss": 2.923, + "step": 7805 + }, + { + "epoch": 0.7072093497316029, + "grad_norm": 0.7811461687088013, + "learning_rate": 0.00015287259107110495, + "loss": 2.9939, + "step": 7806 + }, + { + "epoch": 0.7072999479060497, + "grad_norm": 0.8026955723762512, + "learning_rate": 0.0001528665498701142, + "loss": 2.8123, + "step": 7807 + }, + { + "epoch": 0.7073905460804965, + "grad_norm": 0.7760587930679321, + "learning_rate": 0.00015286050866912342, + "loss": 2.6821, + "step": 7808 + }, + { + "epoch": 0.7074811442549432, + "grad_norm": 0.756841778755188, + "learning_rate": 0.00015285446746813268, + "loss": 2.724, + "step": 7809 + }, + { + "epoch": 0.70757174242939, + "grad_norm": 0.819538414478302, + "learning_rate": 0.00015284842626714192, + "loss": 2.7857, + "step": 7810 + }, + { + "epoch": 0.7076623406038368, + "grad_norm": 0.7611005306243896, + "learning_rate": 0.00015284238506615115, + "loss": 2.9349, + "step": 7811 + }, + { + "epoch": 0.7077529387782836, + "grad_norm": 0.756467878818512, + "learning_rate": 0.00015283634386516038, + "loss": 2.6928, + "step": 7812 + }, + { + "epoch": 0.7078435369527304, + "grad_norm": 0.8619431853294373, + "learning_rate": 0.00015283030266416965, + "loss": 2.9642, + "step": 7813 + }, + { + "epoch": 0.7079341351271772, + "grad_norm": 0.8485718965530396, + "learning_rate": 0.0001528242614631789, + "loss": 2.9445, + "step": 7814 + }, + { + "epoch": 0.708024733301624, + "grad_norm": 0.7815572619438171, + "learning_rate": 0.00015281822026218814, + "loss": 2.8703, + "step": 7815 + }, + { + "epoch": 0.7081153314760708, + "grad_norm": 0.7448770999908447, + "learning_rate": 0.00015281217906119737, + "loss": 2.7683, + "step": 7816 + }, + { + "epoch": 0.7082059296505175, + "grad_norm": 0.8197051286697388, + "learning_rate": 0.0001528061378602066, + "loss": 2.7679, + "step": 7817 + }, + { + "epoch": 0.7082965278249643, + "grad_norm": 0.8391551971435547, + "learning_rate": 0.00015280009665921587, + "loss": 2.8405, + "step": 7818 + }, + { + "epoch": 0.7083871259994111, + "grad_norm": 0.6796146631240845, + "learning_rate": 0.0001527940554582251, + "loss": 2.2028, + "step": 7819 + }, + { + "epoch": 0.7084777241738579, + "grad_norm": 0.8627543449401855, + "learning_rate": 0.00015278801425723434, + "loss": 3.0918, + "step": 7820 + }, + { + "epoch": 0.7085683223483047, + "grad_norm": 0.9146549701690674, + "learning_rate": 0.0001527819730562436, + "loss": 2.829, + "step": 7821 + }, + { + "epoch": 0.7086589205227515, + "grad_norm": 0.7786087989807129, + "learning_rate": 0.00015277593185525283, + "loss": 2.6262, + "step": 7822 + }, + { + "epoch": 0.7087495186971983, + "grad_norm": 0.8535957336425781, + "learning_rate": 0.0001527698906542621, + "loss": 2.9781, + "step": 7823 + }, + { + "epoch": 0.708840116871645, + "grad_norm": 0.8194676041603088, + "learning_rate": 0.0001527638494532713, + "loss": 2.9596, + "step": 7824 + }, + { + "epoch": 0.7089307150460918, + "grad_norm": 0.7682968378067017, + "learning_rate": 0.00015275780825228056, + "loss": 2.9569, + "step": 7825 + }, + { + "epoch": 0.7090213132205386, + "grad_norm": 0.7862314581871033, + "learning_rate": 0.0001527517670512898, + "loss": 2.8017, + "step": 7826 + }, + { + "epoch": 0.7091119113949854, + "grad_norm": 0.8154558539390564, + "learning_rate": 0.00015274572585029906, + "loss": 2.7998, + "step": 7827 + }, + { + "epoch": 0.7092025095694322, + "grad_norm": 0.8089265823364258, + "learning_rate": 0.0001527396846493083, + "loss": 2.7769, + "step": 7828 + }, + { + "epoch": 0.709293107743879, + "grad_norm": 0.7608136534690857, + "learning_rate": 0.00015273364344831753, + "loss": 1.8414, + "step": 7829 + }, + { + "epoch": 0.7093837059183258, + "grad_norm": 0.817176342010498, + "learning_rate": 0.0001527276022473268, + "loss": 2.8005, + "step": 7830 + }, + { + "epoch": 0.7094743040927726, + "grad_norm": 0.8266512155532837, + "learning_rate": 0.00015272156104633602, + "loss": 2.8304, + "step": 7831 + }, + { + "epoch": 0.7095649022672194, + "grad_norm": 0.788881242275238, + "learning_rate": 0.00015271551984534525, + "loss": 2.9303, + "step": 7832 + }, + { + "epoch": 0.7096555004416661, + "grad_norm": 0.8984748125076294, + "learning_rate": 0.0001527094786443545, + "loss": 2.9079, + "step": 7833 + }, + { + "epoch": 0.7097460986161129, + "grad_norm": 0.7909278869628906, + "learning_rate": 0.00015270343744336375, + "loss": 2.7016, + "step": 7834 + }, + { + "epoch": 0.7098366967905597, + "grad_norm": 0.7962185144424438, + "learning_rate": 0.00015269739624237298, + "loss": 2.8103, + "step": 7835 + }, + { + "epoch": 0.7099272949650065, + "grad_norm": 0.7889731526374817, + "learning_rate": 0.00015269135504138225, + "loss": 2.9499, + "step": 7836 + }, + { + "epoch": 0.7100178931394533, + "grad_norm": 0.7559102177619934, + "learning_rate": 0.00015268531384039148, + "loss": 2.7208, + "step": 7837 + }, + { + "epoch": 0.7101084913139, + "grad_norm": 0.746688187122345, + "learning_rate": 0.0001526792726394007, + "loss": 2.7597, + "step": 7838 + }, + { + "epoch": 0.7101990894883468, + "grad_norm": 0.7198464274406433, + "learning_rate": 0.00015267323143840997, + "loss": 2.2466, + "step": 7839 + }, + { + "epoch": 0.7102896876627935, + "grad_norm": 0.7738919854164124, + "learning_rate": 0.0001526671902374192, + "loss": 2.7804, + "step": 7840 + }, + { + "epoch": 0.7103802858372403, + "grad_norm": 0.7913113832473755, + "learning_rate": 0.00015266114903642844, + "loss": 2.5136, + "step": 7841 + }, + { + "epoch": 0.7104708840116871, + "grad_norm": 0.7764995098114014, + "learning_rate": 0.00015265510783543768, + "loss": 3.1311, + "step": 7842 + }, + { + "epoch": 0.7105614821861339, + "grad_norm": 0.7850528955459595, + "learning_rate": 0.00015264906663444694, + "loss": 2.7609, + "step": 7843 + }, + { + "epoch": 0.7106520803605807, + "grad_norm": 0.7799997329711914, + "learning_rate": 0.0001526430254334562, + "loss": 2.7814, + "step": 7844 + }, + { + "epoch": 0.7107426785350275, + "grad_norm": 0.7300565838813782, + "learning_rate": 0.0001526369842324654, + "loss": 2.5114, + "step": 7845 + }, + { + "epoch": 0.7108332767094743, + "grad_norm": 0.8560929298400879, + "learning_rate": 0.00015263094303147467, + "loss": 2.875, + "step": 7846 + }, + { + "epoch": 0.7109238748839211, + "grad_norm": 0.7503306269645691, + "learning_rate": 0.0001526249018304839, + "loss": 2.6843, + "step": 7847 + }, + { + "epoch": 0.7110144730583678, + "grad_norm": 0.8782474994659424, + "learning_rate": 0.00015261886062949316, + "loss": 2.7114, + "step": 7848 + }, + { + "epoch": 0.7111050712328146, + "grad_norm": 0.8241926431655884, + "learning_rate": 0.0001526128194285024, + "loss": 2.7532, + "step": 7849 + }, + { + "epoch": 0.7111956694072614, + "grad_norm": 0.7524107098579407, + "learning_rate": 0.00015260677822751163, + "loss": 2.9118, + "step": 7850 + }, + { + "epoch": 0.7112862675817082, + "grad_norm": 0.8122356534004211, + "learning_rate": 0.0001526007370265209, + "loss": 2.8581, + "step": 7851 + }, + { + "epoch": 0.711376865756155, + "grad_norm": 0.7194710373878479, + "learning_rate": 0.00015259469582553013, + "loss": 2.2436, + "step": 7852 + }, + { + "epoch": 0.7114674639306018, + "grad_norm": 0.7940911650657654, + "learning_rate": 0.0001525886546245394, + "loss": 2.9654, + "step": 7853 + }, + { + "epoch": 0.7115580621050486, + "grad_norm": 0.7378663420677185, + "learning_rate": 0.0001525826134235486, + "loss": 2.9331, + "step": 7854 + }, + { + "epoch": 0.7116486602794954, + "grad_norm": 0.7672275900840759, + "learning_rate": 0.00015257657222255785, + "loss": 2.6977, + "step": 7855 + }, + { + "epoch": 0.7117392584539421, + "grad_norm": 0.7006607055664062, + "learning_rate": 0.0001525705310215671, + "loss": 2.1853, + "step": 7856 + }, + { + "epoch": 0.7118298566283889, + "grad_norm": 0.7658519744873047, + "learning_rate": 0.00015256448982057635, + "loss": 2.7253, + "step": 7857 + }, + { + "epoch": 0.7119204548028357, + "grad_norm": 0.8639529943466187, + "learning_rate": 0.00015255844861958556, + "loss": 2.8966, + "step": 7858 + }, + { + "epoch": 0.7120110529772825, + "grad_norm": 0.7803829908370972, + "learning_rate": 0.00015255240741859482, + "loss": 2.9723, + "step": 7859 + }, + { + "epoch": 0.7121016511517293, + "grad_norm": 0.8187565803527832, + "learning_rate": 0.00015254636621760408, + "loss": 2.8235, + "step": 7860 + }, + { + "epoch": 0.7121922493261761, + "grad_norm": 0.8160563707351685, + "learning_rate": 0.0001525403250166133, + "loss": 2.7259, + "step": 7861 + }, + { + "epoch": 0.7122828475006229, + "grad_norm": 0.7496848106384277, + "learning_rate": 0.00015253428381562255, + "loss": 2.9722, + "step": 7862 + }, + { + "epoch": 0.7123734456750697, + "grad_norm": 0.8108861446380615, + "learning_rate": 0.00015252824261463178, + "loss": 2.99, + "step": 7863 + }, + { + "epoch": 0.7124640438495164, + "grad_norm": 0.8036268949508667, + "learning_rate": 0.00015252220141364104, + "loss": 2.8659, + "step": 7864 + }, + { + "epoch": 0.7125546420239632, + "grad_norm": 0.7717123627662659, + "learning_rate": 0.00015251616021265028, + "loss": 2.8267, + "step": 7865 + }, + { + "epoch": 0.71264524019841, + "grad_norm": 0.7826629877090454, + "learning_rate": 0.00015251011901165954, + "loss": 2.8386, + "step": 7866 + }, + { + "epoch": 0.7127358383728568, + "grad_norm": 0.7698405981063843, + "learning_rate": 0.00015250407781066877, + "loss": 2.8768, + "step": 7867 + }, + { + "epoch": 0.7128264365473036, + "grad_norm": 0.7310419678688049, + "learning_rate": 0.000152498036609678, + "loss": 2.6195, + "step": 7868 + }, + { + "epoch": 0.7129170347217504, + "grad_norm": 0.8055012226104736, + "learning_rate": 0.00015249199540868727, + "loss": 2.7906, + "step": 7869 + }, + { + "epoch": 0.7130076328961972, + "grad_norm": 0.8460975885391235, + "learning_rate": 0.0001524859542076965, + "loss": 2.7753, + "step": 7870 + }, + { + "epoch": 0.713098231070644, + "grad_norm": 0.8028121590614319, + "learning_rate": 0.00015247991300670574, + "loss": 2.7489, + "step": 7871 + }, + { + "epoch": 0.7131888292450907, + "grad_norm": 0.661232590675354, + "learning_rate": 0.00015247387180571497, + "loss": 1.9671, + "step": 7872 + }, + { + "epoch": 0.7132794274195375, + "grad_norm": 0.8148142695426941, + "learning_rate": 0.00015246783060472423, + "loss": 2.7124, + "step": 7873 + }, + { + "epoch": 0.7133700255939843, + "grad_norm": 0.7649940848350525, + "learning_rate": 0.0001524617894037335, + "loss": 2.5998, + "step": 7874 + }, + { + "epoch": 0.7134606237684311, + "grad_norm": 1.037742257118225, + "learning_rate": 0.0001524557482027427, + "loss": 2.7395, + "step": 7875 + }, + { + "epoch": 0.7135512219428779, + "grad_norm": 0.7633486986160278, + "learning_rate": 0.00015244970700175196, + "loss": 2.7602, + "step": 7876 + }, + { + "epoch": 0.7136418201173247, + "grad_norm": 0.8090881705284119, + "learning_rate": 0.0001524436658007612, + "loss": 2.8813, + "step": 7877 + }, + { + "epoch": 0.7137324182917715, + "grad_norm": 0.7992528676986694, + "learning_rate": 0.00015243762459977045, + "loss": 2.6564, + "step": 7878 + }, + { + "epoch": 0.7138230164662182, + "grad_norm": 0.7617413401603699, + "learning_rate": 0.0001524315833987797, + "loss": 2.7576, + "step": 7879 + }, + { + "epoch": 0.7139136146406649, + "grad_norm": 0.6209245324134827, + "learning_rate": 0.00015242554219778892, + "loss": 1.2305, + "step": 7880 + }, + { + "epoch": 0.7140042128151117, + "grad_norm": 0.8269597887992859, + "learning_rate": 0.00015241950099679818, + "loss": 3.0071, + "step": 7881 + }, + { + "epoch": 0.7140948109895585, + "grad_norm": 0.7980844378471375, + "learning_rate": 0.00015241345979580742, + "loss": 3.0567, + "step": 7882 + }, + { + "epoch": 0.7141854091640053, + "grad_norm": 0.736655056476593, + "learning_rate": 0.00015240741859481665, + "loss": 2.0634, + "step": 7883 + }, + { + "epoch": 0.7142760073384521, + "grad_norm": 0.7464339137077332, + "learning_rate": 0.00015240137739382589, + "loss": 2.6171, + "step": 7884 + }, + { + "epoch": 0.7143666055128989, + "grad_norm": 0.7662394046783447, + "learning_rate": 0.00015239533619283515, + "loss": 2.846, + "step": 7885 + }, + { + "epoch": 0.7144572036873457, + "grad_norm": 0.8524727821350098, + "learning_rate": 0.00015238929499184438, + "loss": 2.8543, + "step": 7886 + }, + { + "epoch": 0.7145478018617925, + "grad_norm": 0.7990630269050598, + "learning_rate": 0.00015238325379085364, + "loss": 2.6894, + "step": 7887 + }, + { + "epoch": 0.7146384000362392, + "grad_norm": 0.7820594906806946, + "learning_rate": 0.00015237721258986285, + "loss": 2.7905, + "step": 7888 + }, + { + "epoch": 0.714728998210686, + "grad_norm": 0.6733222603797913, + "learning_rate": 0.0001523711713888721, + "loss": 2.028, + "step": 7889 + }, + { + "epoch": 0.7148195963851328, + "grad_norm": 0.7789149880409241, + "learning_rate": 0.00015236513018788137, + "loss": 2.6678, + "step": 7890 + }, + { + "epoch": 0.7149101945595796, + "grad_norm": 0.8828368782997131, + "learning_rate": 0.0001523590889868906, + "loss": 2.7827, + "step": 7891 + }, + { + "epoch": 0.7150007927340264, + "grad_norm": 0.7669426798820496, + "learning_rate": 0.00015235304778589984, + "loss": 2.9455, + "step": 7892 + }, + { + "epoch": 0.7150913909084732, + "grad_norm": 0.7919607162475586, + "learning_rate": 0.00015234700658490907, + "loss": 2.96, + "step": 7893 + }, + { + "epoch": 0.71518198908292, + "grad_norm": 0.7993003129959106, + "learning_rate": 0.00015234096538391834, + "loss": 3.1534, + "step": 7894 + }, + { + "epoch": 0.7152725872573668, + "grad_norm": 0.7653346657752991, + "learning_rate": 0.00015233492418292757, + "loss": 2.642, + "step": 7895 + }, + { + "epoch": 0.7153631854318135, + "grad_norm": 0.8297927975654602, + "learning_rate": 0.0001523288829819368, + "loss": 2.9006, + "step": 7896 + }, + { + "epoch": 0.7154537836062603, + "grad_norm": 0.7865745425224304, + "learning_rate": 0.00015232284178094606, + "loss": 2.8172, + "step": 7897 + }, + { + "epoch": 0.7155443817807071, + "grad_norm": 0.7397478222846985, + "learning_rate": 0.0001523168005799553, + "loss": 2.7827, + "step": 7898 + }, + { + "epoch": 0.7156349799551539, + "grad_norm": 0.7897258400917053, + "learning_rate": 0.00015231075937896456, + "loss": 2.6549, + "step": 7899 + }, + { + "epoch": 0.7157255781296007, + "grad_norm": 0.8693187832832336, + "learning_rate": 0.0001523047181779738, + "loss": 3.0795, + "step": 7900 + }, + { + "epoch": 0.7158161763040475, + "grad_norm": 0.7862616777420044, + "learning_rate": 0.00015229867697698303, + "loss": 2.8405, + "step": 7901 + }, + { + "epoch": 0.7159067744784943, + "grad_norm": 0.7799757719039917, + "learning_rate": 0.00015229263577599226, + "loss": 2.6614, + "step": 7902 + }, + { + "epoch": 0.715997372652941, + "grad_norm": 0.7549499273300171, + "learning_rate": 0.00015228659457500152, + "loss": 2.691, + "step": 7903 + }, + { + "epoch": 0.7160879708273878, + "grad_norm": 0.6551882028579712, + "learning_rate": 0.00015228055337401076, + "loss": 2.011, + "step": 7904 + }, + { + "epoch": 0.7161785690018346, + "grad_norm": 0.840137243270874, + "learning_rate": 0.00015227451217302, + "loss": 2.7468, + "step": 7905 + }, + { + "epoch": 0.7162691671762814, + "grad_norm": 0.7525712251663208, + "learning_rate": 0.00015226847097202925, + "loss": 2.8441, + "step": 7906 + }, + { + "epoch": 0.7163597653507282, + "grad_norm": 0.7841417789459229, + "learning_rate": 0.00015226242977103849, + "loss": 2.8488, + "step": 7907 + }, + { + "epoch": 0.716450363525175, + "grad_norm": 0.7356937527656555, + "learning_rate": 0.00015225638857004775, + "loss": 2.1542, + "step": 7908 + }, + { + "epoch": 0.7165409616996218, + "grad_norm": 0.780267596244812, + "learning_rate": 0.00015225034736905695, + "loss": 2.7246, + "step": 7909 + }, + { + "epoch": 0.7166315598740686, + "grad_norm": 0.790817141532898, + "learning_rate": 0.00015224430616806622, + "loss": 2.8545, + "step": 7910 + }, + { + "epoch": 0.7167221580485154, + "grad_norm": 0.7700903415679932, + "learning_rate": 0.00015223826496707548, + "loss": 2.6443, + "step": 7911 + }, + { + "epoch": 0.7168127562229621, + "grad_norm": 0.7823769450187683, + "learning_rate": 0.0001522322237660847, + "loss": 3.0069, + "step": 7912 + }, + { + "epoch": 0.7169033543974089, + "grad_norm": 0.8042650818824768, + "learning_rate": 0.00015222618256509394, + "loss": 2.845, + "step": 7913 + }, + { + "epoch": 0.7169939525718557, + "grad_norm": 0.8496611714363098, + "learning_rate": 0.00015222014136410318, + "loss": 3.0132, + "step": 7914 + }, + { + "epoch": 0.7170845507463025, + "grad_norm": 0.7948789000511169, + "learning_rate": 0.00015221410016311244, + "loss": 3.0308, + "step": 7915 + }, + { + "epoch": 0.7171751489207493, + "grad_norm": 0.7383041977882385, + "learning_rate": 0.00015220805896212167, + "loss": 2.9551, + "step": 7916 + }, + { + "epoch": 0.7172657470951961, + "grad_norm": 0.7649462819099426, + "learning_rate": 0.0001522020177611309, + "loss": 2.7069, + "step": 7917 + }, + { + "epoch": 0.7173563452696429, + "grad_norm": 0.8219998478889465, + "learning_rate": 0.00015219597656014014, + "loss": 2.8784, + "step": 7918 + }, + { + "epoch": 0.7174469434440895, + "grad_norm": 0.820809006690979, + "learning_rate": 0.0001521899353591494, + "loss": 2.8398, + "step": 7919 + }, + { + "epoch": 0.7175375416185363, + "grad_norm": 0.7471908330917358, + "learning_rate": 0.00015218389415815866, + "loss": 2.6057, + "step": 7920 + }, + { + "epoch": 0.7176281397929831, + "grad_norm": 0.8574039936065674, + "learning_rate": 0.0001521778529571679, + "loss": 2.8308, + "step": 7921 + }, + { + "epoch": 0.7177187379674299, + "grad_norm": 0.7521113753318787, + "learning_rate": 0.00015217181175617713, + "loss": 2.8447, + "step": 7922 + }, + { + "epoch": 0.7178093361418767, + "grad_norm": 0.8330748677253723, + "learning_rate": 0.00015216577055518637, + "loss": 3.2138, + "step": 7923 + }, + { + "epoch": 0.7178999343163235, + "grad_norm": 0.8233445882797241, + "learning_rate": 0.00015215972935419563, + "loss": 3.0386, + "step": 7924 + }, + { + "epoch": 0.7179905324907703, + "grad_norm": 0.7521651983261108, + "learning_rate": 0.00015215368815320486, + "loss": 2.7336, + "step": 7925 + }, + { + "epoch": 0.7180811306652171, + "grad_norm": 0.8500820398330688, + "learning_rate": 0.0001521476469522141, + "loss": 2.8414, + "step": 7926 + }, + { + "epoch": 0.7181717288396638, + "grad_norm": 0.924479067325592, + "learning_rate": 0.00015214160575122336, + "loss": 2.9271, + "step": 7927 + }, + { + "epoch": 0.7182623270141106, + "grad_norm": 0.8110459446907043, + "learning_rate": 0.0001521355645502326, + "loss": 2.7782, + "step": 7928 + }, + { + "epoch": 0.7183529251885574, + "grad_norm": 0.6636673212051392, + "learning_rate": 0.00015212952334924185, + "loss": 2.18, + "step": 7929 + }, + { + "epoch": 0.7184435233630042, + "grad_norm": 0.7800582051277161, + "learning_rate": 0.0001521234821482511, + "loss": 2.9692, + "step": 7930 + }, + { + "epoch": 0.718534121537451, + "grad_norm": 0.6916937232017517, + "learning_rate": 0.00015211744094726032, + "loss": 2.3024, + "step": 7931 + }, + { + "epoch": 0.7186247197118978, + "grad_norm": 0.794119656085968, + "learning_rate": 0.00015211139974626955, + "loss": 2.7895, + "step": 7932 + }, + { + "epoch": 0.7187153178863446, + "grad_norm": 0.7769298553466797, + "learning_rate": 0.00015210535854527882, + "loss": 3.0454, + "step": 7933 + }, + { + "epoch": 0.7188059160607914, + "grad_norm": 0.7768879532814026, + "learning_rate": 0.00015209931734428805, + "loss": 2.6435, + "step": 7934 + }, + { + "epoch": 0.7188965142352381, + "grad_norm": 0.8199659585952759, + "learning_rate": 0.00015209327614329728, + "loss": 3.0773, + "step": 7935 + }, + { + "epoch": 0.7189871124096849, + "grad_norm": 0.8291743397712708, + "learning_rate": 0.00015208723494230655, + "loss": 2.7907, + "step": 7936 + }, + { + "epoch": 0.7190777105841317, + "grad_norm": 0.8200724720954895, + "learning_rate": 0.00015208119374131578, + "loss": 2.8533, + "step": 7937 + }, + { + "epoch": 0.7191683087585785, + "grad_norm": 0.7685272097587585, + "learning_rate": 0.00015207515254032504, + "loss": 2.8375, + "step": 7938 + }, + { + "epoch": 0.7192589069330253, + "grad_norm": 0.6736952662467957, + "learning_rate": 0.00015206911133933425, + "loss": 1.8974, + "step": 7939 + }, + { + "epoch": 0.7193495051074721, + "grad_norm": 0.7906716465950012, + "learning_rate": 0.0001520630701383435, + "loss": 2.9827, + "step": 7940 + }, + { + "epoch": 0.7194401032819189, + "grad_norm": 0.8555290102958679, + "learning_rate": 0.00015205702893735277, + "loss": 2.8791, + "step": 7941 + }, + { + "epoch": 0.7195307014563657, + "grad_norm": 0.7984578013420105, + "learning_rate": 0.000152050987736362, + "loss": 2.8248, + "step": 7942 + }, + { + "epoch": 0.7196212996308124, + "grad_norm": 0.8255154490470886, + "learning_rate": 0.00015204494653537124, + "loss": 2.7422, + "step": 7943 + }, + { + "epoch": 0.7197118978052592, + "grad_norm": 0.8163098096847534, + "learning_rate": 0.00015203890533438047, + "loss": 2.9325, + "step": 7944 + }, + { + "epoch": 0.719802495979706, + "grad_norm": 0.8281236886978149, + "learning_rate": 0.00015203286413338973, + "loss": 2.8667, + "step": 7945 + }, + { + "epoch": 0.7198930941541528, + "grad_norm": 0.8083235621452332, + "learning_rate": 0.00015202682293239897, + "loss": 2.6775, + "step": 7946 + }, + { + "epoch": 0.7199836923285996, + "grad_norm": 0.9867482781410217, + "learning_rate": 0.0001520207817314082, + "loss": 2.7521, + "step": 7947 + }, + { + "epoch": 0.7200742905030464, + "grad_norm": 0.8019083142280579, + "learning_rate": 0.00015201474053041743, + "loss": 2.5797, + "step": 7948 + }, + { + "epoch": 0.7201648886774932, + "grad_norm": 0.871514618396759, + "learning_rate": 0.0001520086993294267, + "loss": 2.7487, + "step": 7949 + }, + { + "epoch": 0.72025548685194, + "grad_norm": 0.8093435764312744, + "learning_rate": 0.00015200265812843596, + "loss": 2.8779, + "step": 7950 + }, + { + "epoch": 0.7203460850263868, + "grad_norm": 0.7968536615371704, + "learning_rate": 0.0001519966169274452, + "loss": 2.5426, + "step": 7951 + }, + { + "epoch": 0.7204366832008335, + "grad_norm": 0.7540304660797119, + "learning_rate": 0.00015199057572645443, + "loss": 2.5187, + "step": 7952 + }, + { + "epoch": 0.7205272813752803, + "grad_norm": 0.7880854606628418, + "learning_rate": 0.00015198453452546366, + "loss": 2.1931, + "step": 7953 + }, + { + "epoch": 0.7206178795497271, + "grad_norm": 0.8600770831108093, + "learning_rate": 0.00015197849332447292, + "loss": 3.0099, + "step": 7954 + }, + { + "epoch": 0.7207084777241739, + "grad_norm": 0.9004960656166077, + "learning_rate": 0.00015197245212348215, + "loss": 2.8031, + "step": 7955 + }, + { + "epoch": 0.7207990758986207, + "grad_norm": 0.7550205588340759, + "learning_rate": 0.0001519664109224914, + "loss": 2.6564, + "step": 7956 + }, + { + "epoch": 0.7208896740730675, + "grad_norm": 0.8121493458747864, + "learning_rate": 0.00015196036972150065, + "loss": 2.998, + "step": 7957 + }, + { + "epoch": 0.7209802722475143, + "grad_norm": 0.8140687942504883, + "learning_rate": 0.00015195432852050988, + "loss": 2.7742, + "step": 7958 + }, + { + "epoch": 0.721070870421961, + "grad_norm": 0.7852210998535156, + "learning_rate": 0.00015194828731951915, + "loss": 2.7823, + "step": 7959 + }, + { + "epoch": 0.7211614685964077, + "grad_norm": 0.8290153741836548, + "learning_rate": 0.00015194224611852835, + "loss": 2.9071, + "step": 7960 + }, + { + "epoch": 0.7212520667708545, + "grad_norm": 0.7669640779495239, + "learning_rate": 0.0001519362049175376, + "loss": 2.8847, + "step": 7961 + }, + { + "epoch": 0.7213426649453013, + "grad_norm": 0.7642165422439575, + "learning_rate": 0.00015193016371654685, + "loss": 2.8524, + "step": 7962 + }, + { + "epoch": 0.7214332631197481, + "grad_norm": 0.8251994252204895, + "learning_rate": 0.0001519241225155561, + "loss": 2.913, + "step": 7963 + }, + { + "epoch": 0.7215238612941949, + "grad_norm": 0.8998884558677673, + "learning_rate": 0.00015191808131456534, + "loss": 2.5798, + "step": 7964 + }, + { + "epoch": 0.7216144594686417, + "grad_norm": 0.7390356659889221, + "learning_rate": 0.00015191204011357458, + "loss": 2.85, + "step": 7965 + }, + { + "epoch": 0.7217050576430885, + "grad_norm": 0.8352174758911133, + "learning_rate": 0.00015190599891258384, + "loss": 2.9081, + "step": 7966 + }, + { + "epoch": 0.7217956558175352, + "grad_norm": 0.8782383799552917, + "learning_rate": 0.00015189995771159307, + "loss": 2.6472, + "step": 7967 + }, + { + "epoch": 0.721886253991982, + "grad_norm": 0.7945238351821899, + "learning_rate": 0.0001518939165106023, + "loss": 2.9566, + "step": 7968 + }, + { + "epoch": 0.7219768521664288, + "grad_norm": 0.7355581521987915, + "learning_rate": 0.00015188787530961154, + "loss": 2.8546, + "step": 7969 + }, + { + "epoch": 0.7220674503408756, + "grad_norm": 0.8017829656600952, + "learning_rate": 0.0001518818341086208, + "loss": 2.914, + "step": 7970 + }, + { + "epoch": 0.7221580485153224, + "grad_norm": 0.744662344455719, + "learning_rate": 0.00015187579290763006, + "loss": 2.6225, + "step": 7971 + }, + { + "epoch": 0.7222486466897692, + "grad_norm": 0.7005282640457153, + "learning_rate": 0.0001518697517066393, + "loss": 2.2391, + "step": 7972 + }, + { + "epoch": 0.722339244864216, + "grad_norm": 0.8197852969169617, + "learning_rate": 0.00015186371050564853, + "loss": 2.6435, + "step": 7973 + }, + { + "epoch": 0.7224298430386628, + "grad_norm": 0.7818012833595276, + "learning_rate": 0.00015185766930465776, + "loss": 2.8102, + "step": 7974 + }, + { + "epoch": 0.7225204412131095, + "grad_norm": 0.8374640941619873, + "learning_rate": 0.00015185162810366703, + "loss": 2.7306, + "step": 7975 + }, + { + "epoch": 0.7226110393875563, + "grad_norm": 0.7934338450431824, + "learning_rate": 0.00015184558690267626, + "loss": 2.8341, + "step": 7976 + }, + { + "epoch": 0.7227016375620031, + "grad_norm": 0.7636714577674866, + "learning_rate": 0.0001518395457016855, + "loss": 2.7917, + "step": 7977 + }, + { + "epoch": 0.7227922357364499, + "grad_norm": 0.7724885940551758, + "learning_rate": 0.00015183350450069473, + "loss": 2.8446, + "step": 7978 + }, + { + "epoch": 0.7228828339108967, + "grad_norm": 0.7915485501289368, + "learning_rate": 0.000151827463299704, + "loss": 2.1853, + "step": 7979 + }, + { + "epoch": 0.7229734320853435, + "grad_norm": 0.7780755758285522, + "learning_rate": 0.00015182142209871325, + "loss": 2.8782, + "step": 7980 + }, + { + "epoch": 0.7230640302597903, + "grad_norm": 0.6934331655502319, + "learning_rate": 0.00015181538089772246, + "loss": 2.0618, + "step": 7981 + }, + { + "epoch": 0.7231546284342371, + "grad_norm": 0.8305524587631226, + "learning_rate": 0.00015180933969673172, + "loss": 2.7432, + "step": 7982 + }, + { + "epoch": 0.7232452266086838, + "grad_norm": 0.8227525949478149, + "learning_rate": 0.00015180329849574095, + "loss": 2.6663, + "step": 7983 + }, + { + "epoch": 0.7233358247831306, + "grad_norm": 0.8256049752235413, + "learning_rate": 0.0001517972572947502, + "loss": 2.8495, + "step": 7984 + }, + { + "epoch": 0.7234264229575774, + "grad_norm": 0.8470938801765442, + "learning_rate": 0.00015179121609375945, + "loss": 2.7339, + "step": 7985 + }, + { + "epoch": 0.7235170211320242, + "grad_norm": 0.7851350903511047, + "learning_rate": 0.00015178517489276868, + "loss": 2.8276, + "step": 7986 + }, + { + "epoch": 0.723607619306471, + "grad_norm": 0.8712218403816223, + "learning_rate": 0.00015177913369177794, + "loss": 2.7755, + "step": 7987 + }, + { + "epoch": 0.7236982174809178, + "grad_norm": 0.7026036381721497, + "learning_rate": 0.00015177309249078718, + "loss": 2.1344, + "step": 7988 + }, + { + "epoch": 0.7237888156553646, + "grad_norm": 0.8137809038162231, + "learning_rate": 0.00015176705128979644, + "loss": 2.3039, + "step": 7989 + }, + { + "epoch": 0.7238794138298114, + "grad_norm": 0.7379831075668335, + "learning_rate": 0.00015176101008880564, + "loss": 1.9505, + "step": 7990 + }, + { + "epoch": 0.7239700120042581, + "grad_norm": 0.7779869437217712, + "learning_rate": 0.0001517549688878149, + "loss": 2.5982, + "step": 7991 + }, + { + "epoch": 0.7240606101787049, + "grad_norm": 0.8222345113754272, + "learning_rate": 0.00015174892768682414, + "loss": 2.7436, + "step": 7992 + }, + { + "epoch": 0.7241512083531517, + "grad_norm": 0.8121806383132935, + "learning_rate": 0.0001517428864858334, + "loss": 2.6802, + "step": 7993 + }, + { + "epoch": 0.7242418065275985, + "grad_norm": 0.7902358174324036, + "learning_rate": 0.00015173684528484264, + "loss": 2.6457, + "step": 7994 + }, + { + "epoch": 0.7243324047020453, + "grad_norm": 0.7635499238967896, + "learning_rate": 0.00015173080408385187, + "loss": 2.7695, + "step": 7995 + }, + { + "epoch": 0.7244230028764921, + "grad_norm": 0.8015406727790833, + "learning_rate": 0.00015172476288286113, + "loss": 2.8042, + "step": 7996 + }, + { + "epoch": 0.7245136010509389, + "grad_norm": 0.8023317456245422, + "learning_rate": 0.00015171872168187036, + "loss": 2.746, + "step": 7997 + }, + { + "epoch": 0.7246041992253857, + "grad_norm": 0.6676498651504517, + "learning_rate": 0.0001517126804808796, + "loss": 2.0944, + "step": 7998 + }, + { + "epoch": 0.7246947973998324, + "grad_norm": 0.7859655618667603, + "learning_rate": 0.00015170663927988883, + "loss": 2.8853, + "step": 7999 + }, + { + "epoch": 0.7247853955742791, + "grad_norm": 0.7699310779571533, + "learning_rate": 0.0001517005980788981, + "loss": 2.494, + "step": 8000 + }, + { + "epoch": 0.7248759937487259, + "grad_norm": 0.8139808177947998, + "learning_rate": 0.00015169455687790735, + "loss": 2.7534, + "step": 8001 + }, + { + "epoch": 0.7249665919231727, + "grad_norm": 0.8189457654953003, + "learning_rate": 0.0001516885156769166, + "loss": 2.7673, + "step": 8002 + }, + { + "epoch": 0.7250571900976195, + "grad_norm": 0.7980149388313293, + "learning_rate": 0.00015168247447592582, + "loss": 2.84, + "step": 8003 + }, + { + "epoch": 0.7251477882720663, + "grad_norm": 0.8251537084579468, + "learning_rate": 0.00015167643327493506, + "loss": 2.9807, + "step": 8004 + }, + { + "epoch": 0.7252383864465131, + "grad_norm": 0.8198680877685547, + "learning_rate": 0.00015167039207394432, + "loss": 2.8982, + "step": 8005 + }, + { + "epoch": 0.7253289846209598, + "grad_norm": 0.7921887040138245, + "learning_rate": 0.00015166435087295355, + "loss": 2.8439, + "step": 8006 + }, + { + "epoch": 0.7254195827954066, + "grad_norm": 0.7981971502304077, + "learning_rate": 0.00015165830967196279, + "loss": 2.5894, + "step": 8007 + }, + { + "epoch": 0.7255101809698534, + "grad_norm": 0.7453425526618958, + "learning_rate": 0.00015165226847097205, + "loss": 2.6144, + "step": 8008 + }, + { + "epoch": 0.7256007791443002, + "grad_norm": 0.7786526679992676, + "learning_rate": 0.00015164622726998128, + "loss": 2.8745, + "step": 8009 + }, + { + "epoch": 0.725691377318747, + "grad_norm": 0.7711648344993591, + "learning_rate": 0.00015164018606899054, + "loss": 3.0019, + "step": 8010 + }, + { + "epoch": 0.7257819754931938, + "grad_norm": 0.7882023453712463, + "learning_rate": 0.00015163414486799975, + "loss": 2.8506, + "step": 8011 + }, + { + "epoch": 0.7258725736676406, + "grad_norm": 0.8007714748382568, + "learning_rate": 0.000151628103667009, + "loss": 2.7217, + "step": 8012 + }, + { + "epoch": 0.7259631718420874, + "grad_norm": 0.8703241348266602, + "learning_rate": 0.00015162206246601824, + "loss": 2.9889, + "step": 8013 + }, + { + "epoch": 0.7260537700165342, + "grad_norm": 0.7522998452186584, + "learning_rate": 0.0001516160212650275, + "loss": 2.5017, + "step": 8014 + }, + { + "epoch": 0.7261443681909809, + "grad_norm": 0.7699607014656067, + "learning_rate": 0.00015160998006403674, + "loss": 2.7669, + "step": 8015 + }, + { + "epoch": 0.7262349663654277, + "grad_norm": 0.7643697261810303, + "learning_rate": 0.00015160393886304597, + "loss": 2.7192, + "step": 8016 + }, + { + "epoch": 0.7263255645398745, + "grad_norm": 0.8011612892150879, + "learning_rate": 0.00015159789766205524, + "loss": 2.7562, + "step": 8017 + }, + { + "epoch": 0.7264161627143213, + "grad_norm": 0.8892515897750854, + "learning_rate": 0.00015159185646106447, + "loss": 2.9006, + "step": 8018 + }, + { + "epoch": 0.7265067608887681, + "grad_norm": 0.6375832557678223, + "learning_rate": 0.0001515858152600737, + "loss": 1.8886, + "step": 8019 + }, + { + "epoch": 0.7265973590632149, + "grad_norm": 0.7510844469070435, + "learning_rate": 0.00015157977405908294, + "loss": 2.0197, + "step": 8020 + }, + { + "epoch": 0.7266879572376617, + "grad_norm": 0.8424631357192993, + "learning_rate": 0.0001515737328580922, + "loss": 3.1669, + "step": 8021 + }, + { + "epoch": 0.7267785554121085, + "grad_norm": 0.7318916320800781, + "learning_rate": 0.00015156769165710143, + "loss": 2.5871, + "step": 8022 + }, + { + "epoch": 0.7268691535865552, + "grad_norm": 0.842099666595459, + "learning_rate": 0.0001515616504561107, + "loss": 2.74, + "step": 8023 + }, + { + "epoch": 0.726959751761002, + "grad_norm": 0.7925246357917786, + "learning_rate": 0.00015155560925511993, + "loss": 2.7438, + "step": 8024 + }, + { + "epoch": 0.7270503499354488, + "grad_norm": 0.8313971757888794, + "learning_rate": 0.00015154956805412916, + "loss": 2.1203, + "step": 8025 + }, + { + "epoch": 0.7271409481098956, + "grad_norm": 0.7562773823738098, + "learning_rate": 0.00015154352685313842, + "loss": 2.8071, + "step": 8026 + }, + { + "epoch": 0.7272315462843424, + "grad_norm": 0.7999769449234009, + "learning_rate": 0.00015153748565214766, + "loss": 2.7775, + "step": 8027 + }, + { + "epoch": 0.7273221444587892, + "grad_norm": 0.8449512124061584, + "learning_rate": 0.0001515314444511569, + "loss": 2.8928, + "step": 8028 + }, + { + "epoch": 0.727412742633236, + "grad_norm": 0.768398642539978, + "learning_rate": 0.00015152540325016613, + "loss": 2.698, + "step": 8029 + }, + { + "epoch": 0.7275033408076828, + "grad_norm": 0.7446209788322449, + "learning_rate": 0.00015151936204917539, + "loss": 2.6917, + "step": 8030 + }, + { + "epoch": 0.7275939389821295, + "grad_norm": 0.7828193306922913, + "learning_rate": 0.00015151332084818465, + "loss": 2.9215, + "step": 8031 + }, + { + "epoch": 0.7276845371565763, + "grad_norm": 0.8132421374320984, + "learning_rate": 0.00015150727964719385, + "loss": 2.7849, + "step": 8032 + }, + { + "epoch": 0.7277751353310231, + "grad_norm": 0.7638842463493347, + "learning_rate": 0.00015150123844620312, + "loss": 2.6905, + "step": 8033 + }, + { + "epoch": 0.7278657335054699, + "grad_norm": 0.761504590511322, + "learning_rate": 0.00015149519724521235, + "loss": 2.7877, + "step": 8034 + }, + { + "epoch": 0.7279563316799167, + "grad_norm": 0.7312592267990112, + "learning_rate": 0.0001514891560442216, + "loss": 2.5281, + "step": 8035 + }, + { + "epoch": 0.7280469298543635, + "grad_norm": 0.7807655930519104, + "learning_rate": 0.00015148311484323084, + "loss": 2.7251, + "step": 8036 + }, + { + "epoch": 0.7281375280288103, + "grad_norm": 0.753803014755249, + "learning_rate": 0.00015147707364224008, + "loss": 2.5778, + "step": 8037 + }, + { + "epoch": 0.728228126203257, + "grad_norm": 0.7852795720100403, + "learning_rate": 0.00015147103244124934, + "loss": 2.6938, + "step": 8038 + }, + { + "epoch": 0.7283187243777038, + "grad_norm": 0.7564572095870972, + "learning_rate": 0.00015146499124025857, + "loss": 2.3956, + "step": 8039 + }, + { + "epoch": 0.7284093225521506, + "grad_norm": 0.8124843835830688, + "learning_rate": 0.00015145895003926784, + "loss": 2.8939, + "step": 8040 + }, + { + "epoch": 0.7284999207265973, + "grad_norm": 0.7567859888076782, + "learning_rate": 0.00015145290883827704, + "loss": 2.7894, + "step": 8041 + }, + { + "epoch": 0.7285905189010441, + "grad_norm": 0.8462041616439819, + "learning_rate": 0.0001514468676372863, + "loss": 2.7059, + "step": 8042 + }, + { + "epoch": 0.7286811170754909, + "grad_norm": 0.7630500197410583, + "learning_rate": 0.00015144082643629554, + "loss": 2.8134, + "step": 8043 + }, + { + "epoch": 0.7287717152499377, + "grad_norm": 0.7636144161224365, + "learning_rate": 0.0001514347852353048, + "loss": 2.8195, + "step": 8044 + }, + { + "epoch": 0.7288623134243845, + "grad_norm": 0.8082332015037537, + "learning_rate": 0.000151428744034314, + "loss": 2.8336, + "step": 8045 + }, + { + "epoch": 0.7289529115988312, + "grad_norm": 0.7665285468101501, + "learning_rate": 0.00015142270283332327, + "loss": 2.8097, + "step": 8046 + }, + { + "epoch": 0.729043509773278, + "grad_norm": 0.7977038621902466, + "learning_rate": 0.00015141666163233253, + "loss": 2.9964, + "step": 8047 + }, + { + "epoch": 0.7291341079477248, + "grad_norm": 0.8072904348373413, + "learning_rate": 0.00015141062043134176, + "loss": 2.843, + "step": 8048 + }, + { + "epoch": 0.7292247061221716, + "grad_norm": 0.675471305847168, + "learning_rate": 0.000151404579230351, + "loss": 2.0529, + "step": 8049 + }, + { + "epoch": 0.7293153042966184, + "grad_norm": 0.7145544290542603, + "learning_rate": 0.00015139853802936023, + "loss": 2.2994, + "step": 8050 + }, + { + "epoch": 0.7294059024710652, + "grad_norm": 0.7222967147827148, + "learning_rate": 0.0001513924968283695, + "loss": 2.7539, + "step": 8051 + }, + { + "epoch": 0.729496500645512, + "grad_norm": 0.8886818885803223, + "learning_rate": 0.00015138645562737873, + "loss": 3.0369, + "step": 8052 + }, + { + "epoch": 0.7295870988199588, + "grad_norm": 0.7434291243553162, + "learning_rate": 0.00015138041442638799, + "loss": 2.5136, + "step": 8053 + }, + { + "epoch": 0.7296776969944055, + "grad_norm": 0.7984306812286377, + "learning_rate": 0.00015137437322539722, + "loss": 2.7699, + "step": 8054 + }, + { + "epoch": 0.7297682951688523, + "grad_norm": 0.903583288192749, + "learning_rate": 0.00015136833202440645, + "loss": 2.8588, + "step": 8055 + }, + { + "epoch": 0.7298588933432991, + "grad_norm": 0.7800313830375671, + "learning_rate": 0.00015136229082341572, + "loss": 2.686, + "step": 8056 + }, + { + "epoch": 0.7299494915177459, + "grad_norm": 0.734288215637207, + "learning_rate": 0.00015135624962242495, + "loss": 2.8006, + "step": 8057 + }, + { + "epoch": 0.7300400896921927, + "grad_norm": 0.7994083166122437, + "learning_rate": 0.00015135020842143418, + "loss": 2.649, + "step": 8058 + }, + { + "epoch": 0.7301306878666395, + "grad_norm": 0.7459543347358704, + "learning_rate": 0.00015134416722044342, + "loss": 2.8097, + "step": 8059 + }, + { + "epoch": 0.7302212860410863, + "grad_norm": 0.8062729835510254, + "learning_rate": 0.00015133812601945268, + "loss": 2.7649, + "step": 8060 + }, + { + "epoch": 0.7303118842155331, + "grad_norm": 0.8051264882087708, + "learning_rate": 0.00015133208481846194, + "loss": 2.8587, + "step": 8061 + }, + { + "epoch": 0.7304024823899798, + "grad_norm": 0.8425242900848389, + "learning_rate": 0.00015132604361747115, + "loss": 2.8767, + "step": 8062 + }, + { + "epoch": 0.7304930805644266, + "grad_norm": 0.7666087746620178, + "learning_rate": 0.0001513200024164804, + "loss": 2.7051, + "step": 8063 + }, + { + "epoch": 0.7305836787388734, + "grad_norm": 0.5782164335250854, + "learning_rate": 0.00015131396121548964, + "loss": 1.3568, + "step": 8064 + }, + { + "epoch": 0.7306742769133202, + "grad_norm": 0.7774218320846558, + "learning_rate": 0.0001513079200144989, + "loss": 2.8095, + "step": 8065 + }, + { + "epoch": 0.730764875087767, + "grad_norm": 0.7901633977890015, + "learning_rate": 0.00015130187881350814, + "loss": 2.8662, + "step": 8066 + }, + { + "epoch": 0.7308554732622138, + "grad_norm": 0.6577836871147156, + "learning_rate": 0.00015129583761251737, + "loss": 2.0663, + "step": 8067 + }, + { + "epoch": 0.7309460714366606, + "grad_norm": 0.7710526585578918, + "learning_rate": 0.00015128979641152663, + "loss": 2.5178, + "step": 8068 + }, + { + "epoch": 0.7310366696111074, + "grad_norm": 0.7493706941604614, + "learning_rate": 0.00015128375521053587, + "loss": 2.251, + "step": 8069 + }, + { + "epoch": 0.7311272677855541, + "grad_norm": 0.8587595224380493, + "learning_rate": 0.0001512777140095451, + "loss": 2.7742, + "step": 8070 + }, + { + "epoch": 0.7312178659600009, + "grad_norm": 0.7653665542602539, + "learning_rate": 0.00015127167280855433, + "loss": 2.7403, + "step": 8071 + }, + { + "epoch": 0.7313084641344477, + "grad_norm": 1.0834550857543945, + "learning_rate": 0.0001512656316075636, + "loss": 2.5012, + "step": 8072 + }, + { + "epoch": 0.7313990623088945, + "grad_norm": 0.7888209819793701, + "learning_rate": 0.00015125959040657283, + "loss": 3.0398, + "step": 8073 + }, + { + "epoch": 0.7314896604833413, + "grad_norm": 0.829411506652832, + "learning_rate": 0.0001512535492055821, + "loss": 2.657, + "step": 8074 + }, + { + "epoch": 0.7315802586577881, + "grad_norm": 0.7830390930175781, + "learning_rate": 0.0001512475080045913, + "loss": 2.7741, + "step": 8075 + }, + { + "epoch": 0.7316708568322349, + "grad_norm": 0.6969447731971741, + "learning_rate": 0.00015124146680360056, + "loss": 2.3566, + "step": 8076 + }, + { + "epoch": 0.7317614550066817, + "grad_norm": 0.8352934718132019, + "learning_rate": 0.00015123542560260982, + "loss": 2.9523, + "step": 8077 + }, + { + "epoch": 0.7318520531811284, + "grad_norm": 0.804000735282898, + "learning_rate": 0.00015122938440161905, + "loss": 2.8614, + "step": 8078 + }, + { + "epoch": 0.7319426513555752, + "grad_norm": 0.8365309834480286, + "learning_rate": 0.0001512233432006283, + "loss": 2.9903, + "step": 8079 + }, + { + "epoch": 0.732033249530022, + "grad_norm": 0.7942440509796143, + "learning_rate": 0.00015121730199963752, + "loss": 2.7183, + "step": 8080 + }, + { + "epoch": 0.7321238477044687, + "grad_norm": 0.8065480589866638, + "learning_rate": 0.00015121126079864678, + "loss": 2.8357, + "step": 8081 + }, + { + "epoch": 0.7322144458789155, + "grad_norm": 0.7666229605674744, + "learning_rate": 0.00015120521959765602, + "loss": 2.5862, + "step": 8082 + }, + { + "epoch": 0.7323050440533623, + "grad_norm": 0.8118290901184082, + "learning_rate": 0.00015119917839666525, + "loss": 2.8198, + "step": 8083 + }, + { + "epoch": 0.7323956422278091, + "grad_norm": 0.8087276220321655, + "learning_rate": 0.0001511931371956745, + "loss": 2.5965, + "step": 8084 + }, + { + "epoch": 0.7324862404022559, + "grad_norm": 0.8516177535057068, + "learning_rate": 0.00015118709599468375, + "loss": 2.8209, + "step": 8085 + }, + { + "epoch": 0.7325768385767026, + "grad_norm": 0.7744507789611816, + "learning_rate": 0.000151181054793693, + "loss": 2.682, + "step": 8086 + }, + { + "epoch": 0.7326674367511494, + "grad_norm": 0.8935883641242981, + "learning_rate": 0.00015117501359270224, + "loss": 2.801, + "step": 8087 + }, + { + "epoch": 0.7327580349255962, + "grad_norm": 0.8209714889526367, + "learning_rate": 0.00015116897239171148, + "loss": 2.5208, + "step": 8088 + }, + { + "epoch": 0.732848633100043, + "grad_norm": 0.8326624035835266, + "learning_rate": 0.0001511629311907207, + "loss": 2.7641, + "step": 8089 + }, + { + "epoch": 0.7329392312744898, + "grad_norm": 0.8379979729652405, + "learning_rate": 0.00015115688998972997, + "loss": 2.8974, + "step": 8090 + }, + { + "epoch": 0.7330298294489366, + "grad_norm": 0.8416929841041565, + "learning_rate": 0.0001511508487887392, + "loss": 3.0637, + "step": 8091 + }, + { + "epoch": 0.7331204276233834, + "grad_norm": 0.8635523915290833, + "learning_rate": 0.00015114480758774844, + "loss": 2.855, + "step": 8092 + }, + { + "epoch": 0.7332110257978302, + "grad_norm": 0.7678226828575134, + "learning_rate": 0.0001511387663867577, + "loss": 2.9572, + "step": 8093 + }, + { + "epoch": 0.7333016239722769, + "grad_norm": 0.7894672155380249, + "learning_rate": 0.00015113272518576693, + "loss": 2.7321, + "step": 8094 + }, + { + "epoch": 0.7333922221467237, + "grad_norm": 0.7564792633056641, + "learning_rate": 0.0001511266839847762, + "loss": 2.9961, + "step": 8095 + }, + { + "epoch": 0.7334828203211705, + "grad_norm": 0.8007658123970032, + "learning_rate": 0.0001511206427837854, + "loss": 2.8604, + "step": 8096 + }, + { + "epoch": 0.7335734184956173, + "grad_norm": 0.7798882722854614, + "learning_rate": 0.00015111460158279466, + "loss": 2.9971, + "step": 8097 + }, + { + "epoch": 0.7336640166700641, + "grad_norm": 0.7558030486106873, + "learning_rate": 0.00015110856038180393, + "loss": 2.7371, + "step": 8098 + }, + { + "epoch": 0.7337546148445109, + "grad_norm": 0.76128751039505, + "learning_rate": 0.00015110251918081316, + "loss": 3.2381, + "step": 8099 + }, + { + "epoch": 0.7338452130189577, + "grad_norm": 0.8326403498649597, + "learning_rate": 0.0001510964779798224, + "loss": 2.722, + "step": 8100 + }, + { + "epoch": 0.7339358111934045, + "grad_norm": 0.8082150816917419, + "learning_rate": 0.00015109043677883163, + "loss": 2.7916, + "step": 8101 + }, + { + "epoch": 0.7340264093678512, + "grad_norm": 0.7031673192977905, + "learning_rate": 0.0001510843955778409, + "loss": 2.2143, + "step": 8102 + }, + { + "epoch": 0.734117007542298, + "grad_norm": 0.8176169991493225, + "learning_rate": 0.00015107835437685012, + "loss": 2.897, + "step": 8103 + }, + { + "epoch": 0.7342076057167448, + "grad_norm": 0.8314171433448792, + "learning_rate": 0.00015107231317585936, + "loss": 3.0706, + "step": 8104 + }, + { + "epoch": 0.7342982038911916, + "grad_norm": 0.7718626856803894, + "learning_rate": 0.0001510662719748686, + "loss": 2.7626, + "step": 8105 + }, + { + "epoch": 0.7343888020656384, + "grad_norm": 0.8511891961097717, + "learning_rate": 0.00015106023077387785, + "loss": 2.7415, + "step": 8106 + }, + { + "epoch": 0.7344794002400852, + "grad_norm": 0.800544023513794, + "learning_rate": 0.0001510541895728871, + "loss": 2.714, + "step": 8107 + }, + { + "epoch": 0.734569998414532, + "grad_norm": 0.8550249338150024, + "learning_rate": 0.00015104814837189635, + "loss": 2.8673, + "step": 8108 + }, + { + "epoch": 0.7346605965889788, + "grad_norm": 0.8807050585746765, + "learning_rate": 0.00015104210717090558, + "loss": 2.7637, + "step": 8109 + }, + { + "epoch": 0.7347511947634255, + "grad_norm": 0.7423498034477234, + "learning_rate": 0.00015103606596991482, + "loss": 2.8075, + "step": 8110 + }, + { + "epoch": 0.7348417929378723, + "grad_norm": 0.8794116377830505, + "learning_rate": 0.00015103002476892408, + "loss": 2.7731, + "step": 8111 + }, + { + "epoch": 0.7349323911123191, + "grad_norm": 0.7563779354095459, + "learning_rate": 0.0001510239835679333, + "loss": 2.8988, + "step": 8112 + }, + { + "epoch": 0.7350229892867659, + "grad_norm": 0.8731781244277954, + "learning_rate": 0.00015101794236694254, + "loss": 2.7733, + "step": 8113 + }, + { + "epoch": 0.7351135874612127, + "grad_norm": 0.7963098287582397, + "learning_rate": 0.0001510119011659518, + "loss": 2.8788, + "step": 8114 + }, + { + "epoch": 0.7352041856356595, + "grad_norm": 0.8374744653701782, + "learning_rate": 0.00015100585996496104, + "loss": 2.2868, + "step": 8115 + }, + { + "epoch": 0.7352947838101063, + "grad_norm": 0.7495047450065613, + "learning_rate": 0.0001509998187639703, + "loss": 2.7216, + "step": 8116 + }, + { + "epoch": 0.735385381984553, + "grad_norm": 0.7969992160797119, + "learning_rate": 0.00015099377756297953, + "loss": 2.8815, + "step": 8117 + }, + { + "epoch": 0.7354759801589998, + "grad_norm": 0.7918328642845154, + "learning_rate": 0.00015098773636198877, + "loss": 3.0426, + "step": 8118 + }, + { + "epoch": 0.7355665783334466, + "grad_norm": 0.7801067233085632, + "learning_rate": 0.000150981695160998, + "loss": 2.996, + "step": 8119 + }, + { + "epoch": 0.7356571765078934, + "grad_norm": 0.7649794816970825, + "learning_rate": 0.00015097565396000726, + "loss": 2.6069, + "step": 8120 + }, + { + "epoch": 0.7357477746823402, + "grad_norm": 0.7359362244606018, + "learning_rate": 0.0001509696127590165, + "loss": 2.7092, + "step": 8121 + }, + { + "epoch": 0.7358383728567869, + "grad_norm": 0.8884406685829163, + "learning_rate": 0.00015096357155802573, + "loss": 2.7236, + "step": 8122 + }, + { + "epoch": 0.7359289710312337, + "grad_norm": 0.8623157739639282, + "learning_rate": 0.000150957530357035, + "loss": 3.041, + "step": 8123 + }, + { + "epoch": 0.7360195692056805, + "grad_norm": 0.7882740497589111, + "learning_rate": 0.00015095148915604423, + "loss": 2.8731, + "step": 8124 + }, + { + "epoch": 0.7361101673801272, + "grad_norm": 0.7338725328445435, + "learning_rate": 0.0001509454479550535, + "loss": 2.3467, + "step": 8125 + }, + { + "epoch": 0.736200765554574, + "grad_norm": 0.6756443977355957, + "learning_rate": 0.0001509394067540627, + "loss": 1.7711, + "step": 8126 + }, + { + "epoch": 0.7362913637290208, + "grad_norm": 0.7424505352973938, + "learning_rate": 0.00015093336555307196, + "loss": 2.7533, + "step": 8127 + }, + { + "epoch": 0.7363819619034676, + "grad_norm": 0.8601559400558472, + "learning_rate": 0.00015092732435208122, + "loss": 3.1101, + "step": 8128 + }, + { + "epoch": 0.7364725600779144, + "grad_norm": 0.751442551612854, + "learning_rate": 0.00015092128315109045, + "loss": 2.4992, + "step": 8129 + }, + { + "epoch": 0.7365631582523612, + "grad_norm": 0.7789798378944397, + "learning_rate": 0.00015091524195009969, + "loss": 2.771, + "step": 8130 + }, + { + "epoch": 0.736653756426808, + "grad_norm": 0.7982369661331177, + "learning_rate": 0.00015090920074910892, + "loss": 2.6206, + "step": 8131 + }, + { + "epoch": 0.7367443546012548, + "grad_norm": 0.7583799362182617, + "learning_rate": 0.00015090315954811818, + "loss": 2.7348, + "step": 8132 + }, + { + "epoch": 0.7368349527757015, + "grad_norm": 0.8267589211463928, + "learning_rate": 0.00015089711834712742, + "loss": 2.8476, + "step": 8133 + }, + { + "epoch": 0.7369255509501483, + "grad_norm": 0.7858834266662598, + "learning_rate": 0.00015089107714613665, + "loss": 2.8168, + "step": 8134 + }, + { + "epoch": 0.7370161491245951, + "grad_norm": 0.8483680486679077, + "learning_rate": 0.00015088503594514588, + "loss": 3.065, + "step": 8135 + }, + { + "epoch": 0.7371067472990419, + "grad_norm": 0.7776409387588501, + "learning_rate": 0.00015087899474415514, + "loss": 2.9092, + "step": 8136 + }, + { + "epoch": 0.7371973454734887, + "grad_norm": 0.6970564723014832, + "learning_rate": 0.0001508729535431644, + "loss": 1.972, + "step": 8137 + }, + { + "epoch": 0.7372879436479355, + "grad_norm": 0.7956661581993103, + "learning_rate": 0.00015086691234217364, + "loss": 2.6438, + "step": 8138 + }, + { + "epoch": 0.7373785418223823, + "grad_norm": 0.7764764428138733, + "learning_rate": 0.00015086087114118287, + "loss": 2.9132, + "step": 8139 + }, + { + "epoch": 0.7374691399968291, + "grad_norm": 0.8249651789665222, + "learning_rate": 0.0001508548299401921, + "loss": 2.9428, + "step": 8140 + }, + { + "epoch": 0.7375597381712758, + "grad_norm": 0.6682234406471252, + "learning_rate": 0.00015084878873920137, + "loss": 1.9138, + "step": 8141 + }, + { + "epoch": 0.7376503363457226, + "grad_norm": 0.7362225651741028, + "learning_rate": 0.0001508427475382106, + "loss": 2.7573, + "step": 8142 + }, + { + "epoch": 0.7377409345201694, + "grad_norm": 0.7749296426773071, + "learning_rate": 0.00015083670633721984, + "loss": 2.878, + "step": 8143 + }, + { + "epoch": 0.7378315326946162, + "grad_norm": 0.7829998135566711, + "learning_rate": 0.0001508306651362291, + "loss": 2.5595, + "step": 8144 + }, + { + "epoch": 0.737922130869063, + "grad_norm": 0.8021520376205444, + "learning_rate": 0.00015082462393523833, + "loss": 2.7804, + "step": 8145 + }, + { + "epoch": 0.7380127290435098, + "grad_norm": 0.8018382787704468, + "learning_rate": 0.0001508185827342476, + "loss": 2.7549, + "step": 8146 + }, + { + "epoch": 0.7381033272179566, + "grad_norm": 0.7971715331077576, + "learning_rate": 0.0001508125415332568, + "loss": 2.9256, + "step": 8147 + }, + { + "epoch": 0.7381939253924034, + "grad_norm": 0.8214972615242004, + "learning_rate": 0.00015080650033226606, + "loss": 2.8892, + "step": 8148 + }, + { + "epoch": 0.7382845235668501, + "grad_norm": 0.7814722061157227, + "learning_rate": 0.0001508004591312753, + "loss": 2.8677, + "step": 8149 + }, + { + "epoch": 0.7383751217412969, + "grad_norm": 0.7490960955619812, + "learning_rate": 0.00015079441793028456, + "loss": 2.7137, + "step": 8150 + }, + { + "epoch": 0.7384657199157437, + "grad_norm": 0.7475077509880066, + "learning_rate": 0.0001507883767292938, + "loss": 2.7846, + "step": 8151 + }, + { + "epoch": 0.7385563180901905, + "grad_norm": 0.7122424244880676, + "learning_rate": 0.00015078233552830302, + "loss": 2.2351, + "step": 8152 + }, + { + "epoch": 0.7386469162646373, + "grad_norm": 0.7815283536911011, + "learning_rate": 0.00015077629432731229, + "loss": 2.4943, + "step": 8153 + }, + { + "epoch": 0.7387375144390841, + "grad_norm": 0.6564634442329407, + "learning_rate": 0.00015077025312632152, + "loss": 2.0051, + "step": 8154 + }, + { + "epoch": 0.7388281126135309, + "grad_norm": 0.8062174320220947, + "learning_rate": 0.00015076421192533075, + "loss": 2.6347, + "step": 8155 + }, + { + "epoch": 0.7389187107879777, + "grad_norm": 0.7254067659378052, + "learning_rate": 0.00015075817072434, + "loss": 2.7288, + "step": 8156 + }, + { + "epoch": 0.7390093089624244, + "grad_norm": 0.8018463253974915, + "learning_rate": 0.00015075212952334925, + "loss": 2.7228, + "step": 8157 + }, + { + "epoch": 0.7390999071368712, + "grad_norm": 0.7695730924606323, + "learning_rate": 0.0001507460883223585, + "loss": 2.6778, + "step": 8158 + }, + { + "epoch": 0.739190505311318, + "grad_norm": 0.7925087213516235, + "learning_rate": 0.00015074004712136774, + "loss": 2.8076, + "step": 8159 + }, + { + "epoch": 0.7392811034857648, + "grad_norm": 0.7158840894699097, + "learning_rate": 0.00015073400592037698, + "loss": 2.2133, + "step": 8160 + }, + { + "epoch": 0.7393717016602116, + "grad_norm": 0.8070570230484009, + "learning_rate": 0.0001507279647193862, + "loss": 2.7574, + "step": 8161 + }, + { + "epoch": 0.7394622998346583, + "grad_norm": 0.7042269110679626, + "learning_rate": 0.00015072192351839547, + "loss": 2.0871, + "step": 8162 + }, + { + "epoch": 0.7395528980091051, + "grad_norm": 0.8776463866233826, + "learning_rate": 0.0001507158823174047, + "loss": 2.8422, + "step": 8163 + }, + { + "epoch": 0.7396434961835519, + "grad_norm": 0.774111807346344, + "learning_rate": 0.00015070984111641394, + "loss": 2.8615, + "step": 8164 + }, + { + "epoch": 0.7397340943579986, + "grad_norm": 0.715196967124939, + "learning_rate": 0.00015070379991542318, + "loss": 2.7407, + "step": 8165 + }, + { + "epoch": 0.7398246925324454, + "grad_norm": 0.6899030804634094, + "learning_rate": 0.00015069775871443244, + "loss": 2.1466, + "step": 8166 + }, + { + "epoch": 0.7399152907068922, + "grad_norm": 0.6981901526451111, + "learning_rate": 0.0001506917175134417, + "loss": 2.0928, + "step": 8167 + }, + { + "epoch": 0.740005888881339, + "grad_norm": 0.7781512141227722, + "learning_rate": 0.0001506856763124509, + "loss": 2.7078, + "step": 8168 + }, + { + "epoch": 0.7400964870557858, + "grad_norm": 0.7938280701637268, + "learning_rate": 0.00015067963511146017, + "loss": 2.8789, + "step": 8169 + }, + { + "epoch": 0.7401870852302326, + "grad_norm": 0.8007157444953918, + "learning_rate": 0.0001506735939104694, + "loss": 2.5221, + "step": 8170 + }, + { + "epoch": 0.7402776834046794, + "grad_norm": 0.8025911450386047, + "learning_rate": 0.00015066755270947866, + "loss": 2.5787, + "step": 8171 + }, + { + "epoch": 0.7403682815791262, + "grad_norm": 0.7926928997039795, + "learning_rate": 0.0001506615115084879, + "loss": 2.8084, + "step": 8172 + }, + { + "epoch": 0.7404588797535729, + "grad_norm": 0.8190704584121704, + "learning_rate": 0.00015065547030749713, + "loss": 2.9842, + "step": 8173 + }, + { + "epoch": 0.7405494779280197, + "grad_norm": 0.7815862894058228, + "learning_rate": 0.0001506494291065064, + "loss": 2.7548, + "step": 8174 + }, + { + "epoch": 0.7406400761024665, + "grad_norm": 0.8195329308509827, + "learning_rate": 0.00015064338790551563, + "loss": 2.718, + "step": 8175 + }, + { + "epoch": 0.7407306742769133, + "grad_norm": 0.8405175805091858, + "learning_rate": 0.00015063734670452489, + "loss": 2.8125, + "step": 8176 + }, + { + "epoch": 0.7408212724513601, + "grad_norm": 0.7892889976501465, + "learning_rate": 0.0001506313055035341, + "loss": 2.9933, + "step": 8177 + }, + { + "epoch": 0.7409118706258069, + "grad_norm": 0.7798622846603394, + "learning_rate": 0.00015062526430254335, + "loss": 2.5253, + "step": 8178 + }, + { + "epoch": 0.7410024688002537, + "grad_norm": 0.7784885168075562, + "learning_rate": 0.0001506192231015526, + "loss": 2.8295, + "step": 8179 + }, + { + "epoch": 0.7410930669747005, + "grad_norm": 0.7811000943183899, + "learning_rate": 0.00015061318190056185, + "loss": 3.0139, + "step": 8180 + }, + { + "epoch": 0.7411836651491472, + "grad_norm": 0.8312163352966309, + "learning_rate": 0.00015060714069957108, + "loss": 2.9946, + "step": 8181 + }, + { + "epoch": 0.741274263323594, + "grad_norm": 0.7683334946632385, + "learning_rate": 0.00015060109949858032, + "loss": 2.6466, + "step": 8182 + }, + { + "epoch": 0.7413648614980408, + "grad_norm": 0.7631383538246155, + "learning_rate": 0.00015059505829758958, + "loss": 2.7414, + "step": 8183 + }, + { + "epoch": 0.7414554596724876, + "grad_norm": 0.7350941896438599, + "learning_rate": 0.0001505890170965988, + "loss": 2.6647, + "step": 8184 + }, + { + "epoch": 0.7415460578469344, + "grad_norm": 0.8027393221855164, + "learning_rate": 0.00015058297589560805, + "loss": 2.8751, + "step": 8185 + }, + { + "epoch": 0.7416366560213812, + "grad_norm": 0.7778029441833496, + "learning_rate": 0.00015057693469461728, + "loss": 2.8707, + "step": 8186 + }, + { + "epoch": 0.741727254195828, + "grad_norm": 0.7368156909942627, + "learning_rate": 0.00015057089349362654, + "loss": 2.9308, + "step": 8187 + }, + { + "epoch": 0.7418178523702748, + "grad_norm": 0.7934113144874573, + "learning_rate": 0.0001505648522926358, + "loss": 2.689, + "step": 8188 + }, + { + "epoch": 0.7419084505447215, + "grad_norm": 0.748843252658844, + "learning_rate": 0.00015055881109164504, + "loss": 2.8146, + "step": 8189 + }, + { + "epoch": 0.7419990487191683, + "grad_norm": 0.7970103621482849, + "learning_rate": 0.00015055276989065427, + "loss": 2.9256, + "step": 8190 + }, + { + "epoch": 0.7420896468936151, + "grad_norm": 0.7941340804100037, + "learning_rate": 0.0001505467286896635, + "loss": 2.6205, + "step": 8191 + }, + { + "epoch": 0.7421802450680619, + "grad_norm": 0.809718668460846, + "learning_rate": 0.00015054068748867277, + "loss": 2.7141, + "step": 8192 + }, + { + "epoch": 0.7422708432425087, + "grad_norm": 0.8340876698493958, + "learning_rate": 0.000150534646287682, + "loss": 2.7784, + "step": 8193 + }, + { + "epoch": 0.7423614414169555, + "grad_norm": 0.8394103050231934, + "learning_rate": 0.00015052860508669123, + "loss": 2.9026, + "step": 8194 + }, + { + "epoch": 0.7424520395914023, + "grad_norm": 0.6991639733314514, + "learning_rate": 0.00015052256388570047, + "loss": 2.1859, + "step": 8195 + }, + { + "epoch": 0.7425426377658491, + "grad_norm": 0.8926108479499817, + "learning_rate": 0.00015051652268470973, + "loss": 2.764, + "step": 8196 + }, + { + "epoch": 0.7426332359402958, + "grad_norm": 0.8256886601448059, + "learning_rate": 0.000150510481483719, + "loss": 3.023, + "step": 8197 + }, + { + "epoch": 0.7427238341147426, + "grad_norm": 0.7117676734924316, + "learning_rate": 0.0001505044402827282, + "loss": 2.6916, + "step": 8198 + }, + { + "epoch": 0.7428144322891894, + "grad_norm": 0.8022922873497009, + "learning_rate": 0.00015049839908173746, + "loss": 2.8156, + "step": 8199 + }, + { + "epoch": 0.7429050304636362, + "grad_norm": 0.772811770439148, + "learning_rate": 0.0001504923578807467, + "loss": 2.9441, + "step": 8200 + }, + { + "epoch": 0.742995628638083, + "grad_norm": 0.7588112354278564, + "learning_rate": 0.00015048631667975595, + "loss": 3.2547, + "step": 8201 + }, + { + "epoch": 0.7430862268125298, + "grad_norm": 0.7627606391906738, + "learning_rate": 0.0001504802754787652, + "loss": 2.8397, + "step": 8202 + }, + { + "epoch": 0.7431768249869765, + "grad_norm": 0.7432065606117249, + "learning_rate": 0.00015047423427777442, + "loss": 2.9105, + "step": 8203 + }, + { + "epoch": 0.7432674231614232, + "grad_norm": 0.7897669672966003, + "learning_rate": 0.00015046819307678368, + "loss": 2.1841, + "step": 8204 + }, + { + "epoch": 0.74335802133587, + "grad_norm": 0.7979099154472351, + "learning_rate": 0.00015046215187579292, + "loss": 2.896, + "step": 8205 + }, + { + "epoch": 0.7434486195103168, + "grad_norm": 0.8358690738677979, + "learning_rate": 0.00015045611067480215, + "loss": 2.7512, + "step": 8206 + }, + { + "epoch": 0.7435392176847636, + "grad_norm": 0.8084738850593567, + "learning_rate": 0.00015045006947381139, + "loss": 2.8556, + "step": 8207 + }, + { + "epoch": 0.7436298158592104, + "grad_norm": 0.7017821073532104, + "learning_rate": 0.00015044402827282065, + "loss": 2.2311, + "step": 8208 + }, + { + "epoch": 0.7437204140336572, + "grad_norm": 0.8124151229858398, + "learning_rate": 0.00015043798707182988, + "loss": 2.8775, + "step": 8209 + }, + { + "epoch": 0.743811012208104, + "grad_norm": 0.8006890416145325, + "learning_rate": 0.00015043194587083914, + "loss": 2.6, + "step": 8210 + }, + { + "epoch": 0.7439016103825508, + "grad_norm": 0.7920641899108887, + "learning_rate": 0.00015042590466984838, + "loss": 2.686, + "step": 8211 + }, + { + "epoch": 0.7439922085569975, + "grad_norm": 0.9438324570655823, + "learning_rate": 0.0001504198634688576, + "loss": 2.9863, + "step": 8212 + }, + { + "epoch": 0.7440828067314443, + "grad_norm": 0.7402288317680359, + "learning_rate": 0.00015041382226786687, + "loss": 2.6343, + "step": 8213 + }, + { + "epoch": 0.7441734049058911, + "grad_norm": 0.7830679416656494, + "learning_rate": 0.0001504077810668761, + "loss": 2.7214, + "step": 8214 + }, + { + "epoch": 0.7442640030803379, + "grad_norm": 0.7660263180732727, + "learning_rate": 0.00015040173986588534, + "loss": 2.7866, + "step": 8215 + }, + { + "epoch": 0.7443546012547847, + "grad_norm": 0.7674909830093384, + "learning_rate": 0.00015039569866489457, + "loss": 2.5399, + "step": 8216 + }, + { + "epoch": 0.7444451994292315, + "grad_norm": 0.7791915535926819, + "learning_rate": 0.00015038965746390383, + "loss": 3.2772, + "step": 8217 + }, + { + "epoch": 0.7445357976036783, + "grad_norm": 0.8044906854629517, + "learning_rate": 0.0001503836162629131, + "loss": 2.884, + "step": 8218 + }, + { + "epoch": 0.7446263957781251, + "grad_norm": 0.7998222708702087, + "learning_rate": 0.0001503775750619223, + "loss": 2.6619, + "step": 8219 + }, + { + "epoch": 0.7447169939525718, + "grad_norm": 0.6764879822731018, + "learning_rate": 0.00015037153386093156, + "loss": 2.1301, + "step": 8220 + }, + { + "epoch": 0.7448075921270186, + "grad_norm": 0.7871766686439514, + "learning_rate": 0.0001503654926599408, + "loss": 2.6742, + "step": 8221 + }, + { + "epoch": 0.7448981903014654, + "grad_norm": 0.778917670249939, + "learning_rate": 0.00015035945145895006, + "loss": 2.891, + "step": 8222 + }, + { + "epoch": 0.7449887884759122, + "grad_norm": 0.8379752039909363, + "learning_rate": 0.0001503534102579593, + "loss": 2.6329, + "step": 8223 + }, + { + "epoch": 0.745079386650359, + "grad_norm": 0.8450531959533691, + "learning_rate": 0.00015034736905696853, + "loss": 2.7681, + "step": 8224 + }, + { + "epoch": 0.7451699848248058, + "grad_norm": 0.7611551284790039, + "learning_rate": 0.00015034132785597776, + "loss": 2.0761, + "step": 8225 + }, + { + "epoch": 0.7452605829992526, + "grad_norm": 0.8397864103317261, + "learning_rate": 0.00015033528665498702, + "loss": 2.8977, + "step": 8226 + }, + { + "epoch": 0.7453511811736994, + "grad_norm": 0.8079404830932617, + "learning_rate": 0.00015032924545399628, + "loss": 2.4832, + "step": 8227 + }, + { + "epoch": 0.7454417793481461, + "grad_norm": 0.8396058082580566, + "learning_rate": 0.0001503232042530055, + "loss": 2.6464, + "step": 8228 + }, + { + "epoch": 0.7455323775225929, + "grad_norm": 0.7680096626281738, + "learning_rate": 0.00015031716305201475, + "loss": 2.9104, + "step": 8229 + }, + { + "epoch": 0.7456229756970397, + "grad_norm": 0.8095062971115112, + "learning_rate": 0.00015031112185102399, + "loss": 2.7553, + "step": 8230 + }, + { + "epoch": 0.7457135738714865, + "grad_norm": 0.8109043836593628, + "learning_rate": 0.00015030508065003325, + "loss": 2.8054, + "step": 8231 + }, + { + "epoch": 0.7458041720459333, + "grad_norm": 0.8043054342269897, + "learning_rate": 0.00015029903944904245, + "loss": 2.9865, + "step": 8232 + }, + { + "epoch": 0.7458947702203801, + "grad_norm": 0.73074871301651, + "learning_rate": 0.00015029299824805172, + "loss": 2.6414, + "step": 8233 + }, + { + "epoch": 0.7459853683948269, + "grad_norm": 0.7723628878593445, + "learning_rate": 0.00015028695704706098, + "loss": 2.8426, + "step": 8234 + }, + { + "epoch": 0.7460759665692737, + "grad_norm": 0.8794986605644226, + "learning_rate": 0.0001502809158460702, + "loss": 2.5801, + "step": 8235 + }, + { + "epoch": 0.7461665647437205, + "grad_norm": 0.8022456169128418, + "learning_rate": 0.00015027487464507944, + "loss": 2.8977, + "step": 8236 + }, + { + "epoch": 0.7462571629181672, + "grad_norm": 0.7935547232627869, + "learning_rate": 0.00015026883344408868, + "loss": 2.7419, + "step": 8237 + }, + { + "epoch": 0.746347761092614, + "grad_norm": 0.8142284750938416, + "learning_rate": 0.00015026279224309794, + "loss": 3.0614, + "step": 8238 + }, + { + "epoch": 0.7464383592670608, + "grad_norm": 0.775635302066803, + "learning_rate": 0.00015025675104210717, + "loss": 2.5886, + "step": 8239 + }, + { + "epoch": 0.7465289574415076, + "grad_norm": 0.7671568393707275, + "learning_rate": 0.00015025070984111643, + "loss": 2.821, + "step": 8240 + }, + { + "epoch": 0.7466195556159544, + "grad_norm": 0.725212574005127, + "learning_rate": 0.00015024466864012567, + "loss": 2.1748, + "step": 8241 + }, + { + "epoch": 0.7467101537904012, + "grad_norm": 0.8088005781173706, + "learning_rate": 0.0001502386274391349, + "loss": 2.6096, + "step": 8242 + }, + { + "epoch": 0.7468007519648479, + "grad_norm": 0.779007613658905, + "learning_rate": 0.00015023258623814416, + "loss": 2.7353, + "step": 8243 + }, + { + "epoch": 0.7468913501392946, + "grad_norm": 0.7493333220481873, + "learning_rate": 0.0001502265450371534, + "loss": 2.727, + "step": 8244 + }, + { + "epoch": 0.7469819483137414, + "grad_norm": 0.8522852063179016, + "learning_rate": 0.00015022050383616263, + "loss": 2.8447, + "step": 8245 + }, + { + "epoch": 0.7470725464881882, + "grad_norm": 0.798122227191925, + "learning_rate": 0.00015021446263517187, + "loss": 2.9435, + "step": 8246 + }, + { + "epoch": 0.747163144662635, + "grad_norm": 0.7769741415977478, + "learning_rate": 0.00015020842143418113, + "loss": 2.724, + "step": 8247 + }, + { + "epoch": 0.7472537428370818, + "grad_norm": 0.8341935276985168, + "learning_rate": 0.0001502023802331904, + "loss": 3.0821, + "step": 8248 + }, + { + "epoch": 0.7473443410115286, + "grad_norm": 0.7941400408744812, + "learning_rate": 0.0001501963390321996, + "loss": 2.8635, + "step": 8249 + }, + { + "epoch": 0.7474349391859754, + "grad_norm": 0.8066178560256958, + "learning_rate": 0.00015019029783120886, + "loss": 2.6871, + "step": 8250 + }, + { + "epoch": 0.7475255373604222, + "grad_norm": 0.6836779713630676, + "learning_rate": 0.0001501842566302181, + "loss": 2.1368, + "step": 8251 + }, + { + "epoch": 0.7476161355348689, + "grad_norm": 0.7973634600639343, + "learning_rate": 0.00015017821542922735, + "loss": 2.7472, + "step": 8252 + }, + { + "epoch": 0.7477067337093157, + "grad_norm": 0.7698671221733093, + "learning_rate": 0.00015017217422823659, + "loss": 2.9001, + "step": 8253 + }, + { + "epoch": 0.7477973318837625, + "grad_norm": 0.8162696957588196, + "learning_rate": 0.00015016613302724582, + "loss": 2.6562, + "step": 8254 + }, + { + "epoch": 0.7478879300582093, + "grad_norm": 0.7873493432998657, + "learning_rate": 0.00015016009182625505, + "loss": 2.4963, + "step": 8255 + }, + { + "epoch": 0.7479785282326561, + "grad_norm": 0.8551124334335327, + "learning_rate": 0.00015015405062526432, + "loss": 2.7589, + "step": 8256 + }, + { + "epoch": 0.7480691264071029, + "grad_norm": 0.7836431860923767, + "learning_rate": 0.00015014800942427355, + "loss": 2.8749, + "step": 8257 + }, + { + "epoch": 0.7481597245815497, + "grad_norm": 0.8066478371620178, + "learning_rate": 0.00015014196822328278, + "loss": 2.6498, + "step": 8258 + }, + { + "epoch": 0.7482503227559965, + "grad_norm": 0.8164636492729187, + "learning_rate": 0.00015013592702229204, + "loss": 2.7013, + "step": 8259 + }, + { + "epoch": 0.7483409209304432, + "grad_norm": 0.7929725050926208, + "learning_rate": 0.00015012988582130128, + "loss": 2.2663, + "step": 8260 + }, + { + "epoch": 0.74843151910489, + "grad_norm": 0.7561323046684265, + "learning_rate": 0.00015012384462031054, + "loss": 2.7109, + "step": 8261 + }, + { + "epoch": 0.7485221172793368, + "grad_norm": 0.7878855466842651, + "learning_rate": 0.00015011780341931975, + "loss": 2.7436, + "step": 8262 + }, + { + "epoch": 0.7486127154537836, + "grad_norm": 0.7447091937065125, + "learning_rate": 0.000150111762218329, + "loss": 2.7173, + "step": 8263 + }, + { + "epoch": 0.7487033136282304, + "grad_norm": 0.8557795286178589, + "learning_rate": 0.00015010572101733827, + "loss": 2.7654, + "step": 8264 + }, + { + "epoch": 0.7487939118026772, + "grad_norm": 0.7877557277679443, + "learning_rate": 0.0001500996798163475, + "loss": 2.8076, + "step": 8265 + }, + { + "epoch": 0.748884509977124, + "grad_norm": 0.8552584052085876, + "learning_rate": 0.00015009363861535674, + "loss": 2.6788, + "step": 8266 + }, + { + "epoch": 0.7489751081515708, + "grad_norm": 0.7800004482269287, + "learning_rate": 0.00015008759741436597, + "loss": 2.9151, + "step": 8267 + }, + { + "epoch": 0.7490657063260175, + "grad_norm": 0.7502309679985046, + "learning_rate": 0.00015008155621337523, + "loss": 3.0726, + "step": 8268 + }, + { + "epoch": 0.7491563045004643, + "grad_norm": 0.7567452192306519, + "learning_rate": 0.00015007551501238447, + "loss": 2.5236, + "step": 8269 + }, + { + "epoch": 0.7492469026749111, + "grad_norm": 0.8662324547767639, + "learning_rate": 0.0001500694738113937, + "loss": 2.999, + "step": 8270 + }, + { + "epoch": 0.7493375008493579, + "grad_norm": 0.6663268208503723, + "learning_rate": 0.00015006343261040296, + "loss": 1.9932, + "step": 8271 + }, + { + "epoch": 0.7494280990238047, + "grad_norm": 0.8281658291816711, + "learning_rate": 0.0001500573914094122, + "loss": 2.7968, + "step": 8272 + }, + { + "epoch": 0.7495186971982515, + "grad_norm": 0.7794591784477234, + "learning_rate": 0.00015005135020842146, + "loss": 2.7411, + "step": 8273 + }, + { + "epoch": 0.7496092953726983, + "grad_norm": 0.9024602174758911, + "learning_rate": 0.0001500453090074307, + "loss": 2.7459, + "step": 8274 + }, + { + "epoch": 0.7496998935471451, + "grad_norm": 0.8341313600540161, + "learning_rate": 0.00015003926780643992, + "loss": 2.9549, + "step": 8275 + }, + { + "epoch": 0.7497904917215918, + "grad_norm": 0.8206439018249512, + "learning_rate": 0.00015003322660544916, + "loss": 2.7677, + "step": 8276 + }, + { + "epoch": 0.7498810898960386, + "grad_norm": 0.78900146484375, + "learning_rate": 0.00015002718540445842, + "loss": 2.6431, + "step": 8277 + }, + { + "epoch": 0.7499716880704854, + "grad_norm": 0.7555161118507385, + "learning_rate": 0.00015002114420346765, + "loss": 2.8632, + "step": 8278 + }, + { + "epoch": 0.7500622862449322, + "grad_norm": 0.8360323309898376, + "learning_rate": 0.0001500151030024769, + "loss": 3.0082, + "step": 8279 + }, + { + "epoch": 0.750152884419379, + "grad_norm": 0.8048120141029358, + "learning_rate": 0.00015000906180148615, + "loss": 2.7922, + "step": 8280 + }, + { + "epoch": 0.7502434825938258, + "grad_norm": 0.8168743252754211, + "learning_rate": 0.00015000302060049538, + "loss": 2.9, + "step": 8281 + }, + { + "epoch": 0.7503340807682726, + "grad_norm": 0.8216965198516846, + "learning_rate": 0.00014999697939950464, + "loss": 2.7857, + "step": 8282 + }, + { + "epoch": 0.7504246789427194, + "grad_norm": 0.7562044262886047, + "learning_rate": 0.00014999093819851385, + "loss": 2.7122, + "step": 8283 + }, + { + "epoch": 0.750515277117166, + "grad_norm": 0.7384498715400696, + "learning_rate": 0.0001499848969975231, + "loss": 2.0504, + "step": 8284 + }, + { + "epoch": 0.7506058752916128, + "grad_norm": 0.8429835438728333, + "learning_rate": 0.00014997885579653235, + "loss": 2.7519, + "step": 8285 + }, + { + "epoch": 0.7506964734660596, + "grad_norm": 0.7953433394432068, + "learning_rate": 0.0001499728145955416, + "loss": 2.859, + "step": 8286 + }, + { + "epoch": 0.7507870716405064, + "grad_norm": 0.7983808517456055, + "learning_rate": 0.00014996677339455084, + "loss": 2.8094, + "step": 8287 + }, + { + "epoch": 0.7508776698149532, + "grad_norm": 0.7625759840011597, + "learning_rate": 0.00014996073219356008, + "loss": 2.863, + "step": 8288 + }, + { + "epoch": 0.7509682679894, + "grad_norm": 0.8102598786354065, + "learning_rate": 0.00014995469099256934, + "loss": 2.7152, + "step": 8289 + }, + { + "epoch": 0.7510588661638468, + "grad_norm": 0.8246459364891052, + "learning_rate": 0.00014994864979157857, + "loss": 2.7491, + "step": 8290 + }, + { + "epoch": 0.7511494643382935, + "grad_norm": 0.8327946066856384, + "learning_rate": 0.0001499426085905878, + "loss": 2.0295, + "step": 8291 + }, + { + "epoch": 0.7512400625127403, + "grad_norm": 0.7482458353042603, + "learning_rate": 0.00014993656738959704, + "loss": 2.6497, + "step": 8292 + }, + { + "epoch": 0.7513306606871871, + "grad_norm": 0.8533065319061279, + "learning_rate": 0.0001499305261886063, + "loss": 3.2021, + "step": 8293 + }, + { + "epoch": 0.7514212588616339, + "grad_norm": 0.7593948245048523, + "learning_rate": 0.00014992448498761556, + "loss": 2.7001, + "step": 8294 + }, + { + "epoch": 0.7515118570360807, + "grad_norm": 0.9264819622039795, + "learning_rate": 0.0001499184437866248, + "loss": 2.7531, + "step": 8295 + }, + { + "epoch": 0.7516024552105275, + "grad_norm": 0.7691560983657837, + "learning_rate": 0.00014991240258563403, + "loss": 2.7054, + "step": 8296 + }, + { + "epoch": 0.7516930533849743, + "grad_norm": 0.6677315831184387, + "learning_rate": 0.00014990636138464326, + "loss": 2.028, + "step": 8297 + }, + { + "epoch": 0.7517836515594211, + "grad_norm": 0.7959808111190796, + "learning_rate": 0.00014990032018365252, + "loss": 2.7013, + "step": 8298 + }, + { + "epoch": 0.7518742497338678, + "grad_norm": 0.7843925356864929, + "learning_rate": 0.00014989427898266176, + "loss": 2.8213, + "step": 8299 + }, + { + "epoch": 0.7519648479083146, + "grad_norm": 0.8113759160041809, + "learning_rate": 0.000149888237781671, + "loss": 2.8956, + "step": 8300 + }, + { + "epoch": 0.7520554460827614, + "grad_norm": 0.8407657742500305, + "learning_rate": 0.00014988219658068025, + "loss": 2.5569, + "step": 8301 + }, + { + "epoch": 0.7521460442572082, + "grad_norm": 0.8044555187225342, + "learning_rate": 0.0001498761553796895, + "loss": 2.7135, + "step": 8302 + }, + { + "epoch": 0.752236642431655, + "grad_norm": 0.7863640189170837, + "learning_rate": 0.00014987011417869875, + "loss": 2.9331, + "step": 8303 + }, + { + "epoch": 0.7523272406061018, + "grad_norm": 0.7721131443977356, + "learning_rate": 0.00014986407297770798, + "loss": 2.8202, + "step": 8304 + }, + { + "epoch": 0.7524178387805486, + "grad_norm": 0.8638178110122681, + "learning_rate": 0.00014985803177671722, + "loss": 2.4716, + "step": 8305 + }, + { + "epoch": 0.7525084369549954, + "grad_norm": 0.7663964033126831, + "learning_rate": 0.00014985199057572645, + "loss": 3.0186, + "step": 8306 + }, + { + "epoch": 0.7525990351294422, + "grad_norm": 0.799282431602478, + "learning_rate": 0.0001498459493747357, + "loss": 2.7701, + "step": 8307 + }, + { + "epoch": 0.7526896333038889, + "grad_norm": 0.6415296196937561, + "learning_rate": 0.00014983990817374495, + "loss": 2.0469, + "step": 8308 + }, + { + "epoch": 0.7527802314783357, + "grad_norm": 0.8005272150039673, + "learning_rate": 0.00014983386697275418, + "loss": 2.6667, + "step": 8309 + }, + { + "epoch": 0.7528708296527825, + "grad_norm": 0.7407487034797668, + "learning_rate": 0.00014982782577176344, + "loss": 2.6029, + "step": 8310 + }, + { + "epoch": 0.7529614278272293, + "grad_norm": 0.7872732877731323, + "learning_rate": 0.00014982178457077268, + "loss": 2.8542, + "step": 8311 + }, + { + "epoch": 0.7530520260016761, + "grad_norm": 0.8058658838272095, + "learning_rate": 0.00014981574336978194, + "loss": 2.9958, + "step": 8312 + }, + { + "epoch": 0.7531426241761229, + "grad_norm": 0.7563486695289612, + "learning_rate": 0.00014980970216879114, + "loss": 2.6845, + "step": 8313 + }, + { + "epoch": 0.7532332223505697, + "grad_norm": 0.7493889331817627, + "learning_rate": 0.0001498036609678004, + "loss": 2.8973, + "step": 8314 + }, + { + "epoch": 0.7533238205250165, + "grad_norm": 0.7527517676353455, + "learning_rate": 0.00014979761976680964, + "loss": 2.9907, + "step": 8315 + }, + { + "epoch": 0.7534144186994632, + "grad_norm": 0.8219995498657227, + "learning_rate": 0.0001497915785658189, + "loss": 2.7072, + "step": 8316 + }, + { + "epoch": 0.75350501687391, + "grad_norm": 0.8575217127799988, + "learning_rate": 0.00014978553736482813, + "loss": 2.6453, + "step": 8317 + }, + { + "epoch": 0.7535956150483568, + "grad_norm": 0.6698116660118103, + "learning_rate": 0.00014977949616383737, + "loss": 2.0889, + "step": 8318 + }, + { + "epoch": 0.7536862132228036, + "grad_norm": 0.7524647116661072, + "learning_rate": 0.00014977345496284663, + "loss": 2.7306, + "step": 8319 + }, + { + "epoch": 0.7537768113972504, + "grad_norm": 0.7952456474304199, + "learning_rate": 0.00014976741376185586, + "loss": 2.8639, + "step": 8320 + }, + { + "epoch": 0.7538674095716972, + "grad_norm": 0.773780882358551, + "learning_rate": 0.0001497613725608651, + "loss": 2.6592, + "step": 8321 + }, + { + "epoch": 0.753958007746144, + "grad_norm": 0.6917534470558167, + "learning_rate": 0.00014975533135987433, + "loss": 2.0735, + "step": 8322 + }, + { + "epoch": 0.7540486059205908, + "grad_norm": 0.7678301930427551, + "learning_rate": 0.0001497492901588836, + "loss": 2.7474, + "step": 8323 + }, + { + "epoch": 0.7541392040950374, + "grad_norm": 0.6626254320144653, + "learning_rate": 0.00014974324895789285, + "loss": 2.1811, + "step": 8324 + }, + { + "epoch": 0.7542298022694842, + "grad_norm": 0.6966309547424316, + "learning_rate": 0.0001497372077569021, + "loss": 2.0214, + "step": 8325 + }, + { + "epoch": 0.754320400443931, + "grad_norm": 0.7988982796669006, + "learning_rate": 0.00014973116655591132, + "loss": 2.8172, + "step": 8326 + }, + { + "epoch": 0.7544109986183778, + "grad_norm": 0.7301539778709412, + "learning_rate": 0.00014972512535492056, + "loss": 2.4882, + "step": 8327 + }, + { + "epoch": 0.7545015967928246, + "grad_norm": 0.8253836631774902, + "learning_rate": 0.00014971908415392982, + "loss": 3.1276, + "step": 8328 + }, + { + "epoch": 0.7545921949672714, + "grad_norm": 0.7751532793045044, + "learning_rate": 0.00014971304295293905, + "loss": 2.9629, + "step": 8329 + }, + { + "epoch": 0.7546827931417182, + "grad_norm": 0.783893883228302, + "learning_rate": 0.00014970700175194829, + "loss": 3.0092, + "step": 8330 + }, + { + "epoch": 0.7547733913161649, + "grad_norm": 0.7952336072921753, + "learning_rate": 0.00014970096055095755, + "loss": 2.7341, + "step": 8331 + }, + { + "epoch": 0.7548639894906117, + "grad_norm": 0.852476179599762, + "learning_rate": 0.00014969491934996678, + "loss": 3.44, + "step": 8332 + }, + { + "epoch": 0.7549545876650585, + "grad_norm": 0.7930893301963806, + "learning_rate": 0.00014968887814897604, + "loss": 2.7611, + "step": 8333 + }, + { + "epoch": 0.7550451858395053, + "grad_norm": 0.8080412745475769, + "learning_rate": 0.00014968283694798525, + "loss": 2.79, + "step": 8334 + }, + { + "epoch": 0.7551357840139521, + "grad_norm": 0.7474736571311951, + "learning_rate": 0.0001496767957469945, + "loss": 2.9547, + "step": 8335 + }, + { + "epoch": 0.7552263821883989, + "grad_norm": 0.8062903881072998, + "learning_rate": 0.00014967075454600374, + "loss": 2.8882, + "step": 8336 + }, + { + "epoch": 0.7553169803628457, + "grad_norm": 0.7617515921592712, + "learning_rate": 0.000149664713345013, + "loss": 2.6093, + "step": 8337 + }, + { + "epoch": 0.7554075785372925, + "grad_norm": 0.7664965391159058, + "learning_rate": 0.00014965867214402224, + "loss": 2.7308, + "step": 8338 + }, + { + "epoch": 0.7554981767117392, + "grad_norm": 0.7932139039039612, + "learning_rate": 0.00014965263094303147, + "loss": 2.8509, + "step": 8339 + }, + { + "epoch": 0.755588774886186, + "grad_norm": 0.7435409426689148, + "learning_rate": 0.00014964658974204073, + "loss": 2.7907, + "step": 8340 + }, + { + "epoch": 0.7556793730606328, + "grad_norm": 0.7500289082527161, + "learning_rate": 0.00014964054854104997, + "loss": 2.8299, + "step": 8341 + }, + { + "epoch": 0.7557699712350796, + "grad_norm": 0.7996256351470947, + "learning_rate": 0.0001496345073400592, + "loss": 2.7502, + "step": 8342 + }, + { + "epoch": 0.7558605694095264, + "grad_norm": 0.8364930748939514, + "learning_rate": 0.00014962846613906844, + "loss": 2.6903, + "step": 8343 + }, + { + "epoch": 0.7559511675839732, + "grad_norm": 0.7695034146308899, + "learning_rate": 0.0001496224249380777, + "loss": 2.7803, + "step": 8344 + }, + { + "epoch": 0.75604176575842, + "grad_norm": 0.7612802386283875, + "learning_rate": 0.00014961638373708693, + "loss": 2.8002, + "step": 8345 + }, + { + "epoch": 0.7561323639328668, + "grad_norm": 0.8547050356864929, + "learning_rate": 0.0001496103425360962, + "loss": 2.9713, + "step": 8346 + }, + { + "epoch": 0.7562229621073135, + "grad_norm": 0.8190881013870239, + "learning_rate": 0.00014960430133510543, + "loss": 2.7646, + "step": 8347 + }, + { + "epoch": 0.7563135602817603, + "grad_norm": 0.7783765196800232, + "learning_rate": 0.00014959826013411466, + "loss": 2.9067, + "step": 8348 + }, + { + "epoch": 0.7564041584562071, + "grad_norm": 0.7582978010177612, + "learning_rate": 0.00014959221893312392, + "loss": 2.7761, + "step": 8349 + }, + { + "epoch": 0.7564947566306539, + "grad_norm": 0.7521493434906006, + "learning_rate": 0.00014958617773213316, + "loss": 2.999, + "step": 8350 + }, + { + "epoch": 0.7565853548051007, + "grad_norm": 0.7696526646614075, + "learning_rate": 0.0001495801365311424, + "loss": 2.9199, + "step": 8351 + }, + { + "epoch": 0.7566759529795475, + "grad_norm": 0.8151476979255676, + "learning_rate": 0.00014957409533015162, + "loss": 2.8538, + "step": 8352 + }, + { + "epoch": 0.7567665511539943, + "grad_norm": 0.8220531940460205, + "learning_rate": 0.00014956805412916089, + "loss": 2.7536, + "step": 8353 + }, + { + "epoch": 0.7568571493284411, + "grad_norm": 0.8062984347343445, + "learning_rate": 0.00014956201292817015, + "loss": 2.8585, + "step": 8354 + }, + { + "epoch": 0.7569477475028878, + "grad_norm": 0.792967677116394, + "learning_rate": 0.00014955597172717935, + "loss": 2.8083, + "step": 8355 + }, + { + "epoch": 0.7570383456773346, + "grad_norm": 0.7918829917907715, + "learning_rate": 0.00014954993052618862, + "loss": 3.0737, + "step": 8356 + }, + { + "epoch": 0.7571289438517814, + "grad_norm": 0.8327732682228088, + "learning_rate": 0.00014954388932519785, + "loss": 2.9036, + "step": 8357 + }, + { + "epoch": 0.7572195420262282, + "grad_norm": 0.793705940246582, + "learning_rate": 0.0001495378481242071, + "loss": 2.8475, + "step": 8358 + }, + { + "epoch": 0.757310140200675, + "grad_norm": 0.8155816197395325, + "learning_rate": 0.00014953180692321634, + "loss": 2.8206, + "step": 8359 + }, + { + "epoch": 0.7574007383751218, + "grad_norm": 0.808901309967041, + "learning_rate": 0.00014952576572222558, + "loss": 2.7496, + "step": 8360 + }, + { + "epoch": 0.7574913365495686, + "grad_norm": 0.746282160282135, + "learning_rate": 0.00014951972452123484, + "loss": 2.6233, + "step": 8361 + }, + { + "epoch": 0.7575819347240154, + "grad_norm": 0.8138017058372498, + "learning_rate": 0.00014951368332024407, + "loss": 2.7096, + "step": 8362 + }, + { + "epoch": 0.7576725328984621, + "grad_norm": 0.7885015606880188, + "learning_rate": 0.00014950764211925333, + "loss": 2.8895, + "step": 8363 + }, + { + "epoch": 0.7577631310729089, + "grad_norm": 0.83549964427948, + "learning_rate": 0.00014950160091826254, + "loss": 2.9169, + "step": 8364 + }, + { + "epoch": 0.7578537292473556, + "grad_norm": 0.8269827365875244, + "learning_rate": 0.0001494955597172718, + "loss": 3.0231, + "step": 8365 + }, + { + "epoch": 0.7579443274218024, + "grad_norm": 0.82182377576828, + "learning_rate": 0.00014948951851628104, + "loss": 2.8958, + "step": 8366 + }, + { + "epoch": 0.7580349255962492, + "grad_norm": 0.815694272518158, + "learning_rate": 0.0001494834773152903, + "loss": 2.9054, + "step": 8367 + }, + { + "epoch": 0.758125523770696, + "grad_norm": 0.8238771557807922, + "learning_rate": 0.00014947743611429953, + "loss": 2.7997, + "step": 8368 + }, + { + "epoch": 0.7582161219451428, + "grad_norm": 0.6622836589813232, + "learning_rate": 0.00014947139491330877, + "loss": 2.0964, + "step": 8369 + }, + { + "epoch": 0.7583067201195896, + "grad_norm": 0.7324483394622803, + "learning_rate": 0.00014946535371231803, + "loss": 2.1188, + "step": 8370 + }, + { + "epoch": 0.7583973182940363, + "grad_norm": 0.8210554718971252, + "learning_rate": 0.00014945931251132726, + "loss": 3.0003, + "step": 8371 + }, + { + "epoch": 0.7584879164684831, + "grad_norm": 0.8046831488609314, + "learning_rate": 0.0001494532713103365, + "loss": 2.8238, + "step": 8372 + }, + { + "epoch": 0.7585785146429299, + "grad_norm": 0.8538684844970703, + "learning_rate": 0.00014944723010934573, + "loss": 2.9115, + "step": 8373 + }, + { + "epoch": 0.7586691128173767, + "grad_norm": 0.7830027341842651, + "learning_rate": 0.000149441188908355, + "loss": 2.97, + "step": 8374 + }, + { + "epoch": 0.7587597109918235, + "grad_norm": 0.7691891193389893, + "learning_rate": 0.00014943514770736422, + "loss": 1.9854, + "step": 8375 + }, + { + "epoch": 0.7588503091662703, + "grad_norm": 0.7762609124183655, + "learning_rate": 0.00014942910650637349, + "loss": 2.3856, + "step": 8376 + }, + { + "epoch": 0.7589409073407171, + "grad_norm": 0.8697507381439209, + "learning_rate": 0.00014942306530538272, + "loss": 2.7067, + "step": 8377 + }, + { + "epoch": 0.7590315055151639, + "grad_norm": 0.873667299747467, + "learning_rate": 0.00014941702410439195, + "loss": 2.9436, + "step": 8378 + }, + { + "epoch": 0.7591221036896106, + "grad_norm": 0.844735860824585, + "learning_rate": 0.00014941098290340122, + "loss": 2.8218, + "step": 8379 + }, + { + "epoch": 0.7592127018640574, + "grad_norm": 0.8299157619476318, + "learning_rate": 0.00014940494170241045, + "loss": 2.8448, + "step": 8380 + }, + { + "epoch": 0.7593033000385042, + "grad_norm": 0.7666091322898865, + "learning_rate": 0.00014939890050141968, + "loss": 2.8939, + "step": 8381 + }, + { + "epoch": 0.759393898212951, + "grad_norm": 0.8082230091094971, + "learning_rate": 0.00014939285930042892, + "loss": 2.7569, + "step": 8382 + }, + { + "epoch": 0.7594844963873978, + "grad_norm": 0.6372170448303223, + "learning_rate": 0.00014938681809943818, + "loss": 1.9893, + "step": 8383 + }, + { + "epoch": 0.7595750945618446, + "grad_norm": 0.7857462763786316, + "learning_rate": 0.00014938077689844744, + "loss": 2.6956, + "step": 8384 + }, + { + "epoch": 0.7596656927362914, + "grad_norm": 0.8434740900993347, + "learning_rate": 0.00014937473569745665, + "loss": 3.0264, + "step": 8385 + }, + { + "epoch": 0.7597562909107382, + "grad_norm": 0.8125308752059937, + "learning_rate": 0.0001493686944964659, + "loss": 2.8024, + "step": 8386 + }, + { + "epoch": 0.7598468890851849, + "grad_norm": 0.8713995814323425, + "learning_rate": 0.00014936265329547514, + "loss": 2.9801, + "step": 8387 + }, + { + "epoch": 0.7599374872596317, + "grad_norm": 0.7323066592216492, + "learning_rate": 0.0001493566120944844, + "loss": 2.7552, + "step": 8388 + }, + { + "epoch": 0.7600280854340785, + "grad_norm": 0.8107891082763672, + "learning_rate": 0.00014935057089349364, + "loss": 2.7878, + "step": 8389 + }, + { + "epoch": 0.7601186836085253, + "grad_norm": 0.8444768786430359, + "learning_rate": 0.00014934452969250287, + "loss": 2.8199, + "step": 8390 + }, + { + "epoch": 0.7602092817829721, + "grad_norm": 0.8022921085357666, + "learning_rate": 0.00014933848849151213, + "loss": 2.9437, + "step": 8391 + }, + { + "epoch": 0.7602998799574189, + "grad_norm": 0.8644261956214905, + "learning_rate": 0.00014933244729052137, + "loss": 2.8566, + "step": 8392 + }, + { + "epoch": 0.7603904781318657, + "grad_norm": 0.6910907030105591, + "learning_rate": 0.0001493264060895306, + "loss": 2.0367, + "step": 8393 + }, + { + "epoch": 0.7604810763063125, + "grad_norm": 0.7490397691726685, + "learning_rate": 0.00014932036488853983, + "loss": 2.5837, + "step": 8394 + }, + { + "epoch": 0.7605716744807592, + "grad_norm": 0.7520521283149719, + "learning_rate": 0.0001493143236875491, + "loss": 2.702, + "step": 8395 + }, + { + "epoch": 0.760662272655206, + "grad_norm": 0.7878017425537109, + "learning_rate": 0.00014930828248655833, + "loss": 2.6578, + "step": 8396 + }, + { + "epoch": 0.7607528708296528, + "grad_norm": 0.7725675106048584, + "learning_rate": 0.0001493022412855676, + "loss": 2.8015, + "step": 8397 + }, + { + "epoch": 0.7608434690040996, + "grad_norm": 0.8518844246864319, + "learning_rate": 0.00014929620008457682, + "loss": 2.6982, + "step": 8398 + }, + { + "epoch": 0.7609340671785464, + "grad_norm": 0.8196699023246765, + "learning_rate": 0.00014929015888358606, + "loss": 2.7318, + "step": 8399 + }, + { + "epoch": 0.7610246653529932, + "grad_norm": 0.7890444993972778, + "learning_rate": 0.00014928411768259532, + "loss": 2.7442, + "step": 8400 + }, + { + "epoch": 0.76111526352744, + "grad_norm": 0.8199514150619507, + "learning_rate": 0.00014927807648160455, + "loss": 2.6439, + "step": 8401 + }, + { + "epoch": 0.7612058617018868, + "grad_norm": 0.6802409291267395, + "learning_rate": 0.0001492720352806138, + "loss": 2.2241, + "step": 8402 + }, + { + "epoch": 0.7612964598763335, + "grad_norm": 0.75575190782547, + "learning_rate": 0.00014926599407962302, + "loss": 2.7356, + "step": 8403 + }, + { + "epoch": 0.7613870580507803, + "grad_norm": 0.7978293895721436, + "learning_rate": 0.00014925995287863228, + "loss": 2.8187, + "step": 8404 + }, + { + "epoch": 0.761477656225227, + "grad_norm": 0.7883256077766418, + "learning_rate": 0.00014925391167764152, + "loss": 2.926, + "step": 8405 + }, + { + "epoch": 0.7615682543996738, + "grad_norm": 0.8404895067214966, + "learning_rate": 0.00014924787047665075, + "loss": 2.7372, + "step": 8406 + }, + { + "epoch": 0.7616588525741206, + "grad_norm": 0.8004569411277771, + "learning_rate": 0.00014924182927566, + "loss": 2.7396, + "step": 8407 + }, + { + "epoch": 0.7617494507485674, + "grad_norm": 0.7621678709983826, + "learning_rate": 0.00014923578807466925, + "loss": 2.8835, + "step": 8408 + }, + { + "epoch": 0.7618400489230142, + "grad_norm": 0.7763963341712952, + "learning_rate": 0.0001492297468736785, + "loss": 2.6082, + "step": 8409 + }, + { + "epoch": 0.761930647097461, + "grad_norm": 0.8524616360664368, + "learning_rate": 0.00014922370567268774, + "loss": 2.5264, + "step": 8410 + }, + { + "epoch": 0.7620212452719077, + "grad_norm": 0.8105716705322266, + "learning_rate": 0.00014921766447169698, + "loss": 2.7365, + "step": 8411 + }, + { + "epoch": 0.7621118434463545, + "grad_norm": 0.7843795418739319, + "learning_rate": 0.0001492116232707062, + "loss": 2.7672, + "step": 8412 + }, + { + "epoch": 0.7622024416208013, + "grad_norm": 0.7465866208076477, + "learning_rate": 0.00014920558206971547, + "loss": 2.7348, + "step": 8413 + }, + { + "epoch": 0.7622930397952481, + "grad_norm": 0.8267806768417358, + "learning_rate": 0.00014919954086872473, + "loss": 2.7517, + "step": 8414 + }, + { + "epoch": 0.7623836379696949, + "grad_norm": 0.8707207441329956, + "learning_rate": 0.00014919349966773394, + "loss": 2.6662, + "step": 8415 + }, + { + "epoch": 0.7624742361441417, + "grad_norm": 0.8335063457489014, + "learning_rate": 0.0001491874584667432, + "loss": 2.8243, + "step": 8416 + }, + { + "epoch": 0.7625648343185885, + "grad_norm": 0.794158399105072, + "learning_rate": 0.00014918141726575243, + "loss": 2.7472, + "step": 8417 + }, + { + "epoch": 0.7626554324930352, + "grad_norm": 0.819793164730072, + "learning_rate": 0.0001491753760647617, + "loss": 3.1242, + "step": 8418 + }, + { + "epoch": 0.762746030667482, + "grad_norm": 0.7984497547149658, + "learning_rate": 0.0001491693348637709, + "loss": 2.8464, + "step": 8419 + }, + { + "epoch": 0.7628366288419288, + "grad_norm": 0.8091386556625366, + "learning_rate": 0.00014916329366278016, + "loss": 2.7462, + "step": 8420 + }, + { + "epoch": 0.7629272270163756, + "grad_norm": 0.8956581354141235, + "learning_rate": 0.00014915725246178942, + "loss": 2.6758, + "step": 8421 + }, + { + "epoch": 0.7630178251908224, + "grad_norm": 0.7461825609207153, + "learning_rate": 0.00014915121126079866, + "loss": 2.7446, + "step": 8422 + }, + { + "epoch": 0.7631084233652692, + "grad_norm": 0.8480824828147888, + "learning_rate": 0.0001491451700598079, + "loss": 2.9701, + "step": 8423 + }, + { + "epoch": 0.763199021539716, + "grad_norm": 0.85847008228302, + "learning_rate": 0.00014913912885881713, + "loss": 2.7441, + "step": 8424 + }, + { + "epoch": 0.7632896197141628, + "grad_norm": 0.8435136079788208, + "learning_rate": 0.0001491330876578264, + "loss": 2.6459, + "step": 8425 + }, + { + "epoch": 0.7633802178886095, + "grad_norm": 0.8229952454566956, + "learning_rate": 0.00014912704645683562, + "loss": 2.9452, + "step": 8426 + }, + { + "epoch": 0.7634708160630563, + "grad_norm": 0.7464684844017029, + "learning_rate": 0.00014912100525584488, + "loss": 2.845, + "step": 8427 + }, + { + "epoch": 0.7635614142375031, + "grad_norm": 0.7838325500488281, + "learning_rate": 0.00014911496405485412, + "loss": 2.7888, + "step": 8428 + }, + { + "epoch": 0.7636520124119499, + "grad_norm": 0.7578606009483337, + "learning_rate": 0.00014910892285386335, + "loss": 2.8952, + "step": 8429 + }, + { + "epoch": 0.7637426105863967, + "grad_norm": 0.9590862989425659, + "learning_rate": 0.0001491028816528726, + "loss": 1.4574, + "step": 8430 + }, + { + "epoch": 0.7638332087608435, + "grad_norm": 0.7978525161743164, + "learning_rate": 0.00014909684045188185, + "loss": 3.0059, + "step": 8431 + }, + { + "epoch": 0.7639238069352903, + "grad_norm": 0.7686001062393188, + "learning_rate": 0.00014909079925089108, + "loss": 2.8554, + "step": 8432 + }, + { + "epoch": 0.7640144051097371, + "grad_norm": 0.7008143663406372, + "learning_rate": 0.00014908475804990031, + "loss": 1.9637, + "step": 8433 + }, + { + "epoch": 0.7641050032841838, + "grad_norm": 0.7942928671836853, + "learning_rate": 0.00014907871684890958, + "loss": 2.9373, + "step": 8434 + }, + { + "epoch": 0.7641956014586306, + "grad_norm": 0.8974775671958923, + "learning_rate": 0.0001490726756479188, + "loss": 2.7351, + "step": 8435 + }, + { + "epoch": 0.7642861996330774, + "grad_norm": 0.9465859532356262, + "learning_rate": 0.00014906663444692804, + "loss": 2.7921, + "step": 8436 + }, + { + "epoch": 0.7643767978075242, + "grad_norm": 0.8293324708938599, + "learning_rate": 0.0001490605932459373, + "loss": 2.6028, + "step": 8437 + }, + { + "epoch": 0.764467395981971, + "grad_norm": 0.7097412943840027, + "learning_rate": 0.00014905455204494654, + "loss": 2.1952, + "step": 8438 + }, + { + "epoch": 0.7645579941564178, + "grad_norm": 0.7964911460876465, + "learning_rate": 0.0001490485108439558, + "loss": 2.6555, + "step": 8439 + }, + { + "epoch": 0.7646485923308646, + "grad_norm": 0.6985941529273987, + "learning_rate": 0.00014904246964296503, + "loss": 1.9165, + "step": 8440 + }, + { + "epoch": 0.7647391905053114, + "grad_norm": 0.772733211517334, + "learning_rate": 0.00014903642844197427, + "loss": 2.6507, + "step": 8441 + }, + { + "epoch": 0.7648297886797581, + "grad_norm": 0.8489066362380981, + "learning_rate": 0.0001490303872409835, + "loss": 2.7422, + "step": 8442 + }, + { + "epoch": 0.7649203868542049, + "grad_norm": 0.7533143162727356, + "learning_rate": 0.00014902434603999276, + "loss": 2.797, + "step": 8443 + }, + { + "epoch": 0.7650109850286517, + "grad_norm": 0.7489643096923828, + "learning_rate": 0.000149018304839002, + "loss": 2.7921, + "step": 8444 + }, + { + "epoch": 0.7651015832030985, + "grad_norm": 0.8098774552345276, + "learning_rate": 0.00014901226363801123, + "loss": 2.7855, + "step": 8445 + }, + { + "epoch": 0.7651921813775452, + "grad_norm": 0.8623696565628052, + "learning_rate": 0.0001490062224370205, + "loss": 2.7723, + "step": 8446 + }, + { + "epoch": 0.765282779551992, + "grad_norm": 0.8105174899101257, + "learning_rate": 0.00014900018123602973, + "loss": 2.9716, + "step": 8447 + }, + { + "epoch": 0.7653733777264388, + "grad_norm": 0.7752507328987122, + "learning_rate": 0.000148994140035039, + "loss": 2.4434, + "step": 8448 + }, + { + "epoch": 0.7654639759008856, + "grad_norm": 0.8360505104064941, + "learning_rate": 0.0001489880988340482, + "loss": 2.6513, + "step": 8449 + }, + { + "epoch": 0.7655545740753323, + "grad_norm": 0.828926146030426, + "learning_rate": 0.00014898205763305746, + "loss": 2.6239, + "step": 8450 + }, + { + "epoch": 0.7656451722497791, + "grad_norm": 0.8478286266326904, + "learning_rate": 0.00014897601643206672, + "loss": 2.875, + "step": 8451 + }, + { + "epoch": 0.7657357704242259, + "grad_norm": 0.8011831045150757, + "learning_rate": 0.00014896997523107595, + "loss": 2.7114, + "step": 8452 + }, + { + "epoch": 0.7658263685986727, + "grad_norm": 0.6649369597434998, + "learning_rate": 0.00014896393403008519, + "loss": 1.9039, + "step": 8453 + }, + { + "epoch": 0.7659169667731195, + "grad_norm": 0.8399912714958191, + "learning_rate": 0.00014895789282909442, + "loss": 2.5734, + "step": 8454 + }, + { + "epoch": 0.7660075649475663, + "grad_norm": 0.7827432751655579, + "learning_rate": 0.00014895185162810368, + "loss": 2.859, + "step": 8455 + }, + { + "epoch": 0.7660981631220131, + "grad_norm": 0.7519256472587585, + "learning_rate": 0.00014894581042711291, + "loss": 2.671, + "step": 8456 + }, + { + "epoch": 0.7661887612964599, + "grad_norm": 0.793969452381134, + "learning_rate": 0.00014893976922612215, + "loss": 2.7288, + "step": 8457 + }, + { + "epoch": 0.7662793594709066, + "grad_norm": 0.7583494186401367, + "learning_rate": 0.0001489337280251314, + "loss": 2.6731, + "step": 8458 + }, + { + "epoch": 0.7663699576453534, + "grad_norm": 0.79815274477005, + "learning_rate": 0.00014892768682414064, + "loss": 2.8282, + "step": 8459 + }, + { + "epoch": 0.7664605558198002, + "grad_norm": 0.8001736998558044, + "learning_rate": 0.0001489216456231499, + "loss": 2.8548, + "step": 8460 + }, + { + "epoch": 0.766551153994247, + "grad_norm": 0.8000234365463257, + "learning_rate": 0.00014891560442215914, + "loss": 2.639, + "step": 8461 + }, + { + "epoch": 0.7666417521686938, + "grad_norm": 0.83097243309021, + "learning_rate": 0.00014890956322116837, + "loss": 3.0203, + "step": 8462 + }, + { + "epoch": 0.7667323503431406, + "grad_norm": 0.8749410510063171, + "learning_rate": 0.0001489035220201776, + "loss": 2.5215, + "step": 8463 + }, + { + "epoch": 0.7668229485175874, + "grad_norm": 0.8031538128852844, + "learning_rate": 0.00014889748081918687, + "loss": 2.9675, + "step": 8464 + }, + { + "epoch": 0.7669135466920342, + "grad_norm": 0.806049108505249, + "learning_rate": 0.0001488914396181961, + "loss": 2.7163, + "step": 8465 + }, + { + "epoch": 0.7670041448664809, + "grad_norm": 0.7398295402526855, + "learning_rate": 0.00014888539841720534, + "loss": 2.6635, + "step": 8466 + }, + { + "epoch": 0.7670947430409277, + "grad_norm": 0.7388654947280884, + "learning_rate": 0.0001488793572162146, + "loss": 2.823, + "step": 8467 + }, + { + "epoch": 0.7671853412153745, + "grad_norm": 0.8366036415100098, + "learning_rate": 0.00014887331601522383, + "loss": 3.2034, + "step": 8468 + }, + { + "epoch": 0.7672759393898213, + "grad_norm": 0.7535519599914551, + "learning_rate": 0.0001488672748142331, + "loss": 2.555, + "step": 8469 + }, + { + "epoch": 0.7673665375642681, + "grad_norm": 0.8182070851325989, + "learning_rate": 0.0001488612336132423, + "loss": 2.7901, + "step": 8470 + }, + { + "epoch": 0.7674571357387149, + "grad_norm": 0.8224934935569763, + "learning_rate": 0.00014885519241225156, + "loss": 2.9939, + "step": 8471 + }, + { + "epoch": 0.7675477339131617, + "grad_norm": 0.7390925884246826, + "learning_rate": 0.0001488491512112608, + "loss": 2.7032, + "step": 8472 + }, + { + "epoch": 0.7676383320876085, + "grad_norm": 0.8372147679328918, + "learning_rate": 0.00014884311001027006, + "loss": 2.9392, + "step": 8473 + }, + { + "epoch": 0.7677289302620552, + "grad_norm": 0.8144082427024841, + "learning_rate": 0.0001488370688092793, + "loss": 2.9858, + "step": 8474 + }, + { + "epoch": 0.767819528436502, + "grad_norm": 0.7901778221130371, + "learning_rate": 0.00014883102760828852, + "loss": 2.6557, + "step": 8475 + }, + { + "epoch": 0.7679101266109488, + "grad_norm": 0.7759279608726501, + "learning_rate": 0.00014882498640729779, + "loss": 2.9344, + "step": 8476 + }, + { + "epoch": 0.7680007247853956, + "grad_norm": 0.7907986044883728, + "learning_rate": 0.00014881894520630702, + "loss": 2.7107, + "step": 8477 + }, + { + "epoch": 0.7680913229598424, + "grad_norm": 0.5856641530990601, + "learning_rate": 0.00014881290400531625, + "loss": 1.3874, + "step": 8478 + }, + { + "epoch": 0.7681819211342892, + "grad_norm": 0.8302209973335266, + "learning_rate": 0.0001488068628043255, + "loss": 2.6835, + "step": 8479 + }, + { + "epoch": 0.768272519308736, + "grad_norm": 0.7936270833015442, + "learning_rate": 0.00014880082160333475, + "loss": 2.8331, + "step": 8480 + }, + { + "epoch": 0.7683631174831828, + "grad_norm": 0.8583191633224487, + "learning_rate": 0.000148794780402344, + "loss": 2.6329, + "step": 8481 + }, + { + "epoch": 0.7684537156576295, + "grad_norm": 0.7576243877410889, + "learning_rate": 0.00014878873920135324, + "loss": 2.7561, + "step": 8482 + }, + { + "epoch": 0.7685443138320763, + "grad_norm": 0.8294682502746582, + "learning_rate": 0.00014878269800036248, + "loss": 2.7536, + "step": 8483 + }, + { + "epoch": 0.7686349120065231, + "grad_norm": 0.8357402086257935, + "learning_rate": 0.0001487766567993717, + "loss": 2.6753, + "step": 8484 + }, + { + "epoch": 0.7687255101809699, + "grad_norm": 0.7935557961463928, + "learning_rate": 0.00014877061559838097, + "loss": 2.8369, + "step": 8485 + }, + { + "epoch": 0.7688161083554166, + "grad_norm": 0.8072317242622375, + "learning_rate": 0.0001487645743973902, + "loss": 2.8159, + "step": 8486 + }, + { + "epoch": 0.7689067065298634, + "grad_norm": 0.7792950868606567, + "learning_rate": 0.00014875853319639944, + "loss": 2.7041, + "step": 8487 + }, + { + "epoch": 0.7689973047043102, + "grad_norm": 0.7538738250732422, + "learning_rate": 0.0001487524919954087, + "loss": 2.6651, + "step": 8488 + }, + { + "epoch": 0.769087902878757, + "grad_norm": 0.7902202010154724, + "learning_rate": 0.00014874645079441794, + "loss": 2.9021, + "step": 8489 + }, + { + "epoch": 0.7691785010532037, + "grad_norm": 0.7937014102935791, + "learning_rate": 0.0001487404095934272, + "loss": 2.5182, + "step": 8490 + }, + { + "epoch": 0.7692690992276505, + "grad_norm": 0.6697866320610046, + "learning_rate": 0.00014873436839243643, + "loss": 2.0588, + "step": 8491 + }, + { + "epoch": 0.7693596974020973, + "grad_norm": 0.6730586290359497, + "learning_rate": 0.00014872832719144567, + "loss": 2.0306, + "step": 8492 + }, + { + "epoch": 0.7694502955765441, + "grad_norm": 0.8534904718399048, + "learning_rate": 0.0001487222859904549, + "loss": 2.6759, + "step": 8493 + }, + { + "epoch": 0.7695408937509909, + "grad_norm": 0.877548098564148, + "learning_rate": 0.00014871624478946416, + "loss": 2.9075, + "step": 8494 + }, + { + "epoch": 0.7696314919254377, + "grad_norm": 0.8351582884788513, + "learning_rate": 0.0001487102035884734, + "loss": 2.7985, + "step": 8495 + }, + { + "epoch": 0.7697220900998845, + "grad_norm": 0.7988496422767639, + "learning_rate": 0.00014870416238748263, + "loss": 2.6933, + "step": 8496 + }, + { + "epoch": 0.7698126882743312, + "grad_norm": 0.7604631781578064, + "learning_rate": 0.0001486981211864919, + "loss": 2.9689, + "step": 8497 + }, + { + "epoch": 0.769903286448778, + "grad_norm": 0.8287925124168396, + "learning_rate": 0.00014869207998550112, + "loss": 2.5875, + "step": 8498 + }, + { + "epoch": 0.7699938846232248, + "grad_norm": 0.8377761840820312, + "learning_rate": 0.00014868603878451039, + "loss": 2.9023, + "step": 8499 + }, + { + "epoch": 0.7700844827976716, + "grad_norm": 0.7878649234771729, + "learning_rate": 0.0001486799975835196, + "loss": 2.7386, + "step": 8500 + }, + { + "epoch": 0.7701750809721184, + "grad_norm": 0.7550784349441528, + "learning_rate": 0.00014867395638252885, + "loss": 2.7801, + "step": 8501 + }, + { + "epoch": 0.7702656791465652, + "grad_norm": 0.740136444568634, + "learning_rate": 0.0001486679151815381, + "loss": 2.6896, + "step": 8502 + }, + { + "epoch": 0.770356277321012, + "grad_norm": 0.7615137696266174, + "learning_rate": 0.00014866187398054735, + "loss": 2.7325, + "step": 8503 + }, + { + "epoch": 0.7704468754954588, + "grad_norm": 0.7459118366241455, + "learning_rate": 0.00014865583277955658, + "loss": 2.2208, + "step": 8504 + }, + { + "epoch": 0.7705374736699055, + "grad_norm": 0.786236047744751, + "learning_rate": 0.00014864979157856582, + "loss": 2.7802, + "step": 8505 + }, + { + "epoch": 0.7706280718443523, + "grad_norm": 0.7133588790893555, + "learning_rate": 0.00014864375037757508, + "loss": 2.1126, + "step": 8506 + }, + { + "epoch": 0.7707186700187991, + "grad_norm": 0.8442530632019043, + "learning_rate": 0.0001486377091765843, + "loss": 2.9195, + "step": 8507 + }, + { + "epoch": 0.7708092681932459, + "grad_norm": 0.8609490990638733, + "learning_rate": 0.00014863166797559355, + "loss": 2.8043, + "step": 8508 + }, + { + "epoch": 0.7708998663676927, + "grad_norm": 0.9233207702636719, + "learning_rate": 0.00014862562677460278, + "loss": 2.981, + "step": 8509 + }, + { + "epoch": 0.7709904645421395, + "grad_norm": 0.8009801506996155, + "learning_rate": 0.00014861958557361204, + "loss": 2.5901, + "step": 8510 + }, + { + "epoch": 0.7710810627165863, + "grad_norm": 0.807748019695282, + "learning_rate": 0.0001486135443726213, + "loss": 2.8333, + "step": 8511 + }, + { + "epoch": 0.7711716608910331, + "grad_norm": 0.9248517751693726, + "learning_rate": 0.00014860750317163054, + "loss": 2.8128, + "step": 8512 + }, + { + "epoch": 0.7712622590654798, + "grad_norm": 0.7440809607505798, + "learning_rate": 0.00014860146197063977, + "loss": 2.9456, + "step": 8513 + }, + { + "epoch": 0.7713528572399266, + "grad_norm": 0.8893543481826782, + "learning_rate": 0.000148595420769649, + "loss": 2.6971, + "step": 8514 + }, + { + "epoch": 0.7714434554143734, + "grad_norm": 0.7654691934585571, + "learning_rate": 0.00014858937956865827, + "loss": 2.6023, + "step": 8515 + }, + { + "epoch": 0.7715340535888202, + "grad_norm": 0.7777653336524963, + "learning_rate": 0.0001485833383676675, + "loss": 2.7083, + "step": 8516 + }, + { + "epoch": 0.771624651763267, + "grad_norm": 0.8284453749656677, + "learning_rate": 0.00014857729716667673, + "loss": 2.7169, + "step": 8517 + }, + { + "epoch": 0.7717152499377138, + "grad_norm": 0.7694072127342224, + "learning_rate": 0.000148571255965686, + "loss": 2.7759, + "step": 8518 + }, + { + "epoch": 0.7718058481121606, + "grad_norm": 0.8312073945999146, + "learning_rate": 0.00014856521476469523, + "loss": 2.9016, + "step": 8519 + }, + { + "epoch": 0.7718964462866074, + "grad_norm": 0.8004971742630005, + "learning_rate": 0.0001485591735637045, + "loss": 3.0887, + "step": 8520 + }, + { + "epoch": 0.7719870444610542, + "grad_norm": 0.7277240753173828, + "learning_rate": 0.0001485531323627137, + "loss": 2.6017, + "step": 8521 + }, + { + "epoch": 0.7720776426355009, + "grad_norm": 0.8307280540466309, + "learning_rate": 0.00014854709116172296, + "loss": 3.2356, + "step": 8522 + }, + { + "epoch": 0.7721682408099477, + "grad_norm": 0.8644726276397705, + "learning_rate": 0.0001485410499607322, + "loss": 2.7671, + "step": 8523 + }, + { + "epoch": 0.7722588389843945, + "grad_norm": 0.8250706195831299, + "learning_rate": 0.00014853500875974145, + "loss": 2.8907, + "step": 8524 + }, + { + "epoch": 0.7723494371588413, + "grad_norm": 0.7014891505241394, + "learning_rate": 0.0001485289675587507, + "loss": 2.2043, + "step": 8525 + }, + { + "epoch": 0.7724400353332881, + "grad_norm": 0.794587254524231, + "learning_rate": 0.00014852292635775992, + "loss": 2.7359, + "step": 8526 + }, + { + "epoch": 0.7725306335077348, + "grad_norm": 0.73177570104599, + "learning_rate": 0.00014851688515676918, + "loss": 2.2986, + "step": 8527 + }, + { + "epoch": 0.7726212316821816, + "grad_norm": 0.8173173069953918, + "learning_rate": 0.00014851084395577842, + "loss": 2.9158, + "step": 8528 + }, + { + "epoch": 0.7727118298566283, + "grad_norm": 0.8314335942268372, + "learning_rate": 0.00014850480275478765, + "loss": 3.0519, + "step": 8529 + }, + { + "epoch": 0.7728024280310751, + "grad_norm": 0.7668262720108032, + "learning_rate": 0.00014849876155379689, + "loss": 2.8858, + "step": 8530 + }, + { + "epoch": 0.7728930262055219, + "grad_norm": 0.7756307125091553, + "learning_rate": 0.00014849272035280615, + "loss": 2.8164, + "step": 8531 + }, + { + "epoch": 0.7729836243799687, + "grad_norm": 0.7659720778465271, + "learning_rate": 0.00014848667915181538, + "loss": 2.9016, + "step": 8532 + }, + { + "epoch": 0.7730742225544155, + "grad_norm": 0.7731379866600037, + "learning_rate": 0.00014848063795082464, + "loss": 2.7834, + "step": 8533 + }, + { + "epoch": 0.7731648207288623, + "grad_norm": 0.8059349656105042, + "learning_rate": 0.00014847459674983388, + "loss": 2.9236, + "step": 8534 + }, + { + "epoch": 0.7732554189033091, + "grad_norm": 0.7573506832122803, + "learning_rate": 0.0001484685555488431, + "loss": 2.5473, + "step": 8535 + }, + { + "epoch": 0.7733460170777559, + "grad_norm": 0.829278826713562, + "learning_rate": 0.00014846251434785237, + "loss": 3.0679, + "step": 8536 + }, + { + "epoch": 0.7734366152522026, + "grad_norm": 0.8666473031044006, + "learning_rate": 0.0001484564731468616, + "loss": 2.735, + "step": 8537 + }, + { + "epoch": 0.7735272134266494, + "grad_norm": 0.8533970713615417, + "learning_rate": 0.00014845043194587084, + "loss": 2.7285, + "step": 8538 + }, + { + "epoch": 0.7736178116010962, + "grad_norm": 0.6711745262145996, + "learning_rate": 0.00014844439074488007, + "loss": 1.8492, + "step": 8539 + }, + { + "epoch": 0.773708409775543, + "grad_norm": 0.8277530670166016, + "learning_rate": 0.00014843834954388933, + "loss": 2.8632, + "step": 8540 + }, + { + "epoch": 0.7737990079499898, + "grad_norm": 0.7902617454528809, + "learning_rate": 0.0001484323083428986, + "loss": 2.8416, + "step": 8541 + }, + { + "epoch": 0.7738896061244366, + "grad_norm": 0.7179662585258484, + "learning_rate": 0.0001484262671419078, + "loss": 2.1021, + "step": 8542 + }, + { + "epoch": 0.7739802042988834, + "grad_norm": 0.7771360278129578, + "learning_rate": 0.00014842022594091706, + "loss": 2.577, + "step": 8543 + }, + { + "epoch": 0.7740708024733302, + "grad_norm": 0.805556058883667, + "learning_rate": 0.0001484141847399263, + "loss": 2.7977, + "step": 8544 + }, + { + "epoch": 0.7741614006477769, + "grad_norm": 0.8210350871086121, + "learning_rate": 0.00014840814353893556, + "loss": 2.9143, + "step": 8545 + }, + { + "epoch": 0.7742519988222237, + "grad_norm": 0.6937916278839111, + "learning_rate": 0.0001484021023379448, + "loss": 2.1218, + "step": 8546 + }, + { + "epoch": 0.7743425969966705, + "grad_norm": 0.8206404447555542, + "learning_rate": 0.00014839606113695403, + "loss": 2.9967, + "step": 8547 + }, + { + "epoch": 0.7744331951711173, + "grad_norm": 0.828425407409668, + "learning_rate": 0.0001483900199359633, + "loss": 2.7338, + "step": 8548 + }, + { + "epoch": 0.7745237933455641, + "grad_norm": 0.7431089878082275, + "learning_rate": 0.00014838397873497252, + "loss": 1.839, + "step": 8549 + }, + { + "epoch": 0.7746143915200109, + "grad_norm": 0.753633975982666, + "learning_rate": 0.00014837793753398178, + "loss": 2.0149, + "step": 8550 + }, + { + "epoch": 0.7747049896944577, + "grad_norm": 0.645919144153595, + "learning_rate": 0.000148371896332991, + "loss": 1.9481, + "step": 8551 + }, + { + "epoch": 0.7747955878689045, + "grad_norm": 0.7816630601882935, + "learning_rate": 0.00014836585513200025, + "loss": 2.7481, + "step": 8552 + }, + { + "epoch": 0.7748861860433512, + "grad_norm": 0.9030842781066895, + "learning_rate": 0.00014835981393100949, + "loss": 2.4311, + "step": 8553 + }, + { + "epoch": 0.774976784217798, + "grad_norm": 0.8046669960021973, + "learning_rate": 0.00014835377273001875, + "loss": 2.9624, + "step": 8554 + }, + { + "epoch": 0.7750673823922448, + "grad_norm": 0.8168879747390747, + "learning_rate": 0.00014834773152902795, + "loss": 2.6256, + "step": 8555 + }, + { + "epoch": 0.7751579805666916, + "grad_norm": 0.8529312610626221, + "learning_rate": 0.00014834169032803721, + "loss": 2.7557, + "step": 8556 + }, + { + "epoch": 0.7752485787411384, + "grad_norm": 0.7763368487358093, + "learning_rate": 0.00014833564912704648, + "loss": 2.6161, + "step": 8557 + }, + { + "epoch": 0.7753391769155852, + "grad_norm": 0.7348354458808899, + "learning_rate": 0.0001483296079260557, + "loss": 2.73, + "step": 8558 + }, + { + "epoch": 0.775429775090032, + "grad_norm": 0.8104738593101501, + "learning_rate": 0.00014832356672506494, + "loss": 2.604, + "step": 8559 + }, + { + "epoch": 0.7755203732644788, + "grad_norm": 0.7946962118148804, + "learning_rate": 0.00014831752552407418, + "loss": 2.5968, + "step": 8560 + }, + { + "epoch": 0.7756109714389255, + "grad_norm": 0.77519291639328, + "learning_rate": 0.00014831148432308344, + "loss": 3.0435, + "step": 8561 + }, + { + "epoch": 0.7757015696133723, + "grad_norm": 0.8071316480636597, + "learning_rate": 0.00014830544312209267, + "loss": 2.8336, + "step": 8562 + }, + { + "epoch": 0.7757921677878191, + "grad_norm": 0.8014549612998962, + "learning_rate": 0.00014829940192110193, + "loss": 2.8515, + "step": 8563 + }, + { + "epoch": 0.7758827659622659, + "grad_norm": 0.7917729020118713, + "learning_rate": 0.00014829336072011117, + "loss": 2.8425, + "step": 8564 + }, + { + "epoch": 0.7759733641367127, + "grad_norm": 0.769550621509552, + "learning_rate": 0.0001482873195191204, + "loss": 2.741, + "step": 8565 + }, + { + "epoch": 0.7760639623111595, + "grad_norm": 0.7589368224143982, + "learning_rate": 0.00014828127831812966, + "loss": 2.7433, + "step": 8566 + }, + { + "epoch": 0.7761545604856062, + "grad_norm": 0.6581156253814697, + "learning_rate": 0.0001482752371171389, + "loss": 2.2202, + "step": 8567 + }, + { + "epoch": 0.776245158660053, + "grad_norm": 0.7990230917930603, + "learning_rate": 0.00014826919591614813, + "loss": 2.909, + "step": 8568 + }, + { + "epoch": 0.7763357568344997, + "grad_norm": 0.7747130990028381, + "learning_rate": 0.00014826315471515737, + "loss": 2.6816, + "step": 8569 + }, + { + "epoch": 0.7764263550089465, + "grad_norm": 0.7905811667442322, + "learning_rate": 0.00014825711351416663, + "loss": 3.0479, + "step": 8570 + }, + { + "epoch": 0.7765169531833933, + "grad_norm": 0.7439942955970764, + "learning_rate": 0.0001482510723131759, + "loss": 2.8468, + "step": 8571 + }, + { + "epoch": 0.7766075513578401, + "grad_norm": 0.809313952922821, + "learning_rate": 0.0001482450311121851, + "loss": 2.7106, + "step": 8572 + }, + { + "epoch": 0.7766981495322869, + "grad_norm": 0.768600583076477, + "learning_rate": 0.00014823898991119436, + "loss": 2.6386, + "step": 8573 + }, + { + "epoch": 0.7767887477067337, + "grad_norm": 0.7933810949325562, + "learning_rate": 0.0001482329487102036, + "loss": 2.769, + "step": 8574 + }, + { + "epoch": 0.7768793458811805, + "grad_norm": 0.7973137497901917, + "learning_rate": 0.00014822690750921285, + "loss": 2.5922, + "step": 8575 + }, + { + "epoch": 0.7769699440556272, + "grad_norm": 0.8396936655044556, + "learning_rate": 0.00014822086630822209, + "loss": 2.7634, + "step": 8576 + }, + { + "epoch": 0.777060542230074, + "grad_norm": 0.7518828511238098, + "learning_rate": 0.00014821482510723132, + "loss": 2.6311, + "step": 8577 + }, + { + "epoch": 0.7771511404045208, + "grad_norm": 0.8389133214950562, + "learning_rate": 0.00014820878390624058, + "loss": 2.7374, + "step": 8578 + }, + { + "epoch": 0.7772417385789676, + "grad_norm": 0.8448631167411804, + "learning_rate": 0.00014820274270524981, + "loss": 2.6915, + "step": 8579 + }, + { + "epoch": 0.7773323367534144, + "grad_norm": 0.7442335486412048, + "learning_rate": 0.00014819670150425905, + "loss": 3.0069, + "step": 8580 + }, + { + "epoch": 0.7774229349278612, + "grad_norm": 0.7361761331558228, + "learning_rate": 0.00014819066030326828, + "loss": 2.318, + "step": 8581 + }, + { + "epoch": 0.777513533102308, + "grad_norm": 0.8028291463851929, + "learning_rate": 0.00014818461910227754, + "loss": 2.8657, + "step": 8582 + }, + { + "epoch": 0.7776041312767548, + "grad_norm": 0.8905485272407532, + "learning_rate": 0.00014817857790128678, + "loss": 2.8834, + "step": 8583 + }, + { + "epoch": 0.7776947294512015, + "grad_norm": 0.8516442775726318, + "learning_rate": 0.00014817253670029604, + "loss": 2.6708, + "step": 8584 + }, + { + "epoch": 0.7777853276256483, + "grad_norm": 0.8545002937316895, + "learning_rate": 0.00014816649549930527, + "loss": 3.0086, + "step": 8585 + }, + { + "epoch": 0.7778759258000951, + "grad_norm": 0.8146714568138123, + "learning_rate": 0.0001481604542983145, + "loss": 2.612, + "step": 8586 + }, + { + "epoch": 0.7779665239745419, + "grad_norm": 0.7887700200080872, + "learning_rate": 0.00014815441309732377, + "loss": 2.9216, + "step": 8587 + }, + { + "epoch": 0.7780571221489887, + "grad_norm": 0.7851274609565735, + "learning_rate": 0.000148148371896333, + "loss": 2.7005, + "step": 8588 + }, + { + "epoch": 0.7781477203234355, + "grad_norm": 0.7738264799118042, + "learning_rate": 0.00014814233069534224, + "loss": 2.989, + "step": 8589 + }, + { + "epoch": 0.7782383184978823, + "grad_norm": 0.8023179173469543, + "learning_rate": 0.00014813628949435147, + "loss": 2.4457, + "step": 8590 + }, + { + "epoch": 0.7783289166723291, + "grad_norm": 0.8215933442115784, + "learning_rate": 0.00014813024829336073, + "loss": 2.8348, + "step": 8591 + }, + { + "epoch": 0.7784195148467759, + "grad_norm": 0.8079305291175842, + "learning_rate": 0.00014812420709236997, + "loss": 2.7107, + "step": 8592 + }, + { + "epoch": 0.7785101130212226, + "grad_norm": 0.8143686652183533, + "learning_rate": 0.0001481181658913792, + "loss": 2.7492, + "step": 8593 + }, + { + "epoch": 0.7786007111956694, + "grad_norm": 0.8056140542030334, + "learning_rate": 0.00014811212469038846, + "loss": 2.6823, + "step": 8594 + }, + { + "epoch": 0.7786913093701162, + "grad_norm": 0.8741399645805359, + "learning_rate": 0.0001481060834893977, + "loss": 2.5784, + "step": 8595 + }, + { + "epoch": 0.778781907544563, + "grad_norm": 0.830871045589447, + "learning_rate": 0.00014810004228840696, + "loss": 2.5409, + "step": 8596 + }, + { + "epoch": 0.7788725057190098, + "grad_norm": 0.8486402630805969, + "learning_rate": 0.0001480940010874162, + "loss": 3.0587, + "step": 8597 + }, + { + "epoch": 0.7789631038934566, + "grad_norm": 0.8004481196403503, + "learning_rate": 0.00014808795988642542, + "loss": 2.887, + "step": 8598 + }, + { + "epoch": 0.7790537020679034, + "grad_norm": 0.838428795337677, + "learning_rate": 0.00014808191868543466, + "loss": 2.7673, + "step": 8599 + }, + { + "epoch": 0.7791443002423502, + "grad_norm": 0.745265543460846, + "learning_rate": 0.00014807587748444392, + "loss": 2.7148, + "step": 8600 + }, + { + "epoch": 0.7792348984167969, + "grad_norm": 0.7437013387680054, + "learning_rate": 0.00014806983628345318, + "loss": 2.2711, + "step": 8601 + }, + { + "epoch": 0.7793254965912437, + "grad_norm": 0.9211732745170593, + "learning_rate": 0.0001480637950824624, + "loss": 3.1433, + "step": 8602 + }, + { + "epoch": 0.7794160947656905, + "grad_norm": 0.8364513516426086, + "learning_rate": 0.00014805775388147165, + "loss": 2.8334, + "step": 8603 + }, + { + "epoch": 0.7795066929401373, + "grad_norm": 0.789383590221405, + "learning_rate": 0.00014805171268048088, + "loss": 2.8643, + "step": 8604 + }, + { + "epoch": 0.7795972911145841, + "grad_norm": 0.7064113020896912, + "learning_rate": 0.00014804567147949014, + "loss": 2.3323, + "step": 8605 + }, + { + "epoch": 0.7796878892890309, + "grad_norm": 0.8131768107414246, + "learning_rate": 0.00014803963027849935, + "loss": 3.1394, + "step": 8606 + }, + { + "epoch": 0.7797784874634777, + "grad_norm": 0.794195830821991, + "learning_rate": 0.0001480335890775086, + "loss": 2.929, + "step": 8607 + }, + { + "epoch": 0.7798690856379243, + "grad_norm": 0.7934302687644958, + "learning_rate": 0.00014802754787651787, + "loss": 2.9108, + "step": 8608 + }, + { + "epoch": 0.7799596838123711, + "grad_norm": 0.7721412181854248, + "learning_rate": 0.0001480215066755271, + "loss": 2.7136, + "step": 8609 + }, + { + "epoch": 0.7800502819868179, + "grad_norm": 0.7593920826911926, + "learning_rate": 0.00014801546547453634, + "loss": 2.8116, + "step": 8610 + }, + { + "epoch": 0.7801408801612647, + "grad_norm": 0.797433078289032, + "learning_rate": 0.00014800942427354558, + "loss": 2.7577, + "step": 8611 + }, + { + "epoch": 0.7802314783357115, + "grad_norm": 0.7827565670013428, + "learning_rate": 0.00014800338307255484, + "loss": 2.793, + "step": 8612 + }, + { + "epoch": 0.7803220765101583, + "grad_norm": 0.7735447883605957, + "learning_rate": 0.00014799734187156407, + "loss": 3.0454, + "step": 8613 + }, + { + "epoch": 0.7804126746846051, + "grad_norm": 0.644696831703186, + "learning_rate": 0.00014799130067057333, + "loss": 2.2172, + "step": 8614 + }, + { + "epoch": 0.7805032728590519, + "grad_norm": 0.7641891837120056, + "learning_rate": 0.00014798525946958257, + "loss": 2.8248, + "step": 8615 + }, + { + "epoch": 0.7805938710334986, + "grad_norm": 0.7737988233566284, + "learning_rate": 0.0001479792182685918, + "loss": 2.9162, + "step": 8616 + }, + { + "epoch": 0.7806844692079454, + "grad_norm": 0.78532475233078, + "learning_rate": 0.00014797317706760106, + "loss": 3.0485, + "step": 8617 + }, + { + "epoch": 0.7807750673823922, + "grad_norm": 0.5501746535301208, + "learning_rate": 0.0001479671358666103, + "loss": 1.686, + "step": 8618 + }, + { + "epoch": 0.780865665556839, + "grad_norm": 0.809466540813446, + "learning_rate": 0.00014796109466561953, + "loss": 2.9767, + "step": 8619 + }, + { + "epoch": 0.7809562637312858, + "grad_norm": 0.801041841506958, + "learning_rate": 0.00014795505346462876, + "loss": 2.8438, + "step": 8620 + }, + { + "epoch": 0.7810468619057326, + "grad_norm": 0.6728940606117249, + "learning_rate": 0.00014794901226363802, + "loss": 2.0751, + "step": 8621 + }, + { + "epoch": 0.7811374600801794, + "grad_norm": 0.7518661022186279, + "learning_rate": 0.00014794297106264726, + "loss": 2.9436, + "step": 8622 + }, + { + "epoch": 0.7812280582546262, + "grad_norm": 0.7414668798446655, + "learning_rate": 0.0001479369298616565, + "loss": 2.5449, + "step": 8623 + }, + { + "epoch": 0.7813186564290729, + "grad_norm": 0.8107067346572876, + "learning_rate": 0.00014793088866066575, + "loss": 3.102, + "step": 8624 + }, + { + "epoch": 0.7814092546035197, + "grad_norm": 0.7886057496070862, + "learning_rate": 0.000147924847459675, + "loss": 1.9972, + "step": 8625 + }, + { + "epoch": 0.7814998527779665, + "grad_norm": 0.668975293636322, + "learning_rate": 0.00014791880625868425, + "loss": 2.1379, + "step": 8626 + }, + { + "epoch": 0.7815904509524133, + "grad_norm": 0.6357923746109009, + "learning_rate": 0.00014791276505769348, + "loss": 1.9424, + "step": 8627 + }, + { + "epoch": 0.7816810491268601, + "grad_norm": 0.6584326028823853, + "learning_rate": 0.00014790672385670272, + "loss": 1.9215, + "step": 8628 + }, + { + "epoch": 0.7817716473013069, + "grad_norm": 0.7105445265769958, + "learning_rate": 0.00014790068265571195, + "loss": 1.9438, + "step": 8629 + }, + { + "epoch": 0.7818622454757537, + "grad_norm": 0.7996451258659363, + "learning_rate": 0.0001478946414547212, + "loss": 2.8858, + "step": 8630 + }, + { + "epoch": 0.7819528436502005, + "grad_norm": 0.8207160830497742, + "learning_rate": 0.00014788860025373045, + "loss": 2.922, + "step": 8631 + }, + { + "epoch": 0.7820434418246472, + "grad_norm": 0.7695946097373962, + "learning_rate": 0.00014788255905273968, + "loss": 2.8241, + "step": 8632 + }, + { + "epoch": 0.782134039999094, + "grad_norm": 0.9017314314842224, + "learning_rate": 0.00014787651785174894, + "loss": 2.8182, + "step": 8633 + }, + { + "epoch": 0.7822246381735408, + "grad_norm": 0.6749444603919983, + "learning_rate": 0.00014787047665075818, + "loss": 2.124, + "step": 8634 + }, + { + "epoch": 0.7823152363479876, + "grad_norm": 0.7530258893966675, + "learning_rate": 0.00014786443544976744, + "loss": 2.0119, + "step": 8635 + }, + { + "epoch": 0.7824058345224344, + "grad_norm": 0.6887661814689636, + "learning_rate": 0.00014785839424877664, + "loss": 2.219, + "step": 8636 + }, + { + "epoch": 0.7824964326968812, + "grad_norm": 0.7802941203117371, + "learning_rate": 0.0001478523530477859, + "loss": 2.6366, + "step": 8637 + }, + { + "epoch": 0.782587030871328, + "grad_norm": 0.7701817750930786, + "learning_rate": 0.00014784631184679517, + "loss": 2.9344, + "step": 8638 + }, + { + "epoch": 0.7826776290457748, + "grad_norm": 0.7519884705543518, + "learning_rate": 0.0001478402706458044, + "loss": 2.722, + "step": 8639 + }, + { + "epoch": 0.7827682272202215, + "grad_norm": 0.6917465925216675, + "learning_rate": 0.00014783422944481363, + "loss": 2.0246, + "step": 8640 + }, + { + "epoch": 0.7828588253946683, + "grad_norm": 0.7773909568786621, + "learning_rate": 0.00014782818824382287, + "loss": 2.9324, + "step": 8641 + }, + { + "epoch": 0.7829494235691151, + "grad_norm": 0.7791172862052917, + "learning_rate": 0.00014782214704283213, + "loss": 3.001, + "step": 8642 + }, + { + "epoch": 0.7830400217435619, + "grad_norm": 0.7919048070907593, + "learning_rate": 0.00014781610584184136, + "loss": 2.839, + "step": 8643 + }, + { + "epoch": 0.7831306199180087, + "grad_norm": 0.8248032331466675, + "learning_rate": 0.0001478100646408506, + "loss": 2.7472, + "step": 8644 + }, + { + "epoch": 0.7832212180924555, + "grad_norm": 0.8593996167182922, + "learning_rate": 0.00014780402343985986, + "loss": 3.1804, + "step": 8645 + }, + { + "epoch": 0.7833118162669023, + "grad_norm": 0.818397581577301, + "learning_rate": 0.0001477979822388691, + "loss": 2.9421, + "step": 8646 + }, + { + "epoch": 0.7834024144413491, + "grad_norm": 0.7765064239501953, + "learning_rate": 0.00014779194103787835, + "loss": 2.6091, + "step": 8647 + }, + { + "epoch": 0.7834930126157957, + "grad_norm": 0.769512951374054, + "learning_rate": 0.0001477858998368876, + "loss": 2.8026, + "step": 8648 + }, + { + "epoch": 0.7835836107902425, + "grad_norm": 0.780407726764679, + "learning_rate": 0.00014777985863589682, + "loss": 2.8195, + "step": 8649 + }, + { + "epoch": 0.7836742089646893, + "grad_norm": 0.7806561589241028, + "learning_rate": 0.00014777381743490606, + "loss": 2.8581, + "step": 8650 + }, + { + "epoch": 0.7837648071391361, + "grad_norm": 0.7847487330436707, + "learning_rate": 0.00014776777623391532, + "loss": 2.7211, + "step": 8651 + }, + { + "epoch": 0.7838554053135829, + "grad_norm": 0.804573655128479, + "learning_rate": 0.00014776173503292455, + "loss": 2.5657, + "step": 8652 + }, + { + "epoch": 0.7839460034880297, + "grad_norm": 0.7811828255653381, + "learning_rate": 0.00014775569383193379, + "loss": 2.6499, + "step": 8653 + }, + { + "epoch": 0.7840366016624765, + "grad_norm": 0.8242861032485962, + "learning_rate": 0.00014774965263094305, + "loss": 3.042, + "step": 8654 + }, + { + "epoch": 0.7841271998369232, + "grad_norm": 0.8296350836753845, + "learning_rate": 0.00014774361142995228, + "loss": 2.8797, + "step": 8655 + }, + { + "epoch": 0.78421779801137, + "grad_norm": 0.811955988407135, + "learning_rate": 0.00014773757022896154, + "loss": 2.9148, + "step": 8656 + }, + { + "epoch": 0.7843083961858168, + "grad_norm": 0.8265569806098938, + "learning_rate": 0.00014773152902797075, + "loss": 2.8094, + "step": 8657 + }, + { + "epoch": 0.7843989943602636, + "grad_norm": 0.7842000722885132, + "learning_rate": 0.00014772548782698, + "loss": 3.0688, + "step": 8658 + }, + { + "epoch": 0.7844895925347104, + "grad_norm": 0.7550154328346252, + "learning_rate": 0.00014771944662598924, + "loss": 2.6801, + "step": 8659 + }, + { + "epoch": 0.7845801907091572, + "grad_norm": 0.7771438360214233, + "learning_rate": 0.0001477134054249985, + "loss": 3.1333, + "step": 8660 + }, + { + "epoch": 0.784670788883604, + "grad_norm": 0.7808748483657837, + "learning_rate": 0.00014770736422400774, + "loss": 2.9319, + "step": 8661 + }, + { + "epoch": 0.7847613870580508, + "grad_norm": 0.8817659616470337, + "learning_rate": 0.00014770132302301697, + "loss": 3.0566, + "step": 8662 + }, + { + "epoch": 0.7848519852324976, + "grad_norm": 0.9274734258651733, + "learning_rate": 0.00014769528182202623, + "loss": 2.9391, + "step": 8663 + }, + { + "epoch": 0.7849425834069443, + "grad_norm": 0.7661842107772827, + "learning_rate": 0.00014768924062103547, + "loss": 2.8317, + "step": 8664 + }, + { + "epoch": 0.7850331815813911, + "grad_norm": 0.7827793955802917, + "learning_rate": 0.0001476831994200447, + "loss": 2.9466, + "step": 8665 + }, + { + "epoch": 0.7851237797558379, + "grad_norm": 0.7496156692504883, + "learning_rate": 0.00014767715821905394, + "loss": 2.914, + "step": 8666 + }, + { + "epoch": 0.7852143779302847, + "grad_norm": 0.8063421249389648, + "learning_rate": 0.0001476711170180632, + "loss": 2.984, + "step": 8667 + }, + { + "epoch": 0.7853049761047315, + "grad_norm": 0.8234055638313293, + "learning_rate": 0.00014766507581707246, + "loss": 3.0118, + "step": 8668 + }, + { + "epoch": 0.7853955742791783, + "grad_norm": 0.9053353071212769, + "learning_rate": 0.0001476590346160817, + "loss": 2.7758, + "step": 8669 + }, + { + "epoch": 0.7854861724536251, + "grad_norm": 0.7942331433296204, + "learning_rate": 0.00014765299341509093, + "loss": 2.8511, + "step": 8670 + }, + { + "epoch": 0.7855767706280719, + "grad_norm": 0.7914716005325317, + "learning_rate": 0.00014764695221410016, + "loss": 2.7188, + "step": 8671 + }, + { + "epoch": 0.7856673688025186, + "grad_norm": 0.8186629414558411, + "learning_rate": 0.00014764091101310942, + "loss": 2.8307, + "step": 8672 + }, + { + "epoch": 0.7857579669769654, + "grad_norm": 0.7334097623825073, + "learning_rate": 0.00014763486981211866, + "loss": 2.4807, + "step": 8673 + }, + { + "epoch": 0.7858485651514122, + "grad_norm": 0.7371491193771362, + "learning_rate": 0.0001476288286111279, + "loss": 2.8582, + "step": 8674 + }, + { + "epoch": 0.785939163325859, + "grad_norm": 0.74226313829422, + "learning_rate": 0.00014762278741013715, + "loss": 2.8396, + "step": 8675 + }, + { + "epoch": 0.7860297615003058, + "grad_norm": 0.7828328609466553, + "learning_rate": 0.00014761674620914639, + "loss": 2.6224, + "step": 8676 + }, + { + "epoch": 0.7861203596747526, + "grad_norm": 0.8833471536636353, + "learning_rate": 0.00014761070500815565, + "loss": 2.749, + "step": 8677 + }, + { + "epoch": 0.7862109578491994, + "grad_norm": 0.7836803197860718, + "learning_rate": 0.00014760466380716488, + "loss": 2.6854, + "step": 8678 + }, + { + "epoch": 0.7863015560236462, + "grad_norm": 0.7633175849914551, + "learning_rate": 0.00014759862260617411, + "loss": 2.6613, + "step": 8679 + }, + { + "epoch": 0.7863921541980929, + "grad_norm": 0.7915408611297607, + "learning_rate": 0.00014759258140518335, + "loss": 2.7391, + "step": 8680 + }, + { + "epoch": 0.7864827523725397, + "grad_norm": 0.9370132088661194, + "learning_rate": 0.0001475865402041926, + "loss": 2.9475, + "step": 8681 + }, + { + "epoch": 0.7865733505469865, + "grad_norm": 0.8257835507392883, + "learning_rate": 0.00014758049900320184, + "loss": 2.7433, + "step": 8682 + }, + { + "epoch": 0.7866639487214333, + "grad_norm": 0.7704206109046936, + "learning_rate": 0.00014757445780221108, + "loss": 1.9602, + "step": 8683 + }, + { + "epoch": 0.7867545468958801, + "grad_norm": 0.820258617401123, + "learning_rate": 0.00014756841660122034, + "loss": 2.7173, + "step": 8684 + }, + { + "epoch": 0.7868451450703269, + "grad_norm": 0.8370013236999512, + "learning_rate": 0.00014756237540022957, + "loss": 2.9949, + "step": 8685 + }, + { + "epoch": 0.7869357432447737, + "grad_norm": 0.7663812041282654, + "learning_rate": 0.00014755633419923883, + "loss": 2.6809, + "step": 8686 + }, + { + "epoch": 0.7870263414192205, + "grad_norm": 0.7725523114204407, + "learning_rate": 0.00014755029299824804, + "loss": 3.0026, + "step": 8687 + }, + { + "epoch": 0.7871169395936672, + "grad_norm": 0.7906398177146912, + "learning_rate": 0.0001475442517972573, + "loss": 2.8284, + "step": 8688 + }, + { + "epoch": 0.7872075377681139, + "grad_norm": 0.7827199697494507, + "learning_rate": 0.00014753821059626654, + "loss": 2.5964, + "step": 8689 + }, + { + "epoch": 0.7872981359425607, + "grad_norm": 0.8441091179847717, + "learning_rate": 0.0001475321693952758, + "loss": 2.8823, + "step": 8690 + }, + { + "epoch": 0.7873887341170075, + "grad_norm": 0.7810313701629639, + "learning_rate": 0.00014752612819428503, + "loss": 2.5872, + "step": 8691 + }, + { + "epoch": 0.7874793322914543, + "grad_norm": 0.7749983072280884, + "learning_rate": 0.00014752008699329427, + "loss": 2.4531, + "step": 8692 + }, + { + "epoch": 0.7875699304659011, + "grad_norm": 0.830188512802124, + "learning_rate": 0.00014751404579230353, + "loss": 2.8768, + "step": 8693 + }, + { + "epoch": 0.7876605286403479, + "grad_norm": 0.773688554763794, + "learning_rate": 0.00014750800459131276, + "loss": 2.5843, + "step": 8694 + }, + { + "epoch": 0.7877511268147946, + "grad_norm": 0.7651098370552063, + "learning_rate": 0.000147501963390322, + "loss": 2.6625, + "step": 8695 + }, + { + "epoch": 0.7878417249892414, + "grad_norm": 0.7952204942703247, + "learning_rate": 0.00014749592218933123, + "loss": 2.9295, + "step": 8696 + }, + { + "epoch": 0.7879323231636882, + "grad_norm": 0.8045949339866638, + "learning_rate": 0.0001474898809883405, + "loss": 3.0074, + "step": 8697 + }, + { + "epoch": 0.788022921338135, + "grad_norm": 0.9160898923873901, + "learning_rate": 0.00014748383978734975, + "loss": 2.8072, + "step": 8698 + }, + { + "epoch": 0.7881135195125818, + "grad_norm": 0.661478579044342, + "learning_rate": 0.00014747779858635899, + "loss": 1.9709, + "step": 8699 + }, + { + "epoch": 0.7882041176870286, + "grad_norm": 0.7630544304847717, + "learning_rate": 0.00014747175738536822, + "loss": 2.847, + "step": 8700 + }, + { + "epoch": 0.7882947158614754, + "grad_norm": 0.7880386114120483, + "learning_rate": 0.00014746571618437745, + "loss": 2.8575, + "step": 8701 + }, + { + "epoch": 0.7883853140359222, + "grad_norm": 0.8049412369728088, + "learning_rate": 0.00014745967498338671, + "loss": 2.7482, + "step": 8702 + }, + { + "epoch": 0.788475912210369, + "grad_norm": 0.8050287961959839, + "learning_rate": 0.00014745363378239595, + "loss": 2.9397, + "step": 8703 + }, + { + "epoch": 0.7885665103848157, + "grad_norm": 0.781440794467926, + "learning_rate": 0.00014744759258140518, + "loss": 2.6829, + "step": 8704 + }, + { + "epoch": 0.7886571085592625, + "grad_norm": 0.8433947563171387, + "learning_rate": 0.00014744155138041444, + "loss": 2.6954, + "step": 8705 + }, + { + "epoch": 0.7887477067337093, + "grad_norm": 0.7846220135688782, + "learning_rate": 0.00014743551017942368, + "loss": 2.6819, + "step": 8706 + }, + { + "epoch": 0.7888383049081561, + "grad_norm": 0.8577587008476257, + "learning_rate": 0.00014742946897843294, + "loss": 2.9122, + "step": 8707 + }, + { + "epoch": 0.7889289030826029, + "grad_norm": 0.630641520023346, + "learning_rate": 0.00014742342777744215, + "loss": 2.0851, + "step": 8708 + }, + { + "epoch": 0.7890195012570497, + "grad_norm": 0.7821537256240845, + "learning_rate": 0.0001474173865764514, + "loss": 2.5869, + "step": 8709 + }, + { + "epoch": 0.7891100994314965, + "grad_norm": 0.8037095069885254, + "learning_rate": 0.00014741134537546064, + "loss": 2.8002, + "step": 8710 + }, + { + "epoch": 0.7892006976059432, + "grad_norm": 0.7228437066078186, + "learning_rate": 0.0001474053041744699, + "loss": 2.1898, + "step": 8711 + }, + { + "epoch": 0.78929129578039, + "grad_norm": 0.6586551666259766, + "learning_rate": 0.00014739926297347914, + "loss": 1.895, + "step": 8712 + }, + { + "epoch": 0.7893818939548368, + "grad_norm": 0.6922949552536011, + "learning_rate": 0.00014739322177248837, + "loss": 2.1388, + "step": 8713 + }, + { + "epoch": 0.7894724921292836, + "grad_norm": 0.8695371150970459, + "learning_rate": 0.00014738718057149763, + "loss": 2.9091, + "step": 8714 + }, + { + "epoch": 0.7895630903037304, + "grad_norm": 0.8348348736763, + "learning_rate": 0.00014738113937050687, + "loss": 3.0953, + "step": 8715 + }, + { + "epoch": 0.7896536884781772, + "grad_norm": 0.7727453708648682, + "learning_rate": 0.0001473750981695161, + "loss": 2.7205, + "step": 8716 + }, + { + "epoch": 0.789744286652624, + "grad_norm": 0.8555764555931091, + "learning_rate": 0.00014736905696852533, + "loss": 2.5347, + "step": 8717 + }, + { + "epoch": 0.7898348848270708, + "grad_norm": 0.8106939196586609, + "learning_rate": 0.0001473630157675346, + "loss": 2.5612, + "step": 8718 + }, + { + "epoch": 0.7899254830015175, + "grad_norm": 0.8361684679985046, + "learning_rate": 0.00014735697456654383, + "loss": 2.7044, + "step": 8719 + }, + { + "epoch": 0.7900160811759643, + "grad_norm": 0.8038851022720337, + "learning_rate": 0.0001473509333655531, + "loss": 2.8672, + "step": 8720 + }, + { + "epoch": 0.7901066793504111, + "grad_norm": 0.7912235856056213, + "learning_rate": 0.00014734489216456232, + "loss": 2.8488, + "step": 8721 + }, + { + "epoch": 0.7901972775248579, + "grad_norm": 0.7564436197280884, + "learning_rate": 0.00014733885096357156, + "loss": 2.6939, + "step": 8722 + }, + { + "epoch": 0.7902878756993047, + "grad_norm": 0.883049488067627, + "learning_rate": 0.00014733280976258082, + "loss": 2.8769, + "step": 8723 + }, + { + "epoch": 0.7903784738737515, + "grad_norm": 0.8107849359512329, + "learning_rate": 0.00014732676856159005, + "loss": 2.9635, + "step": 8724 + }, + { + "epoch": 0.7904690720481983, + "grad_norm": 0.8258549571037292, + "learning_rate": 0.0001473207273605993, + "loss": 2.8556, + "step": 8725 + }, + { + "epoch": 0.7905596702226451, + "grad_norm": 0.7767406702041626, + "learning_rate": 0.00014731468615960852, + "loss": 2.7151, + "step": 8726 + }, + { + "epoch": 0.7906502683970918, + "grad_norm": 0.8351162672042847, + "learning_rate": 0.00014730864495861778, + "loss": 2.8686, + "step": 8727 + }, + { + "epoch": 0.7907408665715386, + "grad_norm": 0.8291149735450745, + "learning_rate": 0.00014730260375762704, + "loss": 2.7977, + "step": 8728 + }, + { + "epoch": 0.7908314647459853, + "grad_norm": 0.8190923929214478, + "learning_rate": 0.00014729656255663625, + "loss": 2.9975, + "step": 8729 + }, + { + "epoch": 0.7909220629204321, + "grad_norm": 0.7572120428085327, + "learning_rate": 0.0001472905213556455, + "loss": 2.8148, + "step": 8730 + }, + { + "epoch": 0.7910126610948789, + "grad_norm": 0.7724706530570984, + "learning_rate": 0.00014728448015465475, + "loss": 2.6654, + "step": 8731 + }, + { + "epoch": 0.7911032592693257, + "grad_norm": 0.7532154321670532, + "learning_rate": 0.000147278438953664, + "loss": 2.6128, + "step": 8732 + }, + { + "epoch": 0.7911938574437725, + "grad_norm": 0.7866023182868958, + "learning_rate": 0.00014727239775267324, + "loss": 2.5652, + "step": 8733 + }, + { + "epoch": 0.7912844556182193, + "grad_norm": 0.7979137301445007, + "learning_rate": 0.00014726635655168248, + "loss": 2.8344, + "step": 8734 + }, + { + "epoch": 0.791375053792666, + "grad_norm": 0.7800737023353577, + "learning_rate": 0.00014726031535069174, + "loss": 2.8409, + "step": 8735 + }, + { + "epoch": 0.7914656519671128, + "grad_norm": 0.8172112107276917, + "learning_rate": 0.00014725427414970097, + "loss": 2.7898, + "step": 8736 + }, + { + "epoch": 0.7915562501415596, + "grad_norm": 0.7783122658729553, + "learning_rate": 0.00014724823294871023, + "loss": 2.9668, + "step": 8737 + }, + { + "epoch": 0.7916468483160064, + "grad_norm": 0.7993232607841492, + "learning_rate": 0.00014724219174771944, + "loss": 2.9026, + "step": 8738 + }, + { + "epoch": 0.7917374464904532, + "grad_norm": 0.8901943564414978, + "learning_rate": 0.0001472361505467287, + "loss": 2.8585, + "step": 8739 + }, + { + "epoch": 0.7918280446649, + "grad_norm": 0.8087913990020752, + "learning_rate": 0.00014723010934573793, + "loss": 2.6497, + "step": 8740 + }, + { + "epoch": 0.7919186428393468, + "grad_norm": 0.753854513168335, + "learning_rate": 0.0001472240681447472, + "loss": 2.6993, + "step": 8741 + }, + { + "epoch": 0.7920092410137936, + "grad_norm": 0.8032562136650085, + "learning_rate": 0.0001472180269437564, + "loss": 2.8255, + "step": 8742 + }, + { + "epoch": 0.7920998391882403, + "grad_norm": 0.7655273675918579, + "learning_rate": 0.00014721198574276566, + "loss": 2.8181, + "step": 8743 + }, + { + "epoch": 0.7921904373626871, + "grad_norm": 0.7803038954734802, + "learning_rate": 0.00014720594454177492, + "loss": 2.9387, + "step": 8744 + }, + { + "epoch": 0.7922810355371339, + "grad_norm": 0.7796838879585266, + "learning_rate": 0.00014719990334078416, + "loss": 2.8431, + "step": 8745 + }, + { + "epoch": 0.7923716337115807, + "grad_norm": 0.7973790168762207, + "learning_rate": 0.0001471938621397934, + "loss": 2.7568, + "step": 8746 + }, + { + "epoch": 0.7924622318860275, + "grad_norm": 0.7856976389884949, + "learning_rate": 0.00014718782093880263, + "loss": 2.7867, + "step": 8747 + }, + { + "epoch": 0.7925528300604743, + "grad_norm": 0.8031361699104309, + "learning_rate": 0.0001471817797378119, + "loss": 2.7777, + "step": 8748 + }, + { + "epoch": 0.7926434282349211, + "grad_norm": 0.8057293891906738, + "learning_rate": 0.00014717573853682112, + "loss": 2.5801, + "step": 8749 + }, + { + "epoch": 0.7927340264093679, + "grad_norm": 0.8586289286613464, + "learning_rate": 0.00014716969733583038, + "loss": 2.8966, + "step": 8750 + }, + { + "epoch": 0.7928246245838146, + "grad_norm": 0.7354238033294678, + "learning_rate": 0.00014716365613483962, + "loss": 2.7667, + "step": 8751 + }, + { + "epoch": 0.7929152227582614, + "grad_norm": 0.7661678194999695, + "learning_rate": 0.00014715761493384885, + "loss": 2.9625, + "step": 8752 + }, + { + "epoch": 0.7930058209327082, + "grad_norm": 0.811477541923523, + "learning_rate": 0.0001471515737328581, + "loss": 2.6798, + "step": 8753 + }, + { + "epoch": 0.793096419107155, + "grad_norm": 0.8529846668243408, + "learning_rate": 0.00014714553253186735, + "loss": 2.7371, + "step": 8754 + }, + { + "epoch": 0.7931870172816018, + "grad_norm": 0.7946705222129822, + "learning_rate": 0.00014713949133087658, + "loss": 2.9322, + "step": 8755 + }, + { + "epoch": 0.7932776154560486, + "grad_norm": 0.8192231059074402, + "learning_rate": 0.00014713345012988581, + "loss": 2.8234, + "step": 8756 + }, + { + "epoch": 0.7933682136304954, + "grad_norm": 0.8284642100334167, + "learning_rate": 0.00014712740892889508, + "loss": 2.8715, + "step": 8757 + }, + { + "epoch": 0.7934588118049422, + "grad_norm": 0.724774181842804, + "learning_rate": 0.00014712136772790434, + "loss": 2.5791, + "step": 8758 + }, + { + "epoch": 0.7935494099793889, + "grad_norm": 0.7889655232429504, + "learning_rate": 0.00014711532652691354, + "loss": 2.7145, + "step": 8759 + }, + { + "epoch": 0.7936400081538357, + "grad_norm": 0.8199071884155273, + "learning_rate": 0.0001471092853259228, + "loss": 2.9196, + "step": 8760 + }, + { + "epoch": 0.7937306063282825, + "grad_norm": 0.778055727481842, + "learning_rate": 0.00014710324412493204, + "loss": 2.7998, + "step": 8761 + }, + { + "epoch": 0.7938212045027293, + "grad_norm": 0.8268120884895325, + "learning_rate": 0.0001470972029239413, + "loss": 2.9693, + "step": 8762 + }, + { + "epoch": 0.7939118026771761, + "grad_norm": 0.8105575442314148, + "learning_rate": 0.00014709116172295053, + "loss": 3.0972, + "step": 8763 + }, + { + "epoch": 0.7940024008516229, + "grad_norm": 0.7896679639816284, + "learning_rate": 0.00014708512052195977, + "loss": 2.9082, + "step": 8764 + }, + { + "epoch": 0.7940929990260697, + "grad_norm": 0.7095540761947632, + "learning_rate": 0.00014707907932096903, + "loss": 2.2541, + "step": 8765 + }, + { + "epoch": 0.7941835972005165, + "grad_norm": 0.7808650135993958, + "learning_rate": 0.00014707303811997826, + "loss": 2.8977, + "step": 8766 + }, + { + "epoch": 0.7942741953749632, + "grad_norm": 0.7426479458808899, + "learning_rate": 0.0001470669969189875, + "loss": 2.4178, + "step": 8767 + }, + { + "epoch": 0.79436479354941, + "grad_norm": 0.9171054363250732, + "learning_rate": 0.00014706095571799673, + "loss": 2.8102, + "step": 8768 + }, + { + "epoch": 0.7944553917238568, + "grad_norm": 0.7602759003639221, + "learning_rate": 0.000147054914517006, + "loss": 2.6035, + "step": 8769 + }, + { + "epoch": 0.7945459898983035, + "grad_norm": 0.7500836253166199, + "learning_rate": 0.00014704887331601523, + "loss": 3.0488, + "step": 8770 + }, + { + "epoch": 0.7946365880727503, + "grad_norm": 0.7632079720497131, + "learning_rate": 0.0001470428321150245, + "loss": 2.6121, + "step": 8771 + }, + { + "epoch": 0.7947271862471971, + "grad_norm": 0.7462980151176453, + "learning_rate": 0.0001470367909140337, + "loss": 2.6663, + "step": 8772 + }, + { + "epoch": 0.7948177844216439, + "grad_norm": 0.8058220148086548, + "learning_rate": 0.00014703074971304296, + "loss": 2.8987, + "step": 8773 + }, + { + "epoch": 0.7949083825960906, + "grad_norm": 0.7823822498321533, + "learning_rate": 0.00014702470851205222, + "loss": 2.7127, + "step": 8774 + }, + { + "epoch": 0.7949989807705374, + "grad_norm": 0.8208547830581665, + "learning_rate": 0.00014701866731106145, + "loss": 2.8451, + "step": 8775 + }, + { + "epoch": 0.7950895789449842, + "grad_norm": 0.784423291683197, + "learning_rate": 0.00014701262611007068, + "loss": 2.9801, + "step": 8776 + }, + { + "epoch": 0.795180177119431, + "grad_norm": 0.7583513855934143, + "learning_rate": 0.00014700658490907992, + "loss": 2.7417, + "step": 8777 + }, + { + "epoch": 0.7952707752938778, + "grad_norm": 0.8103356957435608, + "learning_rate": 0.00014700054370808918, + "loss": 2.7011, + "step": 8778 + }, + { + "epoch": 0.7953613734683246, + "grad_norm": 0.8044670820236206, + "learning_rate": 0.00014699450250709841, + "loss": 2.9164, + "step": 8779 + }, + { + "epoch": 0.7954519716427714, + "grad_norm": 0.8363944888114929, + "learning_rate": 0.00014698846130610765, + "loss": 3.0563, + "step": 8780 + }, + { + "epoch": 0.7955425698172182, + "grad_norm": 0.7935853600502014, + "learning_rate": 0.0001469824201051169, + "loss": 2.8215, + "step": 8781 + }, + { + "epoch": 0.795633167991665, + "grad_norm": 0.7246319651603699, + "learning_rate": 0.00014697637890412614, + "loss": 2.9715, + "step": 8782 + }, + { + "epoch": 0.7957237661661117, + "grad_norm": 0.8206211924552917, + "learning_rate": 0.0001469703377031354, + "loss": 3.0227, + "step": 8783 + }, + { + "epoch": 0.7958143643405585, + "grad_norm": 0.779560923576355, + "learning_rate": 0.00014696429650214464, + "loss": 2.1353, + "step": 8784 + }, + { + "epoch": 0.7959049625150053, + "grad_norm": 0.7760299444198608, + "learning_rate": 0.00014695825530115387, + "loss": 2.7808, + "step": 8785 + }, + { + "epoch": 0.7959955606894521, + "grad_norm": 0.762109637260437, + "learning_rate": 0.0001469522141001631, + "loss": 2.665, + "step": 8786 + }, + { + "epoch": 0.7960861588638989, + "grad_norm": 0.8537574410438538, + "learning_rate": 0.00014694617289917237, + "loss": 2.7813, + "step": 8787 + }, + { + "epoch": 0.7961767570383457, + "grad_norm": 0.7758901715278625, + "learning_rate": 0.00014694013169818163, + "loss": 2.6203, + "step": 8788 + }, + { + "epoch": 0.7962673552127925, + "grad_norm": 0.825027585029602, + "learning_rate": 0.00014693409049719084, + "loss": 2.6853, + "step": 8789 + }, + { + "epoch": 0.7963579533872392, + "grad_norm": 0.7706109881401062, + "learning_rate": 0.0001469280492962001, + "loss": 2.878, + "step": 8790 + }, + { + "epoch": 0.796448551561686, + "grad_norm": 0.7162430882453918, + "learning_rate": 0.00014692200809520933, + "loss": 2.1474, + "step": 8791 + }, + { + "epoch": 0.7965391497361328, + "grad_norm": 0.8029369711875916, + "learning_rate": 0.0001469159668942186, + "loss": 2.839, + "step": 8792 + }, + { + "epoch": 0.7966297479105796, + "grad_norm": 0.8326616883277893, + "learning_rate": 0.0001469099256932278, + "loss": 2.7407, + "step": 8793 + }, + { + "epoch": 0.7967203460850264, + "grad_norm": 0.8496942520141602, + "learning_rate": 0.00014690388449223706, + "loss": 2.7418, + "step": 8794 + }, + { + "epoch": 0.7968109442594732, + "grad_norm": 0.8399388194084167, + "learning_rate": 0.00014689784329124632, + "loss": 2.8079, + "step": 8795 + }, + { + "epoch": 0.79690154243392, + "grad_norm": 0.7639923691749573, + "learning_rate": 0.00014689180209025556, + "loss": 2.9201, + "step": 8796 + }, + { + "epoch": 0.7969921406083668, + "grad_norm": 0.7412963509559631, + "learning_rate": 0.0001468857608892648, + "loss": 2.612, + "step": 8797 + }, + { + "epoch": 0.7970827387828135, + "grad_norm": 0.7867389917373657, + "learning_rate": 0.00014687971968827402, + "loss": 2.6436, + "step": 8798 + }, + { + "epoch": 0.7971733369572603, + "grad_norm": 1.0749603509902954, + "learning_rate": 0.00014687367848728329, + "loss": 2.9692, + "step": 8799 + }, + { + "epoch": 0.7972639351317071, + "grad_norm": 0.7511017918586731, + "learning_rate": 0.00014686763728629252, + "loss": 2.2757, + "step": 8800 + }, + { + "epoch": 0.7973545333061539, + "grad_norm": 0.7925686240196228, + "learning_rate": 0.00014686159608530178, + "loss": 3.0504, + "step": 8801 + }, + { + "epoch": 0.7974451314806007, + "grad_norm": 0.7061945796012878, + "learning_rate": 0.000146855554884311, + "loss": 2.3001, + "step": 8802 + }, + { + "epoch": 0.7975357296550475, + "grad_norm": 0.8796581625938416, + "learning_rate": 0.00014684951368332025, + "loss": 2.9819, + "step": 8803 + }, + { + "epoch": 0.7976263278294943, + "grad_norm": 0.762453019618988, + "learning_rate": 0.0001468434724823295, + "loss": 2.6996, + "step": 8804 + }, + { + "epoch": 0.7977169260039411, + "grad_norm": 0.8241298794746399, + "learning_rate": 0.00014683743128133874, + "loss": 2.8504, + "step": 8805 + }, + { + "epoch": 0.7978075241783879, + "grad_norm": 0.7380418181419373, + "learning_rate": 0.00014683139008034798, + "loss": 2.8391, + "step": 8806 + }, + { + "epoch": 0.7978981223528346, + "grad_norm": 0.7635532021522522, + "learning_rate": 0.0001468253488793572, + "loss": 2.7973, + "step": 8807 + }, + { + "epoch": 0.7979887205272814, + "grad_norm": 0.9074283838272095, + "learning_rate": 0.00014681930767836647, + "loss": 2.7666, + "step": 8808 + }, + { + "epoch": 0.7980793187017282, + "grad_norm": 0.9554743766784668, + "learning_rate": 0.0001468132664773757, + "loss": 2.6723, + "step": 8809 + }, + { + "epoch": 0.7981699168761749, + "grad_norm": 0.9050778746604919, + "learning_rate": 0.00014680722527638494, + "loss": 2.6505, + "step": 8810 + }, + { + "epoch": 0.7982605150506217, + "grad_norm": 0.779712975025177, + "learning_rate": 0.0001468011840753942, + "loss": 2.5701, + "step": 8811 + }, + { + "epoch": 0.7983511132250685, + "grad_norm": 0.8393440842628479, + "learning_rate": 0.00014679514287440344, + "loss": 2.9061, + "step": 8812 + }, + { + "epoch": 0.7984417113995153, + "grad_norm": 0.7488890886306763, + "learning_rate": 0.0001467891016734127, + "loss": 2.7088, + "step": 8813 + }, + { + "epoch": 0.798532309573962, + "grad_norm": 0.8020590543746948, + "learning_rate": 0.00014678306047242193, + "loss": 2.7355, + "step": 8814 + }, + { + "epoch": 0.7986229077484088, + "grad_norm": 0.6748383641242981, + "learning_rate": 0.00014677701927143117, + "loss": 2.0081, + "step": 8815 + }, + { + "epoch": 0.7987135059228556, + "grad_norm": 0.7635909914970398, + "learning_rate": 0.0001467709780704404, + "loss": 2.8102, + "step": 8816 + }, + { + "epoch": 0.7988041040973024, + "grad_norm": 0.7627056241035461, + "learning_rate": 0.00014676493686944966, + "loss": 2.9055, + "step": 8817 + }, + { + "epoch": 0.7988947022717492, + "grad_norm": 0.7850745916366577, + "learning_rate": 0.0001467588956684589, + "loss": 2.666, + "step": 8818 + }, + { + "epoch": 0.798985300446196, + "grad_norm": 0.8513732552528381, + "learning_rate": 0.00014675285446746813, + "loss": 2.9317, + "step": 8819 + }, + { + "epoch": 0.7990758986206428, + "grad_norm": 0.7699277400970459, + "learning_rate": 0.0001467468132664774, + "loss": 2.78, + "step": 8820 + }, + { + "epoch": 0.7991664967950896, + "grad_norm": 0.7516068816184998, + "learning_rate": 0.00014674077206548662, + "loss": 2.6593, + "step": 8821 + }, + { + "epoch": 0.7992570949695363, + "grad_norm": 0.8197491765022278, + "learning_rate": 0.00014673473086449589, + "loss": 2.6669, + "step": 8822 + }, + { + "epoch": 0.7993476931439831, + "grad_norm": 0.6334033012390137, + "learning_rate": 0.0001467286896635051, + "loss": 1.9414, + "step": 8823 + }, + { + "epoch": 0.7994382913184299, + "grad_norm": 0.768040120601654, + "learning_rate": 0.00014672264846251435, + "loss": 2.8574, + "step": 8824 + }, + { + "epoch": 0.7995288894928767, + "grad_norm": 0.8705130219459534, + "learning_rate": 0.00014671660726152361, + "loss": 2.9567, + "step": 8825 + }, + { + "epoch": 0.7996194876673235, + "grad_norm": 0.9667791724205017, + "learning_rate": 0.00014671056606053285, + "loss": 2.8078, + "step": 8826 + }, + { + "epoch": 0.7997100858417703, + "grad_norm": 0.7694150805473328, + "learning_rate": 0.00014670452485954208, + "loss": 2.961, + "step": 8827 + }, + { + "epoch": 0.7998006840162171, + "grad_norm": 0.8410674929618835, + "learning_rate": 0.00014669848365855132, + "loss": 2.9079, + "step": 8828 + }, + { + "epoch": 0.7998912821906639, + "grad_norm": 0.8011990189552307, + "learning_rate": 0.00014669244245756058, + "loss": 2.7814, + "step": 8829 + }, + { + "epoch": 0.7999818803651106, + "grad_norm": 0.8942069411277771, + "learning_rate": 0.0001466864012565698, + "loss": 3.0363, + "step": 8830 + }, + { + "epoch": 0.8000724785395574, + "grad_norm": 0.8707364797592163, + "learning_rate": 0.00014668036005557905, + "loss": 2.7298, + "step": 8831 + }, + { + "epoch": 0.8001630767140042, + "grad_norm": 0.8107656240463257, + "learning_rate": 0.00014667431885458828, + "loss": 2.7903, + "step": 8832 + }, + { + "epoch": 0.800253674888451, + "grad_norm": 0.9128262400627136, + "learning_rate": 0.00014666827765359754, + "loss": 3.042, + "step": 8833 + }, + { + "epoch": 0.8003442730628978, + "grad_norm": 0.7925093770027161, + "learning_rate": 0.0001466622364526068, + "loss": 2.7526, + "step": 8834 + }, + { + "epoch": 0.8004348712373446, + "grad_norm": 0.7914665937423706, + "learning_rate": 0.00014665619525161604, + "loss": 2.7959, + "step": 8835 + }, + { + "epoch": 0.8005254694117914, + "grad_norm": 0.7647543549537659, + "learning_rate": 0.00014665015405062527, + "loss": 2.5756, + "step": 8836 + }, + { + "epoch": 0.8006160675862382, + "grad_norm": 0.7698851227760315, + "learning_rate": 0.0001466441128496345, + "loss": 2.7029, + "step": 8837 + }, + { + "epoch": 0.8007066657606849, + "grad_norm": 0.8164306879043579, + "learning_rate": 0.00014663807164864377, + "loss": 2.8084, + "step": 8838 + }, + { + "epoch": 0.8007972639351317, + "grad_norm": 0.8192101716995239, + "learning_rate": 0.000146632030447653, + "loss": 2.9961, + "step": 8839 + }, + { + "epoch": 0.8008878621095785, + "grad_norm": 0.8005090355873108, + "learning_rate": 0.00014662598924666223, + "loss": 2.857, + "step": 8840 + }, + { + "epoch": 0.8009784602840253, + "grad_norm": 0.6871324777603149, + "learning_rate": 0.0001466199480456715, + "loss": 2.0252, + "step": 8841 + }, + { + "epoch": 0.8010690584584721, + "grad_norm": 0.7831925749778748, + "learning_rate": 0.00014661390684468073, + "loss": 2.5671, + "step": 8842 + }, + { + "epoch": 0.8011596566329189, + "grad_norm": 0.7941068410873413, + "learning_rate": 0.00014660786564369, + "loss": 2.6482, + "step": 8843 + }, + { + "epoch": 0.8012502548073657, + "grad_norm": 0.7812589406967163, + "learning_rate": 0.0001466018244426992, + "loss": 2.597, + "step": 8844 + }, + { + "epoch": 0.8013408529818125, + "grad_norm": 0.753448486328125, + "learning_rate": 0.00014659578324170846, + "loss": 2.6813, + "step": 8845 + }, + { + "epoch": 0.8014314511562592, + "grad_norm": 0.7835782766342163, + "learning_rate": 0.0001465897420407177, + "loss": 2.8329, + "step": 8846 + }, + { + "epoch": 0.801522049330706, + "grad_norm": 0.8340457081794739, + "learning_rate": 0.00014658370083972695, + "loss": 2.7749, + "step": 8847 + }, + { + "epoch": 0.8016126475051528, + "grad_norm": 0.8552448749542236, + "learning_rate": 0.0001465776596387362, + "loss": 2.862, + "step": 8848 + }, + { + "epoch": 0.8017032456795996, + "grad_norm": 0.8051230311393738, + "learning_rate": 0.00014657161843774542, + "loss": 2.2617, + "step": 8849 + }, + { + "epoch": 0.8017938438540464, + "grad_norm": 0.7829905152320862, + "learning_rate": 0.00014656557723675468, + "loss": 2.8207, + "step": 8850 + }, + { + "epoch": 0.8018844420284931, + "grad_norm": 0.7732526659965515, + "learning_rate": 0.00014655953603576392, + "loss": 2.7291, + "step": 8851 + }, + { + "epoch": 0.8019750402029399, + "grad_norm": 0.7605319619178772, + "learning_rate": 0.00014655349483477315, + "loss": 2.7506, + "step": 8852 + }, + { + "epoch": 0.8020656383773866, + "grad_norm": 0.7821561098098755, + "learning_rate": 0.00014654745363378238, + "loss": 2.9885, + "step": 8853 + }, + { + "epoch": 0.8021562365518334, + "grad_norm": 0.8229318261146545, + "learning_rate": 0.00014654141243279165, + "loss": 2.9368, + "step": 8854 + }, + { + "epoch": 0.8022468347262802, + "grad_norm": 0.7562168836593628, + "learning_rate": 0.0001465353712318009, + "loss": 2.0279, + "step": 8855 + }, + { + "epoch": 0.802337432900727, + "grad_norm": 0.819479763507843, + "learning_rate": 0.00014652933003081014, + "loss": 2.8227, + "step": 8856 + }, + { + "epoch": 0.8024280310751738, + "grad_norm": 0.8077454566955566, + "learning_rate": 0.00014652328882981938, + "loss": 2.4336, + "step": 8857 + }, + { + "epoch": 0.8025186292496206, + "grad_norm": 0.8165393471717834, + "learning_rate": 0.0001465172476288286, + "loss": 2.7277, + "step": 8858 + }, + { + "epoch": 0.8026092274240674, + "grad_norm": 0.6907609105110168, + "learning_rate": 0.00014651120642783787, + "loss": 1.9168, + "step": 8859 + }, + { + "epoch": 0.8026998255985142, + "grad_norm": 0.7356154322624207, + "learning_rate": 0.0001465051652268471, + "loss": 2.7292, + "step": 8860 + }, + { + "epoch": 0.802790423772961, + "grad_norm": 0.6959073543548584, + "learning_rate": 0.00014649912402585634, + "loss": 2.1498, + "step": 8861 + }, + { + "epoch": 0.8028810219474077, + "grad_norm": 0.7221754193305969, + "learning_rate": 0.00014649308282486557, + "loss": 2.3083, + "step": 8862 + }, + { + "epoch": 0.8029716201218545, + "grad_norm": 0.838969886302948, + "learning_rate": 0.00014648704162387483, + "loss": 2.6218, + "step": 8863 + }, + { + "epoch": 0.8030622182963013, + "grad_norm": 0.8362877368927002, + "learning_rate": 0.0001464810004228841, + "loss": 2.6548, + "step": 8864 + }, + { + "epoch": 0.8031528164707481, + "grad_norm": 0.749589741230011, + "learning_rate": 0.00014647495922189333, + "loss": 2.6056, + "step": 8865 + }, + { + "epoch": 0.8032434146451949, + "grad_norm": 0.8238704800605774, + "learning_rate": 0.00014646891802090256, + "loss": 2.9633, + "step": 8866 + }, + { + "epoch": 0.8033340128196417, + "grad_norm": 0.8225491642951965, + "learning_rate": 0.0001464628768199118, + "loss": 2.9049, + "step": 8867 + }, + { + "epoch": 0.8034246109940885, + "grad_norm": 0.7294368743896484, + "learning_rate": 0.00014645683561892106, + "loss": 2.8622, + "step": 8868 + }, + { + "epoch": 0.8035152091685352, + "grad_norm": 0.8024671077728271, + "learning_rate": 0.0001464507944179303, + "loss": 2.8857, + "step": 8869 + }, + { + "epoch": 0.803605807342982, + "grad_norm": 0.6796316504478455, + "learning_rate": 0.00014644475321693953, + "loss": 2.2569, + "step": 8870 + }, + { + "epoch": 0.8036964055174288, + "grad_norm": 0.687089741230011, + "learning_rate": 0.0001464387120159488, + "loss": 1.9892, + "step": 8871 + }, + { + "epoch": 0.8037870036918756, + "grad_norm": 0.8483822345733643, + "learning_rate": 0.00014643267081495802, + "loss": 3.0145, + "step": 8872 + }, + { + "epoch": 0.8038776018663224, + "grad_norm": 0.8147115111351013, + "learning_rate": 0.00014642662961396728, + "loss": 2.8664, + "step": 8873 + }, + { + "epoch": 0.8039682000407692, + "grad_norm": 0.8362317085266113, + "learning_rate": 0.0001464205884129765, + "loss": 2.7503, + "step": 8874 + }, + { + "epoch": 0.804058798215216, + "grad_norm": 0.7332741618156433, + "learning_rate": 0.00014641454721198575, + "loss": 2.6962, + "step": 8875 + }, + { + "epoch": 0.8041493963896628, + "grad_norm": 0.7826964855194092, + "learning_rate": 0.00014640850601099498, + "loss": 2.7732, + "step": 8876 + }, + { + "epoch": 0.8042399945641096, + "grad_norm": 0.7999266982078552, + "learning_rate": 0.00014640246481000425, + "loss": 2.8947, + "step": 8877 + }, + { + "epoch": 0.8043305927385563, + "grad_norm": 0.7687211036682129, + "learning_rate": 0.00014639642360901348, + "loss": 2.5055, + "step": 8878 + }, + { + "epoch": 0.8044211909130031, + "grad_norm": 0.7863785624504089, + "learning_rate": 0.00014639038240802271, + "loss": 2.7051, + "step": 8879 + }, + { + "epoch": 0.8045117890874499, + "grad_norm": 0.7180294990539551, + "learning_rate": 0.00014638434120703198, + "loss": 2.586, + "step": 8880 + }, + { + "epoch": 0.8046023872618967, + "grad_norm": 0.8370947241783142, + "learning_rate": 0.0001463783000060412, + "loss": 2.9035, + "step": 8881 + }, + { + "epoch": 0.8046929854363435, + "grad_norm": 0.7724171876907349, + "learning_rate": 0.00014637225880505044, + "loss": 2.5827, + "step": 8882 + }, + { + "epoch": 0.8047835836107903, + "grad_norm": 0.7992942333221436, + "learning_rate": 0.00014636621760405968, + "loss": 2.8202, + "step": 8883 + }, + { + "epoch": 0.8048741817852371, + "grad_norm": 0.782435953617096, + "learning_rate": 0.00014636017640306894, + "loss": 2.2981, + "step": 8884 + }, + { + "epoch": 0.8049647799596839, + "grad_norm": 0.9030271172523499, + "learning_rate": 0.0001463541352020782, + "loss": 2.6295, + "step": 8885 + }, + { + "epoch": 0.8050553781341306, + "grad_norm": 0.8342363834381104, + "learning_rate": 0.00014634809400108743, + "loss": 2.8419, + "step": 8886 + }, + { + "epoch": 0.8051459763085774, + "grad_norm": 0.8371924757957458, + "learning_rate": 0.00014634205280009667, + "loss": 2.8532, + "step": 8887 + }, + { + "epoch": 0.8052365744830242, + "grad_norm": 0.8643871545791626, + "learning_rate": 0.0001463360115991059, + "loss": 2.689, + "step": 8888 + }, + { + "epoch": 0.805327172657471, + "grad_norm": 0.853111743927002, + "learning_rate": 0.00014632997039811516, + "loss": 2.7161, + "step": 8889 + }, + { + "epoch": 0.8054177708319178, + "grad_norm": 0.8365142345428467, + "learning_rate": 0.0001463239291971244, + "loss": 3.0212, + "step": 8890 + }, + { + "epoch": 0.8055083690063645, + "grad_norm": 0.7981661558151245, + "learning_rate": 0.00014631788799613363, + "loss": 2.65, + "step": 8891 + }, + { + "epoch": 0.8055989671808113, + "grad_norm": 0.8286249041557312, + "learning_rate": 0.00014631184679514287, + "loss": 2.8325, + "step": 8892 + }, + { + "epoch": 0.805689565355258, + "grad_norm": 0.834746778011322, + "learning_rate": 0.00014630580559415213, + "loss": 3.0438, + "step": 8893 + }, + { + "epoch": 0.8057801635297048, + "grad_norm": 0.8038969039916992, + "learning_rate": 0.0001462997643931614, + "loss": 2.792, + "step": 8894 + }, + { + "epoch": 0.8058707617041516, + "grad_norm": 0.8008673787117004, + "learning_rate": 0.0001462937231921706, + "loss": 2.7307, + "step": 8895 + }, + { + "epoch": 0.8059613598785984, + "grad_norm": 0.8001633882522583, + "learning_rate": 0.00014628768199117986, + "loss": 2.9831, + "step": 8896 + }, + { + "epoch": 0.8060519580530452, + "grad_norm": 0.8279359936714172, + "learning_rate": 0.0001462816407901891, + "loss": 2.9031, + "step": 8897 + }, + { + "epoch": 0.806142556227492, + "grad_norm": 0.8419334888458252, + "learning_rate": 0.00014627559958919835, + "loss": 2.8098, + "step": 8898 + }, + { + "epoch": 0.8062331544019388, + "grad_norm": 0.8280171751976013, + "learning_rate": 0.00014626955838820758, + "loss": 2.7346, + "step": 8899 + }, + { + "epoch": 0.8063237525763856, + "grad_norm": 0.8512020111083984, + "learning_rate": 0.00014626351718721682, + "loss": 2.7729, + "step": 8900 + }, + { + "epoch": 0.8064143507508323, + "grad_norm": 0.8564783930778503, + "learning_rate": 0.00014625747598622608, + "loss": 2.8697, + "step": 8901 + }, + { + "epoch": 0.8065049489252791, + "grad_norm": 0.8632746338844299, + "learning_rate": 0.00014625143478523531, + "loss": 2.9541, + "step": 8902 + }, + { + "epoch": 0.8065955470997259, + "grad_norm": 0.7698052525520325, + "learning_rate": 0.00014624539358424455, + "loss": 2.7013, + "step": 8903 + }, + { + "epoch": 0.8066861452741727, + "grad_norm": 0.7862996459007263, + "learning_rate": 0.00014623935238325378, + "loss": 2.7715, + "step": 8904 + }, + { + "epoch": 0.8067767434486195, + "grad_norm": 0.7846841216087341, + "learning_rate": 0.00014623331118226304, + "loss": 2.8769, + "step": 8905 + }, + { + "epoch": 0.8068673416230663, + "grad_norm": 0.8106223344802856, + "learning_rate": 0.00014622726998127228, + "loss": 2.9213, + "step": 8906 + }, + { + "epoch": 0.8069579397975131, + "grad_norm": 0.8359761834144592, + "learning_rate": 0.00014622122878028154, + "loss": 3.1208, + "step": 8907 + }, + { + "epoch": 0.8070485379719599, + "grad_norm": 0.7604934573173523, + "learning_rate": 0.00014621518757929077, + "loss": 2.7934, + "step": 8908 + }, + { + "epoch": 0.8071391361464066, + "grad_norm": 0.8497269749641418, + "learning_rate": 0.0001462091463783, + "loss": 2.8824, + "step": 8909 + }, + { + "epoch": 0.8072297343208534, + "grad_norm": 0.8509673476219177, + "learning_rate": 0.00014620310517730927, + "loss": 2.987, + "step": 8910 + }, + { + "epoch": 0.8073203324953002, + "grad_norm": 0.7938664555549622, + "learning_rate": 0.0001461970639763185, + "loss": 2.7835, + "step": 8911 + }, + { + "epoch": 0.807410930669747, + "grad_norm": 0.7918897271156311, + "learning_rate": 0.00014619102277532774, + "loss": 2.9527, + "step": 8912 + }, + { + "epoch": 0.8075015288441938, + "grad_norm": 0.782969057559967, + "learning_rate": 0.00014618498157433697, + "loss": 2.8858, + "step": 8913 + }, + { + "epoch": 0.8075921270186406, + "grad_norm": 0.8589594960212708, + "learning_rate": 0.00014617894037334623, + "loss": 2.8584, + "step": 8914 + }, + { + "epoch": 0.8076827251930874, + "grad_norm": 0.6968678832054138, + "learning_rate": 0.0001461728991723555, + "loss": 2.3903, + "step": 8915 + }, + { + "epoch": 0.8077733233675342, + "grad_norm": 0.8403642773628235, + "learning_rate": 0.0001461668579713647, + "loss": 2.9924, + "step": 8916 + }, + { + "epoch": 0.807863921541981, + "grad_norm": 0.7774734497070312, + "learning_rate": 0.00014616081677037396, + "loss": 2.6093, + "step": 8917 + }, + { + "epoch": 0.8079545197164277, + "grad_norm": 0.7578040957450867, + "learning_rate": 0.0001461547755693832, + "loss": 2.8663, + "step": 8918 + }, + { + "epoch": 0.8080451178908745, + "grad_norm": 0.6881811618804932, + "learning_rate": 0.00014614873436839246, + "loss": 1.9675, + "step": 8919 + }, + { + "epoch": 0.8081357160653213, + "grad_norm": 0.798086941242218, + "learning_rate": 0.0001461426931674017, + "loss": 2.6496, + "step": 8920 + }, + { + "epoch": 0.8082263142397681, + "grad_norm": 0.8562291860580444, + "learning_rate": 0.00014613665196641092, + "loss": 2.9976, + "step": 8921 + }, + { + "epoch": 0.8083169124142149, + "grad_norm": 0.8903298377990723, + "learning_rate": 0.00014613061076542016, + "loss": 2.6505, + "step": 8922 + }, + { + "epoch": 0.8084075105886617, + "grad_norm": 0.8501774668693542, + "learning_rate": 0.00014612456956442942, + "loss": 2.9362, + "step": 8923 + }, + { + "epoch": 0.8084981087631085, + "grad_norm": 0.8414463996887207, + "learning_rate": 0.00014611852836343868, + "loss": 2.6345, + "step": 8924 + }, + { + "epoch": 0.8085887069375552, + "grad_norm": 0.7513495683670044, + "learning_rate": 0.0001461124871624479, + "loss": 2.519, + "step": 8925 + }, + { + "epoch": 0.808679305112002, + "grad_norm": 0.8378169536590576, + "learning_rate": 0.00014610644596145715, + "loss": 2.6556, + "step": 8926 + }, + { + "epoch": 0.8087699032864488, + "grad_norm": 0.8628169298171997, + "learning_rate": 0.00014610040476046638, + "loss": 3.0045, + "step": 8927 + }, + { + "epoch": 0.8088605014608956, + "grad_norm": 0.7995690703392029, + "learning_rate": 0.00014609436355947564, + "loss": 2.6349, + "step": 8928 + }, + { + "epoch": 0.8089510996353424, + "grad_norm": 0.7817795872688293, + "learning_rate": 0.00014608832235848485, + "loss": 2.5764, + "step": 8929 + }, + { + "epoch": 0.8090416978097892, + "grad_norm": 0.7902078628540039, + "learning_rate": 0.0001460822811574941, + "loss": 2.8171, + "step": 8930 + }, + { + "epoch": 0.809132295984236, + "grad_norm": 0.8299824595451355, + "learning_rate": 0.00014607623995650337, + "loss": 3.2457, + "step": 8931 + }, + { + "epoch": 0.8092228941586826, + "grad_norm": 0.7242081165313721, + "learning_rate": 0.0001460701987555126, + "loss": 2.1914, + "step": 8932 + }, + { + "epoch": 0.8093134923331294, + "grad_norm": 0.8092594146728516, + "learning_rate": 0.00014606415755452184, + "loss": 2.7652, + "step": 8933 + }, + { + "epoch": 0.8094040905075762, + "grad_norm": 0.8389678597450256, + "learning_rate": 0.00014605811635353107, + "loss": 2.8578, + "step": 8934 + }, + { + "epoch": 0.809494688682023, + "grad_norm": 0.7525557279586792, + "learning_rate": 0.00014605207515254034, + "loss": 2.6394, + "step": 8935 + }, + { + "epoch": 0.8095852868564698, + "grad_norm": 0.8376272320747375, + "learning_rate": 0.00014604603395154957, + "loss": 2.6581, + "step": 8936 + }, + { + "epoch": 0.8096758850309166, + "grad_norm": 0.8142749667167664, + "learning_rate": 0.00014603999275055883, + "loss": 2.8697, + "step": 8937 + }, + { + "epoch": 0.8097664832053634, + "grad_norm": 0.8614579439163208, + "learning_rate": 0.00014603395154956807, + "loss": 3.1161, + "step": 8938 + }, + { + "epoch": 0.8098570813798102, + "grad_norm": 0.7865960597991943, + "learning_rate": 0.0001460279103485773, + "loss": 3.0008, + "step": 8939 + }, + { + "epoch": 0.809947679554257, + "grad_norm": 0.787668764591217, + "learning_rate": 0.00014602186914758656, + "loss": 2.7182, + "step": 8940 + }, + { + "epoch": 0.8100382777287037, + "grad_norm": 0.7822709679603577, + "learning_rate": 0.0001460158279465958, + "loss": 2.6169, + "step": 8941 + }, + { + "epoch": 0.8101288759031505, + "grad_norm": 0.7109547257423401, + "learning_rate": 0.00014600978674560503, + "loss": 2.7697, + "step": 8942 + }, + { + "epoch": 0.8102194740775973, + "grad_norm": 0.8561983108520508, + "learning_rate": 0.00014600374554461426, + "loss": 2.9159, + "step": 8943 + }, + { + "epoch": 0.8103100722520441, + "grad_norm": 0.8068404793739319, + "learning_rate": 0.00014599770434362352, + "loss": 2.9106, + "step": 8944 + }, + { + "epoch": 0.8104006704264909, + "grad_norm": 0.8248716592788696, + "learning_rate": 0.00014599166314263278, + "loss": 2.6628, + "step": 8945 + }, + { + "epoch": 0.8104912686009377, + "grad_norm": 0.6933448314666748, + "learning_rate": 0.000145985621941642, + "loss": 2.2661, + "step": 8946 + }, + { + "epoch": 0.8105818667753845, + "grad_norm": 0.8618133664131165, + "learning_rate": 0.00014597958074065125, + "loss": 2.7693, + "step": 8947 + }, + { + "epoch": 0.8106724649498313, + "grad_norm": 0.6582828164100647, + "learning_rate": 0.0001459735395396605, + "loss": 2.0731, + "step": 8948 + }, + { + "epoch": 0.810763063124278, + "grad_norm": 0.759112536907196, + "learning_rate": 0.00014596749833866975, + "loss": 1.9377, + "step": 8949 + }, + { + "epoch": 0.8108536612987248, + "grad_norm": 0.7774397730827332, + "learning_rate": 0.00014596145713767898, + "loss": 2.7358, + "step": 8950 + }, + { + "epoch": 0.8109442594731716, + "grad_norm": 0.8444797992706299, + "learning_rate": 0.00014595541593668822, + "loss": 2.6205, + "step": 8951 + }, + { + "epoch": 0.8110348576476184, + "grad_norm": 0.8057842254638672, + "learning_rate": 0.00014594937473569745, + "loss": 2.6988, + "step": 8952 + }, + { + "epoch": 0.8111254558220652, + "grad_norm": 0.7842429876327515, + "learning_rate": 0.0001459433335347067, + "loss": 2.9541, + "step": 8953 + }, + { + "epoch": 0.811216053996512, + "grad_norm": 0.7432068586349487, + "learning_rate": 0.00014593729233371595, + "loss": 2.6924, + "step": 8954 + }, + { + "epoch": 0.8113066521709588, + "grad_norm": 0.7283487915992737, + "learning_rate": 0.00014593125113272518, + "loss": 2.4864, + "step": 8955 + }, + { + "epoch": 0.8113972503454056, + "grad_norm": 0.7885593175888062, + "learning_rate": 0.00014592520993173444, + "loss": 2.7107, + "step": 8956 + }, + { + "epoch": 0.8114878485198523, + "grad_norm": 0.7473198771476746, + "learning_rate": 0.00014591916873074367, + "loss": 2.822, + "step": 8957 + }, + { + "epoch": 0.8115784466942991, + "grad_norm": 0.8204666972160339, + "learning_rate": 0.00014591312752975294, + "loss": 2.7898, + "step": 8958 + }, + { + "epoch": 0.8116690448687459, + "grad_norm": 0.8657857179641724, + "learning_rate": 0.00014590708632876214, + "loss": 3.0129, + "step": 8959 + }, + { + "epoch": 0.8117596430431927, + "grad_norm": 0.7525634169578552, + "learning_rate": 0.0001459010451277714, + "loss": 2.9357, + "step": 8960 + }, + { + "epoch": 0.8118502412176395, + "grad_norm": 0.7643778920173645, + "learning_rate": 0.00014589500392678067, + "loss": 2.9559, + "step": 8961 + }, + { + "epoch": 0.8119408393920863, + "grad_norm": 0.803992748260498, + "learning_rate": 0.0001458889627257899, + "loss": 2.741, + "step": 8962 + }, + { + "epoch": 0.8120314375665331, + "grad_norm": 0.78992760181427, + "learning_rate": 0.00014588292152479913, + "loss": 2.8635, + "step": 8963 + }, + { + "epoch": 0.8121220357409799, + "grad_norm": 0.8104476928710938, + "learning_rate": 0.00014587688032380837, + "loss": 2.8317, + "step": 8964 + }, + { + "epoch": 0.8122126339154266, + "grad_norm": 0.7811553478240967, + "learning_rate": 0.00014587083912281763, + "loss": 2.7388, + "step": 8965 + }, + { + "epoch": 0.8123032320898734, + "grad_norm": 0.8501136898994446, + "learning_rate": 0.00014586479792182686, + "loss": 2.8006, + "step": 8966 + }, + { + "epoch": 0.8123938302643202, + "grad_norm": 0.835638165473938, + "learning_rate": 0.0001458587567208361, + "loss": 2.6005, + "step": 8967 + }, + { + "epoch": 0.812484428438767, + "grad_norm": 0.8982033729553223, + "learning_rate": 0.00014585271551984536, + "loss": 2.7683, + "step": 8968 + }, + { + "epoch": 0.8125750266132138, + "grad_norm": 0.8843491673469543, + "learning_rate": 0.0001458466743188546, + "loss": 2.9267, + "step": 8969 + }, + { + "epoch": 0.8126656247876606, + "grad_norm": 0.8342661261558533, + "learning_rate": 0.00014584063311786385, + "loss": 2.6628, + "step": 8970 + }, + { + "epoch": 0.8127562229621074, + "grad_norm": 0.7743779420852661, + "learning_rate": 0.0001458345919168731, + "loss": 2.7004, + "step": 8971 + }, + { + "epoch": 0.812846821136554, + "grad_norm": 0.8060936331748962, + "learning_rate": 0.00014582855071588232, + "loss": 2.7964, + "step": 8972 + }, + { + "epoch": 0.8129374193110008, + "grad_norm": 0.7203866243362427, + "learning_rate": 0.00014582250951489156, + "loss": 2.0998, + "step": 8973 + }, + { + "epoch": 0.8130280174854476, + "grad_norm": 0.8507064580917358, + "learning_rate": 0.00014581646831390082, + "loss": 3.0074, + "step": 8974 + }, + { + "epoch": 0.8131186156598944, + "grad_norm": 0.7328072190284729, + "learning_rate": 0.00014581042711291008, + "loss": 2.1887, + "step": 8975 + }, + { + "epoch": 0.8132092138343412, + "grad_norm": 0.8044980764389038, + "learning_rate": 0.00014580438591191928, + "loss": 2.7801, + "step": 8976 + }, + { + "epoch": 0.813299812008788, + "grad_norm": 0.8088290691375732, + "learning_rate": 0.00014579834471092855, + "loss": 2.8193, + "step": 8977 + }, + { + "epoch": 0.8133904101832348, + "grad_norm": 0.7732641696929932, + "learning_rate": 0.00014579230350993778, + "loss": 2.599, + "step": 8978 + }, + { + "epoch": 0.8134810083576816, + "grad_norm": 0.7310886979103088, + "learning_rate": 0.00014578626230894704, + "loss": 2.9459, + "step": 8979 + }, + { + "epoch": 0.8135716065321283, + "grad_norm": 0.8408189415931702, + "learning_rate": 0.00014578022110795625, + "loss": 2.9781, + "step": 8980 + }, + { + "epoch": 0.8136622047065751, + "grad_norm": 0.7717103362083435, + "learning_rate": 0.0001457741799069655, + "loss": 2.8141, + "step": 8981 + }, + { + "epoch": 0.8137528028810219, + "grad_norm": 0.776095986366272, + "learning_rate": 0.00014576813870597474, + "loss": 2.8662, + "step": 8982 + }, + { + "epoch": 0.8138434010554687, + "grad_norm": 0.8165177702903748, + "learning_rate": 0.000145762097504984, + "loss": 2.7048, + "step": 8983 + }, + { + "epoch": 0.8139339992299155, + "grad_norm": 0.821631669998169, + "learning_rate": 0.00014575605630399324, + "loss": 2.7948, + "step": 8984 + }, + { + "epoch": 0.8140245974043623, + "grad_norm": 0.8309692740440369, + "learning_rate": 0.00014575001510300247, + "loss": 2.6302, + "step": 8985 + }, + { + "epoch": 0.8141151955788091, + "grad_norm": 0.7944962382316589, + "learning_rate": 0.00014574397390201173, + "loss": 2.8695, + "step": 8986 + }, + { + "epoch": 0.8142057937532559, + "grad_norm": 0.8352354168891907, + "learning_rate": 0.00014573793270102097, + "loss": 2.7069, + "step": 8987 + }, + { + "epoch": 0.8142963919277026, + "grad_norm": 0.703834056854248, + "learning_rate": 0.00014573189150003023, + "loss": 2.1723, + "step": 8988 + }, + { + "epoch": 0.8143869901021494, + "grad_norm": 0.7989376783370972, + "learning_rate": 0.00014572585029903944, + "loss": 2.6423, + "step": 8989 + }, + { + "epoch": 0.8144775882765962, + "grad_norm": 0.8356828093528748, + "learning_rate": 0.0001457198090980487, + "loss": 2.8732, + "step": 8990 + }, + { + "epoch": 0.814568186451043, + "grad_norm": 0.7778586149215698, + "learning_rate": 0.00014571376789705796, + "loss": 2.5785, + "step": 8991 + }, + { + "epoch": 0.8146587846254898, + "grad_norm": 0.8077204823493958, + "learning_rate": 0.0001457077266960672, + "loss": 2.8081, + "step": 8992 + }, + { + "epoch": 0.8147493827999366, + "grad_norm": 0.8376349210739136, + "learning_rate": 0.00014570168549507643, + "loss": 3.1058, + "step": 8993 + }, + { + "epoch": 0.8148399809743834, + "grad_norm": 0.7820855975151062, + "learning_rate": 0.00014569564429408566, + "loss": 2.7629, + "step": 8994 + }, + { + "epoch": 0.8149305791488302, + "grad_norm": 0.7774994373321533, + "learning_rate": 0.00014568960309309492, + "loss": 2.5633, + "step": 8995 + }, + { + "epoch": 0.815021177323277, + "grad_norm": 0.7835010886192322, + "learning_rate": 0.00014568356189210416, + "loss": 2.8141, + "step": 8996 + }, + { + "epoch": 0.8151117754977237, + "grad_norm": 0.880375862121582, + "learning_rate": 0.0001456775206911134, + "loss": 3.0433, + "step": 8997 + }, + { + "epoch": 0.8152023736721705, + "grad_norm": 0.8082833290100098, + "learning_rate": 0.00014567147949012265, + "loss": 2.5107, + "step": 8998 + }, + { + "epoch": 0.8152929718466173, + "grad_norm": 0.8459770083427429, + "learning_rate": 0.00014566543828913188, + "loss": 2.8015, + "step": 8999 + }, + { + "epoch": 0.8153835700210641, + "grad_norm": 0.8170815110206604, + "learning_rate": 0.00014565939708814115, + "loss": 2.6383, + "step": 9000 + }, + { + "epoch": 0.8154741681955109, + "grad_norm": 0.855270266532898, + "learning_rate": 0.00014565335588715038, + "loss": 2.7428, + "step": 9001 + }, + { + "epoch": 0.8155647663699577, + "grad_norm": 0.8760605454444885, + "learning_rate": 0.00014564731468615961, + "loss": 2.9933, + "step": 9002 + }, + { + "epoch": 0.8156553645444045, + "grad_norm": 0.7774286866188049, + "learning_rate": 0.00014564127348516885, + "loss": 3.1411, + "step": 9003 + }, + { + "epoch": 0.8157459627188512, + "grad_norm": 0.814730703830719, + "learning_rate": 0.0001456352322841781, + "loss": 2.5555, + "step": 9004 + }, + { + "epoch": 0.815836560893298, + "grad_norm": 0.7663499712944031, + "learning_rate": 0.00014562919108318734, + "loss": 2.8172, + "step": 9005 + }, + { + "epoch": 0.8159271590677448, + "grad_norm": 0.8434258699417114, + "learning_rate": 0.00014562314988219658, + "loss": 2.8573, + "step": 9006 + }, + { + "epoch": 0.8160177572421916, + "grad_norm": 0.688930332660675, + "learning_rate": 0.00014561710868120584, + "loss": 2.0349, + "step": 9007 + }, + { + "epoch": 0.8161083554166384, + "grad_norm": 0.8222240805625916, + "learning_rate": 0.00014561106748021507, + "loss": 2.946, + "step": 9008 + }, + { + "epoch": 0.8161989535910852, + "grad_norm": 0.8094297647476196, + "learning_rate": 0.00014560502627922433, + "loss": 3.0169, + "step": 9009 + }, + { + "epoch": 0.816289551765532, + "grad_norm": 0.7774180173873901, + "learning_rate": 0.00014559898507823354, + "loss": 3.0745, + "step": 9010 + }, + { + "epoch": 0.8163801499399788, + "grad_norm": 0.8558351993560791, + "learning_rate": 0.0001455929438772428, + "loss": 2.528, + "step": 9011 + }, + { + "epoch": 0.8164707481144255, + "grad_norm": 0.6716651320457458, + "learning_rate": 0.00014558690267625204, + "loss": 2.0948, + "step": 9012 + }, + { + "epoch": 0.8165613462888722, + "grad_norm": 0.797791063785553, + "learning_rate": 0.0001455808614752613, + "loss": 2.9406, + "step": 9013 + }, + { + "epoch": 0.816651944463319, + "grad_norm": 0.8103511929512024, + "learning_rate": 0.00014557482027427053, + "loss": 2.8255, + "step": 9014 + }, + { + "epoch": 0.8167425426377658, + "grad_norm": 0.827296793460846, + "learning_rate": 0.00014556877907327976, + "loss": 2.7328, + "step": 9015 + }, + { + "epoch": 0.8168331408122126, + "grad_norm": 0.7843112945556641, + "learning_rate": 0.00014556273787228903, + "loss": 2.8429, + "step": 9016 + }, + { + "epoch": 0.8169237389866594, + "grad_norm": 0.7886068224906921, + "learning_rate": 0.00014555669667129826, + "loss": 2.6496, + "step": 9017 + }, + { + "epoch": 0.8170143371611062, + "grad_norm": 0.8368687629699707, + "learning_rate": 0.0001455506554703075, + "loss": 3.1293, + "step": 9018 + }, + { + "epoch": 0.817104935335553, + "grad_norm": 0.7703627347946167, + "learning_rate": 0.00014554461426931673, + "loss": 2.7531, + "step": 9019 + }, + { + "epoch": 0.8171955335099997, + "grad_norm": 0.8075300455093384, + "learning_rate": 0.000145538573068326, + "loss": 2.9895, + "step": 9020 + }, + { + "epoch": 0.8172861316844465, + "grad_norm": 0.8482819199562073, + "learning_rate": 0.00014553253186733525, + "loss": 2.7989, + "step": 9021 + }, + { + "epoch": 0.8173767298588933, + "grad_norm": 0.6607213616371155, + "learning_rate": 0.00014552649066634448, + "loss": 2.2755, + "step": 9022 + }, + { + "epoch": 0.8174673280333401, + "grad_norm": 0.8816236257553101, + "learning_rate": 0.00014552044946535372, + "loss": 2.7277, + "step": 9023 + }, + { + "epoch": 0.8175579262077869, + "grad_norm": 0.7619269490242004, + "learning_rate": 0.00014551440826436295, + "loss": 2.728, + "step": 9024 + }, + { + "epoch": 0.8176485243822337, + "grad_norm": 0.768837034702301, + "learning_rate": 0.00014550836706337221, + "loss": 2.6808, + "step": 9025 + }, + { + "epoch": 0.8177391225566805, + "grad_norm": 0.936543881893158, + "learning_rate": 0.00014550232586238145, + "loss": 2.7449, + "step": 9026 + }, + { + "epoch": 0.8178297207311273, + "grad_norm": 0.793233335018158, + "learning_rate": 0.00014549628466139068, + "loss": 2.8268, + "step": 9027 + }, + { + "epoch": 0.817920318905574, + "grad_norm": 0.7988893985748291, + "learning_rate": 0.00014549024346039994, + "loss": 2.8449, + "step": 9028 + }, + { + "epoch": 0.8180109170800208, + "grad_norm": 0.8822581171989441, + "learning_rate": 0.00014548420225940918, + "loss": 2.7085, + "step": 9029 + }, + { + "epoch": 0.8181015152544676, + "grad_norm": 0.7289165258407593, + "learning_rate": 0.00014547816105841844, + "loss": 2.2493, + "step": 9030 + }, + { + "epoch": 0.8181921134289144, + "grad_norm": 0.82474684715271, + "learning_rate": 0.00014547211985742765, + "loss": 2.6872, + "step": 9031 + }, + { + "epoch": 0.8182827116033612, + "grad_norm": 0.7932078838348389, + "learning_rate": 0.0001454660786564369, + "loss": 2.7923, + "step": 9032 + }, + { + "epoch": 0.818373309777808, + "grad_norm": 0.765186071395874, + "learning_rate": 0.00014546003745544614, + "loss": 2.6405, + "step": 9033 + }, + { + "epoch": 0.8184639079522548, + "grad_norm": 0.7774070501327515, + "learning_rate": 0.0001454539962544554, + "loss": 2.7278, + "step": 9034 + }, + { + "epoch": 0.8185545061267016, + "grad_norm": 0.7531028985977173, + "learning_rate": 0.00014544795505346464, + "loss": 2.452, + "step": 9035 + }, + { + "epoch": 0.8186451043011483, + "grad_norm": 0.7400873899459839, + "learning_rate": 0.00014544191385247387, + "loss": 2.5776, + "step": 9036 + }, + { + "epoch": 0.8187357024755951, + "grad_norm": 0.8103957772254944, + "learning_rate": 0.00014543587265148313, + "loss": 2.6308, + "step": 9037 + }, + { + "epoch": 0.8188263006500419, + "grad_norm": 0.8898227214813232, + "learning_rate": 0.00014542983145049237, + "loss": 3.2689, + "step": 9038 + }, + { + "epoch": 0.8189168988244887, + "grad_norm": 0.812846839427948, + "learning_rate": 0.0001454237902495016, + "loss": 2.8335, + "step": 9039 + }, + { + "epoch": 0.8190074969989355, + "grad_norm": 0.795615553855896, + "learning_rate": 0.00014541774904851083, + "loss": 2.7623, + "step": 9040 + }, + { + "epoch": 0.8190980951733823, + "grad_norm": 0.7254226803779602, + "learning_rate": 0.0001454117078475201, + "loss": 1.9987, + "step": 9041 + }, + { + "epoch": 0.8191886933478291, + "grad_norm": 0.72970050573349, + "learning_rate": 0.00014540566664652933, + "loss": 2.6231, + "step": 9042 + }, + { + "epoch": 0.8192792915222759, + "grad_norm": 0.8098828792572021, + "learning_rate": 0.0001453996254455386, + "loss": 3.0554, + "step": 9043 + }, + { + "epoch": 0.8193698896967226, + "grad_norm": 0.8485209345817566, + "learning_rate": 0.00014539358424454782, + "loss": 2.7563, + "step": 9044 + }, + { + "epoch": 0.8194604878711694, + "grad_norm": 0.8426486253738403, + "learning_rate": 0.00014538754304355706, + "loss": 2.9586, + "step": 9045 + }, + { + "epoch": 0.8195510860456162, + "grad_norm": 0.7777377963066101, + "learning_rate": 0.00014538150184256632, + "loss": 2.1561, + "step": 9046 + }, + { + "epoch": 0.819641684220063, + "grad_norm": 0.8025434017181396, + "learning_rate": 0.00014537546064157555, + "loss": 3.0468, + "step": 9047 + }, + { + "epoch": 0.8197322823945098, + "grad_norm": 0.7825206518173218, + "learning_rate": 0.0001453694194405848, + "loss": 2.7237, + "step": 9048 + }, + { + "epoch": 0.8198228805689566, + "grad_norm": 0.7723462581634521, + "learning_rate": 0.00014536337823959402, + "loss": 2.6335, + "step": 9049 + }, + { + "epoch": 0.8199134787434034, + "grad_norm": 0.8176323771476746, + "learning_rate": 0.00014535733703860328, + "loss": 2.9907, + "step": 9050 + }, + { + "epoch": 0.8200040769178502, + "grad_norm": 0.8044029474258423, + "learning_rate": 0.00014535129583761254, + "loss": 3.1413, + "step": 9051 + }, + { + "epoch": 0.8200946750922969, + "grad_norm": 0.7916985154151917, + "learning_rate": 0.00014534525463662178, + "loss": 2.8099, + "step": 9052 + }, + { + "epoch": 0.8201852732667436, + "grad_norm": 0.8146342635154724, + "learning_rate": 0.000145339213435631, + "loss": 2.9962, + "step": 9053 + }, + { + "epoch": 0.8202758714411904, + "grad_norm": 0.7926376461982727, + "learning_rate": 0.00014533317223464025, + "loss": 2.5945, + "step": 9054 + }, + { + "epoch": 0.8203664696156372, + "grad_norm": 0.8529056310653687, + "learning_rate": 0.0001453271310336495, + "loss": 2.5468, + "step": 9055 + }, + { + "epoch": 0.820457067790084, + "grad_norm": 0.7995223999023438, + "learning_rate": 0.00014532108983265874, + "loss": 2.7929, + "step": 9056 + }, + { + "epoch": 0.8205476659645308, + "grad_norm": 0.7566301822662354, + "learning_rate": 0.00014531504863166797, + "loss": 2.5275, + "step": 9057 + }, + { + "epoch": 0.8206382641389776, + "grad_norm": 0.8363518118858337, + "learning_rate": 0.00014530900743067724, + "loss": 2.896, + "step": 9058 + }, + { + "epoch": 0.8207288623134243, + "grad_norm": 0.8502044677734375, + "learning_rate": 0.00014530296622968647, + "loss": 2.6241, + "step": 9059 + }, + { + "epoch": 0.8208194604878711, + "grad_norm": 0.8090845346450806, + "learning_rate": 0.00014529692502869573, + "loss": 2.839, + "step": 9060 + }, + { + "epoch": 0.8209100586623179, + "grad_norm": 0.6506627798080444, + "learning_rate": 0.00014529088382770494, + "loss": 1.7927, + "step": 9061 + }, + { + "epoch": 0.8210006568367647, + "grad_norm": 0.8635678887367249, + "learning_rate": 0.0001452848426267142, + "loss": 2.955, + "step": 9062 + }, + { + "epoch": 0.8210912550112115, + "grad_norm": 0.8099128603935242, + "learning_rate": 0.00014527880142572343, + "loss": 2.6655, + "step": 9063 + }, + { + "epoch": 0.8211818531856583, + "grad_norm": 0.7887808680534363, + "learning_rate": 0.0001452727602247327, + "loss": 2.6811, + "step": 9064 + }, + { + "epoch": 0.8212724513601051, + "grad_norm": 0.8757814168930054, + "learning_rate": 0.00014526671902374193, + "loss": 2.8108, + "step": 9065 + }, + { + "epoch": 0.8213630495345519, + "grad_norm": 0.9045701622962952, + "learning_rate": 0.00014526067782275116, + "loss": 2.8289, + "step": 9066 + }, + { + "epoch": 0.8214536477089986, + "grad_norm": 0.9090462327003479, + "learning_rate": 0.00014525463662176042, + "loss": 2.6404, + "step": 9067 + }, + { + "epoch": 0.8215442458834454, + "grad_norm": 0.8411532640457153, + "learning_rate": 0.00014524859542076966, + "loss": 2.6817, + "step": 9068 + }, + { + "epoch": 0.8216348440578922, + "grad_norm": 0.8824279308319092, + "learning_rate": 0.0001452425542197789, + "loss": 2.778, + "step": 9069 + }, + { + "epoch": 0.821725442232339, + "grad_norm": 0.7966669797897339, + "learning_rate": 0.00014523651301878813, + "loss": 2.655, + "step": 9070 + }, + { + "epoch": 0.8218160404067858, + "grad_norm": 0.7795517444610596, + "learning_rate": 0.0001452304718177974, + "loss": 2.9883, + "step": 9071 + }, + { + "epoch": 0.8219066385812326, + "grad_norm": 0.8057491183280945, + "learning_rate": 0.00014522443061680662, + "loss": 2.6448, + "step": 9072 + }, + { + "epoch": 0.8219972367556794, + "grad_norm": 0.6904984712600708, + "learning_rate": 0.00014521838941581588, + "loss": 2.1199, + "step": 9073 + }, + { + "epoch": 0.8220878349301262, + "grad_norm": 0.7952621579170227, + "learning_rate": 0.00014521234821482512, + "loss": 2.8876, + "step": 9074 + }, + { + "epoch": 0.822178433104573, + "grad_norm": 0.8083642721176147, + "learning_rate": 0.00014520630701383435, + "loss": 2.6577, + "step": 9075 + }, + { + "epoch": 0.8222690312790197, + "grad_norm": 0.7408735156059265, + "learning_rate": 0.0001452002658128436, + "loss": 2.1875, + "step": 9076 + }, + { + "epoch": 0.8223596294534665, + "grad_norm": 0.854472279548645, + "learning_rate": 0.00014519422461185285, + "loss": 2.8728, + "step": 9077 + }, + { + "epoch": 0.8224502276279133, + "grad_norm": 0.8161584734916687, + "learning_rate": 0.00014518818341086208, + "loss": 2.835, + "step": 9078 + }, + { + "epoch": 0.8225408258023601, + "grad_norm": 0.7921476364135742, + "learning_rate": 0.00014518214220987131, + "loss": 2.8548, + "step": 9079 + }, + { + "epoch": 0.8226314239768069, + "grad_norm": 0.8014017343521118, + "learning_rate": 0.00014517610100888057, + "loss": 2.7247, + "step": 9080 + }, + { + "epoch": 0.8227220221512537, + "grad_norm": 0.7877359986305237, + "learning_rate": 0.00014517005980788984, + "loss": 2.7981, + "step": 9081 + }, + { + "epoch": 0.8228126203257005, + "grad_norm": 0.7903481721878052, + "learning_rate": 0.00014516401860689904, + "loss": 2.8364, + "step": 9082 + }, + { + "epoch": 0.8229032185001472, + "grad_norm": 0.7450007200241089, + "learning_rate": 0.0001451579774059083, + "loss": 2.9352, + "step": 9083 + }, + { + "epoch": 0.822993816674594, + "grad_norm": 0.7846896052360535, + "learning_rate": 0.00014515193620491754, + "loss": 2.8311, + "step": 9084 + }, + { + "epoch": 0.8230844148490408, + "grad_norm": 0.6868115067481995, + "learning_rate": 0.0001451458950039268, + "loss": 1.9859, + "step": 9085 + }, + { + "epoch": 0.8231750130234876, + "grad_norm": 0.7232373952865601, + "learning_rate": 0.00014513985380293603, + "loss": 2.0951, + "step": 9086 + }, + { + "epoch": 0.8232656111979344, + "grad_norm": 0.7346978187561035, + "learning_rate": 0.00014513381260194527, + "loss": 2.681, + "step": 9087 + }, + { + "epoch": 0.8233562093723812, + "grad_norm": 0.7838672399520874, + "learning_rate": 0.00014512777140095453, + "loss": 2.7689, + "step": 9088 + }, + { + "epoch": 0.823446807546828, + "grad_norm": 0.8627725839614868, + "learning_rate": 0.00014512173019996376, + "loss": 2.7277, + "step": 9089 + }, + { + "epoch": 0.8235374057212748, + "grad_norm": 0.7976423501968384, + "learning_rate": 0.000145115688998973, + "loss": 2.4415, + "step": 9090 + }, + { + "epoch": 0.8236280038957215, + "grad_norm": 0.7742733359336853, + "learning_rate": 0.00014510964779798223, + "loss": 2.7559, + "step": 9091 + }, + { + "epoch": 0.8237186020701683, + "grad_norm": 0.7202742099761963, + "learning_rate": 0.0001451036065969915, + "loss": 2.0834, + "step": 9092 + }, + { + "epoch": 0.8238092002446151, + "grad_norm": 0.6479766964912415, + "learning_rate": 0.00014509756539600073, + "loss": 2.3535, + "step": 9093 + }, + { + "epoch": 0.8238997984190618, + "grad_norm": 0.8147363066673279, + "learning_rate": 0.00014509152419501, + "loss": 2.8383, + "step": 9094 + }, + { + "epoch": 0.8239903965935086, + "grad_norm": 0.7876986265182495, + "learning_rate": 0.00014508548299401922, + "loss": 3.117, + "step": 9095 + }, + { + "epoch": 0.8240809947679554, + "grad_norm": 0.7980440855026245, + "learning_rate": 0.00014507944179302846, + "loss": 2.6726, + "step": 9096 + }, + { + "epoch": 0.8241715929424022, + "grad_norm": 0.7908264398574829, + "learning_rate": 0.00014507340059203772, + "loss": 2.8525, + "step": 9097 + }, + { + "epoch": 0.824262191116849, + "grad_norm": 0.8009776473045349, + "learning_rate": 0.00014506735939104695, + "loss": 3.0459, + "step": 9098 + }, + { + "epoch": 0.8243527892912957, + "grad_norm": 0.8685670495033264, + "learning_rate": 0.00014506131819005618, + "loss": 2.6412, + "step": 9099 + }, + { + "epoch": 0.8244433874657425, + "grad_norm": 0.9265697002410889, + "learning_rate": 0.00014505527698906542, + "loss": 3.1241, + "step": 9100 + }, + { + "epoch": 0.8245339856401893, + "grad_norm": 0.7985146641731262, + "learning_rate": 0.00014504923578807468, + "loss": 2.6532, + "step": 9101 + }, + { + "epoch": 0.8246245838146361, + "grad_norm": 0.832085907459259, + "learning_rate": 0.00014504319458708391, + "loss": 2.738, + "step": 9102 + }, + { + "epoch": 0.8247151819890829, + "grad_norm": 0.8584580421447754, + "learning_rate": 0.00014503715338609315, + "loss": 2.851, + "step": 9103 + }, + { + "epoch": 0.8248057801635297, + "grad_norm": 0.7750866413116455, + "learning_rate": 0.0001450311121851024, + "loss": 2.7862, + "step": 9104 + }, + { + "epoch": 0.8248963783379765, + "grad_norm": 0.7816082835197449, + "learning_rate": 0.00014502507098411164, + "loss": 2.5694, + "step": 9105 + }, + { + "epoch": 0.8249869765124233, + "grad_norm": 0.8440439701080322, + "learning_rate": 0.0001450190297831209, + "loss": 3.0644, + "step": 9106 + }, + { + "epoch": 0.82507757468687, + "grad_norm": 1.0090399980545044, + "learning_rate": 0.00014501298858213014, + "loss": 2.6841, + "step": 9107 + }, + { + "epoch": 0.8251681728613168, + "grad_norm": 0.8357836008071899, + "learning_rate": 0.00014500694738113937, + "loss": 2.9651, + "step": 9108 + }, + { + "epoch": 0.8252587710357636, + "grad_norm": 0.7421314716339111, + "learning_rate": 0.0001450009061801486, + "loss": 2.7916, + "step": 9109 + }, + { + "epoch": 0.8253493692102104, + "grad_norm": 0.8323367834091187, + "learning_rate": 0.00014499486497915787, + "loss": 2.6104, + "step": 9110 + }, + { + "epoch": 0.8254399673846572, + "grad_norm": 0.6849014163017273, + "learning_rate": 0.00014498882377816713, + "loss": 2.1447, + "step": 9111 + }, + { + "epoch": 0.825530565559104, + "grad_norm": 0.6890009045600891, + "learning_rate": 0.00014498278257717634, + "loss": 2.0041, + "step": 9112 + }, + { + "epoch": 0.8256211637335508, + "grad_norm": 0.795176088809967, + "learning_rate": 0.0001449767413761856, + "loss": 2.7246, + "step": 9113 + }, + { + "epoch": 0.8257117619079976, + "grad_norm": 0.7511881589889526, + "learning_rate": 0.00014497070017519483, + "loss": 2.7918, + "step": 9114 + }, + { + "epoch": 0.8258023600824443, + "grad_norm": 0.7949076890945435, + "learning_rate": 0.0001449646589742041, + "loss": 2.9548, + "step": 9115 + }, + { + "epoch": 0.8258929582568911, + "grad_norm": 0.8476102352142334, + "learning_rate": 0.0001449586177732133, + "loss": 3.0349, + "step": 9116 + }, + { + "epoch": 0.8259835564313379, + "grad_norm": 0.8345445990562439, + "learning_rate": 0.00014495257657222256, + "loss": 2.8828, + "step": 9117 + }, + { + "epoch": 0.8260741546057847, + "grad_norm": 0.817531406879425, + "learning_rate": 0.00014494653537123182, + "loss": 2.743, + "step": 9118 + }, + { + "epoch": 0.8261647527802315, + "grad_norm": 0.7862212061882019, + "learning_rate": 0.00014494049417024106, + "loss": 2.6914, + "step": 9119 + }, + { + "epoch": 0.8262553509546783, + "grad_norm": 0.781339168548584, + "learning_rate": 0.0001449344529692503, + "loss": 3.0443, + "step": 9120 + }, + { + "epoch": 0.8263459491291251, + "grad_norm": 0.799405574798584, + "learning_rate": 0.00014492841176825952, + "loss": 2.7761, + "step": 9121 + }, + { + "epoch": 0.8264365473035719, + "grad_norm": 0.7942816019058228, + "learning_rate": 0.00014492237056726878, + "loss": 3.1382, + "step": 9122 + }, + { + "epoch": 0.8265271454780186, + "grad_norm": 0.7866047620773315, + "learning_rate": 0.00014491632936627802, + "loss": 2.6582, + "step": 9123 + }, + { + "epoch": 0.8266177436524654, + "grad_norm": 0.7843824028968811, + "learning_rate": 0.00014491028816528728, + "loss": 2.9437, + "step": 9124 + }, + { + "epoch": 0.8267083418269122, + "grad_norm": 0.805297315120697, + "learning_rate": 0.00014490424696429651, + "loss": 2.709, + "step": 9125 + }, + { + "epoch": 0.826798940001359, + "grad_norm": 0.7352823615074158, + "learning_rate": 0.00014489820576330575, + "loss": 2.8218, + "step": 9126 + }, + { + "epoch": 0.8268895381758058, + "grad_norm": 0.8865253925323486, + "learning_rate": 0.000144892164562315, + "loss": 2.8188, + "step": 9127 + }, + { + "epoch": 0.8269801363502526, + "grad_norm": 0.7858036160469055, + "learning_rate": 0.00014488612336132424, + "loss": 2.7128, + "step": 9128 + }, + { + "epoch": 0.8270707345246994, + "grad_norm": 0.6859939098358154, + "learning_rate": 0.00014488008216033348, + "loss": 2.1794, + "step": 9129 + }, + { + "epoch": 0.8271613326991462, + "grad_norm": 0.7968709468841553, + "learning_rate": 0.0001448740409593427, + "loss": 2.7515, + "step": 9130 + }, + { + "epoch": 0.8272519308735929, + "grad_norm": 0.7960574626922607, + "learning_rate": 0.00014486799975835197, + "loss": 2.8305, + "step": 9131 + }, + { + "epoch": 0.8273425290480397, + "grad_norm": 0.7958473563194275, + "learning_rate": 0.0001448619585573612, + "loss": 2.8474, + "step": 9132 + }, + { + "epoch": 0.8274331272224865, + "grad_norm": 0.6790854930877686, + "learning_rate": 0.00014485591735637044, + "loss": 2.1678, + "step": 9133 + }, + { + "epoch": 0.8275237253969332, + "grad_norm": 0.8646283149719238, + "learning_rate": 0.0001448498761553797, + "loss": 2.6763, + "step": 9134 + }, + { + "epoch": 0.82761432357138, + "grad_norm": 0.8082073330879211, + "learning_rate": 0.00014484383495438894, + "loss": 2.8449, + "step": 9135 + }, + { + "epoch": 0.8277049217458268, + "grad_norm": 0.7888658046722412, + "learning_rate": 0.0001448377937533982, + "loss": 2.9924, + "step": 9136 + }, + { + "epoch": 0.8277955199202736, + "grad_norm": 0.843826413154602, + "learning_rate": 0.00014483175255240743, + "loss": 2.9999, + "step": 9137 + }, + { + "epoch": 0.8278861180947203, + "grad_norm": 0.6562382578849792, + "learning_rate": 0.00014482571135141666, + "loss": 2.1915, + "step": 9138 + }, + { + "epoch": 0.8279767162691671, + "grad_norm": 0.7364266514778137, + "learning_rate": 0.0001448196701504259, + "loss": 2.1623, + "step": 9139 + }, + { + "epoch": 0.8280673144436139, + "grad_norm": 0.6730538010597229, + "learning_rate": 0.00014481362894943516, + "loss": 1.9778, + "step": 9140 + }, + { + "epoch": 0.8281579126180607, + "grad_norm": 0.7867077589035034, + "learning_rate": 0.0001448075877484444, + "loss": 2.6695, + "step": 9141 + }, + { + "epoch": 0.8282485107925075, + "grad_norm": 0.8174964785575867, + "learning_rate": 0.00014480154654745363, + "loss": 2.9909, + "step": 9142 + }, + { + "epoch": 0.8283391089669543, + "grad_norm": 0.8106504082679749, + "learning_rate": 0.0001447955053464629, + "loss": 2.6983, + "step": 9143 + }, + { + "epoch": 0.8284297071414011, + "grad_norm": 0.8779634237289429, + "learning_rate": 0.00014478946414547212, + "loss": 2.577, + "step": 9144 + }, + { + "epoch": 0.8285203053158479, + "grad_norm": 0.8355907797813416, + "learning_rate": 0.00014478342294448138, + "loss": 2.675, + "step": 9145 + }, + { + "epoch": 0.8286109034902946, + "grad_norm": 0.950346052646637, + "learning_rate": 0.0001447773817434906, + "loss": 2.9292, + "step": 9146 + }, + { + "epoch": 0.8287015016647414, + "grad_norm": 0.7683727145195007, + "learning_rate": 0.00014477134054249985, + "loss": 2.7889, + "step": 9147 + }, + { + "epoch": 0.8287920998391882, + "grad_norm": 0.7682820558547974, + "learning_rate": 0.00014476529934150911, + "loss": 2.5738, + "step": 9148 + }, + { + "epoch": 0.828882698013635, + "grad_norm": 0.7964195013046265, + "learning_rate": 0.00014475925814051835, + "loss": 2.7611, + "step": 9149 + }, + { + "epoch": 0.8289732961880818, + "grad_norm": 0.8342146873474121, + "learning_rate": 0.00014475321693952758, + "loss": 3.1328, + "step": 9150 + }, + { + "epoch": 0.8290638943625286, + "grad_norm": 0.8193589448928833, + "learning_rate": 0.00014474717573853682, + "loss": 2.6914, + "step": 9151 + }, + { + "epoch": 0.8291544925369754, + "grad_norm": 0.8534883260726929, + "learning_rate": 0.00014474113453754608, + "loss": 3.0578, + "step": 9152 + }, + { + "epoch": 0.8292450907114222, + "grad_norm": 0.8186734914779663, + "learning_rate": 0.0001447350933365553, + "loss": 2.8304, + "step": 9153 + }, + { + "epoch": 0.829335688885869, + "grad_norm": 0.7603661417961121, + "learning_rate": 0.00014472905213556455, + "loss": 2.3495, + "step": 9154 + }, + { + "epoch": 0.8294262870603157, + "grad_norm": 0.823314905166626, + "learning_rate": 0.0001447230109345738, + "loss": 3.0338, + "step": 9155 + }, + { + "epoch": 0.8295168852347625, + "grad_norm": 0.7521035075187683, + "learning_rate": 0.00014471696973358304, + "loss": 2.7565, + "step": 9156 + }, + { + "epoch": 0.8296074834092093, + "grad_norm": 0.8653130531311035, + "learning_rate": 0.0001447109285325923, + "loss": 2.664, + "step": 9157 + }, + { + "epoch": 0.8296980815836561, + "grad_norm": 0.8126941323280334, + "learning_rate": 0.00014470488733160154, + "loss": 2.8094, + "step": 9158 + }, + { + "epoch": 0.8297886797581029, + "grad_norm": 0.7954951524734497, + "learning_rate": 0.00014469884613061077, + "loss": 2.7429, + "step": 9159 + }, + { + "epoch": 0.8298792779325497, + "grad_norm": 0.7927972078323364, + "learning_rate": 0.00014469280492962, + "loss": 2.6873, + "step": 9160 + }, + { + "epoch": 0.8299698761069965, + "grad_norm": 0.8366791009902954, + "learning_rate": 0.00014468676372862926, + "loss": 2.7193, + "step": 9161 + }, + { + "epoch": 0.8300604742814433, + "grad_norm": 0.8249630331993103, + "learning_rate": 0.00014468072252763853, + "loss": 3.0947, + "step": 9162 + }, + { + "epoch": 0.83015107245589, + "grad_norm": 0.7362033128738403, + "learning_rate": 0.00014467468132664773, + "loss": 2.6275, + "step": 9163 + }, + { + "epoch": 0.8302416706303368, + "grad_norm": 0.8191942572593689, + "learning_rate": 0.000144668640125657, + "loss": 3.0147, + "step": 9164 + }, + { + "epoch": 0.8303322688047836, + "grad_norm": 0.825457751750946, + "learning_rate": 0.00014466259892466623, + "loss": 2.911, + "step": 9165 + }, + { + "epoch": 0.8304228669792304, + "grad_norm": 0.8129157423973083, + "learning_rate": 0.0001446565577236755, + "loss": 3.0069, + "step": 9166 + }, + { + "epoch": 0.8305134651536772, + "grad_norm": 0.736576497554779, + "learning_rate": 0.0001446505165226847, + "loss": 2.1482, + "step": 9167 + }, + { + "epoch": 0.830604063328124, + "grad_norm": 0.7243557572364807, + "learning_rate": 0.00014464447532169396, + "loss": 2.7788, + "step": 9168 + }, + { + "epoch": 0.8306946615025708, + "grad_norm": 0.8120021224021912, + "learning_rate": 0.0001446384341207032, + "loss": 2.8251, + "step": 9169 + }, + { + "epoch": 0.8307852596770176, + "grad_norm": 0.7857872247695923, + "learning_rate": 0.00014463239291971245, + "loss": 2.1165, + "step": 9170 + }, + { + "epoch": 0.8308758578514643, + "grad_norm": 0.7984269857406616, + "learning_rate": 0.0001446263517187217, + "loss": 3.1536, + "step": 9171 + }, + { + "epoch": 0.8309664560259111, + "grad_norm": 0.6546210050582886, + "learning_rate": 0.00014462031051773092, + "loss": 2.0234, + "step": 9172 + }, + { + "epoch": 0.8310570542003579, + "grad_norm": 0.8736922740936279, + "learning_rate": 0.00014461426931674018, + "loss": 3.0164, + "step": 9173 + }, + { + "epoch": 0.8311476523748047, + "grad_norm": 0.7022249102592468, + "learning_rate": 0.00014460822811574942, + "loss": 2.0809, + "step": 9174 + }, + { + "epoch": 0.8312382505492514, + "grad_norm": 0.6693162322044373, + "learning_rate": 0.00014460218691475868, + "loss": 2.1598, + "step": 9175 + }, + { + "epoch": 0.8313288487236982, + "grad_norm": 0.7708366513252258, + "learning_rate": 0.00014459614571376788, + "loss": 2.7338, + "step": 9176 + }, + { + "epoch": 0.831419446898145, + "grad_norm": 0.7518146634101868, + "learning_rate": 0.00014459010451277715, + "loss": 2.7405, + "step": 9177 + }, + { + "epoch": 0.8315100450725917, + "grad_norm": 0.7959094047546387, + "learning_rate": 0.0001445840633117864, + "loss": 2.7572, + "step": 9178 + }, + { + "epoch": 0.8316006432470385, + "grad_norm": 0.7882006168365479, + "learning_rate": 0.00014457802211079564, + "loss": 2.8448, + "step": 9179 + }, + { + "epoch": 0.8316912414214853, + "grad_norm": 0.761963427066803, + "learning_rate": 0.00014457198090980487, + "loss": 2.8197, + "step": 9180 + }, + { + "epoch": 0.8317818395959321, + "grad_norm": 0.8519322872161865, + "learning_rate": 0.0001445659397088141, + "loss": 2.8252, + "step": 9181 + }, + { + "epoch": 0.8318724377703789, + "grad_norm": 0.7967319488525391, + "learning_rate": 0.00014455989850782337, + "loss": 2.6034, + "step": 9182 + }, + { + "epoch": 0.8319630359448257, + "grad_norm": 0.8391961455345154, + "learning_rate": 0.0001445538573068326, + "loss": 2.7381, + "step": 9183 + }, + { + "epoch": 0.8320536341192725, + "grad_norm": 0.8037017583847046, + "learning_rate": 0.00014454781610584184, + "loss": 2.7622, + "step": 9184 + }, + { + "epoch": 0.8321442322937193, + "grad_norm": 0.7189391851425171, + "learning_rate": 0.0001445417749048511, + "loss": 2.0398, + "step": 9185 + }, + { + "epoch": 0.832234830468166, + "grad_norm": 0.6840158104896545, + "learning_rate": 0.00014453573370386033, + "loss": 2.1142, + "step": 9186 + }, + { + "epoch": 0.8323254286426128, + "grad_norm": 0.8185265064239502, + "learning_rate": 0.0001445296925028696, + "loss": 2.7177, + "step": 9187 + }, + { + "epoch": 0.8324160268170596, + "grad_norm": 0.8436680436134338, + "learning_rate": 0.00014452365130187883, + "loss": 2.9058, + "step": 9188 + }, + { + "epoch": 0.8325066249915064, + "grad_norm": 0.7808147668838501, + "learning_rate": 0.00014451761010088806, + "loss": 2.7066, + "step": 9189 + }, + { + "epoch": 0.8325972231659532, + "grad_norm": 0.812929093837738, + "learning_rate": 0.0001445115688998973, + "loss": 2.7026, + "step": 9190 + }, + { + "epoch": 0.8326878213404, + "grad_norm": 0.7772384881973267, + "learning_rate": 0.00014450552769890656, + "loss": 2.5727, + "step": 9191 + }, + { + "epoch": 0.8327784195148468, + "grad_norm": 0.7648073434829712, + "learning_rate": 0.0001444994864979158, + "loss": 2.7817, + "step": 9192 + }, + { + "epoch": 0.8328690176892936, + "grad_norm": 0.8262123465538025, + "learning_rate": 0.00014449344529692503, + "loss": 3.1981, + "step": 9193 + }, + { + "epoch": 0.8329596158637403, + "grad_norm": 0.7827180624008179, + "learning_rate": 0.0001444874040959343, + "loss": 2.6357, + "step": 9194 + }, + { + "epoch": 0.8330502140381871, + "grad_norm": 0.8083866834640503, + "learning_rate": 0.00014448136289494352, + "loss": 3.0776, + "step": 9195 + }, + { + "epoch": 0.8331408122126339, + "grad_norm": 0.8562474846839905, + "learning_rate": 0.00014447532169395278, + "loss": 2.5627, + "step": 9196 + }, + { + "epoch": 0.8332314103870807, + "grad_norm": 0.8273684978485107, + "learning_rate": 0.000144469280492962, + "loss": 2.5628, + "step": 9197 + }, + { + "epoch": 0.8333220085615275, + "grad_norm": 0.8003309369087219, + "learning_rate": 0.00014446323929197125, + "loss": 2.7403, + "step": 9198 + }, + { + "epoch": 0.8334126067359743, + "grad_norm": 0.7997527122497559, + "learning_rate": 0.00014445719809098048, + "loss": 2.8017, + "step": 9199 + }, + { + "epoch": 0.8335032049104211, + "grad_norm": 0.8597384691238403, + "learning_rate": 0.00014445115688998975, + "loss": 2.6885, + "step": 9200 + }, + { + "epoch": 0.8335938030848679, + "grad_norm": 0.7197664976119995, + "learning_rate": 0.00014444511568899898, + "loss": 1.9463, + "step": 9201 + }, + { + "epoch": 0.8336844012593146, + "grad_norm": 0.7623018026351929, + "learning_rate": 0.0001444390744880082, + "loss": 2.8407, + "step": 9202 + }, + { + "epoch": 0.8337749994337614, + "grad_norm": 0.8178176283836365, + "learning_rate": 0.00014443303328701747, + "loss": 3.014, + "step": 9203 + }, + { + "epoch": 0.8338655976082082, + "grad_norm": 0.7913644909858704, + "learning_rate": 0.0001444269920860267, + "loss": 2.6937, + "step": 9204 + }, + { + "epoch": 0.833956195782655, + "grad_norm": 0.8097172379493713, + "learning_rate": 0.00014442095088503594, + "loss": 2.9049, + "step": 9205 + }, + { + "epoch": 0.8340467939571018, + "grad_norm": 0.8343501687049866, + "learning_rate": 0.00014441490968404518, + "loss": 3.0919, + "step": 9206 + }, + { + "epoch": 0.8341373921315486, + "grad_norm": 0.7906444668769836, + "learning_rate": 0.00014440886848305444, + "loss": 2.9691, + "step": 9207 + }, + { + "epoch": 0.8342279903059954, + "grad_norm": 0.8271874785423279, + "learning_rate": 0.0001444028272820637, + "loss": 2.9912, + "step": 9208 + }, + { + "epoch": 0.8343185884804422, + "grad_norm": 0.8255289196968079, + "learning_rate": 0.00014439678608107293, + "loss": 2.707, + "step": 9209 + }, + { + "epoch": 0.834409186654889, + "grad_norm": 0.7552477717399597, + "learning_rate": 0.00014439074488008217, + "loss": 2.7123, + "step": 9210 + }, + { + "epoch": 0.8344997848293357, + "grad_norm": 0.8342925906181335, + "learning_rate": 0.0001443847036790914, + "loss": 3.0077, + "step": 9211 + }, + { + "epoch": 0.8345903830037825, + "grad_norm": 0.7140633463859558, + "learning_rate": 0.00014437866247810066, + "loss": 2.2444, + "step": 9212 + }, + { + "epoch": 0.8346809811782293, + "grad_norm": 0.7993923425674438, + "learning_rate": 0.0001443726212771099, + "loss": 2.8895, + "step": 9213 + }, + { + "epoch": 0.8347715793526761, + "grad_norm": 0.8257974982261658, + "learning_rate": 0.00014436658007611913, + "loss": 3.0593, + "step": 9214 + }, + { + "epoch": 0.8348621775271229, + "grad_norm": 0.8521842956542969, + "learning_rate": 0.0001443605388751284, + "loss": 2.7857, + "step": 9215 + }, + { + "epoch": 0.8349527757015696, + "grad_norm": 0.7696899771690369, + "learning_rate": 0.00014435449767413763, + "loss": 2.1085, + "step": 9216 + }, + { + "epoch": 0.8350433738760163, + "grad_norm": 0.7541042566299438, + "learning_rate": 0.0001443484564731469, + "loss": 2.538, + "step": 9217 + }, + { + "epoch": 0.8351339720504631, + "grad_norm": 0.8205408453941345, + "learning_rate": 0.0001443424152721561, + "loss": 2.8103, + "step": 9218 + }, + { + "epoch": 0.8352245702249099, + "grad_norm": 0.8033875823020935, + "learning_rate": 0.00014433637407116535, + "loss": 2.8103, + "step": 9219 + }, + { + "epoch": 0.8353151683993567, + "grad_norm": 0.885503888130188, + "learning_rate": 0.0001443303328701746, + "loss": 3.2305, + "step": 9220 + }, + { + "epoch": 0.8354057665738035, + "grad_norm": 0.8145380020141602, + "learning_rate": 0.00014432429166918385, + "loss": 2.8444, + "step": 9221 + }, + { + "epoch": 0.8354963647482503, + "grad_norm": 0.8719748854637146, + "learning_rate": 0.00014431825046819308, + "loss": 2.9337, + "step": 9222 + }, + { + "epoch": 0.8355869629226971, + "grad_norm": 0.717046320438385, + "learning_rate": 0.00014431220926720232, + "loss": 2.224, + "step": 9223 + }, + { + "epoch": 0.8356775610971439, + "grad_norm": 0.8300582766532898, + "learning_rate": 0.00014430616806621158, + "loss": 2.7647, + "step": 9224 + }, + { + "epoch": 0.8357681592715906, + "grad_norm": 0.8357705473899841, + "learning_rate": 0.0001443001268652208, + "loss": 2.5148, + "step": 9225 + }, + { + "epoch": 0.8358587574460374, + "grad_norm": 0.7972708940505981, + "learning_rate": 0.00014429408566423005, + "loss": 2.6532, + "step": 9226 + }, + { + "epoch": 0.8359493556204842, + "grad_norm": 0.878429651260376, + "learning_rate": 0.00014428804446323928, + "loss": 3.0087, + "step": 9227 + }, + { + "epoch": 0.836039953794931, + "grad_norm": 0.8077180981636047, + "learning_rate": 0.00014428200326224854, + "loss": 2.8233, + "step": 9228 + }, + { + "epoch": 0.8361305519693778, + "grad_norm": 0.7757241129875183, + "learning_rate": 0.00014427596206125778, + "loss": 2.9909, + "step": 9229 + }, + { + "epoch": 0.8362211501438246, + "grad_norm": 0.7686247229576111, + "learning_rate": 0.00014426992086026704, + "loss": 2.938, + "step": 9230 + }, + { + "epoch": 0.8363117483182714, + "grad_norm": 0.8068020343780518, + "learning_rate": 0.00014426387965927627, + "loss": 2.9854, + "step": 9231 + }, + { + "epoch": 0.8364023464927182, + "grad_norm": 0.80671226978302, + "learning_rate": 0.0001442578384582855, + "loss": 2.8276, + "step": 9232 + }, + { + "epoch": 0.836492944667165, + "grad_norm": 0.743682861328125, + "learning_rate": 0.00014425179725729477, + "loss": 2.2018, + "step": 9233 + }, + { + "epoch": 0.8365835428416117, + "grad_norm": 0.7565210461616516, + "learning_rate": 0.000144245756056304, + "loss": 2.1245, + "step": 9234 + }, + { + "epoch": 0.8366741410160585, + "grad_norm": 0.7732259035110474, + "learning_rate": 0.00014423971485531324, + "loss": 2.6642, + "step": 9235 + }, + { + "epoch": 0.8367647391905053, + "grad_norm": 0.778864860534668, + "learning_rate": 0.00014423367365432247, + "loss": 2.7889, + "step": 9236 + }, + { + "epoch": 0.8368553373649521, + "grad_norm": 0.8281701803207397, + "learning_rate": 0.00014422763245333173, + "loss": 2.7902, + "step": 9237 + }, + { + "epoch": 0.8369459355393989, + "grad_norm": 0.8017787933349609, + "learning_rate": 0.000144221591252341, + "loss": 2.7309, + "step": 9238 + }, + { + "epoch": 0.8370365337138457, + "grad_norm": 0.764476478099823, + "learning_rate": 0.0001442155500513502, + "loss": 2.8709, + "step": 9239 + }, + { + "epoch": 0.8371271318882925, + "grad_norm": 0.6770820617675781, + "learning_rate": 0.00014420950885035946, + "loss": 1.9685, + "step": 9240 + }, + { + "epoch": 0.8372177300627393, + "grad_norm": 0.8009704351425171, + "learning_rate": 0.0001442034676493687, + "loss": 2.6685, + "step": 9241 + }, + { + "epoch": 0.837308328237186, + "grad_norm": 0.8003746867179871, + "learning_rate": 0.00014419742644837796, + "loss": 2.7596, + "step": 9242 + }, + { + "epoch": 0.8373989264116328, + "grad_norm": 0.9108754992485046, + "learning_rate": 0.0001441913852473872, + "loss": 3.1916, + "step": 9243 + }, + { + "epoch": 0.8374895245860796, + "grad_norm": 0.8075739741325378, + "learning_rate": 0.00014418534404639642, + "loss": 2.9434, + "step": 9244 + }, + { + "epoch": 0.8375801227605264, + "grad_norm": 0.7317540049552917, + "learning_rate": 0.00014417930284540568, + "loss": 2.0423, + "step": 9245 + }, + { + "epoch": 0.8376707209349732, + "grad_norm": 0.9362093806266785, + "learning_rate": 0.00014417326164441492, + "loss": 3.0604, + "step": 9246 + }, + { + "epoch": 0.83776131910942, + "grad_norm": 0.8223628401756287, + "learning_rate": 0.00014416722044342418, + "loss": 2.7411, + "step": 9247 + }, + { + "epoch": 0.8378519172838668, + "grad_norm": 0.8074533343315125, + "learning_rate": 0.0001441611792424334, + "loss": 2.7916, + "step": 9248 + }, + { + "epoch": 0.8379425154583136, + "grad_norm": 0.9388188123703003, + "learning_rate": 0.00014415513804144265, + "loss": 2.9058, + "step": 9249 + }, + { + "epoch": 0.8380331136327603, + "grad_norm": 0.790765106678009, + "learning_rate": 0.00014414909684045188, + "loss": 2.8851, + "step": 9250 + }, + { + "epoch": 0.8381237118072071, + "grad_norm": 0.796966552734375, + "learning_rate": 0.00014414305563946114, + "loss": 2.8283, + "step": 9251 + }, + { + "epoch": 0.8382143099816539, + "grad_norm": 0.853754997253418, + "learning_rate": 0.00014413701443847038, + "loss": 2.9186, + "step": 9252 + }, + { + "epoch": 0.8383049081561007, + "grad_norm": 0.8494999408721924, + "learning_rate": 0.0001441309732374796, + "loss": 2.9405, + "step": 9253 + }, + { + "epoch": 0.8383955063305475, + "grad_norm": 0.6889272928237915, + "learning_rate": 0.00014412493203648887, + "loss": 1.9939, + "step": 9254 + }, + { + "epoch": 0.8384861045049943, + "grad_norm": 0.8021794557571411, + "learning_rate": 0.0001441188908354981, + "loss": 2.807, + "step": 9255 + }, + { + "epoch": 0.838576702679441, + "grad_norm": 0.7685976624488831, + "learning_rate": 0.00014411284963450734, + "loss": 2.6678, + "step": 9256 + }, + { + "epoch": 0.8386673008538877, + "grad_norm": 0.8089547753334045, + "learning_rate": 0.00014410680843351657, + "loss": 3.0115, + "step": 9257 + }, + { + "epoch": 0.8387578990283345, + "grad_norm": 0.8180484771728516, + "learning_rate": 0.00014410076723252584, + "loss": 2.8556, + "step": 9258 + }, + { + "epoch": 0.8388484972027813, + "grad_norm": 0.6939401030540466, + "learning_rate": 0.00014409472603153507, + "loss": 2.1016, + "step": 9259 + }, + { + "epoch": 0.8389390953772281, + "grad_norm": 0.7660076022148132, + "learning_rate": 0.00014408868483054433, + "loss": 2.4871, + "step": 9260 + }, + { + "epoch": 0.8390296935516749, + "grad_norm": 0.8107670545578003, + "learning_rate": 0.00014408264362955356, + "loss": 2.7135, + "step": 9261 + }, + { + "epoch": 0.8391202917261217, + "grad_norm": 0.7964183688163757, + "learning_rate": 0.0001440766024285628, + "loss": 2.7487, + "step": 9262 + }, + { + "epoch": 0.8392108899005685, + "grad_norm": 0.820298969745636, + "learning_rate": 0.00014407056122757206, + "loss": 3.0945, + "step": 9263 + }, + { + "epoch": 0.8393014880750153, + "grad_norm": 0.860318660736084, + "learning_rate": 0.0001440645200265813, + "loss": 2.7062, + "step": 9264 + }, + { + "epoch": 0.839392086249462, + "grad_norm": 0.8204360008239746, + "learning_rate": 0.00014405847882559053, + "loss": 2.848, + "step": 9265 + }, + { + "epoch": 0.8394826844239088, + "grad_norm": 0.789751410484314, + "learning_rate": 0.00014405243762459976, + "loss": 2.6884, + "step": 9266 + }, + { + "epoch": 0.8395732825983556, + "grad_norm": 0.797418475151062, + "learning_rate": 0.00014404639642360902, + "loss": 2.8524, + "step": 9267 + }, + { + "epoch": 0.8396638807728024, + "grad_norm": 0.8423740267753601, + "learning_rate": 0.00014404035522261828, + "loss": 2.7485, + "step": 9268 + }, + { + "epoch": 0.8397544789472492, + "grad_norm": 0.9299201965332031, + "learning_rate": 0.0001440343140216275, + "loss": 2.1407, + "step": 9269 + }, + { + "epoch": 0.839845077121696, + "grad_norm": 0.8284866809844971, + "learning_rate": 0.00014402827282063675, + "loss": 2.7151, + "step": 9270 + }, + { + "epoch": 0.8399356752961428, + "grad_norm": 0.7891198396682739, + "learning_rate": 0.000144022231619646, + "loss": 2.9569, + "step": 9271 + }, + { + "epoch": 0.8400262734705896, + "grad_norm": 0.8417614698410034, + "learning_rate": 0.00014401619041865525, + "loss": 2.765, + "step": 9272 + }, + { + "epoch": 0.8401168716450363, + "grad_norm": 0.835276186466217, + "learning_rate": 0.00014401014921766448, + "loss": 2.7435, + "step": 9273 + }, + { + "epoch": 0.8402074698194831, + "grad_norm": 0.7834227085113525, + "learning_rate": 0.00014400410801667372, + "loss": 2.9117, + "step": 9274 + }, + { + "epoch": 0.8402980679939299, + "grad_norm": 0.7960100173950195, + "learning_rate": 0.00014399806681568298, + "loss": 2.8998, + "step": 9275 + }, + { + "epoch": 0.8403886661683767, + "grad_norm": 0.8254454731941223, + "learning_rate": 0.0001439920256146922, + "loss": 2.5569, + "step": 9276 + }, + { + "epoch": 0.8404792643428235, + "grad_norm": 0.8137151002883911, + "learning_rate": 0.00014398598441370145, + "loss": 3.0098, + "step": 9277 + }, + { + "epoch": 0.8405698625172703, + "grad_norm": 0.7295335531234741, + "learning_rate": 0.00014397994321271068, + "loss": 2.1097, + "step": 9278 + }, + { + "epoch": 0.8406604606917171, + "grad_norm": 0.769447386264801, + "learning_rate": 0.00014397390201171994, + "loss": 2.9972, + "step": 9279 + }, + { + "epoch": 0.8407510588661639, + "grad_norm": 0.8276401162147522, + "learning_rate": 0.00014396786081072917, + "loss": 3.0249, + "step": 9280 + }, + { + "epoch": 0.8408416570406106, + "grad_norm": 0.8589339852333069, + "learning_rate": 0.00014396181960973844, + "loss": 2.8655, + "step": 9281 + }, + { + "epoch": 0.8409322552150574, + "grad_norm": 0.8292025923728943, + "learning_rate": 0.00014395577840874767, + "loss": 2.6967, + "step": 9282 + }, + { + "epoch": 0.8410228533895042, + "grad_norm": 0.7609112858772278, + "learning_rate": 0.0001439497372077569, + "loss": 2.6786, + "step": 9283 + }, + { + "epoch": 0.841113451563951, + "grad_norm": 0.8161695003509521, + "learning_rate": 0.00014394369600676616, + "loss": 2.9142, + "step": 9284 + }, + { + "epoch": 0.8412040497383978, + "grad_norm": 0.8163471221923828, + "learning_rate": 0.0001439376548057754, + "loss": 2.8185, + "step": 9285 + }, + { + "epoch": 0.8412946479128446, + "grad_norm": 0.8242334127426147, + "learning_rate": 0.00014393161360478463, + "loss": 2.7616, + "step": 9286 + }, + { + "epoch": 0.8413852460872914, + "grad_norm": 0.6650534272193909, + "learning_rate": 0.00014392557240379387, + "loss": 1.7558, + "step": 9287 + }, + { + "epoch": 0.8414758442617382, + "grad_norm": 0.7782494425773621, + "learning_rate": 0.00014391953120280313, + "loss": 2.7844, + "step": 9288 + }, + { + "epoch": 0.841566442436185, + "grad_norm": 0.8110802173614502, + "learning_rate": 0.00014391349000181236, + "loss": 3.0721, + "step": 9289 + }, + { + "epoch": 0.8416570406106317, + "grad_norm": 0.8111177086830139, + "learning_rate": 0.0001439074488008216, + "loss": 2.8711, + "step": 9290 + }, + { + "epoch": 0.8417476387850785, + "grad_norm": 0.8035308718681335, + "learning_rate": 0.00014390140759983086, + "loss": 2.9868, + "step": 9291 + }, + { + "epoch": 0.8418382369595253, + "grad_norm": 0.8460074067115784, + "learning_rate": 0.0001438953663988401, + "loss": 2.8466, + "step": 9292 + }, + { + "epoch": 0.8419288351339721, + "grad_norm": 0.767741322517395, + "learning_rate": 0.00014388932519784935, + "loss": 2.6533, + "step": 9293 + }, + { + "epoch": 0.8420194333084189, + "grad_norm": 0.9045170545578003, + "learning_rate": 0.0001438832839968586, + "loss": 2.7896, + "step": 9294 + }, + { + "epoch": 0.8421100314828657, + "grad_norm": 0.6756736040115356, + "learning_rate": 0.00014387724279586782, + "loss": 2.1551, + "step": 9295 + }, + { + "epoch": 0.8422006296573125, + "grad_norm": 0.6818145513534546, + "learning_rate": 0.00014387120159487705, + "loss": 2.0891, + "step": 9296 + }, + { + "epoch": 0.8422912278317591, + "grad_norm": 1.0118056535720825, + "learning_rate": 0.00014386516039388632, + "loss": 2.8542, + "step": 9297 + }, + { + "epoch": 0.8423818260062059, + "grad_norm": 0.5628468990325928, + "learning_rate": 0.00014385911919289558, + "loss": 1.4538, + "step": 9298 + }, + { + "epoch": 0.8424724241806527, + "grad_norm": 0.84112548828125, + "learning_rate": 0.00014385307799190478, + "loss": 2.7854, + "step": 9299 + }, + { + "epoch": 0.8425630223550995, + "grad_norm": 0.8127212524414062, + "learning_rate": 0.00014384703679091405, + "loss": 2.6429, + "step": 9300 + }, + { + "epoch": 0.8426536205295463, + "grad_norm": 0.916897714138031, + "learning_rate": 0.00014384099558992328, + "loss": 2.7618, + "step": 9301 + }, + { + "epoch": 0.8427442187039931, + "grad_norm": 0.8793201446533203, + "learning_rate": 0.00014383495438893254, + "loss": 3.1773, + "step": 9302 + }, + { + "epoch": 0.8428348168784399, + "grad_norm": 0.823188304901123, + "learning_rate": 0.00014382891318794175, + "loss": 3.2515, + "step": 9303 + }, + { + "epoch": 0.8429254150528867, + "grad_norm": 0.8044430613517761, + "learning_rate": 0.000143822871986951, + "loss": 2.7182, + "step": 9304 + }, + { + "epoch": 0.8430160132273334, + "grad_norm": 0.8280612826347351, + "learning_rate": 0.00014381683078596027, + "loss": 2.9488, + "step": 9305 + }, + { + "epoch": 0.8431066114017802, + "grad_norm": 0.7739838361740112, + "learning_rate": 0.0001438107895849695, + "loss": 3.0678, + "step": 9306 + }, + { + "epoch": 0.843197209576227, + "grad_norm": 0.772138237953186, + "learning_rate": 0.00014380474838397874, + "loss": 2.9125, + "step": 9307 + }, + { + "epoch": 0.8432878077506738, + "grad_norm": 0.8123283386230469, + "learning_rate": 0.00014379870718298797, + "loss": 2.9402, + "step": 9308 + }, + { + "epoch": 0.8433784059251206, + "grad_norm": 0.728820264339447, + "learning_rate": 0.00014379266598199723, + "loss": 2.5417, + "step": 9309 + }, + { + "epoch": 0.8434690040995674, + "grad_norm": 0.774669885635376, + "learning_rate": 0.00014378662478100647, + "loss": 2.6666, + "step": 9310 + }, + { + "epoch": 0.8435596022740142, + "grad_norm": 0.7972124218940735, + "learning_rate": 0.00014378058358001573, + "loss": 2.6716, + "step": 9311 + }, + { + "epoch": 0.843650200448461, + "grad_norm": 0.7455300688743591, + "learning_rate": 0.00014377454237902496, + "loss": 2.0742, + "step": 9312 + }, + { + "epoch": 0.8437407986229077, + "grad_norm": 0.752220630645752, + "learning_rate": 0.0001437685011780342, + "loss": 2.2426, + "step": 9313 + }, + { + "epoch": 0.8438313967973545, + "grad_norm": 0.787564754486084, + "learning_rate": 0.00014376245997704346, + "loss": 2.8568, + "step": 9314 + }, + { + "epoch": 0.8439219949718013, + "grad_norm": 0.8143966197967529, + "learning_rate": 0.0001437564187760527, + "loss": 2.8185, + "step": 9315 + }, + { + "epoch": 0.8440125931462481, + "grad_norm": 0.7583525776863098, + "learning_rate": 0.00014375037757506193, + "loss": 2.8232, + "step": 9316 + }, + { + "epoch": 0.8441031913206949, + "grad_norm": 0.7713313102722168, + "learning_rate": 0.00014374433637407116, + "loss": 2.7398, + "step": 9317 + }, + { + "epoch": 0.8441937894951417, + "grad_norm": 0.7615464925765991, + "learning_rate": 0.00014373829517308042, + "loss": 2.096, + "step": 9318 + }, + { + "epoch": 0.8442843876695885, + "grad_norm": 0.8248372077941895, + "learning_rate": 0.00014373225397208965, + "loss": 2.7932, + "step": 9319 + }, + { + "epoch": 0.8443749858440353, + "grad_norm": 0.852104127407074, + "learning_rate": 0.0001437262127710989, + "loss": 2.8582, + "step": 9320 + }, + { + "epoch": 0.844465584018482, + "grad_norm": 0.6749985218048096, + "learning_rate": 0.00014372017157010815, + "loss": 2.0745, + "step": 9321 + }, + { + "epoch": 0.8445561821929288, + "grad_norm": 0.7775301337242126, + "learning_rate": 0.00014371413036911738, + "loss": 2.84, + "step": 9322 + }, + { + "epoch": 0.8446467803673756, + "grad_norm": 0.685451865196228, + "learning_rate": 0.00014370808916812665, + "loss": 2.1164, + "step": 9323 + }, + { + "epoch": 0.8447373785418224, + "grad_norm": 0.7590100169181824, + "learning_rate": 0.00014370204796713588, + "loss": 2.8342, + "step": 9324 + }, + { + "epoch": 0.8448279767162692, + "grad_norm": 0.8789838552474976, + "learning_rate": 0.0001436960067661451, + "loss": 2.9563, + "step": 9325 + }, + { + "epoch": 0.844918574890716, + "grad_norm": 0.7525656819343567, + "learning_rate": 0.00014368996556515435, + "loss": 2.7404, + "step": 9326 + }, + { + "epoch": 0.8450091730651628, + "grad_norm": 0.833757221698761, + "learning_rate": 0.0001436839243641636, + "loss": 2.8525, + "step": 9327 + }, + { + "epoch": 0.8450997712396096, + "grad_norm": 0.7661402225494385, + "learning_rate": 0.00014367788316317284, + "loss": 2.8749, + "step": 9328 + }, + { + "epoch": 0.8451903694140563, + "grad_norm": 0.8534374237060547, + "learning_rate": 0.00014367184196218208, + "loss": 2.9618, + "step": 9329 + }, + { + "epoch": 0.8452809675885031, + "grad_norm": 0.8632053732872009, + "learning_rate": 0.00014366580076119134, + "loss": 3.0559, + "step": 9330 + }, + { + "epoch": 0.8453715657629499, + "grad_norm": 0.7785208821296692, + "learning_rate": 0.00014365975956020057, + "loss": 2.8104, + "step": 9331 + }, + { + "epoch": 0.8454621639373967, + "grad_norm": 0.8283411264419556, + "learning_rate": 0.00014365371835920983, + "loss": 2.7366, + "step": 9332 + }, + { + "epoch": 0.8455527621118435, + "grad_norm": 0.8567888140678406, + "learning_rate": 0.00014364767715821904, + "loss": 2.8469, + "step": 9333 + }, + { + "epoch": 0.8456433602862903, + "grad_norm": 0.8156989812850952, + "learning_rate": 0.0001436416359572283, + "loss": 2.9134, + "step": 9334 + }, + { + "epoch": 0.8457339584607371, + "grad_norm": 0.7854901552200317, + "learning_rate": 0.00014363559475623756, + "loss": 3.0552, + "step": 9335 + }, + { + "epoch": 0.8458245566351839, + "grad_norm": 0.7625978589057922, + "learning_rate": 0.0001436295535552468, + "loss": 2.6046, + "step": 9336 + }, + { + "epoch": 0.8459151548096305, + "grad_norm": 0.7829076647758484, + "learning_rate": 0.00014362351235425603, + "loss": 2.7591, + "step": 9337 + }, + { + "epoch": 0.8460057529840773, + "grad_norm": 0.7497204542160034, + "learning_rate": 0.00014361747115326526, + "loss": 2.7203, + "step": 9338 + }, + { + "epoch": 0.8460963511585241, + "grad_norm": 0.8219353556632996, + "learning_rate": 0.00014361142995227453, + "loss": 2.8839, + "step": 9339 + }, + { + "epoch": 0.8461869493329709, + "grad_norm": 0.8880892395973206, + "learning_rate": 0.00014360538875128376, + "loss": 3.0565, + "step": 9340 + }, + { + "epoch": 0.8462775475074177, + "grad_norm": 0.8615801334381104, + "learning_rate": 0.000143599347550293, + "loss": 2.7471, + "step": 9341 + }, + { + "epoch": 0.8463681456818645, + "grad_norm": 0.8280979990959167, + "learning_rate": 0.00014359330634930225, + "loss": 3.042, + "step": 9342 + }, + { + "epoch": 0.8464587438563113, + "grad_norm": 0.8428086638450623, + "learning_rate": 0.0001435872651483115, + "loss": 3.0705, + "step": 9343 + }, + { + "epoch": 0.846549342030758, + "grad_norm": 0.8751518726348877, + "learning_rate": 0.00014358122394732075, + "loss": 2.8042, + "step": 9344 + }, + { + "epoch": 0.8466399402052048, + "grad_norm": 0.7908778786659241, + "learning_rate": 0.00014357518274632998, + "loss": 2.6698, + "step": 9345 + }, + { + "epoch": 0.8467305383796516, + "grad_norm": 0.7684834599494934, + "learning_rate": 0.00014356914154533922, + "loss": 2.715, + "step": 9346 + }, + { + "epoch": 0.8468211365540984, + "grad_norm": 0.8695321083068848, + "learning_rate": 0.00014356310034434845, + "loss": 2.8394, + "step": 9347 + }, + { + "epoch": 0.8469117347285452, + "grad_norm": 0.6105039119720459, + "learning_rate": 0.0001435570591433577, + "loss": 1.5329, + "step": 9348 + }, + { + "epoch": 0.847002332902992, + "grad_norm": 0.7616975903511047, + "learning_rate": 0.00014355101794236695, + "loss": 2.7467, + "step": 9349 + }, + { + "epoch": 0.8470929310774388, + "grad_norm": 0.8394117951393127, + "learning_rate": 0.00014354497674137618, + "loss": 2.8931, + "step": 9350 + }, + { + "epoch": 0.8471835292518856, + "grad_norm": 0.6650504469871521, + "learning_rate": 0.00014353893554038544, + "loss": 2.1673, + "step": 9351 + }, + { + "epoch": 0.8472741274263323, + "grad_norm": 0.8543188571929932, + "learning_rate": 0.00014353289433939468, + "loss": 2.8316, + "step": 9352 + }, + { + "epoch": 0.8473647256007791, + "grad_norm": 0.806194007396698, + "learning_rate": 0.00014352685313840394, + "loss": 2.7394, + "step": 9353 + }, + { + "epoch": 0.8474553237752259, + "grad_norm": 0.8558921813964844, + "learning_rate": 0.00014352081193741314, + "loss": 2.6746, + "step": 9354 + }, + { + "epoch": 0.8475459219496727, + "grad_norm": 0.7974327802658081, + "learning_rate": 0.0001435147707364224, + "loss": 2.9082, + "step": 9355 + }, + { + "epoch": 0.8476365201241195, + "grad_norm": 0.85054612159729, + "learning_rate": 0.00014350872953543164, + "loss": 3.0267, + "step": 9356 + }, + { + "epoch": 0.8477271182985663, + "grad_norm": 0.8384100198745728, + "learning_rate": 0.0001435026883344409, + "loss": 2.8742, + "step": 9357 + }, + { + "epoch": 0.8478177164730131, + "grad_norm": 0.7998188734054565, + "learning_rate": 0.00014349664713345014, + "loss": 2.9429, + "step": 9358 + }, + { + "epoch": 0.8479083146474599, + "grad_norm": 0.8080347776412964, + "learning_rate": 0.00014349060593245937, + "loss": 2.8055, + "step": 9359 + }, + { + "epoch": 0.8479989128219066, + "grad_norm": 0.7903016209602356, + "learning_rate": 0.00014348456473146863, + "loss": 2.843, + "step": 9360 + }, + { + "epoch": 0.8480895109963534, + "grad_norm": 0.7989839911460876, + "learning_rate": 0.00014347852353047786, + "loss": 2.9466, + "step": 9361 + }, + { + "epoch": 0.8481801091708002, + "grad_norm": 0.8160963654518127, + "learning_rate": 0.00014347248232948713, + "loss": 3.0852, + "step": 9362 + }, + { + "epoch": 0.848270707345247, + "grad_norm": 0.8259421586990356, + "learning_rate": 0.00014346644112849633, + "loss": 3.0798, + "step": 9363 + }, + { + "epoch": 0.8483613055196938, + "grad_norm": 0.747756838798523, + "learning_rate": 0.0001434603999275056, + "loss": 2.7529, + "step": 9364 + }, + { + "epoch": 0.8484519036941406, + "grad_norm": 0.7953104972839355, + "learning_rate": 0.00014345435872651485, + "loss": 2.9776, + "step": 9365 + }, + { + "epoch": 0.8485425018685874, + "grad_norm": 0.7880508303642273, + "learning_rate": 0.0001434483175255241, + "loss": 2.7617, + "step": 9366 + }, + { + "epoch": 0.8486331000430342, + "grad_norm": 0.8015766739845276, + "learning_rate": 0.00014344227632453332, + "loss": 2.7369, + "step": 9367 + }, + { + "epoch": 0.848723698217481, + "grad_norm": 0.7712604403495789, + "learning_rate": 0.00014343623512354256, + "loss": 2.806, + "step": 9368 + }, + { + "epoch": 0.8488142963919277, + "grad_norm": 0.7914013862609863, + "learning_rate": 0.00014343019392255182, + "loss": 2.9364, + "step": 9369 + }, + { + "epoch": 0.8489048945663745, + "grad_norm": 0.8287401795387268, + "learning_rate": 0.00014342415272156105, + "loss": 2.8977, + "step": 9370 + }, + { + "epoch": 0.8489954927408213, + "grad_norm": 0.790749728679657, + "learning_rate": 0.00014341811152057029, + "loss": 2.6534, + "step": 9371 + }, + { + "epoch": 0.8490860909152681, + "grad_norm": 0.7328025698661804, + "learning_rate": 0.00014341207031957955, + "loss": 2.7096, + "step": 9372 + }, + { + "epoch": 0.8491766890897149, + "grad_norm": 0.8443465828895569, + "learning_rate": 0.00014340602911858878, + "loss": 3.1385, + "step": 9373 + }, + { + "epoch": 0.8492672872641617, + "grad_norm": 0.8756227493286133, + "learning_rate": 0.00014339998791759804, + "loss": 1.9145, + "step": 9374 + }, + { + "epoch": 0.8493578854386085, + "grad_norm": 0.722308874130249, + "learning_rate": 0.00014339394671660728, + "loss": 2.0969, + "step": 9375 + }, + { + "epoch": 0.8494484836130552, + "grad_norm": 0.8037769198417664, + "learning_rate": 0.0001433879055156165, + "loss": 2.7882, + "step": 9376 + }, + { + "epoch": 0.849539081787502, + "grad_norm": 0.8771306872367859, + "learning_rate": 0.00014338186431462574, + "loss": 2.7979, + "step": 9377 + }, + { + "epoch": 0.8496296799619487, + "grad_norm": 0.7788025736808777, + "learning_rate": 0.000143375823113635, + "loss": 2.711, + "step": 9378 + }, + { + "epoch": 0.8497202781363955, + "grad_norm": 0.7379444241523743, + "learning_rate": 0.00014336978191264424, + "loss": 2.6542, + "step": 9379 + }, + { + "epoch": 0.8498108763108423, + "grad_norm": 0.8752089738845825, + "learning_rate": 0.00014336374071165347, + "loss": 2.6166, + "step": 9380 + }, + { + "epoch": 0.8499014744852891, + "grad_norm": 0.8438320159912109, + "learning_rate": 0.00014335769951066274, + "loss": 2.8264, + "step": 9381 + }, + { + "epoch": 0.8499920726597359, + "grad_norm": 0.8129565119743347, + "learning_rate": 0.00014335165830967197, + "loss": 2.809, + "step": 9382 + }, + { + "epoch": 0.8500826708341827, + "grad_norm": 0.8385631442070007, + "learning_rate": 0.00014334561710868123, + "loss": 3.0009, + "step": 9383 + }, + { + "epoch": 0.8501732690086294, + "grad_norm": 0.8058321475982666, + "learning_rate": 0.00014333957590769044, + "loss": 3.0075, + "step": 9384 + }, + { + "epoch": 0.8502638671830762, + "grad_norm": 0.790810763835907, + "learning_rate": 0.0001433335347066997, + "loss": 2.8494, + "step": 9385 + }, + { + "epoch": 0.850354465357523, + "grad_norm": 0.7922213077545166, + "learning_rate": 0.00014332749350570893, + "loss": 2.7322, + "step": 9386 + }, + { + "epoch": 0.8504450635319698, + "grad_norm": 0.8674166798591614, + "learning_rate": 0.0001433214523047182, + "loss": 2.9377, + "step": 9387 + }, + { + "epoch": 0.8505356617064166, + "grad_norm": 0.8130659461021423, + "learning_rate": 0.00014331541110372743, + "loss": 2.6758, + "step": 9388 + }, + { + "epoch": 0.8506262598808634, + "grad_norm": 0.787002444267273, + "learning_rate": 0.00014330936990273666, + "loss": 2.8814, + "step": 9389 + }, + { + "epoch": 0.8507168580553102, + "grad_norm": 0.761346161365509, + "learning_rate": 0.00014330332870174592, + "loss": 2.7615, + "step": 9390 + }, + { + "epoch": 0.850807456229757, + "grad_norm": 0.7878972291946411, + "learning_rate": 0.00014329728750075516, + "loss": 2.5752, + "step": 9391 + }, + { + "epoch": 0.8508980544042037, + "grad_norm": 0.8934152722358704, + "learning_rate": 0.0001432912462997644, + "loss": 2.9401, + "step": 9392 + }, + { + "epoch": 0.8509886525786505, + "grad_norm": 0.8060358762741089, + "learning_rate": 0.00014328520509877363, + "loss": 2.9427, + "step": 9393 + }, + { + "epoch": 0.8510792507530973, + "grad_norm": 0.7734175324440002, + "learning_rate": 0.00014327916389778289, + "loss": 2.6284, + "step": 9394 + }, + { + "epoch": 0.8511698489275441, + "grad_norm": 0.7502239942550659, + "learning_rate": 0.00014327312269679215, + "loss": 2.5694, + "step": 9395 + }, + { + "epoch": 0.8512604471019909, + "grad_norm": 0.8932037949562073, + "learning_rate": 0.00014326708149580138, + "loss": 2.6525, + "step": 9396 + }, + { + "epoch": 0.8513510452764377, + "grad_norm": 0.8687165975570679, + "learning_rate": 0.00014326104029481062, + "loss": 2.5599, + "step": 9397 + }, + { + "epoch": 0.8514416434508845, + "grad_norm": 0.8271833658218384, + "learning_rate": 0.00014325499909381985, + "loss": 2.6752, + "step": 9398 + }, + { + "epoch": 0.8515322416253313, + "grad_norm": 0.7265349626541138, + "learning_rate": 0.0001432489578928291, + "loss": 2.4534, + "step": 9399 + }, + { + "epoch": 0.851622839799778, + "grad_norm": 0.6488290429115295, + "learning_rate": 0.00014324291669183834, + "loss": 1.8972, + "step": 9400 + }, + { + "epoch": 0.8517134379742248, + "grad_norm": 0.7632068395614624, + "learning_rate": 0.00014323687549084758, + "loss": 2.8627, + "step": 9401 + }, + { + "epoch": 0.8518040361486716, + "grad_norm": 0.803461492061615, + "learning_rate": 0.00014323083428985684, + "loss": 2.9057, + "step": 9402 + }, + { + "epoch": 0.8518946343231184, + "grad_norm": 0.7872018218040466, + "learning_rate": 0.00014322479308886607, + "loss": 3.0097, + "step": 9403 + }, + { + "epoch": 0.8519852324975652, + "grad_norm": 0.9044871926307678, + "learning_rate": 0.00014321875188787534, + "loss": 2.75, + "step": 9404 + }, + { + "epoch": 0.852075830672012, + "grad_norm": 0.7880036234855652, + "learning_rate": 0.00014321271068688454, + "loss": 2.8325, + "step": 9405 + }, + { + "epoch": 0.8521664288464588, + "grad_norm": 0.8709837794303894, + "learning_rate": 0.0001432066694858938, + "loss": 2.8753, + "step": 9406 + }, + { + "epoch": 0.8522570270209056, + "grad_norm": 0.6976466178894043, + "learning_rate": 0.00014320062828490304, + "loss": 2.0328, + "step": 9407 + }, + { + "epoch": 0.8523476251953523, + "grad_norm": 0.7677420377731323, + "learning_rate": 0.0001431945870839123, + "loss": 2.4949, + "step": 9408 + }, + { + "epoch": 0.8524382233697991, + "grad_norm": 0.7798047065734863, + "learning_rate": 0.00014318854588292153, + "loss": 2.9075, + "step": 9409 + }, + { + "epoch": 0.8525288215442459, + "grad_norm": 0.8095859289169312, + "learning_rate": 0.00014318250468193077, + "loss": 2.8652, + "step": 9410 + }, + { + "epoch": 0.8526194197186927, + "grad_norm": 0.8407798409461975, + "learning_rate": 0.00014317646348094003, + "loss": 2.6903, + "step": 9411 + }, + { + "epoch": 0.8527100178931395, + "grad_norm": 0.7065296173095703, + "learning_rate": 0.00014317042227994926, + "loss": 2.0702, + "step": 9412 + }, + { + "epoch": 0.8528006160675863, + "grad_norm": 0.7932907938957214, + "learning_rate": 0.0001431643810789585, + "loss": 2.8811, + "step": 9413 + }, + { + "epoch": 0.8528912142420331, + "grad_norm": 0.8749739527702332, + "learning_rate": 0.00014315833987796773, + "loss": 2.8966, + "step": 9414 + }, + { + "epoch": 0.8529818124164799, + "grad_norm": 0.8622158169746399, + "learning_rate": 0.000143152298676977, + "loss": 2.8618, + "step": 9415 + }, + { + "epoch": 0.8530724105909266, + "grad_norm": 0.7706058025360107, + "learning_rate": 0.00014314625747598623, + "loss": 2.5242, + "step": 9416 + }, + { + "epoch": 0.8531630087653734, + "grad_norm": 0.7949343919754028, + "learning_rate": 0.0001431402162749955, + "loss": 2.6326, + "step": 9417 + }, + { + "epoch": 0.8532536069398201, + "grad_norm": 0.8112022876739502, + "learning_rate": 0.00014313417507400472, + "loss": 2.6124, + "step": 9418 + }, + { + "epoch": 0.8533442051142669, + "grad_norm": 0.8575919270515442, + "learning_rate": 0.00014312813387301395, + "loss": 2.6925, + "step": 9419 + }, + { + "epoch": 0.8534348032887137, + "grad_norm": 0.8131237030029297, + "learning_rate": 0.00014312209267202322, + "loss": 2.8164, + "step": 9420 + }, + { + "epoch": 0.8535254014631605, + "grad_norm": 0.7962391972541809, + "learning_rate": 0.00014311605147103245, + "loss": 2.613, + "step": 9421 + }, + { + "epoch": 0.8536159996376073, + "grad_norm": 0.9034964442253113, + "learning_rate": 0.00014311001027004168, + "loss": 2.7447, + "step": 9422 + }, + { + "epoch": 0.853706597812054, + "grad_norm": 0.7121245861053467, + "learning_rate": 0.00014310396906905092, + "loss": 2.3591, + "step": 9423 + }, + { + "epoch": 0.8537971959865008, + "grad_norm": 0.7945100665092468, + "learning_rate": 0.00014309792786806018, + "loss": 2.7229, + "step": 9424 + }, + { + "epoch": 0.8538877941609476, + "grad_norm": 0.5386723279953003, + "learning_rate": 0.00014309188666706944, + "loss": 1.2017, + "step": 9425 + }, + { + "epoch": 0.8539783923353944, + "grad_norm": 0.7783799171447754, + "learning_rate": 0.00014308584546607865, + "loss": 2.9239, + "step": 9426 + }, + { + "epoch": 0.8540689905098412, + "grad_norm": 0.7624185085296631, + "learning_rate": 0.0001430798042650879, + "loss": 2.9178, + "step": 9427 + }, + { + "epoch": 0.854159588684288, + "grad_norm": 0.7592805624008179, + "learning_rate": 0.00014307376306409714, + "loss": 2.7646, + "step": 9428 + }, + { + "epoch": 0.8542501868587348, + "grad_norm": 0.571718156337738, + "learning_rate": 0.0001430677218631064, + "loss": 1.5245, + "step": 9429 + }, + { + "epoch": 0.8543407850331816, + "grad_norm": 0.8422684073448181, + "learning_rate": 0.00014306168066211564, + "loss": 2.9204, + "step": 9430 + }, + { + "epoch": 0.8544313832076283, + "grad_norm": 0.7960715889930725, + "learning_rate": 0.00014305563946112487, + "loss": 2.8292, + "step": 9431 + }, + { + "epoch": 0.8545219813820751, + "grad_norm": 0.8476369976997375, + "learning_rate": 0.00014304959826013413, + "loss": 2.7662, + "step": 9432 + }, + { + "epoch": 0.8546125795565219, + "grad_norm": 0.6774376034736633, + "learning_rate": 0.00014304355705914337, + "loss": 1.984, + "step": 9433 + }, + { + "epoch": 0.8547031777309687, + "grad_norm": 0.8117228746414185, + "learning_rate": 0.00014303751585815263, + "loss": 2.0635, + "step": 9434 + }, + { + "epoch": 0.8547937759054155, + "grad_norm": 0.7860783934593201, + "learning_rate": 0.00014303147465716183, + "loss": 2.9332, + "step": 9435 + }, + { + "epoch": 0.8548843740798623, + "grad_norm": 0.7570704221725464, + "learning_rate": 0.0001430254334561711, + "loss": 2.6646, + "step": 9436 + }, + { + "epoch": 0.8549749722543091, + "grad_norm": 0.8492280840873718, + "learning_rate": 0.00014301939225518033, + "loss": 2.6491, + "step": 9437 + }, + { + "epoch": 0.8550655704287559, + "grad_norm": 0.843291699886322, + "learning_rate": 0.0001430133510541896, + "loss": 2.4697, + "step": 9438 + }, + { + "epoch": 0.8551561686032026, + "grad_norm": 0.8020780682563782, + "learning_rate": 0.00014300730985319883, + "loss": 2.9098, + "step": 9439 + }, + { + "epoch": 0.8552467667776494, + "grad_norm": 0.7866520285606384, + "learning_rate": 0.00014300126865220806, + "loss": 2.5891, + "step": 9440 + }, + { + "epoch": 0.8553373649520962, + "grad_norm": 0.8327063322067261, + "learning_rate": 0.00014299522745121732, + "loss": 2.5825, + "step": 9441 + }, + { + "epoch": 0.855427963126543, + "grad_norm": 0.9145222306251526, + "learning_rate": 0.00014298918625022655, + "loss": 3.0339, + "step": 9442 + }, + { + "epoch": 0.8555185613009898, + "grad_norm": 0.8648324012756348, + "learning_rate": 0.0001429831450492358, + "loss": 2.7615, + "step": 9443 + }, + { + "epoch": 0.8556091594754366, + "grad_norm": 0.8651725053787231, + "learning_rate": 0.00014297710384824502, + "loss": 2.841, + "step": 9444 + }, + { + "epoch": 0.8556997576498834, + "grad_norm": 0.7877336144447327, + "learning_rate": 0.00014297106264725428, + "loss": 2.8556, + "step": 9445 + }, + { + "epoch": 0.8557903558243302, + "grad_norm": 0.8657513856887817, + "learning_rate": 0.00014296502144626352, + "loss": 2.9614, + "step": 9446 + }, + { + "epoch": 0.855880953998777, + "grad_norm": 0.8250658512115479, + "learning_rate": 0.00014295898024527278, + "loss": 2.5256, + "step": 9447 + }, + { + "epoch": 0.8559715521732237, + "grad_norm": 0.9008957743644714, + "learning_rate": 0.000142952939044282, + "loss": 2.642, + "step": 9448 + }, + { + "epoch": 0.8560621503476705, + "grad_norm": 0.8409892320632935, + "learning_rate": 0.00014294689784329125, + "loss": 2.8651, + "step": 9449 + }, + { + "epoch": 0.8561527485221173, + "grad_norm": 0.8324340581893921, + "learning_rate": 0.0001429408566423005, + "loss": 3.1359, + "step": 9450 + }, + { + "epoch": 0.8562433466965641, + "grad_norm": 0.7849398255348206, + "learning_rate": 0.00014293481544130974, + "loss": 2.6729, + "step": 9451 + }, + { + "epoch": 0.8563339448710109, + "grad_norm": 0.7913246750831604, + "learning_rate": 0.00014292877424031898, + "loss": 2.5774, + "step": 9452 + }, + { + "epoch": 0.8564245430454577, + "grad_norm": 0.7942060828208923, + "learning_rate": 0.0001429227330393282, + "loss": 2.8185, + "step": 9453 + }, + { + "epoch": 0.8565151412199045, + "grad_norm": 0.7557292580604553, + "learning_rate": 0.00014291669183833747, + "loss": 2.8569, + "step": 9454 + }, + { + "epoch": 0.8566057393943513, + "grad_norm": 0.7769628167152405, + "learning_rate": 0.00014291065063734673, + "loss": 2.9776, + "step": 9455 + }, + { + "epoch": 0.856696337568798, + "grad_norm": 0.7882164120674133, + "learning_rate": 0.00014290460943635594, + "loss": 2.9367, + "step": 9456 + }, + { + "epoch": 0.8567869357432448, + "grad_norm": 0.729138195514679, + "learning_rate": 0.0001428985682353652, + "loss": 2.2338, + "step": 9457 + }, + { + "epoch": 0.8568775339176916, + "grad_norm": 0.8029696345329285, + "learning_rate": 0.00014289252703437444, + "loss": 2.7347, + "step": 9458 + }, + { + "epoch": 0.8569681320921383, + "grad_norm": 0.7380489110946655, + "learning_rate": 0.0001428864858333837, + "loss": 2.5437, + "step": 9459 + }, + { + "epoch": 0.8570587302665851, + "grad_norm": 0.6599135994911194, + "learning_rate": 0.00014288044463239293, + "loss": 2.0431, + "step": 9460 + }, + { + "epoch": 0.8571493284410319, + "grad_norm": 0.7873519659042358, + "learning_rate": 0.00014287440343140216, + "loss": 2.5873, + "step": 9461 + }, + { + "epoch": 0.8572399266154787, + "grad_norm": 0.7127612829208374, + "learning_rate": 0.00014286836223041143, + "loss": 2.145, + "step": 9462 + }, + { + "epoch": 0.8573305247899254, + "grad_norm": 0.8175622820854187, + "learning_rate": 0.00014286232102942066, + "loss": 2.6543, + "step": 9463 + }, + { + "epoch": 0.8574211229643722, + "grad_norm": 0.7039610743522644, + "learning_rate": 0.0001428562798284299, + "loss": 2.2516, + "step": 9464 + }, + { + "epoch": 0.857511721138819, + "grad_norm": 0.827723503112793, + "learning_rate": 0.00014285023862743913, + "loss": 2.6162, + "step": 9465 + }, + { + "epoch": 0.8576023193132658, + "grad_norm": 0.8571553230285645, + "learning_rate": 0.0001428441974264484, + "loss": 2.7731, + "step": 9466 + }, + { + "epoch": 0.8576929174877126, + "grad_norm": 0.7749573588371277, + "learning_rate": 0.00014283815622545762, + "loss": 2.5808, + "step": 9467 + }, + { + "epoch": 0.8577835156621594, + "grad_norm": 0.7289462685585022, + "learning_rate": 0.00014283211502446688, + "loss": 2.2455, + "step": 9468 + }, + { + "epoch": 0.8578741138366062, + "grad_norm": 0.7837605476379395, + "learning_rate": 0.0001428260738234761, + "loss": 2.8351, + "step": 9469 + }, + { + "epoch": 0.857964712011053, + "grad_norm": 0.85319983959198, + "learning_rate": 0.00014282003262248535, + "loss": 2.9048, + "step": 9470 + }, + { + "epoch": 0.8580553101854997, + "grad_norm": 0.8156236410140991, + "learning_rate": 0.0001428139914214946, + "loss": 2.7154, + "step": 9471 + }, + { + "epoch": 0.8581459083599465, + "grad_norm": 0.7932498455047607, + "learning_rate": 0.00014280795022050385, + "loss": 3.0658, + "step": 9472 + }, + { + "epoch": 0.8582365065343933, + "grad_norm": 0.7772274613380432, + "learning_rate": 0.00014280190901951308, + "loss": 2.5575, + "step": 9473 + }, + { + "epoch": 0.8583271047088401, + "grad_norm": 0.8335005044937134, + "learning_rate": 0.00014279586781852232, + "loss": 2.8371, + "step": 9474 + }, + { + "epoch": 0.8584177028832869, + "grad_norm": 0.8072757720947266, + "learning_rate": 0.00014278982661753158, + "loss": 2.6682, + "step": 9475 + }, + { + "epoch": 0.8585083010577337, + "grad_norm": 0.5771660208702087, + "learning_rate": 0.0001427837854165408, + "loss": 1.4127, + "step": 9476 + }, + { + "epoch": 0.8585988992321805, + "grad_norm": 0.8088504076004028, + "learning_rate": 0.00014277774421555004, + "loss": 2.7707, + "step": 9477 + }, + { + "epoch": 0.8586894974066273, + "grad_norm": 0.8583757281303406, + "learning_rate": 0.0001427717030145593, + "loss": 3.1272, + "step": 9478 + }, + { + "epoch": 0.858780095581074, + "grad_norm": 0.8217593431472778, + "learning_rate": 0.00014276566181356854, + "loss": 3.0396, + "step": 9479 + }, + { + "epoch": 0.8588706937555208, + "grad_norm": 0.8282230496406555, + "learning_rate": 0.0001427596206125778, + "loss": 2.1609, + "step": 9480 + }, + { + "epoch": 0.8589612919299676, + "grad_norm": 0.7986219525337219, + "learning_rate": 0.00014275357941158704, + "loss": 3.0929, + "step": 9481 + }, + { + "epoch": 0.8590518901044144, + "grad_norm": 0.8129732012748718, + "learning_rate": 0.00014274753821059627, + "loss": 2.936, + "step": 9482 + }, + { + "epoch": 0.8591424882788612, + "grad_norm": 0.7954509854316711, + "learning_rate": 0.0001427414970096055, + "loss": 2.9828, + "step": 9483 + }, + { + "epoch": 0.859233086453308, + "grad_norm": 0.8481436967849731, + "learning_rate": 0.00014273545580861476, + "loss": 2.8087, + "step": 9484 + }, + { + "epoch": 0.8593236846277548, + "grad_norm": 0.7950553894042969, + "learning_rate": 0.00014272941460762403, + "loss": 1.9498, + "step": 9485 + }, + { + "epoch": 0.8594142828022016, + "grad_norm": 0.7103852033615112, + "learning_rate": 0.00014272337340663323, + "loss": 1.8771, + "step": 9486 + }, + { + "epoch": 0.8595048809766483, + "grad_norm": 0.8522495031356812, + "learning_rate": 0.0001427173322056425, + "loss": 2.7109, + "step": 9487 + }, + { + "epoch": 0.8595954791510951, + "grad_norm": 0.8253291249275208, + "learning_rate": 0.00014271129100465173, + "loss": 2.8992, + "step": 9488 + }, + { + "epoch": 0.8596860773255419, + "grad_norm": 0.7327092885971069, + "learning_rate": 0.000142705249803661, + "loss": 2.1925, + "step": 9489 + }, + { + "epoch": 0.8597766754999887, + "grad_norm": 0.8035716414451599, + "learning_rate": 0.0001426992086026702, + "loss": 2.5944, + "step": 9490 + }, + { + "epoch": 0.8598672736744355, + "grad_norm": 0.7302771210670471, + "learning_rate": 0.00014269316740167946, + "loss": 2.1304, + "step": 9491 + }, + { + "epoch": 0.8599578718488823, + "grad_norm": 0.7909805178642273, + "learning_rate": 0.00014268712620068872, + "loss": 2.8114, + "step": 9492 + }, + { + "epoch": 0.8600484700233291, + "grad_norm": 0.8464141488075256, + "learning_rate": 0.00014268108499969795, + "loss": 2.5618, + "step": 9493 + }, + { + "epoch": 0.8601390681977759, + "grad_norm": 0.8869946599006653, + "learning_rate": 0.00014267504379870719, + "loss": 2.6005, + "step": 9494 + }, + { + "epoch": 0.8602296663722226, + "grad_norm": 0.8793910145759583, + "learning_rate": 0.00014266900259771642, + "loss": 2.6825, + "step": 9495 + }, + { + "epoch": 0.8603202645466694, + "grad_norm": 0.8671194911003113, + "learning_rate": 0.00014266296139672568, + "loss": 2.6493, + "step": 9496 + }, + { + "epoch": 0.8604108627211162, + "grad_norm": 0.781317949295044, + "learning_rate": 0.00014265692019573492, + "loss": 2.6514, + "step": 9497 + }, + { + "epoch": 0.860501460895563, + "grad_norm": 0.8462486863136292, + "learning_rate": 0.00014265087899474418, + "loss": 2.7503, + "step": 9498 + }, + { + "epoch": 0.8605920590700097, + "grad_norm": 0.8474532961845398, + "learning_rate": 0.00014264483779375338, + "loss": 2.9466, + "step": 9499 + }, + { + "epoch": 0.8606826572444565, + "grad_norm": 0.7727785706520081, + "learning_rate": 0.00014263879659276264, + "loss": 2.761, + "step": 9500 + }, + { + "epoch": 0.8607732554189033, + "grad_norm": 0.8134552240371704, + "learning_rate": 0.0001426327553917719, + "loss": 2.7596, + "step": 9501 + }, + { + "epoch": 0.86086385359335, + "grad_norm": 0.818004310131073, + "learning_rate": 0.00014262671419078114, + "loss": 2.6397, + "step": 9502 + }, + { + "epoch": 0.8609544517677968, + "grad_norm": 0.8901143670082092, + "learning_rate": 0.00014262067298979037, + "loss": 2.9338, + "step": 9503 + }, + { + "epoch": 0.8610450499422436, + "grad_norm": 0.8162528276443481, + "learning_rate": 0.0001426146317887996, + "loss": 3.0277, + "step": 9504 + }, + { + "epoch": 0.8611356481166904, + "grad_norm": 0.7902812957763672, + "learning_rate": 0.00014260859058780887, + "loss": 2.403, + "step": 9505 + }, + { + "epoch": 0.8612262462911372, + "grad_norm": 0.851402223110199, + "learning_rate": 0.0001426025493868181, + "loss": 2.9217, + "step": 9506 + }, + { + "epoch": 0.861316844465584, + "grad_norm": 0.6785312294960022, + "learning_rate": 0.00014259650818582734, + "loss": 2.0712, + "step": 9507 + }, + { + "epoch": 0.8614074426400308, + "grad_norm": 0.8388549089431763, + "learning_rate": 0.0001425904669848366, + "loss": 2.9416, + "step": 9508 + }, + { + "epoch": 0.8614980408144776, + "grad_norm": 0.8435834646224976, + "learning_rate": 0.00014258442578384583, + "loss": 3.0361, + "step": 9509 + }, + { + "epoch": 0.8615886389889243, + "grad_norm": 0.8021736741065979, + "learning_rate": 0.0001425783845828551, + "loss": 2.9141, + "step": 9510 + }, + { + "epoch": 0.8616792371633711, + "grad_norm": 0.7928245663642883, + "learning_rate": 0.00014257234338186433, + "loss": 2.6872, + "step": 9511 + }, + { + "epoch": 0.8617698353378179, + "grad_norm": 0.7738261222839355, + "learning_rate": 0.00014256630218087356, + "loss": 2.7635, + "step": 9512 + }, + { + "epoch": 0.8618604335122647, + "grad_norm": 0.7994831800460815, + "learning_rate": 0.0001425602609798828, + "loss": 3.1866, + "step": 9513 + }, + { + "epoch": 0.8619510316867115, + "grad_norm": 0.7698710560798645, + "learning_rate": 0.00014255421977889206, + "loss": 2.6449, + "step": 9514 + }, + { + "epoch": 0.8620416298611583, + "grad_norm": 0.7163615822792053, + "learning_rate": 0.0001425481785779013, + "loss": 2.6182, + "step": 9515 + }, + { + "epoch": 0.8621322280356051, + "grad_norm": 0.7654884457588196, + "learning_rate": 0.00014254213737691053, + "loss": 2.6673, + "step": 9516 + }, + { + "epoch": 0.8622228262100519, + "grad_norm": 0.8212865591049194, + "learning_rate": 0.00014253609617591979, + "loss": 2.7649, + "step": 9517 + }, + { + "epoch": 0.8623134243844987, + "grad_norm": 0.5577833652496338, + "learning_rate": 0.00014253005497492902, + "loss": 1.3069, + "step": 9518 + }, + { + "epoch": 0.8624040225589454, + "grad_norm": 0.7506940960884094, + "learning_rate": 0.00014252401377393828, + "loss": 2.8542, + "step": 9519 + }, + { + "epoch": 0.8624946207333922, + "grad_norm": 0.8024803400039673, + "learning_rate": 0.0001425179725729475, + "loss": 2.7313, + "step": 9520 + }, + { + "epoch": 0.862585218907839, + "grad_norm": 0.7174893617630005, + "learning_rate": 0.00014251193137195675, + "loss": 2.1742, + "step": 9521 + }, + { + "epoch": 0.8626758170822858, + "grad_norm": 0.7747166156768799, + "learning_rate": 0.000142505890170966, + "loss": 1.982, + "step": 9522 + }, + { + "epoch": 0.8627664152567326, + "grad_norm": 0.908970296382904, + "learning_rate": 0.00014249984896997524, + "loss": 2.8222, + "step": 9523 + }, + { + "epoch": 0.8628570134311794, + "grad_norm": 0.8268623352050781, + "learning_rate": 0.00014249380776898448, + "loss": 2.7688, + "step": 9524 + }, + { + "epoch": 0.8629476116056262, + "grad_norm": 0.7908074259757996, + "learning_rate": 0.0001424877665679937, + "loss": 2.6513, + "step": 9525 + }, + { + "epoch": 0.863038209780073, + "grad_norm": 0.816596508026123, + "learning_rate": 0.00014248172536700297, + "loss": 2.9247, + "step": 9526 + }, + { + "epoch": 0.8631288079545197, + "grad_norm": 0.8682738542556763, + "learning_rate": 0.0001424756841660122, + "loss": 2.7155, + "step": 9527 + }, + { + "epoch": 0.8632194061289665, + "grad_norm": 0.6703278422355652, + "learning_rate": 0.00014246964296502144, + "loss": 2.0152, + "step": 9528 + }, + { + "epoch": 0.8633100043034133, + "grad_norm": 0.8018489480018616, + "learning_rate": 0.00014246360176403068, + "loss": 3.0008, + "step": 9529 + }, + { + "epoch": 0.8634006024778601, + "grad_norm": 0.8123831748962402, + "learning_rate": 0.00014245756056303994, + "loss": 3.1274, + "step": 9530 + }, + { + "epoch": 0.8634912006523069, + "grad_norm": 0.8013458847999573, + "learning_rate": 0.0001424515193620492, + "loss": 2.7795, + "step": 9531 + }, + { + "epoch": 0.8635817988267537, + "grad_norm": 0.8142326474189758, + "learning_rate": 0.00014244547816105843, + "loss": 2.6891, + "step": 9532 + }, + { + "epoch": 0.8636723970012005, + "grad_norm": 0.8245141506195068, + "learning_rate": 0.00014243943696006767, + "loss": 2.8128, + "step": 9533 + }, + { + "epoch": 0.8637629951756473, + "grad_norm": 0.8857367634773254, + "learning_rate": 0.0001424333957590769, + "loss": 2.7707, + "step": 9534 + }, + { + "epoch": 0.863853593350094, + "grad_norm": 0.7721795439720154, + "learning_rate": 0.00014242735455808616, + "loss": 2.6498, + "step": 9535 + }, + { + "epoch": 0.8639441915245408, + "grad_norm": 0.7655408978462219, + "learning_rate": 0.0001424213133570954, + "loss": 2.891, + "step": 9536 + }, + { + "epoch": 0.8640347896989876, + "grad_norm": 0.8212078213691711, + "learning_rate": 0.00014241527215610463, + "loss": 2.8066, + "step": 9537 + }, + { + "epoch": 0.8641253878734344, + "grad_norm": 0.8675088286399841, + "learning_rate": 0.0001424092309551139, + "loss": 2.8005, + "step": 9538 + }, + { + "epoch": 0.8642159860478812, + "grad_norm": 0.7904995679855347, + "learning_rate": 0.00014240318975412313, + "loss": 2.7918, + "step": 9539 + }, + { + "epoch": 0.8643065842223279, + "grad_norm": 0.8158859014511108, + "learning_rate": 0.00014239714855313239, + "loss": 2.9977, + "step": 9540 + }, + { + "epoch": 0.8643971823967747, + "grad_norm": 0.8319217562675476, + "learning_rate": 0.0001423911073521416, + "loss": 2.8791, + "step": 9541 + }, + { + "epoch": 0.8644877805712214, + "grad_norm": 0.7324163913726807, + "learning_rate": 0.00014238506615115085, + "loss": 2.8411, + "step": 9542 + }, + { + "epoch": 0.8645783787456682, + "grad_norm": 0.8522471785545349, + "learning_rate": 0.0001423790249501601, + "loss": 2.866, + "step": 9543 + }, + { + "epoch": 0.864668976920115, + "grad_norm": 0.734438419342041, + "learning_rate": 0.00014237298374916935, + "loss": 2.6628, + "step": 9544 + }, + { + "epoch": 0.8647595750945618, + "grad_norm": 0.8037766814231873, + "learning_rate": 0.00014236694254817858, + "loss": 3.2206, + "step": 9545 + }, + { + "epoch": 0.8648501732690086, + "grad_norm": 0.8552380204200745, + "learning_rate": 0.00014236090134718782, + "loss": 2.7472, + "step": 9546 + }, + { + "epoch": 0.8649407714434554, + "grad_norm": 0.7860184907913208, + "learning_rate": 0.00014235486014619708, + "loss": 2.9066, + "step": 9547 + }, + { + "epoch": 0.8650313696179022, + "grad_norm": 0.7820489406585693, + "learning_rate": 0.0001423488189452063, + "loss": 2.7213, + "step": 9548 + }, + { + "epoch": 0.865121967792349, + "grad_norm": 0.7476001381874084, + "learning_rate": 0.00014234277774421557, + "loss": 2.7397, + "step": 9549 + }, + { + "epoch": 0.8652125659667957, + "grad_norm": 0.9059840440750122, + "learning_rate": 0.00014233673654322478, + "loss": 2.8994, + "step": 9550 + }, + { + "epoch": 0.8653031641412425, + "grad_norm": 0.8015578985214233, + "learning_rate": 0.00014233069534223404, + "loss": 2.7124, + "step": 9551 + }, + { + "epoch": 0.8653937623156893, + "grad_norm": 0.7785775661468506, + "learning_rate": 0.0001423246541412433, + "loss": 2.8111, + "step": 9552 + }, + { + "epoch": 0.8654843604901361, + "grad_norm": 0.7649271488189697, + "learning_rate": 0.00014231861294025254, + "loss": 2.9665, + "step": 9553 + }, + { + "epoch": 0.8655749586645829, + "grad_norm": 0.757064163684845, + "learning_rate": 0.00014231257173926177, + "loss": 2.8083, + "step": 9554 + }, + { + "epoch": 0.8656655568390297, + "grad_norm": 0.7195934653282166, + "learning_rate": 0.000142306530538271, + "loss": 2.0543, + "step": 9555 + }, + { + "epoch": 0.8657561550134765, + "grad_norm": 0.8233635425567627, + "learning_rate": 0.00014230048933728027, + "loss": 3.0589, + "step": 9556 + }, + { + "epoch": 0.8658467531879233, + "grad_norm": 0.805649995803833, + "learning_rate": 0.0001422944481362895, + "loss": 2.7264, + "step": 9557 + }, + { + "epoch": 0.86593735136237, + "grad_norm": 0.9526685476303101, + "learning_rate": 0.00014228840693529873, + "loss": 2.7345, + "step": 9558 + }, + { + "epoch": 0.8660279495368168, + "grad_norm": 0.7037516236305237, + "learning_rate": 0.00014228236573430797, + "loss": 2.1286, + "step": 9559 + }, + { + "epoch": 0.8661185477112636, + "grad_norm": 0.8365904688835144, + "learning_rate": 0.00014227632453331723, + "loss": 2.7148, + "step": 9560 + }, + { + "epoch": 0.8662091458857104, + "grad_norm": 0.8363618850708008, + "learning_rate": 0.0001422702833323265, + "loss": 2.587, + "step": 9561 + }, + { + "epoch": 0.8662997440601572, + "grad_norm": 0.8184176683425903, + "learning_rate": 0.00014226424213133573, + "loss": 2.6393, + "step": 9562 + }, + { + "epoch": 0.866390342234604, + "grad_norm": 0.8814058303833008, + "learning_rate": 0.00014225820093034496, + "loss": 2.5103, + "step": 9563 + }, + { + "epoch": 0.8664809404090508, + "grad_norm": 0.8119888305664062, + "learning_rate": 0.0001422521597293542, + "loss": 2.4726, + "step": 9564 + }, + { + "epoch": 0.8665715385834976, + "grad_norm": 0.6888571381568909, + "learning_rate": 0.00014224611852836345, + "loss": 2.0833, + "step": 9565 + }, + { + "epoch": 0.8666621367579443, + "grad_norm": 0.8415398001670837, + "learning_rate": 0.0001422400773273727, + "loss": 2.7867, + "step": 9566 + }, + { + "epoch": 0.8667527349323911, + "grad_norm": 0.7184413075447083, + "learning_rate": 0.00014223403612638192, + "loss": 2.0841, + "step": 9567 + }, + { + "epoch": 0.8668433331068379, + "grad_norm": 0.7023499011993408, + "learning_rate": 0.00014222799492539118, + "loss": 2.2016, + "step": 9568 + }, + { + "epoch": 0.8669339312812847, + "grad_norm": 0.7881538271903992, + "learning_rate": 0.00014222195372440042, + "loss": 2.8576, + "step": 9569 + }, + { + "epoch": 0.8670245294557315, + "grad_norm": 0.8575051426887512, + "learning_rate": 0.00014221591252340968, + "loss": 2.6787, + "step": 9570 + }, + { + "epoch": 0.8671151276301783, + "grad_norm": 0.8595358729362488, + "learning_rate": 0.00014220987132241889, + "loss": 2.8363, + "step": 9571 + }, + { + "epoch": 0.8672057258046251, + "grad_norm": 0.8397266268730164, + "learning_rate": 0.00014220383012142815, + "loss": 2.8297, + "step": 9572 + }, + { + "epoch": 0.8672963239790719, + "grad_norm": 0.8360785841941833, + "learning_rate": 0.00014219778892043738, + "loss": 2.8849, + "step": 9573 + }, + { + "epoch": 0.8673869221535186, + "grad_norm": 0.8730611205101013, + "learning_rate": 0.00014219174771944664, + "loss": 3.2971, + "step": 9574 + }, + { + "epoch": 0.8674775203279654, + "grad_norm": 0.8388612866401672, + "learning_rate": 0.00014218570651845588, + "loss": 2.6081, + "step": 9575 + }, + { + "epoch": 0.8675681185024122, + "grad_norm": 0.8062427043914795, + "learning_rate": 0.0001421796653174651, + "loss": 2.6794, + "step": 9576 + }, + { + "epoch": 0.867658716676859, + "grad_norm": 0.7990761995315552, + "learning_rate": 0.00014217362411647437, + "loss": 2.7745, + "step": 9577 + }, + { + "epoch": 0.8677493148513058, + "grad_norm": 0.8428146243095398, + "learning_rate": 0.0001421675829154836, + "loss": 2.7894, + "step": 9578 + }, + { + "epoch": 0.8678399130257526, + "grad_norm": 0.7593300342559814, + "learning_rate": 0.00014216154171449284, + "loss": 2.4161, + "step": 9579 + }, + { + "epoch": 0.8679305112001993, + "grad_norm": 0.7533334493637085, + "learning_rate": 0.00014215550051350207, + "loss": 2.5382, + "step": 9580 + }, + { + "epoch": 0.868021109374646, + "grad_norm": 0.7903574705123901, + "learning_rate": 0.00014214945931251133, + "loss": 2.5837, + "step": 9581 + }, + { + "epoch": 0.8681117075490928, + "grad_norm": 0.8275593519210815, + "learning_rate": 0.0001421434181115206, + "loss": 2.7844, + "step": 9582 + }, + { + "epoch": 0.8682023057235396, + "grad_norm": 0.8754494786262512, + "learning_rate": 0.00014213737691052983, + "loss": 2.5039, + "step": 9583 + }, + { + "epoch": 0.8682929038979864, + "grad_norm": 0.799170732498169, + "learning_rate": 0.00014213133570953906, + "loss": 2.7531, + "step": 9584 + }, + { + "epoch": 0.8683835020724332, + "grad_norm": 0.8190925717353821, + "learning_rate": 0.0001421252945085483, + "loss": 2.8544, + "step": 9585 + }, + { + "epoch": 0.86847410024688, + "grad_norm": 0.8364723324775696, + "learning_rate": 0.00014211925330755756, + "loss": 2.5943, + "step": 9586 + }, + { + "epoch": 0.8685646984213268, + "grad_norm": 0.9432142972946167, + "learning_rate": 0.0001421132121065668, + "loss": 2.8027, + "step": 9587 + }, + { + "epoch": 0.8686552965957736, + "grad_norm": 0.7988717555999756, + "learning_rate": 0.00014210717090557603, + "loss": 3.0022, + "step": 9588 + }, + { + "epoch": 0.8687458947702204, + "grad_norm": 0.8677662014961243, + "learning_rate": 0.00014210112970458526, + "loss": 2.8217, + "step": 9589 + }, + { + "epoch": 0.8688364929446671, + "grad_norm": 0.7626563906669617, + "learning_rate": 0.00014209508850359452, + "loss": 2.5034, + "step": 9590 + }, + { + "epoch": 0.8689270911191139, + "grad_norm": 0.8109798431396484, + "learning_rate": 0.00014208904730260378, + "loss": 2.9314, + "step": 9591 + }, + { + "epoch": 0.8690176892935607, + "grad_norm": 0.8136438727378845, + "learning_rate": 0.000142083006101613, + "loss": 2.7514, + "step": 9592 + }, + { + "epoch": 0.8691082874680075, + "grad_norm": 0.6758728623390198, + "learning_rate": 0.00014207696490062225, + "loss": 2.0528, + "step": 9593 + }, + { + "epoch": 0.8691988856424543, + "grad_norm": 0.7869331240653992, + "learning_rate": 0.00014207092369963149, + "loss": 2.9087, + "step": 9594 + }, + { + "epoch": 0.8692894838169011, + "grad_norm": 0.7925480008125305, + "learning_rate": 0.00014206488249864075, + "loss": 2.7012, + "step": 9595 + }, + { + "epoch": 0.8693800819913479, + "grad_norm": 0.8059089183807373, + "learning_rate": 0.00014205884129764998, + "loss": 2.842, + "step": 9596 + }, + { + "epoch": 0.8694706801657947, + "grad_norm": 0.8969958424568176, + "learning_rate": 0.00014205280009665922, + "loss": 2.7956, + "step": 9597 + }, + { + "epoch": 0.8695612783402414, + "grad_norm": 0.7773688435554504, + "learning_rate": 0.00014204675889566848, + "loss": 2.8018, + "step": 9598 + }, + { + "epoch": 0.8696518765146882, + "grad_norm": 0.8061697483062744, + "learning_rate": 0.0001420407176946777, + "loss": 2.997, + "step": 9599 + }, + { + "epoch": 0.869742474689135, + "grad_norm": 0.8793430924415588, + "learning_rate": 0.00014203467649368694, + "loss": 3.2179, + "step": 9600 + }, + { + "epoch": 0.8698330728635818, + "grad_norm": 0.7923629879951477, + "learning_rate": 0.00014202863529269618, + "loss": 2.8332, + "step": 9601 + }, + { + "epoch": 0.8699236710380286, + "grad_norm": 0.7766503691673279, + "learning_rate": 0.00014202259409170544, + "loss": 2.9226, + "step": 9602 + }, + { + "epoch": 0.8700142692124754, + "grad_norm": 0.8376980423927307, + "learning_rate": 0.00014201655289071467, + "loss": 2.7024, + "step": 9603 + }, + { + "epoch": 0.8701048673869222, + "grad_norm": 0.7005520462989807, + "learning_rate": 0.00014201051168972393, + "loss": 2.0719, + "step": 9604 + }, + { + "epoch": 0.870195465561369, + "grad_norm": 0.8905799984931946, + "learning_rate": 0.00014200447048873317, + "loss": 2.8453, + "step": 9605 + }, + { + "epoch": 0.8702860637358157, + "grad_norm": 0.8001392483711243, + "learning_rate": 0.0001419984292877424, + "loss": 2.7229, + "step": 9606 + }, + { + "epoch": 0.8703766619102625, + "grad_norm": 0.7766175866127014, + "learning_rate": 0.00014199238808675166, + "loss": 2.6906, + "step": 9607 + }, + { + "epoch": 0.8704672600847093, + "grad_norm": 0.7631382346153259, + "learning_rate": 0.0001419863468857609, + "loss": 2.7037, + "step": 9608 + }, + { + "epoch": 0.8705578582591561, + "grad_norm": 0.7964637875556946, + "learning_rate": 0.00014198030568477013, + "loss": 2.8397, + "step": 9609 + }, + { + "epoch": 0.8706484564336029, + "grad_norm": 0.7890156507492065, + "learning_rate": 0.00014197426448377937, + "loss": 2.573, + "step": 9610 + }, + { + "epoch": 0.8707390546080497, + "grad_norm": 0.8625876307487488, + "learning_rate": 0.00014196822328278863, + "loss": 2.9655, + "step": 9611 + }, + { + "epoch": 0.8708296527824965, + "grad_norm": 0.8323901891708374, + "learning_rate": 0.0001419621820817979, + "loss": 2.8199, + "step": 9612 + }, + { + "epoch": 0.8709202509569433, + "grad_norm": 0.7479987740516663, + "learning_rate": 0.0001419561408808071, + "loss": 2.2116, + "step": 9613 + }, + { + "epoch": 0.87101084913139, + "grad_norm": 0.8114891648292542, + "learning_rate": 0.00014195009967981636, + "loss": 2.7834, + "step": 9614 + }, + { + "epoch": 0.8711014473058368, + "grad_norm": 0.7707242369651794, + "learning_rate": 0.0001419440584788256, + "loss": 2.8718, + "step": 9615 + }, + { + "epoch": 0.8711920454802836, + "grad_norm": 0.830639123916626, + "learning_rate": 0.00014193801727783485, + "loss": 2.8989, + "step": 9616 + }, + { + "epoch": 0.8712826436547304, + "grad_norm": 0.7857653498649597, + "learning_rate": 0.00014193197607684409, + "loss": 2.7723, + "step": 9617 + }, + { + "epoch": 0.8713732418291772, + "grad_norm": 0.8886507749557495, + "learning_rate": 0.00014192593487585332, + "loss": 3.0791, + "step": 9618 + }, + { + "epoch": 0.871463840003624, + "grad_norm": 0.7993385791778564, + "learning_rate": 0.00014191989367486255, + "loss": 2.9562, + "step": 9619 + }, + { + "epoch": 0.8715544381780708, + "grad_norm": 0.7996087670326233, + "learning_rate": 0.00014191385247387182, + "loss": 2.9251, + "step": 9620 + }, + { + "epoch": 0.8716450363525174, + "grad_norm": 0.7264648079872131, + "learning_rate": 0.00014190781127288108, + "loss": 2.7039, + "step": 9621 + }, + { + "epoch": 0.8717356345269642, + "grad_norm": 0.7738354802131653, + "learning_rate": 0.00014190177007189028, + "loss": 2.6936, + "step": 9622 + }, + { + "epoch": 0.871826232701411, + "grad_norm": 0.7622865438461304, + "learning_rate": 0.00014189572887089954, + "loss": 2.8214, + "step": 9623 + }, + { + "epoch": 0.8719168308758578, + "grad_norm": 0.7964497804641724, + "learning_rate": 0.00014188968766990878, + "loss": 2.94, + "step": 9624 + }, + { + "epoch": 0.8720074290503046, + "grad_norm": 0.7742416858673096, + "learning_rate": 0.00014188364646891804, + "loss": 2.6464, + "step": 9625 + }, + { + "epoch": 0.8720980272247514, + "grad_norm": 0.8390318751335144, + "learning_rate": 0.00014187760526792727, + "loss": 2.5444, + "step": 9626 + }, + { + "epoch": 0.8721886253991982, + "grad_norm": 0.7666927576065063, + "learning_rate": 0.0001418715640669365, + "loss": 2.7097, + "step": 9627 + }, + { + "epoch": 0.872279223573645, + "grad_norm": 0.8693661093711853, + "learning_rate": 0.00014186552286594577, + "loss": 2.4105, + "step": 9628 + }, + { + "epoch": 0.8723698217480917, + "grad_norm": 0.8693742156028748, + "learning_rate": 0.000141859481664955, + "loss": 2.701, + "step": 9629 + }, + { + "epoch": 0.8724604199225385, + "grad_norm": 0.7468125820159912, + "learning_rate": 0.00014185344046396424, + "loss": 2.7316, + "step": 9630 + }, + { + "epoch": 0.8725510180969853, + "grad_norm": 0.8933172821998596, + "learning_rate": 0.00014184739926297347, + "loss": 2.9559, + "step": 9631 + }, + { + "epoch": 0.8726416162714321, + "grad_norm": 0.8727932572364807, + "learning_rate": 0.00014184135806198273, + "loss": 2.9038, + "step": 9632 + }, + { + "epoch": 0.8727322144458789, + "grad_norm": 0.7810438871383667, + "learning_rate": 0.00014183531686099197, + "loss": 2.8984, + "step": 9633 + }, + { + "epoch": 0.8728228126203257, + "grad_norm": 0.7910609841346741, + "learning_rate": 0.00014182927566000123, + "loss": 2.6406, + "step": 9634 + }, + { + "epoch": 0.8729134107947725, + "grad_norm": 0.7387962937355042, + "learning_rate": 0.00014182323445901046, + "loss": 2.2122, + "step": 9635 + }, + { + "epoch": 0.8730040089692193, + "grad_norm": 0.8519683480262756, + "learning_rate": 0.0001418171932580197, + "loss": 2.6827, + "step": 9636 + }, + { + "epoch": 0.873094607143666, + "grad_norm": 0.8583543300628662, + "learning_rate": 0.00014181115205702896, + "loss": 3.1111, + "step": 9637 + }, + { + "epoch": 0.8731852053181128, + "grad_norm": 0.7806146144866943, + "learning_rate": 0.0001418051108560382, + "loss": 2.9089, + "step": 9638 + }, + { + "epoch": 0.8732758034925596, + "grad_norm": 0.8262740969657898, + "learning_rate": 0.00014179906965504742, + "loss": 2.8392, + "step": 9639 + }, + { + "epoch": 0.8733664016670064, + "grad_norm": 0.8051215410232544, + "learning_rate": 0.00014179302845405666, + "loss": 2.9451, + "step": 9640 + }, + { + "epoch": 0.8734569998414532, + "grad_norm": 0.7795888185501099, + "learning_rate": 0.00014178698725306592, + "loss": 2.5521, + "step": 9641 + }, + { + "epoch": 0.8735475980159, + "grad_norm": 0.7975374460220337, + "learning_rate": 0.00014178094605207518, + "loss": 2.8748, + "step": 9642 + }, + { + "epoch": 0.8736381961903468, + "grad_norm": 0.8135979175567627, + "learning_rate": 0.0001417749048510844, + "loss": 2.7077, + "step": 9643 + }, + { + "epoch": 0.8737287943647936, + "grad_norm": 0.7838068604469299, + "learning_rate": 0.00014176886365009365, + "loss": 2.705, + "step": 9644 + }, + { + "epoch": 0.8738193925392403, + "grad_norm": 0.8319929838180542, + "learning_rate": 0.00014176282244910288, + "loss": 2.8516, + "step": 9645 + }, + { + "epoch": 0.8739099907136871, + "grad_norm": 0.7699160575866699, + "learning_rate": 0.00014175678124811214, + "loss": 2.7733, + "step": 9646 + }, + { + "epoch": 0.8740005888881339, + "grad_norm": 0.7466673851013184, + "learning_rate": 0.00014175074004712138, + "loss": 2.8071, + "step": 9647 + }, + { + "epoch": 0.8740911870625807, + "grad_norm": 0.8591762781143188, + "learning_rate": 0.0001417446988461306, + "loss": 2.9426, + "step": 9648 + }, + { + "epoch": 0.8741817852370275, + "grad_norm": 0.8429707884788513, + "learning_rate": 0.00014173865764513985, + "loss": 2.9198, + "step": 9649 + }, + { + "epoch": 0.8742723834114743, + "grad_norm": 0.7589640021324158, + "learning_rate": 0.0001417326164441491, + "loss": 2.7525, + "step": 9650 + }, + { + "epoch": 0.8743629815859211, + "grad_norm": 0.7472631335258484, + "learning_rate": 0.00014172657524315834, + "loss": 2.6971, + "step": 9651 + }, + { + "epoch": 0.8744535797603679, + "grad_norm": 0.7962272763252258, + "learning_rate": 0.00014172053404216758, + "loss": 2.8679, + "step": 9652 + }, + { + "epoch": 0.8745441779348146, + "grad_norm": 0.8406625390052795, + "learning_rate": 0.00014171449284117684, + "loss": 2.6916, + "step": 9653 + }, + { + "epoch": 0.8746347761092614, + "grad_norm": 0.7659062147140503, + "learning_rate": 0.00014170845164018607, + "loss": 2.8201, + "step": 9654 + }, + { + "epoch": 0.8747253742837082, + "grad_norm": 0.708608090877533, + "learning_rate": 0.00014170241043919533, + "loss": 1.9662, + "step": 9655 + }, + { + "epoch": 0.874815972458155, + "grad_norm": 0.8067208528518677, + "learning_rate": 0.00014169636923820454, + "loss": 2.9453, + "step": 9656 + }, + { + "epoch": 0.8749065706326018, + "grad_norm": 0.8217226266860962, + "learning_rate": 0.0001416903280372138, + "loss": 2.9867, + "step": 9657 + }, + { + "epoch": 0.8749971688070486, + "grad_norm": 0.6782970428466797, + "learning_rate": 0.00014168428683622306, + "loss": 2.0675, + "step": 9658 + }, + { + "epoch": 0.8750877669814954, + "grad_norm": 0.8624693155288696, + "learning_rate": 0.0001416782456352323, + "loss": 3.0144, + "step": 9659 + }, + { + "epoch": 0.8751783651559422, + "grad_norm": 0.8324472308158875, + "learning_rate": 0.00014167220443424153, + "loss": 2.8747, + "step": 9660 + }, + { + "epoch": 0.8752689633303888, + "grad_norm": 0.7785320281982422, + "learning_rate": 0.00014166616323325076, + "loss": 2.9486, + "step": 9661 + }, + { + "epoch": 0.8753595615048356, + "grad_norm": 0.832991361618042, + "learning_rate": 0.00014166012203226003, + "loss": 3.0712, + "step": 9662 + }, + { + "epoch": 0.8754501596792824, + "grad_norm": 0.7871313095092773, + "learning_rate": 0.00014165408083126926, + "loss": 2.6158, + "step": 9663 + }, + { + "epoch": 0.8755407578537292, + "grad_norm": 0.8300356268882751, + "learning_rate": 0.0001416480396302785, + "loss": 3.0319, + "step": 9664 + }, + { + "epoch": 0.875631356028176, + "grad_norm": 0.8554440140724182, + "learning_rate": 0.00014164199842928775, + "loss": 2.7214, + "step": 9665 + }, + { + "epoch": 0.8757219542026228, + "grad_norm": 0.7397758960723877, + "learning_rate": 0.000141635957228297, + "loss": 2.8896, + "step": 9666 + }, + { + "epoch": 0.8758125523770696, + "grad_norm": 0.7562670707702637, + "learning_rate": 0.00014162991602730625, + "loss": 2.8309, + "step": 9667 + }, + { + "epoch": 0.8759031505515164, + "grad_norm": 0.8407593965530396, + "learning_rate": 0.00014162387482631548, + "loss": 2.5007, + "step": 9668 + }, + { + "epoch": 0.8759937487259631, + "grad_norm": 0.7607198357582092, + "learning_rate": 0.00014161783362532472, + "loss": 2.522, + "step": 9669 + }, + { + "epoch": 0.8760843469004099, + "grad_norm": 0.7495636940002441, + "learning_rate": 0.00014161179242433395, + "loss": 2.7513, + "step": 9670 + }, + { + "epoch": 0.8761749450748567, + "grad_norm": 0.8490250110626221, + "learning_rate": 0.0001416057512233432, + "loss": 2.9075, + "step": 9671 + }, + { + "epoch": 0.8762655432493035, + "grad_norm": 0.7676758766174316, + "learning_rate": 0.00014159971002235247, + "loss": 2.8204, + "step": 9672 + }, + { + "epoch": 0.8763561414237503, + "grad_norm": 0.7726947665214539, + "learning_rate": 0.00014159366882136168, + "loss": 2.7754, + "step": 9673 + }, + { + "epoch": 0.8764467395981971, + "grad_norm": 0.7923010587692261, + "learning_rate": 0.00014158762762037094, + "loss": 2.6357, + "step": 9674 + }, + { + "epoch": 0.8765373377726439, + "grad_norm": 0.8249114155769348, + "learning_rate": 0.00014158158641938018, + "loss": 2.9445, + "step": 9675 + }, + { + "epoch": 0.8766279359470907, + "grad_norm": 0.8493127822875977, + "learning_rate": 0.00014157554521838944, + "loss": 2.9305, + "step": 9676 + }, + { + "epoch": 0.8767185341215374, + "grad_norm": 0.8105508685112, + "learning_rate": 0.00014156950401739864, + "loss": 2.8791, + "step": 9677 + }, + { + "epoch": 0.8768091322959842, + "grad_norm": 0.7916189432144165, + "learning_rate": 0.0001415634628164079, + "loss": 2.7262, + "step": 9678 + }, + { + "epoch": 0.876899730470431, + "grad_norm": 0.8068888783454895, + "learning_rate": 0.00014155742161541714, + "loss": 2.9545, + "step": 9679 + }, + { + "epoch": 0.8769903286448778, + "grad_norm": 0.8423900008201599, + "learning_rate": 0.0001415513804144264, + "loss": 2.729, + "step": 9680 + }, + { + "epoch": 0.8770809268193246, + "grad_norm": 0.8333113789558411, + "learning_rate": 0.00014154533921343563, + "loss": 2.9993, + "step": 9681 + }, + { + "epoch": 0.8771715249937714, + "grad_norm": 0.8302508592605591, + "learning_rate": 0.00014153929801244487, + "loss": 2.8928, + "step": 9682 + }, + { + "epoch": 0.8772621231682182, + "grad_norm": 0.7919431924819946, + "learning_rate": 0.00014153325681145413, + "loss": 2.5701, + "step": 9683 + }, + { + "epoch": 0.877352721342665, + "grad_norm": 0.8562088012695312, + "learning_rate": 0.00014152721561046336, + "loss": 2.8207, + "step": 9684 + }, + { + "epoch": 0.8774433195171117, + "grad_norm": 0.831650972366333, + "learning_rate": 0.00014152117440947263, + "loss": 2.8199, + "step": 9685 + }, + { + "epoch": 0.8775339176915585, + "grad_norm": 0.8518459796905518, + "learning_rate": 0.00014151513320848183, + "loss": 3.1217, + "step": 9686 + }, + { + "epoch": 0.8776245158660053, + "grad_norm": 0.8243225812911987, + "learning_rate": 0.0001415090920074911, + "loss": 2.8022, + "step": 9687 + }, + { + "epoch": 0.8777151140404521, + "grad_norm": 0.8538419604301453, + "learning_rate": 0.00014150305080650035, + "loss": 2.9199, + "step": 9688 + }, + { + "epoch": 0.8778057122148989, + "grad_norm": 0.7444364428520203, + "learning_rate": 0.0001414970096055096, + "loss": 2.7302, + "step": 9689 + }, + { + "epoch": 0.8778963103893457, + "grad_norm": 0.7697076201438904, + "learning_rate": 0.00014149096840451882, + "loss": 2.8104, + "step": 9690 + }, + { + "epoch": 0.8779869085637925, + "grad_norm": 0.8081661462783813, + "learning_rate": 0.00014148492720352806, + "loss": 2.8833, + "step": 9691 + }, + { + "epoch": 0.8780775067382393, + "grad_norm": 0.7909315824508667, + "learning_rate": 0.00014147888600253732, + "loss": 3.0109, + "step": 9692 + }, + { + "epoch": 0.878168104912686, + "grad_norm": 0.7773188948631287, + "learning_rate": 0.00014147284480154655, + "loss": 2.675, + "step": 9693 + }, + { + "epoch": 0.8782587030871328, + "grad_norm": 0.8261553049087524, + "learning_rate": 0.00014146680360055579, + "loss": 2.7276, + "step": 9694 + }, + { + "epoch": 0.8783493012615796, + "grad_norm": 0.7353509068489075, + "learning_rate": 0.00014146076239956505, + "loss": 2.62, + "step": 9695 + }, + { + "epoch": 0.8784398994360264, + "grad_norm": 0.8173173069953918, + "learning_rate": 0.00014145472119857428, + "loss": 2.8152, + "step": 9696 + }, + { + "epoch": 0.8785304976104732, + "grad_norm": 0.8873192667961121, + "learning_rate": 0.00014144867999758354, + "loss": 2.7812, + "step": 9697 + }, + { + "epoch": 0.87862109578492, + "grad_norm": 0.8132085800170898, + "learning_rate": 0.00014144263879659278, + "loss": 3.0356, + "step": 9698 + }, + { + "epoch": 0.8787116939593668, + "grad_norm": 0.6600408554077148, + "learning_rate": 0.000141436597595602, + "loss": 2.0974, + "step": 9699 + }, + { + "epoch": 0.8788022921338136, + "grad_norm": 0.8063369989395142, + "learning_rate": 0.00014143055639461124, + "loss": 2.7799, + "step": 9700 + }, + { + "epoch": 0.8788928903082603, + "grad_norm": 0.7686138153076172, + "learning_rate": 0.0001414245151936205, + "loss": 2.7827, + "step": 9701 + }, + { + "epoch": 0.878983488482707, + "grad_norm": 0.8052294254302979, + "learning_rate": 0.00014141847399262974, + "loss": 3.0261, + "step": 9702 + }, + { + "epoch": 0.8790740866571538, + "grad_norm": 0.7884116768836975, + "learning_rate": 0.00014141243279163897, + "loss": 2.3011, + "step": 9703 + }, + { + "epoch": 0.8791646848316006, + "grad_norm": 0.776469349861145, + "learning_rate": 0.00014140639159064823, + "loss": 2.8091, + "step": 9704 + }, + { + "epoch": 0.8792552830060474, + "grad_norm": 0.8402460217475891, + "learning_rate": 0.00014140035038965747, + "loss": 2.6636, + "step": 9705 + }, + { + "epoch": 0.8793458811804942, + "grad_norm": 0.7840914130210876, + "learning_rate": 0.00014139430918866673, + "loss": 2.9336, + "step": 9706 + }, + { + "epoch": 0.879436479354941, + "grad_norm": 0.762371838092804, + "learning_rate": 0.00014138826798767594, + "loss": 2.1602, + "step": 9707 + }, + { + "epoch": 0.8795270775293877, + "grad_norm": 0.7985847592353821, + "learning_rate": 0.0001413822267866852, + "loss": 2.9164, + "step": 9708 + }, + { + "epoch": 0.8796176757038345, + "grad_norm": 0.7025455832481384, + "learning_rate": 0.00014137618558569446, + "loss": 2.4218, + "step": 9709 + }, + { + "epoch": 0.8797082738782813, + "grad_norm": 0.8260303139686584, + "learning_rate": 0.0001413701443847037, + "loss": 2.715, + "step": 9710 + }, + { + "epoch": 0.8797988720527281, + "grad_norm": 0.7222891449928284, + "learning_rate": 0.00014136410318371293, + "loss": 2.1478, + "step": 9711 + }, + { + "epoch": 0.8798894702271749, + "grad_norm": 0.8050627112388611, + "learning_rate": 0.00014135806198272216, + "loss": 2.5653, + "step": 9712 + }, + { + "epoch": 0.8799800684016217, + "grad_norm": 0.7915892004966736, + "learning_rate": 0.00014135202078173142, + "loss": 2.5754, + "step": 9713 + }, + { + "epoch": 0.8800706665760685, + "grad_norm": 0.7752402424812317, + "learning_rate": 0.00014134597958074066, + "loss": 2.5779, + "step": 9714 + }, + { + "epoch": 0.8801612647505153, + "grad_norm": 0.7850690484046936, + "learning_rate": 0.0001413399383797499, + "loss": 2.7616, + "step": 9715 + }, + { + "epoch": 0.880251862924962, + "grad_norm": 0.7128453254699707, + "learning_rate": 0.00014133389717875912, + "loss": 2.1174, + "step": 9716 + }, + { + "epoch": 0.8803424610994088, + "grad_norm": 0.7740228772163391, + "learning_rate": 0.00014132785597776839, + "loss": 2.7758, + "step": 9717 + }, + { + "epoch": 0.8804330592738556, + "grad_norm": 0.7970761656761169, + "learning_rate": 0.00014132181477677765, + "loss": 2.7654, + "step": 9718 + }, + { + "epoch": 0.8805236574483024, + "grad_norm": 0.843267560005188, + "learning_rate": 0.00014131577357578688, + "loss": 2.7764, + "step": 9719 + }, + { + "epoch": 0.8806142556227492, + "grad_norm": 0.7667818665504456, + "learning_rate": 0.00014130973237479612, + "loss": 2.7241, + "step": 9720 + }, + { + "epoch": 0.880704853797196, + "grad_norm": 0.9174322485923767, + "learning_rate": 0.00014130369117380535, + "loss": 2.9417, + "step": 9721 + }, + { + "epoch": 0.8807954519716428, + "grad_norm": 0.8143362998962402, + "learning_rate": 0.0001412976499728146, + "loss": 2.8148, + "step": 9722 + }, + { + "epoch": 0.8808860501460896, + "grad_norm": 0.7787153124809265, + "learning_rate": 0.00014129160877182384, + "loss": 2.3812, + "step": 9723 + }, + { + "epoch": 0.8809766483205363, + "grad_norm": 0.8160692453384399, + "learning_rate": 0.00014128556757083308, + "loss": 2.8376, + "step": 9724 + }, + { + "epoch": 0.8810672464949831, + "grad_norm": 0.7616212368011475, + "learning_rate": 0.00014127952636984234, + "loss": 2.5742, + "step": 9725 + }, + { + "epoch": 0.8811578446694299, + "grad_norm": 0.7901712656021118, + "learning_rate": 0.00014127348516885157, + "loss": 2.9221, + "step": 9726 + }, + { + "epoch": 0.8812484428438767, + "grad_norm": 0.8093090653419495, + "learning_rate": 0.00014126744396786083, + "loss": 2.6367, + "step": 9727 + }, + { + "epoch": 0.8813390410183235, + "grad_norm": 0.8012028932571411, + "learning_rate": 0.00014126140276687004, + "loss": 2.6125, + "step": 9728 + }, + { + "epoch": 0.8814296391927703, + "grad_norm": 0.8079915642738342, + "learning_rate": 0.0001412553615658793, + "loss": 2.8831, + "step": 9729 + }, + { + "epoch": 0.8815202373672171, + "grad_norm": 0.8412514925003052, + "learning_rate": 0.00014124932036488854, + "loss": 2.7478, + "step": 9730 + }, + { + "epoch": 0.8816108355416639, + "grad_norm": 0.902840793132782, + "learning_rate": 0.0001412432791638978, + "loss": 2.6471, + "step": 9731 + }, + { + "epoch": 0.8817014337161106, + "grad_norm": 0.8517336249351501, + "learning_rate": 0.00014123723796290703, + "loss": 2.9704, + "step": 9732 + }, + { + "epoch": 0.8817920318905574, + "grad_norm": 0.788766086101532, + "learning_rate": 0.00014123119676191627, + "loss": 2.7352, + "step": 9733 + }, + { + "epoch": 0.8818826300650042, + "grad_norm": 0.8120980858802795, + "learning_rate": 0.00014122515556092553, + "loss": 2.6266, + "step": 9734 + }, + { + "epoch": 0.881973228239451, + "grad_norm": 0.7888132929801941, + "learning_rate": 0.00014121911435993476, + "loss": 2.5502, + "step": 9735 + }, + { + "epoch": 0.8820638264138978, + "grad_norm": 0.8071483373641968, + "learning_rate": 0.00014121307315894402, + "loss": 2.7726, + "step": 9736 + }, + { + "epoch": 0.8821544245883446, + "grad_norm": 0.7891906499862671, + "learning_rate": 0.00014120703195795323, + "loss": 2.6331, + "step": 9737 + }, + { + "epoch": 0.8822450227627914, + "grad_norm": 0.8034350275993347, + "learning_rate": 0.0001412009907569625, + "loss": 2.4963, + "step": 9738 + }, + { + "epoch": 0.8823356209372382, + "grad_norm": 0.7794338464736938, + "learning_rate": 0.00014119494955597175, + "loss": 2.8329, + "step": 9739 + }, + { + "epoch": 0.882426219111685, + "grad_norm": 0.8114681839942932, + "learning_rate": 0.00014118890835498099, + "loss": 2.6819, + "step": 9740 + }, + { + "epoch": 0.8825168172861317, + "grad_norm": 0.8515303730964661, + "learning_rate": 0.00014118286715399022, + "loss": 2.9559, + "step": 9741 + }, + { + "epoch": 0.8826074154605784, + "grad_norm": 0.8760316371917725, + "learning_rate": 0.00014117682595299945, + "loss": 2.9459, + "step": 9742 + }, + { + "epoch": 0.8826980136350252, + "grad_norm": 0.7310296893119812, + "learning_rate": 0.00014117078475200872, + "loss": 2.5141, + "step": 9743 + }, + { + "epoch": 0.882788611809472, + "grad_norm": 0.8243328332901001, + "learning_rate": 0.00014116474355101795, + "loss": 2.86, + "step": 9744 + }, + { + "epoch": 0.8828792099839188, + "grad_norm": 0.8119776248931885, + "learning_rate": 0.00014115870235002718, + "loss": 2.6816, + "step": 9745 + }, + { + "epoch": 0.8829698081583656, + "grad_norm": 0.7896768450737, + "learning_rate": 0.00014115266114903642, + "loss": 2.8043, + "step": 9746 + }, + { + "epoch": 0.8830604063328124, + "grad_norm": 0.7979839444160461, + "learning_rate": 0.00014114661994804568, + "loss": 2.8811, + "step": 9747 + }, + { + "epoch": 0.8831510045072591, + "grad_norm": 0.810397744178772, + "learning_rate": 0.00014114057874705494, + "loss": 2.7314, + "step": 9748 + }, + { + "epoch": 0.8832416026817059, + "grad_norm": 0.7761694192886353, + "learning_rate": 0.00014113453754606417, + "loss": 2.1805, + "step": 9749 + }, + { + "epoch": 0.8833322008561527, + "grad_norm": 0.8328813910484314, + "learning_rate": 0.0001411284963450734, + "loss": 2.8126, + "step": 9750 + }, + { + "epoch": 0.8834227990305995, + "grad_norm": 1.0176329612731934, + "learning_rate": 0.00014112245514408264, + "loss": 2.9375, + "step": 9751 + }, + { + "epoch": 0.8835133972050463, + "grad_norm": 0.8498592972755432, + "learning_rate": 0.0001411164139430919, + "loss": 2.6852, + "step": 9752 + }, + { + "epoch": 0.8836039953794931, + "grad_norm": 0.8162945508956909, + "learning_rate": 0.00014111037274210114, + "loss": 2.6633, + "step": 9753 + }, + { + "epoch": 0.8836945935539399, + "grad_norm": 0.7699756026268005, + "learning_rate": 0.00014110433154111037, + "loss": 2.6781, + "step": 9754 + }, + { + "epoch": 0.8837851917283867, + "grad_norm": 0.803999125957489, + "learning_rate": 0.00014109829034011963, + "loss": 2.6435, + "step": 9755 + }, + { + "epoch": 0.8838757899028334, + "grad_norm": 0.8100943565368652, + "learning_rate": 0.00014109224913912887, + "loss": 2.8147, + "step": 9756 + }, + { + "epoch": 0.8839663880772802, + "grad_norm": 0.7806283235549927, + "learning_rate": 0.00014108620793813813, + "loss": 2.5553, + "step": 9757 + }, + { + "epoch": 0.884056986251727, + "grad_norm": 0.8045893311500549, + "learning_rate": 0.00014108016673714733, + "loss": 2.8002, + "step": 9758 + }, + { + "epoch": 0.8841475844261738, + "grad_norm": 0.7842176556587219, + "learning_rate": 0.0001410741255361566, + "loss": 2.8377, + "step": 9759 + }, + { + "epoch": 0.8842381826006206, + "grad_norm": 0.7632573843002319, + "learning_rate": 0.00014106808433516583, + "loss": 2.3292, + "step": 9760 + }, + { + "epoch": 0.8843287807750674, + "grad_norm": 0.8223870992660522, + "learning_rate": 0.0001410620431341751, + "loss": 2.9936, + "step": 9761 + }, + { + "epoch": 0.8844193789495142, + "grad_norm": 0.6689080595970154, + "learning_rate": 0.00014105600193318432, + "loss": 2.1035, + "step": 9762 + }, + { + "epoch": 0.884509977123961, + "grad_norm": 0.8111570477485657, + "learning_rate": 0.00014104996073219356, + "loss": 2.8389, + "step": 9763 + }, + { + "epoch": 0.8846005752984077, + "grad_norm": 0.853700578212738, + "learning_rate": 0.00014104391953120282, + "loss": 2.3572, + "step": 9764 + }, + { + "epoch": 0.8846911734728545, + "grad_norm": 0.7698506116867065, + "learning_rate": 0.00014103787833021205, + "loss": 2.6821, + "step": 9765 + }, + { + "epoch": 0.8847817716473013, + "grad_norm": 0.8424315452575684, + "learning_rate": 0.0001410318371292213, + "loss": 2.8748, + "step": 9766 + }, + { + "epoch": 0.8848723698217481, + "grad_norm": 0.7955506443977356, + "learning_rate": 0.00014102579592823052, + "loss": 2.7999, + "step": 9767 + }, + { + "epoch": 0.8849629679961949, + "grad_norm": 0.8285724520683289, + "learning_rate": 0.00014101975472723978, + "loss": 2.6759, + "step": 9768 + }, + { + "epoch": 0.8850535661706417, + "grad_norm": 0.8557848930358887, + "learning_rate": 0.00014101371352624904, + "loss": 2.7143, + "step": 9769 + }, + { + "epoch": 0.8851441643450885, + "grad_norm": 0.7338418960571289, + "learning_rate": 0.00014100767232525828, + "loss": 2.5693, + "step": 9770 + }, + { + "epoch": 0.8852347625195353, + "grad_norm": 0.7693625092506409, + "learning_rate": 0.0001410016311242675, + "loss": 2.7721, + "step": 9771 + }, + { + "epoch": 0.885325360693982, + "grad_norm": 0.784203827381134, + "learning_rate": 0.00014099558992327675, + "loss": 2.821, + "step": 9772 + }, + { + "epoch": 0.8854159588684288, + "grad_norm": 0.8048550486564636, + "learning_rate": 0.000140989548722286, + "loss": 2.9094, + "step": 9773 + }, + { + "epoch": 0.8855065570428756, + "grad_norm": 0.8700568079948425, + "learning_rate": 0.00014098350752129524, + "loss": 2.8514, + "step": 9774 + }, + { + "epoch": 0.8855971552173224, + "grad_norm": 0.8230111002922058, + "learning_rate": 0.00014097746632030448, + "loss": 2.6038, + "step": 9775 + }, + { + "epoch": 0.8856877533917692, + "grad_norm": 0.8824648261070251, + "learning_rate": 0.0001409714251193137, + "loss": 2.1502, + "step": 9776 + }, + { + "epoch": 0.885778351566216, + "grad_norm": 0.7976526618003845, + "learning_rate": 0.00014096538391832297, + "loss": 2.638, + "step": 9777 + }, + { + "epoch": 0.8858689497406628, + "grad_norm": 0.776227593421936, + "learning_rate": 0.00014095934271733223, + "loss": 2.8869, + "step": 9778 + }, + { + "epoch": 0.8859595479151096, + "grad_norm": 0.8186699748039246, + "learning_rate": 0.00014095330151634144, + "loss": 2.6606, + "step": 9779 + }, + { + "epoch": 0.8860501460895563, + "grad_norm": 0.8019995093345642, + "learning_rate": 0.0001409472603153507, + "loss": 2.6664, + "step": 9780 + }, + { + "epoch": 0.8861407442640031, + "grad_norm": 0.8129714131355286, + "learning_rate": 0.00014094121911435993, + "loss": 2.7132, + "step": 9781 + }, + { + "epoch": 0.8862313424384499, + "grad_norm": 0.7726123332977295, + "learning_rate": 0.0001409351779133692, + "loss": 2.6485, + "step": 9782 + }, + { + "epoch": 0.8863219406128966, + "grad_norm": 0.8457036018371582, + "learning_rate": 0.00014092913671237843, + "loss": 2.707, + "step": 9783 + }, + { + "epoch": 0.8864125387873434, + "grad_norm": 0.8332813382148743, + "learning_rate": 0.00014092309551138766, + "loss": 2.6374, + "step": 9784 + }, + { + "epoch": 0.8865031369617902, + "grad_norm": 0.7957324385643005, + "learning_rate": 0.00014091705431039692, + "loss": 2.8074, + "step": 9785 + }, + { + "epoch": 0.886593735136237, + "grad_norm": 0.8486992716789246, + "learning_rate": 0.00014091101310940616, + "loss": 2.8392, + "step": 9786 + }, + { + "epoch": 0.8866843333106837, + "grad_norm": 0.7459009289741516, + "learning_rate": 0.0001409049719084154, + "loss": 2.5289, + "step": 9787 + }, + { + "epoch": 0.8867749314851305, + "grad_norm": 0.7450244426727295, + "learning_rate": 0.00014089893070742463, + "loss": 2.611, + "step": 9788 + }, + { + "epoch": 0.8868655296595773, + "grad_norm": 0.810703694820404, + "learning_rate": 0.0001408928895064339, + "loss": 2.1111, + "step": 9789 + }, + { + "epoch": 0.8869561278340241, + "grad_norm": 0.8002599477767944, + "learning_rate": 0.00014088684830544312, + "loss": 2.7812, + "step": 9790 + }, + { + "epoch": 0.8870467260084709, + "grad_norm": 0.8693192601203918, + "learning_rate": 0.00014088080710445238, + "loss": 2.9144, + "step": 9791 + }, + { + "epoch": 0.8871373241829177, + "grad_norm": 0.840619683265686, + "learning_rate": 0.00014087476590346162, + "loss": 2.8211, + "step": 9792 + }, + { + "epoch": 0.8872279223573645, + "grad_norm": 0.9818710684776306, + "learning_rate": 0.00014086872470247085, + "loss": 2.8533, + "step": 9793 + }, + { + "epoch": 0.8873185205318113, + "grad_norm": 0.8130396008491516, + "learning_rate": 0.0001408626835014801, + "loss": 2.7782, + "step": 9794 + }, + { + "epoch": 0.887409118706258, + "grad_norm": 0.7969170212745667, + "learning_rate": 0.00014085664230048935, + "loss": 2.5776, + "step": 9795 + }, + { + "epoch": 0.8874997168807048, + "grad_norm": 0.8229281902313232, + "learning_rate": 0.00014085060109949858, + "loss": 2.5735, + "step": 9796 + }, + { + "epoch": 0.8875903150551516, + "grad_norm": 0.8382211327552795, + "learning_rate": 0.00014084455989850781, + "loss": 2.7136, + "step": 9797 + }, + { + "epoch": 0.8876809132295984, + "grad_norm": 0.7679049968719482, + "learning_rate": 0.00014083851869751708, + "loss": 2.5842, + "step": 9798 + }, + { + "epoch": 0.8877715114040452, + "grad_norm": 0.8327851891517639, + "learning_rate": 0.00014083247749652634, + "loss": 2.7586, + "step": 9799 + }, + { + "epoch": 0.887862109578492, + "grad_norm": 0.8133258819580078, + "learning_rate": 0.00014082643629553554, + "loss": 2.8715, + "step": 9800 + }, + { + "epoch": 0.8879527077529388, + "grad_norm": 0.79029381275177, + "learning_rate": 0.0001408203950945448, + "loss": 2.8568, + "step": 9801 + }, + { + "epoch": 0.8880433059273856, + "grad_norm": 0.7626277804374695, + "learning_rate": 0.00014081435389355404, + "loss": 2.6662, + "step": 9802 + }, + { + "epoch": 0.8881339041018323, + "grad_norm": 0.8000333905220032, + "learning_rate": 0.0001408083126925633, + "loss": 2.7996, + "step": 9803 + }, + { + "epoch": 0.8882245022762791, + "grad_norm": 0.8660078644752502, + "learning_rate": 0.00014080227149157253, + "loss": 2.6116, + "step": 9804 + }, + { + "epoch": 0.8883151004507259, + "grad_norm": 0.7127832770347595, + "learning_rate": 0.00014079623029058177, + "loss": 2.1685, + "step": 9805 + }, + { + "epoch": 0.8884056986251727, + "grad_norm": 0.8204120397567749, + "learning_rate": 0.000140790189089591, + "loss": 2.5742, + "step": 9806 + }, + { + "epoch": 0.8884962967996195, + "grad_norm": 0.8697447776794434, + "learning_rate": 0.00014078414788860026, + "loss": 2.8773, + "step": 9807 + }, + { + "epoch": 0.8885868949740663, + "grad_norm": 0.8658682703971863, + "learning_rate": 0.00014077810668760952, + "loss": 2.64, + "step": 9808 + }, + { + "epoch": 0.8886774931485131, + "grad_norm": 0.7785741686820984, + "learning_rate": 0.00014077206548661873, + "loss": 2.6626, + "step": 9809 + }, + { + "epoch": 0.8887680913229599, + "grad_norm": 0.8166967630386353, + "learning_rate": 0.000140766024285628, + "loss": 2.8846, + "step": 9810 + }, + { + "epoch": 0.8888586894974067, + "grad_norm": 0.8885951638221741, + "learning_rate": 0.00014075998308463723, + "loss": 2.6963, + "step": 9811 + }, + { + "epoch": 0.8889492876718534, + "grad_norm": 0.8094418048858643, + "learning_rate": 0.0001407539418836465, + "loss": 2.795, + "step": 9812 + }, + { + "epoch": 0.8890398858463002, + "grad_norm": 0.7973811626434326, + "learning_rate": 0.00014074790068265572, + "loss": 2.7073, + "step": 9813 + }, + { + "epoch": 0.889130484020747, + "grad_norm": 0.7658677101135254, + "learning_rate": 0.00014074185948166496, + "loss": 2.6795, + "step": 9814 + }, + { + "epoch": 0.8892210821951938, + "grad_norm": 0.9012757539749146, + "learning_rate": 0.00014073581828067422, + "loss": 2.8778, + "step": 9815 + }, + { + "epoch": 0.8893116803696406, + "grad_norm": 0.811065137386322, + "learning_rate": 0.00014072977707968345, + "loss": 2.8203, + "step": 9816 + }, + { + "epoch": 0.8894022785440874, + "grad_norm": 0.8612082600593567, + "learning_rate": 0.00014072373587869269, + "loss": 3.0508, + "step": 9817 + }, + { + "epoch": 0.8894928767185342, + "grad_norm": 0.8099370002746582, + "learning_rate": 0.00014071769467770192, + "loss": 2.6035, + "step": 9818 + }, + { + "epoch": 0.889583474892981, + "grad_norm": 0.7954690456390381, + "learning_rate": 0.00014071165347671118, + "loss": 2.6836, + "step": 9819 + }, + { + "epoch": 0.8896740730674277, + "grad_norm": 0.7998084425926208, + "learning_rate": 0.00014070561227572041, + "loss": 2.8705, + "step": 9820 + }, + { + "epoch": 0.8897646712418745, + "grad_norm": 0.8255806565284729, + "learning_rate": 0.00014069957107472968, + "loss": 3.0292, + "step": 9821 + }, + { + "epoch": 0.8898552694163213, + "grad_norm": 0.8033653497695923, + "learning_rate": 0.0001406935298737389, + "loss": 2.8657, + "step": 9822 + }, + { + "epoch": 0.889945867590768, + "grad_norm": 0.7498171925544739, + "learning_rate": 0.00014068748867274814, + "loss": 2.7173, + "step": 9823 + }, + { + "epoch": 0.8900364657652148, + "grad_norm": 0.7402768731117249, + "learning_rate": 0.0001406814474717574, + "loss": 2.7619, + "step": 9824 + }, + { + "epoch": 0.8901270639396616, + "grad_norm": 0.841932475566864, + "learning_rate": 0.00014067540627076664, + "loss": 3.1311, + "step": 9825 + }, + { + "epoch": 0.8902176621141084, + "grad_norm": 0.7863238453865051, + "learning_rate": 0.00014066936506977587, + "loss": 2.6317, + "step": 9826 + }, + { + "epoch": 0.8903082602885551, + "grad_norm": 0.8550273776054382, + "learning_rate": 0.0001406633238687851, + "loss": 3.0142, + "step": 9827 + }, + { + "epoch": 0.8903988584630019, + "grad_norm": 0.766310453414917, + "learning_rate": 0.00014065728266779437, + "loss": 1.9508, + "step": 9828 + }, + { + "epoch": 0.8904894566374487, + "grad_norm": 0.8098750114440918, + "learning_rate": 0.00014065124146680363, + "loss": 3.0009, + "step": 9829 + }, + { + "epoch": 0.8905800548118955, + "grad_norm": 0.7945453524589539, + "learning_rate": 0.00014064520026581284, + "loss": 2.9395, + "step": 9830 + }, + { + "epoch": 0.8906706529863423, + "grad_norm": 0.7096872925758362, + "learning_rate": 0.0001406391590648221, + "loss": 1.8215, + "step": 9831 + }, + { + "epoch": 0.8907612511607891, + "grad_norm": 0.8620094656944275, + "learning_rate": 0.00014063311786383133, + "loss": 3.2325, + "step": 9832 + }, + { + "epoch": 0.8908518493352359, + "grad_norm": 0.8133580684661865, + "learning_rate": 0.0001406270766628406, + "loss": 2.4765, + "step": 9833 + }, + { + "epoch": 0.8909424475096827, + "grad_norm": 0.8364594578742981, + "learning_rate": 0.00014062103546184983, + "loss": 2.7765, + "step": 9834 + }, + { + "epoch": 0.8910330456841294, + "grad_norm": 0.78807133436203, + "learning_rate": 0.00014061499426085906, + "loss": 2.7647, + "step": 9835 + }, + { + "epoch": 0.8911236438585762, + "grad_norm": 0.7597033977508545, + "learning_rate": 0.0001406089530598683, + "loss": 2.7765, + "step": 9836 + }, + { + "epoch": 0.891214242033023, + "grad_norm": 0.80785071849823, + "learning_rate": 0.00014060291185887756, + "loss": 2.1056, + "step": 9837 + }, + { + "epoch": 0.8913048402074698, + "grad_norm": 0.8208746910095215, + "learning_rate": 0.0001405968706578868, + "loss": 2.7829, + "step": 9838 + }, + { + "epoch": 0.8913954383819166, + "grad_norm": 0.8779385685920715, + "learning_rate": 0.00014059082945689602, + "loss": 2.6969, + "step": 9839 + }, + { + "epoch": 0.8914860365563634, + "grad_norm": 0.7932726740837097, + "learning_rate": 0.00014058478825590529, + "loss": 2.7677, + "step": 9840 + }, + { + "epoch": 0.8915766347308102, + "grad_norm": 0.9112422466278076, + "learning_rate": 0.00014057874705491452, + "loss": 2.7456, + "step": 9841 + }, + { + "epoch": 0.891667232905257, + "grad_norm": 0.7664920687675476, + "learning_rate": 0.00014057270585392378, + "loss": 2.7721, + "step": 9842 + }, + { + "epoch": 0.8917578310797037, + "grad_norm": 0.8425889611244202, + "learning_rate": 0.000140566664652933, + "loss": 2.8923, + "step": 9843 + }, + { + "epoch": 0.8918484292541505, + "grad_norm": 0.795072078704834, + "learning_rate": 0.00014056062345194225, + "loss": 2.8811, + "step": 9844 + }, + { + "epoch": 0.8919390274285973, + "grad_norm": 0.855478048324585, + "learning_rate": 0.0001405545822509515, + "loss": 2.6841, + "step": 9845 + }, + { + "epoch": 0.8920296256030441, + "grad_norm": 0.8194999694824219, + "learning_rate": 0.00014054854104996074, + "loss": 2.9545, + "step": 9846 + }, + { + "epoch": 0.8921202237774909, + "grad_norm": 0.7788867354393005, + "learning_rate": 0.00014054249984896998, + "loss": 2.697, + "step": 9847 + }, + { + "epoch": 0.8922108219519377, + "grad_norm": 0.7471581697463989, + "learning_rate": 0.0001405364586479792, + "loss": 2.5422, + "step": 9848 + }, + { + "epoch": 0.8923014201263845, + "grad_norm": 0.7731968760490417, + "learning_rate": 0.00014053041744698847, + "loss": 2.8421, + "step": 9849 + }, + { + "epoch": 0.8923920183008313, + "grad_norm": 0.867813229560852, + "learning_rate": 0.0001405243762459977, + "loss": 2.8788, + "step": 9850 + }, + { + "epoch": 0.892482616475278, + "grad_norm": 0.8538320064544678, + "learning_rate": 0.00014051833504500694, + "loss": 2.5546, + "step": 9851 + }, + { + "epoch": 0.8925732146497248, + "grad_norm": 0.7824851870536804, + "learning_rate": 0.0001405122938440162, + "loss": 2.5934, + "step": 9852 + }, + { + "epoch": 0.8926638128241716, + "grad_norm": 0.8508328199386597, + "learning_rate": 0.00014050625264302544, + "loss": 3.0313, + "step": 9853 + }, + { + "epoch": 0.8927544109986184, + "grad_norm": 0.7797449231147766, + "learning_rate": 0.0001405002114420347, + "loss": 2.6264, + "step": 9854 + }, + { + "epoch": 0.8928450091730652, + "grad_norm": 0.7721361517906189, + "learning_rate": 0.00014049417024104393, + "loss": 3.0909, + "step": 9855 + }, + { + "epoch": 0.892935607347512, + "grad_norm": 0.7707027792930603, + "learning_rate": 0.00014048812904005317, + "loss": 2.5315, + "step": 9856 + }, + { + "epoch": 0.8930262055219588, + "grad_norm": 0.7249805331230164, + "learning_rate": 0.0001404820878390624, + "loss": 2.2445, + "step": 9857 + }, + { + "epoch": 0.8931168036964056, + "grad_norm": 0.8453980684280396, + "learning_rate": 0.00014047604663807166, + "loss": 2.687, + "step": 9858 + }, + { + "epoch": 0.8932074018708523, + "grad_norm": 0.7967514395713806, + "learning_rate": 0.00014047000543708092, + "loss": 2.6541, + "step": 9859 + }, + { + "epoch": 0.8932980000452991, + "grad_norm": 0.7732813358306885, + "learning_rate": 0.00014046396423609013, + "loss": 2.7822, + "step": 9860 + }, + { + "epoch": 0.8933885982197459, + "grad_norm": 0.7608214616775513, + "learning_rate": 0.0001404579230350994, + "loss": 2.5668, + "step": 9861 + }, + { + "epoch": 0.8934791963941927, + "grad_norm": 0.7745378017425537, + "learning_rate": 0.00014045188183410862, + "loss": 2.9133, + "step": 9862 + }, + { + "epoch": 0.8935697945686395, + "grad_norm": 0.7804350256919861, + "learning_rate": 0.00014044584063311789, + "loss": 2.5899, + "step": 9863 + }, + { + "epoch": 0.8936603927430862, + "grad_norm": 0.8539773225784302, + "learning_rate": 0.0001404397994321271, + "loss": 3.0064, + "step": 9864 + }, + { + "epoch": 0.893750990917533, + "grad_norm": 0.7613780498504639, + "learning_rate": 0.00014043375823113635, + "loss": 2.9737, + "step": 9865 + }, + { + "epoch": 0.8938415890919797, + "grad_norm": 0.7930357456207275, + "learning_rate": 0.0001404277170301456, + "loss": 2.74, + "step": 9866 + }, + { + "epoch": 0.8939321872664265, + "grad_norm": 0.8488595485687256, + "learning_rate": 0.00014042167582915485, + "loss": 2.7956, + "step": 9867 + }, + { + "epoch": 0.8940227854408733, + "grad_norm": 0.8760315179824829, + "learning_rate": 0.00014041563462816408, + "loss": 2.6459, + "step": 9868 + }, + { + "epoch": 0.8941133836153201, + "grad_norm": 0.8037399649620056, + "learning_rate": 0.00014040959342717332, + "loss": 2.8052, + "step": 9869 + }, + { + "epoch": 0.8942039817897669, + "grad_norm": 0.7796853184700012, + "learning_rate": 0.00014040355222618258, + "loss": 2.7024, + "step": 9870 + }, + { + "epoch": 0.8942945799642137, + "grad_norm": 0.7770442962646484, + "learning_rate": 0.0001403975110251918, + "loss": 2.7961, + "step": 9871 + }, + { + "epoch": 0.8943851781386605, + "grad_norm": 0.8146138787269592, + "learning_rate": 0.00014039146982420107, + "loss": 2.832, + "step": 9872 + }, + { + "epoch": 0.8944757763131073, + "grad_norm": 0.7691928744316101, + "learning_rate": 0.00014038542862321028, + "loss": 2.7362, + "step": 9873 + }, + { + "epoch": 0.894566374487554, + "grad_norm": 0.8209981918334961, + "learning_rate": 0.00014037938742221954, + "loss": 2.6475, + "step": 9874 + }, + { + "epoch": 0.8946569726620008, + "grad_norm": 0.8606634140014648, + "learning_rate": 0.0001403733462212288, + "loss": 3.0523, + "step": 9875 + }, + { + "epoch": 0.8947475708364476, + "grad_norm": 0.8618209958076477, + "learning_rate": 0.00014036730502023804, + "loss": 2.8875, + "step": 9876 + }, + { + "epoch": 0.8948381690108944, + "grad_norm": 0.8387846350669861, + "learning_rate": 0.00014036126381924727, + "loss": 2.9054, + "step": 9877 + }, + { + "epoch": 0.8949287671853412, + "grad_norm": 0.7822557687759399, + "learning_rate": 0.0001403552226182565, + "loss": 2.7498, + "step": 9878 + }, + { + "epoch": 0.895019365359788, + "grad_norm": 0.7810636758804321, + "learning_rate": 0.00014034918141726577, + "loss": 2.5696, + "step": 9879 + }, + { + "epoch": 0.8951099635342348, + "grad_norm": 0.8107747435569763, + "learning_rate": 0.000140343140216275, + "loss": 3.038, + "step": 9880 + }, + { + "epoch": 0.8952005617086816, + "grad_norm": 0.7820066809654236, + "learning_rate": 0.00014033709901528423, + "loss": 2.7545, + "step": 9881 + }, + { + "epoch": 0.8952911598831284, + "grad_norm": 0.6406939029693604, + "learning_rate": 0.0001403310578142935, + "loss": 1.3255, + "step": 9882 + }, + { + "epoch": 0.8953817580575751, + "grad_norm": 0.7750917673110962, + "learning_rate": 0.00014032501661330273, + "loss": 2.8373, + "step": 9883 + }, + { + "epoch": 0.8954723562320219, + "grad_norm": 0.7757347822189331, + "learning_rate": 0.000140318975412312, + "loss": 2.6228, + "step": 9884 + }, + { + "epoch": 0.8955629544064687, + "grad_norm": 0.8739996552467346, + "learning_rate": 0.00014031293421132122, + "loss": 2.9304, + "step": 9885 + }, + { + "epoch": 0.8956535525809155, + "grad_norm": 0.6847730278968811, + "learning_rate": 0.00014030689301033046, + "loss": 2.132, + "step": 9886 + }, + { + "epoch": 0.8957441507553623, + "grad_norm": 0.8702399730682373, + "learning_rate": 0.0001403008518093397, + "loss": 2.6914, + "step": 9887 + }, + { + "epoch": 0.8958347489298091, + "grad_norm": 0.8346466422080994, + "learning_rate": 0.00014029481060834895, + "loss": 2.7735, + "step": 9888 + }, + { + "epoch": 0.8959253471042559, + "grad_norm": 0.7706197500228882, + "learning_rate": 0.0001402887694073582, + "loss": 2.6112, + "step": 9889 + }, + { + "epoch": 0.8960159452787027, + "grad_norm": 0.8217064142227173, + "learning_rate": 0.00014028272820636742, + "loss": 2.8071, + "step": 9890 + }, + { + "epoch": 0.8961065434531494, + "grad_norm": 0.764337420463562, + "learning_rate": 0.00014027668700537668, + "loss": 2.4018, + "step": 9891 + }, + { + "epoch": 0.8961971416275962, + "grad_norm": 0.7987123727798462, + "learning_rate": 0.00014027064580438592, + "loss": 2.9608, + "step": 9892 + }, + { + "epoch": 0.896287739802043, + "grad_norm": 0.8429111242294312, + "learning_rate": 0.00014026460460339518, + "loss": 2.7915, + "step": 9893 + }, + { + "epoch": 0.8963783379764898, + "grad_norm": 0.689128041267395, + "learning_rate": 0.00014025856340240439, + "loss": 2.158, + "step": 9894 + }, + { + "epoch": 0.8964689361509366, + "grad_norm": 0.8653247356414795, + "learning_rate": 0.00014025252220141365, + "loss": 2.9897, + "step": 9895 + }, + { + "epoch": 0.8965595343253834, + "grad_norm": 0.784744918346405, + "learning_rate": 0.00014024648100042288, + "loss": 2.9152, + "step": 9896 + }, + { + "epoch": 0.8966501324998302, + "grad_norm": 0.8698143362998962, + "learning_rate": 0.00014024043979943214, + "loss": 2.9185, + "step": 9897 + }, + { + "epoch": 0.896740730674277, + "grad_norm": 0.8147143721580505, + "learning_rate": 0.00014023439859844138, + "loss": 2.811, + "step": 9898 + }, + { + "epoch": 0.8968313288487237, + "grad_norm": 0.8492542505264282, + "learning_rate": 0.0001402283573974506, + "loss": 2.7203, + "step": 9899 + }, + { + "epoch": 0.8969219270231705, + "grad_norm": 0.7746832370758057, + "learning_rate": 0.00014022231619645987, + "loss": 2.5286, + "step": 9900 + }, + { + "epoch": 0.8970125251976173, + "grad_norm": 0.82454514503479, + "learning_rate": 0.0001402162749954691, + "loss": 2.9548, + "step": 9901 + }, + { + "epoch": 0.8971031233720641, + "grad_norm": 0.7578312754631042, + "learning_rate": 0.00014021023379447834, + "loss": 2.6232, + "step": 9902 + }, + { + "epoch": 0.8971937215465109, + "grad_norm": 0.8078757524490356, + "learning_rate": 0.00014020419259348757, + "loss": 2.9737, + "step": 9903 + }, + { + "epoch": 0.8972843197209576, + "grad_norm": 0.8970105051994324, + "learning_rate": 0.00014019815139249683, + "loss": 2.6242, + "step": 9904 + }, + { + "epoch": 0.8973749178954044, + "grad_norm": 0.8030933737754822, + "learning_rate": 0.0001401921101915061, + "loss": 2.7293, + "step": 9905 + }, + { + "epoch": 0.8974655160698511, + "grad_norm": 0.8145158886909485, + "learning_rate": 0.00014018606899051533, + "loss": 2.467, + "step": 9906 + }, + { + "epoch": 0.8975561142442979, + "grad_norm": 0.8089193105697632, + "learning_rate": 0.00014018002778952456, + "loss": 2.986, + "step": 9907 + }, + { + "epoch": 0.8976467124187447, + "grad_norm": 0.8085559606552124, + "learning_rate": 0.0001401739865885338, + "loss": 2.8958, + "step": 9908 + }, + { + "epoch": 0.8977373105931915, + "grad_norm": 0.9149196147918701, + "learning_rate": 0.00014016794538754306, + "loss": 2.9932, + "step": 9909 + }, + { + "epoch": 0.8978279087676383, + "grad_norm": 0.7954002618789673, + "learning_rate": 0.0001401619041865523, + "loss": 2.8826, + "step": 9910 + }, + { + "epoch": 0.8979185069420851, + "grad_norm": 0.8471199870109558, + "learning_rate": 0.00014015586298556153, + "loss": 2.7439, + "step": 9911 + }, + { + "epoch": 0.8980091051165319, + "grad_norm": 0.8194413185119629, + "learning_rate": 0.0001401498217845708, + "loss": 2.7874, + "step": 9912 + }, + { + "epoch": 0.8980997032909787, + "grad_norm": 0.841766357421875, + "learning_rate": 0.00014014378058358002, + "loss": 2.6629, + "step": 9913 + }, + { + "epoch": 0.8981903014654254, + "grad_norm": 0.8807939291000366, + "learning_rate": 0.00014013773938258928, + "loss": 2.8981, + "step": 9914 + }, + { + "epoch": 0.8982808996398722, + "grad_norm": 0.7892494201660156, + "learning_rate": 0.0001401316981815985, + "loss": 2.7186, + "step": 9915 + }, + { + "epoch": 0.898371497814319, + "grad_norm": 0.7040775418281555, + "learning_rate": 0.00014012565698060775, + "loss": 2.3654, + "step": 9916 + }, + { + "epoch": 0.8984620959887658, + "grad_norm": 0.8041091561317444, + "learning_rate": 0.00014011961577961699, + "loss": 2.7079, + "step": 9917 + }, + { + "epoch": 0.8985526941632126, + "grad_norm": 0.8075357675552368, + "learning_rate": 0.00014011357457862625, + "loss": 2.7705, + "step": 9918 + }, + { + "epoch": 0.8986432923376594, + "grad_norm": 0.8068675994873047, + "learning_rate": 0.00014010753337763548, + "loss": 2.8454, + "step": 9919 + }, + { + "epoch": 0.8987338905121062, + "grad_norm": 0.8144665956497192, + "learning_rate": 0.00014010149217664471, + "loss": 2.8047, + "step": 9920 + }, + { + "epoch": 0.898824488686553, + "grad_norm": 0.7385048270225525, + "learning_rate": 0.00014009545097565398, + "loss": 2.0173, + "step": 9921 + }, + { + "epoch": 0.8989150868609997, + "grad_norm": 0.7846535444259644, + "learning_rate": 0.0001400894097746632, + "loss": 2.6685, + "step": 9922 + }, + { + "epoch": 0.8990056850354465, + "grad_norm": 0.7903302907943726, + "learning_rate": 0.00014008336857367247, + "loss": 2.4105, + "step": 9923 + }, + { + "epoch": 0.8990962832098933, + "grad_norm": 0.7933082580566406, + "learning_rate": 0.00014007732737268168, + "loss": 2.6798, + "step": 9924 + }, + { + "epoch": 0.8991868813843401, + "grad_norm": 0.9237964749336243, + "learning_rate": 0.00014007128617169094, + "loss": 2.8198, + "step": 9925 + }, + { + "epoch": 0.8992774795587869, + "grad_norm": 0.7633142471313477, + "learning_rate": 0.00014006524497070017, + "loss": 2.6612, + "step": 9926 + }, + { + "epoch": 0.8993680777332337, + "grad_norm": 0.8446702361106873, + "learning_rate": 0.00014005920376970943, + "loss": 2.8492, + "step": 9927 + }, + { + "epoch": 0.8994586759076805, + "grad_norm": 0.7863032817840576, + "learning_rate": 0.00014005316256871867, + "loss": 2.7586, + "step": 9928 + }, + { + "epoch": 0.8995492740821273, + "grad_norm": 0.6786091923713684, + "learning_rate": 0.0001400471213677279, + "loss": 2.0967, + "step": 9929 + }, + { + "epoch": 0.899639872256574, + "grad_norm": 0.810210108757019, + "learning_rate": 0.00014004108016673716, + "loss": 2.5948, + "step": 9930 + }, + { + "epoch": 0.8997304704310208, + "grad_norm": 0.7332531809806824, + "learning_rate": 0.0001400350389657464, + "loss": 2.6881, + "step": 9931 + }, + { + "epoch": 0.8998210686054676, + "grad_norm": 0.8346866369247437, + "learning_rate": 0.00014002899776475563, + "loss": 3.0493, + "step": 9932 + }, + { + "epoch": 0.8999116667799144, + "grad_norm": 0.7475476264953613, + "learning_rate": 0.00014002295656376487, + "loss": 2.6132, + "step": 9933 + }, + { + "epoch": 0.9000022649543612, + "grad_norm": 0.8674836754798889, + "learning_rate": 0.00014001691536277413, + "loss": 2.7661, + "step": 9934 + }, + { + "epoch": 0.900092863128808, + "grad_norm": 0.8184406757354736, + "learning_rate": 0.0001400108741617834, + "loss": 2.9901, + "step": 9935 + }, + { + "epoch": 0.9001834613032548, + "grad_norm": 0.8109605312347412, + "learning_rate": 0.00014000483296079262, + "loss": 2.7909, + "step": 9936 + }, + { + "epoch": 0.9002740594777016, + "grad_norm": 0.8099241256713867, + "learning_rate": 0.00013999879175980186, + "loss": 2.8001, + "step": 9937 + }, + { + "epoch": 0.9003646576521483, + "grad_norm": 0.8160001039505005, + "learning_rate": 0.0001399927505588111, + "loss": 2.9575, + "step": 9938 + }, + { + "epoch": 0.9004552558265951, + "grad_norm": 0.8547145128250122, + "learning_rate": 0.00013998670935782035, + "loss": 2.9656, + "step": 9939 + }, + { + "epoch": 0.9005458540010419, + "grad_norm": 0.7814831137657166, + "learning_rate": 0.00013998066815682959, + "loss": 3.08, + "step": 9940 + }, + { + "epoch": 0.9006364521754887, + "grad_norm": 0.7712039947509766, + "learning_rate": 0.00013997462695583882, + "loss": 2.1211, + "step": 9941 + }, + { + "epoch": 0.9007270503499355, + "grad_norm": 0.8721023201942444, + "learning_rate": 0.00013996858575484808, + "loss": 2.7393, + "step": 9942 + }, + { + "epoch": 0.9008176485243823, + "grad_norm": 0.8231143951416016, + "learning_rate": 0.00013996254455385731, + "loss": 2.6644, + "step": 9943 + }, + { + "epoch": 0.9009082466988291, + "grad_norm": 0.7768170237541199, + "learning_rate": 0.00013995650335286658, + "loss": 2.7884, + "step": 9944 + }, + { + "epoch": 0.9009988448732758, + "grad_norm": 0.7959057688713074, + "learning_rate": 0.00013995046215187578, + "loss": 2.7835, + "step": 9945 + }, + { + "epoch": 0.9010894430477225, + "grad_norm": 0.7648161053657532, + "learning_rate": 0.00013994442095088504, + "loss": 2.506, + "step": 9946 + }, + { + "epoch": 0.9011800412221693, + "grad_norm": 0.8529520630836487, + "learning_rate": 0.00013993837974989428, + "loss": 2.6561, + "step": 9947 + }, + { + "epoch": 0.9012706393966161, + "grad_norm": 0.8469515442848206, + "learning_rate": 0.00013993233854890354, + "loss": 2.7243, + "step": 9948 + }, + { + "epoch": 0.9013612375710629, + "grad_norm": 0.9787544012069702, + "learning_rate": 0.00013992629734791277, + "loss": 2.8825, + "step": 9949 + }, + { + "epoch": 0.9014518357455097, + "grad_norm": 0.8204187154769897, + "learning_rate": 0.000139920256146922, + "loss": 2.8383, + "step": 9950 + }, + { + "epoch": 0.9015424339199565, + "grad_norm": 0.8786880373954773, + "learning_rate": 0.00013991421494593127, + "loss": 3.1399, + "step": 9951 + }, + { + "epoch": 0.9016330320944033, + "grad_norm": 0.8508517742156982, + "learning_rate": 0.0001399081737449405, + "loss": 3.0209, + "step": 9952 + }, + { + "epoch": 0.90172363026885, + "grad_norm": 0.7983881235122681, + "learning_rate": 0.00013990213254394974, + "loss": 2.749, + "step": 9953 + }, + { + "epoch": 0.9018142284432968, + "grad_norm": 0.7887316942214966, + "learning_rate": 0.00013989609134295897, + "loss": 2.788, + "step": 9954 + }, + { + "epoch": 0.9019048266177436, + "grad_norm": 0.8113560080528259, + "learning_rate": 0.00013989005014196823, + "loss": 2.9977, + "step": 9955 + }, + { + "epoch": 0.9019954247921904, + "grad_norm": 0.7862075567245483, + "learning_rate": 0.00013988400894097747, + "loss": 2.8773, + "step": 9956 + }, + { + "epoch": 0.9020860229666372, + "grad_norm": 0.7713479995727539, + "learning_rate": 0.00013987796773998673, + "loss": 2.6717, + "step": 9957 + }, + { + "epoch": 0.902176621141084, + "grad_norm": 0.7171440720558167, + "learning_rate": 0.00013987192653899596, + "loss": 2.1876, + "step": 9958 + }, + { + "epoch": 0.9022672193155308, + "grad_norm": 0.7932534217834473, + "learning_rate": 0.0001398658853380052, + "loss": 2.2276, + "step": 9959 + }, + { + "epoch": 0.9023578174899776, + "grad_norm": 0.908530056476593, + "learning_rate": 0.00013985984413701446, + "loss": 2.7321, + "step": 9960 + }, + { + "epoch": 0.9024484156644244, + "grad_norm": 0.8243244886398315, + "learning_rate": 0.0001398538029360237, + "loss": 2.8795, + "step": 9961 + }, + { + "epoch": 0.9025390138388711, + "grad_norm": 0.7064749002456665, + "learning_rate": 0.00013984776173503292, + "loss": 1.9619, + "step": 9962 + }, + { + "epoch": 0.9026296120133179, + "grad_norm": 0.8393739461898804, + "learning_rate": 0.00013984172053404216, + "loss": 2.9992, + "step": 9963 + }, + { + "epoch": 0.9027202101877647, + "grad_norm": 0.7721298336982727, + "learning_rate": 0.00013983567933305142, + "loss": 2.712, + "step": 9964 + }, + { + "epoch": 0.9028108083622115, + "grad_norm": 0.7747035026550293, + "learning_rate": 0.00013982963813206068, + "loss": 2.7751, + "step": 9965 + }, + { + "epoch": 0.9029014065366583, + "grad_norm": 0.814801812171936, + "learning_rate": 0.0001398235969310699, + "loss": 2.6407, + "step": 9966 + }, + { + "epoch": 0.9029920047111051, + "grad_norm": 0.7052247524261475, + "learning_rate": 0.00013981755573007915, + "loss": 2.2658, + "step": 9967 + }, + { + "epoch": 0.9030826028855519, + "grad_norm": 0.8145351409912109, + "learning_rate": 0.00013981151452908838, + "loss": 2.9024, + "step": 9968 + }, + { + "epoch": 0.9031732010599987, + "grad_norm": 0.7997913360595703, + "learning_rate": 0.00013980547332809764, + "loss": 2.8007, + "step": 9969 + }, + { + "epoch": 0.9032637992344454, + "grad_norm": 0.8449021577835083, + "learning_rate": 0.00013979943212710688, + "loss": 3.2878, + "step": 9970 + }, + { + "epoch": 0.9033543974088922, + "grad_norm": 0.8051814436912537, + "learning_rate": 0.0001397933909261161, + "loss": 2.8274, + "step": 9971 + }, + { + "epoch": 0.903444995583339, + "grad_norm": 0.7878623604774475, + "learning_rate": 0.00013978734972512537, + "loss": 2.1372, + "step": 9972 + }, + { + "epoch": 0.9035355937577858, + "grad_norm": 0.7984979748725891, + "learning_rate": 0.0001397813085241346, + "loss": 2.7924, + "step": 9973 + }, + { + "epoch": 0.9036261919322326, + "grad_norm": 0.7835785150527954, + "learning_rate": 0.00013977526732314384, + "loss": 2.7991, + "step": 9974 + }, + { + "epoch": 0.9037167901066794, + "grad_norm": 0.8248808979988098, + "learning_rate": 0.00013976922612215308, + "loss": 2.8783, + "step": 9975 + }, + { + "epoch": 0.9038073882811262, + "grad_norm": 0.8339757919311523, + "learning_rate": 0.00013976318492116234, + "loss": 3.1401, + "step": 9976 + }, + { + "epoch": 0.903897986455573, + "grad_norm": 0.7882986664772034, + "learning_rate": 0.00013975714372017157, + "loss": 2.8439, + "step": 9977 + }, + { + "epoch": 0.9039885846300197, + "grad_norm": 0.8176454305648804, + "learning_rate": 0.00013975110251918083, + "loss": 3.0512, + "step": 9978 + }, + { + "epoch": 0.9040791828044665, + "grad_norm": 0.7803297638893127, + "learning_rate": 0.00013974506131819007, + "loss": 2.8439, + "step": 9979 + }, + { + "epoch": 0.9041697809789133, + "grad_norm": 0.7784392833709717, + "learning_rate": 0.0001397390201171993, + "loss": 2.8392, + "step": 9980 + }, + { + "epoch": 0.9042603791533601, + "grad_norm": 0.8116408586502075, + "learning_rate": 0.00013973297891620856, + "loss": 2.7618, + "step": 9981 + }, + { + "epoch": 0.9043509773278069, + "grad_norm": 0.7999128699302673, + "learning_rate": 0.0001397269377152178, + "loss": 2.6796, + "step": 9982 + }, + { + "epoch": 0.9044415755022537, + "grad_norm": 0.7165301442146301, + "learning_rate": 0.00013972089651422703, + "loss": 2.488, + "step": 9983 + }, + { + "epoch": 0.9045321736767005, + "grad_norm": 0.7937242388725281, + "learning_rate": 0.00013971485531323626, + "loss": 2.8854, + "step": 9984 + }, + { + "epoch": 0.9046227718511471, + "grad_norm": 0.782667875289917, + "learning_rate": 0.00013970881411224552, + "loss": 2.8672, + "step": 9985 + }, + { + "epoch": 0.9047133700255939, + "grad_norm": 0.8035340309143066, + "learning_rate": 0.00013970277291125476, + "loss": 2.7598, + "step": 9986 + }, + { + "epoch": 0.9048039682000407, + "grad_norm": 0.7836618423461914, + "learning_rate": 0.000139696731710264, + "loss": 2.7836, + "step": 9987 + }, + { + "epoch": 0.9048945663744875, + "grad_norm": 0.8468023538589478, + "learning_rate": 0.00013969069050927325, + "loss": 3.1643, + "step": 9988 + }, + { + "epoch": 0.9049851645489343, + "grad_norm": 0.7747126221656799, + "learning_rate": 0.0001396846493082825, + "loss": 2.6973, + "step": 9989 + }, + { + "epoch": 0.9050757627233811, + "grad_norm": 0.8397495150566101, + "learning_rate": 0.00013967860810729175, + "loss": 2.5003, + "step": 9990 + }, + { + "epoch": 0.9051663608978279, + "grad_norm": 0.8373984098434448, + "learning_rate": 0.00013967256690630098, + "loss": 2.9234, + "step": 9991 + }, + { + "epoch": 0.9052569590722747, + "grad_norm": 0.8102898001670837, + "learning_rate": 0.00013966652570531022, + "loss": 2.9173, + "step": 9992 + }, + { + "epoch": 0.9053475572467214, + "grad_norm": 0.8149914145469666, + "learning_rate": 0.00013966048450431945, + "loss": 2.8852, + "step": 9993 + }, + { + "epoch": 0.9054381554211682, + "grad_norm": 0.9295089840888977, + "learning_rate": 0.0001396544433033287, + "loss": 2.5786, + "step": 9994 + }, + { + "epoch": 0.905528753595615, + "grad_norm": 0.8031180500984192, + "learning_rate": 0.00013964840210233797, + "loss": 2.6678, + "step": 9995 + }, + { + "epoch": 0.9056193517700618, + "grad_norm": 0.8772322535514832, + "learning_rate": 0.00013964236090134718, + "loss": 2.6633, + "step": 9996 + }, + { + "epoch": 0.9057099499445086, + "grad_norm": 0.7895862460136414, + "learning_rate": 0.00013963631970035644, + "loss": 2.7472, + "step": 9997 + }, + { + "epoch": 0.9058005481189554, + "grad_norm": 0.8358903527259827, + "learning_rate": 0.00013963027849936568, + "loss": 2.5849, + "step": 9998 + }, + { + "epoch": 0.9058911462934022, + "grad_norm": 0.771081805229187, + "learning_rate": 0.00013962423729837494, + "loss": 2.864, + "step": 9999 + }, + { + "epoch": 0.905981744467849, + "grad_norm": 0.8708781003952026, + "learning_rate": 0.00013961819609738417, + "loss": 2.6239, + "step": 10000 + }, + { + "epoch": 0.9060723426422957, + "grad_norm": 0.7421953082084656, + "learning_rate": 0.0001396121548963934, + "loss": 2.6543, + "step": 10001 + }, + { + "epoch": 0.9061629408167425, + "grad_norm": 0.7885909080505371, + "learning_rate": 0.00013960611369540267, + "loss": 2.8019, + "step": 10002 + }, + { + "epoch": 0.9062535389911893, + "grad_norm": 0.8288667798042297, + "learning_rate": 0.0001396000724944119, + "loss": 2.7925, + "step": 10003 + }, + { + "epoch": 0.9063441371656361, + "grad_norm": 0.7948784232139587, + "learning_rate": 0.00013959403129342113, + "loss": 2.5665, + "step": 10004 + }, + { + "epoch": 0.9064347353400829, + "grad_norm": 0.9238951802253723, + "learning_rate": 0.00013958799009243037, + "loss": 2.9007, + "step": 10005 + }, + { + "epoch": 0.9065253335145297, + "grad_norm": 0.8094602227210999, + "learning_rate": 0.00013958194889143963, + "loss": 2.7542, + "step": 10006 + }, + { + "epoch": 0.9066159316889765, + "grad_norm": 0.8581602573394775, + "learning_rate": 0.00013957590769044886, + "loss": 2.6665, + "step": 10007 + }, + { + "epoch": 0.9067065298634233, + "grad_norm": 0.7571961879730225, + "learning_rate": 0.00013956986648945812, + "loss": 2.8187, + "step": 10008 + }, + { + "epoch": 0.90679712803787, + "grad_norm": 0.8384376764297485, + "learning_rate": 0.00013956382528846736, + "loss": 2.5835, + "step": 10009 + }, + { + "epoch": 0.9068877262123168, + "grad_norm": 0.79062819480896, + "learning_rate": 0.0001395577840874766, + "loss": 3.0024, + "step": 10010 + }, + { + "epoch": 0.9069783243867636, + "grad_norm": 0.8009416460990906, + "learning_rate": 0.00013955174288648585, + "loss": 2.9384, + "step": 10011 + }, + { + "epoch": 0.9070689225612104, + "grad_norm": 0.6698444485664368, + "learning_rate": 0.0001395457016854951, + "loss": 1.9346, + "step": 10012 + }, + { + "epoch": 0.9071595207356572, + "grad_norm": 0.8130277991294861, + "learning_rate": 0.00013953966048450432, + "loss": 1.9487, + "step": 10013 + }, + { + "epoch": 0.907250118910104, + "grad_norm": 0.7930156588554382, + "learning_rate": 0.00013953361928351356, + "loss": 2.6737, + "step": 10014 + }, + { + "epoch": 0.9073407170845508, + "grad_norm": 0.7713134288787842, + "learning_rate": 0.00013952757808252282, + "loss": 2.8706, + "step": 10015 + }, + { + "epoch": 0.9074313152589976, + "grad_norm": 0.8694071769714355, + "learning_rate": 0.00013952153688153205, + "loss": 3.1134, + "step": 10016 + }, + { + "epoch": 0.9075219134334443, + "grad_norm": 0.8791508078575134, + "learning_rate": 0.00013951549568054129, + "loss": 2.9396, + "step": 10017 + }, + { + "epoch": 0.9076125116078911, + "grad_norm": 0.8410220146179199, + "learning_rate": 0.00013950945447955055, + "loss": 2.8061, + "step": 10018 + }, + { + "epoch": 0.9077031097823379, + "grad_norm": 0.8264405727386475, + "learning_rate": 0.00013950341327855978, + "loss": 2.6846, + "step": 10019 + }, + { + "epoch": 0.9077937079567847, + "grad_norm": 0.8494986891746521, + "learning_rate": 0.00013949737207756904, + "loss": 2.9105, + "step": 10020 + }, + { + "epoch": 0.9078843061312315, + "grad_norm": 0.6753640174865723, + "learning_rate": 0.00013949133087657828, + "loss": 1.9116, + "step": 10021 + }, + { + "epoch": 0.9079749043056783, + "grad_norm": 0.8400970697402954, + "learning_rate": 0.0001394852896755875, + "loss": 2.8113, + "step": 10022 + }, + { + "epoch": 0.9080655024801251, + "grad_norm": 0.7625826001167297, + "learning_rate": 0.00013947924847459674, + "loss": 2.8301, + "step": 10023 + }, + { + "epoch": 0.9081561006545719, + "grad_norm": 0.6983199715614319, + "learning_rate": 0.000139473207273606, + "loss": 2.2048, + "step": 10024 + }, + { + "epoch": 0.9082466988290187, + "grad_norm": 0.8071367144584656, + "learning_rate": 0.00013946716607261524, + "loss": 2.7449, + "step": 10025 + }, + { + "epoch": 0.9083372970034653, + "grad_norm": 0.8448426127433777, + "learning_rate": 0.00013946112487162447, + "loss": 2.7775, + "step": 10026 + }, + { + "epoch": 0.9084278951779121, + "grad_norm": 0.8749000430107117, + "learning_rate": 0.00013945508367063373, + "loss": 2.7906, + "step": 10027 + }, + { + "epoch": 0.9085184933523589, + "grad_norm": 0.797664999961853, + "learning_rate": 0.00013944904246964297, + "loss": 2.756, + "step": 10028 + }, + { + "epoch": 0.9086090915268057, + "grad_norm": 0.8128458261489868, + "learning_rate": 0.00013944300126865223, + "loss": 2.7082, + "step": 10029 + }, + { + "epoch": 0.9086996897012525, + "grad_norm": 0.8010278940200806, + "learning_rate": 0.00013943696006766144, + "loss": 2.9471, + "step": 10030 + }, + { + "epoch": 0.9087902878756993, + "grad_norm": 0.7377639412879944, + "learning_rate": 0.0001394309188666707, + "loss": 2.2316, + "step": 10031 + }, + { + "epoch": 0.908880886050146, + "grad_norm": 0.8052651882171631, + "learning_rate": 0.00013942487766567996, + "loss": 3.0056, + "step": 10032 + }, + { + "epoch": 0.9089714842245928, + "grad_norm": 0.7628933191299438, + "learning_rate": 0.0001394188364646892, + "loss": 2.0402, + "step": 10033 + }, + { + "epoch": 0.9090620823990396, + "grad_norm": 0.7805528044700623, + "learning_rate": 0.00013941279526369843, + "loss": 2.7024, + "step": 10034 + }, + { + "epoch": 0.9091526805734864, + "grad_norm": 0.7897397875785828, + "learning_rate": 0.00013940675406270766, + "loss": 2.6188, + "step": 10035 + }, + { + "epoch": 0.9092432787479332, + "grad_norm": 0.8025038838386536, + "learning_rate": 0.00013940071286171692, + "loss": 2.8754, + "step": 10036 + }, + { + "epoch": 0.90933387692238, + "grad_norm": 0.782336950302124, + "learning_rate": 0.00013939467166072616, + "loss": 2.7706, + "step": 10037 + }, + { + "epoch": 0.9094244750968268, + "grad_norm": 0.8305665254592896, + "learning_rate": 0.0001393886304597354, + "loss": 2.6441, + "step": 10038 + }, + { + "epoch": 0.9095150732712736, + "grad_norm": 0.7968067526817322, + "learning_rate": 0.00013938258925874465, + "loss": 2.9317, + "step": 10039 + }, + { + "epoch": 0.9096056714457204, + "grad_norm": 0.790835976600647, + "learning_rate": 0.00013937654805775389, + "loss": 2.6584, + "step": 10040 + }, + { + "epoch": 0.9096962696201671, + "grad_norm": 0.7613997459411621, + "learning_rate": 0.00013937050685676315, + "loss": 2.8239, + "step": 10041 + }, + { + "epoch": 0.9097868677946139, + "grad_norm": 0.8517158627510071, + "learning_rate": 0.00013936446565577238, + "loss": 2.8823, + "step": 10042 + }, + { + "epoch": 0.9098774659690607, + "grad_norm": 0.801833987236023, + "learning_rate": 0.00013935842445478161, + "loss": 2.1335, + "step": 10043 + }, + { + "epoch": 0.9099680641435075, + "grad_norm": 0.7651821970939636, + "learning_rate": 0.00013935238325379085, + "loss": 2.6876, + "step": 10044 + }, + { + "epoch": 0.9100586623179543, + "grad_norm": 0.8373934626579285, + "learning_rate": 0.0001393463420528001, + "loss": 2.8778, + "step": 10045 + }, + { + "epoch": 0.9101492604924011, + "grad_norm": 0.7924065589904785, + "learning_rate": 0.00013934030085180934, + "loss": 2.9222, + "step": 10046 + }, + { + "epoch": 0.9102398586668479, + "grad_norm": 0.801953136920929, + "learning_rate": 0.00013933425965081858, + "loss": 2.1158, + "step": 10047 + }, + { + "epoch": 0.9103304568412947, + "grad_norm": 0.8035615086555481, + "learning_rate": 0.00013932821844982784, + "loss": 2.7646, + "step": 10048 + }, + { + "epoch": 0.9104210550157414, + "grad_norm": 0.7946351170539856, + "learning_rate": 0.00013932217724883707, + "loss": 2.5416, + "step": 10049 + }, + { + "epoch": 0.9105116531901882, + "grad_norm": 0.8434789776802063, + "learning_rate": 0.00013931613604784633, + "loss": 2.9143, + "step": 10050 + }, + { + "epoch": 0.910602251364635, + "grad_norm": 0.8825113773345947, + "learning_rate": 0.00013931009484685554, + "loss": 2.4842, + "step": 10051 + }, + { + "epoch": 0.9106928495390818, + "grad_norm": 0.7652816772460938, + "learning_rate": 0.0001393040536458648, + "loss": 2.6458, + "step": 10052 + }, + { + "epoch": 0.9107834477135286, + "grad_norm": 0.7661393284797668, + "learning_rate": 0.00013929801244487404, + "loss": 2.7276, + "step": 10053 + }, + { + "epoch": 0.9108740458879754, + "grad_norm": 0.768488347530365, + "learning_rate": 0.0001392919712438833, + "loss": 2.8375, + "step": 10054 + }, + { + "epoch": 0.9109646440624222, + "grad_norm": 0.7907625436782837, + "learning_rate": 0.00013928593004289253, + "loss": 3.0252, + "step": 10055 + }, + { + "epoch": 0.911055242236869, + "grad_norm": 0.8359084129333496, + "learning_rate": 0.00013927988884190177, + "loss": 2.6339, + "step": 10056 + }, + { + "epoch": 0.9111458404113157, + "grad_norm": 0.7862423062324524, + "learning_rate": 0.00013927384764091103, + "loss": 2.7649, + "step": 10057 + }, + { + "epoch": 0.9112364385857625, + "grad_norm": 0.7870602607727051, + "learning_rate": 0.00013926780643992026, + "loss": 2.7251, + "step": 10058 + }, + { + "epoch": 0.9113270367602093, + "grad_norm": 0.7944641709327698, + "learning_rate": 0.00013926176523892952, + "loss": 2.7377, + "step": 10059 + }, + { + "epoch": 0.9114176349346561, + "grad_norm": 0.7111549377441406, + "learning_rate": 0.00013925572403793873, + "loss": 2.1482, + "step": 10060 + }, + { + "epoch": 0.9115082331091029, + "grad_norm": 0.8522127866744995, + "learning_rate": 0.000139249682836948, + "loss": 2.6612, + "step": 10061 + }, + { + "epoch": 0.9115988312835497, + "grad_norm": 0.803571343421936, + "learning_rate": 0.00013924364163595725, + "loss": 2.7642, + "step": 10062 + }, + { + "epoch": 0.9116894294579965, + "grad_norm": 0.9056190848350525, + "learning_rate": 0.00013923760043496649, + "loss": 2.7242, + "step": 10063 + }, + { + "epoch": 0.9117800276324433, + "grad_norm": 0.712522566318512, + "learning_rate": 0.00013923155923397572, + "loss": 2.3294, + "step": 10064 + }, + { + "epoch": 0.91187062580689, + "grad_norm": 0.7288221120834351, + "learning_rate": 0.00013922551803298495, + "loss": 2.0242, + "step": 10065 + }, + { + "epoch": 0.9119612239813367, + "grad_norm": 0.8106067776679993, + "learning_rate": 0.00013921947683199421, + "loss": 3.1225, + "step": 10066 + }, + { + "epoch": 0.9120518221557835, + "grad_norm": 0.6986391544342041, + "learning_rate": 0.00013921343563100345, + "loss": 2.1512, + "step": 10067 + }, + { + "epoch": 0.9121424203302303, + "grad_norm": 0.7687283158302307, + "learning_rate": 0.00013920739443001268, + "loss": 2.827, + "step": 10068 + }, + { + "epoch": 0.9122330185046771, + "grad_norm": 0.789930522441864, + "learning_rate": 0.00013920135322902194, + "loss": 2.7007, + "step": 10069 + }, + { + "epoch": 0.9123236166791239, + "grad_norm": 0.7826949954032898, + "learning_rate": 0.00013919531202803118, + "loss": 2.7802, + "step": 10070 + }, + { + "epoch": 0.9124142148535707, + "grad_norm": 0.8284690380096436, + "learning_rate": 0.00013918927082704044, + "loss": 2.8139, + "step": 10071 + }, + { + "epoch": 0.9125048130280174, + "grad_norm": 0.5727205276489258, + "learning_rate": 0.00013918322962604967, + "loss": 1.3371, + "step": 10072 + }, + { + "epoch": 0.9125954112024642, + "grad_norm": 0.7603669762611389, + "learning_rate": 0.0001391771884250589, + "loss": 2.7874, + "step": 10073 + }, + { + "epoch": 0.912686009376911, + "grad_norm": 0.7070531845092773, + "learning_rate": 0.00013917114722406814, + "loss": 2.0711, + "step": 10074 + }, + { + "epoch": 0.9127766075513578, + "grad_norm": 0.8187004327774048, + "learning_rate": 0.0001391651060230774, + "loss": 2.7594, + "step": 10075 + }, + { + "epoch": 0.9128672057258046, + "grad_norm": 0.857440173625946, + "learning_rate": 0.00013915906482208664, + "loss": 2.8345, + "step": 10076 + }, + { + "epoch": 0.9129578039002514, + "grad_norm": 0.7571068406105042, + "learning_rate": 0.00013915302362109587, + "loss": 2.7301, + "step": 10077 + }, + { + "epoch": 0.9130484020746982, + "grad_norm": 0.8211199641227722, + "learning_rate": 0.00013914698242010513, + "loss": 2.7602, + "step": 10078 + }, + { + "epoch": 0.913139000249145, + "grad_norm": 0.7020799517631531, + "learning_rate": 0.00013914094121911437, + "loss": 1.8754, + "step": 10079 + }, + { + "epoch": 0.9132295984235917, + "grad_norm": 0.8425829410552979, + "learning_rate": 0.00013913490001812363, + "loss": 2.9474, + "step": 10080 + }, + { + "epoch": 0.9133201965980385, + "grad_norm": 0.8617987036705017, + "learning_rate": 0.00013912885881713283, + "loss": 2.5326, + "step": 10081 + }, + { + "epoch": 0.9134107947724853, + "grad_norm": 0.7330308556556702, + "learning_rate": 0.0001391228176161421, + "loss": 1.9401, + "step": 10082 + }, + { + "epoch": 0.9135013929469321, + "grad_norm": 0.8736897110939026, + "learning_rate": 0.00013911677641515133, + "loss": 2.8548, + "step": 10083 + }, + { + "epoch": 0.9135919911213789, + "grad_norm": 0.7926191687583923, + "learning_rate": 0.0001391107352141606, + "loss": 2.5448, + "step": 10084 + }, + { + "epoch": 0.9136825892958257, + "grad_norm": 0.7156528234481812, + "learning_rate": 0.00013910469401316982, + "loss": 2.1462, + "step": 10085 + }, + { + "epoch": 0.9137731874702725, + "grad_norm": 0.8502962589263916, + "learning_rate": 0.00013909865281217906, + "loss": 2.9809, + "step": 10086 + }, + { + "epoch": 0.9138637856447193, + "grad_norm": 0.8446975946426392, + "learning_rate": 0.00013909261161118832, + "loss": 2.8547, + "step": 10087 + }, + { + "epoch": 0.913954383819166, + "grad_norm": 0.7908600568771362, + "learning_rate": 0.00013908657041019755, + "loss": 2.9372, + "step": 10088 + }, + { + "epoch": 0.9140449819936128, + "grad_norm": 0.8071458339691162, + "learning_rate": 0.0001390805292092068, + "loss": 2.9317, + "step": 10089 + }, + { + "epoch": 0.9141355801680596, + "grad_norm": 0.8059126138687134, + "learning_rate": 0.00013907448800821602, + "loss": 2.6776, + "step": 10090 + }, + { + "epoch": 0.9142261783425064, + "grad_norm": 0.8037248849868774, + "learning_rate": 0.00013906844680722528, + "loss": 2.9491, + "step": 10091 + }, + { + "epoch": 0.9143167765169532, + "grad_norm": 0.7930784821510315, + "learning_rate": 0.00013906240560623454, + "loss": 2.8107, + "step": 10092 + }, + { + "epoch": 0.9144073746914, + "grad_norm": 0.7596151232719421, + "learning_rate": 0.00013905636440524378, + "loss": 2.5058, + "step": 10093 + }, + { + "epoch": 0.9144979728658468, + "grad_norm": 0.7791789770126343, + "learning_rate": 0.000139050323204253, + "loss": 2.9972, + "step": 10094 + }, + { + "epoch": 0.9145885710402936, + "grad_norm": 0.7743590474128723, + "learning_rate": 0.00013904428200326225, + "loss": 2.7361, + "step": 10095 + }, + { + "epoch": 0.9146791692147404, + "grad_norm": 0.8469882011413574, + "learning_rate": 0.0001390382408022715, + "loss": 2.6657, + "step": 10096 + }, + { + "epoch": 0.9147697673891871, + "grad_norm": 0.8078157901763916, + "learning_rate": 0.00013903219960128074, + "loss": 2.6498, + "step": 10097 + }, + { + "epoch": 0.9148603655636339, + "grad_norm": 0.8060004115104675, + "learning_rate": 0.00013902615840028998, + "loss": 2.5905, + "step": 10098 + }, + { + "epoch": 0.9149509637380807, + "grad_norm": 0.8418362140655518, + "learning_rate": 0.00013902011719929924, + "loss": 2.5744, + "step": 10099 + }, + { + "epoch": 0.9150415619125275, + "grad_norm": 0.8670383095741272, + "learning_rate": 0.00013901407599830847, + "loss": 2.6981, + "step": 10100 + }, + { + "epoch": 0.9151321600869743, + "grad_norm": 0.7401648163795471, + "learning_rate": 0.00013900803479731773, + "loss": 2.0274, + "step": 10101 + }, + { + "epoch": 0.9152227582614211, + "grad_norm": 0.7621526122093201, + "learning_rate": 0.00013900199359632694, + "loss": 2.7849, + "step": 10102 + }, + { + "epoch": 0.9153133564358679, + "grad_norm": 0.8221551179885864, + "learning_rate": 0.0001389959523953362, + "loss": 2.7491, + "step": 10103 + }, + { + "epoch": 0.9154039546103147, + "grad_norm": 0.8760179281234741, + "learning_rate": 0.00013898991119434543, + "loss": 2.8603, + "step": 10104 + }, + { + "epoch": 0.9154945527847614, + "grad_norm": 0.8073845505714417, + "learning_rate": 0.0001389838699933547, + "loss": 2.8273, + "step": 10105 + }, + { + "epoch": 0.9155851509592082, + "grad_norm": 0.8057717680931091, + "learning_rate": 0.00013897782879236393, + "loss": 2.8557, + "step": 10106 + }, + { + "epoch": 0.9156757491336549, + "grad_norm": 0.7761959433555603, + "learning_rate": 0.00013897178759137316, + "loss": 2.3884, + "step": 10107 + }, + { + "epoch": 0.9157663473081017, + "grad_norm": 0.7960433959960938, + "learning_rate": 0.00013896574639038242, + "loss": 2.7947, + "step": 10108 + }, + { + "epoch": 0.9158569454825485, + "grad_norm": 0.806701123714447, + "learning_rate": 0.00013895970518939166, + "loss": 2.9108, + "step": 10109 + }, + { + "epoch": 0.9159475436569953, + "grad_norm": 0.8069674968719482, + "learning_rate": 0.00013895366398840092, + "loss": 2.9189, + "step": 10110 + }, + { + "epoch": 0.916038141831442, + "grad_norm": 0.6641845703125, + "learning_rate": 0.00013894762278741013, + "loss": 2.1353, + "step": 10111 + }, + { + "epoch": 0.9161287400058888, + "grad_norm": 0.8083200454711914, + "learning_rate": 0.0001389415815864194, + "loss": 2.7913, + "step": 10112 + }, + { + "epoch": 0.9162193381803356, + "grad_norm": 0.831363320350647, + "learning_rate": 0.00013893554038542862, + "loss": 2.9836, + "step": 10113 + }, + { + "epoch": 0.9163099363547824, + "grad_norm": 0.8372895121574402, + "learning_rate": 0.00013892949918443788, + "loss": 2.8663, + "step": 10114 + }, + { + "epoch": 0.9164005345292292, + "grad_norm": 0.8215112686157227, + "learning_rate": 0.00013892345798344712, + "loss": 2.8601, + "step": 10115 + }, + { + "epoch": 0.916491132703676, + "grad_norm": 0.7434715628623962, + "learning_rate": 0.00013891741678245635, + "loss": 2.4851, + "step": 10116 + }, + { + "epoch": 0.9165817308781228, + "grad_norm": 0.8092532753944397, + "learning_rate": 0.0001389113755814656, + "loss": 2.6964, + "step": 10117 + }, + { + "epoch": 0.9166723290525696, + "grad_norm": 0.8507492542266846, + "learning_rate": 0.00013890533438047485, + "loss": 2.7998, + "step": 10118 + }, + { + "epoch": 0.9167629272270164, + "grad_norm": 0.7781103253364563, + "learning_rate": 0.00013889929317948408, + "loss": 2.6524, + "step": 10119 + }, + { + "epoch": 0.9168535254014631, + "grad_norm": 0.8070328831672668, + "learning_rate": 0.00013889325197849331, + "loss": 2.7343, + "step": 10120 + }, + { + "epoch": 0.9169441235759099, + "grad_norm": 0.818157434463501, + "learning_rate": 0.00013888721077750258, + "loss": 2.6415, + "step": 10121 + }, + { + "epoch": 0.9170347217503567, + "grad_norm": 0.6800169348716736, + "learning_rate": 0.00013888116957651184, + "loss": 2.1153, + "step": 10122 + }, + { + "epoch": 0.9171253199248035, + "grad_norm": 0.8082334995269775, + "learning_rate": 0.00013887512837552107, + "loss": 2.8597, + "step": 10123 + }, + { + "epoch": 0.9172159180992503, + "grad_norm": 0.7769731879234314, + "learning_rate": 0.0001388690871745303, + "loss": 2.8698, + "step": 10124 + }, + { + "epoch": 0.9173065162736971, + "grad_norm": 0.809986412525177, + "learning_rate": 0.00013886304597353954, + "loss": 2.8352, + "step": 10125 + }, + { + "epoch": 0.9173971144481439, + "grad_norm": 0.7579398155212402, + "learning_rate": 0.0001388570047725488, + "loss": 2.8179, + "step": 10126 + }, + { + "epoch": 0.9174877126225907, + "grad_norm": 0.8149374127388, + "learning_rate": 0.00013885096357155803, + "loss": 2.7198, + "step": 10127 + }, + { + "epoch": 0.9175783107970374, + "grad_norm": 0.7155686616897583, + "learning_rate": 0.00013884492237056727, + "loss": 2.0191, + "step": 10128 + }, + { + "epoch": 0.9176689089714842, + "grad_norm": 0.7787701487541199, + "learning_rate": 0.00013883888116957653, + "loss": 2.678, + "step": 10129 + }, + { + "epoch": 0.917759507145931, + "grad_norm": 0.8561106324195862, + "learning_rate": 0.00013883283996858576, + "loss": 2.6626, + "step": 10130 + }, + { + "epoch": 0.9178501053203778, + "grad_norm": 0.7232205867767334, + "learning_rate": 0.00013882679876759502, + "loss": 1.9938, + "step": 10131 + }, + { + "epoch": 0.9179407034948246, + "grad_norm": 0.8521077036857605, + "learning_rate": 0.00013882075756660423, + "loss": 2.579, + "step": 10132 + }, + { + "epoch": 0.9180313016692714, + "grad_norm": 0.6767562627792358, + "learning_rate": 0.0001388147163656135, + "loss": 2.2386, + "step": 10133 + }, + { + "epoch": 0.9181218998437182, + "grad_norm": 0.7466652393341064, + "learning_rate": 0.00013880867516462273, + "loss": 2.1874, + "step": 10134 + }, + { + "epoch": 0.918212498018165, + "grad_norm": 0.7800644040107727, + "learning_rate": 0.000138802633963632, + "loss": 2.7005, + "step": 10135 + }, + { + "epoch": 0.9183030961926117, + "grad_norm": 0.8304532170295715, + "learning_rate": 0.00013879659276264122, + "loss": 3.0212, + "step": 10136 + }, + { + "epoch": 0.9183936943670585, + "grad_norm": 0.8293542861938477, + "learning_rate": 0.00013879055156165046, + "loss": 2.8754, + "step": 10137 + }, + { + "epoch": 0.9184842925415053, + "grad_norm": 0.814867377281189, + "learning_rate": 0.00013878451036065972, + "loss": 2.9474, + "step": 10138 + }, + { + "epoch": 0.9185748907159521, + "grad_norm": 0.8777400255203247, + "learning_rate": 0.00013877846915966895, + "loss": 2.7309, + "step": 10139 + }, + { + "epoch": 0.9186654888903989, + "grad_norm": 0.8640410900115967, + "learning_rate": 0.00013877242795867819, + "loss": 3.2841, + "step": 10140 + }, + { + "epoch": 0.9187560870648457, + "grad_norm": 0.8320043683052063, + "learning_rate": 0.00013876638675768742, + "loss": 3.0508, + "step": 10141 + }, + { + "epoch": 0.9188466852392925, + "grad_norm": 0.8534289002418518, + "learning_rate": 0.00013876034555669668, + "loss": 2.9682, + "step": 10142 + }, + { + "epoch": 0.9189372834137393, + "grad_norm": 0.8647092580795288, + "learning_rate": 0.00013875430435570591, + "loss": 2.7645, + "step": 10143 + }, + { + "epoch": 0.919027881588186, + "grad_norm": 0.8171446323394775, + "learning_rate": 0.00013874826315471518, + "loss": 2.7972, + "step": 10144 + }, + { + "epoch": 0.9191184797626328, + "grad_norm": 0.8115734457969666, + "learning_rate": 0.0001387422219537244, + "loss": 2.6203, + "step": 10145 + }, + { + "epoch": 0.9192090779370796, + "grad_norm": 0.7918534278869629, + "learning_rate": 0.00013873618075273364, + "loss": 2.8738, + "step": 10146 + }, + { + "epoch": 0.9192996761115263, + "grad_norm": 0.8228238224983215, + "learning_rate": 0.0001387301395517429, + "loss": 2.8882, + "step": 10147 + }, + { + "epoch": 0.9193902742859731, + "grad_norm": 0.7478582262992859, + "learning_rate": 0.00013872409835075214, + "loss": 2.6556, + "step": 10148 + }, + { + "epoch": 0.9194808724604199, + "grad_norm": 0.7862219214439392, + "learning_rate": 0.00013871805714976137, + "loss": 2.6415, + "step": 10149 + }, + { + "epoch": 0.9195714706348667, + "grad_norm": 0.823188304901123, + "learning_rate": 0.0001387120159487706, + "loss": 3.1322, + "step": 10150 + }, + { + "epoch": 0.9196620688093134, + "grad_norm": 0.7619822025299072, + "learning_rate": 0.00013870597474777987, + "loss": 2.7925, + "step": 10151 + }, + { + "epoch": 0.9197526669837602, + "grad_norm": 0.7960193157196045, + "learning_rate": 0.00013869993354678913, + "loss": 2.8484, + "step": 10152 + }, + { + "epoch": 0.919843265158207, + "grad_norm": 0.936335027217865, + "learning_rate": 0.00013869389234579834, + "loss": 2.7588, + "step": 10153 + }, + { + "epoch": 0.9199338633326538, + "grad_norm": 0.7492874264717102, + "learning_rate": 0.0001386878511448076, + "loss": 2.7407, + "step": 10154 + }, + { + "epoch": 0.9200244615071006, + "grad_norm": 0.9111508727073669, + "learning_rate": 0.00013868180994381683, + "loss": 2.9464, + "step": 10155 + }, + { + "epoch": 0.9201150596815474, + "grad_norm": 0.829826295375824, + "learning_rate": 0.0001386757687428261, + "loss": 2.8204, + "step": 10156 + }, + { + "epoch": 0.9202056578559942, + "grad_norm": 0.7683019638061523, + "learning_rate": 0.00013866972754183533, + "loss": 2.8229, + "step": 10157 + }, + { + "epoch": 0.920296256030441, + "grad_norm": 0.8241559267044067, + "learning_rate": 0.00013866368634084456, + "loss": 2.8284, + "step": 10158 + }, + { + "epoch": 0.9203868542048877, + "grad_norm": 0.8084510564804077, + "learning_rate": 0.00013865764513985382, + "loss": 2.7539, + "step": 10159 + }, + { + "epoch": 0.9204774523793345, + "grad_norm": 0.7869303226470947, + "learning_rate": 0.00013865160393886306, + "loss": 2.6827, + "step": 10160 + }, + { + "epoch": 0.9205680505537813, + "grad_norm": 0.7984116077423096, + "learning_rate": 0.0001386455627378723, + "loss": 2.5277, + "step": 10161 + }, + { + "epoch": 0.9206586487282281, + "grad_norm": 0.8005961775779724, + "learning_rate": 0.00013863952153688152, + "loss": 3.0314, + "step": 10162 + }, + { + "epoch": 0.9207492469026749, + "grad_norm": 0.7607830166816711, + "learning_rate": 0.00013863348033589079, + "loss": 2.4809, + "step": 10163 + }, + { + "epoch": 0.9208398450771217, + "grad_norm": 0.7034377455711365, + "learning_rate": 0.00013862743913490002, + "loss": 2.094, + "step": 10164 + }, + { + "epoch": 0.9209304432515685, + "grad_norm": 0.8571200966835022, + "learning_rate": 0.00013862139793390928, + "loss": 2.8223, + "step": 10165 + }, + { + "epoch": 0.9210210414260153, + "grad_norm": 0.7830682396888733, + "learning_rate": 0.0001386153567329185, + "loss": 2.6429, + "step": 10166 + }, + { + "epoch": 0.921111639600462, + "grad_norm": 0.7132526636123657, + "learning_rate": 0.00013860931553192775, + "loss": 2.2774, + "step": 10167 + }, + { + "epoch": 0.9212022377749088, + "grad_norm": 0.8480640649795532, + "learning_rate": 0.000138603274330937, + "loss": 2.9553, + "step": 10168 + }, + { + "epoch": 0.9212928359493556, + "grad_norm": 0.8020076155662537, + "learning_rate": 0.00013859723312994624, + "loss": 2.7006, + "step": 10169 + }, + { + "epoch": 0.9213834341238024, + "grad_norm": 0.8783047795295715, + "learning_rate": 0.00013859119192895548, + "loss": 2.8849, + "step": 10170 + }, + { + "epoch": 0.9214740322982492, + "grad_norm": 0.7732100486755371, + "learning_rate": 0.0001385851507279647, + "loss": 2.6785, + "step": 10171 + }, + { + "epoch": 0.921564630472696, + "grad_norm": 0.8377436995506287, + "learning_rate": 0.00013857910952697397, + "loss": 2.8295, + "step": 10172 + }, + { + "epoch": 0.9216552286471428, + "grad_norm": 0.8447519540786743, + "learning_rate": 0.0001385730683259832, + "loss": 3.0161, + "step": 10173 + }, + { + "epoch": 0.9217458268215896, + "grad_norm": 0.8099875450134277, + "learning_rate": 0.00013856702712499244, + "loss": 2.7318, + "step": 10174 + }, + { + "epoch": 0.9218364249960364, + "grad_norm": 0.8766972422599792, + "learning_rate": 0.0001385609859240017, + "loss": 2.8483, + "step": 10175 + }, + { + "epoch": 0.9219270231704831, + "grad_norm": 0.8068873882293701, + "learning_rate": 0.00013855494472301094, + "loss": 2.8536, + "step": 10176 + }, + { + "epoch": 0.9220176213449299, + "grad_norm": 0.6771115660667419, + "learning_rate": 0.0001385489035220202, + "loss": 1.906, + "step": 10177 + }, + { + "epoch": 0.9221082195193767, + "grad_norm": 0.7133148312568665, + "learning_rate": 0.00013854286232102943, + "loss": 2.0339, + "step": 10178 + }, + { + "epoch": 0.9221988176938235, + "grad_norm": 0.8449442982673645, + "learning_rate": 0.00013853682112003867, + "loss": 2.9702, + "step": 10179 + }, + { + "epoch": 0.9222894158682703, + "grad_norm": 0.8203433752059937, + "learning_rate": 0.0001385307799190479, + "loss": 2.8547, + "step": 10180 + }, + { + "epoch": 0.9223800140427171, + "grad_norm": 0.80555260181427, + "learning_rate": 0.00013852473871805716, + "loss": 2.8125, + "step": 10181 + }, + { + "epoch": 0.9224706122171639, + "grad_norm": 0.8052405714988708, + "learning_rate": 0.00013851869751706642, + "loss": 2.898, + "step": 10182 + }, + { + "epoch": 0.9225612103916107, + "grad_norm": 0.7863885760307312, + "learning_rate": 0.00013851265631607563, + "loss": 2.7914, + "step": 10183 + }, + { + "epoch": 0.9226518085660574, + "grad_norm": 0.781383216381073, + "learning_rate": 0.0001385066151150849, + "loss": 3.0441, + "step": 10184 + }, + { + "epoch": 0.9227424067405042, + "grad_norm": 0.7604668140411377, + "learning_rate": 0.00013850057391409412, + "loss": 2.7814, + "step": 10185 + }, + { + "epoch": 0.922833004914951, + "grad_norm": 0.8183280825614929, + "learning_rate": 0.00013849453271310339, + "loss": 2.7583, + "step": 10186 + }, + { + "epoch": 0.9229236030893978, + "grad_norm": 0.8014418482780457, + "learning_rate": 0.00013848849151211262, + "loss": 2.7566, + "step": 10187 + }, + { + "epoch": 0.9230142012638445, + "grad_norm": 0.8692284226417542, + "learning_rate": 0.00013848245031112185, + "loss": 2.9892, + "step": 10188 + }, + { + "epoch": 0.9231047994382913, + "grad_norm": 0.8942543268203735, + "learning_rate": 0.00013847640911013111, + "loss": 2.5159, + "step": 10189 + }, + { + "epoch": 0.9231953976127381, + "grad_norm": 0.8102059364318848, + "learning_rate": 0.00013847036790914035, + "loss": 2.7223, + "step": 10190 + }, + { + "epoch": 0.9232859957871848, + "grad_norm": 0.7639232873916626, + "learning_rate": 0.00013846432670814958, + "loss": 2.6877, + "step": 10191 + }, + { + "epoch": 0.9233765939616316, + "grad_norm": 0.8069093823432922, + "learning_rate": 0.00013845828550715882, + "loss": 2.7959, + "step": 10192 + }, + { + "epoch": 0.9234671921360784, + "grad_norm": 0.7329351902008057, + "learning_rate": 0.00013845224430616808, + "loss": 2.7176, + "step": 10193 + }, + { + "epoch": 0.9235577903105252, + "grad_norm": 0.7845849394798279, + "learning_rate": 0.0001384462031051773, + "loss": 2.7639, + "step": 10194 + }, + { + "epoch": 0.923648388484972, + "grad_norm": 0.8887647390365601, + "learning_rate": 0.00013844016190418657, + "loss": 2.8597, + "step": 10195 + }, + { + "epoch": 0.9237389866594188, + "grad_norm": 0.8778423070907593, + "learning_rate": 0.00013843412070319578, + "loss": 2.9974, + "step": 10196 + }, + { + "epoch": 0.9238295848338656, + "grad_norm": 0.8241698145866394, + "learning_rate": 0.00013842807950220504, + "loss": 2.895, + "step": 10197 + }, + { + "epoch": 0.9239201830083124, + "grad_norm": 0.9401123523712158, + "learning_rate": 0.0001384220383012143, + "loss": 2.7702, + "step": 10198 + }, + { + "epoch": 0.9240107811827591, + "grad_norm": 0.8208226561546326, + "learning_rate": 0.00013841599710022354, + "loss": 2.9437, + "step": 10199 + }, + { + "epoch": 0.9241013793572059, + "grad_norm": 0.815660297870636, + "learning_rate": 0.00013840995589923277, + "loss": 2.5582, + "step": 10200 + }, + { + "epoch": 0.9241919775316527, + "grad_norm": 0.8131288290023804, + "learning_rate": 0.000138403914698242, + "loss": 2.8253, + "step": 10201 + }, + { + "epoch": 0.9242825757060995, + "grad_norm": 0.8157569169998169, + "learning_rate": 0.00013839787349725127, + "loss": 2.7071, + "step": 10202 + }, + { + "epoch": 0.9243731738805463, + "grad_norm": 0.8729638457298279, + "learning_rate": 0.0001383918322962605, + "loss": 2.8254, + "step": 10203 + }, + { + "epoch": 0.9244637720549931, + "grad_norm": 0.6628089547157288, + "learning_rate": 0.00013838579109526973, + "loss": 2.3363, + "step": 10204 + }, + { + "epoch": 0.9245543702294399, + "grad_norm": 0.8466482758522034, + "learning_rate": 0.000138379749894279, + "loss": 2.7627, + "step": 10205 + }, + { + "epoch": 0.9246449684038867, + "grad_norm": 0.8202822208404541, + "learning_rate": 0.00013837370869328823, + "loss": 2.9682, + "step": 10206 + }, + { + "epoch": 0.9247355665783334, + "grad_norm": 0.8008645176887512, + "learning_rate": 0.0001383676674922975, + "loss": 2.959, + "step": 10207 + }, + { + "epoch": 0.9248261647527802, + "grad_norm": 0.8984838128089905, + "learning_rate": 0.00013836162629130672, + "loss": 3.0767, + "step": 10208 + }, + { + "epoch": 0.924916762927227, + "grad_norm": 0.8170086145401001, + "learning_rate": 0.00013835558509031596, + "loss": 2.9701, + "step": 10209 + }, + { + "epoch": 0.9250073611016738, + "grad_norm": 0.7430524230003357, + "learning_rate": 0.0001383495438893252, + "loss": 1.9228, + "step": 10210 + }, + { + "epoch": 0.9250979592761206, + "grad_norm": 0.7834795713424683, + "learning_rate": 0.00013834350268833445, + "loss": 2.6631, + "step": 10211 + }, + { + "epoch": 0.9251885574505674, + "grad_norm": 0.8393563628196716, + "learning_rate": 0.0001383374614873437, + "loss": 2.7011, + "step": 10212 + }, + { + "epoch": 0.9252791556250142, + "grad_norm": 0.8090593814849854, + "learning_rate": 0.00013833142028635292, + "loss": 2.7854, + "step": 10213 + }, + { + "epoch": 0.925369753799461, + "grad_norm": 0.7835372090339661, + "learning_rate": 0.00013832537908536218, + "loss": 2.8599, + "step": 10214 + }, + { + "epoch": 0.9254603519739077, + "grad_norm": 0.8753615617752075, + "learning_rate": 0.00013831933788437142, + "loss": 2.8931, + "step": 10215 + }, + { + "epoch": 0.9255509501483545, + "grad_norm": 0.7561033964157104, + "learning_rate": 0.00013831329668338068, + "loss": 2.579, + "step": 10216 + }, + { + "epoch": 0.9256415483228013, + "grad_norm": 0.8498483300209045, + "learning_rate": 0.00013830725548238988, + "loss": 2.9951, + "step": 10217 + }, + { + "epoch": 0.9257321464972481, + "grad_norm": 0.9447964429855347, + "learning_rate": 0.00013830121428139915, + "loss": 2.5889, + "step": 10218 + }, + { + "epoch": 0.9258227446716949, + "grad_norm": 0.7929452061653137, + "learning_rate": 0.0001382951730804084, + "loss": 1.9057, + "step": 10219 + }, + { + "epoch": 0.9259133428461417, + "grad_norm": 0.7922360897064209, + "learning_rate": 0.00013828913187941764, + "loss": 3.0368, + "step": 10220 + }, + { + "epoch": 0.9260039410205885, + "grad_norm": 2.375145435333252, + "learning_rate": 0.00013828309067842688, + "loss": 2.84, + "step": 10221 + }, + { + "epoch": 0.9260945391950353, + "grad_norm": 0.8260802030563354, + "learning_rate": 0.0001382770494774361, + "loss": 2.5018, + "step": 10222 + }, + { + "epoch": 0.926185137369482, + "grad_norm": 0.8205020427703857, + "learning_rate": 0.00013827100827644537, + "loss": 2.5592, + "step": 10223 + }, + { + "epoch": 0.9262757355439288, + "grad_norm": 0.7738685607910156, + "learning_rate": 0.0001382649670754546, + "loss": 2.7742, + "step": 10224 + }, + { + "epoch": 0.9263663337183756, + "grad_norm": 0.7407816648483276, + "learning_rate": 0.00013825892587446384, + "loss": 2.2637, + "step": 10225 + }, + { + "epoch": 0.9264569318928224, + "grad_norm": 0.8253427147865295, + "learning_rate": 0.00013825288467347307, + "loss": 2.8917, + "step": 10226 + }, + { + "epoch": 0.9265475300672692, + "grad_norm": 0.7394399642944336, + "learning_rate": 0.00013824684347248233, + "loss": 2.1022, + "step": 10227 + }, + { + "epoch": 0.9266381282417159, + "grad_norm": 0.7964169979095459, + "learning_rate": 0.0001382408022714916, + "loss": 2.9522, + "step": 10228 + }, + { + "epoch": 0.9267287264161627, + "grad_norm": 0.8023859262466431, + "learning_rate": 0.00013823476107050083, + "loss": 2.9292, + "step": 10229 + }, + { + "epoch": 0.9268193245906095, + "grad_norm": 0.7420205473899841, + "learning_rate": 0.00013822871986951006, + "loss": 2.8053, + "step": 10230 + }, + { + "epoch": 0.9269099227650562, + "grad_norm": 0.8836199641227722, + "learning_rate": 0.0001382226786685193, + "loss": 2.4761, + "step": 10231 + }, + { + "epoch": 0.927000520939503, + "grad_norm": 0.7789944410324097, + "learning_rate": 0.00013821663746752856, + "loss": 2.6494, + "step": 10232 + }, + { + "epoch": 0.9270911191139498, + "grad_norm": 0.8297148942947388, + "learning_rate": 0.0001382105962665378, + "loss": 2.9405, + "step": 10233 + }, + { + "epoch": 0.9271817172883966, + "grad_norm": 0.8105506896972656, + "learning_rate": 0.00013820455506554703, + "loss": 2.8042, + "step": 10234 + }, + { + "epoch": 0.9272723154628434, + "grad_norm": 0.8360188007354736, + "learning_rate": 0.0001381985138645563, + "loss": 2.9485, + "step": 10235 + }, + { + "epoch": 0.9273629136372902, + "grad_norm": 0.7592721581459045, + "learning_rate": 0.00013819247266356552, + "loss": 2.6213, + "step": 10236 + }, + { + "epoch": 0.927453511811737, + "grad_norm": 0.8214449286460876, + "learning_rate": 0.00013818643146257478, + "loss": 2.8, + "step": 10237 + }, + { + "epoch": 0.9275441099861838, + "grad_norm": 0.8006014823913574, + "learning_rate": 0.000138180390261584, + "loss": 2.9881, + "step": 10238 + }, + { + "epoch": 0.9276347081606305, + "grad_norm": 0.8038879632949829, + "learning_rate": 0.00013817434906059325, + "loss": 2.8842, + "step": 10239 + }, + { + "epoch": 0.9277253063350773, + "grad_norm": 0.7606726288795471, + "learning_rate": 0.00013816830785960248, + "loss": 2.6014, + "step": 10240 + }, + { + "epoch": 0.9278159045095241, + "grad_norm": 0.9455540776252747, + "learning_rate": 0.00013816226665861175, + "loss": 3.118, + "step": 10241 + }, + { + "epoch": 0.9279065026839709, + "grad_norm": 0.7813332080841064, + "learning_rate": 0.00013815622545762098, + "loss": 2.7366, + "step": 10242 + }, + { + "epoch": 0.9279971008584177, + "grad_norm": 0.8190297484397888, + "learning_rate": 0.00013815018425663021, + "loss": 2.7974, + "step": 10243 + }, + { + "epoch": 0.9280876990328645, + "grad_norm": 0.7607611417770386, + "learning_rate": 0.00013814414305563948, + "loss": 2.7075, + "step": 10244 + }, + { + "epoch": 0.9281782972073113, + "grad_norm": 0.9024550318717957, + "learning_rate": 0.0001381381018546487, + "loss": 2.8256, + "step": 10245 + }, + { + "epoch": 0.928268895381758, + "grad_norm": 0.780762255191803, + "learning_rate": 0.00013813206065365797, + "loss": 2.4157, + "step": 10246 + }, + { + "epoch": 0.9283594935562048, + "grad_norm": 0.8180962800979614, + "learning_rate": 0.00013812601945266718, + "loss": 2.7176, + "step": 10247 + }, + { + "epoch": 0.9284500917306516, + "grad_norm": 0.8048275709152222, + "learning_rate": 0.00013811997825167644, + "loss": 2.6501, + "step": 10248 + }, + { + "epoch": 0.9285406899050984, + "grad_norm": 0.8290717005729675, + "learning_rate": 0.0001381139370506857, + "loss": 2.7602, + "step": 10249 + }, + { + "epoch": 0.9286312880795452, + "grad_norm": 0.7677472829818726, + "learning_rate": 0.00013810789584969493, + "loss": 2.3883, + "step": 10250 + }, + { + "epoch": 0.928721886253992, + "grad_norm": 0.7925135493278503, + "learning_rate": 0.00013810185464870417, + "loss": 2.6887, + "step": 10251 + }, + { + "epoch": 0.9288124844284388, + "grad_norm": 0.7920452356338501, + "learning_rate": 0.0001380958134477134, + "loss": 2.2107, + "step": 10252 + }, + { + "epoch": 0.9289030826028856, + "grad_norm": 0.8041022419929504, + "learning_rate": 0.00013808977224672266, + "loss": 2.773, + "step": 10253 + }, + { + "epoch": 0.9289936807773324, + "grad_norm": 0.7654824256896973, + "learning_rate": 0.0001380837310457319, + "loss": 2.9578, + "step": 10254 + }, + { + "epoch": 0.9290842789517791, + "grad_norm": 0.7671637535095215, + "learning_rate": 0.00013807768984474113, + "loss": 2.7975, + "step": 10255 + }, + { + "epoch": 0.9291748771262259, + "grad_norm": 0.823297917842865, + "learning_rate": 0.00013807164864375037, + "loss": 2.7693, + "step": 10256 + }, + { + "epoch": 0.9292654753006727, + "grad_norm": 0.7613077163696289, + "learning_rate": 0.00013806560744275963, + "loss": 2.7738, + "step": 10257 + }, + { + "epoch": 0.9293560734751195, + "grad_norm": 0.7899166345596313, + "learning_rate": 0.0001380595662417689, + "loss": 2.93, + "step": 10258 + }, + { + "epoch": 0.9294466716495663, + "grad_norm": 0.8531551957130432, + "learning_rate": 0.00013805352504077812, + "loss": 2.967, + "step": 10259 + }, + { + "epoch": 0.9295372698240131, + "grad_norm": 0.7564404606819153, + "learning_rate": 0.00013804748383978736, + "loss": 2.8733, + "step": 10260 + }, + { + "epoch": 0.9296278679984599, + "grad_norm": 0.759523332118988, + "learning_rate": 0.0001380414426387966, + "loss": 2.7886, + "step": 10261 + }, + { + "epoch": 0.9297184661729067, + "grad_norm": 0.9471216797828674, + "learning_rate": 0.00013803540143780585, + "loss": 2.7279, + "step": 10262 + }, + { + "epoch": 0.9298090643473534, + "grad_norm": 0.7950829267501831, + "learning_rate": 0.00013802936023681508, + "loss": 2.6911, + "step": 10263 + }, + { + "epoch": 0.9298996625218002, + "grad_norm": 0.8334429264068604, + "learning_rate": 0.00013802331903582432, + "loss": 2.8495, + "step": 10264 + }, + { + "epoch": 0.929990260696247, + "grad_norm": 0.7776434421539307, + "learning_rate": 0.00013801727783483358, + "loss": 2.6323, + "step": 10265 + }, + { + "epoch": 0.9300808588706938, + "grad_norm": 0.8716893792152405, + "learning_rate": 0.00013801123663384281, + "loss": 3.0146, + "step": 10266 + }, + { + "epoch": 0.9301714570451406, + "grad_norm": 0.8196095824241638, + "learning_rate": 0.00013800519543285208, + "loss": 3.1598, + "step": 10267 + }, + { + "epoch": 0.9302620552195874, + "grad_norm": 0.8019415140151978, + "learning_rate": 0.00013799915423186128, + "loss": 3.0307, + "step": 10268 + }, + { + "epoch": 0.9303526533940341, + "grad_norm": 0.8568243384361267, + "learning_rate": 0.00013799311303087054, + "loss": 2.7878, + "step": 10269 + }, + { + "epoch": 0.9304432515684808, + "grad_norm": 0.7707356214523315, + "learning_rate": 0.00013798707182987978, + "loss": 2.8151, + "step": 10270 + }, + { + "epoch": 0.9305338497429276, + "grad_norm": 0.8179143071174622, + "learning_rate": 0.00013798103062888904, + "loss": 2.5894, + "step": 10271 + }, + { + "epoch": 0.9306244479173744, + "grad_norm": 0.7708619236946106, + "learning_rate": 0.00013797498942789827, + "loss": 2.6985, + "step": 10272 + }, + { + "epoch": 0.9307150460918212, + "grad_norm": 0.8064430356025696, + "learning_rate": 0.0001379689482269075, + "loss": 2.8183, + "step": 10273 + }, + { + "epoch": 0.930805644266268, + "grad_norm": 0.7804310321807861, + "learning_rate": 0.00013796290702591677, + "loss": 2.8765, + "step": 10274 + }, + { + "epoch": 0.9308962424407148, + "grad_norm": 0.7972733378410339, + "learning_rate": 0.000137956865824926, + "loss": 2.7768, + "step": 10275 + }, + { + "epoch": 0.9309868406151616, + "grad_norm": 0.8030334711074829, + "learning_rate": 0.00013795082462393524, + "loss": 2.6677, + "step": 10276 + }, + { + "epoch": 0.9310774387896084, + "grad_norm": 0.7103569507598877, + "learning_rate": 0.00013794478342294447, + "loss": 1.784, + "step": 10277 + }, + { + "epoch": 0.9311680369640551, + "grad_norm": 0.7206403613090515, + "learning_rate": 0.00013793874222195373, + "loss": 1.9405, + "step": 10278 + }, + { + "epoch": 0.9312586351385019, + "grad_norm": 0.7959842085838318, + "learning_rate": 0.000137932701020963, + "loss": 2.895, + "step": 10279 + }, + { + "epoch": 0.9313492333129487, + "grad_norm": 0.8149707913398743, + "learning_rate": 0.00013792665981997223, + "loss": 2.7609, + "step": 10280 + }, + { + "epoch": 0.9314398314873955, + "grad_norm": 0.8148518800735474, + "learning_rate": 0.00013792061861898146, + "loss": 2.7243, + "step": 10281 + }, + { + "epoch": 0.9315304296618423, + "grad_norm": 0.780959963798523, + "learning_rate": 0.0001379145774179907, + "loss": 2.8971, + "step": 10282 + }, + { + "epoch": 0.9316210278362891, + "grad_norm": 0.8041626811027527, + "learning_rate": 0.00013790853621699996, + "loss": 2.6027, + "step": 10283 + }, + { + "epoch": 0.9317116260107359, + "grad_norm": 0.7879753708839417, + "learning_rate": 0.0001379024950160092, + "loss": 2.8687, + "step": 10284 + }, + { + "epoch": 0.9318022241851827, + "grad_norm": 0.7889615893363953, + "learning_rate": 0.00013789645381501842, + "loss": 3.1116, + "step": 10285 + }, + { + "epoch": 0.9318928223596294, + "grad_norm": 0.7974286675453186, + "learning_rate": 0.00013789041261402768, + "loss": 2.6957, + "step": 10286 + }, + { + "epoch": 0.9319834205340762, + "grad_norm": 0.7600846290588379, + "learning_rate": 0.00013788437141303692, + "loss": 2.5757, + "step": 10287 + }, + { + "epoch": 0.932074018708523, + "grad_norm": 0.802407443523407, + "learning_rate": 0.00013787833021204618, + "loss": 2.7262, + "step": 10288 + }, + { + "epoch": 0.9321646168829698, + "grad_norm": 0.7317636013031006, + "learning_rate": 0.0001378722890110554, + "loss": 2.9256, + "step": 10289 + }, + { + "epoch": 0.9322552150574166, + "grad_norm": 0.7975431084632874, + "learning_rate": 0.00013786624781006465, + "loss": 2.6637, + "step": 10290 + }, + { + "epoch": 0.9323458132318634, + "grad_norm": 0.847955584526062, + "learning_rate": 0.00013786020660907388, + "loss": 2.8268, + "step": 10291 + }, + { + "epoch": 0.9324364114063102, + "grad_norm": 0.772079348564148, + "learning_rate": 0.00013785416540808314, + "loss": 3.0034, + "step": 10292 + }, + { + "epoch": 0.932527009580757, + "grad_norm": 0.7823781371116638, + "learning_rate": 0.00013784812420709238, + "loss": 2.9001, + "step": 10293 + }, + { + "epoch": 0.9326176077552037, + "grad_norm": 0.7627604603767395, + "learning_rate": 0.0001378420830061016, + "loss": 2.6173, + "step": 10294 + }, + { + "epoch": 0.9327082059296505, + "grad_norm": 0.7650439739227295, + "learning_rate": 0.00013783604180511087, + "loss": 2.8952, + "step": 10295 + }, + { + "epoch": 0.9327988041040973, + "grad_norm": 0.7796351909637451, + "learning_rate": 0.0001378300006041201, + "loss": 2.7281, + "step": 10296 + }, + { + "epoch": 0.9328894022785441, + "grad_norm": 0.8195030093193054, + "learning_rate": 0.00013782395940312937, + "loss": 3.1321, + "step": 10297 + }, + { + "epoch": 0.9329800004529909, + "grad_norm": 0.8519221544265747, + "learning_rate": 0.00013781791820213857, + "loss": 2.947, + "step": 10298 + }, + { + "epoch": 0.9330705986274377, + "grad_norm": 0.8209981322288513, + "learning_rate": 0.00013781187700114784, + "loss": 2.8773, + "step": 10299 + }, + { + "epoch": 0.9331611968018845, + "grad_norm": 0.851378858089447, + "learning_rate": 0.00013780583580015707, + "loss": 2.9155, + "step": 10300 + }, + { + "epoch": 0.9332517949763313, + "grad_norm": 0.8254127502441406, + "learning_rate": 0.00013779979459916633, + "loss": 2.93, + "step": 10301 + }, + { + "epoch": 0.933342393150778, + "grad_norm": 0.8231610059738159, + "learning_rate": 0.00013779375339817557, + "loss": 2.8579, + "step": 10302 + }, + { + "epoch": 0.9334329913252248, + "grad_norm": 0.8214370012283325, + "learning_rate": 0.0001377877121971848, + "loss": 2.8936, + "step": 10303 + }, + { + "epoch": 0.9335235894996716, + "grad_norm": 0.7736647725105286, + "learning_rate": 0.00013778167099619406, + "loss": 2.8092, + "step": 10304 + }, + { + "epoch": 0.9336141876741184, + "grad_norm": 0.7665966153144836, + "learning_rate": 0.0001377756297952033, + "loss": 2.8569, + "step": 10305 + }, + { + "epoch": 0.9337047858485652, + "grad_norm": 0.8169408440589905, + "learning_rate": 0.00013776958859421253, + "loss": 2.6875, + "step": 10306 + }, + { + "epoch": 0.933795384023012, + "grad_norm": 0.7446429133415222, + "learning_rate": 0.00013776354739322176, + "loss": 2.5141, + "step": 10307 + }, + { + "epoch": 0.9338859821974588, + "grad_norm": 0.7892894148826599, + "learning_rate": 0.00013775750619223102, + "loss": 2.8537, + "step": 10308 + }, + { + "epoch": 0.9339765803719055, + "grad_norm": 0.7473850250244141, + "learning_rate": 0.00013775146499124029, + "loss": 2.6108, + "step": 10309 + }, + { + "epoch": 0.9340671785463522, + "grad_norm": 0.7554207444190979, + "learning_rate": 0.00013774542379024952, + "loss": 2.7443, + "step": 10310 + }, + { + "epoch": 0.934157776720799, + "grad_norm": 0.8134192824363708, + "learning_rate": 0.00013773938258925875, + "loss": 2.6496, + "step": 10311 + }, + { + "epoch": 0.9342483748952458, + "grad_norm": 0.7544184923171997, + "learning_rate": 0.000137733341388268, + "loss": 2.6215, + "step": 10312 + }, + { + "epoch": 0.9343389730696926, + "grad_norm": 0.7954362034797668, + "learning_rate": 0.00013772730018727725, + "loss": 2.6496, + "step": 10313 + }, + { + "epoch": 0.9344295712441394, + "grad_norm": 0.8525195121765137, + "learning_rate": 0.00013772125898628648, + "loss": 2.8373, + "step": 10314 + }, + { + "epoch": 0.9345201694185862, + "grad_norm": 0.8639864325523376, + "learning_rate": 0.00013771521778529572, + "loss": 2.4813, + "step": 10315 + }, + { + "epoch": 0.934610767593033, + "grad_norm": 0.8299828767776489, + "learning_rate": 0.00013770917658430498, + "loss": 3.0509, + "step": 10316 + }, + { + "epoch": 0.9347013657674798, + "grad_norm": 0.8326935172080994, + "learning_rate": 0.0001377031353833142, + "loss": 2.6975, + "step": 10317 + }, + { + "epoch": 0.9347919639419265, + "grad_norm": 0.851056694984436, + "learning_rate": 0.00013769709418232347, + "loss": 2.9404, + "step": 10318 + }, + { + "epoch": 0.9348825621163733, + "grad_norm": 0.8773520588874817, + "learning_rate": 0.00013769105298133268, + "loss": 3.0437, + "step": 10319 + }, + { + "epoch": 0.9349731602908201, + "grad_norm": 0.7950474619865417, + "learning_rate": 0.00013768501178034194, + "loss": 2.5661, + "step": 10320 + }, + { + "epoch": 0.9350637584652669, + "grad_norm": 0.827488899230957, + "learning_rate": 0.00013767897057935118, + "loss": 2.5422, + "step": 10321 + }, + { + "epoch": 0.9351543566397137, + "grad_norm": 0.7681262493133545, + "learning_rate": 0.00013767292937836044, + "loss": 2.6093, + "step": 10322 + }, + { + "epoch": 0.9352449548141605, + "grad_norm": 0.8289632201194763, + "learning_rate": 0.00013766688817736967, + "loss": 2.7492, + "step": 10323 + }, + { + "epoch": 0.9353355529886073, + "grad_norm": 0.8855257034301758, + "learning_rate": 0.0001376608469763789, + "loss": 2.7464, + "step": 10324 + }, + { + "epoch": 0.935426151163054, + "grad_norm": 0.7578449249267578, + "learning_rate": 0.00013765480577538817, + "loss": 2.7112, + "step": 10325 + }, + { + "epoch": 0.9355167493375008, + "grad_norm": 0.8309325575828552, + "learning_rate": 0.0001376487645743974, + "loss": 2.8457, + "step": 10326 + }, + { + "epoch": 0.9356073475119476, + "grad_norm": 0.8231379389762878, + "learning_rate": 0.00013764272337340663, + "loss": 2.5459, + "step": 10327 + }, + { + "epoch": 0.9356979456863944, + "grad_norm": 0.8089391589164734, + "learning_rate": 0.00013763668217241587, + "loss": 2.8115, + "step": 10328 + }, + { + "epoch": 0.9357885438608412, + "grad_norm": 0.8182677030563354, + "learning_rate": 0.00013763064097142513, + "loss": 3.1252, + "step": 10329 + }, + { + "epoch": 0.935879142035288, + "grad_norm": 0.8453269004821777, + "learning_rate": 0.00013762459977043436, + "loss": 3.0285, + "step": 10330 + }, + { + "epoch": 0.9359697402097348, + "grad_norm": 0.804523766040802, + "learning_rate": 0.00013761855856944362, + "loss": 2.6502, + "step": 10331 + }, + { + "epoch": 0.9360603383841816, + "grad_norm": 0.8455667495727539, + "learning_rate": 0.00013761251736845286, + "loss": 2.9048, + "step": 10332 + }, + { + "epoch": 0.9361509365586284, + "grad_norm": 0.8366347551345825, + "learning_rate": 0.0001376064761674621, + "loss": 2.8206, + "step": 10333 + }, + { + "epoch": 0.9362415347330751, + "grad_norm": 0.7913139462471008, + "learning_rate": 0.00013760043496647135, + "loss": 2.6168, + "step": 10334 + }, + { + "epoch": 0.9363321329075219, + "grad_norm": 0.7650044560432434, + "learning_rate": 0.0001375943937654806, + "loss": 2.6886, + "step": 10335 + }, + { + "epoch": 0.9364227310819687, + "grad_norm": 0.7698338627815247, + "learning_rate": 0.00013758835256448982, + "loss": 2.7032, + "step": 10336 + }, + { + "epoch": 0.9365133292564155, + "grad_norm": 0.7821192741394043, + "learning_rate": 0.00013758231136349906, + "loss": 3.0145, + "step": 10337 + }, + { + "epoch": 0.9366039274308623, + "grad_norm": 0.6823012232780457, + "learning_rate": 0.00013757627016250832, + "loss": 2.1067, + "step": 10338 + }, + { + "epoch": 0.9366945256053091, + "grad_norm": 0.7824718952178955, + "learning_rate": 0.00013757022896151758, + "loss": 2.8686, + "step": 10339 + }, + { + "epoch": 0.9367851237797559, + "grad_norm": 0.83334881067276, + "learning_rate": 0.00013756418776052678, + "loss": 2.78, + "step": 10340 + }, + { + "epoch": 0.9368757219542027, + "grad_norm": 0.8193923830986023, + "learning_rate": 0.00013755814655953605, + "loss": 2.6067, + "step": 10341 + }, + { + "epoch": 0.9369663201286494, + "grad_norm": 0.7564346194267273, + "learning_rate": 0.00013755210535854528, + "loss": 2.921, + "step": 10342 + }, + { + "epoch": 0.9370569183030962, + "grad_norm": 0.8402191996574402, + "learning_rate": 0.00013754606415755454, + "loss": 1.9589, + "step": 10343 + }, + { + "epoch": 0.937147516477543, + "grad_norm": 0.8113667368888855, + "learning_rate": 0.00013754002295656378, + "loss": 2.8674, + "step": 10344 + }, + { + "epoch": 0.9372381146519898, + "grad_norm": 0.841873049736023, + "learning_rate": 0.000137533981755573, + "loss": 2.7217, + "step": 10345 + }, + { + "epoch": 0.9373287128264366, + "grad_norm": 0.6863391995429993, + "learning_rate": 0.00013752794055458227, + "loss": 2.0949, + "step": 10346 + }, + { + "epoch": 0.9374193110008834, + "grad_norm": 0.8754007816314697, + "learning_rate": 0.0001375218993535915, + "loss": 3.1313, + "step": 10347 + }, + { + "epoch": 0.9375099091753302, + "grad_norm": 0.7679219245910645, + "learning_rate": 0.00013751585815260074, + "loss": 2.673, + "step": 10348 + }, + { + "epoch": 0.937600507349777, + "grad_norm": 0.7201882600784302, + "learning_rate": 0.00013750981695160997, + "loss": 2.6082, + "step": 10349 + }, + { + "epoch": 0.9376911055242236, + "grad_norm": 0.7973514199256897, + "learning_rate": 0.00013750377575061923, + "loss": 2.8915, + "step": 10350 + }, + { + "epoch": 0.9377817036986704, + "grad_norm": 0.8413876891136169, + "learning_rate": 0.00013749773454962847, + "loss": 2.9168, + "step": 10351 + }, + { + "epoch": 0.9378723018731172, + "grad_norm": 0.8103439807891846, + "learning_rate": 0.00013749169334863773, + "loss": 3.0168, + "step": 10352 + }, + { + "epoch": 0.937962900047564, + "grad_norm": 0.812526285648346, + "learning_rate": 0.00013748565214764694, + "loss": 2.91, + "step": 10353 + }, + { + "epoch": 0.9380534982220108, + "grad_norm": 0.8702222108840942, + "learning_rate": 0.0001374796109466562, + "loss": 3.0605, + "step": 10354 + }, + { + "epoch": 0.9381440963964576, + "grad_norm": 0.7096233367919922, + "learning_rate": 0.00013747356974566546, + "loss": 2.043, + "step": 10355 + }, + { + "epoch": 0.9382346945709044, + "grad_norm": 0.886917769908905, + "learning_rate": 0.0001374675285446747, + "loss": 2.9091, + "step": 10356 + }, + { + "epoch": 0.9383252927453511, + "grad_norm": 0.8646842837333679, + "learning_rate": 0.00013746148734368393, + "loss": 2.7613, + "step": 10357 + }, + { + "epoch": 0.9384158909197979, + "grad_norm": 0.79775470495224, + "learning_rate": 0.00013745544614269316, + "loss": 2.4592, + "step": 10358 + }, + { + "epoch": 0.9385064890942447, + "grad_norm": 0.5590262413024902, + "learning_rate": 0.00013744940494170242, + "loss": 1.2409, + "step": 10359 + }, + { + "epoch": 0.9385970872686915, + "grad_norm": 0.7259834408760071, + "learning_rate": 0.00013744336374071166, + "loss": 2.6376, + "step": 10360 + }, + { + "epoch": 0.9386876854431383, + "grad_norm": 0.6707054376602173, + "learning_rate": 0.0001374373225397209, + "loss": 2.0496, + "step": 10361 + }, + { + "epoch": 0.9387782836175851, + "grad_norm": 0.9487971663475037, + "learning_rate": 0.00013743128133873015, + "loss": 2.9925, + "step": 10362 + }, + { + "epoch": 0.9388688817920319, + "grad_norm": 0.8371861577033997, + "learning_rate": 0.00013742524013773938, + "loss": 3.012, + "step": 10363 + }, + { + "epoch": 0.9389594799664787, + "grad_norm": 0.8453049063682556, + "learning_rate": 0.00013741919893674865, + "loss": 2.9415, + "step": 10364 + }, + { + "epoch": 0.9390500781409254, + "grad_norm": 0.6777663230895996, + "learning_rate": 0.00013741315773575788, + "loss": 2.2098, + "step": 10365 + }, + { + "epoch": 0.9391406763153722, + "grad_norm": 0.8175214529037476, + "learning_rate": 0.00013740711653476711, + "loss": 2.8797, + "step": 10366 + }, + { + "epoch": 0.939231274489819, + "grad_norm": 0.7891516089439392, + "learning_rate": 0.00013740107533377635, + "loss": 2.7909, + "step": 10367 + }, + { + "epoch": 0.9393218726642658, + "grad_norm": 0.7890466451644897, + "learning_rate": 0.0001373950341327856, + "loss": 2.8082, + "step": 10368 + }, + { + "epoch": 0.9394124708387126, + "grad_norm": 0.7894207239151001, + "learning_rate": 0.00013738899293179487, + "loss": 2.9981, + "step": 10369 + }, + { + "epoch": 0.9395030690131594, + "grad_norm": 0.7943523526191711, + "learning_rate": 0.00013738295173080408, + "loss": 2.7112, + "step": 10370 + }, + { + "epoch": 0.9395936671876062, + "grad_norm": 0.782502293586731, + "learning_rate": 0.00013737691052981334, + "loss": 2.6963, + "step": 10371 + }, + { + "epoch": 0.939684265362053, + "grad_norm": 0.8196326494216919, + "learning_rate": 0.00013737086932882257, + "loss": 2.9006, + "step": 10372 + }, + { + "epoch": 0.9397748635364997, + "grad_norm": 0.8915473222732544, + "learning_rate": 0.00013736482812783183, + "loss": 2.6614, + "step": 10373 + }, + { + "epoch": 0.9398654617109465, + "grad_norm": 0.7683588862419128, + "learning_rate": 0.00013735878692684107, + "loss": 2.796, + "step": 10374 + }, + { + "epoch": 0.9399560598853933, + "grad_norm": 0.7888836860656738, + "learning_rate": 0.0001373527457258503, + "loss": 2.69, + "step": 10375 + }, + { + "epoch": 0.9400466580598401, + "grad_norm": 0.7745002508163452, + "learning_rate": 0.00013734670452485956, + "loss": 2.825, + "step": 10376 + }, + { + "epoch": 0.9401372562342869, + "grad_norm": 0.7470067739486694, + "learning_rate": 0.0001373406633238688, + "loss": 2.1674, + "step": 10377 + }, + { + "epoch": 0.9402278544087337, + "grad_norm": 0.8644087910652161, + "learning_rate": 0.00013733462212287803, + "loss": 2.812, + "step": 10378 + }, + { + "epoch": 0.9403184525831805, + "grad_norm": 0.872871994972229, + "learning_rate": 0.00013732858092188727, + "loss": 2.6458, + "step": 10379 + }, + { + "epoch": 0.9404090507576273, + "grad_norm": 0.9277419447898865, + "learning_rate": 0.00013732253972089653, + "loss": 2.913, + "step": 10380 + }, + { + "epoch": 0.940499648932074, + "grad_norm": 0.8034663796424866, + "learning_rate": 0.00013731649851990576, + "loss": 2.7611, + "step": 10381 + }, + { + "epoch": 0.9405902471065208, + "grad_norm": 0.783866822719574, + "learning_rate": 0.00013731045731891502, + "loss": 2.7592, + "step": 10382 + }, + { + "epoch": 0.9406808452809676, + "grad_norm": 0.7590939402580261, + "learning_rate": 0.00013730441611792423, + "loss": 2.4679, + "step": 10383 + }, + { + "epoch": 0.9407714434554144, + "grad_norm": 0.8990957736968994, + "learning_rate": 0.0001372983749169335, + "loss": 2.8069, + "step": 10384 + }, + { + "epoch": 0.9408620416298612, + "grad_norm": 0.9510653614997864, + "learning_rate": 0.00013729233371594275, + "loss": 2.6823, + "step": 10385 + }, + { + "epoch": 0.940952639804308, + "grad_norm": 0.7884221076965332, + "learning_rate": 0.00013728629251495198, + "loss": 2.7773, + "step": 10386 + }, + { + "epoch": 0.9410432379787548, + "grad_norm": 0.7839957475662231, + "learning_rate": 0.00013728025131396122, + "loss": 2.8467, + "step": 10387 + }, + { + "epoch": 0.9411338361532016, + "grad_norm": 0.7892153263092041, + "learning_rate": 0.00013727421011297045, + "loss": 2.5711, + "step": 10388 + }, + { + "epoch": 0.9412244343276484, + "grad_norm": 0.8256089687347412, + "learning_rate": 0.00013726816891197971, + "loss": 3.0223, + "step": 10389 + }, + { + "epoch": 0.941315032502095, + "grad_norm": 0.8109915256500244, + "learning_rate": 0.00013726212771098895, + "loss": 2.8508, + "step": 10390 + }, + { + "epoch": 0.9414056306765418, + "grad_norm": 0.8321003913879395, + "learning_rate": 0.00013725608650999818, + "loss": 2.7717, + "step": 10391 + }, + { + "epoch": 0.9414962288509886, + "grad_norm": 0.7687034010887146, + "learning_rate": 0.00013725004530900744, + "loss": 2.7269, + "step": 10392 + }, + { + "epoch": 0.9415868270254354, + "grad_norm": 0.7841367125511169, + "learning_rate": 0.00013724400410801668, + "loss": 2.6396, + "step": 10393 + }, + { + "epoch": 0.9416774251998822, + "grad_norm": 0.8433821797370911, + "learning_rate": 0.00013723796290702594, + "loss": 2.9695, + "step": 10394 + }, + { + "epoch": 0.941768023374329, + "grad_norm": 0.6576271653175354, + "learning_rate": 0.00013723192170603517, + "loss": 2.1145, + "step": 10395 + }, + { + "epoch": 0.9418586215487758, + "grad_norm": 0.8416181802749634, + "learning_rate": 0.0001372258805050444, + "loss": 2.7254, + "step": 10396 + }, + { + "epoch": 0.9419492197232225, + "grad_norm": 0.7508403658866882, + "learning_rate": 0.00013721983930405364, + "loss": 2.7395, + "step": 10397 + }, + { + "epoch": 0.9420398178976693, + "grad_norm": 0.7608640193939209, + "learning_rate": 0.0001372137981030629, + "loss": 2.5713, + "step": 10398 + }, + { + "epoch": 0.9421304160721161, + "grad_norm": 0.8185020089149475, + "learning_rate": 0.00013720775690207214, + "loss": 2.8613, + "step": 10399 + }, + { + "epoch": 0.9422210142465629, + "grad_norm": 0.7712879180908203, + "learning_rate": 0.00013720171570108137, + "loss": 2.8002, + "step": 10400 + }, + { + "epoch": 0.9423116124210097, + "grad_norm": 0.8564422130584717, + "learning_rate": 0.00013719567450009063, + "loss": 2.6989, + "step": 10401 + }, + { + "epoch": 0.9424022105954565, + "grad_norm": 0.7953861951828003, + "learning_rate": 0.00013718963329909987, + "loss": 2.566, + "step": 10402 + }, + { + "epoch": 0.9424928087699033, + "grad_norm": 0.6937279105186462, + "learning_rate": 0.00013718359209810913, + "loss": 2.0983, + "step": 10403 + }, + { + "epoch": 0.94258340694435, + "grad_norm": 0.7956683039665222, + "learning_rate": 0.00013717755089711833, + "loss": 2.193, + "step": 10404 + }, + { + "epoch": 0.9426740051187968, + "grad_norm": 0.7833611965179443, + "learning_rate": 0.0001371715096961276, + "loss": 2.8529, + "step": 10405 + }, + { + "epoch": 0.9427646032932436, + "grad_norm": 0.7152555584907532, + "learning_rate": 0.00013716546849513686, + "loss": 2.1598, + "step": 10406 + }, + { + "epoch": 0.9428552014676904, + "grad_norm": 0.8051909804344177, + "learning_rate": 0.0001371594272941461, + "loss": 2.7154, + "step": 10407 + }, + { + "epoch": 0.9429457996421372, + "grad_norm": 0.9348576664924622, + "learning_rate": 0.00013715338609315532, + "loss": 3.0226, + "step": 10408 + }, + { + "epoch": 0.943036397816584, + "grad_norm": 0.836995542049408, + "learning_rate": 0.00013714734489216456, + "loss": 2.7928, + "step": 10409 + }, + { + "epoch": 0.9431269959910308, + "grad_norm": 0.8769822120666504, + "learning_rate": 0.00013714130369117382, + "loss": 2.6881, + "step": 10410 + }, + { + "epoch": 0.9432175941654776, + "grad_norm": 0.7951906323432922, + "learning_rate": 0.00013713526249018305, + "loss": 2.7927, + "step": 10411 + }, + { + "epoch": 0.9433081923399244, + "grad_norm": 0.864568829536438, + "learning_rate": 0.0001371292212891923, + "loss": 2.8448, + "step": 10412 + }, + { + "epoch": 0.9433987905143711, + "grad_norm": 0.8248215913772583, + "learning_rate": 0.00013712318008820152, + "loss": 2.5718, + "step": 10413 + }, + { + "epoch": 0.9434893886888179, + "grad_norm": 0.7829052805900574, + "learning_rate": 0.00013711713888721078, + "loss": 2.5481, + "step": 10414 + }, + { + "epoch": 0.9435799868632647, + "grad_norm": 0.780785083770752, + "learning_rate": 0.00013711109768622004, + "loss": 2.5532, + "step": 10415 + }, + { + "epoch": 0.9436705850377115, + "grad_norm": 0.842930793762207, + "learning_rate": 0.00013710505648522928, + "loss": 2.9597, + "step": 10416 + }, + { + "epoch": 0.9437611832121583, + "grad_norm": 0.787669837474823, + "learning_rate": 0.0001370990152842385, + "loss": 2.6769, + "step": 10417 + }, + { + "epoch": 0.9438517813866051, + "grad_norm": 0.7783954739570618, + "learning_rate": 0.00013709297408324775, + "loss": 2.7556, + "step": 10418 + }, + { + "epoch": 0.9439423795610519, + "grad_norm": 0.7573541402816772, + "learning_rate": 0.000137086932882257, + "loss": 2.5964, + "step": 10419 + }, + { + "epoch": 0.9440329777354987, + "grad_norm": 0.8118947148323059, + "learning_rate": 0.00013708089168126624, + "loss": 2.7224, + "step": 10420 + }, + { + "epoch": 0.9441235759099454, + "grad_norm": 0.7633313536643982, + "learning_rate": 0.00013707485048027547, + "loss": 2.6083, + "step": 10421 + }, + { + "epoch": 0.9442141740843922, + "grad_norm": 0.8653739094734192, + "learning_rate": 0.00013706880927928474, + "loss": 2.9771, + "step": 10422 + }, + { + "epoch": 0.944304772258839, + "grad_norm": 0.8854790925979614, + "learning_rate": 0.00013706276807829397, + "loss": 2.7831, + "step": 10423 + }, + { + "epoch": 0.9443953704332858, + "grad_norm": 0.8387234210968018, + "learning_rate": 0.00013705672687730323, + "loss": 2.7794, + "step": 10424 + }, + { + "epoch": 0.9444859686077326, + "grad_norm": 0.8061969876289368, + "learning_rate": 0.00013705068567631244, + "loss": 3.0242, + "step": 10425 + }, + { + "epoch": 0.9445765667821794, + "grad_norm": 0.8371800184249878, + "learning_rate": 0.0001370446444753217, + "loss": 2.7171, + "step": 10426 + }, + { + "epoch": 0.9446671649566262, + "grad_norm": 0.8136566281318665, + "learning_rate": 0.00013703860327433093, + "loss": 2.5907, + "step": 10427 + }, + { + "epoch": 0.944757763131073, + "grad_norm": 0.8467051982879639, + "learning_rate": 0.0001370325620733402, + "loss": 2.7223, + "step": 10428 + }, + { + "epoch": 0.9448483613055197, + "grad_norm": 0.8094639778137207, + "learning_rate": 0.00013702652087234943, + "loss": 2.9014, + "step": 10429 + }, + { + "epoch": 0.9449389594799665, + "grad_norm": 0.8291431665420532, + "learning_rate": 0.00013702047967135866, + "loss": 2.6101, + "step": 10430 + }, + { + "epoch": 0.9450295576544132, + "grad_norm": 0.7705879211425781, + "learning_rate": 0.00013701443847036792, + "loss": 2.8261, + "step": 10431 + }, + { + "epoch": 0.94512015582886, + "grad_norm": 0.8075419664382935, + "learning_rate": 0.00013700839726937716, + "loss": 2.7694, + "step": 10432 + }, + { + "epoch": 0.9452107540033068, + "grad_norm": 0.8352501392364502, + "learning_rate": 0.00013700235606838642, + "loss": 2.6528, + "step": 10433 + }, + { + "epoch": 0.9453013521777536, + "grad_norm": 0.782659649848938, + "learning_rate": 0.00013699631486739563, + "loss": 2.8257, + "step": 10434 + }, + { + "epoch": 0.9453919503522004, + "grad_norm": 0.8470736742019653, + "learning_rate": 0.0001369902736664049, + "loss": 2.7133, + "step": 10435 + }, + { + "epoch": 0.9454825485266471, + "grad_norm": 0.8179614543914795, + "learning_rate": 0.00013698423246541415, + "loss": 3.099, + "step": 10436 + }, + { + "epoch": 0.9455731467010939, + "grad_norm": 0.830272376537323, + "learning_rate": 0.00013697819126442338, + "loss": 2.937, + "step": 10437 + }, + { + "epoch": 0.9456637448755407, + "grad_norm": 0.716139018535614, + "learning_rate": 0.00013697215006343262, + "loss": 2.0502, + "step": 10438 + }, + { + "epoch": 0.9457543430499875, + "grad_norm": 0.838762640953064, + "learning_rate": 0.00013696610886244185, + "loss": 2.92, + "step": 10439 + }, + { + "epoch": 0.9458449412244343, + "grad_norm": 1.0977299213409424, + "learning_rate": 0.0001369600676614511, + "loss": 3.0283, + "step": 10440 + }, + { + "epoch": 0.9459355393988811, + "grad_norm": 0.7682933807373047, + "learning_rate": 0.00013695402646046035, + "loss": 2.7218, + "step": 10441 + }, + { + "epoch": 0.9460261375733279, + "grad_norm": 0.7924580574035645, + "learning_rate": 0.00013694798525946958, + "loss": 2.6628, + "step": 10442 + }, + { + "epoch": 0.9461167357477747, + "grad_norm": 0.789049506187439, + "learning_rate": 0.00013694194405847881, + "loss": 2.7429, + "step": 10443 + }, + { + "epoch": 0.9462073339222214, + "grad_norm": 0.818247377872467, + "learning_rate": 0.00013693590285748807, + "loss": 2.7692, + "step": 10444 + }, + { + "epoch": 0.9462979320966682, + "grad_norm": 0.7755863070487976, + "learning_rate": 0.00013692986165649734, + "loss": 2.0902, + "step": 10445 + }, + { + "epoch": 0.946388530271115, + "grad_norm": 0.7845357656478882, + "learning_rate": 0.00013692382045550657, + "loss": 2.8331, + "step": 10446 + }, + { + "epoch": 0.9464791284455618, + "grad_norm": 0.7844014763832092, + "learning_rate": 0.0001369177792545158, + "loss": 3.0754, + "step": 10447 + }, + { + "epoch": 0.9465697266200086, + "grad_norm": 0.7645691633224487, + "learning_rate": 0.00013691173805352504, + "loss": 2.7101, + "step": 10448 + }, + { + "epoch": 0.9466603247944554, + "grad_norm": 0.7749740481376648, + "learning_rate": 0.0001369056968525343, + "loss": 2.3177, + "step": 10449 + }, + { + "epoch": 0.9467509229689022, + "grad_norm": 0.7974615693092346, + "learning_rate": 0.00013689965565154353, + "loss": 2.6728, + "step": 10450 + }, + { + "epoch": 0.946841521143349, + "grad_norm": 0.779606819152832, + "learning_rate": 0.00013689361445055277, + "loss": 2.9181, + "step": 10451 + }, + { + "epoch": 0.9469321193177958, + "grad_norm": 0.7987850904464722, + "learning_rate": 0.00013688757324956203, + "loss": 2.6966, + "step": 10452 + }, + { + "epoch": 0.9470227174922425, + "grad_norm": 0.8482074737548828, + "learning_rate": 0.00013688153204857126, + "loss": 2.9492, + "step": 10453 + }, + { + "epoch": 0.9471133156666893, + "grad_norm": 0.8589617609977722, + "learning_rate": 0.00013687549084758052, + "loss": 2.6813, + "step": 10454 + }, + { + "epoch": 0.9472039138411361, + "grad_norm": 0.7753381729125977, + "learning_rate": 0.00013686944964658973, + "loss": 2.787, + "step": 10455 + }, + { + "epoch": 0.9472945120155829, + "grad_norm": 0.804135799407959, + "learning_rate": 0.000136863408445599, + "loss": 2.9719, + "step": 10456 + }, + { + "epoch": 0.9473851101900297, + "grad_norm": 0.8060266971588135, + "learning_rate": 0.00013685736724460823, + "loss": 2.9005, + "step": 10457 + }, + { + "epoch": 0.9474757083644765, + "grad_norm": 0.8053033947944641, + "learning_rate": 0.0001368513260436175, + "loss": 2.8522, + "step": 10458 + }, + { + "epoch": 0.9475663065389233, + "grad_norm": 0.8805874586105347, + "learning_rate": 0.00013684528484262672, + "loss": 3.1324, + "step": 10459 + }, + { + "epoch": 0.94765690471337, + "grad_norm": 0.7861425876617432, + "learning_rate": 0.00013683924364163596, + "loss": 2.7691, + "step": 10460 + }, + { + "epoch": 0.9477475028878168, + "grad_norm": 0.8137229681015015, + "learning_rate": 0.00013683320244064522, + "loss": 2.5603, + "step": 10461 + }, + { + "epoch": 0.9478381010622636, + "grad_norm": 0.8343732357025146, + "learning_rate": 0.00013682716123965445, + "loss": 2.8755, + "step": 10462 + }, + { + "epoch": 0.9479286992367104, + "grad_norm": 0.7666659951210022, + "learning_rate": 0.00013682112003866368, + "loss": 2.5404, + "step": 10463 + }, + { + "epoch": 0.9480192974111572, + "grad_norm": 0.8774640560150146, + "learning_rate": 0.00013681507883767292, + "loss": 2.6961, + "step": 10464 + }, + { + "epoch": 0.948109895585604, + "grad_norm": 1.0117642879486084, + "learning_rate": 0.00013680903763668218, + "loss": 2.5064, + "step": 10465 + }, + { + "epoch": 0.9482004937600508, + "grad_norm": 0.815909206867218, + "learning_rate": 0.00013680299643569144, + "loss": 2.8922, + "step": 10466 + }, + { + "epoch": 0.9482910919344976, + "grad_norm": 0.8691501021385193, + "learning_rate": 0.00013679695523470067, + "loss": 2.9341, + "step": 10467 + }, + { + "epoch": 0.9483816901089444, + "grad_norm": 0.9030426144599915, + "learning_rate": 0.0001367909140337099, + "loss": 3.029, + "step": 10468 + }, + { + "epoch": 0.9484722882833911, + "grad_norm": 0.8112939596176147, + "learning_rate": 0.00013678487283271914, + "loss": 2.7656, + "step": 10469 + }, + { + "epoch": 0.9485628864578379, + "grad_norm": 0.66029953956604, + "learning_rate": 0.0001367788316317284, + "loss": 1.9066, + "step": 10470 + }, + { + "epoch": 0.9486534846322846, + "grad_norm": 0.7678027153015137, + "learning_rate": 0.00013677279043073764, + "loss": 2.1301, + "step": 10471 + }, + { + "epoch": 0.9487440828067314, + "grad_norm": 0.7546448111534119, + "learning_rate": 0.00013676674922974687, + "loss": 2.6774, + "step": 10472 + }, + { + "epoch": 0.9488346809811782, + "grad_norm": 0.7077309489250183, + "learning_rate": 0.0001367607080287561, + "loss": 2.0213, + "step": 10473 + }, + { + "epoch": 0.948925279155625, + "grad_norm": 0.7126147150993347, + "learning_rate": 0.00013675466682776537, + "loss": 2.7377, + "step": 10474 + }, + { + "epoch": 0.9490158773300718, + "grad_norm": 0.7567920088768005, + "learning_rate": 0.00013674862562677463, + "loss": 2.7845, + "step": 10475 + }, + { + "epoch": 0.9491064755045185, + "grad_norm": 0.7645745873451233, + "learning_rate": 0.00013674258442578384, + "loss": 2.5663, + "step": 10476 + }, + { + "epoch": 0.9491970736789653, + "grad_norm": 0.7149943113327026, + "learning_rate": 0.0001367365432247931, + "loss": 1.93, + "step": 10477 + }, + { + "epoch": 0.9492876718534121, + "grad_norm": 0.808320164680481, + "learning_rate": 0.00013673050202380233, + "loss": 2.5728, + "step": 10478 + }, + { + "epoch": 0.9493782700278589, + "grad_norm": 0.8850331902503967, + "learning_rate": 0.0001367244608228116, + "loss": 2.8129, + "step": 10479 + }, + { + "epoch": 0.9494688682023057, + "grad_norm": 0.8271325826644897, + "learning_rate": 0.00013671841962182083, + "loss": 2.8217, + "step": 10480 + }, + { + "epoch": 0.9495594663767525, + "grad_norm": 0.8479661345481873, + "learning_rate": 0.00013671237842083006, + "loss": 2.7694, + "step": 10481 + }, + { + "epoch": 0.9496500645511993, + "grad_norm": 0.7633600234985352, + "learning_rate": 0.00013670633721983932, + "loss": 2.6464, + "step": 10482 + }, + { + "epoch": 0.9497406627256461, + "grad_norm": 0.7770676016807556, + "learning_rate": 0.00013670029601884856, + "loss": 2.8962, + "step": 10483 + }, + { + "epoch": 0.9498312609000928, + "grad_norm": 0.8073007464408875, + "learning_rate": 0.00013669425481785782, + "loss": 3.0519, + "step": 10484 + }, + { + "epoch": 0.9499218590745396, + "grad_norm": 0.8029835820198059, + "learning_rate": 0.00013668821361686702, + "loss": 2.7227, + "step": 10485 + }, + { + "epoch": 0.9500124572489864, + "grad_norm": 0.8175754547119141, + "learning_rate": 0.00013668217241587628, + "loss": 3.0325, + "step": 10486 + }, + { + "epoch": 0.9501030554234332, + "grad_norm": 0.8598646521568298, + "learning_rate": 0.00013667613121488552, + "loss": 2.8978, + "step": 10487 + }, + { + "epoch": 0.95019365359788, + "grad_norm": 0.8266860842704773, + "learning_rate": 0.00013667009001389478, + "loss": 2.8726, + "step": 10488 + }, + { + "epoch": 0.9502842517723268, + "grad_norm": 0.8203237652778625, + "learning_rate": 0.00013666404881290401, + "loss": 2.9854, + "step": 10489 + }, + { + "epoch": 0.9503748499467736, + "grad_norm": 0.8049030303955078, + "learning_rate": 0.00013665800761191325, + "loss": 3.1041, + "step": 10490 + }, + { + "epoch": 0.9504654481212204, + "grad_norm": 0.7920880913734436, + "learning_rate": 0.0001366519664109225, + "loss": 2.6223, + "step": 10491 + }, + { + "epoch": 0.9505560462956671, + "grad_norm": 0.7089594006538391, + "learning_rate": 0.00013664592520993174, + "loss": 2.0884, + "step": 10492 + }, + { + "epoch": 0.9506466444701139, + "grad_norm": 0.8257433176040649, + "learning_rate": 0.00013663988400894098, + "loss": 2.9058, + "step": 10493 + }, + { + "epoch": 0.9507372426445607, + "grad_norm": 0.8080734014511108, + "learning_rate": 0.0001366338428079502, + "loss": 2.6304, + "step": 10494 + }, + { + "epoch": 0.9508278408190075, + "grad_norm": 0.8538564443588257, + "learning_rate": 0.00013662780160695947, + "loss": 2.8215, + "step": 10495 + }, + { + "epoch": 0.9509184389934543, + "grad_norm": 0.8017534017562866, + "learning_rate": 0.00013662176040596873, + "loss": 2.877, + "step": 10496 + }, + { + "epoch": 0.9510090371679011, + "grad_norm": 0.8427301049232483, + "learning_rate": 0.00013661571920497797, + "loss": 2.9248, + "step": 10497 + }, + { + "epoch": 0.9510996353423479, + "grad_norm": 0.8536628484725952, + "learning_rate": 0.0001366096780039872, + "loss": 2.7975, + "step": 10498 + }, + { + "epoch": 0.9511902335167947, + "grad_norm": 0.6848964095115662, + "learning_rate": 0.00013660363680299644, + "loss": 2.2583, + "step": 10499 + }, + { + "epoch": 0.9512808316912414, + "grad_norm": 0.784696102142334, + "learning_rate": 0.0001365975956020057, + "loss": 2.8217, + "step": 10500 + }, + { + "epoch": 0.9513714298656882, + "grad_norm": 0.8949984908103943, + "learning_rate": 0.00013659155440101493, + "loss": 2.8295, + "step": 10501 + }, + { + "epoch": 0.951462028040135, + "grad_norm": 0.7711756825447083, + "learning_rate": 0.00013658551320002416, + "loss": 2.8692, + "step": 10502 + }, + { + "epoch": 0.9515526262145818, + "grad_norm": 0.8007754683494568, + "learning_rate": 0.0001365794719990334, + "loss": 2.6601, + "step": 10503 + }, + { + "epoch": 0.9516432243890286, + "grad_norm": 0.8547306656837463, + "learning_rate": 0.00013657343079804266, + "loss": 2.7645, + "step": 10504 + }, + { + "epoch": 0.9517338225634754, + "grad_norm": 0.8388234376907349, + "learning_rate": 0.00013656738959705192, + "loss": 2.9429, + "step": 10505 + }, + { + "epoch": 0.9518244207379222, + "grad_norm": 0.7715099453926086, + "learning_rate": 0.00013656134839606113, + "loss": 2.6957, + "step": 10506 + }, + { + "epoch": 0.951915018912369, + "grad_norm": 0.853263258934021, + "learning_rate": 0.0001365553071950704, + "loss": 3.1494, + "step": 10507 + }, + { + "epoch": 0.9520056170868157, + "grad_norm": 0.7820144295692444, + "learning_rate": 0.00013654926599407962, + "loss": 2.7665, + "step": 10508 + }, + { + "epoch": 0.9520962152612625, + "grad_norm": 0.80782151222229, + "learning_rate": 0.00013654322479308888, + "loss": 2.7721, + "step": 10509 + }, + { + "epoch": 0.9521868134357093, + "grad_norm": 0.7841374278068542, + "learning_rate": 0.00013653718359209812, + "loss": 2.8778, + "step": 10510 + }, + { + "epoch": 0.9522774116101561, + "grad_norm": 0.7253906726837158, + "learning_rate": 0.00013653114239110735, + "loss": 2.0385, + "step": 10511 + }, + { + "epoch": 0.9523680097846028, + "grad_norm": 0.7876873016357422, + "learning_rate": 0.00013652510119011661, + "loss": 2.9568, + "step": 10512 + }, + { + "epoch": 0.9524586079590496, + "grad_norm": 0.825602650642395, + "learning_rate": 0.00013651905998912585, + "loss": 2.9698, + "step": 10513 + }, + { + "epoch": 0.9525492061334964, + "grad_norm": 0.7756446599960327, + "learning_rate": 0.00013651301878813508, + "loss": 2.2708, + "step": 10514 + }, + { + "epoch": 0.9526398043079431, + "grad_norm": 0.7763016223907471, + "learning_rate": 0.00013650697758714432, + "loss": 2.8647, + "step": 10515 + }, + { + "epoch": 0.9527304024823899, + "grad_norm": 0.7941663265228271, + "learning_rate": 0.00013650093638615358, + "loss": 2.8401, + "step": 10516 + }, + { + "epoch": 0.9528210006568367, + "grad_norm": 1.262839913368225, + "learning_rate": 0.0001364948951851628, + "loss": 2.8753, + "step": 10517 + }, + { + "epoch": 0.9529115988312835, + "grad_norm": 0.8013337850570679, + "learning_rate": 0.00013648885398417207, + "loss": 2.824, + "step": 10518 + }, + { + "epoch": 0.9530021970057303, + "grad_norm": 0.8211619853973389, + "learning_rate": 0.0001364828127831813, + "loss": 2.7101, + "step": 10519 + }, + { + "epoch": 0.9530927951801771, + "grad_norm": 0.6983921527862549, + "learning_rate": 0.00013647677158219054, + "loss": 2.1754, + "step": 10520 + }, + { + "epoch": 0.9531833933546239, + "grad_norm": 0.8133074045181274, + "learning_rate": 0.0001364707303811998, + "loss": 2.7801, + "step": 10521 + }, + { + "epoch": 0.9532739915290707, + "grad_norm": 0.858967661857605, + "learning_rate": 0.00013646468918020904, + "loss": 2.8631, + "step": 10522 + }, + { + "epoch": 0.9533645897035175, + "grad_norm": 0.8206047415733337, + "learning_rate": 0.00013645864797921827, + "loss": 2.8838, + "step": 10523 + }, + { + "epoch": 0.9534551878779642, + "grad_norm": 0.7510297894477844, + "learning_rate": 0.0001364526067782275, + "loss": 2.8916, + "step": 10524 + }, + { + "epoch": 0.953545786052411, + "grad_norm": 0.7883956432342529, + "learning_rate": 0.00013644656557723677, + "loss": 2.6811, + "step": 10525 + }, + { + "epoch": 0.9536363842268578, + "grad_norm": 0.8853307962417603, + "learning_rate": 0.00013644052437624603, + "loss": 2.8318, + "step": 10526 + }, + { + "epoch": 0.9537269824013046, + "grad_norm": 0.8299819231033325, + "learning_rate": 0.00013643448317525523, + "loss": 2.9319, + "step": 10527 + }, + { + "epoch": 0.9538175805757514, + "grad_norm": 0.8564929366111755, + "learning_rate": 0.0001364284419742645, + "loss": 2.8957, + "step": 10528 + }, + { + "epoch": 0.9539081787501982, + "grad_norm": 0.8481623530387878, + "learning_rate": 0.00013642240077327373, + "loss": 2.4536, + "step": 10529 + }, + { + "epoch": 0.953998776924645, + "grad_norm": 0.8010044693946838, + "learning_rate": 0.000136416359572283, + "loss": 3.0985, + "step": 10530 + }, + { + "epoch": 0.9540893750990918, + "grad_norm": 0.8123490214347839, + "learning_rate": 0.00013641031837129222, + "loss": 2.846, + "step": 10531 + }, + { + "epoch": 0.9541799732735385, + "grad_norm": 0.7482290863990784, + "learning_rate": 0.00013640427717030146, + "loss": 2.6465, + "step": 10532 + }, + { + "epoch": 0.9542705714479853, + "grad_norm": 0.7387469410896301, + "learning_rate": 0.0001363982359693107, + "loss": 2.5772, + "step": 10533 + }, + { + "epoch": 0.9543611696224321, + "grad_norm": 0.8182982206344604, + "learning_rate": 0.00013639219476831995, + "loss": 2.6943, + "step": 10534 + }, + { + "epoch": 0.9544517677968789, + "grad_norm": 0.775985062122345, + "learning_rate": 0.0001363861535673292, + "loss": 2.8635, + "step": 10535 + }, + { + "epoch": 0.9545423659713257, + "grad_norm": 0.8505902886390686, + "learning_rate": 0.00013638011236633842, + "loss": 2.7521, + "step": 10536 + }, + { + "epoch": 0.9546329641457725, + "grad_norm": 0.7844703793525696, + "learning_rate": 0.00013637407116534768, + "loss": 2.8477, + "step": 10537 + }, + { + "epoch": 0.9547235623202193, + "grad_norm": 0.8266549706459045, + "learning_rate": 0.00013636802996435692, + "loss": 3.0243, + "step": 10538 + }, + { + "epoch": 0.954814160494666, + "grad_norm": 0.8272948265075684, + "learning_rate": 0.00013636198876336618, + "loss": 2.6609, + "step": 10539 + }, + { + "epoch": 0.9549047586691128, + "grad_norm": 0.7754067778587341, + "learning_rate": 0.00013635594756237538, + "loss": 2.7776, + "step": 10540 + }, + { + "epoch": 0.9549953568435596, + "grad_norm": 0.8425865173339844, + "learning_rate": 0.00013634990636138465, + "loss": 2.8545, + "step": 10541 + }, + { + "epoch": 0.9550859550180064, + "grad_norm": 0.8010498285293579, + "learning_rate": 0.0001363438651603939, + "loss": 2.8495, + "step": 10542 + }, + { + "epoch": 0.9551765531924532, + "grad_norm": 0.8909648656845093, + "learning_rate": 0.00013633782395940314, + "loss": 2.6806, + "step": 10543 + }, + { + "epoch": 0.9552671513669, + "grad_norm": 0.7409950494766235, + "learning_rate": 0.00013633178275841237, + "loss": 2.5358, + "step": 10544 + }, + { + "epoch": 0.9553577495413468, + "grad_norm": 0.8147101998329163, + "learning_rate": 0.0001363257415574216, + "loss": 2.7973, + "step": 10545 + }, + { + "epoch": 0.9554483477157936, + "grad_norm": 0.7798152565956116, + "learning_rate": 0.00013631970035643087, + "loss": 2.8509, + "step": 10546 + }, + { + "epoch": 0.9555389458902404, + "grad_norm": 0.8344205617904663, + "learning_rate": 0.0001363136591554401, + "loss": 2.8491, + "step": 10547 + }, + { + "epoch": 0.9556295440646871, + "grad_norm": 0.8173433542251587, + "learning_rate": 0.00013630761795444934, + "loss": 2.7533, + "step": 10548 + }, + { + "epoch": 0.9557201422391339, + "grad_norm": 0.7991475462913513, + "learning_rate": 0.0001363015767534586, + "loss": 2.8808, + "step": 10549 + }, + { + "epoch": 0.9558107404135807, + "grad_norm": 0.9022144675254822, + "learning_rate": 0.00013629553555246783, + "loss": 2.6358, + "step": 10550 + }, + { + "epoch": 0.9559013385880275, + "grad_norm": 0.7881503105163574, + "learning_rate": 0.0001362894943514771, + "loss": 2.6183, + "step": 10551 + }, + { + "epoch": 0.9559919367624742, + "grad_norm": 0.6965592503547668, + "learning_rate": 0.00013628345315048633, + "loss": 1.9886, + "step": 10552 + }, + { + "epoch": 0.956082534936921, + "grad_norm": 0.7664571404457092, + "learning_rate": 0.00013627741194949556, + "loss": 2.8796, + "step": 10553 + }, + { + "epoch": 0.9561731331113678, + "grad_norm": 0.9035927653312683, + "learning_rate": 0.0001362713707485048, + "loss": 2.841, + "step": 10554 + }, + { + "epoch": 0.9562637312858145, + "grad_norm": 0.840082585811615, + "learning_rate": 0.00013626532954751406, + "loss": 2.6997, + "step": 10555 + }, + { + "epoch": 0.9563543294602613, + "grad_norm": 0.7836452126502991, + "learning_rate": 0.00013625928834652332, + "loss": 2.8016, + "step": 10556 + }, + { + "epoch": 0.9564449276347081, + "grad_norm": 0.8698344826698303, + "learning_rate": 0.00013625324714553253, + "loss": 3.0357, + "step": 10557 + }, + { + "epoch": 0.9565355258091549, + "grad_norm": 0.8227140307426453, + "learning_rate": 0.0001362472059445418, + "loss": 2.9541, + "step": 10558 + }, + { + "epoch": 0.9566261239836017, + "grad_norm": 0.8550177216529846, + "learning_rate": 0.00013624116474355102, + "loss": 2.4677, + "step": 10559 + }, + { + "epoch": 0.9567167221580485, + "grad_norm": 0.8035456538200378, + "learning_rate": 0.00013623512354256028, + "loss": 2.8345, + "step": 10560 + }, + { + "epoch": 0.9568073203324953, + "grad_norm": 0.8418534398078918, + "learning_rate": 0.00013622908234156952, + "loss": 2.6288, + "step": 10561 + }, + { + "epoch": 0.9568979185069421, + "grad_norm": 0.7986711859703064, + "learning_rate": 0.00013622304114057875, + "loss": 2.7662, + "step": 10562 + }, + { + "epoch": 0.9569885166813888, + "grad_norm": 0.9124842882156372, + "learning_rate": 0.00013621699993958798, + "loss": 2.9504, + "step": 10563 + }, + { + "epoch": 0.9570791148558356, + "grad_norm": 0.851004958152771, + "learning_rate": 0.00013621095873859725, + "loss": 2.8723, + "step": 10564 + }, + { + "epoch": 0.9571697130302824, + "grad_norm": 0.6697123050689697, + "learning_rate": 0.00013620491753760648, + "loss": 2.0417, + "step": 10565 + }, + { + "epoch": 0.9572603112047292, + "grad_norm": 0.9379859566688538, + "learning_rate": 0.00013619887633661571, + "loss": 2.824, + "step": 10566 + }, + { + "epoch": 0.957350909379176, + "grad_norm": 0.7779533863067627, + "learning_rate": 0.00013619283513562497, + "loss": 2.7483, + "step": 10567 + }, + { + "epoch": 0.9574415075536228, + "grad_norm": 0.7857169508934021, + "learning_rate": 0.0001361867939346342, + "loss": 2.7274, + "step": 10568 + }, + { + "epoch": 0.9575321057280696, + "grad_norm": 0.7680959105491638, + "learning_rate": 0.00013618075273364347, + "loss": 2.5939, + "step": 10569 + }, + { + "epoch": 0.9576227039025164, + "grad_norm": 0.7795993685722351, + "learning_rate": 0.00013617471153265268, + "loss": 2.7523, + "step": 10570 + }, + { + "epoch": 0.9577133020769631, + "grad_norm": 0.7649767398834229, + "learning_rate": 0.00013616867033166194, + "loss": 2.842, + "step": 10571 + }, + { + "epoch": 0.9578039002514099, + "grad_norm": 0.8136153221130371, + "learning_rate": 0.0001361626291306712, + "loss": 2.7821, + "step": 10572 + }, + { + "epoch": 0.9578944984258567, + "grad_norm": 0.7716251015663147, + "learning_rate": 0.00013615658792968043, + "loss": 2.6312, + "step": 10573 + }, + { + "epoch": 0.9579850966003035, + "grad_norm": 0.8121680021286011, + "learning_rate": 0.00013615054672868967, + "loss": 3.043, + "step": 10574 + }, + { + "epoch": 0.9580756947747503, + "grad_norm": 0.7811117768287659, + "learning_rate": 0.0001361445055276989, + "loss": 3.1267, + "step": 10575 + }, + { + "epoch": 0.9581662929491971, + "grad_norm": 0.8612077236175537, + "learning_rate": 0.00013613846432670816, + "loss": 2.9017, + "step": 10576 + }, + { + "epoch": 0.9582568911236439, + "grad_norm": 0.8807713389396667, + "learning_rate": 0.0001361324231257174, + "loss": 2.9798, + "step": 10577 + }, + { + "epoch": 0.9583474892980907, + "grad_norm": 0.7313560843467712, + "learning_rate": 0.00013612638192472663, + "loss": 2.2828, + "step": 10578 + }, + { + "epoch": 0.9584380874725374, + "grad_norm": 0.7568538784980774, + "learning_rate": 0.0001361203407237359, + "loss": 2.5983, + "step": 10579 + }, + { + "epoch": 0.9585286856469842, + "grad_norm": 0.753882110118866, + "learning_rate": 0.00013611429952274513, + "loss": 2.8075, + "step": 10580 + }, + { + "epoch": 0.958619283821431, + "grad_norm": 0.8146265149116516, + "learning_rate": 0.0001361082583217544, + "loss": 2.7494, + "step": 10581 + }, + { + "epoch": 0.9587098819958778, + "grad_norm": 0.7539995908737183, + "learning_rate": 0.00013610221712076362, + "loss": 2.8639, + "step": 10582 + }, + { + "epoch": 0.9588004801703246, + "grad_norm": 0.8184040784835815, + "learning_rate": 0.00013609617591977286, + "loss": 2.8307, + "step": 10583 + }, + { + "epoch": 0.9588910783447714, + "grad_norm": 0.8302366137504578, + "learning_rate": 0.0001360901347187821, + "loss": 2.5992, + "step": 10584 + }, + { + "epoch": 0.9589816765192182, + "grad_norm": 0.8394742608070374, + "learning_rate": 0.00013608409351779135, + "loss": 2.4718, + "step": 10585 + }, + { + "epoch": 0.959072274693665, + "grad_norm": 0.7641257047653198, + "learning_rate": 0.00013607805231680058, + "loss": 2.6745, + "step": 10586 + }, + { + "epoch": 0.9591628728681117, + "grad_norm": 0.773763120174408, + "learning_rate": 0.00013607201111580982, + "loss": 2.7259, + "step": 10587 + }, + { + "epoch": 0.9592534710425585, + "grad_norm": 0.8424060344696045, + "learning_rate": 0.00013606596991481908, + "loss": 2.7496, + "step": 10588 + }, + { + "epoch": 0.9593440692170053, + "grad_norm": 0.8175966143608093, + "learning_rate": 0.00013605992871382831, + "loss": 2.9914, + "step": 10589 + }, + { + "epoch": 0.9594346673914521, + "grad_norm": 0.7718377709388733, + "learning_rate": 0.00013605388751283757, + "loss": 2.4952, + "step": 10590 + }, + { + "epoch": 0.9595252655658989, + "grad_norm": 0.7345624566078186, + "learning_rate": 0.00013604784631184678, + "loss": 2.6277, + "step": 10591 + }, + { + "epoch": 0.9596158637403457, + "grad_norm": 0.8089966177940369, + "learning_rate": 0.00013604180511085604, + "loss": 2.895, + "step": 10592 + }, + { + "epoch": 0.9597064619147924, + "grad_norm": 0.7798806428909302, + "learning_rate": 0.00013603576390986528, + "loss": 2.8206, + "step": 10593 + }, + { + "epoch": 0.9597970600892392, + "grad_norm": 0.7846522331237793, + "learning_rate": 0.00013602972270887454, + "loss": 2.8027, + "step": 10594 + }, + { + "epoch": 0.9598876582636859, + "grad_norm": 0.7702997922897339, + "learning_rate": 0.00013602368150788377, + "loss": 2.5903, + "step": 10595 + }, + { + "epoch": 0.9599782564381327, + "grad_norm": 0.8688881993293762, + "learning_rate": 0.000136017640306893, + "loss": 2.7885, + "step": 10596 + }, + { + "epoch": 0.9600688546125795, + "grad_norm": 0.7890874743461609, + "learning_rate": 0.00013601159910590227, + "loss": 2.7031, + "step": 10597 + }, + { + "epoch": 0.9601594527870263, + "grad_norm": 0.8084300756454468, + "learning_rate": 0.0001360055579049115, + "loss": 2.7834, + "step": 10598 + }, + { + "epoch": 0.9602500509614731, + "grad_norm": 0.8501111268997192, + "learning_rate": 0.00013599951670392074, + "loss": 2.8294, + "step": 10599 + }, + { + "epoch": 0.9603406491359199, + "grad_norm": 0.7529709935188293, + "learning_rate": 0.00013599347550292997, + "loss": 2.8419, + "step": 10600 + }, + { + "epoch": 0.9604312473103667, + "grad_norm": 0.7941164970397949, + "learning_rate": 0.00013598743430193923, + "loss": 3.0714, + "step": 10601 + }, + { + "epoch": 0.9605218454848135, + "grad_norm": 0.7577246427536011, + "learning_rate": 0.0001359813931009485, + "loss": 2.5345, + "step": 10602 + }, + { + "epoch": 0.9606124436592602, + "grad_norm": 0.822338879108429, + "learning_rate": 0.00013597535189995773, + "loss": 2.8587, + "step": 10603 + }, + { + "epoch": 0.960703041833707, + "grad_norm": 0.7858595252037048, + "learning_rate": 0.00013596931069896696, + "loss": 2.7032, + "step": 10604 + }, + { + "epoch": 0.9607936400081538, + "grad_norm": 0.7855499982833862, + "learning_rate": 0.0001359632694979762, + "loss": 2.8738, + "step": 10605 + }, + { + "epoch": 0.9608842381826006, + "grad_norm": 0.8614487051963806, + "learning_rate": 0.00013595722829698546, + "loss": 2.8232, + "step": 10606 + }, + { + "epoch": 0.9609748363570474, + "grad_norm": 0.821232259273529, + "learning_rate": 0.0001359511870959947, + "loss": 2.9793, + "step": 10607 + }, + { + "epoch": 0.9610654345314942, + "grad_norm": 0.7503252625465393, + "learning_rate": 0.00013594514589500392, + "loss": 2.7359, + "step": 10608 + }, + { + "epoch": 0.961156032705941, + "grad_norm": 0.7678775787353516, + "learning_rate": 0.00013593910469401318, + "loss": 2.8693, + "step": 10609 + }, + { + "epoch": 0.9612466308803878, + "grad_norm": 0.8294500112533569, + "learning_rate": 0.00013593306349302242, + "loss": 2.8857, + "step": 10610 + }, + { + "epoch": 0.9613372290548345, + "grad_norm": 0.8230102062225342, + "learning_rate": 0.00013592702229203168, + "loss": 2.8111, + "step": 10611 + }, + { + "epoch": 0.9614278272292813, + "grad_norm": 0.7834093570709229, + "learning_rate": 0.0001359209810910409, + "loss": 2.7012, + "step": 10612 + }, + { + "epoch": 0.9615184254037281, + "grad_norm": 0.8529041409492493, + "learning_rate": 0.00013591493989005015, + "loss": 2.9396, + "step": 10613 + }, + { + "epoch": 0.9616090235781749, + "grad_norm": 0.8544533848762512, + "learning_rate": 0.00013590889868905938, + "loss": 2.9385, + "step": 10614 + }, + { + "epoch": 0.9616996217526217, + "grad_norm": 0.7873590588569641, + "learning_rate": 0.00013590285748806864, + "loss": 2.8785, + "step": 10615 + }, + { + "epoch": 0.9617902199270685, + "grad_norm": 0.858144998550415, + "learning_rate": 0.00013589681628707788, + "loss": 3.1564, + "step": 10616 + }, + { + "epoch": 0.9618808181015153, + "grad_norm": 0.8058539032936096, + "learning_rate": 0.0001358907750860871, + "loss": 2.615, + "step": 10617 + }, + { + "epoch": 0.961971416275962, + "grad_norm": 0.8837745189666748, + "learning_rate": 0.00013588473388509637, + "loss": 2.8534, + "step": 10618 + }, + { + "epoch": 0.9620620144504088, + "grad_norm": 0.8451353907585144, + "learning_rate": 0.0001358786926841056, + "loss": 3.0884, + "step": 10619 + }, + { + "epoch": 0.9621526126248556, + "grad_norm": 0.8045987486839294, + "learning_rate": 0.00013587265148311487, + "loss": 2.8812, + "step": 10620 + }, + { + "epoch": 0.9622432107993024, + "grad_norm": 0.8006194829940796, + "learning_rate": 0.00013586661028212407, + "loss": 2.8466, + "step": 10621 + }, + { + "epoch": 0.9623338089737492, + "grad_norm": 0.6680963635444641, + "learning_rate": 0.00013586056908113334, + "loss": 2.063, + "step": 10622 + }, + { + "epoch": 0.962424407148196, + "grad_norm": 0.8038066625595093, + "learning_rate": 0.00013585452788014257, + "loss": 2.8698, + "step": 10623 + }, + { + "epoch": 0.9625150053226428, + "grad_norm": 0.7283053994178772, + "learning_rate": 0.00013584848667915183, + "loss": 2.0286, + "step": 10624 + }, + { + "epoch": 0.9626056034970896, + "grad_norm": 0.8513144254684448, + "learning_rate": 0.00013584244547816106, + "loss": 2.8681, + "step": 10625 + }, + { + "epoch": 0.9626962016715364, + "grad_norm": 0.8648897409439087, + "learning_rate": 0.0001358364042771703, + "loss": 2.9754, + "step": 10626 + }, + { + "epoch": 0.9627867998459831, + "grad_norm": 0.8362101912498474, + "learning_rate": 0.00013583036307617956, + "loss": 2.7989, + "step": 10627 + }, + { + "epoch": 0.9628773980204299, + "grad_norm": 0.6715154051780701, + "learning_rate": 0.0001358243218751888, + "loss": 2.2126, + "step": 10628 + }, + { + "epoch": 0.9629679961948767, + "grad_norm": 0.7543195486068726, + "learning_rate": 0.00013581828067419803, + "loss": 2.6503, + "step": 10629 + }, + { + "epoch": 0.9630585943693235, + "grad_norm": 0.7721061706542969, + "learning_rate": 0.00013581223947320726, + "loss": 2.8842, + "step": 10630 + }, + { + "epoch": 0.9631491925437703, + "grad_norm": 0.7797832489013672, + "learning_rate": 0.00013580619827221652, + "loss": 2.7049, + "step": 10631 + }, + { + "epoch": 0.9632397907182171, + "grad_norm": 0.7916635274887085, + "learning_rate": 0.00013580015707122578, + "loss": 2.7639, + "step": 10632 + }, + { + "epoch": 0.9633303888926638, + "grad_norm": 0.8168932795524597, + "learning_rate": 0.00013579411587023502, + "loss": 2.5907, + "step": 10633 + }, + { + "epoch": 0.9634209870671105, + "grad_norm": 1.0246726274490356, + "learning_rate": 0.00013578807466924425, + "loss": 2.8208, + "step": 10634 + }, + { + "epoch": 0.9635115852415573, + "grad_norm": 0.7984297871589661, + "learning_rate": 0.0001357820334682535, + "loss": 3.0036, + "step": 10635 + }, + { + "epoch": 0.9636021834160041, + "grad_norm": 0.7972297072410583, + "learning_rate": 0.00013577599226726275, + "loss": 2.6737, + "step": 10636 + }, + { + "epoch": 0.9636927815904509, + "grad_norm": 0.8541191220283508, + "learning_rate": 0.00013576995106627198, + "loss": 3.0255, + "step": 10637 + }, + { + "epoch": 0.9637833797648977, + "grad_norm": 0.8243719935417175, + "learning_rate": 0.00013576390986528122, + "loss": 2.8337, + "step": 10638 + }, + { + "epoch": 0.9638739779393445, + "grad_norm": 0.8218807578086853, + "learning_rate": 0.00013575786866429048, + "loss": 2.7607, + "step": 10639 + }, + { + "epoch": 0.9639645761137913, + "grad_norm": 0.8609697222709656, + "learning_rate": 0.0001357518274632997, + "loss": 2.9684, + "step": 10640 + }, + { + "epoch": 0.9640551742882381, + "grad_norm": 0.7890011072158813, + "learning_rate": 0.00013574578626230897, + "loss": 2.968, + "step": 10641 + }, + { + "epoch": 0.9641457724626848, + "grad_norm": 0.7891564965248108, + "learning_rate": 0.00013573974506131818, + "loss": 2.1211, + "step": 10642 + }, + { + "epoch": 0.9642363706371316, + "grad_norm": 0.8286845088005066, + "learning_rate": 0.00013573370386032744, + "loss": 2.7398, + "step": 10643 + }, + { + "epoch": 0.9643269688115784, + "grad_norm": 0.8866640329360962, + "learning_rate": 0.00013572766265933667, + "loss": 3.0172, + "step": 10644 + }, + { + "epoch": 0.9644175669860252, + "grad_norm": 0.8386523723602295, + "learning_rate": 0.00013572162145834594, + "loss": 2.7619, + "step": 10645 + }, + { + "epoch": 0.964508165160472, + "grad_norm": 1.2725696563720703, + "learning_rate": 0.00013571558025735517, + "loss": 1.9399, + "step": 10646 + }, + { + "epoch": 0.9645987633349188, + "grad_norm": 0.7907176613807678, + "learning_rate": 0.0001357095390563644, + "loss": 2.8143, + "step": 10647 + }, + { + "epoch": 0.9646893615093656, + "grad_norm": 0.8365755677223206, + "learning_rate": 0.00013570349785537366, + "loss": 3.1389, + "step": 10648 + }, + { + "epoch": 0.9647799596838124, + "grad_norm": 0.8149949312210083, + "learning_rate": 0.0001356974566543829, + "loss": 2.8952, + "step": 10649 + }, + { + "epoch": 0.9648705578582591, + "grad_norm": 0.8183088302612305, + "learning_rate": 0.00013569141545339213, + "loss": 2.7754, + "step": 10650 + }, + { + "epoch": 0.9649611560327059, + "grad_norm": 0.8194260597229004, + "learning_rate": 0.00013568537425240137, + "loss": 2.745, + "step": 10651 + }, + { + "epoch": 0.9650517542071527, + "grad_norm": 0.7552707195281982, + "learning_rate": 0.00013567933305141063, + "loss": 2.5075, + "step": 10652 + }, + { + "epoch": 0.9651423523815995, + "grad_norm": 0.8043773174285889, + "learning_rate": 0.00013567329185041986, + "loss": 2.8878, + "step": 10653 + }, + { + "epoch": 0.9652329505560463, + "grad_norm": 1.4434564113616943, + "learning_rate": 0.00013566725064942912, + "loss": 2.8859, + "step": 10654 + }, + { + "epoch": 0.9653235487304931, + "grad_norm": 0.8558198809623718, + "learning_rate": 0.00013566120944843836, + "loss": 2.9833, + "step": 10655 + }, + { + "epoch": 0.9654141469049399, + "grad_norm": 0.761171817779541, + "learning_rate": 0.0001356551682474476, + "loss": 2.6684, + "step": 10656 + }, + { + "epoch": 0.9655047450793867, + "grad_norm": 0.8829056620597839, + "learning_rate": 0.00013564912704645685, + "loss": 2.6239, + "step": 10657 + }, + { + "epoch": 0.9655953432538334, + "grad_norm": 0.727140486240387, + "learning_rate": 0.0001356430858454661, + "loss": 2.1103, + "step": 10658 + }, + { + "epoch": 0.9656859414282802, + "grad_norm": 0.7388594150543213, + "learning_rate": 0.00013563704464447532, + "loss": 2.1383, + "step": 10659 + }, + { + "epoch": 0.965776539602727, + "grad_norm": 0.761554479598999, + "learning_rate": 0.00013563100344348455, + "loss": 2.7786, + "step": 10660 + }, + { + "epoch": 0.9658671377771738, + "grad_norm": 0.8144588470458984, + "learning_rate": 0.00013562496224249382, + "loss": 2.8548, + "step": 10661 + }, + { + "epoch": 0.9659577359516206, + "grad_norm": 0.8478509783744812, + "learning_rate": 0.00013561892104150308, + "loss": 3.0869, + "step": 10662 + }, + { + "epoch": 0.9660483341260674, + "grad_norm": 0.9089096188545227, + "learning_rate": 0.00013561287984051228, + "loss": 2.0644, + "step": 10663 + }, + { + "epoch": 0.9661389323005142, + "grad_norm": 0.7955164909362793, + "learning_rate": 0.00013560683863952155, + "loss": 2.6949, + "step": 10664 + }, + { + "epoch": 0.966229530474961, + "grad_norm": 0.7566693425178528, + "learning_rate": 0.00013560079743853078, + "loss": 2.6725, + "step": 10665 + }, + { + "epoch": 0.9663201286494078, + "grad_norm": 0.8594494462013245, + "learning_rate": 0.00013559475623754004, + "loss": 2.7665, + "step": 10666 + }, + { + "epoch": 0.9664107268238545, + "grad_norm": 0.7998448610305786, + "learning_rate": 0.00013558871503654927, + "loss": 2.8481, + "step": 10667 + }, + { + "epoch": 0.9665013249983013, + "grad_norm": 0.8054465055465698, + "learning_rate": 0.0001355826738355585, + "loss": 2.7679, + "step": 10668 + }, + { + "epoch": 0.9665919231727481, + "grad_norm": 0.8366721272468567, + "learning_rate": 0.00013557663263456777, + "loss": 2.5702, + "step": 10669 + }, + { + "epoch": 0.9666825213471949, + "grad_norm": 0.8235066533088684, + "learning_rate": 0.000135570591433577, + "loss": 2.6935, + "step": 10670 + }, + { + "epoch": 0.9667731195216417, + "grad_norm": 0.7695505619049072, + "learning_rate": 0.00013556455023258626, + "loss": 2.7605, + "step": 10671 + }, + { + "epoch": 0.9668637176960885, + "grad_norm": 0.8218868970870972, + "learning_rate": 0.00013555850903159547, + "loss": 2.7028, + "step": 10672 + }, + { + "epoch": 0.9669543158705353, + "grad_norm": 0.857913613319397, + "learning_rate": 0.00013555246783060473, + "loss": 2.9784, + "step": 10673 + }, + { + "epoch": 0.9670449140449819, + "grad_norm": 0.8233358263969421, + "learning_rate": 0.00013554642662961397, + "loss": 2.5949, + "step": 10674 + }, + { + "epoch": 0.9671355122194287, + "grad_norm": 0.8356831669807434, + "learning_rate": 0.00013554038542862323, + "loss": 2.927, + "step": 10675 + }, + { + "epoch": 0.9672261103938755, + "grad_norm": 0.8254451751708984, + "learning_rate": 0.00013553434422763246, + "loss": 2.8966, + "step": 10676 + }, + { + "epoch": 0.9673167085683223, + "grad_norm": 0.8645581007003784, + "learning_rate": 0.0001355283030266417, + "loss": 2.9243, + "step": 10677 + }, + { + "epoch": 0.9674073067427691, + "grad_norm": 0.8239887952804565, + "learning_rate": 0.00013552226182565096, + "loss": 2.7986, + "step": 10678 + }, + { + "epoch": 0.9674979049172159, + "grad_norm": 0.8215185403823853, + "learning_rate": 0.0001355162206246602, + "loss": 2.6696, + "step": 10679 + }, + { + "epoch": 0.9675885030916627, + "grad_norm": 0.829302191734314, + "learning_rate": 0.00013551017942366943, + "loss": 2.7303, + "step": 10680 + }, + { + "epoch": 0.9676791012661095, + "grad_norm": 0.7664878964424133, + "learning_rate": 0.00013550413822267866, + "loss": 2.5784, + "step": 10681 + }, + { + "epoch": 0.9677696994405562, + "grad_norm": 0.7955071926116943, + "learning_rate": 0.00013549809702168792, + "loss": 2.7244, + "step": 10682 + }, + { + "epoch": 0.967860297615003, + "grad_norm": 0.7927817702293396, + "learning_rate": 0.00013549205582069715, + "loss": 2.6751, + "step": 10683 + }, + { + "epoch": 0.9679508957894498, + "grad_norm": 0.81999671459198, + "learning_rate": 0.00013548601461970642, + "loss": 2.7912, + "step": 10684 + }, + { + "epoch": 0.9680414939638966, + "grad_norm": 0.7840282320976257, + "learning_rate": 0.00013547997341871565, + "loss": 2.5733, + "step": 10685 + }, + { + "epoch": 0.9681320921383434, + "grad_norm": 0.795745849609375, + "learning_rate": 0.00013547393221772488, + "loss": 2.556, + "step": 10686 + }, + { + "epoch": 0.9682226903127902, + "grad_norm": 0.8361626267433167, + "learning_rate": 0.00013546789101673415, + "loss": 3.0113, + "step": 10687 + }, + { + "epoch": 0.968313288487237, + "grad_norm": 0.8020575642585754, + "learning_rate": 0.00013546184981574338, + "loss": 2.9366, + "step": 10688 + }, + { + "epoch": 0.9684038866616838, + "grad_norm": 0.767593502998352, + "learning_rate": 0.0001354558086147526, + "loss": 2.8255, + "step": 10689 + }, + { + "epoch": 0.9684944848361305, + "grad_norm": 0.7796858549118042, + "learning_rate": 0.00013544976741376185, + "loss": 2.8156, + "step": 10690 + }, + { + "epoch": 0.9685850830105773, + "grad_norm": 0.8851258158683777, + "learning_rate": 0.0001354437262127711, + "loss": 2.2391, + "step": 10691 + }, + { + "epoch": 0.9686756811850241, + "grad_norm": 0.7933969497680664, + "learning_rate": 0.00013543768501178037, + "loss": 2.7201, + "step": 10692 + }, + { + "epoch": 0.9687662793594709, + "grad_norm": 0.8390265703201294, + "learning_rate": 0.00013543164381078958, + "loss": 2.7625, + "step": 10693 + }, + { + "epoch": 0.9688568775339177, + "grad_norm": 0.7780367136001587, + "learning_rate": 0.00013542560260979884, + "loss": 3.0802, + "step": 10694 + }, + { + "epoch": 0.9689474757083645, + "grad_norm": 0.9414607286453247, + "learning_rate": 0.00013541956140880807, + "loss": 2.4192, + "step": 10695 + }, + { + "epoch": 0.9690380738828113, + "grad_norm": 0.8682844638824463, + "learning_rate": 0.00013541352020781733, + "loss": 2.7699, + "step": 10696 + }, + { + "epoch": 0.9691286720572581, + "grad_norm": 0.7974702715873718, + "learning_rate": 0.00013540747900682657, + "loss": 2.9503, + "step": 10697 + }, + { + "epoch": 0.9692192702317048, + "grad_norm": 0.7918549180030823, + "learning_rate": 0.0001354014378058358, + "loss": 2.7164, + "step": 10698 + }, + { + "epoch": 0.9693098684061516, + "grad_norm": 0.7883690595626831, + "learning_rate": 0.00013539539660484506, + "loss": 3.0004, + "step": 10699 + }, + { + "epoch": 0.9694004665805984, + "grad_norm": 0.8424616456031799, + "learning_rate": 0.0001353893554038543, + "loss": 2.9546, + "step": 10700 + }, + { + "epoch": 0.9694910647550452, + "grad_norm": 0.7825626134872437, + "learning_rate": 0.00013538331420286353, + "loss": 2.5164, + "step": 10701 + }, + { + "epoch": 0.969581662929492, + "grad_norm": 0.8312402367591858, + "learning_rate": 0.00013537727300187276, + "loss": 2.7186, + "step": 10702 + }, + { + "epoch": 0.9696722611039388, + "grad_norm": 0.7571629881858826, + "learning_rate": 0.00013537123180088203, + "loss": 2.0557, + "step": 10703 + }, + { + "epoch": 0.9697628592783856, + "grad_norm": 0.7815642952919006, + "learning_rate": 0.00013536519059989126, + "loss": 2.6911, + "step": 10704 + }, + { + "epoch": 0.9698534574528324, + "grad_norm": 0.8435609936714172, + "learning_rate": 0.00013535914939890052, + "loss": 2.8795, + "step": 10705 + }, + { + "epoch": 0.9699440556272791, + "grad_norm": 0.8724271059036255, + "learning_rate": 0.00013535310819790975, + "loss": 2.5158, + "step": 10706 + }, + { + "epoch": 0.9700346538017259, + "grad_norm": 0.7667621374130249, + "learning_rate": 0.000135347066996919, + "loss": 2.8656, + "step": 10707 + }, + { + "epoch": 0.9701252519761727, + "grad_norm": 0.8094162940979004, + "learning_rate": 0.00013534102579592825, + "loss": 2.8179, + "step": 10708 + }, + { + "epoch": 0.9702158501506195, + "grad_norm": 0.8206813335418701, + "learning_rate": 0.00013533498459493748, + "loss": 2.5653, + "step": 10709 + }, + { + "epoch": 0.9703064483250663, + "grad_norm": 0.7954540848731995, + "learning_rate": 0.00013532894339394672, + "loss": 2.6995, + "step": 10710 + }, + { + "epoch": 0.9703970464995131, + "grad_norm": 0.8100320100784302, + "learning_rate": 0.00013532290219295595, + "loss": 2.6446, + "step": 10711 + }, + { + "epoch": 0.9704876446739599, + "grad_norm": 0.8092798590660095, + "learning_rate": 0.0001353168609919652, + "loss": 2.9422, + "step": 10712 + }, + { + "epoch": 0.9705782428484067, + "grad_norm": 0.7875932455062866, + "learning_rate": 0.00013531081979097445, + "loss": 2.6236, + "step": 10713 + }, + { + "epoch": 0.9706688410228533, + "grad_norm": 0.8403133153915405, + "learning_rate": 0.00013530477858998368, + "loss": 2.8843, + "step": 10714 + }, + { + "epoch": 0.9707594391973001, + "grad_norm": 0.8074411749839783, + "learning_rate": 0.00013529873738899294, + "loss": 2.4926, + "step": 10715 + }, + { + "epoch": 0.9708500373717469, + "grad_norm": 0.7799168229103088, + "learning_rate": 0.00013529269618800218, + "loss": 2.9563, + "step": 10716 + }, + { + "epoch": 0.9709406355461937, + "grad_norm": 0.9047964811325073, + "learning_rate": 0.00013528665498701144, + "loss": 2.8799, + "step": 10717 + }, + { + "epoch": 0.9710312337206405, + "grad_norm": 0.8145598769187927, + "learning_rate": 0.00013528061378602067, + "loss": 2.715, + "step": 10718 + }, + { + "epoch": 0.9711218318950873, + "grad_norm": 0.8336505889892578, + "learning_rate": 0.0001352745725850299, + "loss": 2.905, + "step": 10719 + }, + { + "epoch": 0.9712124300695341, + "grad_norm": 0.76915043592453, + "learning_rate": 0.00013526853138403914, + "loss": 2.8595, + "step": 10720 + }, + { + "epoch": 0.9713030282439808, + "grad_norm": 0.8145914077758789, + "learning_rate": 0.0001352624901830484, + "loss": 2.7945, + "step": 10721 + }, + { + "epoch": 0.9713936264184276, + "grad_norm": 0.8174611926078796, + "learning_rate": 0.00013525644898205764, + "loss": 2.6644, + "step": 10722 + }, + { + "epoch": 0.9714842245928744, + "grad_norm": 0.7583951354026794, + "learning_rate": 0.00013525040778106687, + "loss": 2.5528, + "step": 10723 + }, + { + "epoch": 0.9715748227673212, + "grad_norm": 0.7672412395477295, + "learning_rate": 0.00013524436658007613, + "loss": 2.6941, + "step": 10724 + }, + { + "epoch": 0.971665420941768, + "grad_norm": 0.8215516805648804, + "learning_rate": 0.00013523832537908536, + "loss": 2.5763, + "step": 10725 + }, + { + "epoch": 0.9717560191162148, + "grad_norm": 0.8424509167671204, + "learning_rate": 0.00013523228417809463, + "loss": 3.1243, + "step": 10726 + }, + { + "epoch": 0.9718466172906616, + "grad_norm": 0.8076397180557251, + "learning_rate": 0.00013522624297710383, + "loss": 2.938, + "step": 10727 + }, + { + "epoch": 0.9719372154651084, + "grad_norm": 0.7972416281700134, + "learning_rate": 0.0001352202017761131, + "loss": 2.9584, + "step": 10728 + }, + { + "epoch": 0.9720278136395551, + "grad_norm": 0.7570191025733948, + "learning_rate": 0.00013521416057512236, + "loss": 2.7377, + "step": 10729 + }, + { + "epoch": 0.9721184118140019, + "grad_norm": 0.7760381698608398, + "learning_rate": 0.0001352081193741316, + "loss": 2.8155, + "step": 10730 + }, + { + "epoch": 0.9722090099884487, + "grad_norm": 0.7820430397987366, + "learning_rate": 0.00013520207817314082, + "loss": 2.6989, + "step": 10731 + }, + { + "epoch": 0.9722996081628955, + "grad_norm": 0.8642070293426514, + "learning_rate": 0.00013519603697215006, + "loss": 2.6945, + "step": 10732 + }, + { + "epoch": 0.9723902063373423, + "grad_norm": 0.9126107692718506, + "learning_rate": 0.00013518999577115932, + "loss": 2.985, + "step": 10733 + }, + { + "epoch": 0.9724808045117891, + "grad_norm": 0.8333151936531067, + "learning_rate": 0.00013518395457016855, + "loss": 2.1461, + "step": 10734 + }, + { + "epoch": 0.9725714026862359, + "grad_norm": 0.7883483171463013, + "learning_rate": 0.0001351779133691778, + "loss": 2.025, + "step": 10735 + }, + { + "epoch": 0.9726620008606827, + "grad_norm": 0.6575998067855835, + "learning_rate": 0.00013517187216818705, + "loss": 1.3462, + "step": 10736 + }, + { + "epoch": 0.9727525990351295, + "grad_norm": 0.7248581051826477, + "learning_rate": 0.00013516583096719628, + "loss": 2.6635, + "step": 10737 + }, + { + "epoch": 0.9728431972095762, + "grad_norm": 0.8630764484405518, + "learning_rate": 0.00013515978976620554, + "loss": 3.047, + "step": 10738 + }, + { + "epoch": 0.972933795384023, + "grad_norm": 0.7584630846977234, + "learning_rate": 0.00013515374856521478, + "loss": 2.8157, + "step": 10739 + }, + { + "epoch": 0.9730243935584698, + "grad_norm": 0.9597374796867371, + "learning_rate": 0.000135147707364224, + "loss": 2.9302, + "step": 10740 + }, + { + "epoch": 0.9731149917329166, + "grad_norm": 0.8342373967170715, + "learning_rate": 0.00013514166616323324, + "loss": 2.8532, + "step": 10741 + }, + { + "epoch": 0.9732055899073634, + "grad_norm": 0.8174713850021362, + "learning_rate": 0.0001351356249622425, + "loss": 2.5723, + "step": 10742 + }, + { + "epoch": 0.9732961880818102, + "grad_norm": 0.7888997793197632, + "learning_rate": 0.00013512958376125174, + "loss": 2.9412, + "step": 10743 + }, + { + "epoch": 0.973386786256257, + "grad_norm": 0.8120766878128052, + "learning_rate": 0.00013512354256026097, + "loss": 2.7859, + "step": 10744 + }, + { + "epoch": 0.9734773844307038, + "grad_norm": 0.7570154070854187, + "learning_rate": 0.00013511750135927024, + "loss": 2.6262, + "step": 10745 + }, + { + "epoch": 0.9735679826051505, + "grad_norm": 0.8097013831138611, + "learning_rate": 0.00013511146015827947, + "loss": 2.8473, + "step": 10746 + }, + { + "epoch": 0.9736585807795973, + "grad_norm": 0.7811283469200134, + "learning_rate": 0.00013510541895728873, + "loss": 2.7793, + "step": 10747 + }, + { + "epoch": 0.9737491789540441, + "grad_norm": 0.8473230600357056, + "learning_rate": 0.00013509937775629796, + "loss": 2.7359, + "step": 10748 + }, + { + "epoch": 0.9738397771284909, + "grad_norm": 0.9302940368652344, + "learning_rate": 0.0001350933365553072, + "loss": 2.8562, + "step": 10749 + }, + { + "epoch": 0.9739303753029377, + "grad_norm": 0.799817681312561, + "learning_rate": 0.00013508729535431643, + "loss": 2.7992, + "step": 10750 + }, + { + "epoch": 0.9740209734773845, + "grad_norm": 0.7480702996253967, + "learning_rate": 0.0001350812541533257, + "loss": 2.6641, + "step": 10751 + }, + { + "epoch": 0.9741115716518313, + "grad_norm": 0.755404531955719, + "learning_rate": 0.00013507521295233493, + "loss": 2.2306, + "step": 10752 + }, + { + "epoch": 0.974202169826278, + "grad_norm": 0.7169764041900635, + "learning_rate": 0.00013506917175134416, + "loss": 2.0133, + "step": 10753 + }, + { + "epoch": 0.9742927680007248, + "grad_norm": 0.7826345562934875, + "learning_rate": 0.00013506313055035342, + "loss": 2.8708, + "step": 10754 + }, + { + "epoch": 0.9743833661751715, + "grad_norm": 0.7889255285263062, + "learning_rate": 0.00013505708934936266, + "loss": 2.8172, + "step": 10755 + }, + { + "epoch": 0.9744739643496183, + "grad_norm": 0.8282690644264221, + "learning_rate": 0.00013505104814837192, + "loss": 2.698, + "step": 10756 + }, + { + "epoch": 0.9745645625240651, + "grad_norm": 0.852963924407959, + "learning_rate": 0.00013504500694738113, + "loss": 2.7162, + "step": 10757 + }, + { + "epoch": 0.9746551606985119, + "grad_norm": 0.8544152975082397, + "learning_rate": 0.0001350389657463904, + "loss": 2.7899, + "step": 10758 + }, + { + "epoch": 0.9747457588729587, + "grad_norm": 0.8897741436958313, + "learning_rate": 0.00013503292454539965, + "loss": 2.6618, + "step": 10759 + }, + { + "epoch": 0.9748363570474055, + "grad_norm": 0.8302481174468994, + "learning_rate": 0.00013502688334440888, + "loss": 2.91, + "step": 10760 + }, + { + "epoch": 0.9749269552218522, + "grad_norm": 0.7934239506721497, + "learning_rate": 0.00013502084214341812, + "loss": 2.5814, + "step": 10761 + }, + { + "epoch": 0.975017553396299, + "grad_norm": 0.6657084226608276, + "learning_rate": 0.00013501480094242735, + "loss": 2.1459, + "step": 10762 + }, + { + "epoch": 0.9751081515707458, + "grad_norm": 0.8733190894126892, + "learning_rate": 0.0001350087597414366, + "loss": 2.9211, + "step": 10763 + }, + { + "epoch": 0.9751987497451926, + "grad_norm": 0.8287232518196106, + "learning_rate": 0.00013500271854044585, + "loss": 2.9996, + "step": 10764 + }, + { + "epoch": 0.9752893479196394, + "grad_norm": 0.8685379028320312, + "learning_rate": 0.00013499667733945508, + "loss": 2.6867, + "step": 10765 + }, + { + "epoch": 0.9753799460940862, + "grad_norm": 0.744125247001648, + "learning_rate": 0.00013499063613846434, + "loss": 2.6503, + "step": 10766 + }, + { + "epoch": 0.975470544268533, + "grad_norm": 0.8099753856658936, + "learning_rate": 0.00013498459493747357, + "loss": 2.8784, + "step": 10767 + }, + { + "epoch": 0.9755611424429798, + "grad_norm": 0.7941974997520447, + "learning_rate": 0.00013497855373648284, + "loss": 2.8313, + "step": 10768 + }, + { + "epoch": 0.9756517406174265, + "grad_norm": 0.8422639966011047, + "learning_rate": 0.00013497251253549207, + "loss": 3.0009, + "step": 10769 + }, + { + "epoch": 0.9757423387918733, + "grad_norm": 0.7099463939666748, + "learning_rate": 0.0001349664713345013, + "loss": 2.098, + "step": 10770 + }, + { + "epoch": 0.9758329369663201, + "grad_norm": 0.824740469455719, + "learning_rate": 0.00013496043013351054, + "loss": 2.7336, + "step": 10771 + }, + { + "epoch": 0.9759235351407669, + "grad_norm": 0.6959747672080994, + "learning_rate": 0.0001349543889325198, + "loss": 2.1756, + "step": 10772 + }, + { + "epoch": 0.9760141333152137, + "grad_norm": 0.8660708665847778, + "learning_rate": 0.00013494834773152903, + "loss": 2.8467, + "step": 10773 + }, + { + "epoch": 0.9761047314896605, + "grad_norm": 0.8299806118011475, + "learning_rate": 0.00013494230653053827, + "loss": 2.8514, + "step": 10774 + }, + { + "epoch": 0.9761953296641073, + "grad_norm": 0.7817371487617493, + "learning_rate": 0.00013493626532954753, + "loss": 2.8241, + "step": 10775 + }, + { + "epoch": 0.9762859278385541, + "grad_norm": 0.7618198394775391, + "learning_rate": 0.00013493022412855676, + "loss": 2.8588, + "step": 10776 + }, + { + "epoch": 0.9763765260130008, + "grad_norm": 0.8146346807479858, + "learning_rate": 0.00013492418292756602, + "loss": 2.8007, + "step": 10777 + }, + { + "epoch": 0.9764671241874476, + "grad_norm": 0.7848421335220337, + "learning_rate": 0.00013491814172657523, + "loss": 3.1117, + "step": 10778 + }, + { + "epoch": 0.9765577223618944, + "grad_norm": 0.7813141345977783, + "learning_rate": 0.0001349121005255845, + "loss": 2.8716, + "step": 10779 + }, + { + "epoch": 0.9766483205363412, + "grad_norm": 0.7822166085243225, + "learning_rate": 0.00013490605932459373, + "loss": 2.7936, + "step": 10780 + }, + { + "epoch": 0.976738918710788, + "grad_norm": 0.8490806818008423, + "learning_rate": 0.000134900018123603, + "loss": 2.8316, + "step": 10781 + }, + { + "epoch": 0.9768295168852348, + "grad_norm": 0.8270947933197021, + "learning_rate": 0.00013489397692261222, + "loss": 2.9333, + "step": 10782 + }, + { + "epoch": 0.9769201150596816, + "grad_norm": 0.8036820292472839, + "learning_rate": 0.00013488793572162145, + "loss": 2.7051, + "step": 10783 + }, + { + "epoch": 0.9770107132341284, + "grad_norm": 0.7551915049552917, + "learning_rate": 0.00013488189452063072, + "loss": 2.7409, + "step": 10784 + }, + { + "epoch": 0.9771013114085751, + "grad_norm": 0.7649247050285339, + "learning_rate": 0.00013487585331963995, + "loss": 2.8924, + "step": 10785 + }, + { + "epoch": 0.9771919095830219, + "grad_norm": 0.8184694051742554, + "learning_rate": 0.00013486981211864918, + "loss": 2.8976, + "step": 10786 + }, + { + "epoch": 0.9772825077574687, + "grad_norm": 0.774953305721283, + "learning_rate": 0.00013486377091765842, + "loss": 2.6085, + "step": 10787 + }, + { + "epoch": 0.9773731059319155, + "grad_norm": 0.7383402585983276, + "learning_rate": 0.00013485772971666768, + "loss": 2.7163, + "step": 10788 + }, + { + "epoch": 0.9774637041063623, + "grad_norm": 0.8442283272743225, + "learning_rate": 0.00013485168851567694, + "loss": 2.9086, + "step": 10789 + }, + { + "epoch": 0.9775543022808091, + "grad_norm": 0.7946393489837646, + "learning_rate": 0.00013484564731468617, + "loss": 2.6876, + "step": 10790 + }, + { + "epoch": 0.9776449004552559, + "grad_norm": 0.8085775971412659, + "learning_rate": 0.0001348396061136954, + "loss": 2.7489, + "step": 10791 + }, + { + "epoch": 0.9777354986297027, + "grad_norm": 0.6908823251724243, + "learning_rate": 0.00013483356491270464, + "loss": 1.9953, + "step": 10792 + }, + { + "epoch": 0.9778260968041494, + "grad_norm": 0.7900034785270691, + "learning_rate": 0.0001348275237117139, + "loss": 3.0667, + "step": 10793 + }, + { + "epoch": 0.9779166949785962, + "grad_norm": 0.7758607268333435, + "learning_rate": 0.00013482148251072314, + "loss": 2.7825, + "step": 10794 + }, + { + "epoch": 0.9780072931530429, + "grad_norm": 0.772080659866333, + "learning_rate": 0.00013481544130973237, + "loss": 2.715, + "step": 10795 + }, + { + "epoch": 0.9780978913274897, + "grad_norm": 0.7340586185455322, + "learning_rate": 0.00013480940010874163, + "loss": 2.3395, + "step": 10796 + }, + { + "epoch": 0.9781884895019365, + "grad_norm": 0.8239163160324097, + "learning_rate": 0.00013480335890775087, + "loss": 2.6902, + "step": 10797 + }, + { + "epoch": 0.9782790876763833, + "grad_norm": 0.8520792722702026, + "learning_rate": 0.00013479731770676013, + "loss": 2.8178, + "step": 10798 + }, + { + "epoch": 0.9783696858508301, + "grad_norm": 0.8296450972557068, + "learning_rate": 0.00013479127650576934, + "loss": 2.8722, + "step": 10799 + }, + { + "epoch": 0.9784602840252768, + "grad_norm": 0.6999891996383667, + "learning_rate": 0.0001347852353047786, + "loss": 2.0342, + "step": 10800 + }, + { + "epoch": 0.9785508821997236, + "grad_norm": 0.7498747110366821, + "learning_rate": 0.00013477919410378783, + "loss": 2.2125, + "step": 10801 + }, + { + "epoch": 0.9786414803741704, + "grad_norm": 0.8169413805007935, + "learning_rate": 0.0001347731529027971, + "loss": 3.2609, + "step": 10802 + }, + { + "epoch": 0.9787320785486172, + "grad_norm": 0.8479570150375366, + "learning_rate": 0.00013476711170180633, + "loss": 2.9459, + "step": 10803 + }, + { + "epoch": 0.978822676723064, + "grad_norm": 0.6837416291236877, + "learning_rate": 0.00013476107050081556, + "loss": 1.775, + "step": 10804 + }, + { + "epoch": 0.9789132748975108, + "grad_norm": 0.9848020076751709, + "learning_rate": 0.00013475502929982482, + "loss": 2.9825, + "step": 10805 + }, + { + "epoch": 0.9790038730719576, + "grad_norm": 0.8582839369773865, + "learning_rate": 0.00013474898809883405, + "loss": 3.1079, + "step": 10806 + }, + { + "epoch": 0.9790944712464044, + "grad_norm": 0.7824413180351257, + "learning_rate": 0.00013474294689784332, + "loss": 3.0824, + "step": 10807 + }, + { + "epoch": 0.9791850694208512, + "grad_norm": 0.8416684865951538, + "learning_rate": 0.00013473690569685252, + "loss": 2.9193, + "step": 10808 + }, + { + "epoch": 0.9792756675952979, + "grad_norm": 0.7924050092697144, + "learning_rate": 0.00013473086449586178, + "loss": 2.8074, + "step": 10809 + }, + { + "epoch": 0.9793662657697447, + "grad_norm": 0.8073720335960388, + "learning_rate": 0.00013472482329487102, + "loss": 2.9329, + "step": 10810 + }, + { + "epoch": 0.9794568639441915, + "grad_norm": 0.8026302456855774, + "learning_rate": 0.00013471878209388028, + "loss": 2.9758, + "step": 10811 + }, + { + "epoch": 0.9795474621186383, + "grad_norm": 0.7094435691833496, + "learning_rate": 0.0001347127408928895, + "loss": 2.2062, + "step": 10812 + }, + { + "epoch": 0.9796380602930851, + "grad_norm": 0.7893811464309692, + "learning_rate": 0.00013470669969189875, + "loss": 2.9789, + "step": 10813 + }, + { + "epoch": 0.9797286584675319, + "grad_norm": 0.652748703956604, + "learning_rate": 0.000134700658490908, + "loss": 2.3026, + "step": 10814 + }, + { + "epoch": 0.9798192566419787, + "grad_norm": 0.8064255714416504, + "learning_rate": 0.00013469461728991724, + "loss": 2.652, + "step": 10815 + }, + { + "epoch": 0.9799098548164255, + "grad_norm": 0.7989699840545654, + "learning_rate": 0.00013468857608892648, + "loss": 2.9548, + "step": 10816 + }, + { + "epoch": 0.9800004529908722, + "grad_norm": 0.8272461891174316, + "learning_rate": 0.0001346825348879357, + "loss": 2.9467, + "step": 10817 + }, + { + "epoch": 0.980091051165319, + "grad_norm": 0.7038021087646484, + "learning_rate": 0.00013467649368694497, + "loss": 2.057, + "step": 10818 + }, + { + "epoch": 0.9801816493397658, + "grad_norm": 0.7973771095275879, + "learning_rate": 0.00013467045248595423, + "loss": 2.6404, + "step": 10819 + }, + { + "epoch": 0.9802722475142126, + "grad_norm": 0.8284409046173096, + "learning_rate": 0.00013466441128496347, + "loss": 2.7709, + "step": 10820 + }, + { + "epoch": 0.9803628456886594, + "grad_norm": 0.8071178197860718, + "learning_rate": 0.0001346583700839727, + "loss": 2.8698, + "step": 10821 + }, + { + "epoch": 0.9804534438631062, + "grad_norm": 0.8673900961875916, + "learning_rate": 0.00013465232888298194, + "loss": 2.7154, + "step": 10822 + }, + { + "epoch": 0.980544042037553, + "grad_norm": 0.8477303385734558, + "learning_rate": 0.0001346462876819912, + "loss": 2.9626, + "step": 10823 + }, + { + "epoch": 0.9806346402119998, + "grad_norm": 0.8243212103843689, + "learning_rate": 0.00013464024648100043, + "loss": 2.7485, + "step": 10824 + }, + { + "epoch": 0.9807252383864465, + "grad_norm": 0.7658624053001404, + "learning_rate": 0.00013463420528000966, + "loss": 2.6862, + "step": 10825 + }, + { + "epoch": 0.9808158365608933, + "grad_norm": 0.7711549997329712, + "learning_rate": 0.00013462816407901893, + "loss": 2.8152, + "step": 10826 + }, + { + "epoch": 0.9809064347353401, + "grad_norm": 0.6879411339759827, + "learning_rate": 0.00013462212287802816, + "loss": 2.1157, + "step": 10827 + }, + { + "epoch": 0.9809970329097869, + "grad_norm": 0.7724767327308655, + "learning_rate": 0.00013461608167703742, + "loss": 2.8327, + "step": 10828 + }, + { + "epoch": 0.9810876310842337, + "grad_norm": 0.8834150433540344, + "learning_rate": 0.00013461004047604663, + "loss": 2.7616, + "step": 10829 + }, + { + "epoch": 0.9811782292586805, + "grad_norm": 0.8977527022361755, + "learning_rate": 0.0001346039992750559, + "loss": 2.817, + "step": 10830 + }, + { + "epoch": 0.9812688274331273, + "grad_norm": 0.7860261797904968, + "learning_rate": 0.00013459795807406512, + "loss": 2.771, + "step": 10831 + }, + { + "epoch": 0.981359425607574, + "grad_norm": 0.940514862537384, + "learning_rate": 0.00013459191687307438, + "loss": 2.6623, + "step": 10832 + }, + { + "epoch": 0.9814500237820208, + "grad_norm": 0.8420348763465881, + "learning_rate": 0.00013458587567208362, + "loss": 2.6544, + "step": 10833 + }, + { + "epoch": 0.9815406219564676, + "grad_norm": 0.8554883003234863, + "learning_rate": 0.00013457983447109285, + "loss": 3.0797, + "step": 10834 + }, + { + "epoch": 0.9816312201309144, + "grad_norm": 0.8301708698272705, + "learning_rate": 0.0001345737932701021, + "loss": 2.9476, + "step": 10835 + }, + { + "epoch": 0.9817218183053611, + "grad_norm": 0.8216462731361389, + "learning_rate": 0.00013456775206911135, + "loss": 2.7491, + "step": 10836 + }, + { + "epoch": 0.9818124164798079, + "grad_norm": 0.845390796661377, + "learning_rate": 0.00013456171086812058, + "loss": 2.8668, + "step": 10837 + }, + { + "epoch": 0.9819030146542547, + "grad_norm": 0.776185929775238, + "learning_rate": 0.00013455566966712982, + "loss": 2.8723, + "step": 10838 + }, + { + "epoch": 0.9819936128287015, + "grad_norm": 0.7769374847412109, + "learning_rate": 0.00013454962846613908, + "loss": 2.7364, + "step": 10839 + }, + { + "epoch": 0.9820842110031482, + "grad_norm": 0.8439967036247253, + "learning_rate": 0.0001345435872651483, + "loss": 2.9567, + "step": 10840 + }, + { + "epoch": 0.982174809177595, + "grad_norm": 0.8217287659645081, + "learning_rate": 0.00013453754606415757, + "loss": 2.9167, + "step": 10841 + }, + { + "epoch": 0.9822654073520418, + "grad_norm": 0.8112394213676453, + "learning_rate": 0.0001345315048631668, + "loss": 2.7827, + "step": 10842 + }, + { + "epoch": 0.9823560055264886, + "grad_norm": 0.7919735312461853, + "learning_rate": 0.00013452546366217604, + "loss": 2.5062, + "step": 10843 + }, + { + "epoch": 0.9824466037009354, + "grad_norm": 0.8191226124763489, + "learning_rate": 0.0001345194224611853, + "loss": 2.9449, + "step": 10844 + }, + { + "epoch": 0.9825372018753822, + "grad_norm": 0.7899011373519897, + "learning_rate": 0.00013451338126019454, + "loss": 2.6482, + "step": 10845 + }, + { + "epoch": 0.982627800049829, + "grad_norm": 0.7861994504928589, + "learning_rate": 0.00013450734005920377, + "loss": 2.8464, + "step": 10846 + }, + { + "epoch": 0.9827183982242758, + "grad_norm": 0.8736829161643982, + "learning_rate": 0.000134501298858213, + "loss": 3.041, + "step": 10847 + }, + { + "epoch": 0.9828089963987225, + "grad_norm": 0.8595653176307678, + "learning_rate": 0.00013449525765722226, + "loss": 2.6947, + "step": 10848 + }, + { + "epoch": 0.9828995945731693, + "grad_norm": 0.876106321811676, + "learning_rate": 0.00013448921645623153, + "loss": 2.9214, + "step": 10849 + }, + { + "epoch": 0.9829901927476161, + "grad_norm": 0.7973750233650208, + "learning_rate": 0.00013448317525524073, + "loss": 2.7036, + "step": 10850 + }, + { + "epoch": 0.9830807909220629, + "grad_norm": 0.7875123620033264, + "learning_rate": 0.00013447713405425, + "loss": 2.5677, + "step": 10851 + }, + { + "epoch": 0.9831713890965097, + "grad_norm": 0.7864854335784912, + "learning_rate": 0.00013447109285325923, + "loss": 2.7933, + "step": 10852 + }, + { + "epoch": 0.9832619872709565, + "grad_norm": 0.7851114869117737, + "learning_rate": 0.0001344650516522685, + "loss": 2.8463, + "step": 10853 + }, + { + "epoch": 0.9833525854454033, + "grad_norm": 0.8423100709915161, + "learning_rate": 0.00013445901045127772, + "loss": 2.8207, + "step": 10854 + }, + { + "epoch": 0.9834431836198501, + "grad_norm": 0.8716627955436707, + "learning_rate": 0.00013445296925028696, + "loss": 3.0117, + "step": 10855 + }, + { + "epoch": 0.9835337817942968, + "grad_norm": 0.9028264284133911, + "learning_rate": 0.00013444692804929622, + "loss": 2.978, + "step": 10856 + }, + { + "epoch": 0.9836243799687436, + "grad_norm": 0.8002797961235046, + "learning_rate": 0.00013444088684830545, + "loss": 2.9106, + "step": 10857 + }, + { + "epoch": 0.9837149781431904, + "grad_norm": 0.7610489130020142, + "learning_rate": 0.0001344348456473147, + "loss": 2.7393, + "step": 10858 + }, + { + "epoch": 0.9838055763176372, + "grad_norm": 0.866127610206604, + "learning_rate": 0.00013442880444632392, + "loss": 2.5952, + "step": 10859 + }, + { + "epoch": 0.983896174492084, + "grad_norm": 0.8111260533332825, + "learning_rate": 0.00013442276324533318, + "loss": 2.8199, + "step": 10860 + }, + { + "epoch": 0.9839867726665308, + "grad_norm": 0.8551108837127686, + "learning_rate": 0.00013441672204434242, + "loss": 2.9453, + "step": 10861 + }, + { + "epoch": 0.9840773708409776, + "grad_norm": 0.8442111611366272, + "learning_rate": 0.00013441068084335168, + "loss": 2.7413, + "step": 10862 + }, + { + "epoch": 0.9841679690154244, + "grad_norm": 0.8230049014091492, + "learning_rate": 0.0001344046396423609, + "loss": 3.0619, + "step": 10863 + }, + { + "epoch": 0.9842585671898711, + "grad_norm": 0.8802434802055359, + "learning_rate": 0.00013439859844137014, + "loss": 3.0231, + "step": 10864 + }, + { + "epoch": 0.9843491653643179, + "grad_norm": 0.8215543627738953, + "learning_rate": 0.0001343925572403794, + "loss": 2.6711, + "step": 10865 + }, + { + "epoch": 0.9844397635387647, + "grad_norm": 0.7733488082885742, + "learning_rate": 0.00013438651603938864, + "loss": 2.0144, + "step": 10866 + }, + { + "epoch": 0.9845303617132115, + "grad_norm": 0.7978000044822693, + "learning_rate": 0.00013438047483839787, + "loss": 2.7187, + "step": 10867 + }, + { + "epoch": 0.9846209598876583, + "grad_norm": 1.020568609237671, + "learning_rate": 0.0001343744336374071, + "loss": 2.9319, + "step": 10868 + }, + { + "epoch": 0.9847115580621051, + "grad_norm": 0.7708040475845337, + "learning_rate": 0.00013436839243641637, + "loss": 2.7329, + "step": 10869 + }, + { + "epoch": 0.9848021562365519, + "grad_norm": 0.7355670928955078, + "learning_rate": 0.0001343623512354256, + "loss": 1.9626, + "step": 10870 + }, + { + "epoch": 0.9848927544109987, + "grad_norm": 0.9089875817298889, + "learning_rate": 0.00013435631003443486, + "loss": 2.7848, + "step": 10871 + }, + { + "epoch": 0.9849833525854454, + "grad_norm": 0.7075557708740234, + "learning_rate": 0.0001343502688334441, + "loss": 2.2181, + "step": 10872 + }, + { + "epoch": 0.9850739507598922, + "grad_norm": 0.7839050889015198, + "learning_rate": 0.00013434422763245333, + "loss": 2.864, + "step": 10873 + }, + { + "epoch": 0.985164548934339, + "grad_norm": 0.7882186770439148, + "learning_rate": 0.0001343381864314626, + "loss": 2.9075, + "step": 10874 + }, + { + "epoch": 0.9852551471087858, + "grad_norm": 0.7898355722427368, + "learning_rate": 0.00013433214523047183, + "loss": 2.6523, + "step": 10875 + }, + { + "epoch": 0.9853457452832325, + "grad_norm": 0.7839875817298889, + "learning_rate": 0.00013432610402948106, + "loss": 2.8178, + "step": 10876 + }, + { + "epoch": 0.9854363434576793, + "grad_norm": 0.8232185244560242, + "learning_rate": 0.0001343200628284903, + "loss": 3.0281, + "step": 10877 + }, + { + "epoch": 0.9855269416321261, + "grad_norm": 0.7924366593360901, + "learning_rate": 0.00013431402162749956, + "loss": 2.8191, + "step": 10878 + }, + { + "epoch": 0.9856175398065729, + "grad_norm": 0.8003842830657959, + "learning_rate": 0.00013430798042650882, + "loss": 2.3275, + "step": 10879 + }, + { + "epoch": 0.9857081379810196, + "grad_norm": 0.8090684413909912, + "learning_rate": 0.00013430193922551803, + "loss": 2.5245, + "step": 10880 + }, + { + "epoch": 0.9857987361554664, + "grad_norm": 0.6687867045402527, + "learning_rate": 0.00013429589802452729, + "loss": 2.0423, + "step": 10881 + }, + { + "epoch": 0.9858893343299132, + "grad_norm": 0.8460585474967957, + "learning_rate": 0.00013428985682353652, + "loss": 2.7291, + "step": 10882 + }, + { + "epoch": 0.98597993250436, + "grad_norm": 0.8456172943115234, + "learning_rate": 0.00013428381562254578, + "loss": 3.0688, + "step": 10883 + }, + { + "epoch": 0.9860705306788068, + "grad_norm": 0.7858349084854126, + "learning_rate": 0.00013427777442155502, + "loss": 2.6098, + "step": 10884 + }, + { + "epoch": 0.9861611288532536, + "grad_norm": 0.7051410675048828, + "learning_rate": 0.00013427173322056425, + "loss": 2.0292, + "step": 10885 + }, + { + "epoch": 0.9862517270277004, + "grad_norm": 0.8482012152671814, + "learning_rate": 0.0001342656920195735, + "loss": 2.7853, + "step": 10886 + }, + { + "epoch": 0.9863423252021472, + "grad_norm": 0.7942967414855957, + "learning_rate": 0.00013425965081858274, + "loss": 3.1012, + "step": 10887 + }, + { + "epoch": 0.9864329233765939, + "grad_norm": 0.719994068145752, + "learning_rate": 0.00013425360961759198, + "loss": 2.1739, + "step": 10888 + }, + { + "epoch": 0.9865235215510407, + "grad_norm": 0.7667325735092163, + "learning_rate": 0.0001342475684166012, + "loss": 2.6471, + "step": 10889 + }, + { + "epoch": 0.9866141197254875, + "grad_norm": 0.6955845355987549, + "learning_rate": 0.00013424152721561047, + "loss": 2.0554, + "step": 10890 + }, + { + "epoch": 0.9867047178999343, + "grad_norm": 1.1477584838867188, + "learning_rate": 0.0001342354860146197, + "loss": 2.8608, + "step": 10891 + }, + { + "epoch": 0.9867953160743811, + "grad_norm": 0.9570803642272949, + "learning_rate": 0.00013422944481362897, + "loss": 3.0943, + "step": 10892 + }, + { + "epoch": 0.9868859142488279, + "grad_norm": 0.7832409739494324, + "learning_rate": 0.0001342234036126382, + "loss": 2.6172, + "step": 10893 + }, + { + "epoch": 0.9869765124232747, + "grad_norm": 0.8698916435241699, + "learning_rate": 0.00013421736241164744, + "loss": 2.9215, + "step": 10894 + }, + { + "epoch": 0.9870671105977215, + "grad_norm": 0.8625399470329285, + "learning_rate": 0.0001342113212106567, + "loss": 2.8233, + "step": 10895 + }, + { + "epoch": 0.9871577087721682, + "grad_norm": 0.8305600881576538, + "learning_rate": 0.00013420528000966593, + "loss": 2.8922, + "step": 10896 + }, + { + "epoch": 0.987248306946615, + "grad_norm": 0.8852500915527344, + "learning_rate": 0.00013419923880867517, + "loss": 2.5717, + "step": 10897 + }, + { + "epoch": 0.9873389051210618, + "grad_norm": 0.8397268652915955, + "learning_rate": 0.0001341931976076844, + "loss": 2.7817, + "step": 10898 + }, + { + "epoch": 0.9874295032955086, + "grad_norm": 0.853269100189209, + "learning_rate": 0.00013418715640669366, + "loss": 2.8919, + "step": 10899 + }, + { + "epoch": 0.9875201014699554, + "grad_norm": 0.8279159069061279, + "learning_rate": 0.0001341811152057029, + "loss": 3.0561, + "step": 10900 + }, + { + "epoch": 0.9876106996444022, + "grad_norm": 0.8144961595535278, + "learning_rate": 0.00013417507400471213, + "loss": 2.9815, + "step": 10901 + }, + { + "epoch": 0.987701297818849, + "grad_norm": 0.7361277341842651, + "learning_rate": 0.0001341690328037214, + "loss": 2.6882, + "step": 10902 + }, + { + "epoch": 0.9877918959932958, + "grad_norm": 0.807045578956604, + "learning_rate": 0.00013416299160273063, + "loss": 2.5866, + "step": 10903 + }, + { + "epoch": 0.9878824941677425, + "grad_norm": 0.9127631187438965, + "learning_rate": 0.00013415695040173989, + "loss": 3.0322, + "step": 10904 + }, + { + "epoch": 0.9879730923421893, + "grad_norm": 0.8130974769592285, + "learning_rate": 0.00013415090920074912, + "loss": 2.7229, + "step": 10905 + }, + { + "epoch": 0.9880636905166361, + "grad_norm": 0.8095946907997131, + "learning_rate": 0.00013414486799975835, + "loss": 2.9054, + "step": 10906 + }, + { + "epoch": 0.9881542886910829, + "grad_norm": 0.858174741268158, + "learning_rate": 0.0001341388267987676, + "loss": 2.6738, + "step": 10907 + }, + { + "epoch": 0.9882448868655297, + "grad_norm": 0.8499566316604614, + "learning_rate": 0.00013413278559777685, + "loss": 3.1093, + "step": 10908 + }, + { + "epoch": 0.9883354850399765, + "grad_norm": 0.8828730583190918, + "learning_rate": 0.00013412674439678608, + "loss": 2.842, + "step": 10909 + }, + { + "epoch": 0.9884260832144233, + "grad_norm": 0.7737197279930115, + "learning_rate": 0.00013412070319579532, + "loss": 2.5367, + "step": 10910 + }, + { + "epoch": 0.98851668138887, + "grad_norm": 0.8159345984458923, + "learning_rate": 0.00013411466199480458, + "loss": 2.8897, + "step": 10911 + }, + { + "epoch": 0.9886072795633168, + "grad_norm": 0.7954857349395752, + "learning_rate": 0.0001341086207938138, + "loss": 2.8019, + "step": 10912 + }, + { + "epoch": 0.9886978777377636, + "grad_norm": 0.6940935850143433, + "learning_rate": 0.00013410257959282307, + "loss": 1.8535, + "step": 10913 + }, + { + "epoch": 0.9887884759122104, + "grad_norm": 0.8698012232780457, + "learning_rate": 0.00013409653839183228, + "loss": 2.783, + "step": 10914 + }, + { + "epoch": 0.9888790740866572, + "grad_norm": 0.8181320428848267, + "learning_rate": 0.00013409049719084154, + "loss": 2.7714, + "step": 10915 + }, + { + "epoch": 0.988969672261104, + "grad_norm": 0.8093773126602173, + "learning_rate": 0.0001340844559898508, + "loss": 2.6231, + "step": 10916 + }, + { + "epoch": 0.9890602704355507, + "grad_norm": 0.8562283515930176, + "learning_rate": 0.00013407841478886004, + "loss": 2.6433, + "step": 10917 + }, + { + "epoch": 0.9891508686099975, + "grad_norm": 0.9112843871116638, + "learning_rate": 0.00013407237358786927, + "loss": 2.8318, + "step": 10918 + }, + { + "epoch": 0.9892414667844442, + "grad_norm": 0.8146413564682007, + "learning_rate": 0.0001340663323868785, + "loss": 2.7092, + "step": 10919 + }, + { + "epoch": 0.989332064958891, + "grad_norm": 0.8476353883743286, + "learning_rate": 0.00013406029118588777, + "loss": 2.944, + "step": 10920 + }, + { + "epoch": 0.9894226631333378, + "grad_norm": 0.771661639213562, + "learning_rate": 0.000134054249984897, + "loss": 2.837, + "step": 10921 + }, + { + "epoch": 0.9895132613077846, + "grad_norm": 0.8499038219451904, + "learning_rate": 0.00013404820878390623, + "loss": 2.8421, + "step": 10922 + }, + { + "epoch": 0.9896038594822314, + "grad_norm": 0.7771455645561218, + "learning_rate": 0.0001340421675829155, + "loss": 2.8395, + "step": 10923 + }, + { + "epoch": 0.9896944576566782, + "grad_norm": 0.8821293711662292, + "learning_rate": 0.00013403612638192473, + "loss": 2.3474, + "step": 10924 + }, + { + "epoch": 0.989785055831125, + "grad_norm": 0.7850678563117981, + "learning_rate": 0.000134030085180934, + "loss": 2.8831, + "step": 10925 + }, + { + "epoch": 0.9898756540055718, + "grad_norm": 0.7820662260055542, + "learning_rate": 0.00013402404397994323, + "loss": 2.8257, + "step": 10926 + }, + { + "epoch": 0.9899662521800185, + "grad_norm": 0.8344864845275879, + "learning_rate": 0.00013401800277895246, + "loss": 2.8257, + "step": 10927 + }, + { + "epoch": 0.9900568503544653, + "grad_norm": 0.7881389856338501, + "learning_rate": 0.0001340119615779617, + "loss": 2.8107, + "step": 10928 + }, + { + "epoch": 0.9901474485289121, + "grad_norm": 0.7955217361450195, + "learning_rate": 0.00013400592037697095, + "loss": 2.5191, + "step": 10929 + }, + { + "epoch": 0.9902380467033589, + "grad_norm": 0.7686125636100769, + "learning_rate": 0.0001339998791759802, + "loss": 2.798, + "step": 10930 + }, + { + "epoch": 0.9903286448778057, + "grad_norm": 0.6900812387466431, + "learning_rate": 0.00013399383797498942, + "loss": 2.0291, + "step": 10931 + }, + { + "epoch": 0.9904192430522525, + "grad_norm": 0.8075054883956909, + "learning_rate": 0.00013398779677399868, + "loss": 2.9428, + "step": 10932 + }, + { + "epoch": 0.9905098412266993, + "grad_norm": 0.8115695118904114, + "learning_rate": 0.00013398175557300792, + "loss": 2.8166, + "step": 10933 + }, + { + "epoch": 0.9906004394011461, + "grad_norm": 0.8414000272750854, + "learning_rate": 0.00013397571437201718, + "loss": 2.8374, + "step": 10934 + }, + { + "epoch": 0.9906910375755928, + "grad_norm": 0.763521134853363, + "learning_rate": 0.0001339696731710264, + "loss": 2.8227, + "step": 10935 + }, + { + "epoch": 0.9907816357500396, + "grad_norm": 0.8402549028396606, + "learning_rate": 0.00013396363197003565, + "loss": 3.0341, + "step": 10936 + }, + { + "epoch": 0.9908722339244864, + "grad_norm": 0.831228494644165, + "learning_rate": 0.00013395759076904488, + "loss": 2.8918, + "step": 10937 + }, + { + "epoch": 0.9909628320989332, + "grad_norm": 0.7666506171226501, + "learning_rate": 0.00013395154956805414, + "loss": 2.5528, + "step": 10938 + }, + { + "epoch": 0.99105343027338, + "grad_norm": 0.9370864033699036, + "learning_rate": 0.00013394550836706338, + "loss": 2.9624, + "step": 10939 + }, + { + "epoch": 0.9911440284478268, + "grad_norm": 0.7454035878181458, + "learning_rate": 0.0001339394671660726, + "loss": 3.0021, + "step": 10940 + }, + { + "epoch": 0.9912346266222736, + "grad_norm": 0.7789361476898193, + "learning_rate": 0.00013393342596508187, + "loss": 2.777, + "step": 10941 + }, + { + "epoch": 0.9913252247967204, + "grad_norm": 0.7990679740905762, + "learning_rate": 0.0001339273847640911, + "loss": 2.6321, + "step": 10942 + }, + { + "epoch": 0.9914158229711671, + "grad_norm": 0.8742187023162842, + "learning_rate": 0.00013392134356310037, + "loss": 2.9756, + "step": 10943 + }, + { + "epoch": 0.9915064211456139, + "grad_norm": 0.857029914855957, + "learning_rate": 0.00013391530236210957, + "loss": 2.8244, + "step": 10944 + }, + { + "epoch": 0.9915970193200607, + "grad_norm": 0.7928543090820312, + "learning_rate": 0.00013390926116111883, + "loss": 2.2852, + "step": 10945 + }, + { + "epoch": 0.9916876174945075, + "grad_norm": 0.8672637343406677, + "learning_rate": 0.0001339032199601281, + "loss": 3.1095, + "step": 10946 + }, + { + "epoch": 0.9917782156689543, + "grad_norm": 0.8070001602172852, + "learning_rate": 0.00013389717875913733, + "loss": 2.9116, + "step": 10947 + }, + { + "epoch": 0.9918688138434011, + "grad_norm": 0.7543050050735474, + "learning_rate": 0.00013389113755814656, + "loss": 3.0267, + "step": 10948 + }, + { + "epoch": 0.9919594120178479, + "grad_norm": 0.7894711494445801, + "learning_rate": 0.0001338850963571558, + "loss": 2.6974, + "step": 10949 + }, + { + "epoch": 0.9920500101922947, + "grad_norm": 0.78280109167099, + "learning_rate": 0.00013387905515616506, + "loss": 3.0896, + "step": 10950 + }, + { + "epoch": 0.9921406083667414, + "grad_norm": 0.8692906498908997, + "learning_rate": 0.0001338730139551743, + "loss": 2.8931, + "step": 10951 + }, + { + "epoch": 0.9922312065411882, + "grad_norm": 0.8009922504425049, + "learning_rate": 0.00013386697275418353, + "loss": 2.8095, + "step": 10952 + }, + { + "epoch": 0.992321804715635, + "grad_norm": 0.8004471063613892, + "learning_rate": 0.0001338609315531928, + "loss": 2.1296, + "step": 10953 + }, + { + "epoch": 0.9924124028900818, + "grad_norm": 0.6956804394721985, + "learning_rate": 0.00013385489035220202, + "loss": 2.1014, + "step": 10954 + }, + { + "epoch": 0.9925030010645286, + "grad_norm": 0.8135072588920593, + "learning_rate": 0.00013384884915121128, + "loss": 2.9639, + "step": 10955 + }, + { + "epoch": 0.9925935992389754, + "grad_norm": 0.7828360795974731, + "learning_rate": 0.00013384280795022052, + "loss": 2.9726, + "step": 10956 + }, + { + "epoch": 0.9926841974134221, + "grad_norm": 0.7805309295654297, + "learning_rate": 0.00013383676674922975, + "loss": 3.0462, + "step": 10957 + }, + { + "epoch": 0.9927747955878689, + "grad_norm": 0.792011559009552, + "learning_rate": 0.00013383072554823899, + "loss": 2.9779, + "step": 10958 + }, + { + "epoch": 0.9928653937623156, + "grad_norm": 0.7913088202476501, + "learning_rate": 0.00013382468434724825, + "loss": 2.834, + "step": 10959 + }, + { + "epoch": 0.9929559919367624, + "grad_norm": 0.7800768613815308, + "learning_rate": 0.00013381864314625748, + "loss": 2.8069, + "step": 10960 + }, + { + "epoch": 0.9930465901112092, + "grad_norm": 0.7335860729217529, + "learning_rate": 0.00013381260194526672, + "loss": 2.7967, + "step": 10961 + }, + { + "epoch": 0.993137188285656, + "grad_norm": 0.8053755760192871, + "learning_rate": 0.00013380656074427598, + "loss": 2.8745, + "step": 10962 + }, + { + "epoch": 0.9932277864601028, + "grad_norm": 1.7181907892227173, + "learning_rate": 0.0001338005195432852, + "loss": 2.7402, + "step": 10963 + }, + { + "epoch": 0.9933183846345496, + "grad_norm": 0.838829755783081, + "learning_rate": 0.00013379447834229447, + "loss": 2.8139, + "step": 10964 + }, + { + "epoch": 0.9934089828089964, + "grad_norm": 0.777891218662262, + "learning_rate": 0.00013378843714130368, + "loss": 2.8743, + "step": 10965 + }, + { + "epoch": 0.9934995809834432, + "grad_norm": 0.7098841071128845, + "learning_rate": 0.00013378239594031294, + "loss": 2.135, + "step": 10966 + }, + { + "epoch": 0.9935901791578899, + "grad_norm": 0.7272146344184875, + "learning_rate": 0.00013377635473932217, + "loss": 1.9466, + "step": 10967 + }, + { + "epoch": 0.9936807773323367, + "grad_norm": 0.851897656917572, + "learning_rate": 0.00013377031353833144, + "loss": 2.994, + "step": 10968 + }, + { + "epoch": 0.9937713755067835, + "grad_norm": 0.8390393257141113, + "learning_rate": 0.00013376427233734067, + "loss": 2.9137, + "step": 10969 + }, + { + "epoch": 0.9938619736812303, + "grad_norm": 0.7615281939506531, + "learning_rate": 0.0001337582311363499, + "loss": 2.794, + "step": 10970 + }, + { + "epoch": 0.9939525718556771, + "grad_norm": 0.8262609839439392, + "learning_rate": 0.00013375218993535916, + "loss": 2.66, + "step": 10971 + }, + { + "epoch": 0.9940431700301239, + "grad_norm": 0.7791025638580322, + "learning_rate": 0.0001337461487343684, + "loss": 2.1075, + "step": 10972 + }, + { + "epoch": 0.9941337682045707, + "grad_norm": 0.7873169779777527, + "learning_rate": 0.00013374010753337763, + "loss": 2.7578, + "step": 10973 + }, + { + "epoch": 0.9942243663790175, + "grad_norm": 0.9079036116600037, + "learning_rate": 0.00013373406633238687, + "loss": 2.874, + "step": 10974 + }, + { + "epoch": 0.9943149645534642, + "grad_norm": 0.8533504605293274, + "learning_rate": 0.00013372802513139613, + "loss": 2.8107, + "step": 10975 + }, + { + "epoch": 0.994405562727911, + "grad_norm": 0.7884867787361145, + "learning_rate": 0.0001337219839304054, + "loss": 2.745, + "step": 10976 + }, + { + "epoch": 0.9944961609023578, + "grad_norm": 0.7593388557434082, + "learning_rate": 0.00013371594272941462, + "loss": 2.153, + "step": 10977 + }, + { + "epoch": 0.9945867590768046, + "grad_norm": 0.7860540151596069, + "learning_rate": 0.00013370990152842386, + "loss": 2.9085, + "step": 10978 + }, + { + "epoch": 0.9946773572512514, + "grad_norm": 0.7888593673706055, + "learning_rate": 0.0001337038603274331, + "loss": 2.9453, + "step": 10979 + }, + { + "epoch": 0.9947679554256982, + "grad_norm": 0.7612860202789307, + "learning_rate": 0.00013369781912644235, + "loss": 2.6088, + "step": 10980 + }, + { + "epoch": 0.994858553600145, + "grad_norm": 0.8403357267379761, + "learning_rate": 0.00013369177792545159, + "loss": 2.7201, + "step": 10981 + }, + { + "epoch": 0.9949491517745918, + "grad_norm": 0.7825067043304443, + "learning_rate": 0.00013368573672446082, + "loss": 2.5604, + "step": 10982 + }, + { + "epoch": 0.9950397499490385, + "grad_norm": 0.7962688207626343, + "learning_rate": 0.00013367969552347008, + "loss": 2.7357, + "step": 10983 + }, + { + "epoch": 0.9951303481234853, + "grad_norm": 0.7874683737754822, + "learning_rate": 0.00013367365432247932, + "loss": 2.8043, + "step": 10984 + }, + { + "epoch": 0.9952209462979321, + "grad_norm": 0.8253872990608215, + "learning_rate": 0.00013366761312148858, + "loss": 2.7713, + "step": 10985 + }, + { + "epoch": 0.9953115444723789, + "grad_norm": 0.8342469334602356, + "learning_rate": 0.00013366157192049778, + "loss": 3.0973, + "step": 10986 + }, + { + "epoch": 0.9954021426468257, + "grad_norm": 0.9046341180801392, + "learning_rate": 0.00013365553071950704, + "loss": 3.1228, + "step": 10987 + }, + { + "epoch": 0.9954927408212725, + "grad_norm": 0.7523354887962341, + "learning_rate": 0.00013364948951851628, + "loss": 2.7863, + "step": 10988 + }, + { + "epoch": 0.9955833389957193, + "grad_norm": 0.795161247253418, + "learning_rate": 0.00013364344831752554, + "loss": 2.9049, + "step": 10989 + }, + { + "epoch": 0.9956739371701661, + "grad_norm": 0.8031451106071472, + "learning_rate": 0.00013363740711653477, + "loss": 2.6422, + "step": 10990 + }, + { + "epoch": 0.9957645353446128, + "grad_norm": 0.7560983300209045, + "learning_rate": 0.000133631365915544, + "loss": 3.0308, + "step": 10991 + }, + { + "epoch": 0.9958551335190596, + "grad_norm": 0.7926454544067383, + "learning_rate": 0.00013362532471455327, + "loss": 2.9827, + "step": 10992 + }, + { + "epoch": 0.9959457316935064, + "grad_norm": 0.7957165241241455, + "learning_rate": 0.0001336192835135625, + "loss": 2.8491, + "step": 10993 + }, + { + "epoch": 0.9960363298679532, + "grad_norm": 0.8058078289031982, + "learning_rate": 0.00013361324231257176, + "loss": 2.7724, + "step": 10994 + }, + { + "epoch": 0.9961269280424, + "grad_norm": 0.7702135443687439, + "learning_rate": 0.00013360720111158097, + "loss": 2.6978, + "step": 10995 + }, + { + "epoch": 0.9962175262168468, + "grad_norm": 0.8671699166297913, + "learning_rate": 0.00013360115991059023, + "loss": 2.7594, + "step": 10996 + }, + { + "epoch": 0.9963081243912936, + "grad_norm": 0.7841046452522278, + "learning_rate": 0.00013359511870959947, + "loss": 2.6644, + "step": 10997 + }, + { + "epoch": 0.9963987225657402, + "grad_norm": 0.8184312582015991, + "learning_rate": 0.00013358907750860873, + "loss": 2.6503, + "step": 10998 + }, + { + "epoch": 0.996489320740187, + "grad_norm": 0.8118552565574646, + "learning_rate": 0.00013358303630761796, + "loss": 3.0475, + "step": 10999 + }, + { + "epoch": 0.9965799189146338, + "grad_norm": 0.8527470231056213, + "learning_rate": 0.0001335769951066272, + "loss": 2.7211, + "step": 11000 + }, + { + "epoch": 0.9966705170890806, + "grad_norm": 0.8036944270133972, + "learning_rate": 0.00013357095390563646, + "loss": 2.6493, + "step": 11001 + }, + { + "epoch": 0.9967611152635274, + "grad_norm": 0.8049510717391968, + "learning_rate": 0.0001335649127046457, + "loss": 2.7961, + "step": 11002 + }, + { + "epoch": 0.9968517134379742, + "grad_norm": 0.8720367550849915, + "learning_rate": 0.00013355887150365493, + "loss": 2.7865, + "step": 11003 + }, + { + "epoch": 0.996942311612421, + "grad_norm": 0.8009216785430908, + "learning_rate": 0.00013355283030266416, + "loss": 2.8389, + "step": 11004 + }, + { + "epoch": 0.9970329097868678, + "grad_norm": 0.7496457695960999, + "learning_rate": 0.00013354678910167342, + "loss": 2.6247, + "step": 11005 + }, + { + "epoch": 0.9971235079613145, + "grad_norm": 0.8580725789070129, + "learning_rate": 0.00013354074790068268, + "loss": 2.8518, + "step": 11006 + }, + { + "epoch": 0.9972141061357613, + "grad_norm": 0.8212641477584839, + "learning_rate": 0.00013353470669969192, + "loss": 2.8556, + "step": 11007 + }, + { + "epoch": 0.9973047043102081, + "grad_norm": 0.7644590139389038, + "learning_rate": 0.00013352866549870115, + "loss": 2.7112, + "step": 11008 + }, + { + "epoch": 0.9973953024846549, + "grad_norm": 0.8247709274291992, + "learning_rate": 0.00013352262429771038, + "loss": 2.6739, + "step": 11009 + }, + { + "epoch": 0.9974859006591017, + "grad_norm": 0.9328873157501221, + "learning_rate": 0.00013351658309671964, + "loss": 2.5133, + "step": 11010 + }, + { + "epoch": 0.9975764988335485, + "grad_norm": 0.7962944507598877, + "learning_rate": 0.00013351054189572888, + "loss": 2.8526, + "step": 11011 + }, + { + "epoch": 0.9976670970079953, + "grad_norm": 0.8044102787971497, + "learning_rate": 0.0001335045006947381, + "loss": 2.8061, + "step": 11012 + }, + { + "epoch": 0.9977576951824421, + "grad_norm": 0.82625412940979, + "learning_rate": 0.00013349845949374737, + "loss": 2.7275, + "step": 11013 + }, + { + "epoch": 0.9978482933568888, + "grad_norm": 0.8121049404144287, + "learning_rate": 0.0001334924182927566, + "loss": 2.6598, + "step": 11014 + }, + { + "epoch": 0.9979388915313356, + "grad_norm": 0.6867625713348389, + "learning_rate": 0.00013348637709176587, + "loss": 2.0853, + "step": 11015 + }, + { + "epoch": 0.9980294897057824, + "grad_norm": 0.8835938572883606, + "learning_rate": 0.00013348033589077508, + "loss": 2.7006, + "step": 11016 + }, + { + "epoch": 0.9981200878802292, + "grad_norm": 0.8258145451545715, + "learning_rate": 0.00013347429468978434, + "loss": 2.6358, + "step": 11017 + }, + { + "epoch": 0.998210686054676, + "grad_norm": 0.765067994594574, + "learning_rate": 0.00013346825348879357, + "loss": 2.5719, + "step": 11018 + }, + { + "epoch": 0.9983012842291228, + "grad_norm": 0.8649446368217468, + "learning_rate": 0.00013346221228780283, + "loss": 2.8954, + "step": 11019 + }, + { + "epoch": 0.9983918824035696, + "grad_norm": 0.8765762448310852, + "learning_rate": 0.00013345617108681207, + "loss": 3.0705, + "step": 11020 + }, + { + "epoch": 0.9984824805780164, + "grad_norm": 0.755993127822876, + "learning_rate": 0.0001334501298858213, + "loss": 2.432, + "step": 11021 + }, + { + "epoch": 0.9985730787524632, + "grad_norm": 0.8185970783233643, + "learning_rate": 0.00013344408868483056, + "loss": 2.8038, + "step": 11022 + }, + { + "epoch": 0.9986636769269099, + "grad_norm": 0.8168573379516602, + "learning_rate": 0.0001334380474838398, + "loss": 3.0909, + "step": 11023 + }, + { + "epoch": 0.9987542751013567, + "grad_norm": 0.719412624835968, + "learning_rate": 0.00013343200628284903, + "loss": 2.2395, + "step": 11024 + }, + { + "epoch": 0.9988448732758035, + "grad_norm": 0.767345130443573, + "learning_rate": 0.00013342596508185826, + "loss": 2.751, + "step": 11025 + }, + { + "epoch": 0.9989354714502503, + "grad_norm": 0.867748498916626, + "learning_rate": 0.00013341992388086753, + "loss": 2.8072, + "step": 11026 + }, + { + "epoch": 0.9990260696246971, + "grad_norm": 0.8184195160865784, + "learning_rate": 0.00013341388267987676, + "loss": 2.8431, + "step": 11027 + }, + { + "epoch": 0.9991166677991439, + "grad_norm": 0.852786123752594, + "learning_rate": 0.00013340784147888602, + "loss": 2.6665, + "step": 11028 + }, + { + "epoch": 0.9992072659735907, + "grad_norm": 0.7571428418159485, + "learning_rate": 0.00013340180027789525, + "loss": 2.6311, + "step": 11029 + }, + { + "epoch": 0.9992978641480375, + "grad_norm": 0.8731504082679749, + "learning_rate": 0.0001333957590769045, + "loss": 2.6282, + "step": 11030 + }, + { + "epoch": 0.9993884623224842, + "grad_norm": 0.8179746270179749, + "learning_rate": 0.00013338971787591375, + "loss": 2.6193, + "step": 11031 + }, + { + "epoch": 0.999479060496931, + "grad_norm": 0.7926318049430847, + "learning_rate": 0.00013338367667492298, + "loss": 2.5766, + "step": 11032 + }, + { + "epoch": 0.9995696586713778, + "grad_norm": 0.7904252409934998, + "learning_rate": 0.00013337763547393222, + "loss": 2.5887, + "step": 11033 + }, + { + "epoch": 0.9996602568458246, + "grad_norm": 0.8363568186759949, + "learning_rate": 0.00013337159427294145, + "loss": 2.6137, + "step": 11034 + }, + { + "epoch": 0.9997508550202714, + "grad_norm": 0.7733702063560486, + "learning_rate": 0.0001333655530719507, + "loss": 3.0008, + "step": 11035 + }, + { + "epoch": 0.9998414531947182, + "grad_norm": 0.8798292279243469, + "learning_rate": 0.00013335951187095997, + "loss": 2.5739, + "step": 11036 + }, + { + "epoch": 0.999932051369165, + "grad_norm": 0.8215324282646179, + "learning_rate": 0.00013335347066996918, + "loss": 2.8692, + "step": 11037 + }, + { + "epoch": 1.0000226495436118, + "grad_norm": 0.7774962186813354, + "learning_rate": 0.00013334742946897844, + "loss": 2.8123, + "step": 11038 + }, + { + "epoch": 1.0001132477180585, + "grad_norm": 0.777630627155304, + "learning_rate": 0.00013334138826798768, + "loss": 2.7915, + "step": 11039 + }, + { + "epoch": 1.0002038458925053, + "grad_norm": 0.6789016127586365, + "learning_rate": 0.00013333534706699694, + "loss": 2.0528, + "step": 11040 + }, + { + "epoch": 1.0002944440669521, + "grad_norm": 0.7567192316055298, + "learning_rate": 0.00013332930586600617, + "loss": 2.6133, + "step": 11041 + }, + { + "epoch": 1.000385042241399, + "grad_norm": 0.752346396446228, + "learning_rate": 0.0001333232646650154, + "loss": 2.5351, + "step": 11042 + }, + { + "epoch": 1.0004756404158457, + "grad_norm": 0.7610706686973572, + "learning_rate": 0.00013331722346402467, + "loss": 2.8621, + "step": 11043 + }, + { + "epoch": 1.0005662385902925, + "grad_norm": 0.8163691163063049, + "learning_rate": 0.0001333111822630339, + "loss": 2.9265, + "step": 11044 + }, + { + "epoch": 1.0006568367647393, + "grad_norm": 0.7739965319633484, + "learning_rate": 0.00013330514106204316, + "loss": 2.5865, + "step": 11045 + }, + { + "epoch": 1.000747434939186, + "grad_norm": 0.6576281785964966, + "learning_rate": 0.00013329909986105237, + "loss": 1.9174, + "step": 11046 + }, + { + "epoch": 1.0008380331136328, + "grad_norm": 0.779090166091919, + "learning_rate": 0.00013329305866006163, + "loss": 2.6896, + "step": 11047 + }, + { + "epoch": 1.0009286312880796, + "grad_norm": 0.8121617436408997, + "learning_rate": 0.00013328701745907086, + "loss": 2.9135, + "step": 11048 + }, + { + "epoch": 1.0010192294625264, + "grad_norm": 0.8341018557548523, + "learning_rate": 0.00013328097625808013, + "loss": 2.5994, + "step": 11049 + }, + { + "epoch": 1.0011098276369732, + "grad_norm": 0.8049536943435669, + "learning_rate": 0.00013327493505708933, + "loss": 2.6564, + "step": 11050 + }, + { + "epoch": 1.00120042581142, + "grad_norm": 0.795446515083313, + "learning_rate": 0.0001332688938560986, + "loss": 2.6034, + "step": 11051 + }, + { + "epoch": 1.0012910239858668, + "grad_norm": 0.7833463549613953, + "learning_rate": 0.00013326285265510785, + "loss": 2.4421, + "step": 11052 + }, + { + "epoch": 1.0013816221603136, + "grad_norm": 0.7859344482421875, + "learning_rate": 0.0001332568114541171, + "loss": 2.7315, + "step": 11053 + }, + { + "epoch": 1.0014722203347604, + "grad_norm": 0.9195829629898071, + "learning_rate": 0.00013325077025312632, + "loss": 2.6551, + "step": 11054 + }, + { + "epoch": 1.0015628185092071, + "grad_norm": 0.7963623404502869, + "learning_rate": 0.00013324472905213556, + "loss": 2.6886, + "step": 11055 + }, + { + "epoch": 1.001653416683654, + "grad_norm": 0.8039566874504089, + "learning_rate": 0.00013323868785114482, + "loss": 2.7912, + "step": 11056 + }, + { + "epoch": 1.0017440148581007, + "grad_norm": 0.9434640407562256, + "learning_rate": 0.00013323264665015405, + "loss": 2.8928, + "step": 11057 + }, + { + "epoch": 1.0018346130325475, + "grad_norm": 0.8436272740364075, + "learning_rate": 0.0001332266054491633, + "loss": 2.4887, + "step": 11058 + }, + { + "epoch": 1.001925211206994, + "grad_norm": 0.8191894888877869, + "learning_rate": 0.00013322056424817255, + "loss": 2.5706, + "step": 11059 + }, + { + "epoch": 1.0020158093814409, + "grad_norm": 0.8440008759498596, + "learning_rate": 0.00013321452304718178, + "loss": 2.7752, + "step": 11060 + }, + { + "epoch": 1.0021064075558876, + "grad_norm": 0.855593204498291, + "learning_rate": 0.00013320848184619104, + "loss": 2.5236, + "step": 11061 + }, + { + "epoch": 1.0021970057303344, + "grad_norm": 0.8948823809623718, + "learning_rate": 0.00013320244064520028, + "loss": 2.7627, + "step": 11062 + }, + { + "epoch": 1.0022876039047812, + "grad_norm": 0.8899322748184204, + "learning_rate": 0.0001331963994442095, + "loss": 2.6563, + "step": 11063 + }, + { + "epoch": 1.002378202079228, + "grad_norm": 0.840539276599884, + "learning_rate": 0.00013319035824321874, + "loss": 2.6516, + "step": 11064 + }, + { + "epoch": 1.0024688002536748, + "grad_norm": 0.8329349160194397, + "learning_rate": 0.000133184317042228, + "loss": 2.8859, + "step": 11065 + }, + { + "epoch": 1.0025593984281216, + "grad_norm": 0.8866831064224243, + "learning_rate": 0.00013317827584123727, + "loss": 2.5604, + "step": 11066 + }, + { + "epoch": 1.0026499966025684, + "grad_norm": 0.8191143870353699, + "learning_rate": 0.00013317223464024647, + "loss": 2.9132, + "step": 11067 + }, + { + "epoch": 1.0027405947770152, + "grad_norm": 0.8933125138282776, + "learning_rate": 0.00013316619343925573, + "loss": 2.7222, + "step": 11068 + }, + { + "epoch": 1.002831192951462, + "grad_norm": 0.8916510343551636, + "learning_rate": 0.00013316015223826497, + "loss": 2.8323, + "step": 11069 + }, + { + "epoch": 1.0029217911259087, + "grad_norm": 0.8440781831741333, + "learning_rate": 0.00013315411103727423, + "loss": 2.6566, + "step": 11070 + }, + { + "epoch": 1.0030123893003555, + "grad_norm": 0.8600988984107971, + "learning_rate": 0.00013314806983628346, + "loss": 2.7713, + "step": 11071 + }, + { + "epoch": 1.0031029874748023, + "grad_norm": 0.7788486480712891, + "learning_rate": 0.0001331420286352927, + "loss": 1.956, + "step": 11072 + }, + { + "epoch": 1.003193585649249, + "grad_norm": 0.8838188648223877, + "learning_rate": 0.00013313598743430196, + "loss": 2.5285, + "step": 11073 + }, + { + "epoch": 1.0032841838236959, + "grad_norm": 0.8263453841209412, + "learning_rate": 0.0001331299462333112, + "loss": 2.7682, + "step": 11074 + }, + { + "epoch": 1.0033747819981427, + "grad_norm": 0.8242684602737427, + "learning_rate": 0.00013312390503232043, + "loss": 2.6899, + "step": 11075 + }, + { + "epoch": 1.0034653801725895, + "grad_norm": 0.856952428817749, + "learning_rate": 0.00013311786383132966, + "loss": 2.7007, + "step": 11076 + }, + { + "epoch": 1.0035559783470362, + "grad_norm": 0.8725153803825378, + "learning_rate": 0.00013311182263033892, + "loss": 2.9723, + "step": 11077 + }, + { + "epoch": 1.003646576521483, + "grad_norm": 0.834966242313385, + "learning_rate": 0.00013310578142934816, + "loss": 2.9291, + "step": 11078 + }, + { + "epoch": 1.0037371746959298, + "grad_norm": 0.8296695351600647, + "learning_rate": 0.00013309974022835742, + "loss": 2.6898, + "step": 11079 + }, + { + "epoch": 1.0038277728703766, + "grad_norm": 0.8207578659057617, + "learning_rate": 0.00013309369902736662, + "loss": 2.9607, + "step": 11080 + }, + { + "epoch": 1.0039183710448234, + "grad_norm": 0.7963082194328308, + "learning_rate": 0.00013308765782637589, + "loss": 2.8292, + "step": 11081 + }, + { + "epoch": 1.0040089692192702, + "grad_norm": 0.7877174019813538, + "learning_rate": 0.00013308161662538515, + "loss": 2.5931, + "step": 11082 + }, + { + "epoch": 1.004099567393717, + "grad_norm": 0.8615460395812988, + "learning_rate": 0.00013307557542439438, + "loss": 2.6321, + "step": 11083 + }, + { + "epoch": 1.0041901655681638, + "grad_norm": 0.8173784017562866, + "learning_rate": 0.00013306953422340362, + "loss": 2.8526, + "step": 11084 + }, + { + "epoch": 1.0042807637426105, + "grad_norm": 0.8774644732475281, + "learning_rate": 0.00013306349302241285, + "loss": 2.7145, + "step": 11085 + }, + { + "epoch": 1.0043713619170573, + "grad_norm": 0.8850494623184204, + "learning_rate": 0.0001330574518214221, + "loss": 2.6355, + "step": 11086 + }, + { + "epoch": 1.0044619600915041, + "grad_norm": 0.8330889940261841, + "learning_rate": 0.00013305141062043134, + "loss": 2.7981, + "step": 11087 + }, + { + "epoch": 1.004552558265951, + "grad_norm": 0.9105362892150879, + "learning_rate": 0.00013304536941944058, + "loss": 2.6596, + "step": 11088 + }, + { + "epoch": 1.0046431564403977, + "grad_norm": 0.813674807548523, + "learning_rate": 0.00013303932821844984, + "loss": 2.6701, + "step": 11089 + }, + { + "epoch": 1.0047337546148445, + "grad_norm": 0.8319053053855896, + "learning_rate": 0.00013303328701745907, + "loss": 2.7878, + "step": 11090 + }, + { + "epoch": 1.0048243527892913, + "grad_norm": 0.8121641874313354, + "learning_rate": 0.00013302724581646833, + "loss": 2.6336, + "step": 11091 + }, + { + "epoch": 1.004914950963738, + "grad_norm": 0.8840779066085815, + "learning_rate": 0.00013302120461547757, + "loss": 2.7627, + "step": 11092 + }, + { + "epoch": 1.0050055491381849, + "grad_norm": 0.8392292261123657, + "learning_rate": 0.0001330151634144868, + "loss": 2.6591, + "step": 11093 + }, + { + "epoch": 1.0050961473126316, + "grad_norm": 0.9166077375411987, + "learning_rate": 0.00013300912221349604, + "loss": 2.8337, + "step": 11094 + }, + { + "epoch": 1.0051867454870784, + "grad_norm": 0.8141034841537476, + "learning_rate": 0.0001330030810125053, + "loss": 2.7912, + "step": 11095 + }, + { + "epoch": 1.0052773436615252, + "grad_norm": 0.8379542231559753, + "learning_rate": 0.00013299703981151453, + "loss": 2.5973, + "step": 11096 + }, + { + "epoch": 1.005367941835972, + "grad_norm": 0.8205088973045349, + "learning_rate": 0.00013299099861052377, + "loss": 2.7387, + "step": 11097 + }, + { + "epoch": 1.0054585400104188, + "grad_norm": 0.8988860845565796, + "learning_rate": 0.00013298495740953303, + "loss": 2.6579, + "step": 11098 + }, + { + "epoch": 1.0055491381848656, + "grad_norm": 0.8623374700546265, + "learning_rate": 0.00013297891620854226, + "loss": 2.6554, + "step": 11099 + }, + { + "epoch": 1.0056397363593124, + "grad_norm": 0.9296642541885376, + "learning_rate": 0.00013297287500755152, + "loss": 2.6542, + "step": 11100 + }, + { + "epoch": 1.0057303345337592, + "grad_norm": 0.8319603204727173, + "learning_rate": 0.00013296683380656073, + "loss": 2.8251, + "step": 11101 + }, + { + "epoch": 1.005820932708206, + "grad_norm": 0.9204514622688293, + "learning_rate": 0.00013296079260557, + "loss": 2.7363, + "step": 11102 + }, + { + "epoch": 1.0059115308826527, + "grad_norm": 0.8488034009933472, + "learning_rate": 0.00013295475140457925, + "loss": 2.6648, + "step": 11103 + }, + { + "epoch": 1.0060021290570995, + "grad_norm": 0.8543798327445984, + "learning_rate": 0.00013294871020358849, + "loss": 2.6581, + "step": 11104 + }, + { + "epoch": 1.0060927272315463, + "grad_norm": 0.8438416123390198, + "learning_rate": 0.00013294266900259772, + "loss": 2.5471, + "step": 11105 + }, + { + "epoch": 1.006183325405993, + "grad_norm": 0.9284725189208984, + "learning_rate": 0.00013293662780160695, + "loss": 2.8073, + "step": 11106 + }, + { + "epoch": 1.0062739235804399, + "grad_norm": 0.8645551204681396, + "learning_rate": 0.00013293058660061622, + "loss": 2.7416, + "step": 11107 + }, + { + "epoch": 1.0063645217548867, + "grad_norm": 0.9358887076377869, + "learning_rate": 0.00013292454539962545, + "loss": 2.7727, + "step": 11108 + }, + { + "epoch": 1.0064551199293335, + "grad_norm": 0.8171511888504028, + "learning_rate": 0.00013291850419863468, + "loss": 2.004, + "step": 11109 + }, + { + "epoch": 1.0065457181037802, + "grad_norm": 0.7134958505630493, + "learning_rate": 0.00013291246299764392, + "loss": 1.9777, + "step": 11110 + }, + { + "epoch": 1.006636316278227, + "grad_norm": 0.9634122848510742, + "learning_rate": 0.00013290642179665318, + "loss": 2.3999, + "step": 11111 + }, + { + "epoch": 1.0067269144526738, + "grad_norm": 0.9075639843940735, + "learning_rate": 0.00013290038059566244, + "loss": 2.7056, + "step": 11112 + }, + { + "epoch": 1.0068175126271206, + "grad_norm": 0.8722299337387085, + "learning_rate": 0.00013289433939467167, + "loss": 2.75, + "step": 11113 + }, + { + "epoch": 1.0069081108015674, + "grad_norm": 0.808335542678833, + "learning_rate": 0.0001328882981936809, + "loss": 2.7972, + "step": 11114 + }, + { + "epoch": 1.0069987089760142, + "grad_norm": 0.8054851293563843, + "learning_rate": 0.00013288225699269014, + "loss": 2.5979, + "step": 11115 + }, + { + "epoch": 1.007089307150461, + "grad_norm": 0.8883678317070007, + "learning_rate": 0.0001328762157916994, + "loss": 2.7841, + "step": 11116 + }, + { + "epoch": 1.0071799053249078, + "grad_norm": 0.8139682412147522, + "learning_rate": 0.00013287017459070864, + "loss": 2.8191, + "step": 11117 + }, + { + "epoch": 1.0072705034993545, + "grad_norm": 0.7641851305961609, + "learning_rate": 0.00013286413338971787, + "loss": 2.5314, + "step": 11118 + }, + { + "epoch": 1.0073611016738013, + "grad_norm": 0.9438263773918152, + "learning_rate": 0.00013285809218872713, + "loss": 2.6784, + "step": 11119 + }, + { + "epoch": 1.0074516998482481, + "grad_norm": 0.8453559279441833, + "learning_rate": 0.00013285205098773637, + "loss": 2.6015, + "step": 11120 + }, + { + "epoch": 1.007542298022695, + "grad_norm": 0.8783933520317078, + "learning_rate": 0.00013284600978674563, + "loss": 3.0008, + "step": 11121 + }, + { + "epoch": 1.0076328961971417, + "grad_norm": 0.7980436086654663, + "learning_rate": 0.00013283996858575486, + "loss": 2.528, + "step": 11122 + }, + { + "epoch": 1.0077234943715885, + "grad_norm": 1.0662646293640137, + "learning_rate": 0.0001328339273847641, + "loss": 2.2146, + "step": 11123 + }, + { + "epoch": 1.0078140925460353, + "grad_norm": 0.8764304518699646, + "learning_rate": 0.00013282788618377333, + "loss": 2.4018, + "step": 11124 + }, + { + "epoch": 1.007904690720482, + "grad_norm": 0.864826500415802, + "learning_rate": 0.0001328218449827826, + "loss": 2.7168, + "step": 11125 + }, + { + "epoch": 1.0079952888949288, + "grad_norm": 0.7988182902336121, + "learning_rate": 0.00013281580378179182, + "loss": 2.1063, + "step": 11126 + }, + { + "epoch": 1.0080858870693756, + "grad_norm": 0.853724479675293, + "learning_rate": 0.00013280976258080106, + "loss": 2.7494, + "step": 11127 + }, + { + "epoch": 1.0081764852438224, + "grad_norm": 0.7691208720207214, + "learning_rate": 0.00013280372137981032, + "loss": 2.502, + "step": 11128 + }, + { + "epoch": 1.0082670834182692, + "grad_norm": 0.8020975589752197, + "learning_rate": 0.00013279768017881955, + "loss": 2.5131, + "step": 11129 + }, + { + "epoch": 1.008357681592716, + "grad_norm": 0.8641566634178162, + "learning_rate": 0.00013279163897782882, + "loss": 2.6679, + "step": 11130 + }, + { + "epoch": 1.0084482797671628, + "grad_norm": 0.8664197325706482, + "learning_rate": 0.00013278559777683802, + "loss": 2.7823, + "step": 11131 + }, + { + "epoch": 1.0085388779416096, + "grad_norm": 0.8038976788520813, + "learning_rate": 0.00013277955657584728, + "loss": 2.6424, + "step": 11132 + }, + { + "epoch": 1.0086294761160564, + "grad_norm": 0.9117175340652466, + "learning_rate": 0.00013277351537485654, + "loss": 2.744, + "step": 11133 + }, + { + "epoch": 1.0087200742905031, + "grad_norm": 0.8401806354522705, + "learning_rate": 0.00013276747417386578, + "loss": 2.7818, + "step": 11134 + }, + { + "epoch": 1.00881067246495, + "grad_norm": 0.842602014541626, + "learning_rate": 0.000132761432972875, + "loss": 2.7249, + "step": 11135 + }, + { + "epoch": 1.0089012706393967, + "grad_norm": 0.8010328412055969, + "learning_rate": 0.00013275539177188425, + "loss": 2.5656, + "step": 11136 + }, + { + "epoch": 1.0089918688138435, + "grad_norm": 0.8012117743492126, + "learning_rate": 0.0001327493505708935, + "loss": 2.7897, + "step": 11137 + }, + { + "epoch": 1.0090824669882903, + "grad_norm": 0.8182348012924194, + "learning_rate": 0.00013274330936990274, + "loss": 2.5892, + "step": 11138 + }, + { + "epoch": 1.009173065162737, + "grad_norm": 0.845319390296936, + "learning_rate": 0.00013273726816891198, + "loss": 2.3693, + "step": 11139 + }, + { + "epoch": 1.0092636633371836, + "grad_norm": 0.8690118789672852, + "learning_rate": 0.0001327312269679212, + "loss": 2.814, + "step": 11140 + }, + { + "epoch": 1.0093542615116304, + "grad_norm": 0.8291383981704712, + "learning_rate": 0.00013272518576693047, + "loss": 2.8071, + "step": 11141 + }, + { + "epoch": 1.0094448596860772, + "grad_norm": 0.8373326659202576, + "learning_rate": 0.00013271914456593973, + "loss": 2.6079, + "step": 11142 + }, + { + "epoch": 1.009535457860524, + "grad_norm": 0.889747679233551, + "learning_rate": 0.00013271310336494897, + "loss": 2.6693, + "step": 11143 + }, + { + "epoch": 1.0096260560349708, + "grad_norm": 0.8240498304367065, + "learning_rate": 0.0001327070621639582, + "loss": 2.5889, + "step": 11144 + }, + { + "epoch": 1.0097166542094176, + "grad_norm": 0.8768743276596069, + "learning_rate": 0.00013270102096296743, + "loss": 2.7423, + "step": 11145 + }, + { + "epoch": 1.0098072523838644, + "grad_norm": 0.8680695295333862, + "learning_rate": 0.0001326949797619767, + "loss": 2.6443, + "step": 11146 + }, + { + "epoch": 1.0098978505583112, + "grad_norm": 0.8310193419456482, + "learning_rate": 0.00013268893856098593, + "loss": 2.5793, + "step": 11147 + }, + { + "epoch": 1.009988448732758, + "grad_norm": 0.9236509799957275, + "learning_rate": 0.00013268289735999516, + "loss": 2.8863, + "step": 11148 + }, + { + "epoch": 1.0100790469072047, + "grad_norm": 0.7743667364120483, + "learning_rate": 0.00013267685615900442, + "loss": 2.5929, + "step": 11149 + }, + { + "epoch": 1.0101696450816515, + "grad_norm": 0.8399999737739563, + "learning_rate": 0.00013267081495801366, + "loss": 2.7074, + "step": 11150 + }, + { + "epoch": 1.0102602432560983, + "grad_norm": 0.7376807928085327, + "learning_rate": 0.00013266477375702292, + "loss": 1.8494, + "step": 11151 + }, + { + "epoch": 1.010350841430545, + "grad_norm": 0.8513144850730896, + "learning_rate": 0.00013265873255603213, + "loss": 2.5329, + "step": 11152 + }, + { + "epoch": 1.0104414396049919, + "grad_norm": 0.8611559867858887, + "learning_rate": 0.0001326526913550414, + "loss": 2.8056, + "step": 11153 + }, + { + "epoch": 1.0105320377794387, + "grad_norm": 0.8838302493095398, + "learning_rate": 0.00013264665015405062, + "loss": 2.6053, + "step": 11154 + }, + { + "epoch": 1.0106226359538855, + "grad_norm": 0.8108101487159729, + "learning_rate": 0.00013264060895305988, + "loss": 2.5579, + "step": 11155 + }, + { + "epoch": 1.0107132341283322, + "grad_norm": 0.8812084197998047, + "learning_rate": 0.00013263456775206912, + "loss": 2.7076, + "step": 11156 + }, + { + "epoch": 1.010803832302779, + "grad_norm": 0.8338202834129333, + "learning_rate": 0.00013262852655107835, + "loss": 2.5641, + "step": 11157 + }, + { + "epoch": 1.0108944304772258, + "grad_norm": 0.8764447569847107, + "learning_rate": 0.0001326224853500876, + "loss": 2.8527, + "step": 11158 + }, + { + "epoch": 1.0109850286516726, + "grad_norm": 0.8268841505050659, + "learning_rate": 0.00013261644414909685, + "loss": 2.7585, + "step": 11159 + }, + { + "epoch": 1.0110756268261194, + "grad_norm": 0.7907134890556335, + "learning_rate": 0.00013261040294810608, + "loss": 2.5376, + "step": 11160 + }, + { + "epoch": 1.0111662250005662, + "grad_norm": 0.7601253390312195, + "learning_rate": 0.00013260436174711531, + "loss": 1.8961, + "step": 11161 + }, + { + "epoch": 1.011256823175013, + "grad_norm": 0.8191161751747131, + "learning_rate": 0.00013259832054612458, + "loss": 2.5988, + "step": 11162 + }, + { + "epoch": 1.0113474213494598, + "grad_norm": 0.8304821252822876, + "learning_rate": 0.00013259227934513384, + "loss": 2.8304, + "step": 11163 + }, + { + "epoch": 1.0114380195239066, + "grad_norm": 0.8377571702003479, + "learning_rate": 0.00013258623814414307, + "loss": 2.7908, + "step": 11164 + }, + { + "epoch": 1.0115286176983533, + "grad_norm": 0.8326588869094849, + "learning_rate": 0.0001325801969431523, + "loss": 2.8804, + "step": 11165 + }, + { + "epoch": 1.0116192158728001, + "grad_norm": 0.8489895462989807, + "learning_rate": 0.00013257415574216154, + "loss": 2.9014, + "step": 11166 + }, + { + "epoch": 1.011709814047247, + "grad_norm": 0.9541758894920349, + "learning_rate": 0.0001325681145411708, + "loss": 2.7259, + "step": 11167 + }, + { + "epoch": 1.0118004122216937, + "grad_norm": 0.8032077550888062, + "learning_rate": 0.00013256207334018003, + "loss": 2.6899, + "step": 11168 + }, + { + "epoch": 1.0118910103961405, + "grad_norm": 0.8566052317619324, + "learning_rate": 0.00013255603213918927, + "loss": 2.7261, + "step": 11169 + }, + { + "epoch": 1.0119816085705873, + "grad_norm": 0.8566197156906128, + "learning_rate": 0.0001325499909381985, + "loss": 2.6599, + "step": 11170 + }, + { + "epoch": 1.012072206745034, + "grad_norm": 0.8731631636619568, + "learning_rate": 0.00013254394973720776, + "loss": 2.4608, + "step": 11171 + }, + { + "epoch": 1.0121628049194809, + "grad_norm": 0.7978683710098267, + "learning_rate": 0.00013253790853621703, + "loss": 2.1426, + "step": 11172 + }, + { + "epoch": 1.0122534030939276, + "grad_norm": 0.8358251452445984, + "learning_rate": 0.00013253186733522623, + "loss": 2.7395, + "step": 11173 + }, + { + "epoch": 1.0123440012683744, + "grad_norm": 0.8976254463195801, + "learning_rate": 0.0001325258261342355, + "loss": 2.7828, + "step": 11174 + }, + { + "epoch": 1.0124345994428212, + "grad_norm": 0.7692650556564331, + "learning_rate": 0.00013251978493324473, + "loss": 2.092, + "step": 11175 + }, + { + "epoch": 1.012525197617268, + "grad_norm": 0.9184789061546326, + "learning_rate": 0.000132513743732254, + "loss": 2.7947, + "step": 11176 + }, + { + "epoch": 1.0126157957917148, + "grad_norm": 0.7438969016075134, + "learning_rate": 0.00013250770253126322, + "loss": 2.0575, + "step": 11177 + }, + { + "epoch": 1.0127063939661616, + "grad_norm": 0.932817280292511, + "learning_rate": 0.00013250166133027246, + "loss": 2.7213, + "step": 11178 + }, + { + "epoch": 1.0127969921406084, + "grad_norm": 0.7374956607818604, + "learning_rate": 0.00013249562012928172, + "loss": 2.2049, + "step": 11179 + }, + { + "epoch": 1.0128875903150552, + "grad_norm": 0.8131324052810669, + "learning_rate": 0.00013248957892829095, + "loss": 2.6152, + "step": 11180 + }, + { + "epoch": 1.012978188489502, + "grad_norm": 0.8354853987693787, + "learning_rate": 0.0001324835377273002, + "loss": 2.8084, + "step": 11181 + }, + { + "epoch": 1.0130687866639487, + "grad_norm": 0.863311767578125, + "learning_rate": 0.00013247749652630942, + "loss": 2.5965, + "step": 11182 + }, + { + "epoch": 1.0131593848383955, + "grad_norm": 0.9661327600479126, + "learning_rate": 0.00013247145532531868, + "loss": 2.8586, + "step": 11183 + }, + { + "epoch": 1.0132499830128423, + "grad_norm": 0.8362953662872314, + "learning_rate": 0.00013246541412432792, + "loss": 1.9862, + "step": 11184 + }, + { + "epoch": 1.013340581187289, + "grad_norm": 0.8429222106933594, + "learning_rate": 0.00013245937292333718, + "loss": 2.7173, + "step": 11185 + }, + { + "epoch": 1.0134311793617359, + "grad_norm": 0.8131267428398132, + "learning_rate": 0.0001324533317223464, + "loss": 2.3885, + "step": 11186 + }, + { + "epoch": 1.0135217775361827, + "grad_norm": 0.8402718305587769, + "learning_rate": 0.00013244729052135564, + "loss": 2.7252, + "step": 11187 + }, + { + "epoch": 1.0136123757106295, + "grad_norm": 0.6928055286407471, + "learning_rate": 0.0001324412493203649, + "loss": 2.131, + "step": 11188 + }, + { + "epoch": 1.0137029738850762, + "grad_norm": 0.8516420722007751, + "learning_rate": 0.00013243520811937414, + "loss": 2.5136, + "step": 11189 + }, + { + "epoch": 1.013793572059523, + "grad_norm": 0.776531994342804, + "learning_rate": 0.00013242916691838337, + "loss": 1.3041, + "step": 11190 + }, + { + "epoch": 1.0138841702339698, + "grad_norm": 0.8662886619567871, + "learning_rate": 0.0001324231257173926, + "loss": 2.7656, + "step": 11191 + }, + { + "epoch": 1.0139747684084166, + "grad_norm": 0.876906156539917, + "learning_rate": 0.00013241708451640187, + "loss": 2.6119, + "step": 11192 + }, + { + "epoch": 1.0140653665828634, + "grad_norm": 0.817001223564148, + "learning_rate": 0.00013241104331541113, + "loss": 2.6919, + "step": 11193 + }, + { + "epoch": 1.0141559647573102, + "grad_norm": 0.8281209468841553, + "learning_rate": 0.00013240500211442036, + "loss": 2.7315, + "step": 11194 + }, + { + "epoch": 1.014246562931757, + "grad_norm": 0.8671538829803467, + "learning_rate": 0.0001323989609134296, + "loss": 2.5527, + "step": 11195 + }, + { + "epoch": 1.0143371611062038, + "grad_norm": 0.8357703685760498, + "learning_rate": 0.00013239291971243883, + "loss": 2.77, + "step": 11196 + }, + { + "epoch": 1.0144277592806505, + "grad_norm": 0.8301588892936707, + "learning_rate": 0.0001323868785114481, + "loss": 2.8979, + "step": 11197 + }, + { + "epoch": 1.0145183574550973, + "grad_norm": 0.8553801774978638, + "learning_rate": 0.00013238083731045733, + "loss": 2.6202, + "step": 11198 + }, + { + "epoch": 1.0146089556295441, + "grad_norm": 0.8092402815818787, + "learning_rate": 0.00013237479610946656, + "loss": 2.8445, + "step": 11199 + }, + { + "epoch": 1.014699553803991, + "grad_norm": 0.8197951912879944, + "learning_rate": 0.0001323687549084758, + "loss": 2.5998, + "step": 11200 + }, + { + "epoch": 1.0147901519784377, + "grad_norm": 0.7120721936225891, + "learning_rate": 0.00013236271370748506, + "loss": 2.199, + "step": 11201 + }, + { + "epoch": 1.0148807501528845, + "grad_norm": 0.8998697996139526, + "learning_rate": 0.00013235667250649432, + "loss": 2.5817, + "step": 11202 + }, + { + "epoch": 1.0149713483273313, + "grad_norm": 0.8351618647575378, + "learning_rate": 0.00013235063130550352, + "loss": 2.8636, + "step": 11203 + }, + { + "epoch": 1.015061946501778, + "grad_norm": 0.843907356262207, + "learning_rate": 0.00013234459010451279, + "loss": 2.4978, + "step": 11204 + }, + { + "epoch": 1.0151525446762248, + "grad_norm": 0.828815758228302, + "learning_rate": 0.00013233854890352202, + "loss": 2.5693, + "step": 11205 + }, + { + "epoch": 1.0152431428506716, + "grad_norm": 0.9104110598564148, + "learning_rate": 0.00013233250770253128, + "loss": 2.879, + "step": 11206 + }, + { + "epoch": 1.0153337410251184, + "grad_norm": 0.8296048045158386, + "learning_rate": 0.00013232646650154052, + "loss": 2.8214, + "step": 11207 + }, + { + "epoch": 1.0154243391995652, + "grad_norm": 0.9200468063354492, + "learning_rate": 0.00013232042530054975, + "loss": 2.587, + "step": 11208 + }, + { + "epoch": 1.015514937374012, + "grad_norm": 0.8021771311759949, + "learning_rate": 0.000132314384099559, + "loss": 2.5749, + "step": 11209 + }, + { + "epoch": 1.0156055355484588, + "grad_norm": 0.8173088431358337, + "learning_rate": 0.00013230834289856824, + "loss": 2.7494, + "step": 11210 + }, + { + "epoch": 1.0156961337229056, + "grad_norm": 0.7068215012550354, + "learning_rate": 0.00013230230169757748, + "loss": 1.8832, + "step": 11211 + }, + { + "epoch": 1.0157867318973524, + "grad_norm": 0.8418647646903992, + "learning_rate": 0.0001322962604965867, + "loss": 2.0573, + "step": 11212 + }, + { + "epoch": 1.0158773300717991, + "grad_norm": 0.8804832696914673, + "learning_rate": 0.00013229021929559597, + "loss": 2.7347, + "step": 11213 + }, + { + "epoch": 1.015967928246246, + "grad_norm": 0.8844782114028931, + "learning_rate": 0.0001322841780946052, + "loss": 2.5603, + "step": 11214 + }, + { + "epoch": 1.0160585264206927, + "grad_norm": 0.9078071713447571, + "learning_rate": 0.00013227813689361447, + "loss": 2.5472, + "step": 11215 + }, + { + "epoch": 1.0161491245951395, + "grad_norm": 1.0537937879562378, + "learning_rate": 0.0001322720956926237, + "loss": 2.5221, + "step": 11216 + }, + { + "epoch": 1.0162397227695863, + "grad_norm": 0.8947159647941589, + "learning_rate": 0.00013226605449163294, + "loss": 2.3476, + "step": 11217 + }, + { + "epoch": 1.016330320944033, + "grad_norm": 0.9365119934082031, + "learning_rate": 0.0001322600132906422, + "loss": 2.9082, + "step": 11218 + }, + { + "epoch": 1.0164209191184799, + "grad_norm": 0.8749282360076904, + "learning_rate": 0.00013225397208965143, + "loss": 2.6217, + "step": 11219 + }, + { + "epoch": 1.0165115172929267, + "grad_norm": 0.8578024506568909, + "learning_rate": 0.00013224793088866067, + "loss": 2.5099, + "step": 11220 + }, + { + "epoch": 1.0166021154673732, + "grad_norm": 0.7350583076477051, + "learning_rate": 0.0001322418896876699, + "loss": 2.0962, + "step": 11221 + }, + { + "epoch": 1.01669271364182, + "grad_norm": 0.8498398065567017, + "learning_rate": 0.00013223584848667916, + "loss": 2.6353, + "step": 11222 + }, + { + "epoch": 1.0167833118162668, + "grad_norm": 0.7442131042480469, + "learning_rate": 0.00013222980728568842, + "loss": 2.0359, + "step": 11223 + }, + { + "epoch": 1.0168739099907136, + "grad_norm": 0.8401215672492981, + "learning_rate": 0.00013222376608469763, + "loss": 2.4866, + "step": 11224 + }, + { + "epoch": 1.0169645081651604, + "grad_norm": 0.8230358958244324, + "learning_rate": 0.0001322177248837069, + "loss": 2.774, + "step": 11225 + }, + { + "epoch": 1.0170551063396072, + "grad_norm": 0.7330735325813293, + "learning_rate": 0.00013221168368271612, + "loss": 2.0376, + "step": 11226 + }, + { + "epoch": 1.017145704514054, + "grad_norm": 0.8988582491874695, + "learning_rate": 0.00013220564248172539, + "loss": 2.5867, + "step": 11227 + }, + { + "epoch": 1.0172363026885007, + "grad_norm": 0.8419578075408936, + "learning_rate": 0.00013219960128073462, + "loss": 2.8081, + "step": 11228 + }, + { + "epoch": 1.0173269008629475, + "grad_norm": 0.8631223440170288, + "learning_rate": 0.00013219356007974385, + "loss": 2.6252, + "step": 11229 + }, + { + "epoch": 1.0174174990373943, + "grad_norm": 0.8537087440490723, + "learning_rate": 0.0001321875188787531, + "loss": 2.4901, + "step": 11230 + }, + { + "epoch": 1.017508097211841, + "grad_norm": 0.9475165009498596, + "learning_rate": 0.00013218147767776235, + "loss": 2.6518, + "step": 11231 + }, + { + "epoch": 1.0175986953862879, + "grad_norm": 0.8273070454597473, + "learning_rate": 0.0001321754364767716, + "loss": 2.7752, + "step": 11232 + }, + { + "epoch": 1.0176892935607347, + "grad_norm": 0.7373884916305542, + "learning_rate": 0.00013216939527578082, + "loss": 2.0143, + "step": 11233 + }, + { + "epoch": 1.0177798917351815, + "grad_norm": 0.8587778210639954, + "learning_rate": 0.00013216335407479008, + "loss": 2.9034, + "step": 11234 + }, + { + "epoch": 1.0178704899096283, + "grad_norm": 0.8481143712997437, + "learning_rate": 0.0001321573128737993, + "loss": 2.7274, + "step": 11235 + }, + { + "epoch": 1.017961088084075, + "grad_norm": 0.8584009408950806, + "learning_rate": 0.00013215127167280857, + "loss": 2.6771, + "step": 11236 + }, + { + "epoch": 1.0180516862585218, + "grad_norm": 0.8461078405380249, + "learning_rate": 0.00013214523047181778, + "loss": 2.732, + "step": 11237 + }, + { + "epoch": 1.0181422844329686, + "grad_norm": 0.8662376403808594, + "learning_rate": 0.00013213918927082704, + "loss": 2.7583, + "step": 11238 + }, + { + "epoch": 1.0182328826074154, + "grad_norm": 0.8722410798072815, + "learning_rate": 0.0001321331480698363, + "loss": 2.6319, + "step": 11239 + }, + { + "epoch": 1.0183234807818622, + "grad_norm": 0.8309972286224365, + "learning_rate": 0.00013212710686884554, + "loss": 2.9428, + "step": 11240 + }, + { + "epoch": 1.018414078956309, + "grad_norm": 0.8618354201316833, + "learning_rate": 0.00013212106566785477, + "loss": 2.5855, + "step": 11241 + }, + { + "epoch": 1.0185046771307558, + "grad_norm": 0.7852728962898254, + "learning_rate": 0.000132115024466864, + "loss": 2.6712, + "step": 11242 + }, + { + "epoch": 1.0185952753052026, + "grad_norm": 0.9430863261222839, + "learning_rate": 0.00013210898326587327, + "loss": 2.8215, + "step": 11243 + }, + { + "epoch": 1.0186858734796493, + "grad_norm": 0.851552426815033, + "learning_rate": 0.0001321029420648825, + "loss": 2.5932, + "step": 11244 + }, + { + "epoch": 1.0187764716540961, + "grad_norm": 0.8635781407356262, + "learning_rate": 0.00013209690086389176, + "loss": 2.6955, + "step": 11245 + }, + { + "epoch": 1.018867069828543, + "grad_norm": 0.8592686057090759, + "learning_rate": 0.000132090859662901, + "loss": 2.9685, + "step": 11246 + }, + { + "epoch": 1.0189576680029897, + "grad_norm": 0.8267068266868591, + "learning_rate": 0.00013208481846191023, + "loss": 2.6138, + "step": 11247 + }, + { + "epoch": 1.0190482661774365, + "grad_norm": 0.8556285500526428, + "learning_rate": 0.0001320787772609195, + "loss": 2.9193, + "step": 11248 + }, + { + "epoch": 1.0191388643518833, + "grad_norm": 0.8744221925735474, + "learning_rate": 0.00013207273605992872, + "loss": 2.7358, + "step": 11249 + }, + { + "epoch": 1.01922946252633, + "grad_norm": 0.8531243205070496, + "learning_rate": 0.00013206669485893796, + "loss": 2.7031, + "step": 11250 + }, + { + "epoch": 1.0193200607007769, + "grad_norm": 0.800440788269043, + "learning_rate": 0.0001320606536579472, + "loss": 2.6044, + "step": 11251 + }, + { + "epoch": 1.0194106588752236, + "grad_norm": 0.854494035243988, + "learning_rate": 0.00013205461245695645, + "loss": 2.84, + "step": 11252 + }, + { + "epoch": 1.0195012570496704, + "grad_norm": 0.9366505146026611, + "learning_rate": 0.00013204857125596572, + "loss": 2.834, + "step": 11253 + }, + { + "epoch": 1.0195918552241172, + "grad_norm": 0.8302942514419556, + "learning_rate": 0.00013204253005497492, + "loss": 2.781, + "step": 11254 + }, + { + "epoch": 1.019682453398564, + "grad_norm": 0.7317270040512085, + "learning_rate": 0.00013203648885398418, + "loss": 2.1397, + "step": 11255 + }, + { + "epoch": 1.0197730515730108, + "grad_norm": 0.8212071657180786, + "learning_rate": 0.00013203044765299342, + "loss": 2.6311, + "step": 11256 + }, + { + "epoch": 1.0198636497474576, + "grad_norm": 0.8037013411521912, + "learning_rate": 0.00013202440645200268, + "loss": 2.6117, + "step": 11257 + }, + { + "epoch": 1.0199542479219044, + "grad_norm": 0.841126561164856, + "learning_rate": 0.0001320183652510119, + "loss": 2.7066, + "step": 11258 + }, + { + "epoch": 1.0200448460963512, + "grad_norm": 0.8716331124305725, + "learning_rate": 0.00013201232405002115, + "loss": 2.6374, + "step": 11259 + }, + { + "epoch": 1.020135444270798, + "grad_norm": 0.8546915054321289, + "learning_rate": 0.00013200628284903038, + "loss": 2.7275, + "step": 11260 + }, + { + "epoch": 1.0202260424452447, + "grad_norm": 0.8954454660415649, + "learning_rate": 0.00013200024164803964, + "loss": 2.5554, + "step": 11261 + }, + { + "epoch": 1.0203166406196915, + "grad_norm": 0.8529139161109924, + "learning_rate": 0.00013199420044704888, + "loss": 2.8603, + "step": 11262 + }, + { + "epoch": 1.0204072387941383, + "grad_norm": 0.7943106293678284, + "learning_rate": 0.0001319881592460581, + "loss": 2.5739, + "step": 11263 + }, + { + "epoch": 1.020497836968585, + "grad_norm": 0.9251561164855957, + "learning_rate": 0.00013198211804506737, + "loss": 2.5269, + "step": 11264 + }, + { + "epoch": 1.0205884351430319, + "grad_norm": 0.9351811408996582, + "learning_rate": 0.0001319760768440766, + "loss": 2.8119, + "step": 11265 + }, + { + "epoch": 1.0206790333174787, + "grad_norm": 0.6174471378326416, + "learning_rate": 0.00013197003564308587, + "loss": 1.4453, + "step": 11266 + }, + { + "epoch": 1.0207696314919255, + "grad_norm": 0.7301484942436218, + "learning_rate": 0.00013196399444209507, + "loss": 1.844, + "step": 11267 + }, + { + "epoch": 1.0208602296663722, + "grad_norm": 0.8713781237602234, + "learning_rate": 0.00013195795324110433, + "loss": 2.7731, + "step": 11268 + }, + { + "epoch": 1.020950827840819, + "grad_norm": 0.8480002284049988, + "learning_rate": 0.0001319519120401136, + "loss": 2.6619, + "step": 11269 + }, + { + "epoch": 1.0210414260152658, + "grad_norm": 0.8748977184295654, + "learning_rate": 0.00013194587083912283, + "loss": 2.7501, + "step": 11270 + }, + { + "epoch": 1.0211320241897126, + "grad_norm": 0.7521603107452393, + "learning_rate": 0.00013193982963813206, + "loss": 2.0447, + "step": 11271 + }, + { + "epoch": 1.0212226223641594, + "grad_norm": 0.8781550526618958, + "learning_rate": 0.0001319337884371413, + "loss": 2.6509, + "step": 11272 + }, + { + "epoch": 1.0213132205386062, + "grad_norm": 0.6258354783058167, + "learning_rate": 0.00013192774723615056, + "loss": 1.315, + "step": 11273 + }, + { + "epoch": 1.021403818713053, + "grad_norm": 0.8593977689743042, + "learning_rate": 0.0001319217060351598, + "loss": 2.8169, + "step": 11274 + }, + { + "epoch": 1.0214944168874998, + "grad_norm": 0.8032324314117432, + "learning_rate": 0.00013191566483416903, + "loss": 2.7089, + "step": 11275 + }, + { + "epoch": 1.0215850150619465, + "grad_norm": 0.7823426127433777, + "learning_rate": 0.0001319096236331783, + "loss": 2.0324, + "step": 11276 + }, + { + "epoch": 1.0216756132363933, + "grad_norm": 0.8446296453475952, + "learning_rate": 0.00013190358243218752, + "loss": 2.7419, + "step": 11277 + }, + { + "epoch": 1.0217662114108401, + "grad_norm": 0.8735248446464539, + "learning_rate": 0.00013189754123119678, + "loss": 2.5389, + "step": 11278 + }, + { + "epoch": 1.021856809585287, + "grad_norm": 0.8692997694015503, + "learning_rate": 0.00013189150003020602, + "loss": 2.7955, + "step": 11279 + }, + { + "epoch": 1.0219474077597337, + "grad_norm": 0.8976773023605347, + "learning_rate": 0.00013188545882921525, + "loss": 2.7522, + "step": 11280 + }, + { + "epoch": 1.0220380059341805, + "grad_norm": 0.8437100052833557, + "learning_rate": 0.00013187941762822449, + "loss": 2.5674, + "step": 11281 + }, + { + "epoch": 1.0221286041086273, + "grad_norm": 0.8734058141708374, + "learning_rate": 0.00013187337642723375, + "loss": 2.7276, + "step": 11282 + }, + { + "epoch": 1.022219202283074, + "grad_norm": 0.86916583776474, + "learning_rate": 0.00013186733522624298, + "loss": 2.823, + "step": 11283 + }, + { + "epoch": 1.0223098004575208, + "grad_norm": 0.8623618483543396, + "learning_rate": 0.00013186129402525221, + "loss": 2.6036, + "step": 11284 + }, + { + "epoch": 1.0224003986319676, + "grad_norm": 0.8602996468544006, + "learning_rate": 0.00013185525282426148, + "loss": 2.5872, + "step": 11285 + }, + { + "epoch": 1.0224909968064144, + "grad_norm": 0.8557217717170715, + "learning_rate": 0.0001318492116232707, + "loss": 2.399, + "step": 11286 + }, + { + "epoch": 1.0225815949808612, + "grad_norm": 0.9576001167297363, + "learning_rate": 0.00013184317042227997, + "loss": 2.983, + "step": 11287 + }, + { + "epoch": 1.022672193155308, + "grad_norm": 0.8505628108978271, + "learning_rate": 0.00013183712922128918, + "loss": 2.5638, + "step": 11288 + }, + { + "epoch": 1.0227627913297548, + "grad_norm": 0.9112201929092407, + "learning_rate": 0.00013183108802029844, + "loss": 2.6887, + "step": 11289 + }, + { + "epoch": 1.0228533895042016, + "grad_norm": 0.8996481895446777, + "learning_rate": 0.00013182504681930767, + "loss": 2.8686, + "step": 11290 + }, + { + "epoch": 1.0229439876786484, + "grad_norm": 0.9604005813598633, + "learning_rate": 0.00013181900561831693, + "loss": 2.8818, + "step": 11291 + }, + { + "epoch": 1.0230345858530951, + "grad_norm": 0.8432559370994568, + "learning_rate": 0.00013181296441732617, + "loss": 2.3148, + "step": 11292 + }, + { + "epoch": 1.023125184027542, + "grad_norm": 0.8721452951431274, + "learning_rate": 0.0001318069232163354, + "loss": 2.6829, + "step": 11293 + }, + { + "epoch": 1.0232157822019887, + "grad_norm": 0.8898346424102783, + "learning_rate": 0.00013180088201534466, + "loss": 2.6305, + "step": 11294 + }, + { + "epoch": 1.0233063803764355, + "grad_norm": 0.9093887805938721, + "learning_rate": 0.0001317948408143539, + "loss": 2.5601, + "step": 11295 + }, + { + "epoch": 1.0233969785508823, + "grad_norm": 0.8715314269065857, + "learning_rate": 0.00013178879961336313, + "loss": 2.8454, + "step": 11296 + }, + { + "epoch": 1.023487576725329, + "grad_norm": 0.8318759799003601, + "learning_rate": 0.00013178275841237237, + "loss": 2.7391, + "step": 11297 + }, + { + "epoch": 1.0235781748997759, + "grad_norm": 0.8765479922294617, + "learning_rate": 0.00013177671721138163, + "loss": 2.7254, + "step": 11298 + }, + { + "epoch": 1.0236687730742227, + "grad_norm": 0.8716441988945007, + "learning_rate": 0.0001317706760103909, + "loss": 2.7669, + "step": 11299 + }, + { + "epoch": 1.0237593712486694, + "grad_norm": 0.9591617584228516, + "learning_rate": 0.00013176463480940012, + "loss": 2.398, + "step": 11300 + }, + { + "epoch": 1.0238499694231162, + "grad_norm": 0.9823881983757019, + "learning_rate": 0.00013175859360840936, + "loss": 2.9129, + "step": 11301 + }, + { + "epoch": 1.0239405675975628, + "grad_norm": 0.8648741245269775, + "learning_rate": 0.0001317525524074186, + "loss": 2.6559, + "step": 11302 + }, + { + "epoch": 1.0240311657720096, + "grad_norm": 0.9205857515335083, + "learning_rate": 0.00013174651120642785, + "loss": 2.5422, + "step": 11303 + }, + { + "epoch": 1.0241217639464564, + "grad_norm": 0.8468566536903381, + "learning_rate": 0.00013174047000543709, + "loss": 2.8006, + "step": 11304 + }, + { + "epoch": 1.0242123621209032, + "grad_norm": 0.8535242080688477, + "learning_rate": 0.00013173442880444632, + "loss": 2.536, + "step": 11305 + }, + { + "epoch": 1.02430296029535, + "grad_norm": 0.8155398368835449, + "learning_rate": 0.00013172838760345558, + "loss": 2.8381, + "step": 11306 + }, + { + "epoch": 1.0243935584697967, + "grad_norm": 0.9256640076637268, + "learning_rate": 0.00013172234640246481, + "loss": 2.7432, + "step": 11307 + }, + { + "epoch": 1.0244841566442435, + "grad_norm": 0.9024507999420166, + "learning_rate": 0.00013171630520147408, + "loss": 2.6625, + "step": 11308 + }, + { + "epoch": 1.0245747548186903, + "grad_norm": 0.8889920711517334, + "learning_rate": 0.0001317102640004833, + "loss": 2.8714, + "step": 11309 + }, + { + "epoch": 1.024665352993137, + "grad_norm": 0.8384226560592651, + "learning_rate": 0.00013170422279949254, + "loss": 2.255, + "step": 11310 + }, + { + "epoch": 1.024755951167584, + "grad_norm": 0.9444588422775269, + "learning_rate": 0.00013169818159850178, + "loss": 2.7935, + "step": 11311 + }, + { + "epoch": 1.0248465493420307, + "grad_norm": 0.8799281716346741, + "learning_rate": 0.00013169214039751104, + "loss": 2.8113, + "step": 11312 + }, + { + "epoch": 1.0249371475164775, + "grad_norm": 0.9903099536895752, + "learning_rate": 0.00013168609919652027, + "loss": 2.7604, + "step": 11313 + }, + { + "epoch": 1.0250277456909243, + "grad_norm": 0.9410387277603149, + "learning_rate": 0.0001316800579955295, + "loss": 2.6105, + "step": 11314 + }, + { + "epoch": 1.025118343865371, + "grad_norm": 0.84328693151474, + "learning_rate": 0.00013167401679453877, + "loss": 3.0689, + "step": 11315 + }, + { + "epoch": 1.0252089420398178, + "grad_norm": 0.8647865653038025, + "learning_rate": 0.000131667975593548, + "loss": 2.5428, + "step": 11316 + }, + { + "epoch": 1.0252995402142646, + "grad_norm": 0.8775978088378906, + "learning_rate": 0.00013166193439255726, + "loss": 2.6374, + "step": 11317 + }, + { + "epoch": 1.0253901383887114, + "grad_norm": 0.8681612610816956, + "learning_rate": 0.00013165589319156647, + "loss": 2.7788, + "step": 11318 + }, + { + "epoch": 1.0254807365631582, + "grad_norm": 0.8772042393684387, + "learning_rate": 0.00013164985199057573, + "loss": 2.5045, + "step": 11319 + }, + { + "epoch": 1.025571334737605, + "grad_norm": 0.8606233596801758, + "learning_rate": 0.00013164381078958497, + "loss": 2.786, + "step": 11320 + }, + { + "epoch": 1.0256619329120518, + "grad_norm": 0.8669321537017822, + "learning_rate": 0.00013163776958859423, + "loss": 2.4826, + "step": 11321 + }, + { + "epoch": 1.0257525310864986, + "grad_norm": 0.8052210807800293, + "learning_rate": 0.00013163172838760346, + "loss": 2.7257, + "step": 11322 + }, + { + "epoch": 1.0258431292609453, + "grad_norm": 0.8502984642982483, + "learning_rate": 0.0001316256871866127, + "loss": 2.6482, + "step": 11323 + }, + { + "epoch": 1.0259337274353921, + "grad_norm": 0.8527174592018127, + "learning_rate": 0.00013161964598562196, + "loss": 2.6727, + "step": 11324 + }, + { + "epoch": 1.026024325609839, + "grad_norm": 0.8103166818618774, + "learning_rate": 0.0001316136047846312, + "loss": 2.5995, + "step": 11325 + }, + { + "epoch": 1.0261149237842857, + "grad_norm": 0.813851535320282, + "learning_rate": 0.00013160756358364042, + "loss": 2.6223, + "step": 11326 + }, + { + "epoch": 1.0262055219587325, + "grad_norm": 0.721291720867157, + "learning_rate": 0.00013160152238264966, + "loss": 1.9026, + "step": 11327 + }, + { + "epoch": 1.0262961201331793, + "grad_norm": 0.8744772672653198, + "learning_rate": 0.00013159548118165892, + "loss": 2.5967, + "step": 11328 + }, + { + "epoch": 1.026386718307626, + "grad_norm": 0.8897041082382202, + "learning_rate": 0.00013158943998066818, + "loss": 2.5719, + "step": 11329 + }, + { + "epoch": 1.0264773164820729, + "grad_norm": 0.8942477107048035, + "learning_rate": 0.00013158339877967741, + "loss": 2.6767, + "step": 11330 + }, + { + "epoch": 1.0265679146565196, + "grad_norm": 0.8264179229736328, + "learning_rate": 0.00013157735757868665, + "loss": 2.6941, + "step": 11331 + }, + { + "epoch": 1.0266585128309664, + "grad_norm": 0.8606237769126892, + "learning_rate": 0.00013157131637769588, + "loss": 2.6817, + "step": 11332 + }, + { + "epoch": 1.0267491110054132, + "grad_norm": 0.8674498796463013, + "learning_rate": 0.00013156527517670514, + "loss": 2.4494, + "step": 11333 + }, + { + "epoch": 1.02683970917986, + "grad_norm": 0.9009765386581421, + "learning_rate": 0.00013155923397571438, + "loss": 2.9327, + "step": 11334 + }, + { + "epoch": 1.0269303073543068, + "grad_norm": 0.9035921692848206, + "learning_rate": 0.0001315531927747236, + "loss": 2.913, + "step": 11335 + }, + { + "epoch": 1.0270209055287536, + "grad_norm": 0.832223117351532, + "learning_rate": 0.00013154715157373287, + "loss": 2.7173, + "step": 11336 + }, + { + "epoch": 1.0271115037032004, + "grad_norm": 0.8985092043876648, + "learning_rate": 0.0001315411103727421, + "loss": 2.6443, + "step": 11337 + }, + { + "epoch": 1.0272021018776472, + "grad_norm": 0.8729944825172424, + "learning_rate": 0.00013153506917175137, + "loss": 2.6759, + "step": 11338 + }, + { + "epoch": 1.027292700052094, + "grad_norm": 0.8232083320617676, + "learning_rate": 0.00013152902797076058, + "loss": 2.6329, + "step": 11339 + }, + { + "epoch": 1.0273832982265407, + "grad_norm": 0.8698650002479553, + "learning_rate": 0.00013152298676976984, + "loss": 2.5346, + "step": 11340 + }, + { + "epoch": 1.0274738964009875, + "grad_norm": 0.9255873560905457, + "learning_rate": 0.00013151694556877907, + "loss": 2.6916, + "step": 11341 + }, + { + "epoch": 1.0275644945754343, + "grad_norm": 0.749904990196228, + "learning_rate": 0.00013151090436778833, + "loss": 1.923, + "step": 11342 + }, + { + "epoch": 1.027655092749881, + "grad_norm": 0.8408960700035095, + "learning_rate": 0.00013150486316679757, + "loss": 2.4345, + "step": 11343 + }, + { + "epoch": 1.0277456909243279, + "grad_norm": 0.8748396039009094, + "learning_rate": 0.0001314988219658068, + "loss": 2.4371, + "step": 11344 + }, + { + "epoch": 1.0278362890987747, + "grad_norm": 0.8143633604049683, + "learning_rate": 0.00013149278076481606, + "loss": 2.7022, + "step": 11345 + }, + { + "epoch": 1.0279268872732215, + "grad_norm": 0.8881786465644836, + "learning_rate": 0.0001314867395638253, + "loss": 2.4842, + "step": 11346 + }, + { + "epoch": 1.0280174854476682, + "grad_norm": 0.8918449282646179, + "learning_rate": 0.00013148069836283453, + "loss": 2.7856, + "step": 11347 + }, + { + "epoch": 1.028108083622115, + "grad_norm": 0.892601490020752, + "learning_rate": 0.00013147465716184376, + "loss": 3.0169, + "step": 11348 + }, + { + "epoch": 1.0281986817965618, + "grad_norm": 0.9578560590744019, + "learning_rate": 0.00013146861596085302, + "loss": 2.7926, + "step": 11349 + }, + { + "epoch": 1.0282892799710086, + "grad_norm": 0.88067227602005, + "learning_rate": 0.00013146257475986226, + "loss": 2.6768, + "step": 11350 + }, + { + "epoch": 1.0283798781454554, + "grad_norm": 0.850734531879425, + "learning_rate": 0.00013145653355887152, + "loss": 2.5966, + "step": 11351 + }, + { + "epoch": 1.0284704763199022, + "grad_norm": 0.8139296174049377, + "learning_rate": 0.00013145049235788075, + "loss": 2.692, + "step": 11352 + }, + { + "epoch": 1.028561074494349, + "grad_norm": 0.8358063101768494, + "learning_rate": 0.00013144445115689, + "loss": 2.5532, + "step": 11353 + }, + { + "epoch": 1.0286516726687958, + "grad_norm": 0.8375602960586548, + "learning_rate": 0.00013143840995589925, + "loss": 2.7, + "step": 11354 + }, + { + "epoch": 1.0287422708432425, + "grad_norm": 0.8822147250175476, + "learning_rate": 0.00013143236875490848, + "loss": 2.706, + "step": 11355 + }, + { + "epoch": 1.0288328690176893, + "grad_norm": 0.9315916299819946, + "learning_rate": 0.00013142632755391772, + "loss": 2.7793, + "step": 11356 + }, + { + "epoch": 1.0289234671921361, + "grad_norm": 0.9465680122375488, + "learning_rate": 0.00013142028635292695, + "loss": 2.8174, + "step": 11357 + }, + { + "epoch": 1.029014065366583, + "grad_norm": 0.9065900444984436, + "learning_rate": 0.0001314142451519362, + "loss": 2.7282, + "step": 11358 + }, + { + "epoch": 1.0291046635410297, + "grad_norm": 0.9016030430793762, + "learning_rate": 0.00013140820395094547, + "loss": 2.4984, + "step": 11359 + }, + { + "epoch": 1.0291952617154765, + "grad_norm": 0.8225070238113403, + "learning_rate": 0.00013140216274995468, + "loss": 2.7232, + "step": 11360 + }, + { + "epoch": 1.0292858598899233, + "grad_norm": 0.8372030258178711, + "learning_rate": 0.00013139612154896394, + "loss": 2.5837, + "step": 11361 + }, + { + "epoch": 1.02937645806437, + "grad_norm": 0.855431318283081, + "learning_rate": 0.00013139008034797318, + "loss": 2.6643, + "step": 11362 + }, + { + "epoch": 1.0294670562388168, + "grad_norm": 0.8493560552597046, + "learning_rate": 0.00013138403914698244, + "loss": 2.7318, + "step": 11363 + }, + { + "epoch": 1.0295576544132636, + "grad_norm": 0.8032834529876709, + "learning_rate": 0.00013137799794599167, + "loss": 2.5787, + "step": 11364 + }, + { + "epoch": 1.0296482525877104, + "grad_norm": 0.893166720867157, + "learning_rate": 0.0001313719567450009, + "loss": 2.9286, + "step": 11365 + }, + { + "epoch": 1.0297388507621572, + "grad_norm": 0.8769408464431763, + "learning_rate": 0.00013136591554401017, + "loss": 2.8046, + "step": 11366 + }, + { + "epoch": 1.029829448936604, + "grad_norm": 0.9251463413238525, + "learning_rate": 0.0001313598743430194, + "loss": 2.6228, + "step": 11367 + }, + { + "epoch": 1.0299200471110508, + "grad_norm": 0.7563031315803528, + "learning_rate": 0.00013135383314202866, + "loss": 1.9452, + "step": 11368 + }, + { + "epoch": 1.0300106452854976, + "grad_norm": 0.8371793627738953, + "learning_rate": 0.00013134779194103787, + "loss": 2.787, + "step": 11369 + }, + { + "epoch": 1.0301012434599444, + "grad_norm": 0.94198077917099, + "learning_rate": 0.00013134175074004713, + "loss": 2.8486, + "step": 11370 + }, + { + "epoch": 1.0301918416343911, + "grad_norm": 0.9124158620834351, + "learning_rate": 0.00013133570953905636, + "loss": 2.739, + "step": 11371 + }, + { + "epoch": 1.030282439808838, + "grad_norm": 0.9339147806167603, + "learning_rate": 0.00013132966833806562, + "loss": 2.5845, + "step": 11372 + }, + { + "epoch": 1.0303730379832847, + "grad_norm": 0.8431546688079834, + "learning_rate": 0.00013132362713707486, + "loss": 2.3775, + "step": 11373 + }, + { + "epoch": 1.0304636361577315, + "grad_norm": 0.8818604946136475, + "learning_rate": 0.0001313175859360841, + "loss": 2.7195, + "step": 11374 + }, + { + "epoch": 1.0305542343321783, + "grad_norm": 0.8893022537231445, + "learning_rate": 0.00013131154473509335, + "loss": 2.6784, + "step": 11375 + }, + { + "epoch": 1.030644832506625, + "grad_norm": 0.8955414891242981, + "learning_rate": 0.0001313055035341026, + "loss": 2.8624, + "step": 11376 + }, + { + "epoch": 1.0307354306810719, + "grad_norm": 0.8219074010848999, + "learning_rate": 0.00013129946233311182, + "loss": 2.495, + "step": 11377 + }, + { + "epoch": 1.0308260288555187, + "grad_norm": 0.9406517148017883, + "learning_rate": 0.00013129342113212106, + "loss": 2.9913, + "step": 11378 + }, + { + "epoch": 1.0309166270299654, + "grad_norm": 0.8743398785591125, + "learning_rate": 0.00013128737993113032, + "loss": 2.8689, + "step": 11379 + }, + { + "epoch": 1.0310072252044122, + "grad_norm": 1.168220043182373, + "learning_rate": 0.00013128133873013955, + "loss": 2.8127, + "step": 11380 + }, + { + "epoch": 1.031097823378859, + "grad_norm": 0.8718997240066528, + "learning_rate": 0.0001312752975291488, + "loss": 2.7598, + "step": 11381 + }, + { + "epoch": 1.0311884215533058, + "grad_norm": 0.9430450201034546, + "learning_rate": 0.00013126925632815805, + "loss": 2.9398, + "step": 11382 + }, + { + "epoch": 1.0312790197277524, + "grad_norm": 0.9079588055610657, + "learning_rate": 0.00013126321512716728, + "loss": 2.6687, + "step": 11383 + }, + { + "epoch": 1.0313696179021992, + "grad_norm": 0.8400775194168091, + "learning_rate": 0.00013125717392617654, + "loss": 2.4583, + "step": 11384 + }, + { + "epoch": 1.031460216076646, + "grad_norm": 0.7890886068344116, + "learning_rate": 0.00013125113272518578, + "loss": 2.7107, + "step": 11385 + }, + { + "epoch": 1.0315508142510927, + "grad_norm": 0.635364830493927, + "learning_rate": 0.000131245091524195, + "loss": 1.465, + "step": 11386 + }, + { + "epoch": 1.0316414124255395, + "grad_norm": 0.8205000758171082, + "learning_rate": 0.00013123905032320424, + "loss": 2.6347, + "step": 11387 + }, + { + "epoch": 1.0317320105999863, + "grad_norm": 1.050934076309204, + "learning_rate": 0.0001312330091222135, + "loss": 2.7216, + "step": 11388 + }, + { + "epoch": 1.031822608774433, + "grad_norm": 0.8331598043441772, + "learning_rate": 0.00013122696792122277, + "loss": 2.5927, + "step": 11389 + }, + { + "epoch": 1.03191320694888, + "grad_norm": 0.9218451380729675, + "learning_rate": 0.00013122092672023197, + "loss": 2.912, + "step": 11390 + }, + { + "epoch": 1.0320038051233267, + "grad_norm": 0.8475827574729919, + "learning_rate": 0.00013121488551924123, + "loss": 2.7842, + "step": 11391 + }, + { + "epoch": 1.0320944032977735, + "grad_norm": 0.7640538215637207, + "learning_rate": 0.00013120884431825047, + "loss": 1.9068, + "step": 11392 + }, + { + "epoch": 1.0321850014722203, + "grad_norm": 0.7646887898445129, + "learning_rate": 0.00013120280311725973, + "loss": 2.1797, + "step": 11393 + }, + { + "epoch": 1.032275599646667, + "grad_norm": 0.8195066452026367, + "learning_rate": 0.00013119676191626896, + "loss": 2.6909, + "step": 11394 + }, + { + "epoch": 1.0323661978211138, + "grad_norm": 0.8052653670310974, + "learning_rate": 0.0001311907207152782, + "loss": 2.578, + "step": 11395 + }, + { + "epoch": 1.0324567959955606, + "grad_norm": 0.8464866876602173, + "learning_rate": 0.00013118467951428746, + "loss": 2.7261, + "step": 11396 + }, + { + "epoch": 1.0325473941700074, + "grad_norm": 0.8069810271263123, + "learning_rate": 0.0001311786383132967, + "loss": 2.5296, + "step": 11397 + }, + { + "epoch": 1.0326379923444542, + "grad_norm": 0.7436041235923767, + "learning_rate": 0.00013117259711230593, + "loss": 1.918, + "step": 11398 + }, + { + "epoch": 1.032728590518901, + "grad_norm": 0.8147885799407959, + "learning_rate": 0.00013116655591131516, + "loss": 2.6319, + "step": 11399 + }, + { + "epoch": 1.0328191886933478, + "grad_norm": 0.8704424500465393, + "learning_rate": 0.00013116051471032442, + "loss": 2.7954, + "step": 11400 + }, + { + "epoch": 1.0329097868677946, + "grad_norm": 0.9470900893211365, + "learning_rate": 0.00013115447350933366, + "loss": 2.5814, + "step": 11401 + }, + { + "epoch": 1.0330003850422413, + "grad_norm": 0.8852320313453674, + "learning_rate": 0.00013114843230834292, + "loss": 2.8029, + "step": 11402 + }, + { + "epoch": 1.0330909832166881, + "grad_norm": 0.7666469812393188, + "learning_rate": 0.00013114239110735215, + "loss": 2.0888, + "step": 11403 + }, + { + "epoch": 1.033181581391135, + "grad_norm": 0.8461340665817261, + "learning_rate": 0.00013113634990636139, + "loss": 2.4328, + "step": 11404 + }, + { + "epoch": 1.0332721795655817, + "grad_norm": 0.9465137720108032, + "learning_rate": 0.00013113030870537065, + "loss": 2.6715, + "step": 11405 + }, + { + "epoch": 1.0333627777400285, + "grad_norm": 0.9385329484939575, + "learning_rate": 0.00013112426750437988, + "loss": 2.5568, + "step": 11406 + }, + { + "epoch": 1.0334533759144753, + "grad_norm": 0.9059379696846008, + "learning_rate": 0.00013111822630338911, + "loss": 2.6063, + "step": 11407 + }, + { + "epoch": 1.033543974088922, + "grad_norm": 0.866209089756012, + "learning_rate": 0.00013111218510239835, + "loss": 2.5303, + "step": 11408 + }, + { + "epoch": 1.0336345722633689, + "grad_norm": 0.8320260047912598, + "learning_rate": 0.0001311061439014076, + "loss": 2.5784, + "step": 11409 + }, + { + "epoch": 1.0337251704378156, + "grad_norm": 0.887982189655304, + "learning_rate": 0.00013110010270041684, + "loss": 2.7685, + "step": 11410 + }, + { + "epoch": 1.0338157686122624, + "grad_norm": 0.9076502323150635, + "learning_rate": 0.00013109406149942608, + "loss": 2.6824, + "step": 11411 + }, + { + "epoch": 1.0339063667867092, + "grad_norm": 0.8263260126113892, + "learning_rate": 0.00013108802029843534, + "loss": 2.7076, + "step": 11412 + }, + { + "epoch": 1.033996964961156, + "grad_norm": 0.8795676827430725, + "learning_rate": 0.00013108197909744457, + "loss": 2.6968, + "step": 11413 + }, + { + "epoch": 1.0340875631356028, + "grad_norm": 0.8250941038131714, + "learning_rate": 0.00013107593789645383, + "loss": 2.2079, + "step": 11414 + }, + { + "epoch": 1.0341781613100496, + "grad_norm": 0.8861020803451538, + "learning_rate": 0.00013106989669546307, + "loss": 2.4699, + "step": 11415 + }, + { + "epoch": 1.0342687594844964, + "grad_norm": 0.823660135269165, + "learning_rate": 0.0001310638554944723, + "loss": 2.7271, + "step": 11416 + }, + { + "epoch": 1.0343593576589432, + "grad_norm": 1.0334450006484985, + "learning_rate": 0.00013105781429348154, + "loss": 2.803, + "step": 11417 + }, + { + "epoch": 1.03444995583339, + "grad_norm": 0.7073245644569397, + "learning_rate": 0.0001310517730924908, + "loss": 1.9127, + "step": 11418 + }, + { + "epoch": 1.0345405540078367, + "grad_norm": 0.8619771599769592, + "learning_rate": 0.00013104573189150006, + "loss": 2.7967, + "step": 11419 + }, + { + "epoch": 1.0346311521822835, + "grad_norm": 0.848743736743927, + "learning_rate": 0.00013103969069050927, + "loss": 2.7472, + "step": 11420 + }, + { + "epoch": 1.0347217503567303, + "grad_norm": 0.8613580465316772, + "learning_rate": 0.00013103364948951853, + "loss": 2.4107, + "step": 11421 + }, + { + "epoch": 1.034812348531177, + "grad_norm": 0.8671005368232727, + "learning_rate": 0.00013102760828852776, + "loss": 2.572, + "step": 11422 + }, + { + "epoch": 1.0349029467056239, + "grad_norm": 0.8178399205207825, + "learning_rate": 0.00013102156708753702, + "loss": 2.5844, + "step": 11423 + }, + { + "epoch": 1.0349935448800707, + "grad_norm": 0.8936229348182678, + "learning_rate": 0.00013101552588654623, + "loss": 2.6859, + "step": 11424 + }, + { + "epoch": 1.0350841430545175, + "grad_norm": 0.8684024214744568, + "learning_rate": 0.0001310094846855555, + "loss": 2.765, + "step": 11425 + }, + { + "epoch": 1.0351747412289642, + "grad_norm": 0.8934696912765503, + "learning_rate": 0.00013100344348456475, + "loss": 2.442, + "step": 11426 + }, + { + "epoch": 1.035265339403411, + "grad_norm": 0.8185001015663147, + "learning_rate": 0.00013099740228357399, + "loss": 2.4052, + "step": 11427 + }, + { + "epoch": 1.0353559375778578, + "grad_norm": 0.8902931809425354, + "learning_rate": 0.00013099136108258322, + "loss": 2.6155, + "step": 11428 + }, + { + "epoch": 1.0354465357523046, + "grad_norm": 0.865828275680542, + "learning_rate": 0.00013098531988159245, + "loss": 2.4828, + "step": 11429 + }, + { + "epoch": 1.0355371339267514, + "grad_norm": 0.8499835729598999, + "learning_rate": 0.00013097927868060171, + "loss": 2.6559, + "step": 11430 + }, + { + "epoch": 1.0356277321011982, + "grad_norm": 0.8735024929046631, + "learning_rate": 0.00013097323747961095, + "loss": 2.9907, + "step": 11431 + }, + { + "epoch": 1.035718330275645, + "grad_norm": 0.8762850761413574, + "learning_rate": 0.0001309671962786202, + "loss": 2.7679, + "step": 11432 + }, + { + "epoch": 1.0358089284500918, + "grad_norm": 0.8594050407409668, + "learning_rate": 0.00013096115507762944, + "loss": 2.6553, + "step": 11433 + }, + { + "epoch": 1.0358995266245385, + "grad_norm": 0.7317241430282593, + "learning_rate": 0.00013095511387663868, + "loss": 1.9176, + "step": 11434 + }, + { + "epoch": 1.0359901247989853, + "grad_norm": 0.8906294703483582, + "learning_rate": 0.00013094907267564794, + "loss": 2.6862, + "step": 11435 + }, + { + "epoch": 1.0360807229734321, + "grad_norm": 0.8582792282104492, + "learning_rate": 0.00013094303147465717, + "loss": 2.6168, + "step": 11436 + }, + { + "epoch": 1.036171321147879, + "grad_norm": 0.959373950958252, + "learning_rate": 0.0001309369902736664, + "loss": 2.6199, + "step": 11437 + }, + { + "epoch": 1.0362619193223257, + "grad_norm": 0.849859356880188, + "learning_rate": 0.00013093094907267564, + "loss": 2.6658, + "step": 11438 + }, + { + "epoch": 1.0363525174967725, + "grad_norm": 1.003104329109192, + "learning_rate": 0.0001309249078716849, + "loss": 2.5327, + "step": 11439 + }, + { + "epoch": 1.0364431156712193, + "grad_norm": 0.8837713003158569, + "learning_rate": 0.00013091886667069416, + "loss": 2.4716, + "step": 11440 + }, + { + "epoch": 1.036533713845666, + "grad_norm": 0.8613578677177429, + "learning_rate": 0.00013091282546970337, + "loss": 2.8286, + "step": 11441 + }, + { + "epoch": 1.0366243120201128, + "grad_norm": 0.8898236751556396, + "learning_rate": 0.00013090678426871263, + "loss": 2.511, + "step": 11442 + }, + { + "epoch": 1.0367149101945596, + "grad_norm": 0.6311081647872925, + "learning_rate": 0.00013090074306772187, + "loss": 1.2925, + "step": 11443 + }, + { + "epoch": 1.0368055083690064, + "grad_norm": 0.8861545920372009, + "learning_rate": 0.00013089470186673113, + "loss": 2.8994, + "step": 11444 + }, + { + "epoch": 1.0368961065434532, + "grad_norm": 0.9366837739944458, + "learning_rate": 0.00013088866066574036, + "loss": 2.8204, + "step": 11445 + }, + { + "epoch": 1.0369867047179, + "grad_norm": 0.9060474038124084, + "learning_rate": 0.0001308826194647496, + "loss": 2.8706, + "step": 11446 + }, + { + "epoch": 1.0370773028923468, + "grad_norm": 0.8377496004104614, + "learning_rate": 0.00013087657826375883, + "loss": 2.7521, + "step": 11447 + }, + { + "epoch": 1.0371679010667936, + "grad_norm": 0.8981912136077881, + "learning_rate": 0.0001308705370627681, + "loss": 2.5408, + "step": 11448 + }, + { + "epoch": 1.0372584992412404, + "grad_norm": 0.9802030920982361, + "learning_rate": 0.00013086449586177732, + "loss": 2.6034, + "step": 11449 + }, + { + "epoch": 1.0373490974156871, + "grad_norm": 0.8622349500656128, + "learning_rate": 0.00013085845466078656, + "loss": 2.5774, + "step": 11450 + }, + { + "epoch": 1.037439695590134, + "grad_norm": 0.8883572220802307, + "learning_rate": 0.00013085241345979582, + "loss": 2.7332, + "step": 11451 + }, + { + "epoch": 1.0375302937645807, + "grad_norm": 0.8933494091033936, + "learning_rate": 0.00013084637225880505, + "loss": 2.5344, + "step": 11452 + }, + { + "epoch": 1.0376208919390275, + "grad_norm": 0.9516342282295227, + "learning_rate": 0.00013084033105781431, + "loss": 2.4325, + "step": 11453 + }, + { + "epoch": 1.0377114901134743, + "grad_norm": 0.8476973176002502, + "learning_rate": 0.00013083428985682352, + "loss": 2.8271, + "step": 11454 + }, + { + "epoch": 1.037802088287921, + "grad_norm": 0.9212738871574402, + "learning_rate": 0.00013082824865583278, + "loss": 2.6981, + "step": 11455 + }, + { + "epoch": 1.0378926864623679, + "grad_norm": 0.9713603854179382, + "learning_rate": 0.00013082220745484204, + "loss": 2.6106, + "step": 11456 + }, + { + "epoch": 1.0379832846368147, + "grad_norm": 0.8489305377006531, + "learning_rate": 0.00013081616625385128, + "loss": 2.8065, + "step": 11457 + }, + { + "epoch": 1.0380738828112615, + "grad_norm": 0.7685877680778503, + "learning_rate": 0.0001308101250528605, + "loss": 1.9068, + "step": 11458 + }, + { + "epoch": 1.0381644809857082, + "grad_norm": 0.9338744282722473, + "learning_rate": 0.00013080408385186975, + "loss": 2.6448, + "step": 11459 + }, + { + "epoch": 1.038255079160155, + "grad_norm": 0.8527113199234009, + "learning_rate": 0.000130798042650879, + "loss": 2.7393, + "step": 11460 + }, + { + "epoch": 1.0383456773346018, + "grad_norm": 0.8793390393257141, + "learning_rate": 0.00013079200144988824, + "loss": 2.8434, + "step": 11461 + }, + { + "epoch": 1.0384362755090484, + "grad_norm": 0.8239098787307739, + "learning_rate": 0.00013078596024889748, + "loss": 2.1547, + "step": 11462 + }, + { + "epoch": 1.0385268736834954, + "grad_norm": 0.818260908126831, + "learning_rate": 0.00013077991904790674, + "loss": 2.6935, + "step": 11463 + }, + { + "epoch": 1.038617471857942, + "grad_norm": 0.8725025653839111, + "learning_rate": 0.00013077387784691597, + "loss": 2.6623, + "step": 11464 + }, + { + "epoch": 1.0387080700323887, + "grad_norm": 0.8350239992141724, + "learning_rate": 0.00013076783664592523, + "loss": 2.4652, + "step": 11465 + }, + { + "epoch": 1.0387986682068355, + "grad_norm": 0.8419706225395203, + "learning_rate": 0.00013076179544493447, + "loss": 2.5801, + "step": 11466 + }, + { + "epoch": 1.0388892663812823, + "grad_norm": 0.8796727061271667, + "learning_rate": 0.0001307557542439437, + "loss": 2.8853, + "step": 11467 + }, + { + "epoch": 1.038979864555729, + "grad_norm": 0.9114300608634949, + "learning_rate": 0.00013074971304295293, + "loss": 2.9075, + "step": 11468 + }, + { + "epoch": 1.039070462730176, + "grad_norm": 0.888637900352478, + "learning_rate": 0.0001307436718419622, + "loss": 2.6732, + "step": 11469 + }, + { + "epoch": 1.0391610609046227, + "grad_norm": 0.8556389212608337, + "learning_rate": 0.00013073763064097143, + "loss": 3.0426, + "step": 11470 + }, + { + "epoch": 1.0392516590790695, + "grad_norm": 0.775099515914917, + "learning_rate": 0.00013073158943998066, + "loss": 2.0588, + "step": 11471 + }, + { + "epoch": 1.0393422572535163, + "grad_norm": 0.8742817640304565, + "learning_rate": 0.00013072554823898992, + "loss": 2.4556, + "step": 11472 + }, + { + "epoch": 1.039432855427963, + "grad_norm": 0.867897093296051, + "learning_rate": 0.00013071950703799916, + "loss": 2.8234, + "step": 11473 + }, + { + "epoch": 1.0395234536024098, + "grad_norm": 0.8527962565422058, + "learning_rate": 0.00013071346583700842, + "loss": 2.6953, + "step": 11474 + }, + { + "epoch": 1.0396140517768566, + "grad_norm": 0.8982307314872742, + "learning_rate": 0.00013070742463601763, + "loss": 2.6364, + "step": 11475 + }, + { + "epoch": 1.0397046499513034, + "grad_norm": 0.9078572392463684, + "learning_rate": 0.0001307013834350269, + "loss": 2.8511, + "step": 11476 + }, + { + "epoch": 1.0397952481257502, + "grad_norm": 0.9130671620368958, + "learning_rate": 0.00013069534223403612, + "loss": 2.722, + "step": 11477 + }, + { + "epoch": 1.039885846300197, + "grad_norm": 0.9188193678855896, + "learning_rate": 0.00013068930103304538, + "loss": 2.7009, + "step": 11478 + }, + { + "epoch": 1.0399764444746438, + "grad_norm": 0.9356524348258972, + "learning_rate": 0.00013068325983205462, + "loss": 2.7946, + "step": 11479 + }, + { + "epoch": 1.0400670426490906, + "grad_norm": 0.8782621026039124, + "learning_rate": 0.00013067721863106385, + "loss": 2.6284, + "step": 11480 + }, + { + "epoch": 1.0401576408235373, + "grad_norm": 0.7293270230293274, + "learning_rate": 0.0001306711774300731, + "loss": 2.0954, + "step": 11481 + }, + { + "epoch": 1.0402482389979841, + "grad_norm": 0.8757433891296387, + "learning_rate": 0.00013066513622908235, + "loss": 2.5348, + "step": 11482 + }, + { + "epoch": 1.040338837172431, + "grad_norm": 0.908167839050293, + "learning_rate": 0.00013065909502809158, + "loss": 2.8655, + "step": 11483 + }, + { + "epoch": 1.0404294353468777, + "grad_norm": 0.8337087631225586, + "learning_rate": 0.00013065305382710081, + "loss": 2.7057, + "step": 11484 + }, + { + "epoch": 1.0405200335213245, + "grad_norm": 0.8310267329216003, + "learning_rate": 0.00013064701262611008, + "loss": 2.9021, + "step": 11485 + }, + { + "epoch": 1.0406106316957713, + "grad_norm": 0.8711749315261841, + "learning_rate": 0.00013064097142511934, + "loss": 2.67, + "step": 11486 + }, + { + "epoch": 1.040701229870218, + "grad_norm": 0.8331395387649536, + "learning_rate": 0.00013063493022412857, + "loss": 2.585, + "step": 11487 + }, + { + "epoch": 1.0407918280446649, + "grad_norm": 0.898405134677887, + "learning_rate": 0.0001306288890231378, + "loss": 2.5273, + "step": 11488 + }, + { + "epoch": 1.0408824262191116, + "grad_norm": 0.8912107348442078, + "learning_rate": 0.00013062284782214704, + "loss": 2.8246, + "step": 11489 + }, + { + "epoch": 1.0409730243935584, + "grad_norm": 0.8138939738273621, + "learning_rate": 0.0001306168066211563, + "loss": 2.0487, + "step": 11490 + }, + { + "epoch": 1.0410636225680052, + "grad_norm": 0.8091341257095337, + "learning_rate": 0.00013061076542016553, + "loss": 2.7567, + "step": 11491 + }, + { + "epoch": 1.041154220742452, + "grad_norm": 0.9591588377952576, + "learning_rate": 0.00013060472421917477, + "loss": 2.902, + "step": 11492 + }, + { + "epoch": 1.0412448189168988, + "grad_norm": 0.8775354027748108, + "learning_rate": 0.00013059868301818403, + "loss": 2.8543, + "step": 11493 + }, + { + "epoch": 1.0413354170913456, + "grad_norm": 0.9574953317642212, + "learning_rate": 0.00013059264181719326, + "loss": 3.0762, + "step": 11494 + }, + { + "epoch": 1.0414260152657924, + "grad_norm": 0.8032853007316589, + "learning_rate": 0.00013058660061620252, + "loss": 2.5346, + "step": 11495 + }, + { + "epoch": 1.0415166134402392, + "grad_norm": 0.8972880840301514, + "learning_rate": 0.00013058055941521176, + "loss": 2.5826, + "step": 11496 + }, + { + "epoch": 1.041607211614686, + "grad_norm": 0.8293015956878662, + "learning_rate": 0.000130574518214221, + "loss": 2.5968, + "step": 11497 + }, + { + "epoch": 1.0416978097891327, + "grad_norm": 0.903153121471405, + "learning_rate": 0.00013056847701323023, + "loss": 2.5267, + "step": 11498 + }, + { + "epoch": 1.0417884079635795, + "grad_norm": 0.8050899505615234, + "learning_rate": 0.0001305624358122395, + "loss": 2.0491, + "step": 11499 + }, + { + "epoch": 1.0418790061380263, + "grad_norm": 0.8574737310409546, + "learning_rate": 0.00013055639461124872, + "loss": 2.8261, + "step": 11500 + }, + { + "epoch": 1.041969604312473, + "grad_norm": 1.067325472831726, + "learning_rate": 0.00013055035341025796, + "loss": 2.6985, + "step": 11501 + }, + { + "epoch": 1.0420602024869199, + "grad_norm": 0.8881846070289612, + "learning_rate": 0.00013054431220926722, + "loss": 2.5509, + "step": 11502 + }, + { + "epoch": 1.0421508006613667, + "grad_norm": 0.894355297088623, + "learning_rate": 0.00013053827100827645, + "loss": 2.7061, + "step": 11503 + }, + { + "epoch": 1.0422413988358135, + "grad_norm": 0.8653911352157593, + "learning_rate": 0.0001305322298072857, + "loss": 2.83, + "step": 11504 + }, + { + "epoch": 1.0423319970102602, + "grad_norm": 0.8975962400436401, + "learning_rate": 0.00013052618860629492, + "loss": 2.9036, + "step": 11505 + }, + { + "epoch": 1.042422595184707, + "grad_norm": 0.7554945945739746, + "learning_rate": 0.00013052014740530418, + "loss": 2.1068, + "step": 11506 + }, + { + "epoch": 1.0425131933591538, + "grad_norm": 0.82966148853302, + "learning_rate": 0.00013051410620431341, + "loss": 2.7356, + "step": 11507 + }, + { + "epoch": 1.0426037915336006, + "grad_norm": 0.9025017023086548, + "learning_rate": 0.00013050806500332268, + "loss": 2.6877, + "step": 11508 + }, + { + "epoch": 1.0426943897080474, + "grad_norm": 0.9066196084022522, + "learning_rate": 0.0001305020238023319, + "loss": 2.8583, + "step": 11509 + }, + { + "epoch": 1.0427849878824942, + "grad_norm": 0.875683069229126, + "learning_rate": 0.00013049598260134114, + "loss": 2.7162, + "step": 11510 + }, + { + "epoch": 1.042875586056941, + "grad_norm": 0.838878333568573, + "learning_rate": 0.0001304899414003504, + "loss": 2.5213, + "step": 11511 + }, + { + "epoch": 1.0429661842313878, + "grad_norm": 0.8506826162338257, + "learning_rate": 0.00013048390019935964, + "loss": 2.8036, + "step": 11512 + }, + { + "epoch": 1.0430567824058345, + "grad_norm": 0.8658949732780457, + "learning_rate": 0.00013047785899836887, + "loss": 2.6833, + "step": 11513 + }, + { + "epoch": 1.0431473805802813, + "grad_norm": 0.8547382354736328, + "learning_rate": 0.0001304718177973781, + "loss": 2.5438, + "step": 11514 + }, + { + "epoch": 1.0432379787547281, + "grad_norm": 0.8522206544876099, + "learning_rate": 0.00013046577659638737, + "loss": 2.6029, + "step": 11515 + }, + { + "epoch": 1.043328576929175, + "grad_norm": 0.840431272983551, + "learning_rate": 0.00013045973539539663, + "loss": 2.6317, + "step": 11516 + }, + { + "epoch": 1.0434191751036217, + "grad_norm": 0.8292078375816345, + "learning_rate": 0.00013045369419440586, + "loss": 2.7625, + "step": 11517 + }, + { + "epoch": 1.0435097732780685, + "grad_norm": 0.8355238437652588, + "learning_rate": 0.0001304476529934151, + "loss": 2.5512, + "step": 11518 + }, + { + "epoch": 1.0436003714525153, + "grad_norm": 0.761737585067749, + "learning_rate": 0.00013044161179242433, + "loss": 2.3634, + "step": 11519 + }, + { + "epoch": 1.043690969626962, + "grad_norm": 0.8432960510253906, + "learning_rate": 0.0001304355705914336, + "loss": 2.8086, + "step": 11520 + }, + { + "epoch": 1.0437815678014088, + "grad_norm": 1.077613115310669, + "learning_rate": 0.00013042952939044283, + "loss": 2.5149, + "step": 11521 + }, + { + "epoch": 1.0438721659758556, + "grad_norm": 0.8090775609016418, + "learning_rate": 0.00013042348818945206, + "loss": 2.5302, + "step": 11522 + }, + { + "epoch": 1.0439627641503024, + "grad_norm": 0.8613194823265076, + "learning_rate": 0.00013041744698846132, + "loss": 2.5146, + "step": 11523 + }, + { + "epoch": 1.0440533623247492, + "grad_norm": 0.7655290961265564, + "learning_rate": 0.00013041140578747056, + "loss": 2.2155, + "step": 11524 + }, + { + "epoch": 1.044143960499196, + "grad_norm": 0.7684474587440491, + "learning_rate": 0.00013040536458647982, + "loss": 2.0694, + "step": 11525 + }, + { + "epoch": 1.0442345586736428, + "grad_norm": 0.8687455058097839, + "learning_rate": 0.00013039932338548902, + "loss": 2.9205, + "step": 11526 + }, + { + "epoch": 1.0443251568480896, + "grad_norm": 1.0759378671646118, + "learning_rate": 0.00013039328218449829, + "loss": 2.6193, + "step": 11527 + }, + { + "epoch": 1.0444157550225364, + "grad_norm": 0.8425675630569458, + "learning_rate": 0.00013038724098350752, + "loss": 2.6281, + "step": 11528 + }, + { + "epoch": 1.0445063531969832, + "grad_norm": 0.7954870462417603, + "learning_rate": 0.00013038119978251678, + "loss": 1.9683, + "step": 11529 + }, + { + "epoch": 1.04459695137143, + "grad_norm": 0.8685157895088196, + "learning_rate": 0.00013037515858152601, + "loss": 2.592, + "step": 11530 + }, + { + "epoch": 1.0446875495458767, + "grad_norm": 0.8546211123466492, + "learning_rate": 0.00013036911738053525, + "loss": 2.8506, + "step": 11531 + }, + { + "epoch": 1.0447781477203235, + "grad_norm": 0.9079587459564209, + "learning_rate": 0.0001303630761795445, + "loss": 2.7477, + "step": 11532 + }, + { + "epoch": 1.0448687458947703, + "grad_norm": 0.8734347224235535, + "learning_rate": 0.00013035703497855374, + "loss": 2.6196, + "step": 11533 + }, + { + "epoch": 1.044959344069217, + "grad_norm": 0.9381318688392639, + "learning_rate": 0.00013035099377756298, + "loss": 2.5212, + "step": 11534 + }, + { + "epoch": 1.0450499422436639, + "grad_norm": 0.9607807397842407, + "learning_rate": 0.0001303449525765722, + "loss": 2.5709, + "step": 11535 + }, + { + "epoch": 1.0451405404181107, + "grad_norm": 0.8712701201438904, + "learning_rate": 0.00013033891137558147, + "loss": 2.6859, + "step": 11536 + }, + { + "epoch": 1.0452311385925575, + "grad_norm": 0.9106449484825134, + "learning_rate": 0.0001303328701745907, + "loss": 2.7691, + "step": 11537 + }, + { + "epoch": 1.0453217367670042, + "grad_norm": 0.7565534114837646, + "learning_rate": 0.00013032682897359997, + "loss": 1.9999, + "step": 11538 + }, + { + "epoch": 1.045412334941451, + "grad_norm": 0.9080372452735901, + "learning_rate": 0.0001303207877726092, + "loss": 2.6341, + "step": 11539 + }, + { + "epoch": 1.0455029331158978, + "grad_norm": 0.9076554775238037, + "learning_rate": 0.00013031474657161844, + "loss": 2.6712, + "step": 11540 + }, + { + "epoch": 1.0455935312903446, + "grad_norm": 0.8689590692520142, + "learning_rate": 0.0001303087053706277, + "loss": 2.7438, + "step": 11541 + }, + { + "epoch": 1.0456841294647914, + "grad_norm": 0.6728610396385193, + "learning_rate": 0.00013030266416963693, + "loss": 1.2863, + "step": 11542 + }, + { + "epoch": 1.045774727639238, + "grad_norm": 0.8527954816818237, + "learning_rate": 0.00013029662296864617, + "loss": 2.4838, + "step": 11543 + }, + { + "epoch": 1.045865325813685, + "grad_norm": 0.9737771153450012, + "learning_rate": 0.0001302905817676554, + "loss": 2.719, + "step": 11544 + }, + { + "epoch": 1.0459559239881315, + "grad_norm": 0.8966338038444519, + "learning_rate": 0.00013028454056666466, + "loss": 2.8433, + "step": 11545 + }, + { + "epoch": 1.0460465221625783, + "grad_norm": 0.9338751435279846, + "learning_rate": 0.00013027849936567392, + "loss": 2.3909, + "step": 11546 + }, + { + "epoch": 1.046137120337025, + "grad_norm": 0.7671496272087097, + "learning_rate": 0.00013027245816468313, + "loss": 2.0047, + "step": 11547 + }, + { + "epoch": 1.046227718511472, + "grad_norm": 0.8316596746444702, + "learning_rate": 0.0001302664169636924, + "loss": 2.7751, + "step": 11548 + }, + { + "epoch": 1.0463183166859187, + "grad_norm": 0.883520245552063, + "learning_rate": 0.00013026037576270162, + "loss": 2.6009, + "step": 11549 + }, + { + "epoch": 1.0464089148603655, + "grad_norm": 0.8402029871940613, + "learning_rate": 0.00013025433456171089, + "loss": 2.5856, + "step": 11550 + }, + { + "epoch": 1.0464995130348123, + "grad_norm": 0.8129808306694031, + "learning_rate": 0.00013024829336072012, + "loss": 2.5432, + "step": 11551 + }, + { + "epoch": 1.046590111209259, + "grad_norm": 0.8529233336448669, + "learning_rate": 0.00013024225215972935, + "loss": 2.1217, + "step": 11552 + }, + { + "epoch": 1.0466807093837058, + "grad_norm": 0.8219574093818665, + "learning_rate": 0.00013023621095873861, + "loss": 2.4987, + "step": 11553 + }, + { + "epoch": 1.0467713075581526, + "grad_norm": 0.8789244294166565, + "learning_rate": 0.00013023016975774785, + "loss": 2.7518, + "step": 11554 + }, + { + "epoch": 1.0468619057325994, + "grad_norm": 0.937574028968811, + "learning_rate": 0.0001302241285567571, + "loss": 3.093, + "step": 11555 + }, + { + "epoch": 1.0469525039070462, + "grad_norm": 0.7877078652381897, + "learning_rate": 0.00013021808735576632, + "loss": 2.5599, + "step": 11556 + }, + { + "epoch": 1.047043102081493, + "grad_norm": 0.9150423407554626, + "learning_rate": 0.00013021204615477558, + "loss": 2.7301, + "step": 11557 + }, + { + "epoch": 1.0471337002559398, + "grad_norm": 0.8484898209571838, + "learning_rate": 0.0001302060049537848, + "loss": 2.7239, + "step": 11558 + }, + { + "epoch": 1.0472242984303866, + "grad_norm": 0.8756781816482544, + "learning_rate": 0.00013019996375279407, + "loss": 2.8102, + "step": 11559 + }, + { + "epoch": 1.0473148966048333, + "grad_norm": 0.9119872450828552, + "learning_rate": 0.0001301939225518033, + "loss": 2.1779, + "step": 11560 + }, + { + "epoch": 1.0474054947792801, + "grad_norm": 0.8659595847129822, + "learning_rate": 0.00013018788135081254, + "loss": 3.1573, + "step": 11561 + }, + { + "epoch": 1.047496092953727, + "grad_norm": 0.8112178444862366, + "learning_rate": 0.0001301818401498218, + "loss": 2.4358, + "step": 11562 + }, + { + "epoch": 1.0475866911281737, + "grad_norm": 0.8881739377975464, + "learning_rate": 0.00013017579894883104, + "loss": 3.1318, + "step": 11563 + }, + { + "epoch": 1.0476772893026205, + "grad_norm": 0.8918806314468384, + "learning_rate": 0.00013016975774784027, + "loss": 2.7363, + "step": 11564 + }, + { + "epoch": 1.0477678874770673, + "grad_norm": 0.9084779620170593, + "learning_rate": 0.0001301637165468495, + "loss": 2.7782, + "step": 11565 + }, + { + "epoch": 1.047858485651514, + "grad_norm": 0.8669302463531494, + "learning_rate": 0.00013015767534585877, + "loss": 2.8975, + "step": 11566 + }, + { + "epoch": 1.0479490838259609, + "grad_norm": 0.9234402179718018, + "learning_rate": 0.000130151634144868, + "loss": 2.947, + "step": 11567 + }, + { + "epoch": 1.0480396820004076, + "grad_norm": 0.8966816067695618, + "learning_rate": 0.00013014559294387726, + "loss": 2.6878, + "step": 11568 + }, + { + "epoch": 1.0481302801748544, + "grad_norm": 0.9010366797447205, + "learning_rate": 0.0001301395517428865, + "loss": 2.6338, + "step": 11569 + }, + { + "epoch": 1.0482208783493012, + "grad_norm": 0.8834646940231323, + "learning_rate": 0.00013013351054189573, + "loss": 2.7453, + "step": 11570 + }, + { + "epoch": 1.048311476523748, + "grad_norm": 0.863472044467926, + "learning_rate": 0.000130127469340905, + "loss": 2.7546, + "step": 11571 + }, + { + "epoch": 1.0484020746981948, + "grad_norm": 0.9473832845687866, + "learning_rate": 0.00013012142813991422, + "loss": 2.5243, + "step": 11572 + }, + { + "epoch": 1.0484926728726416, + "grad_norm": 0.8516150116920471, + "learning_rate": 0.00013011538693892346, + "loss": 2.7916, + "step": 11573 + }, + { + "epoch": 1.0485832710470884, + "grad_norm": 0.903005063533783, + "learning_rate": 0.0001301093457379327, + "loss": 2.8226, + "step": 11574 + }, + { + "epoch": 1.0486738692215352, + "grad_norm": 0.8919252753257751, + "learning_rate": 0.00013010330453694195, + "loss": 2.6982, + "step": 11575 + }, + { + "epoch": 1.048764467395982, + "grad_norm": 0.8815717101097107, + "learning_rate": 0.00013009726333595121, + "loss": 2.7071, + "step": 11576 + }, + { + "epoch": 1.0488550655704287, + "grad_norm": 0.7399741411209106, + "learning_rate": 0.00013009122213496042, + "loss": 2.0774, + "step": 11577 + }, + { + "epoch": 1.0489456637448755, + "grad_norm": 0.9052515029907227, + "learning_rate": 0.00013008518093396968, + "loss": 2.8802, + "step": 11578 + }, + { + "epoch": 1.0490362619193223, + "grad_norm": 0.8045221567153931, + "learning_rate": 0.00013007913973297892, + "loss": 2.7352, + "step": 11579 + }, + { + "epoch": 1.049126860093769, + "grad_norm": 0.870939314365387, + "learning_rate": 0.00013007309853198818, + "loss": 2.6462, + "step": 11580 + }, + { + "epoch": 1.0492174582682159, + "grad_norm": 0.8758457899093628, + "learning_rate": 0.0001300670573309974, + "loss": 2.5852, + "step": 11581 + }, + { + "epoch": 1.0493080564426627, + "grad_norm": 0.7555779814720154, + "learning_rate": 0.00013006101613000665, + "loss": 2.0222, + "step": 11582 + }, + { + "epoch": 1.0493986546171095, + "grad_norm": 0.9305556416511536, + "learning_rate": 0.0001300549749290159, + "loss": 2.5905, + "step": 11583 + }, + { + "epoch": 1.0494892527915562, + "grad_norm": 0.8661393523216248, + "learning_rate": 0.00013004893372802514, + "loss": 2.6482, + "step": 11584 + }, + { + "epoch": 1.049579850966003, + "grad_norm": 0.8719338774681091, + "learning_rate": 0.00013004289252703438, + "loss": 2.5321, + "step": 11585 + }, + { + "epoch": 1.0496704491404498, + "grad_norm": 0.7288873195648193, + "learning_rate": 0.0001300368513260436, + "loss": 2.0163, + "step": 11586 + }, + { + "epoch": 1.0497610473148966, + "grad_norm": 0.8439109325408936, + "learning_rate": 0.00013003081012505287, + "loss": 2.5885, + "step": 11587 + }, + { + "epoch": 1.0498516454893434, + "grad_norm": 0.8682496547698975, + "learning_rate": 0.0001300247689240621, + "loss": 2.7417, + "step": 11588 + }, + { + "epoch": 1.0499422436637902, + "grad_norm": 0.7125054001808167, + "learning_rate": 0.00013001872772307137, + "loss": 1.9022, + "step": 11589 + }, + { + "epoch": 1.050032841838237, + "grad_norm": 0.986255407333374, + "learning_rate": 0.0001300126865220806, + "loss": 2.549, + "step": 11590 + }, + { + "epoch": 1.0501234400126838, + "grad_norm": 0.8651114106178284, + "learning_rate": 0.00013000664532108983, + "loss": 2.4509, + "step": 11591 + }, + { + "epoch": 1.0502140381871305, + "grad_norm": 0.9055672883987427, + "learning_rate": 0.0001300006041200991, + "loss": 2.7996, + "step": 11592 + }, + { + "epoch": 1.0503046363615773, + "grad_norm": 0.8188491463661194, + "learning_rate": 0.00012999456291910833, + "loss": 2.2642, + "step": 11593 + }, + { + "epoch": 1.0503952345360241, + "grad_norm": 0.8791861534118652, + "learning_rate": 0.00012998852171811756, + "loss": 2.6947, + "step": 11594 + }, + { + "epoch": 1.050485832710471, + "grad_norm": 0.8458858728408813, + "learning_rate": 0.0001299824805171268, + "loss": 2.6151, + "step": 11595 + }, + { + "epoch": 1.0505764308849177, + "grad_norm": 0.840779185295105, + "learning_rate": 0.00012997643931613606, + "loss": 2.6683, + "step": 11596 + }, + { + "epoch": 1.0506670290593645, + "grad_norm": 0.8628423810005188, + "learning_rate": 0.0001299703981151453, + "loss": 2.7849, + "step": 11597 + }, + { + "epoch": 1.0507576272338113, + "grad_norm": 0.8531506061553955, + "learning_rate": 0.00012996435691415453, + "loss": 2.6862, + "step": 11598 + }, + { + "epoch": 1.050848225408258, + "grad_norm": 0.8746371269226074, + "learning_rate": 0.0001299583157131638, + "loss": 2.7392, + "step": 11599 + }, + { + "epoch": 1.0509388235827049, + "grad_norm": 0.8567238450050354, + "learning_rate": 0.00012995227451217302, + "loss": 2.6469, + "step": 11600 + }, + { + "epoch": 1.0510294217571516, + "grad_norm": 0.8320102691650391, + "learning_rate": 0.00012994623331118228, + "loss": 2.7231, + "step": 11601 + }, + { + "epoch": 1.0511200199315984, + "grad_norm": 0.9078424572944641, + "learning_rate": 0.00012994019211019152, + "loss": 2.7492, + "step": 11602 + }, + { + "epoch": 1.0512106181060452, + "grad_norm": 0.9015796780586243, + "learning_rate": 0.00012993415090920075, + "loss": 2.5744, + "step": 11603 + }, + { + "epoch": 1.051301216280492, + "grad_norm": 0.883476972579956, + "learning_rate": 0.00012992810970820998, + "loss": 2.9154, + "step": 11604 + }, + { + "epoch": 1.0513918144549388, + "grad_norm": 0.935836911201477, + "learning_rate": 0.00012992206850721925, + "loss": 2.8489, + "step": 11605 + }, + { + "epoch": 1.0514824126293856, + "grad_norm": 0.7769103050231934, + "learning_rate": 0.0001299160273062285, + "loss": 2.0809, + "step": 11606 + }, + { + "epoch": 1.0515730108038324, + "grad_norm": 0.8778992295265198, + "learning_rate": 0.00012990998610523771, + "loss": 2.655, + "step": 11607 + }, + { + "epoch": 1.0516636089782792, + "grad_norm": 0.8735196590423584, + "learning_rate": 0.00012990394490424698, + "loss": 2.5642, + "step": 11608 + }, + { + "epoch": 1.051754207152726, + "grad_norm": 0.8900754451751709, + "learning_rate": 0.0001298979037032562, + "loss": 2.6958, + "step": 11609 + }, + { + "epoch": 1.0518448053271727, + "grad_norm": 0.934294581413269, + "learning_rate": 0.00012989186250226547, + "loss": 2.7049, + "step": 11610 + }, + { + "epoch": 1.0519354035016195, + "grad_norm": 0.8152914643287659, + "learning_rate": 0.00012988582130127468, + "loss": 2.6702, + "step": 11611 + }, + { + "epoch": 1.0520260016760663, + "grad_norm": 0.9114857316017151, + "learning_rate": 0.00012987978010028394, + "loss": 2.6485, + "step": 11612 + }, + { + "epoch": 1.052116599850513, + "grad_norm": 0.8674474954605103, + "learning_rate": 0.0001298737388992932, + "loss": 2.5299, + "step": 11613 + }, + { + "epoch": 1.0522071980249599, + "grad_norm": 0.8525881171226501, + "learning_rate": 0.00012986769769830243, + "loss": 2.8105, + "step": 11614 + }, + { + "epoch": 1.0522977961994067, + "grad_norm": 0.8584138751029968, + "learning_rate": 0.00012986165649731167, + "loss": 2.844, + "step": 11615 + }, + { + "epoch": 1.0523883943738535, + "grad_norm": 0.7734537124633789, + "learning_rate": 0.0001298556152963209, + "loss": 2.0485, + "step": 11616 + }, + { + "epoch": 1.0524789925483002, + "grad_norm": 0.8050795197486877, + "learning_rate": 0.00012984957409533016, + "loss": 2.0886, + "step": 11617 + }, + { + "epoch": 1.052569590722747, + "grad_norm": 0.9256535172462463, + "learning_rate": 0.0001298435328943394, + "loss": 2.5602, + "step": 11618 + }, + { + "epoch": 1.0526601888971938, + "grad_norm": 0.8122329115867615, + "learning_rate": 0.00012983749169334866, + "loss": 2.7172, + "step": 11619 + }, + { + "epoch": 1.0527507870716406, + "grad_norm": 0.8579346537590027, + "learning_rate": 0.0001298314504923579, + "loss": 2.6376, + "step": 11620 + }, + { + "epoch": 1.0528413852460874, + "grad_norm": 0.8287264704704285, + "learning_rate": 0.00012982540929136713, + "loss": 2.8924, + "step": 11621 + }, + { + "epoch": 1.0529319834205342, + "grad_norm": 0.9432464241981506, + "learning_rate": 0.0001298193680903764, + "loss": 2.6715, + "step": 11622 + }, + { + "epoch": 1.053022581594981, + "grad_norm": 0.8306782841682434, + "learning_rate": 0.00012981332688938562, + "loss": 2.6255, + "step": 11623 + }, + { + "epoch": 1.0531131797694275, + "grad_norm": 0.8774224519729614, + "learning_rate": 0.00012980728568839486, + "loss": 2.734, + "step": 11624 + }, + { + "epoch": 1.0532037779438745, + "grad_norm": 0.929307222366333, + "learning_rate": 0.0001298012444874041, + "loss": 2.9388, + "step": 11625 + }, + { + "epoch": 1.053294376118321, + "grad_norm": 0.8730800747871399, + "learning_rate": 0.00012979520328641335, + "loss": 2.9372, + "step": 11626 + }, + { + "epoch": 1.053384974292768, + "grad_norm": 0.7945969104766846, + "learning_rate": 0.00012978916208542259, + "loss": 1.9155, + "step": 11627 + }, + { + "epoch": 1.0534755724672147, + "grad_norm": 0.891274094581604, + "learning_rate": 0.00012978312088443182, + "loss": 2.6967, + "step": 11628 + }, + { + "epoch": 1.0535661706416615, + "grad_norm": 0.9418442249298096, + "learning_rate": 0.00012977707968344108, + "loss": 2.958, + "step": 11629 + }, + { + "epoch": 1.0536567688161083, + "grad_norm": 0.8649837374687195, + "learning_rate": 0.00012977103848245031, + "loss": 2.6151, + "step": 11630 + }, + { + "epoch": 1.053747366990555, + "grad_norm": 0.9177277684211731, + "learning_rate": 0.00012976499728145958, + "loss": 2.5165, + "step": 11631 + }, + { + "epoch": 1.0538379651650018, + "grad_norm": 0.7624941468238831, + "learning_rate": 0.0001297589560804688, + "loss": 1.9491, + "step": 11632 + }, + { + "epoch": 1.0539285633394486, + "grad_norm": 0.8579923510551453, + "learning_rate": 0.00012975291487947804, + "loss": 1.9956, + "step": 11633 + }, + { + "epoch": 1.0540191615138954, + "grad_norm": 1.0102359056472778, + "learning_rate": 0.00012974687367848728, + "loss": 2.7403, + "step": 11634 + }, + { + "epoch": 1.0541097596883422, + "grad_norm": 0.8910506963729858, + "learning_rate": 0.00012974083247749654, + "loss": 2.8132, + "step": 11635 + }, + { + "epoch": 1.054200357862789, + "grad_norm": 0.8426573276519775, + "learning_rate": 0.00012973479127650577, + "loss": 2.708, + "step": 11636 + }, + { + "epoch": 1.0542909560372358, + "grad_norm": 0.9067173004150391, + "learning_rate": 0.000129728750075515, + "loss": 2.8097, + "step": 11637 + }, + { + "epoch": 1.0543815542116826, + "grad_norm": 0.9162536859512329, + "learning_rate": 0.00012972270887452427, + "loss": 2.5848, + "step": 11638 + }, + { + "epoch": 1.0544721523861293, + "grad_norm": 0.854803204536438, + "learning_rate": 0.0001297166676735335, + "loss": 3.0892, + "step": 11639 + }, + { + "epoch": 1.0545627505605761, + "grad_norm": 0.8810678124427795, + "learning_rate": 0.00012971062647254276, + "loss": 2.6877, + "step": 11640 + }, + { + "epoch": 1.054653348735023, + "grad_norm": 0.855836808681488, + "learning_rate": 0.00012970458527155197, + "loss": 2.4705, + "step": 11641 + }, + { + "epoch": 1.0547439469094697, + "grad_norm": 0.8123157024383545, + "learning_rate": 0.00012969854407056123, + "loss": 2.4088, + "step": 11642 + }, + { + "epoch": 1.0548345450839165, + "grad_norm": 0.8669589757919312, + "learning_rate": 0.0001296925028695705, + "loss": 2.847, + "step": 11643 + }, + { + "epoch": 1.0549251432583633, + "grad_norm": 0.8613848090171814, + "learning_rate": 0.00012968646166857973, + "loss": 2.8177, + "step": 11644 + }, + { + "epoch": 1.05501574143281, + "grad_norm": 0.8222469687461853, + "learning_rate": 0.00012968042046758896, + "loss": 2.7746, + "step": 11645 + }, + { + "epoch": 1.0551063396072569, + "grad_norm": 0.9166885018348694, + "learning_rate": 0.0001296743792665982, + "loss": 2.7443, + "step": 11646 + }, + { + "epoch": 1.0551969377817036, + "grad_norm": 0.8373569250106812, + "learning_rate": 0.00012966833806560746, + "loss": 2.7207, + "step": 11647 + }, + { + "epoch": 1.0552875359561504, + "grad_norm": 0.8999388217926025, + "learning_rate": 0.0001296622968646167, + "loss": 2.9072, + "step": 11648 + }, + { + "epoch": 1.0553781341305972, + "grad_norm": 0.9008916020393372, + "learning_rate": 0.00012965625566362592, + "loss": 2.4641, + "step": 11649 + }, + { + "epoch": 1.055468732305044, + "grad_norm": 0.8550629615783691, + "learning_rate": 0.00012965021446263519, + "loss": 2.8593, + "step": 11650 + }, + { + "epoch": 1.0555593304794908, + "grad_norm": 0.8490543365478516, + "learning_rate": 0.00012964417326164442, + "loss": 2.66, + "step": 11651 + }, + { + "epoch": 1.0556499286539376, + "grad_norm": 0.8096756339073181, + "learning_rate": 0.00012963813206065368, + "loss": 2.548, + "step": 11652 + }, + { + "epoch": 1.0557405268283844, + "grad_norm": 0.8016831874847412, + "learning_rate": 0.00012963209085966291, + "loss": 2.4767, + "step": 11653 + }, + { + "epoch": 1.0558311250028312, + "grad_norm": 0.8272764682769775, + "learning_rate": 0.00012962604965867215, + "loss": 2.6796, + "step": 11654 + }, + { + "epoch": 1.055921723177278, + "grad_norm": 0.8663323521614075, + "learning_rate": 0.00012962000845768138, + "loss": 2.3887, + "step": 11655 + }, + { + "epoch": 1.0560123213517247, + "grad_norm": 0.7579489350318909, + "learning_rate": 0.00012961396725669064, + "loss": 2.005, + "step": 11656 + }, + { + "epoch": 1.0561029195261715, + "grad_norm": 0.8579424023628235, + "learning_rate": 0.00012960792605569988, + "loss": 2.2024, + "step": 11657 + }, + { + "epoch": 1.0561935177006183, + "grad_norm": 0.8987650871276855, + "learning_rate": 0.0001296018848547091, + "loss": 2.6861, + "step": 11658 + }, + { + "epoch": 1.056284115875065, + "grad_norm": 0.9296181201934814, + "learning_rate": 0.00012959584365371837, + "loss": 2.7289, + "step": 11659 + }, + { + "epoch": 1.0563747140495119, + "grad_norm": 0.8535952568054199, + "learning_rate": 0.0001295898024527276, + "loss": 2.7246, + "step": 11660 + }, + { + "epoch": 1.0564653122239587, + "grad_norm": 0.8717053532600403, + "learning_rate": 0.00012958376125173687, + "loss": 2.6632, + "step": 11661 + }, + { + "epoch": 1.0565559103984055, + "grad_norm": 0.8653382658958435, + "learning_rate": 0.00012957772005074608, + "loss": 2.7286, + "step": 11662 + }, + { + "epoch": 1.0566465085728522, + "grad_norm": 0.8842229843139648, + "learning_rate": 0.00012957167884975534, + "loss": 2.6008, + "step": 11663 + }, + { + "epoch": 1.056737106747299, + "grad_norm": 0.9670765399932861, + "learning_rate": 0.00012956563764876457, + "loss": 2.3122, + "step": 11664 + }, + { + "epoch": 1.0568277049217458, + "grad_norm": 0.7596291899681091, + "learning_rate": 0.00012955959644777383, + "loss": 2.025, + "step": 11665 + }, + { + "epoch": 1.0569183030961926, + "grad_norm": 0.8654265999794006, + "learning_rate": 0.00012955355524678307, + "loss": 2.6681, + "step": 11666 + }, + { + "epoch": 1.0570089012706394, + "grad_norm": 0.8626565337181091, + "learning_rate": 0.0001295475140457923, + "loss": 2.5919, + "step": 11667 + }, + { + "epoch": 1.0570994994450862, + "grad_norm": 0.8508695960044861, + "learning_rate": 0.00012954147284480156, + "loss": 2.7079, + "step": 11668 + }, + { + "epoch": 1.057190097619533, + "grad_norm": 0.896554708480835, + "learning_rate": 0.0001295354316438108, + "loss": 2.515, + "step": 11669 + }, + { + "epoch": 1.0572806957939798, + "grad_norm": 0.8662987947463989, + "learning_rate": 0.00012952939044282003, + "loss": 2.806, + "step": 11670 + }, + { + "epoch": 1.0573712939684266, + "grad_norm": 0.8227165341377258, + "learning_rate": 0.00012952334924182926, + "loss": 2.4427, + "step": 11671 + }, + { + "epoch": 1.0574618921428733, + "grad_norm": 0.9016183018684387, + "learning_rate": 0.00012951730804083852, + "loss": 2.7829, + "step": 11672 + }, + { + "epoch": 1.0575524903173201, + "grad_norm": 0.848949670791626, + "learning_rate": 0.00012951126683984779, + "loss": 2.9844, + "step": 11673 + }, + { + "epoch": 1.057643088491767, + "grad_norm": 0.850865364074707, + "learning_rate": 0.00012950522563885702, + "loss": 2.5229, + "step": 11674 + }, + { + "epoch": 1.0577336866662137, + "grad_norm": 0.8799096345901489, + "learning_rate": 0.00012949918443786625, + "loss": 2.6669, + "step": 11675 + }, + { + "epoch": 1.0578242848406605, + "grad_norm": 1.0794122219085693, + "learning_rate": 0.0001294931432368755, + "loss": 2.7101, + "step": 11676 + }, + { + "epoch": 1.0579148830151073, + "grad_norm": 0.9113321304321289, + "learning_rate": 0.00012948710203588475, + "loss": 2.6217, + "step": 11677 + }, + { + "epoch": 1.058005481189554, + "grad_norm": 0.873823344707489, + "learning_rate": 0.00012948106083489398, + "loss": 2.5209, + "step": 11678 + }, + { + "epoch": 1.0580960793640009, + "grad_norm": 0.9934367537498474, + "learning_rate": 0.00012947501963390322, + "loss": 2.6243, + "step": 11679 + }, + { + "epoch": 1.0581866775384476, + "grad_norm": 0.8592094779014587, + "learning_rate": 0.00012946897843291248, + "loss": 2.6217, + "step": 11680 + }, + { + "epoch": 1.0582772757128944, + "grad_norm": 0.9382907748222351, + "learning_rate": 0.0001294629372319217, + "loss": 2.8755, + "step": 11681 + }, + { + "epoch": 1.0583678738873412, + "grad_norm": 0.7801971435546875, + "learning_rate": 0.00012945689603093097, + "loss": 1.9454, + "step": 11682 + }, + { + "epoch": 1.058458472061788, + "grad_norm": 0.8638409376144409, + "learning_rate": 0.0001294508548299402, + "loss": 2.8966, + "step": 11683 + }, + { + "epoch": 1.0585490702362348, + "grad_norm": 0.8605515956878662, + "learning_rate": 0.00012944481362894944, + "loss": 2.6603, + "step": 11684 + }, + { + "epoch": 1.0586396684106816, + "grad_norm": 0.8887109756469727, + "learning_rate": 0.00012943877242795868, + "loss": 2.8533, + "step": 11685 + }, + { + "epoch": 1.0587302665851284, + "grad_norm": 0.9572227001190186, + "learning_rate": 0.00012943273122696794, + "loss": 2.6488, + "step": 11686 + }, + { + "epoch": 1.0588208647595752, + "grad_norm": 0.829487681388855, + "learning_rate": 0.00012942669002597717, + "loss": 2.869, + "step": 11687 + }, + { + "epoch": 1.058911462934022, + "grad_norm": 0.6952860951423645, + "learning_rate": 0.0001294206488249864, + "loss": 1.8173, + "step": 11688 + }, + { + "epoch": 1.0590020611084687, + "grad_norm": 0.8666000366210938, + "learning_rate": 0.00012941460762399567, + "loss": 2.8859, + "step": 11689 + }, + { + "epoch": 1.0590926592829155, + "grad_norm": 0.8811712861061096, + "learning_rate": 0.0001294085664230049, + "loss": 2.4953, + "step": 11690 + }, + { + "epoch": 1.0591832574573623, + "grad_norm": 0.8418065905570984, + "learning_rate": 0.00012940252522201416, + "loss": 2.6512, + "step": 11691 + }, + { + "epoch": 1.059273855631809, + "grad_norm": 0.8730806708335876, + "learning_rate": 0.00012939648402102337, + "loss": 2.7824, + "step": 11692 + }, + { + "epoch": 1.0593644538062559, + "grad_norm": 0.903144359588623, + "learning_rate": 0.00012939044282003263, + "loss": 2.8204, + "step": 11693 + }, + { + "epoch": 1.0594550519807027, + "grad_norm": 0.8304439783096313, + "learning_rate": 0.00012938440161904186, + "loss": 2.9807, + "step": 11694 + }, + { + "epoch": 1.0595456501551495, + "grad_norm": 0.876997172832489, + "learning_rate": 0.00012937836041805112, + "loss": 2.603, + "step": 11695 + }, + { + "epoch": 1.0596362483295962, + "grad_norm": 0.8958041667938232, + "learning_rate": 0.00012937231921706036, + "loss": 3.0879, + "step": 11696 + }, + { + "epoch": 1.059726846504043, + "grad_norm": 0.836338996887207, + "learning_rate": 0.0001293662780160696, + "loss": 3.0347, + "step": 11697 + }, + { + "epoch": 1.0598174446784898, + "grad_norm": 0.9963533282279968, + "learning_rate": 0.00012936023681507885, + "loss": 2.7011, + "step": 11698 + }, + { + "epoch": 1.0599080428529366, + "grad_norm": 0.9032695293426514, + "learning_rate": 0.0001293541956140881, + "loss": 2.5496, + "step": 11699 + }, + { + "epoch": 1.0599986410273834, + "grad_norm": 0.9552670121192932, + "learning_rate": 0.00012934815441309732, + "loss": 2.6678, + "step": 11700 + }, + { + "epoch": 1.0600892392018302, + "grad_norm": 0.844016432762146, + "learning_rate": 0.00012934211321210656, + "loss": 2.482, + "step": 11701 + }, + { + "epoch": 1.060179837376277, + "grad_norm": 0.8163989782333374, + "learning_rate": 0.00012933607201111582, + "loss": 2.7448, + "step": 11702 + }, + { + "epoch": 1.0602704355507238, + "grad_norm": 0.8979951739311218, + "learning_rate": 0.00012933003081012508, + "loss": 2.5429, + "step": 11703 + }, + { + "epoch": 1.0603610337251705, + "grad_norm": 0.9064918160438538, + "learning_rate": 0.0001293239896091343, + "loss": 2.67, + "step": 11704 + }, + { + "epoch": 1.060451631899617, + "grad_norm": 0.8895378708839417, + "learning_rate": 0.00012931794840814355, + "loss": 2.974, + "step": 11705 + }, + { + "epoch": 1.0605422300740641, + "grad_norm": 0.8427445292472839, + "learning_rate": 0.00012931190720715278, + "loss": 2.5897, + "step": 11706 + }, + { + "epoch": 1.0606328282485107, + "grad_norm": 1.0542972087860107, + "learning_rate": 0.00012930586600616204, + "loss": 2.8116, + "step": 11707 + }, + { + "epoch": 1.0607234264229575, + "grad_norm": 0.7497166395187378, + "learning_rate": 0.00012929982480517128, + "loss": 2.1562, + "step": 11708 + }, + { + "epoch": 1.0608140245974043, + "grad_norm": 0.9341096878051758, + "learning_rate": 0.0001292937836041805, + "loss": 2.8131, + "step": 11709 + }, + { + "epoch": 1.060904622771851, + "grad_norm": 0.8417386412620544, + "learning_rate": 0.00012928774240318977, + "loss": 2.9029, + "step": 11710 + }, + { + "epoch": 1.0609952209462978, + "grad_norm": 0.8407676219940186, + "learning_rate": 0.000129281701202199, + "loss": 2.609, + "step": 11711 + }, + { + "epoch": 1.0610858191207446, + "grad_norm": 0.809920608997345, + "learning_rate": 0.00012927566000120827, + "loss": 2.4898, + "step": 11712 + }, + { + "epoch": 1.0611764172951914, + "grad_norm": 0.9592170715332031, + "learning_rate": 0.00012926961880021747, + "loss": 3.0317, + "step": 11713 + }, + { + "epoch": 1.0612670154696382, + "grad_norm": 0.9067894816398621, + "learning_rate": 0.00012926357759922673, + "loss": 2.8837, + "step": 11714 + }, + { + "epoch": 1.061357613644085, + "grad_norm": 0.8505570888519287, + "learning_rate": 0.00012925753639823597, + "loss": 2.7348, + "step": 11715 + }, + { + "epoch": 1.0614482118185318, + "grad_norm": 0.8790844678878784, + "learning_rate": 0.00012925149519724523, + "loss": 2.8969, + "step": 11716 + }, + { + "epoch": 1.0615388099929786, + "grad_norm": 0.9133590459823608, + "learning_rate": 0.00012924545399625446, + "loss": 2.7608, + "step": 11717 + }, + { + "epoch": 1.0616294081674253, + "grad_norm": 0.732200026512146, + "learning_rate": 0.0001292394127952637, + "loss": 2.0615, + "step": 11718 + }, + { + "epoch": 1.0617200063418721, + "grad_norm": 0.8273134827613831, + "learning_rate": 0.00012923337159427296, + "loss": 2.6287, + "step": 11719 + }, + { + "epoch": 1.061810604516319, + "grad_norm": 0.8349559903144836, + "learning_rate": 0.0001292273303932822, + "loss": 2.5109, + "step": 11720 + }, + { + "epoch": 1.0619012026907657, + "grad_norm": 0.9088574647903442, + "learning_rate": 0.00012922128919229143, + "loss": 2.9056, + "step": 11721 + }, + { + "epoch": 1.0619918008652125, + "grad_norm": 0.8350821733474731, + "learning_rate": 0.00012921524799130066, + "loss": 2.7768, + "step": 11722 + }, + { + "epoch": 1.0620823990396593, + "grad_norm": 0.8499355912208557, + "learning_rate": 0.00012920920679030992, + "loss": 2.8832, + "step": 11723 + }, + { + "epoch": 1.062172997214106, + "grad_norm": 0.8081416487693787, + "learning_rate": 0.00012920316558931916, + "loss": 2.6279, + "step": 11724 + }, + { + "epoch": 1.0622635953885529, + "grad_norm": 0.8985013961791992, + "learning_rate": 0.00012919712438832842, + "loss": 2.8642, + "step": 11725 + }, + { + "epoch": 1.0623541935629996, + "grad_norm": 0.9121807217597961, + "learning_rate": 0.00012919108318733765, + "loss": 2.8844, + "step": 11726 + }, + { + "epoch": 1.0624447917374464, + "grad_norm": 0.8657879829406738, + "learning_rate": 0.00012918504198634688, + "loss": 2.5991, + "step": 11727 + }, + { + "epoch": 1.0625353899118932, + "grad_norm": 0.8811066150665283, + "learning_rate": 0.00012917900078535615, + "loss": 2.9838, + "step": 11728 + }, + { + "epoch": 1.06262598808634, + "grad_norm": 0.7824976444244385, + "learning_rate": 0.00012917295958436538, + "loss": 2.0369, + "step": 11729 + }, + { + "epoch": 1.0627165862607868, + "grad_norm": 0.9152143597602844, + "learning_rate": 0.00012916691838337461, + "loss": 2.8276, + "step": 11730 + }, + { + "epoch": 1.0628071844352336, + "grad_norm": 0.8390578031539917, + "learning_rate": 0.00012916087718238385, + "loss": 2.5818, + "step": 11731 + }, + { + "epoch": 1.0628977826096804, + "grad_norm": 0.8418065309524536, + "learning_rate": 0.0001291548359813931, + "loss": 2.6533, + "step": 11732 + }, + { + "epoch": 1.0629883807841272, + "grad_norm": 0.8840615153312683, + "learning_rate": 0.00012914879478040237, + "loss": 2.755, + "step": 11733 + }, + { + "epoch": 1.063078978958574, + "grad_norm": 0.9872623682022095, + "learning_rate": 0.00012914275357941158, + "loss": 2.6913, + "step": 11734 + }, + { + "epoch": 1.0631695771330207, + "grad_norm": 0.8885285258293152, + "learning_rate": 0.00012913671237842084, + "loss": 2.5935, + "step": 11735 + }, + { + "epoch": 1.0632601753074675, + "grad_norm": 0.8467036485671997, + "learning_rate": 0.00012913067117743007, + "loss": 2.576, + "step": 11736 + }, + { + "epoch": 1.0633507734819143, + "grad_norm": 0.8836548924446106, + "learning_rate": 0.00012912462997643933, + "loss": 2.4582, + "step": 11737 + }, + { + "epoch": 1.063441371656361, + "grad_norm": 0.9283885359764099, + "learning_rate": 0.00012911858877544857, + "loss": 2.7877, + "step": 11738 + }, + { + "epoch": 1.0635319698308079, + "grad_norm": 0.8763395547866821, + "learning_rate": 0.0001291125475744578, + "loss": 2.8749, + "step": 11739 + }, + { + "epoch": 1.0636225680052547, + "grad_norm": 0.8879559636116028, + "learning_rate": 0.00012910650637346706, + "loss": 2.6899, + "step": 11740 + }, + { + "epoch": 1.0637131661797015, + "grad_norm": 0.8565288782119751, + "learning_rate": 0.0001291004651724763, + "loss": 2.8414, + "step": 11741 + }, + { + "epoch": 1.0638037643541483, + "grad_norm": 0.8538507223129272, + "learning_rate": 0.00012909442397148556, + "loss": 2.7168, + "step": 11742 + }, + { + "epoch": 1.063894362528595, + "grad_norm": 0.8195162415504456, + "learning_rate": 0.00012908838277049477, + "loss": 2.0543, + "step": 11743 + }, + { + "epoch": 1.0639849607030418, + "grad_norm": 0.8726834058761597, + "learning_rate": 0.00012908234156950403, + "loss": 2.8967, + "step": 11744 + }, + { + "epoch": 1.0640755588774886, + "grad_norm": 0.8946143984794617, + "learning_rate": 0.00012907630036851326, + "loss": 2.5516, + "step": 11745 + }, + { + "epoch": 1.0641661570519354, + "grad_norm": 0.868773877620697, + "learning_rate": 0.00012907025916752252, + "loss": 2.8129, + "step": 11746 + }, + { + "epoch": 1.0642567552263822, + "grad_norm": 0.951991617679596, + "learning_rate": 0.00012906421796653173, + "loss": 2.7712, + "step": 11747 + }, + { + "epoch": 1.064347353400829, + "grad_norm": 0.8728275299072266, + "learning_rate": 0.000129058176765541, + "loss": 2.6349, + "step": 11748 + }, + { + "epoch": 1.0644379515752758, + "grad_norm": 0.8526418209075928, + "learning_rate": 0.00012905213556455025, + "loss": 2.7498, + "step": 11749 + }, + { + "epoch": 1.0645285497497226, + "grad_norm": 0.8226845860481262, + "learning_rate": 0.00012904609436355948, + "loss": 2.746, + "step": 11750 + }, + { + "epoch": 1.0646191479241693, + "grad_norm": 0.8733077645301819, + "learning_rate": 0.00012904005316256872, + "loss": 2.843, + "step": 11751 + }, + { + "epoch": 1.0647097460986161, + "grad_norm": 0.8756088614463806, + "learning_rate": 0.00012903401196157795, + "loss": 2.611, + "step": 11752 + }, + { + "epoch": 1.064800344273063, + "grad_norm": 0.8210937976837158, + "learning_rate": 0.00012902797076058721, + "loss": 2.451, + "step": 11753 + }, + { + "epoch": 1.0648909424475097, + "grad_norm": 0.900534987449646, + "learning_rate": 0.00012902192955959645, + "loss": 2.8578, + "step": 11754 + }, + { + "epoch": 1.0649815406219565, + "grad_norm": 0.8325604200363159, + "learning_rate": 0.0001290158883586057, + "loss": 2.6043, + "step": 11755 + }, + { + "epoch": 1.0650721387964033, + "grad_norm": 0.9048058986663818, + "learning_rate": 0.00012900984715761494, + "loss": 2.725, + "step": 11756 + }, + { + "epoch": 1.06516273697085, + "grad_norm": 0.8885524272918701, + "learning_rate": 0.00012900380595662418, + "loss": 2.9137, + "step": 11757 + }, + { + "epoch": 1.0652533351452969, + "grad_norm": 1.051479697227478, + "learning_rate": 0.00012899776475563344, + "loss": 2.6538, + "step": 11758 + }, + { + "epoch": 1.0653439333197436, + "grad_norm": 0.8635042905807495, + "learning_rate": 0.00012899172355464267, + "loss": 2.798, + "step": 11759 + }, + { + "epoch": 1.0654345314941904, + "grad_norm": 1.0564429759979248, + "learning_rate": 0.0001289856823536519, + "loss": 2.5908, + "step": 11760 + }, + { + "epoch": 1.0655251296686372, + "grad_norm": 0.9091120958328247, + "learning_rate": 0.00012897964115266114, + "loss": 2.7407, + "step": 11761 + }, + { + "epoch": 1.065615727843084, + "grad_norm": 0.9328336119651794, + "learning_rate": 0.0001289735999516704, + "loss": 2.7, + "step": 11762 + }, + { + "epoch": 1.0657063260175308, + "grad_norm": 0.7929255366325378, + "learning_rate": 0.00012896755875067966, + "loss": 2.0875, + "step": 11763 + }, + { + "epoch": 1.0657969241919776, + "grad_norm": 0.8895715475082397, + "learning_rate": 0.00012896151754968887, + "loss": 2.7966, + "step": 11764 + }, + { + "epoch": 1.0658875223664244, + "grad_norm": 0.8720507025718689, + "learning_rate": 0.00012895547634869813, + "loss": 2.8136, + "step": 11765 + }, + { + "epoch": 1.0659781205408712, + "grad_norm": 0.8612650036811829, + "learning_rate": 0.00012894943514770737, + "loss": 2.4933, + "step": 11766 + }, + { + "epoch": 1.066068718715318, + "grad_norm": 0.9343265891075134, + "learning_rate": 0.00012894339394671663, + "loss": 2.9438, + "step": 11767 + }, + { + "epoch": 1.0661593168897647, + "grad_norm": 0.7662745714187622, + "learning_rate": 0.00012893735274572586, + "loss": 2.055, + "step": 11768 + }, + { + "epoch": 1.0662499150642115, + "grad_norm": 0.8518401384353638, + "learning_rate": 0.0001289313115447351, + "loss": 2.8548, + "step": 11769 + }, + { + "epoch": 1.0663405132386583, + "grad_norm": 0.8979755640029907, + "learning_rate": 0.00012892527034374436, + "loss": 2.8075, + "step": 11770 + }, + { + "epoch": 1.066431111413105, + "grad_norm": 0.803612232208252, + "learning_rate": 0.0001289192291427536, + "loss": 2.6833, + "step": 11771 + }, + { + "epoch": 1.0665217095875519, + "grad_norm": 0.7509040236473083, + "learning_rate": 0.00012891318794176282, + "loss": 2.0924, + "step": 11772 + }, + { + "epoch": 1.0666123077619987, + "grad_norm": 0.8575763702392578, + "learning_rate": 0.00012890714674077206, + "loss": 2.3643, + "step": 11773 + }, + { + "epoch": 1.0667029059364455, + "grad_norm": 0.9068167805671692, + "learning_rate": 0.00012890110553978132, + "loss": 2.7391, + "step": 11774 + }, + { + "epoch": 1.0667935041108922, + "grad_norm": 0.8964870572090149, + "learning_rate": 0.00012889506433879055, + "loss": 2.7069, + "step": 11775 + }, + { + "epoch": 1.066884102285339, + "grad_norm": 0.9236223697662354, + "learning_rate": 0.00012888902313779981, + "loss": 2.8898, + "step": 11776 + }, + { + "epoch": 1.0669747004597858, + "grad_norm": 0.8080416917800903, + "learning_rate": 0.00012888298193680902, + "loss": 2.5177, + "step": 11777 + }, + { + "epoch": 1.0670652986342326, + "grad_norm": 0.8630039691925049, + "learning_rate": 0.00012887694073581828, + "loss": 2.6936, + "step": 11778 + }, + { + "epoch": 1.0671558968086794, + "grad_norm": 0.5984112620353699, + "learning_rate": 0.00012887089953482754, + "loss": 1.2443, + "step": 11779 + }, + { + "epoch": 1.0672464949831262, + "grad_norm": 0.8946981430053711, + "learning_rate": 0.00012886485833383678, + "loss": 2.5798, + "step": 11780 + }, + { + "epoch": 1.067337093157573, + "grad_norm": 0.8943532705307007, + "learning_rate": 0.000128858817132846, + "loss": 2.8461, + "step": 11781 + }, + { + "epoch": 1.0674276913320198, + "grad_norm": 0.8532477617263794, + "learning_rate": 0.00012885277593185525, + "loss": 2.7659, + "step": 11782 + }, + { + "epoch": 1.0675182895064665, + "grad_norm": 0.9259388446807861, + "learning_rate": 0.0001288467347308645, + "loss": 2.7205, + "step": 11783 + }, + { + "epoch": 1.0676088876809133, + "grad_norm": 1.0531439781188965, + "learning_rate": 0.00012884069352987374, + "loss": 2.6828, + "step": 11784 + }, + { + "epoch": 1.0676994858553601, + "grad_norm": 0.8542975783348083, + "learning_rate": 0.00012883465232888297, + "loss": 2.4945, + "step": 11785 + }, + { + "epoch": 1.0677900840298067, + "grad_norm": 0.8199741840362549, + "learning_rate": 0.00012882861112789224, + "loss": 2.6124, + "step": 11786 + }, + { + "epoch": 1.0678806822042537, + "grad_norm": 0.8453740477561951, + "learning_rate": 0.00012882256992690147, + "loss": 2.8041, + "step": 11787 + }, + { + "epoch": 1.0679712803787003, + "grad_norm": 0.8984143733978271, + "learning_rate": 0.00012881652872591073, + "loss": 2.6358, + "step": 11788 + }, + { + "epoch": 1.0680618785531473, + "grad_norm": 0.8824708461761475, + "learning_rate": 0.00012881048752491997, + "loss": 2.6944, + "step": 11789 + }, + { + "epoch": 1.0681524767275938, + "grad_norm": 0.7981746196746826, + "learning_rate": 0.0001288044463239292, + "loss": 2.3761, + "step": 11790 + }, + { + "epoch": 1.0682430749020406, + "grad_norm": 0.7751597166061401, + "learning_rate": 0.00012879840512293843, + "loss": 2.0459, + "step": 11791 + }, + { + "epoch": 1.0683336730764874, + "grad_norm": 1.0008225440979004, + "learning_rate": 0.0001287923639219477, + "loss": 2.8595, + "step": 11792 + }, + { + "epoch": 1.0684242712509342, + "grad_norm": 0.9535751938819885, + "learning_rate": 0.00012878632272095696, + "loss": 2.5423, + "step": 11793 + }, + { + "epoch": 1.068514869425381, + "grad_norm": 0.7887536287307739, + "learning_rate": 0.00012878028151996616, + "loss": 2.1918, + "step": 11794 + }, + { + "epoch": 1.0686054675998278, + "grad_norm": 0.8673492074012756, + "learning_rate": 0.00012877424031897542, + "loss": 2.7597, + "step": 11795 + }, + { + "epoch": 1.0686960657742746, + "grad_norm": 0.8494321703910828, + "learning_rate": 0.00012876819911798466, + "loss": 2.5795, + "step": 11796 + }, + { + "epoch": 1.0687866639487213, + "grad_norm": 0.9189808368682861, + "learning_rate": 0.00012876215791699392, + "loss": 2.7618, + "step": 11797 + }, + { + "epoch": 1.0688772621231681, + "grad_norm": 0.8865576982498169, + "learning_rate": 0.00012875611671600313, + "loss": 2.5991, + "step": 11798 + }, + { + "epoch": 1.068967860297615, + "grad_norm": 0.8676732778549194, + "learning_rate": 0.0001287500755150124, + "loss": 2.6654, + "step": 11799 + }, + { + "epoch": 1.0690584584720617, + "grad_norm": 0.8921017050743103, + "learning_rate": 0.00012874403431402165, + "loss": 2.6939, + "step": 11800 + }, + { + "epoch": 1.0691490566465085, + "grad_norm": 0.8530886173248291, + "learning_rate": 0.00012873799311303088, + "loss": 2.6316, + "step": 11801 + }, + { + "epoch": 1.0692396548209553, + "grad_norm": 0.8662992119789124, + "learning_rate": 0.00012873195191204012, + "loss": 2.5522, + "step": 11802 + }, + { + "epoch": 1.069330252995402, + "grad_norm": 0.8875656127929688, + "learning_rate": 0.00012872591071104935, + "loss": 2.6169, + "step": 11803 + }, + { + "epoch": 1.0694208511698489, + "grad_norm": 0.9148658514022827, + "learning_rate": 0.0001287198695100586, + "loss": 2.5637, + "step": 11804 + }, + { + "epoch": 1.0695114493442957, + "grad_norm": 0.8114504218101501, + "learning_rate": 0.00012871382830906785, + "loss": 2.2408, + "step": 11805 + }, + { + "epoch": 1.0696020475187424, + "grad_norm": 0.8795102834701538, + "learning_rate": 0.0001287077871080771, + "loss": 2.7053, + "step": 11806 + }, + { + "epoch": 1.0696926456931892, + "grad_norm": 0.9476529359817505, + "learning_rate": 0.00012870174590708631, + "loss": 2.8302, + "step": 11807 + }, + { + "epoch": 1.069783243867636, + "grad_norm": 0.9200076460838318, + "learning_rate": 0.00012869570470609557, + "loss": 2.5939, + "step": 11808 + }, + { + "epoch": 1.0698738420420828, + "grad_norm": 0.9327757954597473, + "learning_rate": 0.00012868966350510484, + "loss": 3.0246, + "step": 11809 + }, + { + "epoch": 1.0699644402165296, + "grad_norm": 0.9557564854621887, + "learning_rate": 0.00012868362230411407, + "loss": 2.5081, + "step": 11810 + }, + { + "epoch": 1.0700550383909764, + "grad_norm": 0.8962368965148926, + "learning_rate": 0.0001286775811031233, + "loss": 2.8216, + "step": 11811 + }, + { + "epoch": 1.0701456365654232, + "grad_norm": 0.9638885259628296, + "learning_rate": 0.00012867153990213254, + "loss": 2.9828, + "step": 11812 + }, + { + "epoch": 1.07023623473987, + "grad_norm": 0.8667038679122925, + "learning_rate": 0.0001286654987011418, + "loss": 2.6778, + "step": 11813 + }, + { + "epoch": 1.0703268329143167, + "grad_norm": 0.938968300819397, + "learning_rate": 0.00012865945750015103, + "loss": 2.7742, + "step": 11814 + }, + { + "epoch": 1.0704174310887635, + "grad_norm": 0.9015342593193054, + "learning_rate": 0.00012865341629916027, + "loss": 2.7335, + "step": 11815 + }, + { + "epoch": 1.0705080292632103, + "grad_norm": 0.7699699401855469, + "learning_rate": 0.00012864737509816953, + "loss": 2.1404, + "step": 11816 + }, + { + "epoch": 1.070598627437657, + "grad_norm": 0.8845568895339966, + "learning_rate": 0.00012864133389717876, + "loss": 2.4474, + "step": 11817 + }, + { + "epoch": 1.070689225612104, + "grad_norm": 0.8780108094215393, + "learning_rate": 0.00012863529269618802, + "loss": 2.824, + "step": 11818 + }, + { + "epoch": 1.0707798237865507, + "grad_norm": 0.867621660232544, + "learning_rate": 0.00012862925149519726, + "loss": 2.9124, + "step": 11819 + }, + { + "epoch": 1.0708704219609975, + "grad_norm": 0.8491523861885071, + "learning_rate": 0.0001286232102942065, + "loss": 2.5645, + "step": 11820 + }, + { + "epoch": 1.0709610201354443, + "grad_norm": 0.9328508377075195, + "learning_rate": 0.00012861716909321573, + "loss": 2.7016, + "step": 11821 + }, + { + "epoch": 1.071051618309891, + "grad_norm": 0.9482867121696472, + "learning_rate": 0.000128611127892225, + "loss": 2.692, + "step": 11822 + }, + { + "epoch": 1.0711422164843378, + "grad_norm": 0.9373480081558228, + "learning_rate": 0.00012860508669123422, + "loss": 2.6465, + "step": 11823 + }, + { + "epoch": 1.0712328146587846, + "grad_norm": 0.8594893217086792, + "learning_rate": 0.00012859904549024346, + "loss": 2.7285, + "step": 11824 + }, + { + "epoch": 1.0713234128332314, + "grad_norm": 0.7143989205360413, + "learning_rate": 0.00012859300428925272, + "loss": 1.8631, + "step": 11825 + }, + { + "epoch": 1.0714140110076782, + "grad_norm": 0.862885594367981, + "learning_rate": 0.00012858696308826195, + "loss": 2.776, + "step": 11826 + }, + { + "epoch": 1.071504609182125, + "grad_norm": 0.8397030234336853, + "learning_rate": 0.0001285809218872712, + "loss": 2.5334, + "step": 11827 + }, + { + "epoch": 1.0715952073565718, + "grad_norm": 0.9180647134780884, + "learning_rate": 0.00012857488068628042, + "loss": 2.7911, + "step": 11828 + }, + { + "epoch": 1.0716858055310186, + "grad_norm": 0.9477965235710144, + "learning_rate": 0.00012856883948528968, + "loss": 2.9983, + "step": 11829 + }, + { + "epoch": 1.0717764037054653, + "grad_norm": 0.9566825032234192, + "learning_rate": 0.00012856279828429894, + "loss": 2.4821, + "step": 11830 + }, + { + "epoch": 1.0718670018799121, + "grad_norm": 0.8899033069610596, + "learning_rate": 0.00012855675708330818, + "loss": 2.7652, + "step": 11831 + }, + { + "epoch": 1.071957600054359, + "grad_norm": 0.8937165141105652, + "learning_rate": 0.0001285507158823174, + "loss": 2.691, + "step": 11832 + }, + { + "epoch": 1.0720481982288057, + "grad_norm": 0.8971630930900574, + "learning_rate": 0.00012854467468132664, + "loss": 2.6353, + "step": 11833 + }, + { + "epoch": 1.0721387964032525, + "grad_norm": 0.88340163230896, + "learning_rate": 0.0001285386334803359, + "loss": 2.6746, + "step": 11834 + }, + { + "epoch": 1.0722293945776993, + "grad_norm": 0.9314581751823425, + "learning_rate": 0.00012853259227934514, + "loss": 2.5881, + "step": 11835 + }, + { + "epoch": 1.072319992752146, + "grad_norm": 0.9354996681213379, + "learning_rate": 0.00012852655107835437, + "loss": 2.7987, + "step": 11836 + }, + { + "epoch": 1.0724105909265929, + "grad_norm": 0.8503134250640869, + "learning_rate": 0.0001285205098773636, + "loss": 2.5156, + "step": 11837 + }, + { + "epoch": 1.0725011891010396, + "grad_norm": 0.8720304369926453, + "learning_rate": 0.00012851446867637287, + "loss": 2.4627, + "step": 11838 + }, + { + "epoch": 1.0725917872754864, + "grad_norm": 0.8529791235923767, + "learning_rate": 0.00012850842747538213, + "loss": 2.6722, + "step": 11839 + }, + { + "epoch": 1.0726823854499332, + "grad_norm": 0.9803540110588074, + "learning_rate": 0.00012850238627439136, + "loss": 2.6449, + "step": 11840 + }, + { + "epoch": 1.07277298362438, + "grad_norm": 0.8729098439216614, + "learning_rate": 0.0001284963450734006, + "loss": 2.6145, + "step": 11841 + }, + { + "epoch": 1.0728635817988268, + "grad_norm": 0.8973535895347595, + "learning_rate": 0.00012849030387240983, + "loss": 2.8397, + "step": 11842 + }, + { + "epoch": 1.0729541799732736, + "grad_norm": 0.8562145233154297, + "learning_rate": 0.0001284842626714191, + "loss": 2.771, + "step": 11843 + }, + { + "epoch": 1.0730447781477204, + "grad_norm": 0.7996718883514404, + "learning_rate": 0.00012847822147042833, + "loss": 2.6585, + "step": 11844 + }, + { + "epoch": 1.0731353763221672, + "grad_norm": 0.8815382719039917, + "learning_rate": 0.00012847218026943756, + "loss": 2.4756, + "step": 11845 + }, + { + "epoch": 1.073225974496614, + "grad_norm": 0.815762996673584, + "learning_rate": 0.00012846613906844682, + "loss": 2.5686, + "step": 11846 + }, + { + "epoch": 1.0733165726710607, + "grad_norm": 0.8711830973625183, + "learning_rate": 0.00012846009786745606, + "loss": 2.9009, + "step": 11847 + }, + { + "epoch": 1.0734071708455075, + "grad_norm": 0.8619061708450317, + "learning_rate": 0.00012845405666646532, + "loss": 2.4403, + "step": 11848 + }, + { + "epoch": 1.0734977690199543, + "grad_norm": 0.8925482630729675, + "learning_rate": 0.00012844801546547452, + "loss": 2.8944, + "step": 11849 + }, + { + "epoch": 1.073588367194401, + "grad_norm": 0.8771618008613586, + "learning_rate": 0.00012844197426448378, + "loss": 2.5817, + "step": 11850 + }, + { + "epoch": 1.0736789653688479, + "grad_norm": 0.9145684838294983, + "learning_rate": 0.00012843593306349302, + "loss": 2.7833, + "step": 11851 + }, + { + "epoch": 1.0737695635432947, + "grad_norm": 0.889226496219635, + "learning_rate": 0.00012842989186250228, + "loss": 2.7999, + "step": 11852 + }, + { + "epoch": 1.0738601617177415, + "grad_norm": 0.8501346111297607, + "learning_rate": 0.00012842385066151151, + "loss": 2.5571, + "step": 11853 + }, + { + "epoch": 1.0739507598921882, + "grad_norm": 0.8091066479682922, + "learning_rate": 0.00012841780946052075, + "loss": 2.0808, + "step": 11854 + }, + { + "epoch": 1.074041358066635, + "grad_norm": 0.9069276452064514, + "learning_rate": 0.00012841176825953, + "loss": 2.7974, + "step": 11855 + }, + { + "epoch": 1.0741319562410818, + "grad_norm": 0.7629657983779907, + "learning_rate": 0.00012840572705853924, + "loss": 1.968, + "step": 11856 + }, + { + "epoch": 1.0742225544155286, + "grad_norm": 0.8832945227622986, + "learning_rate": 0.00012839968585754848, + "loss": 2.6241, + "step": 11857 + }, + { + "epoch": 1.0743131525899754, + "grad_norm": 0.9419539570808411, + "learning_rate": 0.0001283936446565577, + "loss": 2.6909, + "step": 11858 + }, + { + "epoch": 1.0744037507644222, + "grad_norm": 0.9642343521118164, + "learning_rate": 0.00012838760345556697, + "loss": 2.8906, + "step": 11859 + }, + { + "epoch": 1.074494348938869, + "grad_norm": 0.9008277654647827, + "learning_rate": 0.00012838156225457623, + "loss": 2.9769, + "step": 11860 + }, + { + "epoch": 1.0745849471133158, + "grad_norm": 0.8739026188850403, + "learning_rate": 0.00012837552105358547, + "loss": 2.761, + "step": 11861 + }, + { + "epoch": 1.0746755452877625, + "grad_norm": 0.8532060384750366, + "learning_rate": 0.0001283694798525947, + "loss": 2.6945, + "step": 11862 + }, + { + "epoch": 1.0747661434622093, + "grad_norm": 0.7950068712234497, + "learning_rate": 0.00012836343865160394, + "loss": 2.2117, + "step": 11863 + }, + { + "epoch": 1.0748567416366561, + "grad_norm": 0.8447049260139465, + "learning_rate": 0.0001283573974506132, + "loss": 2.6785, + "step": 11864 + }, + { + "epoch": 1.074947339811103, + "grad_norm": 0.8905567526817322, + "learning_rate": 0.00012835135624962243, + "loss": 2.4146, + "step": 11865 + }, + { + "epoch": 1.0750379379855497, + "grad_norm": 0.8665145039558411, + "learning_rate": 0.00012834531504863167, + "loss": 2.5143, + "step": 11866 + }, + { + "epoch": 1.0751285361599963, + "grad_norm": 0.9754670262336731, + "learning_rate": 0.0001283392738476409, + "loss": 2.9948, + "step": 11867 + }, + { + "epoch": 1.0752191343344433, + "grad_norm": 0.958648145198822, + "learning_rate": 0.00012833323264665016, + "loss": 2.8925, + "step": 11868 + }, + { + "epoch": 1.0753097325088898, + "grad_norm": 0.8568214774131775, + "learning_rate": 0.00012832719144565942, + "loss": 2.8484, + "step": 11869 + }, + { + "epoch": 1.0754003306833368, + "grad_norm": 0.8722485303878784, + "learning_rate": 0.00012832115024466866, + "loss": 2.7284, + "step": 11870 + }, + { + "epoch": 1.0754909288577834, + "grad_norm": 0.8813199400901794, + "learning_rate": 0.0001283151090436779, + "loss": 2.6699, + "step": 11871 + }, + { + "epoch": 1.0755815270322302, + "grad_norm": 0.8627406358718872, + "learning_rate": 0.00012830906784268712, + "loss": 2.7832, + "step": 11872 + }, + { + "epoch": 1.075672125206677, + "grad_norm": 0.7776669263839722, + "learning_rate": 0.00012830302664169638, + "loss": 2.248, + "step": 11873 + }, + { + "epoch": 1.0757627233811238, + "grad_norm": 0.8841536641120911, + "learning_rate": 0.00012829698544070562, + "loss": 2.6015, + "step": 11874 + }, + { + "epoch": 1.0758533215555706, + "grad_norm": 0.8137585520744324, + "learning_rate": 0.00012829094423971485, + "loss": 2.3473, + "step": 11875 + }, + { + "epoch": 1.0759439197300174, + "grad_norm": 0.9083623886108398, + "learning_rate": 0.00012828490303872411, + "loss": 2.7292, + "step": 11876 + }, + { + "epoch": 1.0760345179044641, + "grad_norm": 0.9397425651550293, + "learning_rate": 0.00012827886183773335, + "loss": 2.5598, + "step": 11877 + }, + { + "epoch": 1.076125116078911, + "grad_norm": 0.8925959467887878, + "learning_rate": 0.0001282728206367426, + "loss": 3.1444, + "step": 11878 + }, + { + "epoch": 1.0762157142533577, + "grad_norm": 0.832234263420105, + "learning_rate": 0.00012826677943575182, + "loss": 2.4748, + "step": 11879 + }, + { + "epoch": 1.0763063124278045, + "grad_norm": 0.9223424792289734, + "learning_rate": 0.00012826073823476108, + "loss": 2.5146, + "step": 11880 + }, + { + "epoch": 1.0763969106022513, + "grad_norm": 0.8699928522109985, + "learning_rate": 0.0001282546970337703, + "loss": 2.7963, + "step": 11881 + }, + { + "epoch": 1.076487508776698, + "grad_norm": 0.852371096611023, + "learning_rate": 0.00012824865583277957, + "loss": 2.5018, + "step": 11882 + }, + { + "epoch": 1.0765781069511449, + "grad_norm": 1.024376392364502, + "learning_rate": 0.0001282426146317888, + "loss": 2.7305, + "step": 11883 + }, + { + "epoch": 1.0766687051255917, + "grad_norm": 0.9091702103614807, + "learning_rate": 0.00012823657343079804, + "loss": 2.628, + "step": 11884 + }, + { + "epoch": 1.0767593033000384, + "grad_norm": 0.8661814332008362, + "learning_rate": 0.0001282305322298073, + "loss": 2.754, + "step": 11885 + }, + { + "epoch": 1.0768499014744852, + "grad_norm": 0.9683201313018799, + "learning_rate": 0.00012822449102881654, + "loss": 2.6754, + "step": 11886 + }, + { + "epoch": 1.076940499648932, + "grad_norm": 0.8739655017852783, + "learning_rate": 0.00012821844982782577, + "loss": 2.7703, + "step": 11887 + }, + { + "epoch": 1.0770310978233788, + "grad_norm": 0.8821905851364136, + "learning_rate": 0.000128212408626835, + "loss": 2.5461, + "step": 11888 + }, + { + "epoch": 1.0771216959978256, + "grad_norm": 0.9057590365409851, + "learning_rate": 0.00012820636742584427, + "loss": 2.6637, + "step": 11889 + }, + { + "epoch": 1.0772122941722724, + "grad_norm": 0.8229437470436096, + "learning_rate": 0.00012820032622485353, + "loss": 2.4613, + "step": 11890 + }, + { + "epoch": 1.0773028923467192, + "grad_norm": 0.8478895425796509, + "learning_rate": 0.00012819428502386276, + "loss": 2.6167, + "step": 11891 + }, + { + "epoch": 1.077393490521166, + "grad_norm": 1.0037941932678223, + "learning_rate": 0.000128188243822872, + "loss": 2.946, + "step": 11892 + }, + { + "epoch": 1.0774840886956127, + "grad_norm": 0.8422245979309082, + "learning_rate": 0.00012818220262188123, + "loss": 2.6286, + "step": 11893 + }, + { + "epoch": 1.0775746868700595, + "grad_norm": 0.7683339715003967, + "learning_rate": 0.0001281761614208905, + "loss": 2.1579, + "step": 11894 + }, + { + "epoch": 1.0776652850445063, + "grad_norm": 0.8703526258468628, + "learning_rate": 0.00012817012021989972, + "loss": 2.5601, + "step": 11895 + }, + { + "epoch": 1.077755883218953, + "grad_norm": 0.9241020679473877, + "learning_rate": 0.00012816407901890896, + "loss": 2.9329, + "step": 11896 + }, + { + "epoch": 1.0778464813934, + "grad_norm": 0.93199622631073, + "learning_rate": 0.0001281580378179182, + "loss": 2.5965, + "step": 11897 + }, + { + "epoch": 1.0779370795678467, + "grad_norm": 0.8630263209342957, + "learning_rate": 0.00012815199661692745, + "loss": 2.4576, + "step": 11898 + }, + { + "epoch": 1.0780276777422935, + "grad_norm": 0.9033961892127991, + "learning_rate": 0.00012814595541593671, + "loss": 2.4744, + "step": 11899 + }, + { + "epoch": 1.0781182759167403, + "grad_norm": 0.8311471343040466, + "learning_rate": 0.00012813991421494592, + "loss": 2.7392, + "step": 11900 + }, + { + "epoch": 1.078208874091187, + "grad_norm": 0.8729906678199768, + "learning_rate": 0.00012813387301395518, + "loss": 2.7433, + "step": 11901 + }, + { + "epoch": 1.0782994722656338, + "grad_norm": 0.9253895282745361, + "learning_rate": 0.00012812783181296442, + "loss": 2.8465, + "step": 11902 + }, + { + "epoch": 1.0783900704400806, + "grad_norm": 0.9314908385276794, + "learning_rate": 0.00012812179061197368, + "loss": 2.5327, + "step": 11903 + }, + { + "epoch": 1.0784806686145274, + "grad_norm": 0.8926653265953064, + "learning_rate": 0.0001281157494109829, + "loss": 2.6998, + "step": 11904 + }, + { + "epoch": 1.0785712667889742, + "grad_norm": 0.866193950176239, + "learning_rate": 0.00012810970820999215, + "loss": 2.6476, + "step": 11905 + }, + { + "epoch": 1.078661864963421, + "grad_norm": 0.8494387269020081, + "learning_rate": 0.0001281036670090014, + "loss": 2.663, + "step": 11906 + }, + { + "epoch": 1.0787524631378678, + "grad_norm": 0.9402913451194763, + "learning_rate": 0.00012809762580801064, + "loss": 2.6949, + "step": 11907 + }, + { + "epoch": 1.0788430613123146, + "grad_norm": 0.9108999371528625, + "learning_rate": 0.00012809158460701987, + "loss": 2.6969, + "step": 11908 + }, + { + "epoch": 1.0789336594867613, + "grad_norm": 0.8893679976463318, + "learning_rate": 0.0001280855434060291, + "loss": 2.6915, + "step": 11909 + }, + { + "epoch": 1.0790242576612081, + "grad_norm": 0.8886106610298157, + "learning_rate": 0.00012807950220503837, + "loss": 2.4788, + "step": 11910 + }, + { + "epoch": 1.079114855835655, + "grad_norm": 0.8996275067329407, + "learning_rate": 0.0001280734610040476, + "loss": 2.8259, + "step": 11911 + }, + { + "epoch": 1.0792054540101017, + "grad_norm": 0.902982234954834, + "learning_rate": 0.00012806741980305687, + "loss": 2.6249, + "step": 11912 + }, + { + "epoch": 1.0792960521845485, + "grad_norm": 1.116231918334961, + "learning_rate": 0.0001280613786020661, + "loss": 2.65, + "step": 11913 + }, + { + "epoch": 1.0793866503589953, + "grad_norm": 0.8636390566825867, + "learning_rate": 0.00012805533740107533, + "loss": 2.732, + "step": 11914 + }, + { + "epoch": 1.079477248533442, + "grad_norm": 0.9410573840141296, + "learning_rate": 0.0001280492962000846, + "loss": 2.4826, + "step": 11915 + }, + { + "epoch": 1.0795678467078889, + "grad_norm": 0.8527320027351379, + "learning_rate": 0.00012804325499909383, + "loss": 2.6044, + "step": 11916 + }, + { + "epoch": 1.0796584448823356, + "grad_norm": 0.6122452020645142, + "learning_rate": 0.00012803721379810306, + "loss": 1.5293, + "step": 11917 + }, + { + "epoch": 1.0797490430567824, + "grad_norm": 0.8396362662315369, + "learning_rate": 0.0001280311725971123, + "loss": 2.7453, + "step": 11918 + }, + { + "epoch": 1.0798396412312292, + "grad_norm": 0.8494639992713928, + "learning_rate": 0.00012802513139612156, + "loss": 2.637, + "step": 11919 + }, + { + "epoch": 1.079930239405676, + "grad_norm": 0.8605418801307678, + "learning_rate": 0.00012801909019513082, + "loss": 2.5171, + "step": 11920 + }, + { + "epoch": 1.0800208375801228, + "grad_norm": 0.8740498423576355, + "learning_rate": 0.00012801304899414003, + "loss": 2.714, + "step": 11921 + }, + { + "epoch": 1.0801114357545696, + "grad_norm": 0.9541677832603455, + "learning_rate": 0.0001280070077931493, + "loss": 2.5425, + "step": 11922 + }, + { + "epoch": 1.0802020339290164, + "grad_norm": 0.8652496933937073, + "learning_rate": 0.00012800096659215852, + "loss": 2.8014, + "step": 11923 + }, + { + "epoch": 1.0802926321034632, + "grad_norm": 0.8788346648216248, + "learning_rate": 0.00012799492539116778, + "loss": 2.755, + "step": 11924 + }, + { + "epoch": 1.08038323027791, + "grad_norm": 0.8809582591056824, + "learning_rate": 0.00012798888419017702, + "loss": 2.7935, + "step": 11925 + }, + { + "epoch": 1.0804738284523567, + "grad_norm": 0.8879252076148987, + "learning_rate": 0.00012798284298918625, + "loss": 2.7872, + "step": 11926 + }, + { + "epoch": 1.0805644266268035, + "grad_norm": 0.7457955479621887, + "learning_rate": 0.00012797680178819548, + "loss": 1.9304, + "step": 11927 + }, + { + "epoch": 1.0806550248012503, + "grad_norm": 0.9182634949684143, + "learning_rate": 0.00012797076058720475, + "loss": 3.0592, + "step": 11928 + }, + { + "epoch": 1.080745622975697, + "grad_norm": 0.9068487882614136, + "learning_rate": 0.000127964719386214, + "loss": 2.5994, + "step": 11929 + }, + { + "epoch": 1.0808362211501439, + "grad_norm": 0.9387731552124023, + "learning_rate": 0.00012795867818522321, + "loss": 2.8072, + "step": 11930 + }, + { + "epoch": 1.0809268193245907, + "grad_norm": 0.843265175819397, + "learning_rate": 0.00012795263698423247, + "loss": 2.5736, + "step": 11931 + }, + { + "epoch": 1.0810174174990375, + "grad_norm": 0.8943771719932556, + "learning_rate": 0.0001279465957832417, + "loss": 2.7743, + "step": 11932 + }, + { + "epoch": 1.0811080156734842, + "grad_norm": 0.9131454825401306, + "learning_rate": 0.00012794055458225097, + "loss": 2.7605, + "step": 11933 + }, + { + "epoch": 1.081198613847931, + "grad_norm": 0.7306744456291199, + "learning_rate": 0.00012793451338126018, + "loss": 2.0812, + "step": 11934 + }, + { + "epoch": 1.0812892120223778, + "grad_norm": 0.8941847085952759, + "learning_rate": 0.00012792847218026944, + "loss": 3.0764, + "step": 11935 + }, + { + "epoch": 1.0813798101968246, + "grad_norm": 0.8813527226448059, + "learning_rate": 0.0001279224309792787, + "loss": 2.6185, + "step": 11936 + }, + { + "epoch": 1.0814704083712714, + "grad_norm": 1.0314940214157104, + "learning_rate": 0.00012791638977828793, + "loss": 2.6031, + "step": 11937 + }, + { + "epoch": 1.0815610065457182, + "grad_norm": 0.8892644643783569, + "learning_rate": 0.00012791034857729717, + "loss": 2.637, + "step": 11938 + }, + { + "epoch": 1.081651604720165, + "grad_norm": 0.8755074143409729, + "learning_rate": 0.0001279043073763064, + "loss": 2.8169, + "step": 11939 + }, + { + "epoch": 1.0817422028946118, + "grad_norm": 0.8997645378112793, + "learning_rate": 0.00012789826617531566, + "loss": 2.6471, + "step": 11940 + }, + { + "epoch": 1.0818328010690585, + "grad_norm": 0.937565267086029, + "learning_rate": 0.0001278922249743249, + "loss": 2.4814, + "step": 11941 + }, + { + "epoch": 1.0819233992435053, + "grad_norm": 0.9019392132759094, + "learning_rate": 0.00012788618377333416, + "loss": 2.6541, + "step": 11942 + }, + { + "epoch": 1.0820139974179521, + "grad_norm": 0.8700132966041565, + "learning_rate": 0.0001278801425723434, + "loss": 2.6505, + "step": 11943 + }, + { + "epoch": 1.082104595592399, + "grad_norm": 0.8727045655250549, + "learning_rate": 0.00012787410137135263, + "loss": 2.4897, + "step": 11944 + }, + { + "epoch": 1.0821951937668457, + "grad_norm": 0.8476653695106506, + "learning_rate": 0.0001278680601703619, + "loss": 2.8092, + "step": 11945 + }, + { + "epoch": 1.0822857919412925, + "grad_norm": 0.8333265781402588, + "learning_rate": 0.00012786201896937112, + "loss": 2.5148, + "step": 11946 + }, + { + "epoch": 1.0823763901157393, + "grad_norm": 0.8527387380599976, + "learning_rate": 0.00012785597776838036, + "loss": 2.4915, + "step": 11947 + }, + { + "epoch": 1.0824669882901858, + "grad_norm": 0.8703212141990662, + "learning_rate": 0.0001278499365673896, + "loss": 2.5978, + "step": 11948 + }, + { + "epoch": 1.0825575864646328, + "grad_norm": 0.754301905632019, + "learning_rate": 0.00012784389536639885, + "loss": 1.9512, + "step": 11949 + }, + { + "epoch": 1.0826481846390794, + "grad_norm": 0.8477096557617188, + "learning_rate": 0.0001278378541654081, + "loss": 2.5863, + "step": 11950 + }, + { + "epoch": 1.0827387828135264, + "grad_norm": 0.8629987835884094, + "learning_rate": 0.00012783181296441732, + "loss": 2.6859, + "step": 11951 + }, + { + "epoch": 1.082829380987973, + "grad_norm": 0.9346020817756653, + "learning_rate": 0.00012782577176342658, + "loss": 2.7, + "step": 11952 + }, + { + "epoch": 1.0829199791624198, + "grad_norm": 0.8910892009735107, + "learning_rate": 0.00012781973056243581, + "loss": 2.7834, + "step": 11953 + }, + { + "epoch": 1.0830105773368666, + "grad_norm": 0.8603152632713318, + "learning_rate": 0.00012781368936144507, + "loss": 2.7585, + "step": 11954 + }, + { + "epoch": 1.0831011755113134, + "grad_norm": 0.921596109867096, + "learning_rate": 0.0001278076481604543, + "loss": 2.7203, + "step": 11955 + }, + { + "epoch": 1.0831917736857601, + "grad_norm": 0.8175771236419678, + "learning_rate": 0.00012780160695946354, + "loss": 2.7907, + "step": 11956 + }, + { + "epoch": 1.083282371860207, + "grad_norm": 0.7343394756317139, + "learning_rate": 0.00012779556575847278, + "loss": 2.0716, + "step": 11957 + }, + { + "epoch": 1.0833729700346537, + "grad_norm": 0.868976354598999, + "learning_rate": 0.00012778952455748204, + "loss": 2.9563, + "step": 11958 + }, + { + "epoch": 1.0834635682091005, + "grad_norm": 0.9853094816207886, + "learning_rate": 0.00012778348335649127, + "loss": 2.5404, + "step": 11959 + }, + { + "epoch": 1.0835541663835473, + "grad_norm": 0.8874332904815674, + "learning_rate": 0.0001277774421555005, + "loss": 2.6382, + "step": 11960 + }, + { + "epoch": 1.083644764557994, + "grad_norm": 0.9213668704032898, + "learning_rate": 0.00012777140095450977, + "loss": 2.9962, + "step": 11961 + }, + { + "epoch": 1.0837353627324409, + "grad_norm": 0.9358189702033997, + "learning_rate": 0.000127765359753519, + "loss": 2.408, + "step": 11962 + }, + { + "epoch": 1.0838259609068877, + "grad_norm": 0.9284489750862122, + "learning_rate": 0.00012775931855252826, + "loss": 2.6388, + "step": 11963 + }, + { + "epoch": 1.0839165590813344, + "grad_norm": 0.9428950548171997, + "learning_rate": 0.00012775327735153747, + "loss": 2.7297, + "step": 11964 + }, + { + "epoch": 1.0840071572557812, + "grad_norm": 0.899553120136261, + "learning_rate": 0.00012774723615054673, + "loss": 2.8772, + "step": 11965 + }, + { + "epoch": 1.084097755430228, + "grad_norm": 0.8799108862876892, + "learning_rate": 0.000127741194949556, + "loss": 2.8761, + "step": 11966 + }, + { + "epoch": 1.0841883536046748, + "grad_norm": 0.8929187059402466, + "learning_rate": 0.00012773515374856523, + "loss": 2.5287, + "step": 11967 + }, + { + "epoch": 1.0842789517791216, + "grad_norm": 0.8355387449264526, + "learning_rate": 0.00012772911254757446, + "loss": 2.6452, + "step": 11968 + }, + { + "epoch": 1.0843695499535684, + "grad_norm": 0.9750003218650818, + "learning_rate": 0.0001277230713465837, + "loss": 2.8265, + "step": 11969 + }, + { + "epoch": 1.0844601481280152, + "grad_norm": 0.868371844291687, + "learning_rate": 0.00012771703014559296, + "loss": 2.811, + "step": 11970 + }, + { + "epoch": 1.084550746302462, + "grad_norm": 0.9189143776893616, + "learning_rate": 0.0001277109889446022, + "loss": 2.8176, + "step": 11971 + }, + { + "epoch": 1.0846413444769087, + "grad_norm": 0.8862923383712769, + "learning_rate": 0.00012770494774361142, + "loss": 2.7279, + "step": 11972 + }, + { + "epoch": 1.0847319426513555, + "grad_norm": 0.8894627094268799, + "learning_rate": 0.00012769890654262068, + "loss": 2.4419, + "step": 11973 + }, + { + "epoch": 1.0848225408258023, + "grad_norm": 0.8938934803009033, + "learning_rate": 0.00012769286534162992, + "loss": 2.5875, + "step": 11974 + }, + { + "epoch": 1.084913139000249, + "grad_norm": 0.8673591017723083, + "learning_rate": 0.00012768682414063918, + "loss": 2.6474, + "step": 11975 + }, + { + "epoch": 1.085003737174696, + "grad_norm": 0.9074490070343018, + "learning_rate": 0.00012768078293964841, + "loss": 2.8423, + "step": 11976 + }, + { + "epoch": 1.0850943353491427, + "grad_norm": 0.9117990136146545, + "learning_rate": 0.00012767474173865765, + "loss": 2.8107, + "step": 11977 + }, + { + "epoch": 1.0851849335235895, + "grad_norm": 0.8736864328384399, + "learning_rate": 0.00012766870053766688, + "loss": 2.8654, + "step": 11978 + }, + { + "epoch": 1.0852755316980363, + "grad_norm": 0.7444821000099182, + "learning_rate": 0.00012766265933667614, + "loss": 1.9982, + "step": 11979 + }, + { + "epoch": 1.085366129872483, + "grad_norm": 0.8777018189430237, + "learning_rate": 0.0001276566181356854, + "loss": 2.6184, + "step": 11980 + }, + { + "epoch": 1.0854567280469298, + "grad_norm": 0.9078912734985352, + "learning_rate": 0.0001276505769346946, + "loss": 2.8826, + "step": 11981 + }, + { + "epoch": 1.0855473262213766, + "grad_norm": 0.8714157938957214, + "learning_rate": 0.00012764453573370387, + "loss": 2.6968, + "step": 11982 + }, + { + "epoch": 1.0856379243958234, + "grad_norm": 0.9504134654998779, + "learning_rate": 0.0001276384945327131, + "loss": 2.7006, + "step": 11983 + }, + { + "epoch": 1.0857285225702702, + "grad_norm": 0.9638644456863403, + "learning_rate": 0.00012763245333172237, + "loss": 2.9373, + "step": 11984 + }, + { + "epoch": 1.085819120744717, + "grad_norm": 0.9927895665168762, + "learning_rate": 0.00012762641213073157, + "loss": 2.6582, + "step": 11985 + }, + { + "epoch": 1.0859097189191638, + "grad_norm": 0.8947007656097412, + "learning_rate": 0.00012762037092974084, + "loss": 2.5745, + "step": 11986 + }, + { + "epoch": 1.0860003170936106, + "grad_norm": 0.8479291796684265, + "learning_rate": 0.00012761432972875007, + "loss": 2.8165, + "step": 11987 + }, + { + "epoch": 1.0860909152680573, + "grad_norm": 0.8792785406112671, + "learning_rate": 0.00012760828852775933, + "loss": 2.7756, + "step": 11988 + }, + { + "epoch": 1.0861815134425041, + "grad_norm": 0.8974823951721191, + "learning_rate": 0.00012760224732676856, + "loss": 2.5603, + "step": 11989 + }, + { + "epoch": 1.086272111616951, + "grad_norm": 0.8902912735939026, + "learning_rate": 0.0001275962061257778, + "loss": 2.7069, + "step": 11990 + }, + { + "epoch": 1.0863627097913977, + "grad_norm": 0.9210607409477234, + "learning_rate": 0.00012759016492478706, + "loss": 2.9418, + "step": 11991 + }, + { + "epoch": 1.0864533079658445, + "grad_norm": 0.8251295685768127, + "learning_rate": 0.0001275841237237963, + "loss": 1.9601, + "step": 11992 + }, + { + "epoch": 1.0865439061402913, + "grad_norm": 0.8595241904258728, + "learning_rate": 0.00012757808252280556, + "loss": 2.707, + "step": 11993 + }, + { + "epoch": 1.086634504314738, + "grad_norm": 0.8552426695823669, + "learning_rate": 0.00012757204132181476, + "loss": 2.6097, + "step": 11994 + }, + { + "epoch": 1.0867251024891849, + "grad_norm": 0.9209457039833069, + "learning_rate": 0.00012756600012082402, + "loss": 2.6822, + "step": 11995 + }, + { + "epoch": 1.0868157006636316, + "grad_norm": 0.920080840587616, + "learning_rate": 0.00012755995891983328, + "loss": 2.4969, + "step": 11996 + }, + { + "epoch": 1.0869062988380784, + "grad_norm": 0.9296924471855164, + "learning_rate": 0.00012755391771884252, + "loss": 2.6166, + "step": 11997 + }, + { + "epoch": 1.0869968970125252, + "grad_norm": 0.9072161316871643, + "learning_rate": 0.00012754787651785175, + "loss": 2.5354, + "step": 11998 + }, + { + "epoch": 1.087087495186972, + "grad_norm": 0.8559398055076599, + "learning_rate": 0.000127541835316861, + "loss": 2.8097, + "step": 11999 + }, + { + "epoch": 1.0871780933614188, + "grad_norm": 0.8582165837287903, + "learning_rate": 0.00012753579411587025, + "loss": 2.673, + "step": 12000 + }, + { + "epoch": 1.0872686915358656, + "grad_norm": 0.9387485980987549, + "learning_rate": 0.00012752975291487948, + "loss": 2.7712, + "step": 12001 + }, + { + "epoch": 1.0873592897103124, + "grad_norm": 0.8605996966362, + "learning_rate": 0.00012752371171388872, + "loss": 2.6974, + "step": 12002 + }, + { + "epoch": 1.0874498878847592, + "grad_norm": 0.8986877799034119, + "learning_rate": 0.00012751767051289798, + "loss": 3.1215, + "step": 12003 + }, + { + "epoch": 1.087540486059206, + "grad_norm": 0.8747183084487915, + "learning_rate": 0.0001275116293119072, + "loss": 2.6358, + "step": 12004 + }, + { + "epoch": 1.0876310842336527, + "grad_norm": 0.8509436845779419, + "learning_rate": 0.00012750558811091647, + "loss": 2.5982, + "step": 12005 + }, + { + "epoch": 1.0877216824080995, + "grad_norm": 0.9207550883293152, + "learning_rate": 0.0001274995469099257, + "loss": 2.8349, + "step": 12006 + }, + { + "epoch": 1.0878122805825463, + "grad_norm": 0.9396120309829712, + "learning_rate": 0.00012749350570893494, + "loss": 2.7631, + "step": 12007 + }, + { + "epoch": 1.087902878756993, + "grad_norm": 0.8967615962028503, + "learning_rate": 0.00012748746450794417, + "loss": 2.8511, + "step": 12008 + }, + { + "epoch": 1.0879934769314399, + "grad_norm": 0.7810590863227844, + "learning_rate": 0.00012748142330695344, + "loss": 2.0785, + "step": 12009 + }, + { + "epoch": 1.0880840751058867, + "grad_norm": 0.8440366983413696, + "learning_rate": 0.00012747538210596267, + "loss": 2.7303, + "step": 12010 + }, + { + "epoch": 1.0881746732803335, + "grad_norm": 0.9074844121932983, + "learning_rate": 0.0001274693409049719, + "loss": 2.7736, + "step": 12011 + }, + { + "epoch": 1.0882652714547802, + "grad_norm": 0.8330135345458984, + "learning_rate": 0.00012746329970398116, + "loss": 2.7442, + "step": 12012 + }, + { + "epoch": 1.088355869629227, + "grad_norm": 0.8617720007896423, + "learning_rate": 0.0001274572585029904, + "loss": 2.6167, + "step": 12013 + }, + { + "epoch": 1.0884464678036738, + "grad_norm": 0.9508650898933411, + "learning_rate": 0.00012745121730199966, + "loss": 2.7272, + "step": 12014 + }, + { + "epoch": 1.0885370659781206, + "grad_norm": 0.9249529242515564, + "learning_rate": 0.00012744517610100887, + "loss": 2.4895, + "step": 12015 + }, + { + "epoch": 1.0886276641525674, + "grad_norm": 0.8603651523590088, + "learning_rate": 0.00012743913490001813, + "loss": 2.769, + "step": 12016 + }, + { + "epoch": 1.0887182623270142, + "grad_norm": 0.9036220908164978, + "learning_rate": 0.0001274330936990274, + "loss": 2.9336, + "step": 12017 + }, + { + "epoch": 1.088808860501461, + "grad_norm": 0.9169854521751404, + "learning_rate": 0.00012742705249803662, + "loss": 2.4117, + "step": 12018 + }, + { + "epoch": 1.0888994586759078, + "grad_norm": 0.920372486114502, + "learning_rate": 0.00012742101129704586, + "loss": 2.5758, + "step": 12019 + }, + { + "epoch": 1.0889900568503545, + "grad_norm": 0.8440607190132141, + "learning_rate": 0.0001274149700960551, + "loss": 2.7992, + "step": 12020 + }, + { + "epoch": 1.0890806550248013, + "grad_norm": 0.9519025087356567, + "learning_rate": 0.00012740892889506435, + "loss": 2.7368, + "step": 12021 + }, + { + "epoch": 1.0891712531992481, + "grad_norm": 0.998894989490509, + "learning_rate": 0.0001274028876940736, + "loss": 2.5662, + "step": 12022 + }, + { + "epoch": 1.089261851373695, + "grad_norm": 0.8835020065307617, + "learning_rate": 0.00012739684649308282, + "loss": 2.6575, + "step": 12023 + }, + { + "epoch": 1.0893524495481417, + "grad_norm": 0.9180740118026733, + "learning_rate": 0.00012739080529209205, + "loss": 2.5555, + "step": 12024 + }, + { + "epoch": 1.0894430477225885, + "grad_norm": 0.8666627407073975, + "learning_rate": 0.00012738476409110132, + "loss": 2.7189, + "step": 12025 + }, + { + "epoch": 1.0895336458970353, + "grad_norm": 0.8783131837844849, + "learning_rate": 0.00012737872289011058, + "loss": 2.567, + "step": 12026 + }, + { + "epoch": 1.089624244071482, + "grad_norm": 0.852007269859314, + "learning_rate": 0.0001273726816891198, + "loss": 2.9503, + "step": 12027 + }, + { + "epoch": 1.0897148422459288, + "grad_norm": 0.8407766819000244, + "learning_rate": 0.00012736664048812905, + "loss": 2.6858, + "step": 12028 + }, + { + "epoch": 1.0898054404203754, + "grad_norm": 0.8795477151870728, + "learning_rate": 0.00012736059928713828, + "loss": 2.5611, + "step": 12029 + }, + { + "epoch": 1.0898960385948224, + "grad_norm": 0.854814350605011, + "learning_rate": 0.00012735455808614754, + "loss": 2.7839, + "step": 12030 + }, + { + "epoch": 1.089986636769269, + "grad_norm": 0.8872402906417847, + "learning_rate": 0.00012734851688515677, + "loss": 2.8337, + "step": 12031 + }, + { + "epoch": 1.090077234943716, + "grad_norm": 0.9446734189987183, + "learning_rate": 0.000127342475684166, + "loss": 2.6026, + "step": 12032 + }, + { + "epoch": 1.0901678331181626, + "grad_norm": 0.8521183729171753, + "learning_rate": 0.00012733643448317527, + "loss": 2.8183, + "step": 12033 + }, + { + "epoch": 1.0902584312926094, + "grad_norm": 0.9232314229011536, + "learning_rate": 0.0001273303932821845, + "loss": 2.9073, + "step": 12034 + }, + { + "epoch": 1.0903490294670561, + "grad_norm": 0.9039350152015686, + "learning_rate": 0.00012732435208119377, + "loss": 2.9862, + "step": 12035 + }, + { + "epoch": 1.090439627641503, + "grad_norm": 0.919264554977417, + "learning_rate": 0.00012731831088020297, + "loss": 2.7469, + "step": 12036 + }, + { + "epoch": 1.0905302258159497, + "grad_norm": 0.8929714560508728, + "learning_rate": 0.00012731226967921223, + "loss": 2.5146, + "step": 12037 + }, + { + "epoch": 1.0906208239903965, + "grad_norm": 0.9256357550621033, + "learning_rate": 0.00012730622847822147, + "loss": 2.6959, + "step": 12038 + }, + { + "epoch": 1.0907114221648433, + "grad_norm": 0.8406950235366821, + "learning_rate": 0.00012730018727723073, + "loss": 2.5139, + "step": 12039 + }, + { + "epoch": 1.09080202033929, + "grad_norm": 0.76935875415802, + "learning_rate": 0.00012729414607623996, + "loss": 2.0413, + "step": 12040 + }, + { + "epoch": 1.0908926185137369, + "grad_norm": 0.8517096042633057, + "learning_rate": 0.0001272881048752492, + "loss": 2.5326, + "step": 12041 + }, + { + "epoch": 1.0909832166881837, + "grad_norm": 0.8829935789108276, + "learning_rate": 0.00012728206367425846, + "loss": 2.4926, + "step": 12042 + }, + { + "epoch": 1.0910738148626304, + "grad_norm": 0.8542391657829285, + "learning_rate": 0.0001272760224732677, + "loss": 2.6139, + "step": 12043 + }, + { + "epoch": 1.0911644130370772, + "grad_norm": 1.0461390018463135, + "learning_rate": 0.00012726998127227693, + "loss": 2.7764, + "step": 12044 + }, + { + "epoch": 1.091255011211524, + "grad_norm": 0.8734033703804016, + "learning_rate": 0.00012726394007128616, + "loss": 2.799, + "step": 12045 + }, + { + "epoch": 1.0913456093859708, + "grad_norm": 0.8029977083206177, + "learning_rate": 0.00012725789887029542, + "loss": 2.6936, + "step": 12046 + }, + { + "epoch": 1.0914362075604176, + "grad_norm": 0.8795615434646606, + "learning_rate": 0.00012725185766930468, + "loss": 2.7193, + "step": 12047 + }, + { + "epoch": 1.0915268057348644, + "grad_norm": 0.9081958532333374, + "learning_rate": 0.00012724581646831392, + "loss": 2.7113, + "step": 12048 + }, + { + "epoch": 1.0916174039093112, + "grad_norm": 0.9102417826652527, + "learning_rate": 0.00012723977526732315, + "loss": 2.7463, + "step": 12049 + }, + { + "epoch": 1.091708002083758, + "grad_norm": 0.8076284527778625, + "learning_rate": 0.00012723373406633238, + "loss": 2.5422, + "step": 12050 + }, + { + "epoch": 1.0917986002582047, + "grad_norm": 0.9218136668205261, + "learning_rate": 0.00012722769286534165, + "loss": 2.7156, + "step": 12051 + }, + { + "epoch": 1.0918891984326515, + "grad_norm": 0.8699004054069519, + "learning_rate": 0.00012722165166435088, + "loss": 2.7055, + "step": 12052 + }, + { + "epoch": 1.0919797966070983, + "grad_norm": 0.9165749549865723, + "learning_rate": 0.0001272156104633601, + "loss": 2.7867, + "step": 12053 + }, + { + "epoch": 1.092070394781545, + "grad_norm": 0.8574435710906982, + "learning_rate": 0.00012720956926236935, + "loss": 2.4797, + "step": 12054 + }, + { + "epoch": 1.092160992955992, + "grad_norm": 0.8838058710098267, + "learning_rate": 0.0001272035280613786, + "loss": 2.7328, + "step": 12055 + }, + { + "epoch": 1.0922515911304387, + "grad_norm": 0.8965235352516174, + "learning_rate": 0.00012719748686038787, + "loss": 2.65, + "step": 12056 + }, + { + "epoch": 1.0923421893048855, + "grad_norm": 0.9873544573783875, + "learning_rate": 0.0001271914456593971, + "loss": 2.5617, + "step": 12057 + }, + { + "epoch": 1.0924327874793323, + "grad_norm": 0.9427242875099182, + "learning_rate": 0.00012718540445840634, + "loss": 2.8533, + "step": 12058 + }, + { + "epoch": 1.092523385653779, + "grad_norm": 0.8137660026550293, + "learning_rate": 0.00012717936325741557, + "loss": 2.4571, + "step": 12059 + }, + { + "epoch": 1.0926139838282258, + "grad_norm": 0.8292232751846313, + "learning_rate": 0.00012717332205642483, + "loss": 2.0131, + "step": 12060 + }, + { + "epoch": 1.0927045820026726, + "grad_norm": 0.9050813317298889, + "learning_rate": 0.00012716728085543407, + "loss": 2.7166, + "step": 12061 + }, + { + "epoch": 1.0927951801771194, + "grad_norm": 0.8812723755836487, + "learning_rate": 0.0001271612396544433, + "loss": 2.4852, + "step": 12062 + }, + { + "epoch": 1.0928857783515662, + "grad_norm": 0.9030970335006714, + "learning_rate": 0.00012715519845345256, + "loss": 2.7023, + "step": 12063 + }, + { + "epoch": 1.092976376526013, + "grad_norm": 0.7852323651313782, + "learning_rate": 0.0001271491572524618, + "loss": 2.1134, + "step": 12064 + }, + { + "epoch": 1.0930669747004598, + "grad_norm": 0.8801143765449524, + "learning_rate": 0.00012714311605147106, + "loss": 2.6434, + "step": 12065 + }, + { + "epoch": 1.0931575728749066, + "grad_norm": 0.9028752446174622, + "learning_rate": 0.00012713707485048026, + "loss": 2.7657, + "step": 12066 + }, + { + "epoch": 1.0932481710493533, + "grad_norm": 0.8551328778266907, + "learning_rate": 0.00012713103364948953, + "loss": 2.5876, + "step": 12067 + }, + { + "epoch": 1.0933387692238001, + "grad_norm": 0.8742027878761292, + "learning_rate": 0.00012712499244849876, + "loss": 2.5839, + "step": 12068 + }, + { + "epoch": 1.093429367398247, + "grad_norm": 0.8427885174751282, + "learning_rate": 0.00012711895124750802, + "loss": 2.7829, + "step": 12069 + }, + { + "epoch": 1.0935199655726937, + "grad_norm": 0.9505897164344788, + "learning_rate": 0.00012711291004651726, + "loss": 2.8127, + "step": 12070 + }, + { + "epoch": 1.0936105637471405, + "grad_norm": 0.9222820997238159, + "learning_rate": 0.0001271068688455265, + "loss": 2.6847, + "step": 12071 + }, + { + "epoch": 1.0937011619215873, + "grad_norm": 0.920946478843689, + "learning_rate": 0.00012710082764453575, + "loss": 2.6033, + "step": 12072 + }, + { + "epoch": 1.093791760096034, + "grad_norm": 0.9323870539665222, + "learning_rate": 0.00012709478644354498, + "loss": 2.6602, + "step": 12073 + }, + { + "epoch": 1.0938823582704809, + "grad_norm": 0.9414363503456116, + "learning_rate": 0.00012708874524255422, + "loss": 2.9109, + "step": 12074 + }, + { + "epoch": 1.0939729564449276, + "grad_norm": 0.8748739957809448, + "learning_rate": 0.00012708270404156345, + "loss": 2.3761, + "step": 12075 + }, + { + "epoch": 1.0940635546193744, + "grad_norm": 0.8991499543190002, + "learning_rate": 0.00012707666284057271, + "loss": 2.6123, + "step": 12076 + }, + { + "epoch": 1.0941541527938212, + "grad_norm": 0.9105426669120789, + "learning_rate": 0.00012707062163958197, + "loss": 2.8742, + "step": 12077 + }, + { + "epoch": 1.094244750968268, + "grad_norm": 0.9155826568603516, + "learning_rate": 0.0001270645804385912, + "loss": 2.9844, + "step": 12078 + }, + { + "epoch": 1.0943353491427148, + "grad_norm": 0.8631749153137207, + "learning_rate": 0.00012705853923760044, + "loss": 2.6475, + "step": 12079 + }, + { + "epoch": 1.0944259473171616, + "grad_norm": 0.9046613574028015, + "learning_rate": 0.00012705249803660968, + "loss": 2.5044, + "step": 12080 + }, + { + "epoch": 1.0945165454916084, + "grad_norm": 0.906186580657959, + "learning_rate": 0.00012704645683561894, + "loss": 2.5953, + "step": 12081 + }, + { + "epoch": 1.0946071436660552, + "grad_norm": 0.8623731732368469, + "learning_rate": 0.00012704041563462817, + "loss": 3.0617, + "step": 12082 + }, + { + "epoch": 1.094697741840502, + "grad_norm": 0.8620173335075378, + "learning_rate": 0.0001270343744336374, + "loss": 2.6695, + "step": 12083 + }, + { + "epoch": 1.0947883400149487, + "grad_norm": 0.8946167826652527, + "learning_rate": 0.00012702833323264664, + "loss": 2.5659, + "step": 12084 + }, + { + "epoch": 1.0948789381893955, + "grad_norm": 0.8545267581939697, + "learning_rate": 0.0001270222920316559, + "loss": 2.7104, + "step": 12085 + }, + { + "epoch": 1.0949695363638423, + "grad_norm": 0.9874743819236755, + "learning_rate": 0.00012701625083066516, + "loss": 2.8763, + "step": 12086 + }, + { + "epoch": 1.095060134538289, + "grad_norm": 0.8798359632492065, + "learning_rate": 0.00012701020962967437, + "loss": 2.408, + "step": 12087 + }, + { + "epoch": 1.0951507327127359, + "grad_norm": 0.7444956302642822, + "learning_rate": 0.00012700416842868363, + "loss": 2.0154, + "step": 12088 + }, + { + "epoch": 1.0952413308871827, + "grad_norm": 0.8322364687919617, + "learning_rate": 0.00012699812722769286, + "loss": 2.7024, + "step": 12089 + }, + { + "epoch": 1.0953319290616295, + "grad_norm": 0.8445176482200623, + "learning_rate": 0.00012699208602670213, + "loss": 2.5866, + "step": 12090 + }, + { + "epoch": 1.0954225272360762, + "grad_norm": 0.8422711491584778, + "learning_rate": 0.00012698604482571136, + "loss": 2.8223, + "step": 12091 + }, + { + "epoch": 1.095513125410523, + "grad_norm": 0.9333989024162292, + "learning_rate": 0.0001269800036247206, + "loss": 2.5679, + "step": 12092 + }, + { + "epoch": 1.0956037235849698, + "grad_norm": 0.898607611656189, + "learning_rate": 0.00012697396242372986, + "loss": 2.9425, + "step": 12093 + }, + { + "epoch": 1.0956943217594166, + "grad_norm": 0.807849645614624, + "learning_rate": 0.0001269679212227391, + "loss": 2.1766, + "step": 12094 + }, + { + "epoch": 1.0957849199338634, + "grad_norm": 0.7394058108329773, + "learning_rate": 0.00012696188002174832, + "loss": 1.9432, + "step": 12095 + }, + { + "epoch": 1.0958755181083102, + "grad_norm": 0.8713030219078064, + "learning_rate": 0.00012695583882075756, + "loss": 2.7218, + "step": 12096 + }, + { + "epoch": 1.095966116282757, + "grad_norm": 0.9325006604194641, + "learning_rate": 0.00012694979761976682, + "loss": 2.5, + "step": 12097 + }, + { + "epoch": 1.0960567144572038, + "grad_norm": 0.8216304779052734, + "learning_rate": 0.00012694375641877605, + "loss": 2.2569, + "step": 12098 + }, + { + "epoch": 1.0961473126316505, + "grad_norm": 0.7264118790626526, + "learning_rate": 0.00012693771521778531, + "loss": 1.9562, + "step": 12099 + }, + { + "epoch": 1.0962379108060973, + "grad_norm": 0.9667306542396545, + "learning_rate": 0.00012693167401679455, + "loss": 2.6777, + "step": 12100 + }, + { + "epoch": 1.0963285089805441, + "grad_norm": 0.8768927454948425, + "learning_rate": 0.00012692563281580378, + "loss": 2.8493, + "step": 12101 + }, + { + "epoch": 1.096419107154991, + "grad_norm": 0.8722929954528809, + "learning_rate": 0.00012691959161481304, + "loss": 2.7213, + "step": 12102 + }, + { + "epoch": 1.0965097053294377, + "grad_norm": 0.9409510493278503, + "learning_rate": 0.00012691355041382228, + "loss": 2.7909, + "step": 12103 + }, + { + "epoch": 1.0966003035038845, + "grad_norm": 0.7725728750228882, + "learning_rate": 0.0001269075092128315, + "loss": 2.1249, + "step": 12104 + }, + { + "epoch": 1.0966909016783313, + "grad_norm": 0.9314269423484802, + "learning_rate": 0.00012690146801184075, + "loss": 3.1657, + "step": 12105 + }, + { + "epoch": 1.096781499852778, + "grad_norm": 0.8724786043167114, + "learning_rate": 0.00012689542681085, + "loss": 2.6311, + "step": 12106 + }, + { + "epoch": 1.0968720980272249, + "grad_norm": 0.9163286089897156, + "learning_rate": 0.00012688938560985927, + "loss": 2.5162, + "step": 12107 + }, + { + "epoch": 1.0969626962016716, + "grad_norm": 0.8784648776054382, + "learning_rate": 0.00012688334440886847, + "loss": 2.9247, + "step": 12108 + }, + { + "epoch": 1.0970532943761184, + "grad_norm": 0.8692125082015991, + "learning_rate": 0.00012687730320787774, + "loss": 2.4647, + "step": 12109 + }, + { + "epoch": 1.097143892550565, + "grad_norm": 0.9604447484016418, + "learning_rate": 0.00012687126200688697, + "loss": 2.9581, + "step": 12110 + }, + { + "epoch": 1.097234490725012, + "grad_norm": 0.8977569937705994, + "learning_rate": 0.00012686522080589623, + "loss": 2.5605, + "step": 12111 + }, + { + "epoch": 1.0973250888994586, + "grad_norm": 0.875369131565094, + "learning_rate": 0.00012685917960490546, + "loss": 2.7786, + "step": 12112 + }, + { + "epoch": 1.0974156870739056, + "grad_norm": 0.8183794617652893, + "learning_rate": 0.0001268531384039147, + "loss": 2.5176, + "step": 12113 + }, + { + "epoch": 1.0975062852483521, + "grad_norm": 0.8495648503303528, + "learning_rate": 0.00012684709720292393, + "loss": 2.4696, + "step": 12114 + }, + { + "epoch": 1.097596883422799, + "grad_norm": 0.8625321984291077, + "learning_rate": 0.0001268410560019332, + "loss": 2.6506, + "step": 12115 + }, + { + "epoch": 1.0976874815972457, + "grad_norm": 0.834189236164093, + "learning_rate": 0.00012683501480094246, + "loss": 2.5305, + "step": 12116 + }, + { + "epoch": 1.0977780797716925, + "grad_norm": 0.8901584148406982, + "learning_rate": 0.00012682897359995166, + "loss": 2.6307, + "step": 12117 + }, + { + "epoch": 1.0978686779461393, + "grad_norm": 0.9007748365402222, + "learning_rate": 0.00012682293239896092, + "loss": 2.7798, + "step": 12118 + }, + { + "epoch": 1.097959276120586, + "grad_norm": 0.8373364210128784, + "learning_rate": 0.00012681689119797016, + "loss": 2.6619, + "step": 12119 + }, + { + "epoch": 1.0980498742950329, + "grad_norm": 0.8627564311027527, + "learning_rate": 0.00012681084999697942, + "loss": 2.3539, + "step": 12120 + }, + { + "epoch": 1.0981404724694797, + "grad_norm": 0.9388042092323303, + "learning_rate": 0.00012680480879598863, + "loss": 2.93, + "step": 12121 + }, + { + "epoch": 1.0982310706439264, + "grad_norm": 0.9636463522911072, + "learning_rate": 0.0001267987675949979, + "loss": 3.0795, + "step": 12122 + }, + { + "epoch": 1.0983216688183732, + "grad_norm": 0.8755039572715759, + "learning_rate": 0.00012679272639400715, + "loss": 2.8311, + "step": 12123 + }, + { + "epoch": 1.09841226699282, + "grad_norm": 0.9486544728279114, + "learning_rate": 0.00012678668519301638, + "loss": 2.6148, + "step": 12124 + }, + { + "epoch": 1.0985028651672668, + "grad_norm": 0.9063060283660889, + "learning_rate": 0.00012678064399202562, + "loss": 2.6119, + "step": 12125 + }, + { + "epoch": 1.0985934633417136, + "grad_norm": 0.8600910902023315, + "learning_rate": 0.00012677460279103485, + "loss": 2.6903, + "step": 12126 + }, + { + "epoch": 1.0986840615161604, + "grad_norm": 0.8954198956489563, + "learning_rate": 0.0001267685615900441, + "loss": 2.7163, + "step": 12127 + }, + { + "epoch": 1.0987746596906072, + "grad_norm": 0.8812419176101685, + "learning_rate": 0.00012676252038905335, + "loss": 2.702, + "step": 12128 + }, + { + "epoch": 1.098865257865054, + "grad_norm": 0.938623309135437, + "learning_rate": 0.0001267564791880626, + "loss": 2.6306, + "step": 12129 + }, + { + "epoch": 1.0989558560395007, + "grad_norm": 0.8743863701820374, + "learning_rate": 0.00012675043798707184, + "loss": 2.7174, + "step": 12130 + }, + { + "epoch": 1.0990464542139475, + "grad_norm": 0.899109423160553, + "learning_rate": 0.00012674439678608107, + "loss": 2.6189, + "step": 12131 + }, + { + "epoch": 1.0991370523883943, + "grad_norm": 0.88308185338974, + "learning_rate": 0.00012673835558509034, + "loss": 2.5874, + "step": 12132 + }, + { + "epoch": 1.099227650562841, + "grad_norm": 0.8540655970573425, + "learning_rate": 0.00012673231438409957, + "loss": 2.6306, + "step": 12133 + }, + { + "epoch": 1.099318248737288, + "grad_norm": 0.9147428870201111, + "learning_rate": 0.0001267262731831088, + "loss": 2.5576, + "step": 12134 + }, + { + "epoch": 1.0994088469117347, + "grad_norm": 0.9086984992027283, + "learning_rate": 0.00012672023198211804, + "loss": 2.6883, + "step": 12135 + }, + { + "epoch": 1.0994994450861815, + "grad_norm": 0.7701290249824524, + "learning_rate": 0.0001267141907811273, + "loss": 2.1937, + "step": 12136 + }, + { + "epoch": 1.0995900432606283, + "grad_norm": 0.889127254486084, + "learning_rate": 0.00012670814958013656, + "loss": 2.6398, + "step": 12137 + }, + { + "epoch": 1.099680641435075, + "grad_norm": 0.8889652490615845, + "learning_rate": 0.00012670210837914577, + "loss": 2.7846, + "step": 12138 + }, + { + "epoch": 1.0997712396095218, + "grad_norm": 0.9392780661582947, + "learning_rate": 0.00012669606717815503, + "loss": 2.9257, + "step": 12139 + }, + { + "epoch": 1.0998618377839686, + "grad_norm": 0.9058282375335693, + "learning_rate": 0.00012669002597716426, + "loss": 2.0861, + "step": 12140 + }, + { + "epoch": 1.0999524359584154, + "grad_norm": 0.8813919425010681, + "learning_rate": 0.00012668398477617352, + "loss": 2.4516, + "step": 12141 + }, + { + "epoch": 1.1000430341328622, + "grad_norm": 1.045911431312561, + "learning_rate": 0.00012667794357518276, + "loss": 2.948, + "step": 12142 + }, + { + "epoch": 1.100133632307309, + "grad_norm": 0.9688557386398315, + "learning_rate": 0.000126671902374192, + "loss": 2.6074, + "step": 12143 + }, + { + "epoch": 1.1002242304817558, + "grad_norm": 0.7498278617858887, + "learning_rate": 0.00012666586117320123, + "loss": 2.2531, + "step": 12144 + }, + { + "epoch": 1.1003148286562026, + "grad_norm": 0.850744903087616, + "learning_rate": 0.0001266598199722105, + "loss": 2.8445, + "step": 12145 + }, + { + "epoch": 1.1004054268306493, + "grad_norm": 0.9344937801361084, + "learning_rate": 0.00012665377877121972, + "loss": 2.7781, + "step": 12146 + }, + { + "epoch": 1.1004960250050961, + "grad_norm": 0.8876581788063049, + "learning_rate": 0.00012664773757022895, + "loss": 2.7589, + "step": 12147 + }, + { + "epoch": 1.100586623179543, + "grad_norm": 0.9367260932922363, + "learning_rate": 0.00012664169636923822, + "loss": 2.6846, + "step": 12148 + }, + { + "epoch": 1.1006772213539897, + "grad_norm": 0.9676839113235474, + "learning_rate": 0.00012663565516824745, + "loss": 2.6437, + "step": 12149 + }, + { + "epoch": 1.1007678195284365, + "grad_norm": 1.0324511528015137, + "learning_rate": 0.0001266296139672567, + "loss": 2.8852, + "step": 12150 + }, + { + "epoch": 1.1008584177028833, + "grad_norm": 0.8781235218048096, + "learning_rate": 0.00012662357276626592, + "loss": 2.7826, + "step": 12151 + }, + { + "epoch": 1.10094901587733, + "grad_norm": 0.8803731203079224, + "learning_rate": 0.00012661753156527518, + "loss": 2.8579, + "step": 12152 + }, + { + "epoch": 1.1010396140517769, + "grad_norm": 0.8740930557250977, + "learning_rate": 0.00012661149036428444, + "loss": 3.0261, + "step": 12153 + }, + { + "epoch": 1.1011302122262236, + "grad_norm": 0.8447583913803101, + "learning_rate": 0.00012660544916329367, + "loss": 2.7947, + "step": 12154 + }, + { + "epoch": 1.1012208104006704, + "grad_norm": 0.7917587161064148, + "learning_rate": 0.0001265994079623029, + "loss": 2.4391, + "step": 12155 + }, + { + "epoch": 1.1013114085751172, + "grad_norm": 0.9508464932441711, + "learning_rate": 0.00012659336676131214, + "loss": 2.6034, + "step": 12156 + }, + { + "epoch": 1.101402006749564, + "grad_norm": 0.8670632839202881, + "learning_rate": 0.0001265873255603214, + "loss": 2.8754, + "step": 12157 + }, + { + "epoch": 1.1014926049240108, + "grad_norm": 0.9055042266845703, + "learning_rate": 0.00012658128435933064, + "loss": 2.6386, + "step": 12158 + }, + { + "epoch": 1.1015832030984576, + "grad_norm": 0.8561586141586304, + "learning_rate": 0.00012657524315833987, + "loss": 2.4264, + "step": 12159 + }, + { + "epoch": 1.1016738012729044, + "grad_norm": 0.9049524068832397, + "learning_rate": 0.00012656920195734913, + "loss": 2.5955, + "step": 12160 + }, + { + "epoch": 1.1017643994473512, + "grad_norm": 0.8632776737213135, + "learning_rate": 0.00012656316075635837, + "loss": 2.503, + "step": 12161 + }, + { + "epoch": 1.101854997621798, + "grad_norm": 0.8742277026176453, + "learning_rate": 0.00012655711955536763, + "loss": 2.8665, + "step": 12162 + }, + { + "epoch": 1.1019455957962447, + "grad_norm": 0.9319092035293579, + "learning_rate": 0.00012655107835437686, + "loss": 2.6568, + "step": 12163 + }, + { + "epoch": 1.1020361939706915, + "grad_norm": 0.9250596165657043, + "learning_rate": 0.0001265450371533861, + "loss": 2.765, + "step": 12164 + }, + { + "epoch": 1.1021267921451383, + "grad_norm": 0.8866397142410278, + "learning_rate": 0.00012653899595239533, + "loss": 2.6217, + "step": 12165 + }, + { + "epoch": 1.102217390319585, + "grad_norm": 0.8791565895080566, + "learning_rate": 0.0001265329547514046, + "loss": 2.7963, + "step": 12166 + }, + { + "epoch": 1.1023079884940319, + "grad_norm": 0.8297994136810303, + "learning_rate": 0.00012652691355041385, + "loss": 2.8177, + "step": 12167 + }, + { + "epoch": 1.1023985866684787, + "grad_norm": 0.7951502203941345, + "learning_rate": 0.00012652087234942306, + "loss": 2.1863, + "step": 12168 + }, + { + "epoch": 1.1024891848429255, + "grad_norm": 0.8282173871994019, + "learning_rate": 0.00012651483114843232, + "loss": 2.6043, + "step": 12169 + }, + { + "epoch": 1.1025797830173723, + "grad_norm": 0.9098772406578064, + "learning_rate": 0.00012650878994744155, + "loss": 2.7335, + "step": 12170 + }, + { + "epoch": 1.102670381191819, + "grad_norm": 0.8681011199951172, + "learning_rate": 0.00012650274874645082, + "loss": 2.594, + "step": 12171 + }, + { + "epoch": 1.1027609793662658, + "grad_norm": 0.8902115225791931, + "learning_rate": 0.00012649670754546002, + "loss": 2.5782, + "step": 12172 + }, + { + "epoch": 1.1028515775407126, + "grad_norm": 0.8284293413162231, + "learning_rate": 0.00012649066634446928, + "loss": 2.7492, + "step": 12173 + }, + { + "epoch": 1.1029421757151594, + "grad_norm": 0.9166033267974854, + "learning_rate": 0.00012648462514347852, + "loss": 2.6759, + "step": 12174 + }, + { + "epoch": 1.1030327738896062, + "grad_norm": 0.903100848197937, + "learning_rate": 0.00012647858394248778, + "loss": 2.8071, + "step": 12175 + }, + { + "epoch": 1.103123372064053, + "grad_norm": 0.8659055829048157, + "learning_rate": 0.000126472542741497, + "loss": 2.6691, + "step": 12176 + }, + { + "epoch": 1.1032139702384998, + "grad_norm": 0.9006064534187317, + "learning_rate": 0.00012646650154050625, + "loss": 2.6631, + "step": 12177 + }, + { + "epoch": 1.1033045684129466, + "grad_norm": 0.9025154709815979, + "learning_rate": 0.0001264604603395155, + "loss": 2.5599, + "step": 12178 + }, + { + "epoch": 1.1033951665873933, + "grad_norm": 0.8629404902458191, + "learning_rate": 0.00012645441913852474, + "loss": 2.5997, + "step": 12179 + }, + { + "epoch": 1.1034857647618401, + "grad_norm": 0.8894405364990234, + "learning_rate": 0.000126448377937534, + "loss": 2.6598, + "step": 12180 + }, + { + "epoch": 1.103576362936287, + "grad_norm": 0.893703043460846, + "learning_rate": 0.0001264423367365432, + "loss": 2.5104, + "step": 12181 + }, + { + "epoch": 1.1036669611107337, + "grad_norm": 0.8766022324562073, + "learning_rate": 0.00012643629553555247, + "loss": 2.5635, + "step": 12182 + }, + { + "epoch": 1.1037575592851805, + "grad_norm": 0.8733001947402954, + "learning_rate": 0.00012643025433456173, + "loss": 2.7335, + "step": 12183 + }, + { + "epoch": 1.1038481574596273, + "grad_norm": 0.8750893473625183, + "learning_rate": 0.00012642421313357097, + "loss": 2.7819, + "step": 12184 + }, + { + "epoch": 1.103938755634074, + "grad_norm": 0.8979333639144897, + "learning_rate": 0.0001264181719325802, + "loss": 2.7989, + "step": 12185 + }, + { + "epoch": 1.1040293538085209, + "grad_norm": 0.9247048497200012, + "learning_rate": 0.00012641213073158944, + "loss": 2.7988, + "step": 12186 + }, + { + "epoch": 1.1041199519829676, + "grad_norm": 0.8392958641052246, + "learning_rate": 0.0001264060895305987, + "loss": 2.5519, + "step": 12187 + }, + { + "epoch": 1.1042105501574144, + "grad_norm": 0.8607615232467651, + "learning_rate": 0.00012640004832960793, + "loss": 2.5478, + "step": 12188 + }, + { + "epoch": 1.1043011483318612, + "grad_norm": 0.9015560746192932, + "learning_rate": 0.00012639400712861716, + "loss": 2.7367, + "step": 12189 + }, + { + "epoch": 1.104391746506308, + "grad_norm": 0.8850727081298828, + "learning_rate": 0.00012638796592762643, + "loss": 2.5883, + "step": 12190 + }, + { + "epoch": 1.1044823446807546, + "grad_norm": 0.8696529865264893, + "learning_rate": 0.00012638192472663566, + "loss": 2.7817, + "step": 12191 + }, + { + "epoch": 1.1045729428552016, + "grad_norm": 0.949021577835083, + "learning_rate": 0.00012637588352564492, + "loss": 2.9817, + "step": 12192 + }, + { + "epoch": 1.1046635410296481, + "grad_norm": 0.9174584746360779, + "learning_rate": 0.00012636984232465415, + "loss": 2.6628, + "step": 12193 + }, + { + "epoch": 1.1047541392040952, + "grad_norm": 0.891609251499176, + "learning_rate": 0.0001263638011236634, + "loss": 2.8341, + "step": 12194 + }, + { + "epoch": 1.1048447373785417, + "grad_norm": 0.8852039575576782, + "learning_rate": 0.00012635775992267262, + "loss": 2.6856, + "step": 12195 + }, + { + "epoch": 1.1049353355529885, + "grad_norm": 0.8499743938446045, + "learning_rate": 0.00012635171872168188, + "loss": 2.5733, + "step": 12196 + }, + { + "epoch": 1.1050259337274353, + "grad_norm": 1.0064083337783813, + "learning_rate": 0.00012634567752069112, + "loss": 2.8114, + "step": 12197 + }, + { + "epoch": 1.105116531901882, + "grad_norm": 0.8878710865974426, + "learning_rate": 0.00012633963631970035, + "loss": 2.6266, + "step": 12198 + }, + { + "epoch": 1.1052071300763289, + "grad_norm": 0.8552231192588806, + "learning_rate": 0.0001263335951187096, + "loss": 2.6966, + "step": 12199 + }, + { + "epoch": 1.1052977282507757, + "grad_norm": 0.8745222091674805, + "learning_rate": 0.00012632755391771885, + "loss": 2.7603, + "step": 12200 + }, + { + "epoch": 1.1053883264252224, + "grad_norm": 0.849274218082428, + "learning_rate": 0.0001263215127167281, + "loss": 2.5362, + "step": 12201 + }, + { + "epoch": 1.1054789245996692, + "grad_norm": 0.9294918775558472, + "learning_rate": 0.00012631547151573732, + "loss": 2.6954, + "step": 12202 + }, + { + "epoch": 1.105569522774116, + "grad_norm": 0.8909995555877686, + "learning_rate": 0.00012630943031474658, + "loss": 2.83, + "step": 12203 + }, + { + "epoch": 1.1056601209485628, + "grad_norm": 0.9111427068710327, + "learning_rate": 0.0001263033891137558, + "loss": 2.8067, + "step": 12204 + }, + { + "epoch": 1.1057507191230096, + "grad_norm": 0.7619307637214661, + "learning_rate": 0.00012629734791276507, + "loss": 2.117, + "step": 12205 + }, + { + "epoch": 1.1058413172974564, + "grad_norm": 0.9020026326179504, + "learning_rate": 0.0001262913067117743, + "loss": 2.7166, + "step": 12206 + }, + { + "epoch": 1.1059319154719032, + "grad_norm": 0.8880966901779175, + "learning_rate": 0.00012628526551078354, + "loss": 2.8787, + "step": 12207 + }, + { + "epoch": 1.10602251364635, + "grad_norm": 0.9589238166809082, + "learning_rate": 0.0001262792243097928, + "loss": 2.5914, + "step": 12208 + }, + { + "epoch": 1.1061131118207967, + "grad_norm": 0.8642987608909607, + "learning_rate": 0.00012627318310880204, + "loss": 2.9015, + "step": 12209 + }, + { + "epoch": 1.1062037099952435, + "grad_norm": 0.9091670513153076, + "learning_rate": 0.00012626714190781127, + "loss": 2.6383, + "step": 12210 + }, + { + "epoch": 1.1062943081696903, + "grad_norm": 0.7460150718688965, + "learning_rate": 0.0001262611007068205, + "loss": 1.9588, + "step": 12211 + }, + { + "epoch": 1.106384906344137, + "grad_norm": 0.8481383919715881, + "learning_rate": 0.00012625505950582976, + "loss": 2.5179, + "step": 12212 + }, + { + "epoch": 1.106475504518584, + "grad_norm": 0.8348844647407532, + "learning_rate": 0.00012624901830483903, + "loss": 1.985, + "step": 12213 + }, + { + "epoch": 1.1065661026930307, + "grad_norm": 0.8540094494819641, + "learning_rate": 0.00012624297710384826, + "loss": 2.6169, + "step": 12214 + }, + { + "epoch": 1.1066567008674775, + "grad_norm": 0.8975708484649658, + "learning_rate": 0.0001262369359028575, + "loss": 2.7063, + "step": 12215 + }, + { + "epoch": 1.1067472990419243, + "grad_norm": 0.9146438241004944, + "learning_rate": 0.00012623089470186673, + "loss": 2.6652, + "step": 12216 + }, + { + "epoch": 1.106837897216371, + "grad_norm": 1.021020531654358, + "learning_rate": 0.000126224853500876, + "loss": 2.5631, + "step": 12217 + }, + { + "epoch": 1.1069284953908178, + "grad_norm": 0.8785321712493896, + "learning_rate": 0.00012621881229988522, + "loss": 2.6238, + "step": 12218 + }, + { + "epoch": 1.1070190935652646, + "grad_norm": 0.8472843766212463, + "learning_rate": 0.00012621277109889446, + "loss": 2.5647, + "step": 12219 + }, + { + "epoch": 1.1071096917397114, + "grad_norm": 0.9192529320716858, + "learning_rate": 0.00012620672989790372, + "loss": 2.7033, + "step": 12220 + }, + { + "epoch": 1.1072002899141582, + "grad_norm": 0.9165173768997192, + "learning_rate": 0.00012620068869691295, + "loss": 2.5737, + "step": 12221 + }, + { + "epoch": 1.107290888088605, + "grad_norm": 0.9083285331726074, + "learning_rate": 0.0001261946474959222, + "loss": 2.76, + "step": 12222 + }, + { + "epoch": 1.1073814862630518, + "grad_norm": 0.8218736052513123, + "learning_rate": 0.00012618860629493142, + "loss": 2.5868, + "step": 12223 + }, + { + "epoch": 1.1074720844374986, + "grad_norm": 0.8691990375518799, + "learning_rate": 0.00012618256509394068, + "loss": 2.6084, + "step": 12224 + }, + { + "epoch": 1.1075626826119453, + "grad_norm": 0.8612378835678101, + "learning_rate": 0.00012617652389294992, + "loss": 2.7634, + "step": 12225 + }, + { + "epoch": 1.1076532807863921, + "grad_norm": 0.9834172129631042, + "learning_rate": 0.00012617048269195918, + "loss": 2.7259, + "step": 12226 + }, + { + "epoch": 1.107743878960839, + "grad_norm": 0.8670669794082642, + "learning_rate": 0.0001261644414909684, + "loss": 2.8848, + "step": 12227 + }, + { + "epoch": 1.1078344771352857, + "grad_norm": 0.645124614238739, + "learning_rate": 0.00012615840028997764, + "loss": 1.3724, + "step": 12228 + }, + { + "epoch": 1.1079250753097325, + "grad_norm": 0.909604549407959, + "learning_rate": 0.0001261523590889869, + "loss": 2.584, + "step": 12229 + }, + { + "epoch": 1.1080156734841793, + "grad_norm": 0.8758166432380676, + "learning_rate": 0.00012614631788799614, + "loss": 2.6463, + "step": 12230 + }, + { + "epoch": 1.108106271658626, + "grad_norm": 0.8629305958747864, + "learning_rate": 0.00012614027668700537, + "loss": 2.5903, + "step": 12231 + }, + { + "epoch": 1.1081968698330729, + "grad_norm": 0.932653546333313, + "learning_rate": 0.0001261342354860146, + "loss": 3.0353, + "step": 12232 + }, + { + "epoch": 1.1082874680075196, + "grad_norm": 0.8360341191291809, + "learning_rate": 0.00012612819428502387, + "loss": 2.7078, + "step": 12233 + }, + { + "epoch": 1.1083780661819664, + "grad_norm": 0.882629930973053, + "learning_rate": 0.0001261221530840331, + "loss": 2.7719, + "step": 12234 + }, + { + "epoch": 1.1084686643564132, + "grad_norm": 0.8875755071640015, + "learning_rate": 0.00012611611188304236, + "loss": 2.6878, + "step": 12235 + }, + { + "epoch": 1.10855926253086, + "grad_norm": 0.8482925891876221, + "learning_rate": 0.0001261100706820516, + "loss": 2.5021, + "step": 12236 + }, + { + "epoch": 1.1086498607053068, + "grad_norm": 0.9016217589378357, + "learning_rate": 0.00012610402948106083, + "loss": 2.6243, + "step": 12237 + }, + { + "epoch": 1.1087404588797536, + "grad_norm": 0.807524561882019, + "learning_rate": 0.0001260979882800701, + "loss": 2.0026, + "step": 12238 + }, + { + "epoch": 1.1088310570542004, + "grad_norm": 0.9041067361831665, + "learning_rate": 0.00012609194707907933, + "loss": 2.7801, + "step": 12239 + }, + { + "epoch": 1.1089216552286472, + "grad_norm": 0.884629487991333, + "learning_rate": 0.00012608590587808856, + "loss": 2.6817, + "step": 12240 + }, + { + "epoch": 1.109012253403094, + "grad_norm": 0.8715211749076843, + "learning_rate": 0.0001260798646770978, + "loss": 2.54, + "step": 12241 + }, + { + "epoch": 1.1091028515775407, + "grad_norm": 0.8610383868217468, + "learning_rate": 0.00012607382347610706, + "loss": 2.7563, + "step": 12242 + }, + { + "epoch": 1.1091934497519875, + "grad_norm": 0.8860046863555908, + "learning_rate": 0.00012606778227511632, + "loss": 2.5952, + "step": 12243 + }, + { + "epoch": 1.1092840479264343, + "grad_norm": 0.9181177020072937, + "learning_rate": 0.00012606174107412555, + "loss": 2.6595, + "step": 12244 + }, + { + "epoch": 1.109374646100881, + "grad_norm": 0.8516542911529541, + "learning_rate": 0.0001260556998731348, + "loss": 2.6102, + "step": 12245 + }, + { + "epoch": 1.1094652442753279, + "grad_norm": 0.8847334384918213, + "learning_rate": 0.00012604965867214402, + "loss": 2.6989, + "step": 12246 + }, + { + "epoch": 1.1095558424497747, + "grad_norm": 0.9196171164512634, + "learning_rate": 0.00012604361747115328, + "loss": 2.6922, + "step": 12247 + }, + { + "epoch": 1.1096464406242215, + "grad_norm": 0.8803741335868835, + "learning_rate": 0.00012603757627016252, + "loss": 2.7494, + "step": 12248 + }, + { + "epoch": 1.1097370387986683, + "grad_norm": 0.9206012487411499, + "learning_rate": 0.00012603153506917175, + "loss": 2.7178, + "step": 12249 + }, + { + "epoch": 1.109827636973115, + "grad_norm": 0.8424805998802185, + "learning_rate": 0.000126025493868181, + "loss": 2.4098, + "step": 12250 + }, + { + "epoch": 1.1099182351475618, + "grad_norm": 0.9468032121658325, + "learning_rate": 0.00012601945266719025, + "loss": 2.5032, + "step": 12251 + }, + { + "epoch": 1.1100088333220086, + "grad_norm": 0.8492690324783325, + "learning_rate": 0.0001260134114661995, + "loss": 2.1548, + "step": 12252 + }, + { + "epoch": 1.1100994314964554, + "grad_norm": 0.8903581500053406, + "learning_rate": 0.0001260073702652087, + "loss": 2.5351, + "step": 12253 + }, + { + "epoch": 1.1101900296709022, + "grad_norm": 0.9118524789810181, + "learning_rate": 0.00012600132906421797, + "loss": 2.9563, + "step": 12254 + }, + { + "epoch": 1.110280627845349, + "grad_norm": 0.8904409408569336, + "learning_rate": 0.0001259952878632272, + "loss": 3.0217, + "step": 12255 + }, + { + "epoch": 1.1103712260197958, + "grad_norm": 0.8975630402565002, + "learning_rate": 0.00012598924666223647, + "loss": 2.8084, + "step": 12256 + }, + { + "epoch": 1.1104618241942426, + "grad_norm": 0.89814692735672, + "learning_rate": 0.0001259832054612457, + "loss": 2.8702, + "step": 12257 + }, + { + "epoch": 1.1105524223686893, + "grad_norm": 0.9021351933479309, + "learning_rate": 0.00012597716426025494, + "loss": 2.6769, + "step": 12258 + }, + { + "epoch": 1.1106430205431361, + "grad_norm": 0.8778466582298279, + "learning_rate": 0.0001259711230592642, + "loss": 2.594, + "step": 12259 + }, + { + "epoch": 1.110733618717583, + "grad_norm": 0.9199927449226379, + "learning_rate": 0.00012596508185827343, + "loss": 2.7721, + "step": 12260 + }, + { + "epoch": 1.1108242168920297, + "grad_norm": 0.8670250177383423, + "learning_rate": 0.00012595904065728267, + "loss": 2.8072, + "step": 12261 + }, + { + "epoch": 1.1109148150664765, + "grad_norm": 0.8702759146690369, + "learning_rate": 0.0001259529994562919, + "loss": 2.3964, + "step": 12262 + }, + { + "epoch": 1.1110054132409233, + "grad_norm": 0.8535661101341248, + "learning_rate": 0.00012594695825530116, + "loss": 2.7584, + "step": 12263 + }, + { + "epoch": 1.11109601141537, + "grad_norm": 0.9071521162986755, + "learning_rate": 0.0001259409170543104, + "loss": 2.7622, + "step": 12264 + }, + { + "epoch": 1.1111866095898169, + "grad_norm": 0.8876155018806458, + "learning_rate": 0.00012593487585331966, + "loss": 2.7062, + "step": 12265 + }, + { + "epoch": 1.1112772077642636, + "grad_norm": 0.8856075406074524, + "learning_rate": 0.0001259288346523289, + "loss": 2.8622, + "step": 12266 + }, + { + "epoch": 1.1113678059387104, + "grad_norm": 0.8693503141403198, + "learning_rate": 0.00012592279345133813, + "loss": 2.5375, + "step": 12267 + }, + { + "epoch": 1.1114584041131572, + "grad_norm": 0.8671167492866516, + "learning_rate": 0.0001259167522503474, + "loss": 2.8224, + "step": 12268 + }, + { + "epoch": 1.111549002287604, + "grad_norm": 0.8831426501274109, + "learning_rate": 0.00012591071104935662, + "loss": 2.651, + "step": 12269 + }, + { + "epoch": 1.1116396004620508, + "grad_norm": 0.9673616290092468, + "learning_rate": 0.00012590466984836585, + "loss": 2.8899, + "step": 12270 + }, + { + "epoch": 1.1117301986364976, + "grad_norm": 0.946540355682373, + "learning_rate": 0.0001258986286473751, + "loss": 2.582, + "step": 12271 + }, + { + "epoch": 1.1118207968109441, + "grad_norm": 0.7658612132072449, + "learning_rate": 0.00012589258744638435, + "loss": 2.1761, + "step": 12272 + }, + { + "epoch": 1.1119113949853912, + "grad_norm": 0.8542041778564453, + "learning_rate": 0.0001258865462453936, + "loss": 2.6753, + "step": 12273 + }, + { + "epoch": 1.1120019931598377, + "grad_norm": 0.8996936082839966, + "learning_rate": 0.00012588050504440282, + "loss": 2.9457, + "step": 12274 + }, + { + "epoch": 1.1120925913342847, + "grad_norm": 0.9494107961654663, + "learning_rate": 0.00012587446384341208, + "loss": 2.7515, + "step": 12275 + }, + { + "epoch": 1.1121831895087313, + "grad_norm": 0.8764507174491882, + "learning_rate": 0.0001258684226424213, + "loss": 2.8331, + "step": 12276 + }, + { + "epoch": 1.112273787683178, + "grad_norm": 0.7482072710990906, + "learning_rate": 0.00012586238144143057, + "loss": 1.8532, + "step": 12277 + }, + { + "epoch": 1.1123643858576249, + "grad_norm": 0.8727028369903564, + "learning_rate": 0.0001258563402404398, + "loss": 2.6254, + "step": 12278 + }, + { + "epoch": 1.1124549840320717, + "grad_norm": 0.8839901089668274, + "learning_rate": 0.00012585029903944904, + "loss": 2.8773, + "step": 12279 + }, + { + "epoch": 1.1125455822065184, + "grad_norm": 0.8021629452705383, + "learning_rate": 0.0001258442578384583, + "loss": 2.0637, + "step": 12280 + }, + { + "epoch": 1.1126361803809652, + "grad_norm": 0.8182372450828552, + "learning_rate": 0.00012583821663746754, + "loss": 2.1514, + "step": 12281 + }, + { + "epoch": 1.112726778555412, + "grad_norm": 0.8608352541923523, + "learning_rate": 0.00012583217543647677, + "loss": 2.8058, + "step": 12282 + }, + { + "epoch": 1.1128173767298588, + "grad_norm": 0.8786629438400269, + "learning_rate": 0.000125826134235486, + "loss": 2.6526, + "step": 12283 + }, + { + "epoch": 1.1129079749043056, + "grad_norm": 0.7530134320259094, + "learning_rate": 0.00012582009303449527, + "loss": 2.1289, + "step": 12284 + }, + { + "epoch": 1.1129985730787524, + "grad_norm": 0.8927790522575378, + "learning_rate": 0.0001258140518335045, + "loss": 2.8312, + "step": 12285 + }, + { + "epoch": 1.1130891712531992, + "grad_norm": 0.8588663339614868, + "learning_rate": 0.00012580801063251376, + "loss": 2.8425, + "step": 12286 + }, + { + "epoch": 1.113179769427646, + "grad_norm": 0.8856900930404663, + "learning_rate": 0.000125801969431523, + "loss": 2.6124, + "step": 12287 + }, + { + "epoch": 1.1132703676020927, + "grad_norm": 0.8997082710266113, + "learning_rate": 0.00012579592823053223, + "loss": 2.6624, + "step": 12288 + }, + { + "epoch": 1.1133609657765395, + "grad_norm": 0.9047713279724121, + "learning_rate": 0.0001257898870295415, + "loss": 2.6147, + "step": 12289 + }, + { + "epoch": 1.1134515639509863, + "grad_norm": 0.8889066576957703, + "learning_rate": 0.00012578384582855073, + "loss": 2.8041, + "step": 12290 + }, + { + "epoch": 1.113542162125433, + "grad_norm": 0.9641087055206299, + "learning_rate": 0.00012577780462755996, + "loss": 2.8515, + "step": 12291 + }, + { + "epoch": 1.11363276029988, + "grad_norm": 0.770459771156311, + "learning_rate": 0.0001257717634265692, + "loss": 2.1666, + "step": 12292 + }, + { + "epoch": 1.1137233584743267, + "grad_norm": 0.7894304394721985, + "learning_rate": 0.00012576572222557845, + "loss": 2.0471, + "step": 12293 + }, + { + "epoch": 1.1138139566487735, + "grad_norm": 0.8550871014595032, + "learning_rate": 0.0001257596810245877, + "loss": 2.8269, + "step": 12294 + }, + { + "epoch": 1.1139045548232203, + "grad_norm": 0.919032871723175, + "learning_rate": 0.00012575363982359692, + "loss": 2.5211, + "step": 12295 + }, + { + "epoch": 1.113995152997667, + "grad_norm": 0.8550638556480408, + "learning_rate": 0.00012574759862260618, + "loss": 2.7326, + "step": 12296 + }, + { + "epoch": 1.1140857511721138, + "grad_norm": 0.9064528942108154, + "learning_rate": 0.00012574155742161542, + "loss": 2.7097, + "step": 12297 + }, + { + "epoch": 1.1141763493465606, + "grad_norm": 0.9007940888404846, + "learning_rate": 0.00012573551622062468, + "loss": 2.8418, + "step": 12298 + }, + { + "epoch": 1.1142669475210074, + "grad_norm": 0.9083051085472107, + "learning_rate": 0.0001257294750196339, + "loss": 2.4625, + "step": 12299 + }, + { + "epoch": 1.1143575456954542, + "grad_norm": 0.9416430592536926, + "learning_rate": 0.00012572343381864315, + "loss": 2.6198, + "step": 12300 + }, + { + "epoch": 1.114448143869901, + "grad_norm": 0.8608568906784058, + "learning_rate": 0.00012571739261765238, + "loss": 2.7708, + "step": 12301 + }, + { + "epoch": 1.1145387420443478, + "grad_norm": 0.8842402100563049, + "learning_rate": 0.00012571135141666164, + "loss": 2.8157, + "step": 12302 + }, + { + "epoch": 1.1146293402187946, + "grad_norm": 0.8789511322975159, + "learning_rate": 0.0001257053102156709, + "loss": 2.949, + "step": 12303 + }, + { + "epoch": 1.1147199383932413, + "grad_norm": 0.7814358472824097, + "learning_rate": 0.0001256992690146801, + "loss": 2.0578, + "step": 12304 + }, + { + "epoch": 1.1148105365676881, + "grad_norm": 0.9175446629524231, + "learning_rate": 0.00012569322781368937, + "loss": 2.7375, + "step": 12305 + }, + { + "epoch": 1.114901134742135, + "grad_norm": 0.910976231098175, + "learning_rate": 0.0001256871866126986, + "loss": 2.9253, + "step": 12306 + }, + { + "epoch": 1.1149917329165817, + "grad_norm": 0.8876489400863647, + "learning_rate": 0.00012568114541170787, + "loss": 2.7742, + "step": 12307 + }, + { + "epoch": 1.1150823310910285, + "grad_norm": 0.8832192420959473, + "learning_rate": 0.00012567510421071707, + "loss": 2.7666, + "step": 12308 + }, + { + "epoch": 1.1151729292654753, + "grad_norm": 0.9079288244247437, + "learning_rate": 0.00012566906300972634, + "loss": 2.527, + "step": 12309 + }, + { + "epoch": 1.115263527439922, + "grad_norm": 0.928666889667511, + "learning_rate": 0.0001256630218087356, + "loss": 2.7683, + "step": 12310 + }, + { + "epoch": 1.1153541256143689, + "grad_norm": 0.871611475944519, + "learning_rate": 0.00012565698060774483, + "loss": 2.5623, + "step": 12311 + }, + { + "epoch": 1.1154447237888157, + "grad_norm": 0.9157540202140808, + "learning_rate": 0.00012565093940675406, + "loss": 2.7815, + "step": 12312 + }, + { + "epoch": 1.1155353219632624, + "grad_norm": 0.9711700677871704, + "learning_rate": 0.0001256448982057633, + "loss": 2.7949, + "step": 12313 + }, + { + "epoch": 1.1156259201377092, + "grad_norm": 0.8688304424285889, + "learning_rate": 0.00012563885700477256, + "loss": 2.8783, + "step": 12314 + }, + { + "epoch": 1.115716518312156, + "grad_norm": 0.8600854277610779, + "learning_rate": 0.0001256328158037818, + "loss": 2.8026, + "step": 12315 + }, + { + "epoch": 1.1158071164866028, + "grad_norm": 0.8376374840736389, + "learning_rate": 0.00012562677460279105, + "loss": 2.6761, + "step": 12316 + }, + { + "epoch": 1.1158977146610496, + "grad_norm": 0.8687117695808411, + "learning_rate": 0.0001256207334018003, + "loss": 2.611, + "step": 12317 + }, + { + "epoch": 1.1159883128354964, + "grad_norm": 0.8836192488670349, + "learning_rate": 0.00012561469220080952, + "loss": 2.9109, + "step": 12318 + }, + { + "epoch": 1.1160789110099432, + "grad_norm": 0.8649888634681702, + "learning_rate": 0.00012560865099981878, + "loss": 2.8763, + "step": 12319 + }, + { + "epoch": 1.11616950918439, + "grad_norm": 0.5700135827064514, + "learning_rate": 0.00012560260979882802, + "loss": 1.355, + "step": 12320 + }, + { + "epoch": 1.1162601073588367, + "grad_norm": 0.7602939009666443, + "learning_rate": 0.00012559656859783725, + "loss": 2.0581, + "step": 12321 + }, + { + "epoch": 1.1163507055332835, + "grad_norm": 0.9548502564430237, + "learning_rate": 0.00012559052739684649, + "loss": 2.6194, + "step": 12322 + }, + { + "epoch": 1.1164413037077303, + "grad_norm": 0.9640852808952332, + "learning_rate": 0.00012558448619585575, + "loss": 2.5274, + "step": 12323 + }, + { + "epoch": 1.116531901882177, + "grad_norm": 0.9162721037864685, + "learning_rate": 0.00012557844499486498, + "loss": 2.6643, + "step": 12324 + }, + { + "epoch": 1.116622500056624, + "grad_norm": 0.8756603598594666, + "learning_rate": 0.00012557240379387422, + "loss": 2.8194, + "step": 12325 + }, + { + "epoch": 1.1167130982310707, + "grad_norm": 0.9029278755187988, + "learning_rate": 0.00012556636259288348, + "loss": 2.5851, + "step": 12326 + }, + { + "epoch": 1.1168036964055175, + "grad_norm": 0.9279401898384094, + "learning_rate": 0.0001255603213918927, + "loss": 2.5819, + "step": 12327 + }, + { + "epoch": 1.1168942945799643, + "grad_norm": 0.8781318068504333, + "learning_rate": 0.00012555428019090197, + "loss": 2.6544, + "step": 12328 + }, + { + "epoch": 1.116984892754411, + "grad_norm": 0.8645539283752441, + "learning_rate": 0.0001255482389899112, + "loss": 2.551, + "step": 12329 + }, + { + "epoch": 1.1170754909288578, + "grad_norm": 0.9205479621887207, + "learning_rate": 0.00012554219778892044, + "loss": 2.8865, + "step": 12330 + }, + { + "epoch": 1.1171660891033046, + "grad_norm": 0.8828120827674866, + "learning_rate": 0.00012553615658792967, + "loss": 2.6537, + "step": 12331 + }, + { + "epoch": 1.1172566872777514, + "grad_norm": 0.9316605925559998, + "learning_rate": 0.00012553011538693894, + "loss": 2.8134, + "step": 12332 + }, + { + "epoch": 1.1173472854521982, + "grad_norm": 0.8170668482780457, + "learning_rate": 0.00012552407418594817, + "loss": 2.0951, + "step": 12333 + }, + { + "epoch": 1.117437883626645, + "grad_norm": 0.8915174007415771, + "learning_rate": 0.0001255180329849574, + "loss": 2.6306, + "step": 12334 + }, + { + "epoch": 1.1175284818010918, + "grad_norm": 1.1557881832122803, + "learning_rate": 0.00012551199178396666, + "loss": 2.7551, + "step": 12335 + }, + { + "epoch": 1.1176190799755386, + "grad_norm": 0.9172582030296326, + "learning_rate": 0.0001255059505829759, + "loss": 2.8745, + "step": 12336 + }, + { + "epoch": 1.1177096781499853, + "grad_norm": 0.8655633926391602, + "learning_rate": 0.00012549990938198516, + "loss": 2.7314, + "step": 12337 + }, + { + "epoch": 1.1178002763244321, + "grad_norm": 0.8877886533737183, + "learning_rate": 0.00012549386818099437, + "loss": 2.7359, + "step": 12338 + }, + { + "epoch": 1.117890874498879, + "grad_norm": 0.9542905688285828, + "learning_rate": 0.00012548782698000363, + "loss": 2.6203, + "step": 12339 + }, + { + "epoch": 1.1179814726733257, + "grad_norm": 0.9303706288337708, + "learning_rate": 0.0001254817857790129, + "loss": 2.7677, + "step": 12340 + }, + { + "epoch": 1.1180720708477725, + "grad_norm": 0.8609593510627747, + "learning_rate": 0.00012547574457802212, + "loss": 2.6478, + "step": 12341 + }, + { + "epoch": 1.1181626690222193, + "grad_norm": 0.8766762018203735, + "learning_rate": 0.00012546970337703136, + "loss": 2.7435, + "step": 12342 + }, + { + "epoch": 1.118253267196666, + "grad_norm": 0.8085798025131226, + "learning_rate": 0.0001254636621760406, + "loss": 2.0301, + "step": 12343 + }, + { + "epoch": 1.1183438653711129, + "grad_norm": 0.9181734919548035, + "learning_rate": 0.00012545762097504985, + "loss": 2.6872, + "step": 12344 + }, + { + "epoch": 1.1184344635455596, + "grad_norm": 0.759969174861908, + "learning_rate": 0.00012545157977405909, + "loss": 1.941, + "step": 12345 + }, + { + "epoch": 1.1185250617200064, + "grad_norm": 0.9535549879074097, + "learning_rate": 0.00012544553857306832, + "loss": 2.2362, + "step": 12346 + }, + { + "epoch": 1.1186156598944532, + "grad_norm": 0.9334588646888733, + "learning_rate": 0.00012543949737207758, + "loss": 2.9745, + "step": 12347 + }, + { + "epoch": 1.1187062580689, + "grad_norm": 0.8907454013824463, + "learning_rate": 0.00012543345617108682, + "loss": 3.0549, + "step": 12348 + }, + { + "epoch": 1.1187968562433468, + "grad_norm": 0.9304269552230835, + "learning_rate": 0.00012542741497009608, + "loss": 2.511, + "step": 12349 + }, + { + "epoch": 1.1188874544177936, + "grad_norm": 0.7591465711593628, + "learning_rate": 0.0001254213737691053, + "loss": 2.0293, + "step": 12350 + }, + { + "epoch": 1.1189780525922404, + "grad_norm": 0.8879565596580505, + "learning_rate": 0.00012541533256811454, + "loss": 2.7649, + "step": 12351 + }, + { + "epoch": 1.1190686507666872, + "grad_norm": 0.9881350994110107, + "learning_rate": 0.00012540929136712378, + "loss": 2.6145, + "step": 12352 + }, + { + "epoch": 1.1191592489411337, + "grad_norm": 0.9511057734489441, + "learning_rate": 0.00012540325016613304, + "loss": 2.6928, + "step": 12353 + }, + { + "epoch": 1.1192498471155807, + "grad_norm": 0.8881760239601135, + "learning_rate": 0.00012539720896514227, + "loss": 2.5923, + "step": 12354 + }, + { + "epoch": 1.1193404452900273, + "grad_norm": 0.896500825881958, + "learning_rate": 0.0001253911677641515, + "loss": 2.9323, + "step": 12355 + }, + { + "epoch": 1.1194310434644743, + "grad_norm": 0.7026172280311584, + "learning_rate": 0.00012538512656316077, + "loss": 1.8967, + "step": 12356 + }, + { + "epoch": 1.1195216416389209, + "grad_norm": 0.8732031583786011, + "learning_rate": 0.00012537908536217, + "loss": 2.719, + "step": 12357 + }, + { + "epoch": 1.1196122398133677, + "grad_norm": 0.9879806041717529, + "learning_rate": 0.00012537304416117926, + "loss": 2.4569, + "step": 12358 + }, + { + "epoch": 1.1197028379878144, + "grad_norm": 0.9091539978981018, + "learning_rate": 0.00012536700296018847, + "loss": 2.7172, + "step": 12359 + }, + { + "epoch": 1.1197934361622612, + "grad_norm": 0.7368266582489014, + "learning_rate": 0.00012536096175919773, + "loss": 1.9646, + "step": 12360 + }, + { + "epoch": 1.119884034336708, + "grad_norm": 0.8873229622840881, + "learning_rate": 0.00012535492055820697, + "loss": 2.9263, + "step": 12361 + }, + { + "epoch": 1.1199746325111548, + "grad_norm": 0.83602374792099, + "learning_rate": 0.00012534887935721623, + "loss": 2.7889, + "step": 12362 + }, + { + "epoch": 1.1200652306856016, + "grad_norm": 0.9746618270874023, + "learning_rate": 0.00012534283815622546, + "loss": 3.0377, + "step": 12363 + }, + { + "epoch": 1.1201558288600484, + "grad_norm": 0.9332643747329712, + "learning_rate": 0.0001253367969552347, + "loss": 3.0265, + "step": 12364 + }, + { + "epoch": 1.1202464270344952, + "grad_norm": 1.0021055936813354, + "learning_rate": 0.00012533075575424396, + "loss": 2.7654, + "step": 12365 + }, + { + "epoch": 1.120337025208942, + "grad_norm": 0.8607325553894043, + "learning_rate": 0.0001253247145532532, + "loss": 2.8926, + "step": 12366 + }, + { + "epoch": 1.1204276233833887, + "grad_norm": 0.9022758603096008, + "learning_rate": 0.00012531867335226245, + "loss": 2.7784, + "step": 12367 + }, + { + "epoch": 1.1205182215578355, + "grad_norm": 0.8764415383338928, + "learning_rate": 0.00012531263215127166, + "loss": 2.4568, + "step": 12368 + }, + { + "epoch": 1.1206088197322823, + "grad_norm": 0.8639925122261047, + "learning_rate": 0.00012530659095028092, + "loss": 2.5839, + "step": 12369 + }, + { + "epoch": 1.1206994179067291, + "grad_norm": 0.8855189681053162, + "learning_rate": 0.00012530054974929018, + "loss": 2.6845, + "step": 12370 + }, + { + "epoch": 1.120790016081176, + "grad_norm": 0.878806471824646, + "learning_rate": 0.00012529450854829942, + "loss": 2.6331, + "step": 12371 + }, + { + "epoch": 1.1208806142556227, + "grad_norm": 0.8745403289794922, + "learning_rate": 0.00012528846734730865, + "loss": 2.4785, + "step": 12372 + }, + { + "epoch": 1.1209712124300695, + "grad_norm": 0.8616892099380493, + "learning_rate": 0.00012528242614631788, + "loss": 2.7647, + "step": 12373 + }, + { + "epoch": 1.1210618106045163, + "grad_norm": 0.8929305076599121, + "learning_rate": 0.00012527638494532714, + "loss": 2.6703, + "step": 12374 + }, + { + "epoch": 1.121152408778963, + "grad_norm": 0.917395830154419, + "learning_rate": 0.00012527034374433638, + "loss": 2.5754, + "step": 12375 + }, + { + "epoch": 1.1212430069534098, + "grad_norm": 0.8912478089332581, + "learning_rate": 0.0001252643025433456, + "loss": 2.7124, + "step": 12376 + }, + { + "epoch": 1.1213336051278566, + "grad_norm": 0.8655701279640198, + "learning_rate": 0.00012525826134235487, + "loss": 2.7597, + "step": 12377 + }, + { + "epoch": 1.1214242033023034, + "grad_norm": 0.8263322710990906, + "learning_rate": 0.0001252522201413641, + "loss": 2.7642, + "step": 12378 + }, + { + "epoch": 1.1215148014767502, + "grad_norm": 0.9008363485336304, + "learning_rate": 0.00012524617894037337, + "loss": 2.7472, + "step": 12379 + }, + { + "epoch": 1.121605399651197, + "grad_norm": 0.8565436005592346, + "learning_rate": 0.0001252401377393826, + "loss": 2.7596, + "step": 12380 + }, + { + "epoch": 1.1216959978256438, + "grad_norm": 0.8564646244049072, + "learning_rate": 0.00012523409653839184, + "loss": 2.596, + "step": 12381 + }, + { + "epoch": 1.1217865960000906, + "grad_norm": 0.9321374297142029, + "learning_rate": 0.00012522805533740107, + "loss": 2.6028, + "step": 12382 + }, + { + "epoch": 1.1218771941745374, + "grad_norm": 0.8512706160545349, + "learning_rate": 0.00012522201413641033, + "loss": 2.4808, + "step": 12383 + }, + { + "epoch": 1.1219677923489841, + "grad_norm": 1.0914182662963867, + "learning_rate": 0.00012521597293541957, + "loss": 2.3851, + "step": 12384 + }, + { + "epoch": 1.122058390523431, + "grad_norm": 0.8733231425285339, + "learning_rate": 0.0001252099317344288, + "loss": 2.6658, + "step": 12385 + }, + { + "epoch": 1.1221489886978777, + "grad_norm": 0.9427415728569031, + "learning_rate": 0.00012520389053343806, + "loss": 2.4354, + "step": 12386 + }, + { + "epoch": 1.1222395868723245, + "grad_norm": 0.8294773101806641, + "learning_rate": 0.0001251978493324473, + "loss": 2.724, + "step": 12387 + }, + { + "epoch": 1.1223301850467713, + "grad_norm": 0.9405412673950195, + "learning_rate": 0.00012519180813145656, + "loss": 2.6485, + "step": 12388 + }, + { + "epoch": 1.122420783221218, + "grad_norm": 0.853580892086029, + "learning_rate": 0.00012518576693046576, + "loss": 2.653, + "step": 12389 + }, + { + "epoch": 1.1225113813956649, + "grad_norm": 0.8554474711418152, + "learning_rate": 0.00012517972572947503, + "loss": 2.7356, + "step": 12390 + }, + { + "epoch": 1.1226019795701117, + "grad_norm": 0.8835422396659851, + "learning_rate": 0.00012517368452848426, + "loss": 2.8976, + "step": 12391 + }, + { + "epoch": 1.1226925777445584, + "grad_norm": 0.8466115593910217, + "learning_rate": 0.00012516764332749352, + "loss": 2.7166, + "step": 12392 + }, + { + "epoch": 1.1227831759190052, + "grad_norm": 0.9588918089866638, + "learning_rate": 0.00012516160212650275, + "loss": 2.4708, + "step": 12393 + }, + { + "epoch": 1.122873774093452, + "grad_norm": 0.9604206681251526, + "learning_rate": 0.000125155560925512, + "loss": 2.7204, + "step": 12394 + }, + { + "epoch": 1.1229643722678988, + "grad_norm": 0.9593615531921387, + "learning_rate": 0.00012514951972452125, + "loss": 2.8493, + "step": 12395 + }, + { + "epoch": 1.1230549704423456, + "grad_norm": 0.8779234290122986, + "learning_rate": 0.00012514347852353048, + "loss": 2.7226, + "step": 12396 + }, + { + "epoch": 1.1231455686167924, + "grad_norm": 0.7881258726119995, + "learning_rate": 0.00012513743732253972, + "loss": 2.0574, + "step": 12397 + }, + { + "epoch": 1.1232361667912392, + "grad_norm": 0.8656380772590637, + "learning_rate": 0.00012513139612154895, + "loss": 2.7679, + "step": 12398 + }, + { + "epoch": 1.123326764965686, + "grad_norm": 0.784137487411499, + "learning_rate": 0.0001251253549205582, + "loss": 2.1764, + "step": 12399 + }, + { + "epoch": 1.1234173631401327, + "grad_norm": 0.8972407579421997, + "learning_rate": 0.00012511931371956747, + "loss": 2.571, + "step": 12400 + }, + { + "epoch": 1.1235079613145795, + "grad_norm": 0.8251272439956665, + "learning_rate": 0.0001251132725185767, + "loss": 2.6347, + "step": 12401 + }, + { + "epoch": 1.1235985594890263, + "grad_norm": 0.9118900299072266, + "learning_rate": 0.00012510723131758594, + "loss": 2.7134, + "step": 12402 + }, + { + "epoch": 1.123689157663473, + "grad_norm": 0.928795576095581, + "learning_rate": 0.00012510119011659518, + "loss": 2.7691, + "step": 12403 + }, + { + "epoch": 1.12377975583792, + "grad_norm": 0.9434114098548889, + "learning_rate": 0.00012509514891560444, + "loss": 2.3227, + "step": 12404 + }, + { + "epoch": 1.1238703540123667, + "grad_norm": 0.8355557322502136, + "learning_rate": 0.00012508910771461367, + "loss": 2.6144, + "step": 12405 + }, + { + "epoch": 1.1239609521868135, + "grad_norm": 0.8352215886116028, + "learning_rate": 0.0001250830665136229, + "loss": 2.7149, + "step": 12406 + }, + { + "epoch": 1.1240515503612603, + "grad_norm": 0.8223391175270081, + "learning_rate": 0.00012507702531263217, + "loss": 2.1123, + "step": 12407 + }, + { + "epoch": 1.124142148535707, + "grad_norm": 0.9925066232681274, + "learning_rate": 0.0001250709841116414, + "loss": 3.0062, + "step": 12408 + }, + { + "epoch": 1.1242327467101538, + "grad_norm": 0.9036552906036377, + "learning_rate": 0.00012506494291065066, + "loss": 2.6241, + "step": 12409 + }, + { + "epoch": 1.1243233448846006, + "grad_norm": 0.8351688385009766, + "learning_rate": 0.00012505890170965987, + "loss": 2.6311, + "step": 12410 + }, + { + "epoch": 1.1244139430590474, + "grad_norm": 0.8289351463317871, + "learning_rate": 0.00012505286050866913, + "loss": 2.6952, + "step": 12411 + }, + { + "epoch": 1.1245045412334942, + "grad_norm": 0.8374469876289368, + "learning_rate": 0.00012504681930767836, + "loss": 2.0099, + "step": 12412 + }, + { + "epoch": 1.124595139407941, + "grad_norm": 0.9251470565795898, + "learning_rate": 0.00012504077810668763, + "loss": 2.8337, + "step": 12413 + }, + { + "epoch": 1.1246857375823878, + "grad_norm": 1.079645037651062, + "learning_rate": 0.00012503473690569686, + "loss": 2.8704, + "step": 12414 + }, + { + "epoch": 1.1247763357568346, + "grad_norm": 0.8720921874046326, + "learning_rate": 0.0001250286957047061, + "loss": 2.5223, + "step": 12415 + }, + { + "epoch": 1.1248669339312813, + "grad_norm": 0.8316871523857117, + "learning_rate": 0.00012502265450371535, + "loss": 2.6595, + "step": 12416 + }, + { + "epoch": 1.1249575321057281, + "grad_norm": 0.9384201765060425, + "learning_rate": 0.0001250166133027246, + "loss": 2.6688, + "step": 12417 + }, + { + "epoch": 1.125048130280175, + "grad_norm": 0.8903402090072632, + "learning_rate": 0.00012501057210173382, + "loss": 2.6244, + "step": 12418 + }, + { + "epoch": 1.1251387284546217, + "grad_norm": 0.8480218052864075, + "learning_rate": 0.00012500453090074306, + "loss": 2.644, + "step": 12419 + }, + { + "epoch": 1.1252293266290685, + "grad_norm": 0.8714718818664551, + "learning_rate": 0.00012499848969975232, + "loss": 2.7307, + "step": 12420 + }, + { + "epoch": 1.1253199248035153, + "grad_norm": 0.861638605594635, + "learning_rate": 0.00012499244849876155, + "loss": 2.8773, + "step": 12421 + }, + { + "epoch": 1.125410522977962, + "grad_norm": 0.952224612236023, + "learning_rate": 0.0001249864072977708, + "loss": 2.7436, + "step": 12422 + }, + { + "epoch": 1.1255011211524089, + "grad_norm": 0.8934169411659241, + "learning_rate": 0.00012498036609678005, + "loss": 2.6038, + "step": 12423 + }, + { + "epoch": 1.1255917193268556, + "grad_norm": 0.8751727342605591, + "learning_rate": 0.00012497432489578928, + "loss": 2.9066, + "step": 12424 + }, + { + "epoch": 1.1256823175013024, + "grad_norm": 0.7903333306312561, + "learning_rate": 0.00012496828369479854, + "loss": 2.3097, + "step": 12425 + }, + { + "epoch": 1.1257729156757492, + "grad_norm": 0.9607675671577454, + "learning_rate": 0.00012496224249380778, + "loss": 2.8391, + "step": 12426 + }, + { + "epoch": 1.125863513850196, + "grad_norm": 0.9269137382507324, + "learning_rate": 0.000124956201292817, + "loss": 2.9232, + "step": 12427 + }, + { + "epoch": 1.1259541120246428, + "grad_norm": 0.9089055061340332, + "learning_rate": 0.00012495016009182624, + "loss": 2.5057, + "step": 12428 + }, + { + "epoch": 1.1260447101990896, + "grad_norm": 0.8786005973815918, + "learning_rate": 0.0001249441188908355, + "loss": 2.8307, + "step": 12429 + }, + { + "epoch": 1.1261353083735364, + "grad_norm": 0.841463565826416, + "learning_rate": 0.00012493807768984477, + "loss": 2.6352, + "step": 12430 + }, + { + "epoch": 1.1262259065479832, + "grad_norm": 0.9217904210090637, + "learning_rate": 0.000124932036488854, + "loss": 2.6234, + "step": 12431 + }, + { + "epoch": 1.1263165047224297, + "grad_norm": 0.8548765778541565, + "learning_rate": 0.00012492599528786323, + "loss": 2.2376, + "step": 12432 + }, + { + "epoch": 1.1264071028968767, + "grad_norm": 0.9809646010398865, + "learning_rate": 0.00012491995408687247, + "loss": 2.8194, + "step": 12433 + }, + { + "epoch": 1.1264977010713233, + "grad_norm": 0.8547075390815735, + "learning_rate": 0.00012491391288588173, + "loss": 2.5373, + "step": 12434 + }, + { + "epoch": 1.1265882992457703, + "grad_norm": 0.8945194482803345, + "learning_rate": 0.00012490787168489096, + "loss": 2.9573, + "step": 12435 + }, + { + "epoch": 1.1266788974202169, + "grad_norm": 0.8982054591178894, + "learning_rate": 0.0001249018304839002, + "loss": 2.7076, + "step": 12436 + }, + { + "epoch": 1.1267694955946639, + "grad_norm": 0.9793790578842163, + "learning_rate": 0.00012489578928290946, + "loss": 2.4821, + "step": 12437 + }, + { + "epoch": 1.1268600937691104, + "grad_norm": 0.7260423898696899, + "learning_rate": 0.0001248897480819187, + "loss": 2.1552, + "step": 12438 + }, + { + "epoch": 1.1269506919435575, + "grad_norm": 0.9141126275062561, + "learning_rate": 0.00012488370688092795, + "loss": 2.5284, + "step": 12439 + }, + { + "epoch": 1.127041290118004, + "grad_norm": 0.9425950646400452, + "learning_rate": 0.00012487766567993716, + "loss": 2.8099, + "step": 12440 + }, + { + "epoch": 1.1271318882924508, + "grad_norm": 0.8307409286499023, + "learning_rate": 0.00012487162447894642, + "loss": 2.5435, + "step": 12441 + }, + { + "epoch": 1.1272224864668976, + "grad_norm": 0.8597913980484009, + "learning_rate": 0.00012486558327795566, + "loss": 2.7155, + "step": 12442 + }, + { + "epoch": 1.1273130846413444, + "grad_norm": 0.9395120143890381, + "learning_rate": 0.00012485954207696492, + "loss": 2.6479, + "step": 12443 + }, + { + "epoch": 1.1274036828157912, + "grad_norm": 0.9215790033340454, + "learning_rate": 0.00012485350087597415, + "loss": 2.6581, + "step": 12444 + }, + { + "epoch": 1.127494280990238, + "grad_norm": 0.8994401693344116, + "learning_rate": 0.00012484745967498339, + "loss": 2.8052, + "step": 12445 + }, + { + "epoch": 1.1275848791646848, + "grad_norm": 0.9101608395576477, + "learning_rate": 0.00012484141847399265, + "loss": 2.7967, + "step": 12446 + }, + { + "epoch": 1.1276754773391315, + "grad_norm": 0.8991286754608154, + "learning_rate": 0.00012483537727300188, + "loss": 2.8472, + "step": 12447 + }, + { + "epoch": 1.1277660755135783, + "grad_norm": 0.8762823343276978, + "learning_rate": 0.00012482933607201112, + "loss": 2.597, + "step": 12448 + }, + { + "epoch": 1.1278566736880251, + "grad_norm": 0.7605729103088379, + "learning_rate": 0.00012482329487102035, + "loss": 2.0995, + "step": 12449 + }, + { + "epoch": 1.127947271862472, + "grad_norm": 0.836691677570343, + "learning_rate": 0.0001248172536700296, + "loss": 2.5557, + "step": 12450 + }, + { + "epoch": 1.1280378700369187, + "grad_norm": 0.9119102358818054, + "learning_rate": 0.00012481121246903884, + "loss": 2.5332, + "step": 12451 + }, + { + "epoch": 1.1281284682113655, + "grad_norm": 0.9502992033958435, + "learning_rate": 0.0001248051712680481, + "loss": 2.6244, + "step": 12452 + }, + { + "epoch": 1.1282190663858123, + "grad_norm": 0.9591110944747925, + "learning_rate": 0.00012479913006705734, + "loss": 2.7345, + "step": 12453 + }, + { + "epoch": 1.128309664560259, + "grad_norm": 0.9310910701751709, + "learning_rate": 0.00012479308886606657, + "loss": 2.5511, + "step": 12454 + }, + { + "epoch": 1.1284002627347058, + "grad_norm": 0.9040321707725525, + "learning_rate": 0.00012478704766507584, + "loss": 2.7304, + "step": 12455 + }, + { + "epoch": 1.1284908609091526, + "grad_norm": 0.8839100003242493, + "learning_rate": 0.00012478100646408507, + "loss": 2.6528, + "step": 12456 + }, + { + "epoch": 1.1285814590835994, + "grad_norm": 0.8836674690246582, + "learning_rate": 0.0001247749652630943, + "loss": 2.7674, + "step": 12457 + }, + { + "epoch": 1.1286720572580462, + "grad_norm": 0.8744121789932251, + "learning_rate": 0.00012476892406210354, + "loss": 2.6508, + "step": 12458 + }, + { + "epoch": 1.128762655432493, + "grad_norm": 0.9737843871116638, + "learning_rate": 0.0001247628828611128, + "loss": 2.9388, + "step": 12459 + }, + { + "epoch": 1.1288532536069398, + "grad_norm": 0.850720226764679, + "learning_rate": 0.00012475684166012206, + "loss": 2.5634, + "step": 12460 + }, + { + "epoch": 1.1289438517813866, + "grad_norm": 0.8730328679084778, + "learning_rate": 0.00012475080045913127, + "loss": 2.7337, + "step": 12461 + }, + { + "epoch": 1.1290344499558334, + "grad_norm": 0.8623390197753906, + "learning_rate": 0.00012474475925814053, + "loss": 2.6226, + "step": 12462 + }, + { + "epoch": 1.1291250481302801, + "grad_norm": 0.9333146810531616, + "learning_rate": 0.00012473871805714976, + "loss": 2.8223, + "step": 12463 + }, + { + "epoch": 1.129215646304727, + "grad_norm": 0.9289748072624207, + "learning_rate": 0.00012473267685615902, + "loss": 2.5847, + "step": 12464 + }, + { + "epoch": 1.1293062444791737, + "grad_norm": 0.857419490814209, + "learning_rate": 0.00012472663565516826, + "loss": 2.4915, + "step": 12465 + }, + { + "epoch": 1.1293968426536205, + "grad_norm": 0.9241555333137512, + "learning_rate": 0.0001247205944541775, + "loss": 2.5829, + "step": 12466 + }, + { + "epoch": 1.1294874408280673, + "grad_norm": 0.8204213380813599, + "learning_rate": 0.00012471455325318675, + "loss": 2.5586, + "step": 12467 + }, + { + "epoch": 1.129578039002514, + "grad_norm": 0.9045491814613342, + "learning_rate": 0.00012470851205219599, + "loss": 2.7518, + "step": 12468 + }, + { + "epoch": 1.1296686371769609, + "grad_norm": 0.882338285446167, + "learning_rate": 0.00012470247085120522, + "loss": 2.6326, + "step": 12469 + }, + { + "epoch": 1.1297592353514077, + "grad_norm": 0.9635729789733887, + "learning_rate": 0.00012469642965021445, + "loss": 2.6927, + "step": 12470 + }, + { + "epoch": 1.1298498335258544, + "grad_norm": 0.858590841293335, + "learning_rate": 0.00012469038844922372, + "loss": 2.6326, + "step": 12471 + }, + { + "epoch": 1.1299404317003012, + "grad_norm": 0.8369994759559631, + "learning_rate": 0.00012468434724823295, + "loss": 2.7369, + "step": 12472 + }, + { + "epoch": 1.130031029874748, + "grad_norm": 0.8742069602012634, + "learning_rate": 0.0001246783060472422, + "loss": 2.7834, + "step": 12473 + }, + { + "epoch": 1.1301216280491948, + "grad_norm": 0.7550502419471741, + "learning_rate": 0.00012467226484625142, + "loss": 1.8821, + "step": 12474 + }, + { + "epoch": 1.1302122262236416, + "grad_norm": 0.9505907893180847, + "learning_rate": 0.00012466622364526068, + "loss": 2.4576, + "step": 12475 + }, + { + "epoch": 1.1303028243980884, + "grad_norm": 0.9316603541374207, + "learning_rate": 0.00012466018244426994, + "loss": 2.8974, + "step": 12476 + }, + { + "epoch": 1.1303934225725352, + "grad_norm": 0.8846188187599182, + "learning_rate": 0.00012465414124327917, + "loss": 2.8271, + "step": 12477 + }, + { + "epoch": 1.130484020746982, + "grad_norm": 0.8115814328193665, + "learning_rate": 0.0001246481000422884, + "loss": 1.9311, + "step": 12478 + }, + { + "epoch": 1.1305746189214287, + "grad_norm": 0.8783538341522217, + "learning_rate": 0.00012464205884129764, + "loss": 2.8467, + "step": 12479 + }, + { + "epoch": 1.1306652170958755, + "grad_norm": 0.9314901828765869, + "learning_rate": 0.0001246360176403069, + "loss": 2.8885, + "step": 12480 + }, + { + "epoch": 1.1307558152703223, + "grad_norm": 0.9262660145759583, + "learning_rate": 0.00012462997643931614, + "loss": 2.8088, + "step": 12481 + }, + { + "epoch": 1.130846413444769, + "grad_norm": 0.9534698724746704, + "learning_rate": 0.00012462393523832537, + "loss": 2.8823, + "step": 12482 + }, + { + "epoch": 1.130937011619216, + "grad_norm": 0.9115458726882935, + "learning_rate": 0.00012461789403733463, + "loss": 3.2303, + "step": 12483 + }, + { + "epoch": 1.1310276097936627, + "grad_norm": 0.8464102745056152, + "learning_rate": 0.00012461185283634387, + "loss": 2.4952, + "step": 12484 + }, + { + "epoch": 1.1311182079681095, + "grad_norm": 0.9618716239929199, + "learning_rate": 0.00012460581163535313, + "loss": 2.7154, + "step": 12485 + }, + { + "epoch": 1.1312088061425563, + "grad_norm": 0.8952136635780334, + "learning_rate": 0.00012459977043436236, + "loss": 2.5808, + "step": 12486 + }, + { + "epoch": 1.131299404317003, + "grad_norm": 0.9132945537567139, + "learning_rate": 0.0001245937292333716, + "loss": 2.8824, + "step": 12487 + }, + { + "epoch": 1.1313900024914498, + "grad_norm": 0.8745244145393372, + "learning_rate": 0.00012458768803238083, + "loss": 2.558, + "step": 12488 + }, + { + "epoch": 1.1314806006658966, + "grad_norm": 0.9341776371002197, + "learning_rate": 0.0001245816468313901, + "loss": 2.461, + "step": 12489 + }, + { + "epoch": 1.1315711988403434, + "grad_norm": 0.9012507796287537, + "learning_rate": 0.00012457560563039935, + "loss": 2.5558, + "step": 12490 + }, + { + "epoch": 1.1316617970147902, + "grad_norm": 0.8860464692115784, + "learning_rate": 0.00012456956442940856, + "loss": 2.7051, + "step": 12491 + }, + { + "epoch": 1.131752395189237, + "grad_norm": 0.9191615581512451, + "learning_rate": 0.00012456352322841782, + "loss": 2.7963, + "step": 12492 + }, + { + "epoch": 1.1318429933636838, + "grad_norm": 0.9395555257797241, + "learning_rate": 0.00012455748202742705, + "loss": 2.8112, + "step": 12493 + }, + { + "epoch": 1.1319335915381306, + "grad_norm": 0.8229537010192871, + "learning_rate": 0.00012455144082643632, + "loss": 2.5167, + "step": 12494 + }, + { + "epoch": 1.1320241897125773, + "grad_norm": 0.8709476590156555, + "learning_rate": 0.00012454539962544552, + "loss": 2.9062, + "step": 12495 + }, + { + "epoch": 1.1321147878870241, + "grad_norm": 0.8748621940612793, + "learning_rate": 0.00012453935842445478, + "loss": 2.5754, + "step": 12496 + }, + { + "epoch": 1.132205386061471, + "grad_norm": 0.9209664463996887, + "learning_rate": 0.00012453331722346404, + "loss": 2.8008, + "step": 12497 + }, + { + "epoch": 1.1322959842359177, + "grad_norm": 0.9878742694854736, + "learning_rate": 0.00012452727602247328, + "loss": 2.9514, + "step": 12498 + }, + { + "epoch": 1.1323865824103645, + "grad_norm": 0.8614344596862793, + "learning_rate": 0.0001245212348214825, + "loss": 2.6049, + "step": 12499 + }, + { + "epoch": 1.1324771805848113, + "grad_norm": 0.817273736000061, + "learning_rate": 0.00012451519362049175, + "loss": 2.1675, + "step": 12500 + }, + { + "epoch": 1.132567778759258, + "grad_norm": 0.9424204230308533, + "learning_rate": 0.000124509152419501, + "loss": 2.5738, + "step": 12501 + }, + { + "epoch": 1.1326583769337049, + "grad_norm": 0.8429452180862427, + "learning_rate": 0.00012450311121851024, + "loss": 2.7064, + "step": 12502 + }, + { + "epoch": 1.1327489751081516, + "grad_norm": 0.952907145023346, + "learning_rate": 0.0001244970700175195, + "loss": 2.4355, + "step": 12503 + }, + { + "epoch": 1.1328395732825984, + "grad_norm": 0.8537648916244507, + "learning_rate": 0.0001244910288165287, + "loss": 2.8076, + "step": 12504 + }, + { + "epoch": 1.1329301714570452, + "grad_norm": 0.8682045936584473, + "learning_rate": 0.00012448498761553797, + "loss": 2.6475, + "step": 12505 + }, + { + "epoch": 1.133020769631492, + "grad_norm": 0.7932912707328796, + "learning_rate": 0.00012447894641454723, + "loss": 2.0844, + "step": 12506 + }, + { + "epoch": 1.1331113678059388, + "grad_norm": 0.856326162815094, + "learning_rate": 0.00012447290521355647, + "loss": 2.6446, + "step": 12507 + }, + { + "epoch": 1.1332019659803856, + "grad_norm": 0.8271143436431885, + "learning_rate": 0.0001244668640125657, + "loss": 2.5397, + "step": 12508 + }, + { + "epoch": 1.1332925641548324, + "grad_norm": 0.848690390586853, + "learning_rate": 0.00012446082281157493, + "loss": 2.56, + "step": 12509 + }, + { + "epoch": 1.1333831623292792, + "grad_norm": 0.8335612416267395, + "learning_rate": 0.0001244547816105842, + "loss": 2.6682, + "step": 12510 + }, + { + "epoch": 1.133473760503726, + "grad_norm": 0.45319873094558716, + "learning_rate": 0.00012444874040959343, + "loss": 0.5843, + "step": 12511 + }, + { + "epoch": 1.1335643586781727, + "grad_norm": 0.9139971137046814, + "learning_rate": 0.00012444269920860266, + "loss": 2.7136, + "step": 12512 + }, + { + "epoch": 1.1336549568526193, + "grad_norm": 0.999017596244812, + "learning_rate": 0.00012443665800761193, + "loss": 3.1079, + "step": 12513 + }, + { + "epoch": 1.1337455550270663, + "grad_norm": 0.8611716628074646, + "learning_rate": 0.00012443061680662116, + "loss": 2.7785, + "step": 12514 + }, + { + "epoch": 1.1338361532015129, + "grad_norm": 0.8351605534553528, + "learning_rate": 0.00012442457560563042, + "loss": 2.6352, + "step": 12515 + }, + { + "epoch": 1.1339267513759599, + "grad_norm": 0.9115939736366272, + "learning_rate": 0.00012441853440463965, + "loss": 2.6719, + "step": 12516 + }, + { + "epoch": 1.1340173495504065, + "grad_norm": 0.9026110172271729, + "learning_rate": 0.0001244124932036489, + "loss": 2.5994, + "step": 12517 + }, + { + "epoch": 1.1341079477248535, + "grad_norm": 0.873033344745636, + "learning_rate": 0.00012440645200265812, + "loss": 2.6588, + "step": 12518 + }, + { + "epoch": 1.1341985458993, + "grad_norm": 0.8379390239715576, + "learning_rate": 0.00012440041080166738, + "loss": 2.1002, + "step": 12519 + }, + { + "epoch": 1.134289144073747, + "grad_norm": 0.9209325313568115, + "learning_rate": 0.00012439436960067662, + "loss": 2.6277, + "step": 12520 + }, + { + "epoch": 1.1343797422481936, + "grad_norm": 0.9201090335845947, + "learning_rate": 0.00012438832839968585, + "loss": 2.6738, + "step": 12521 + }, + { + "epoch": 1.1344703404226404, + "grad_norm": 0.8789618611335754, + "learning_rate": 0.0001243822871986951, + "loss": 2.7814, + "step": 12522 + }, + { + "epoch": 1.1345609385970872, + "grad_norm": 0.9195781946182251, + "learning_rate": 0.00012437624599770435, + "loss": 3.0224, + "step": 12523 + }, + { + "epoch": 1.134651536771534, + "grad_norm": 0.868891179561615, + "learning_rate": 0.0001243702047967136, + "loss": 2.9263, + "step": 12524 + }, + { + "epoch": 1.1347421349459808, + "grad_norm": 0.8692139387130737, + "learning_rate": 0.00012436416359572282, + "loss": 2.9296, + "step": 12525 + }, + { + "epoch": 1.1348327331204275, + "grad_norm": 0.9612644910812378, + "learning_rate": 0.00012435812239473208, + "loss": 2.5694, + "step": 12526 + }, + { + "epoch": 1.1349233312948743, + "grad_norm": 0.905731737613678, + "learning_rate": 0.00012435208119374134, + "loss": 3.0009, + "step": 12527 + }, + { + "epoch": 1.1350139294693211, + "grad_norm": 0.8387446403503418, + "learning_rate": 0.00012434603999275057, + "loss": 2.4126, + "step": 12528 + }, + { + "epoch": 1.135104527643768, + "grad_norm": 0.8558259010314941, + "learning_rate": 0.0001243399987917598, + "loss": 2.6965, + "step": 12529 + }, + { + "epoch": 1.1351951258182147, + "grad_norm": 0.8861306309700012, + "learning_rate": 0.00012433395759076904, + "loss": 2.6409, + "step": 12530 + }, + { + "epoch": 1.1352857239926615, + "grad_norm": 0.9987301230430603, + "learning_rate": 0.0001243279163897783, + "loss": 2.9409, + "step": 12531 + }, + { + "epoch": 1.1353763221671083, + "grad_norm": 0.8811972141265869, + "learning_rate": 0.00012432187518878753, + "loss": 2.728, + "step": 12532 + }, + { + "epoch": 1.135466920341555, + "grad_norm": 0.9120472073554993, + "learning_rate": 0.00012431583398779677, + "loss": 2.7463, + "step": 12533 + }, + { + "epoch": 1.1355575185160018, + "grad_norm": 0.8372697234153748, + "learning_rate": 0.000124309792786806, + "loss": 2.5523, + "step": 12534 + }, + { + "epoch": 1.1356481166904486, + "grad_norm": 0.889817476272583, + "learning_rate": 0.00012430375158581526, + "loss": 2.6222, + "step": 12535 + }, + { + "epoch": 1.1357387148648954, + "grad_norm": 0.9311615824699402, + "learning_rate": 0.00012429771038482453, + "loss": 2.6989, + "step": 12536 + }, + { + "epoch": 1.1358293130393422, + "grad_norm": 0.8342102766036987, + "learning_rate": 0.00012429166918383376, + "loss": 2.7451, + "step": 12537 + }, + { + "epoch": 1.135919911213789, + "grad_norm": 0.8691669702529907, + "learning_rate": 0.000124285627982843, + "loss": 2.7651, + "step": 12538 + }, + { + "epoch": 1.1360105093882358, + "grad_norm": 0.8205663561820984, + "learning_rate": 0.00012427958678185223, + "loss": 2.0057, + "step": 12539 + }, + { + "epoch": 1.1361011075626826, + "grad_norm": 0.7481348514556885, + "learning_rate": 0.0001242735455808615, + "loss": 2.0299, + "step": 12540 + }, + { + "epoch": 1.1361917057371294, + "grad_norm": 0.8649997711181641, + "learning_rate": 0.00012426750437987072, + "loss": 2.7001, + "step": 12541 + }, + { + "epoch": 1.1362823039115761, + "grad_norm": 0.7466415166854858, + "learning_rate": 0.00012426146317887996, + "loss": 1.2479, + "step": 12542 + }, + { + "epoch": 1.136372902086023, + "grad_norm": 0.9074328541755676, + "learning_rate": 0.00012425542197788922, + "loss": 2.9391, + "step": 12543 + }, + { + "epoch": 1.1364635002604697, + "grad_norm": 0.8571893572807312, + "learning_rate": 0.00012424938077689845, + "loss": 2.6698, + "step": 12544 + }, + { + "epoch": 1.1365540984349165, + "grad_norm": 0.9072585105895996, + "learning_rate": 0.0001242433395759077, + "loss": 2.6985, + "step": 12545 + }, + { + "epoch": 1.1366446966093633, + "grad_norm": 0.8300860524177551, + "learning_rate": 0.00012423729837491692, + "loss": 2.6744, + "step": 12546 + }, + { + "epoch": 1.13673529478381, + "grad_norm": 0.9228166341781616, + "learning_rate": 0.00012423125717392618, + "loss": 2.6752, + "step": 12547 + }, + { + "epoch": 1.1368258929582569, + "grad_norm": 0.8661195039749146, + "learning_rate": 0.00012422521597293542, + "loss": 2.8747, + "step": 12548 + }, + { + "epoch": 1.1369164911327037, + "grad_norm": 0.9035853147506714, + "learning_rate": 0.00012421917477194468, + "loss": 2.934, + "step": 12549 + }, + { + "epoch": 1.1370070893071504, + "grad_norm": 0.8278263211250305, + "learning_rate": 0.0001242131335709539, + "loss": 2.8091, + "step": 12550 + }, + { + "epoch": 1.1370976874815972, + "grad_norm": 0.9854498505592346, + "learning_rate": 0.00012420709236996314, + "loss": 2.9765, + "step": 12551 + }, + { + "epoch": 1.137188285656044, + "grad_norm": 0.9161737561225891, + "learning_rate": 0.0001242010511689724, + "loss": 2.9924, + "step": 12552 + }, + { + "epoch": 1.1372788838304908, + "grad_norm": 0.8490214943885803, + "learning_rate": 0.00012419500996798164, + "loss": 2.6072, + "step": 12553 + }, + { + "epoch": 1.1373694820049376, + "grad_norm": 0.9275083541870117, + "learning_rate": 0.0001241889687669909, + "loss": 2.4315, + "step": 12554 + }, + { + "epoch": 1.1374600801793844, + "grad_norm": 0.9628244638442993, + "learning_rate": 0.0001241829275660001, + "loss": 2.6201, + "step": 12555 + }, + { + "epoch": 1.1375506783538312, + "grad_norm": 0.904163658618927, + "learning_rate": 0.00012417688636500937, + "loss": 3.1569, + "step": 12556 + }, + { + "epoch": 1.137641276528278, + "grad_norm": 0.9412540793418884, + "learning_rate": 0.00012417084516401863, + "loss": 2.5567, + "step": 12557 + }, + { + "epoch": 1.1377318747027247, + "grad_norm": 0.9086275696754456, + "learning_rate": 0.00012416480396302786, + "loss": 2.7586, + "step": 12558 + }, + { + "epoch": 1.1378224728771715, + "grad_norm": 0.8686631321907043, + "learning_rate": 0.0001241587627620371, + "loss": 2.644, + "step": 12559 + }, + { + "epoch": 1.1379130710516183, + "grad_norm": 0.8306268453598022, + "learning_rate": 0.00012415272156104633, + "loss": 2.6818, + "step": 12560 + }, + { + "epoch": 1.138003669226065, + "grad_norm": 0.8819279074668884, + "learning_rate": 0.0001241466803600556, + "loss": 2.6455, + "step": 12561 + }, + { + "epoch": 1.138094267400512, + "grad_norm": 0.8191506862640381, + "learning_rate": 0.00012414063915906483, + "loss": 2.792, + "step": 12562 + }, + { + "epoch": 1.1381848655749587, + "grad_norm": 0.9013175368309021, + "learning_rate": 0.00012413459795807406, + "loss": 2.8014, + "step": 12563 + }, + { + "epoch": 1.1382754637494055, + "grad_norm": 0.9574426412582397, + "learning_rate": 0.00012412855675708332, + "loss": 2.7357, + "step": 12564 + }, + { + "epoch": 1.1383660619238523, + "grad_norm": 0.9493021368980408, + "learning_rate": 0.00012412251555609256, + "loss": 2.86, + "step": 12565 + }, + { + "epoch": 1.138456660098299, + "grad_norm": 0.8927658200263977, + "learning_rate": 0.00012411647435510182, + "loss": 2.4868, + "step": 12566 + }, + { + "epoch": 1.1385472582727458, + "grad_norm": 0.8843849897384644, + "learning_rate": 0.00012411043315411105, + "loss": 2.7022, + "step": 12567 + }, + { + "epoch": 1.1386378564471926, + "grad_norm": 0.8544094562530518, + "learning_rate": 0.00012410439195312029, + "loss": 2.7224, + "step": 12568 + }, + { + "epoch": 1.1387284546216394, + "grad_norm": 0.8099551200866699, + "learning_rate": 0.00012409835075212952, + "loss": 2.7278, + "step": 12569 + }, + { + "epoch": 1.1388190527960862, + "grad_norm": 0.9135647416114807, + "learning_rate": 0.00012409230955113878, + "loss": 2.8578, + "step": 12570 + }, + { + "epoch": 1.138909650970533, + "grad_norm": 0.8263831734657288, + "learning_rate": 0.00012408626835014802, + "loss": 2.2918, + "step": 12571 + }, + { + "epoch": 1.1390002491449798, + "grad_norm": 0.9261571168899536, + "learning_rate": 0.00012408022714915725, + "loss": 2.8164, + "step": 12572 + }, + { + "epoch": 1.1390908473194266, + "grad_norm": 0.8531845211982727, + "learning_rate": 0.0001240741859481665, + "loss": 2.8194, + "step": 12573 + }, + { + "epoch": 1.1391814454938733, + "grad_norm": 0.91274493932724, + "learning_rate": 0.00012406814474717574, + "loss": 2.8336, + "step": 12574 + }, + { + "epoch": 1.1392720436683201, + "grad_norm": 0.8694708347320557, + "learning_rate": 0.000124062103546185, + "loss": 2.7566, + "step": 12575 + }, + { + "epoch": 1.139362641842767, + "grad_norm": 0.8598187565803528, + "learning_rate": 0.0001240560623451942, + "loss": 2.6983, + "step": 12576 + }, + { + "epoch": 1.1394532400172137, + "grad_norm": 0.8914247155189514, + "learning_rate": 0.00012405002114420347, + "loss": 2.7688, + "step": 12577 + }, + { + "epoch": 1.1395438381916605, + "grad_norm": 0.8882326483726501, + "learning_rate": 0.0001240439799432127, + "loss": 2.7084, + "step": 12578 + }, + { + "epoch": 1.1396344363661073, + "grad_norm": 0.9224625825881958, + "learning_rate": 0.00012403793874222197, + "loss": 2.5967, + "step": 12579 + }, + { + "epoch": 1.139725034540554, + "grad_norm": 0.9090381264686584, + "learning_rate": 0.0001240318975412312, + "loss": 3.2018, + "step": 12580 + }, + { + "epoch": 1.1398156327150009, + "grad_norm": 0.9374440312385559, + "learning_rate": 0.00012402585634024044, + "loss": 2.6314, + "step": 12581 + }, + { + "epoch": 1.1399062308894476, + "grad_norm": 0.9458099007606506, + "learning_rate": 0.0001240198151392497, + "loss": 2.6904, + "step": 12582 + }, + { + "epoch": 1.1399968290638944, + "grad_norm": 0.9204431772232056, + "learning_rate": 0.00012401377393825893, + "loss": 2.6175, + "step": 12583 + }, + { + "epoch": 1.1400874272383412, + "grad_norm": 0.8947415351867676, + "learning_rate": 0.00012400773273726817, + "loss": 2.4671, + "step": 12584 + }, + { + "epoch": 1.140178025412788, + "grad_norm": 0.8868973255157471, + "learning_rate": 0.0001240016915362774, + "loss": 2.533, + "step": 12585 + }, + { + "epoch": 1.1402686235872348, + "grad_norm": 0.925072968006134, + "learning_rate": 0.00012399565033528666, + "loss": 2.625, + "step": 12586 + }, + { + "epoch": 1.1403592217616816, + "grad_norm": 0.8684674501419067, + "learning_rate": 0.00012398960913429592, + "loss": 2.5045, + "step": 12587 + }, + { + "epoch": 1.1404498199361284, + "grad_norm": 0.7577322125434875, + "learning_rate": 0.00012398356793330516, + "loss": 2.1097, + "step": 12588 + }, + { + "epoch": 1.1405404181105752, + "grad_norm": 0.8965619206428528, + "learning_rate": 0.0001239775267323144, + "loss": 2.6239, + "step": 12589 + }, + { + "epoch": 1.140631016285022, + "grad_norm": 0.9516655802726746, + "learning_rate": 0.00012397148553132362, + "loss": 2.696, + "step": 12590 + }, + { + "epoch": 1.1407216144594687, + "grad_norm": 0.8821690678596497, + "learning_rate": 0.00012396544433033289, + "loss": 2.8101, + "step": 12591 + }, + { + "epoch": 1.1408122126339155, + "grad_norm": 0.9441303014755249, + "learning_rate": 0.00012395940312934212, + "loss": 2.5512, + "step": 12592 + }, + { + "epoch": 1.1409028108083623, + "grad_norm": 0.8874218463897705, + "learning_rate": 0.00012395336192835135, + "loss": 2.8443, + "step": 12593 + }, + { + "epoch": 1.1409934089828089, + "grad_norm": 0.9217434525489807, + "learning_rate": 0.00012394732072736062, + "loss": 2.6665, + "step": 12594 + }, + { + "epoch": 1.1410840071572559, + "grad_norm": 0.8466724157333374, + "learning_rate": 0.00012394127952636985, + "loss": 2.6977, + "step": 12595 + }, + { + "epoch": 1.1411746053317025, + "grad_norm": 0.9079104661941528, + "learning_rate": 0.0001239352383253791, + "loss": 2.7912, + "step": 12596 + }, + { + "epoch": 1.1412652035061495, + "grad_norm": 0.8193514943122864, + "learning_rate": 0.00012392919712438832, + "loss": 2.8091, + "step": 12597 + }, + { + "epoch": 1.141355801680596, + "grad_norm": 0.9054383039474487, + "learning_rate": 0.00012392315592339758, + "loss": 2.6628, + "step": 12598 + }, + { + "epoch": 1.141446399855043, + "grad_norm": 0.8874165415763855, + "learning_rate": 0.0001239171147224068, + "loss": 2.6131, + "step": 12599 + }, + { + "epoch": 1.1415369980294896, + "grad_norm": 0.888998806476593, + "learning_rate": 0.00012391107352141607, + "loss": 2.6472, + "step": 12600 + }, + { + "epoch": 1.1416275962039366, + "grad_norm": 0.9279017448425293, + "learning_rate": 0.0001239050323204253, + "loss": 2.7223, + "step": 12601 + }, + { + "epoch": 1.1417181943783832, + "grad_norm": 0.892245352268219, + "learning_rate": 0.00012389899111943454, + "loss": 2.7152, + "step": 12602 + }, + { + "epoch": 1.14180879255283, + "grad_norm": 0.9216935038566589, + "learning_rate": 0.0001238929499184438, + "loss": 2.6883, + "step": 12603 + }, + { + "epoch": 1.1418993907272768, + "grad_norm": 0.935867190361023, + "learning_rate": 0.00012388690871745304, + "loss": 2.8497, + "step": 12604 + }, + { + "epoch": 1.1419899889017235, + "grad_norm": 0.7865311503410339, + "learning_rate": 0.00012388086751646227, + "loss": 1.9145, + "step": 12605 + }, + { + "epoch": 1.1420805870761703, + "grad_norm": 0.9305097460746765, + "learning_rate": 0.0001238748263154715, + "loss": 2.7802, + "step": 12606 + }, + { + "epoch": 1.1421711852506171, + "grad_norm": 0.9530693292617798, + "learning_rate": 0.00012386878511448077, + "loss": 2.9885, + "step": 12607 + }, + { + "epoch": 1.142261783425064, + "grad_norm": 0.9192634224891663, + "learning_rate": 0.00012386274391349, + "loss": 2.6344, + "step": 12608 + }, + { + "epoch": 1.1423523815995107, + "grad_norm": 0.790593147277832, + "learning_rate": 0.00012385670271249926, + "loss": 2.1576, + "step": 12609 + }, + { + "epoch": 1.1424429797739575, + "grad_norm": 0.8887783885002136, + "learning_rate": 0.0001238506615115085, + "loss": 2.8623, + "step": 12610 + }, + { + "epoch": 1.1425335779484043, + "grad_norm": 0.8496550917625427, + "learning_rate": 0.00012384462031051773, + "loss": 2.6567, + "step": 12611 + }, + { + "epoch": 1.142624176122851, + "grad_norm": 0.8377938866615295, + "learning_rate": 0.000123838579109527, + "loss": 2.7084, + "step": 12612 + }, + { + "epoch": 1.1427147742972978, + "grad_norm": 0.944718062877655, + "learning_rate": 0.00012383253790853622, + "loss": 2.7851, + "step": 12613 + }, + { + "epoch": 1.1428053724717446, + "grad_norm": 0.9481290578842163, + "learning_rate": 0.00012382649670754546, + "loss": 2.8411, + "step": 12614 + }, + { + "epoch": 1.1428959706461914, + "grad_norm": 0.8497417569160461, + "learning_rate": 0.0001238204555065547, + "loss": 2.4907, + "step": 12615 + }, + { + "epoch": 1.1429865688206382, + "grad_norm": 0.885187566280365, + "learning_rate": 0.00012381441430556395, + "loss": 2.857, + "step": 12616 + }, + { + "epoch": 1.143077166995085, + "grad_norm": 0.8568141460418701, + "learning_rate": 0.00012380837310457322, + "loss": 2.9148, + "step": 12617 + }, + { + "epoch": 1.1431677651695318, + "grad_norm": 0.856342077255249, + "learning_rate": 0.00012380233190358245, + "loss": 2.722, + "step": 12618 + }, + { + "epoch": 1.1432583633439786, + "grad_norm": 0.7928364276885986, + "learning_rate": 0.00012379629070259168, + "loss": 2.0026, + "step": 12619 + }, + { + "epoch": 1.1433489615184254, + "grad_norm": 0.8296191692352295, + "learning_rate": 0.00012379024950160092, + "loss": 2.4103, + "step": 12620 + }, + { + "epoch": 1.1434395596928721, + "grad_norm": 0.8929241299629211, + "learning_rate": 0.00012378420830061018, + "loss": 2.6084, + "step": 12621 + }, + { + "epoch": 1.143530157867319, + "grad_norm": 0.7960309982299805, + "learning_rate": 0.0001237781670996194, + "loss": 2.6425, + "step": 12622 + }, + { + "epoch": 1.1436207560417657, + "grad_norm": 0.8907264471054077, + "learning_rate": 0.00012377212589862865, + "loss": 2.0686, + "step": 12623 + }, + { + "epoch": 1.1437113542162125, + "grad_norm": 0.9200472831726074, + "learning_rate": 0.0001237660846976379, + "loss": 2.5061, + "step": 12624 + }, + { + "epoch": 1.1438019523906593, + "grad_norm": 1.0277918577194214, + "learning_rate": 0.00012376004349664714, + "loss": 2.664, + "step": 12625 + }, + { + "epoch": 1.143892550565106, + "grad_norm": 0.8710311651229858, + "learning_rate": 0.0001237540022956564, + "loss": 2.4743, + "step": 12626 + }, + { + "epoch": 1.1439831487395529, + "grad_norm": 0.9343557953834534, + "learning_rate": 0.0001237479610946656, + "loss": 2.5722, + "step": 12627 + }, + { + "epoch": 1.1440737469139997, + "grad_norm": 0.9089561700820923, + "learning_rate": 0.00012374191989367487, + "loss": 2.795, + "step": 12628 + }, + { + "epoch": 1.1441643450884464, + "grad_norm": 0.9531729817390442, + "learning_rate": 0.0001237358786926841, + "loss": 2.9953, + "step": 12629 + }, + { + "epoch": 1.1442549432628932, + "grad_norm": 0.9043949842453003, + "learning_rate": 0.00012372983749169337, + "loss": 2.8183, + "step": 12630 + }, + { + "epoch": 1.14434554143734, + "grad_norm": 0.8785759806632996, + "learning_rate": 0.0001237237962907026, + "loss": 3.0188, + "step": 12631 + }, + { + "epoch": 1.1444361396117868, + "grad_norm": 0.9033112525939941, + "learning_rate": 0.00012371775508971183, + "loss": 2.4328, + "step": 12632 + }, + { + "epoch": 1.1445267377862336, + "grad_norm": 0.8798127174377441, + "learning_rate": 0.0001237117138887211, + "loss": 2.9459, + "step": 12633 + }, + { + "epoch": 1.1446173359606804, + "grad_norm": 0.873836874961853, + "learning_rate": 0.00012370567268773033, + "loss": 2.9124, + "step": 12634 + }, + { + "epoch": 1.1447079341351272, + "grad_norm": 0.957368791103363, + "learning_rate": 0.00012369963148673956, + "loss": 2.6575, + "step": 12635 + }, + { + "epoch": 1.144798532309574, + "grad_norm": 0.9047587513923645, + "learning_rate": 0.0001236935902857488, + "loss": 2.78, + "step": 12636 + }, + { + "epoch": 1.1448891304840207, + "grad_norm": 0.982448160648346, + "learning_rate": 0.00012368754908475806, + "loss": 2.5131, + "step": 12637 + }, + { + "epoch": 1.1449797286584675, + "grad_norm": 0.8622865676879883, + "learning_rate": 0.0001236815078837673, + "loss": 2.6791, + "step": 12638 + }, + { + "epoch": 1.1450703268329143, + "grad_norm": 0.8994156122207642, + "learning_rate": 0.00012367546668277655, + "loss": 2.5968, + "step": 12639 + }, + { + "epoch": 1.145160925007361, + "grad_norm": 0.8917995691299438, + "learning_rate": 0.0001236694254817858, + "loss": 2.6011, + "step": 12640 + }, + { + "epoch": 1.145251523181808, + "grad_norm": 0.8551607131958008, + "learning_rate": 0.00012366338428079502, + "loss": 2.6961, + "step": 12641 + }, + { + "epoch": 1.1453421213562547, + "grad_norm": 0.756533145904541, + "learning_rate": 0.00012365734307980428, + "loss": 1.9062, + "step": 12642 + }, + { + "epoch": 1.1454327195307015, + "grad_norm": 0.8312620520591736, + "learning_rate": 0.00012365130187881352, + "loss": 2.1668, + "step": 12643 + }, + { + "epoch": 1.1455233177051483, + "grad_norm": 0.9015713930130005, + "learning_rate": 0.00012364526067782275, + "loss": 2.6831, + "step": 12644 + }, + { + "epoch": 1.145613915879595, + "grad_norm": 0.8391938805580139, + "learning_rate": 0.00012363921947683199, + "loss": 2.844, + "step": 12645 + }, + { + "epoch": 1.1457045140540418, + "grad_norm": 0.923026442527771, + "learning_rate": 0.00012363317827584125, + "loss": 2.5985, + "step": 12646 + }, + { + "epoch": 1.1457951122284886, + "grad_norm": 0.7671570777893066, + "learning_rate": 0.0001236271370748505, + "loss": 1.9726, + "step": 12647 + }, + { + "epoch": 1.1458857104029354, + "grad_norm": 0.8598315119743347, + "learning_rate": 0.00012362109587385971, + "loss": 2.6415, + "step": 12648 + }, + { + "epoch": 1.1459763085773822, + "grad_norm": 0.8607006669044495, + "learning_rate": 0.00012361505467286898, + "loss": 2.8927, + "step": 12649 + }, + { + "epoch": 1.146066906751829, + "grad_norm": 0.8212757706642151, + "learning_rate": 0.0001236090134718782, + "loss": 2.648, + "step": 12650 + }, + { + "epoch": 1.1461575049262758, + "grad_norm": 0.9056439995765686, + "learning_rate": 0.00012360297227088747, + "loss": 2.7304, + "step": 12651 + }, + { + "epoch": 1.1462481031007226, + "grad_norm": 0.8284466862678528, + "learning_rate": 0.0001235969310698967, + "loss": 2.5198, + "step": 12652 + }, + { + "epoch": 1.1463387012751693, + "grad_norm": 0.9324164390563965, + "learning_rate": 0.00012359088986890594, + "loss": 2.9631, + "step": 12653 + }, + { + "epoch": 1.1464292994496161, + "grad_norm": 0.8820091485977173, + "learning_rate": 0.0001235848486679152, + "loss": 2.7064, + "step": 12654 + }, + { + "epoch": 1.146519897624063, + "grad_norm": 0.9427751302719116, + "learning_rate": 0.00012357880746692443, + "loss": 2.7817, + "step": 12655 + }, + { + "epoch": 1.1466104957985097, + "grad_norm": 0.7714593410491943, + "learning_rate": 0.00012357276626593367, + "loss": 2.0054, + "step": 12656 + }, + { + "epoch": 1.1467010939729565, + "grad_norm": 0.8724849224090576, + "learning_rate": 0.0001235667250649429, + "loss": 2.7302, + "step": 12657 + }, + { + "epoch": 1.1467916921474033, + "grad_norm": 0.9109467267990112, + "learning_rate": 0.00012356068386395216, + "loss": 2.8096, + "step": 12658 + }, + { + "epoch": 1.14688229032185, + "grad_norm": 0.9971233010292053, + "learning_rate": 0.0001235546426629614, + "loss": 2.5761, + "step": 12659 + }, + { + "epoch": 1.1469728884962969, + "grad_norm": 0.838896632194519, + "learning_rate": 0.00012354860146197066, + "loss": 2.6366, + "step": 12660 + }, + { + "epoch": 1.1470634866707436, + "grad_norm": 0.7939265370368958, + "learning_rate": 0.00012354256026097987, + "loss": 1.9767, + "step": 12661 + }, + { + "epoch": 1.1471540848451904, + "grad_norm": 0.9368454813957214, + "learning_rate": 0.00012353651905998913, + "loss": 2.4295, + "step": 12662 + }, + { + "epoch": 1.1472446830196372, + "grad_norm": 1.0140446424484253, + "learning_rate": 0.0001235304778589984, + "loss": 2.6119, + "step": 12663 + }, + { + "epoch": 1.147335281194084, + "grad_norm": 0.8930496573448181, + "learning_rate": 0.00012352443665800762, + "loss": 2.748, + "step": 12664 + }, + { + "epoch": 1.1474258793685308, + "grad_norm": 0.8554906845092773, + "learning_rate": 0.00012351839545701686, + "loss": 2.9386, + "step": 12665 + }, + { + "epoch": 1.1475164775429776, + "grad_norm": 0.9326232671737671, + "learning_rate": 0.0001235123542560261, + "loss": 2.6878, + "step": 12666 + }, + { + "epoch": 1.1476070757174244, + "grad_norm": 0.914734423160553, + "learning_rate": 0.00012350631305503535, + "loss": 2.7778, + "step": 12667 + }, + { + "epoch": 1.1476976738918712, + "grad_norm": 0.9721859097480774, + "learning_rate": 0.00012350027185404459, + "loss": 2.5083, + "step": 12668 + }, + { + "epoch": 1.147788272066318, + "grad_norm": 0.8851805329322815, + "learning_rate": 0.00012349423065305382, + "loss": 1.9016, + "step": 12669 + }, + { + "epoch": 1.1478788702407647, + "grad_norm": 0.8918609023094177, + "learning_rate": 0.00012348818945206308, + "loss": 2.6374, + "step": 12670 + }, + { + "epoch": 1.1479694684152115, + "grad_norm": 0.7363106608390808, + "learning_rate": 0.00012348214825107231, + "loss": 2.2422, + "step": 12671 + }, + { + "epoch": 1.1480600665896583, + "grad_norm": 0.8791525959968567, + "learning_rate": 0.00012347610705008158, + "loss": 2.7672, + "step": 12672 + }, + { + "epoch": 1.148150664764105, + "grad_norm": 0.8955627083778381, + "learning_rate": 0.0001234700658490908, + "loss": 2.7758, + "step": 12673 + }, + { + "epoch": 1.1482412629385519, + "grad_norm": 0.8753529787063599, + "learning_rate": 0.00012346402464810004, + "loss": 2.9542, + "step": 12674 + }, + { + "epoch": 1.1483318611129985, + "grad_norm": 0.9552964568138123, + "learning_rate": 0.00012345798344710928, + "loss": 2.501, + "step": 12675 + }, + { + "epoch": 1.1484224592874455, + "grad_norm": 0.742247998714447, + "learning_rate": 0.00012345194224611854, + "loss": 1.87, + "step": 12676 + }, + { + "epoch": 1.148513057461892, + "grad_norm": 0.9378374814987183, + "learning_rate": 0.0001234459010451278, + "loss": 2.6366, + "step": 12677 + }, + { + "epoch": 1.148603655636339, + "grad_norm": 0.8708953857421875, + "learning_rate": 0.000123439859844137, + "loss": 2.7263, + "step": 12678 + }, + { + "epoch": 1.1486942538107856, + "grad_norm": 0.866639256477356, + "learning_rate": 0.00012343381864314627, + "loss": 2.491, + "step": 12679 + }, + { + "epoch": 1.1487848519852326, + "grad_norm": 0.7372201681137085, + "learning_rate": 0.0001234277774421555, + "loss": 1.918, + "step": 12680 + }, + { + "epoch": 1.1488754501596792, + "grad_norm": 0.8343491554260254, + "learning_rate": 0.00012342173624116476, + "loss": 2.0206, + "step": 12681 + }, + { + "epoch": 1.1489660483341262, + "grad_norm": 0.8863969445228577, + "learning_rate": 0.00012341569504017397, + "loss": 2.7695, + "step": 12682 + }, + { + "epoch": 1.1490566465085728, + "grad_norm": 0.8663601875305176, + "learning_rate": 0.00012340965383918323, + "loss": 2.8335, + "step": 12683 + }, + { + "epoch": 1.1491472446830195, + "grad_norm": 0.8480104804039001, + "learning_rate": 0.0001234036126381925, + "loss": 2.7391, + "step": 12684 + }, + { + "epoch": 1.1492378428574663, + "grad_norm": 0.9045876264572144, + "learning_rate": 0.00012339757143720173, + "loss": 2.5341, + "step": 12685 + }, + { + "epoch": 1.1493284410319131, + "grad_norm": 0.8566009998321533, + "learning_rate": 0.00012339153023621096, + "loss": 2.7019, + "step": 12686 + }, + { + "epoch": 1.14941903920636, + "grad_norm": 0.8834752440452576, + "learning_rate": 0.0001233854890352202, + "loss": 2.8973, + "step": 12687 + }, + { + "epoch": 1.1495096373808067, + "grad_norm": 0.957255482673645, + "learning_rate": 0.00012337944783422946, + "loss": 2.7076, + "step": 12688 + }, + { + "epoch": 1.1496002355552535, + "grad_norm": 0.9000287055969238, + "learning_rate": 0.0001233734066332387, + "loss": 2.7489, + "step": 12689 + }, + { + "epoch": 1.1496908337297003, + "grad_norm": 0.9287349581718445, + "learning_rate": 0.00012336736543224795, + "loss": 3.0046, + "step": 12690 + }, + { + "epoch": 1.149781431904147, + "grad_norm": 0.9644018411636353, + "learning_rate": 0.00012336132423125716, + "loss": 2.5234, + "step": 12691 + }, + { + "epoch": 1.1498720300785938, + "grad_norm": 0.8267658948898315, + "learning_rate": 0.00012335528303026642, + "loss": 2.5169, + "step": 12692 + }, + { + "epoch": 1.1499626282530406, + "grad_norm": 0.884752631187439, + "learning_rate": 0.00012334924182927568, + "loss": 2.9918, + "step": 12693 + }, + { + "epoch": 1.1500532264274874, + "grad_norm": 0.6455947160720825, + "learning_rate": 0.00012334320062828492, + "loss": 1.4014, + "step": 12694 + }, + { + "epoch": 1.1501438246019342, + "grad_norm": 0.8822214603424072, + "learning_rate": 0.00012333715942729415, + "loss": 2.677, + "step": 12695 + }, + { + "epoch": 1.150234422776381, + "grad_norm": 0.8402459621429443, + "learning_rate": 0.00012333111822630338, + "loss": 2.8277, + "step": 12696 + }, + { + "epoch": 1.1503250209508278, + "grad_norm": 0.9334512948989868, + "learning_rate": 0.00012332507702531264, + "loss": 2.7927, + "step": 12697 + }, + { + "epoch": 1.1504156191252746, + "grad_norm": 0.889146089553833, + "learning_rate": 0.00012331903582432188, + "loss": 2.6854, + "step": 12698 + }, + { + "epoch": 1.1505062172997214, + "grad_norm": 0.8534746766090393, + "learning_rate": 0.0001233129946233311, + "loss": 2.7208, + "step": 12699 + }, + { + "epoch": 1.1505968154741681, + "grad_norm": 0.8902968168258667, + "learning_rate": 0.00012330695342234037, + "loss": 2.5706, + "step": 12700 + }, + { + "epoch": 1.150687413648615, + "grad_norm": 0.8969182372093201, + "learning_rate": 0.0001233009122213496, + "loss": 2.4518, + "step": 12701 + }, + { + "epoch": 1.1507780118230617, + "grad_norm": 0.9149287939071655, + "learning_rate": 0.00012329487102035887, + "loss": 2.6644, + "step": 12702 + }, + { + "epoch": 1.1508686099975085, + "grad_norm": 1.0077625513076782, + "learning_rate": 0.0001232888298193681, + "loss": 2.7599, + "step": 12703 + }, + { + "epoch": 1.1509592081719553, + "grad_norm": 0.8643971681594849, + "learning_rate": 0.00012328278861837734, + "loss": 2.8324, + "step": 12704 + }, + { + "epoch": 1.151049806346402, + "grad_norm": 0.9616618752479553, + "learning_rate": 0.00012327674741738657, + "loss": 2.9378, + "step": 12705 + }, + { + "epoch": 1.1511404045208489, + "grad_norm": 0.8648359179496765, + "learning_rate": 0.00012327070621639583, + "loss": 2.5416, + "step": 12706 + }, + { + "epoch": 1.1512310026952957, + "grad_norm": 0.8871970772743225, + "learning_rate": 0.00012326466501540507, + "loss": 2.5859, + "step": 12707 + }, + { + "epoch": 1.1513216008697424, + "grad_norm": 0.8557793498039246, + "learning_rate": 0.0001232586238144143, + "loss": 2.7853, + "step": 12708 + }, + { + "epoch": 1.1514121990441892, + "grad_norm": 0.9313181638717651, + "learning_rate": 0.00012325258261342356, + "loss": 2.6883, + "step": 12709 + }, + { + "epoch": 1.151502797218636, + "grad_norm": 0.9017868041992188, + "learning_rate": 0.0001232465414124328, + "loss": 2.5236, + "step": 12710 + }, + { + "epoch": 1.1515933953930828, + "grad_norm": 0.8478432297706604, + "learning_rate": 0.00012324050021144206, + "loss": 2.5718, + "step": 12711 + }, + { + "epoch": 1.1516839935675296, + "grad_norm": 0.9508458375930786, + "learning_rate": 0.00012323445901045126, + "loss": 2.5908, + "step": 12712 + }, + { + "epoch": 1.1517745917419764, + "grad_norm": 0.8687713146209717, + "learning_rate": 0.00012322841780946052, + "loss": 2.5556, + "step": 12713 + }, + { + "epoch": 1.1518651899164232, + "grad_norm": 0.9696719646453857, + "learning_rate": 0.00012322237660846979, + "loss": 2.6391, + "step": 12714 + }, + { + "epoch": 1.15195578809087, + "grad_norm": 0.8843938112258911, + "learning_rate": 0.00012321633540747902, + "loss": 2.7938, + "step": 12715 + }, + { + "epoch": 1.1520463862653167, + "grad_norm": 0.896827220916748, + "learning_rate": 0.00012321029420648825, + "loss": 2.6907, + "step": 12716 + }, + { + "epoch": 1.1521369844397635, + "grad_norm": 1.2597718238830566, + "learning_rate": 0.0001232042530054975, + "loss": 2.7109, + "step": 12717 + }, + { + "epoch": 1.1522275826142103, + "grad_norm": 0.8701784610748291, + "learning_rate": 0.00012319821180450675, + "loss": 2.6438, + "step": 12718 + }, + { + "epoch": 1.152318180788657, + "grad_norm": 0.9631979465484619, + "learning_rate": 0.00012319217060351598, + "loss": 2.5558, + "step": 12719 + }, + { + "epoch": 1.152408778963104, + "grad_norm": 0.8805240988731384, + "learning_rate": 0.00012318612940252522, + "loss": 2.6401, + "step": 12720 + }, + { + "epoch": 1.1524993771375507, + "grad_norm": 0.8279509544372559, + "learning_rate": 0.00012318008820153445, + "loss": 2.2027, + "step": 12721 + }, + { + "epoch": 1.1525899753119975, + "grad_norm": 0.8941972255706787, + "learning_rate": 0.0001231740470005437, + "loss": 1.981, + "step": 12722 + }, + { + "epoch": 1.1526805734864443, + "grad_norm": 0.9124099612236023, + "learning_rate": 0.00012316800579955297, + "loss": 2.4277, + "step": 12723 + }, + { + "epoch": 1.152771171660891, + "grad_norm": 0.8872604966163635, + "learning_rate": 0.0001231619645985622, + "loss": 2.7313, + "step": 12724 + }, + { + "epoch": 1.1528617698353378, + "grad_norm": 0.9861807823181152, + "learning_rate": 0.00012315592339757144, + "loss": 2.7351, + "step": 12725 + }, + { + "epoch": 1.1529523680097846, + "grad_norm": 0.9879185557365417, + "learning_rate": 0.00012314988219658068, + "loss": 2.8645, + "step": 12726 + }, + { + "epoch": 1.1530429661842314, + "grad_norm": 0.8463585376739502, + "learning_rate": 0.00012314384099558994, + "loss": 2.5867, + "step": 12727 + }, + { + "epoch": 1.1531335643586782, + "grad_norm": 0.897118091583252, + "learning_rate": 0.00012313779979459917, + "loss": 2.7442, + "step": 12728 + }, + { + "epoch": 1.153224162533125, + "grad_norm": 0.9997624754905701, + "learning_rate": 0.0001231317585936084, + "loss": 2.5331, + "step": 12729 + }, + { + "epoch": 1.1533147607075718, + "grad_norm": 0.8528890013694763, + "learning_rate": 0.00012312571739261767, + "loss": 2.8548, + "step": 12730 + }, + { + "epoch": 1.1534053588820186, + "grad_norm": 0.9020950794219971, + "learning_rate": 0.0001231196761916269, + "loss": 2.7733, + "step": 12731 + }, + { + "epoch": 1.1534959570564653, + "grad_norm": 0.936498761177063, + "learning_rate": 0.00012311363499063616, + "loss": 2.7326, + "step": 12732 + }, + { + "epoch": 1.1535865552309121, + "grad_norm": 1.028444766998291, + "learning_rate": 0.00012310759378964537, + "loss": 2.6529, + "step": 12733 + }, + { + "epoch": 1.153677153405359, + "grad_norm": 0.8775612711906433, + "learning_rate": 0.00012310155258865463, + "loss": 2.7845, + "step": 12734 + }, + { + "epoch": 1.1537677515798057, + "grad_norm": 0.8768860101699829, + "learning_rate": 0.00012309551138766386, + "loss": 2.804, + "step": 12735 + }, + { + "epoch": 1.1538583497542525, + "grad_norm": 0.9093248844146729, + "learning_rate": 0.00012308947018667312, + "loss": 2.7684, + "step": 12736 + }, + { + "epoch": 1.1539489479286993, + "grad_norm": 0.8093129396438599, + "learning_rate": 0.00012308342898568236, + "loss": 2.5717, + "step": 12737 + }, + { + "epoch": 1.154039546103146, + "grad_norm": 0.7540708184242249, + "learning_rate": 0.0001230773877846916, + "loss": 1.9338, + "step": 12738 + }, + { + "epoch": 1.1541301442775929, + "grad_norm": 0.861798882484436, + "learning_rate": 0.00012307134658370085, + "loss": 2.7828, + "step": 12739 + }, + { + "epoch": 1.1542207424520396, + "grad_norm": 0.9526515603065491, + "learning_rate": 0.0001230653053827101, + "loss": 2.7412, + "step": 12740 + }, + { + "epoch": 1.1543113406264864, + "grad_norm": 0.8367854952812195, + "learning_rate": 0.00012305926418171935, + "loss": 2.7241, + "step": 12741 + }, + { + "epoch": 1.1544019388009332, + "grad_norm": 0.8614333271980286, + "learning_rate": 0.00012305322298072856, + "loss": 2.5699, + "step": 12742 + }, + { + "epoch": 1.15449253697538, + "grad_norm": 0.9172155857086182, + "learning_rate": 0.00012304718177973782, + "loss": 2.9839, + "step": 12743 + }, + { + "epoch": 1.1545831351498268, + "grad_norm": 0.890377938747406, + "learning_rate": 0.00012304114057874708, + "loss": 2.5706, + "step": 12744 + }, + { + "epoch": 1.1546737333242736, + "grad_norm": 0.9320825338363647, + "learning_rate": 0.0001230350993777563, + "loss": 3.1311, + "step": 12745 + }, + { + "epoch": 1.1547643314987204, + "grad_norm": 0.8643141984939575, + "learning_rate": 0.00012302905817676555, + "loss": 2.7009, + "step": 12746 + }, + { + "epoch": 1.1548549296731672, + "grad_norm": 0.8506277799606323, + "learning_rate": 0.00012302301697577478, + "loss": 2.4169, + "step": 12747 + }, + { + "epoch": 1.154945527847614, + "grad_norm": 0.8992213010787964, + "learning_rate": 0.00012301697577478404, + "loss": 2.6996, + "step": 12748 + }, + { + "epoch": 1.1550361260220607, + "grad_norm": 0.9389604926109314, + "learning_rate": 0.00012301093457379328, + "loss": 2.7764, + "step": 12749 + }, + { + "epoch": 1.1551267241965075, + "grad_norm": 0.8812352418899536, + "learning_rate": 0.0001230048933728025, + "loss": 2.7484, + "step": 12750 + }, + { + "epoch": 1.1552173223709543, + "grad_norm": 0.9477089047431946, + "learning_rate": 0.00012299885217181174, + "loss": 2.8576, + "step": 12751 + }, + { + "epoch": 1.155307920545401, + "grad_norm": 0.9167368412017822, + "learning_rate": 0.000122992810970821, + "loss": 2.9011, + "step": 12752 + }, + { + "epoch": 1.155398518719848, + "grad_norm": 0.802085280418396, + "learning_rate": 0.00012298676976983027, + "loss": 2.1166, + "step": 12753 + }, + { + "epoch": 1.1554891168942947, + "grad_norm": 0.9319215416908264, + "learning_rate": 0.0001229807285688395, + "loss": 2.8916, + "step": 12754 + }, + { + "epoch": 1.1555797150687415, + "grad_norm": 0.8834481239318848, + "learning_rate": 0.00012297468736784873, + "loss": 2.7177, + "step": 12755 + }, + { + "epoch": 1.155670313243188, + "grad_norm": 0.8705990314483643, + "learning_rate": 0.00012296864616685797, + "loss": 2.6384, + "step": 12756 + }, + { + "epoch": 1.155760911417635, + "grad_norm": 0.9975316524505615, + "learning_rate": 0.00012296260496586723, + "loss": 2.5685, + "step": 12757 + }, + { + "epoch": 1.1558515095920816, + "grad_norm": 0.8961487412452698, + "learning_rate": 0.00012295656376487646, + "loss": 2.8602, + "step": 12758 + }, + { + "epoch": 1.1559421077665286, + "grad_norm": 0.7811280488967896, + "learning_rate": 0.0001229505225638857, + "loss": 2.0222, + "step": 12759 + }, + { + "epoch": 1.1560327059409752, + "grad_norm": 0.8908206820487976, + "learning_rate": 0.00012294448136289496, + "loss": 2.679, + "step": 12760 + }, + { + "epoch": 1.1561233041154222, + "grad_norm": 0.9588161110877991, + "learning_rate": 0.0001229384401619042, + "loss": 2.8686, + "step": 12761 + }, + { + "epoch": 1.1562139022898688, + "grad_norm": 0.771096408367157, + "learning_rate": 0.00012293239896091345, + "loss": 1.9041, + "step": 12762 + }, + { + "epoch": 1.1563045004643158, + "grad_norm": 0.8823387026786804, + "learning_rate": 0.00012292635775992266, + "loss": 2.8935, + "step": 12763 + }, + { + "epoch": 1.1563950986387623, + "grad_norm": 0.8898478150367737, + "learning_rate": 0.00012292031655893192, + "loss": 2.6823, + "step": 12764 + }, + { + "epoch": 1.1564856968132091, + "grad_norm": 1.002761960029602, + "learning_rate": 0.00012291427535794116, + "loss": 2.7322, + "step": 12765 + }, + { + "epoch": 1.156576294987656, + "grad_norm": 0.8079270720481873, + "learning_rate": 0.00012290823415695042, + "loss": 1.8738, + "step": 12766 + }, + { + "epoch": 1.1566668931621027, + "grad_norm": 0.9379352927207947, + "learning_rate": 0.00012290219295595965, + "loss": 2.7709, + "step": 12767 + }, + { + "epoch": 1.1567574913365495, + "grad_norm": 0.8655785918235779, + "learning_rate": 0.00012289615175496889, + "loss": 2.7826, + "step": 12768 + }, + { + "epoch": 1.1568480895109963, + "grad_norm": 0.862501859664917, + "learning_rate": 0.00012289011055397815, + "loss": 2.7632, + "step": 12769 + }, + { + "epoch": 1.156938687685443, + "grad_norm": 0.9274907112121582, + "learning_rate": 0.00012288406935298738, + "loss": 2.7691, + "step": 12770 + }, + { + "epoch": 1.1570292858598898, + "grad_norm": 0.8393060564994812, + "learning_rate": 0.00012287802815199661, + "loss": 2.5955, + "step": 12771 + }, + { + "epoch": 1.1571198840343366, + "grad_norm": 0.8868077993392944, + "learning_rate": 0.00012287198695100585, + "loss": 2.8241, + "step": 12772 + }, + { + "epoch": 1.1572104822087834, + "grad_norm": 1.022544264793396, + "learning_rate": 0.0001228659457500151, + "loss": 2.9369, + "step": 12773 + }, + { + "epoch": 1.1573010803832302, + "grad_norm": 0.8347573280334473, + "learning_rate": 0.00012285990454902437, + "loss": 2.6433, + "step": 12774 + }, + { + "epoch": 1.157391678557677, + "grad_norm": 0.8622145056724548, + "learning_rate": 0.0001228538633480336, + "loss": 2.8126, + "step": 12775 + }, + { + "epoch": 1.1574822767321238, + "grad_norm": 0.9604334831237793, + "learning_rate": 0.00012284782214704284, + "loss": 2.9999, + "step": 12776 + }, + { + "epoch": 1.1575728749065706, + "grad_norm": 0.8643031120300293, + "learning_rate": 0.00012284178094605207, + "loss": 2.6226, + "step": 12777 + }, + { + "epoch": 1.1576634730810174, + "grad_norm": 0.8577869534492493, + "learning_rate": 0.00012283573974506133, + "loss": 2.6276, + "step": 12778 + }, + { + "epoch": 1.1577540712554641, + "grad_norm": 0.7670577764511108, + "learning_rate": 0.00012282969854407057, + "loss": 2.3419, + "step": 12779 + }, + { + "epoch": 1.157844669429911, + "grad_norm": 0.8362120389938354, + "learning_rate": 0.0001228236573430798, + "loss": 2.45, + "step": 12780 + }, + { + "epoch": 1.1579352676043577, + "grad_norm": 0.8468507528305054, + "learning_rate": 0.00012281761614208904, + "loss": 2.6827, + "step": 12781 + }, + { + "epoch": 1.1580258657788045, + "grad_norm": 0.8881421685218811, + "learning_rate": 0.0001228115749410983, + "loss": 2.8461, + "step": 12782 + }, + { + "epoch": 1.1581164639532513, + "grad_norm": 0.8778554201126099, + "learning_rate": 0.00012280553374010756, + "loss": 2.781, + "step": 12783 + }, + { + "epoch": 1.158207062127698, + "grad_norm": 0.8973779082298279, + "learning_rate": 0.00012279949253911677, + "loss": 2.8711, + "step": 12784 + }, + { + "epoch": 1.1582976603021449, + "grad_norm": 0.9958505630493164, + "learning_rate": 0.00012279345133812603, + "loss": 2.666, + "step": 12785 + }, + { + "epoch": 1.1583882584765917, + "grad_norm": 0.8767656683921814, + "learning_rate": 0.00012278741013713526, + "loss": 2.4562, + "step": 12786 + }, + { + "epoch": 1.1584788566510384, + "grad_norm": 0.8730684518814087, + "learning_rate": 0.00012278136893614452, + "loss": 3.0796, + "step": 12787 + }, + { + "epoch": 1.1585694548254852, + "grad_norm": 0.8373352289199829, + "learning_rate": 0.00012277532773515376, + "loss": 2.6619, + "step": 12788 + }, + { + "epoch": 1.158660052999932, + "grad_norm": 0.9299280047416687, + "learning_rate": 0.000122769286534163, + "loss": 2.7105, + "step": 12789 + }, + { + "epoch": 1.1587506511743788, + "grad_norm": 0.9271764159202576, + "learning_rate": 0.00012276324533317225, + "loss": 2.6954, + "step": 12790 + }, + { + "epoch": 1.1588412493488256, + "grad_norm": 0.858744740486145, + "learning_rate": 0.00012275720413218149, + "loss": 2.9763, + "step": 12791 + }, + { + "epoch": 1.1589318475232724, + "grad_norm": 0.8900680541992188, + "learning_rate": 0.00012275116293119072, + "loss": 2.7528, + "step": 12792 + }, + { + "epoch": 1.1590224456977192, + "grad_norm": 0.873640775680542, + "learning_rate": 0.00012274512173019995, + "loss": 2.9395, + "step": 12793 + }, + { + "epoch": 1.159113043872166, + "grad_norm": 0.9455875158309937, + "learning_rate": 0.00012273908052920921, + "loss": 2.6011, + "step": 12794 + }, + { + "epoch": 1.1592036420466127, + "grad_norm": 0.9144165515899658, + "learning_rate": 0.00012273303932821845, + "loss": 2.7437, + "step": 12795 + }, + { + "epoch": 1.1592942402210595, + "grad_norm": 0.8622444868087769, + "learning_rate": 0.0001227269981272277, + "loss": 2.5598, + "step": 12796 + }, + { + "epoch": 1.1593848383955063, + "grad_norm": 0.9321821928024292, + "learning_rate": 0.00012272095692623694, + "loss": 2.4822, + "step": 12797 + }, + { + "epoch": 1.159475436569953, + "grad_norm": 0.9232690334320068, + "learning_rate": 0.00012271491572524618, + "loss": 2.9171, + "step": 12798 + }, + { + "epoch": 1.1595660347444, + "grad_norm": 0.911091685295105, + "learning_rate": 0.00012270887452425544, + "loss": 2.5793, + "step": 12799 + }, + { + "epoch": 1.1596566329188467, + "grad_norm": 0.8132466077804565, + "learning_rate": 0.00012270283332326467, + "loss": 2.1536, + "step": 12800 + }, + { + "epoch": 1.1597472310932935, + "grad_norm": 0.8950716257095337, + "learning_rate": 0.0001226967921222739, + "loss": 2.8323, + "step": 12801 + }, + { + "epoch": 1.1598378292677403, + "grad_norm": 1.0213109254837036, + "learning_rate": 0.00012269075092128314, + "loss": 2.3884, + "step": 12802 + }, + { + "epoch": 1.159928427442187, + "grad_norm": 0.8711835741996765, + "learning_rate": 0.0001226847097202924, + "loss": 2.8193, + "step": 12803 + }, + { + "epoch": 1.1600190256166338, + "grad_norm": 0.9701612591743469, + "learning_rate": 0.00012267866851930166, + "loss": 2.6252, + "step": 12804 + }, + { + "epoch": 1.1601096237910806, + "grad_norm": 0.8472191095352173, + "learning_rate": 0.0001226726273183109, + "loss": 2.6309, + "step": 12805 + }, + { + "epoch": 1.1602002219655274, + "grad_norm": 0.8711873292922974, + "learning_rate": 0.00012266658611732013, + "loss": 2.0346, + "step": 12806 + }, + { + "epoch": 1.1602908201399742, + "grad_norm": 0.9937304854393005, + "learning_rate": 0.00012266054491632937, + "loss": 2.5541, + "step": 12807 + }, + { + "epoch": 1.160381418314421, + "grad_norm": 0.769597589969635, + "learning_rate": 0.00012265450371533863, + "loss": 1.944, + "step": 12808 + }, + { + "epoch": 1.1604720164888678, + "grad_norm": 0.9567649364471436, + "learning_rate": 0.00012264846251434786, + "loss": 2.7203, + "step": 12809 + }, + { + "epoch": 1.1605626146633146, + "grad_norm": 0.8679679036140442, + "learning_rate": 0.0001226424213133571, + "loss": 2.8661, + "step": 12810 + }, + { + "epoch": 1.1606532128377613, + "grad_norm": 0.9255204796791077, + "learning_rate": 0.00012263638011236633, + "loss": 2.739, + "step": 12811 + }, + { + "epoch": 1.1607438110122081, + "grad_norm": 0.8347921371459961, + "learning_rate": 0.0001226303389113756, + "loss": 2.5432, + "step": 12812 + }, + { + "epoch": 1.160834409186655, + "grad_norm": 0.9158215522766113, + "learning_rate": 0.00012262429771038485, + "loss": 2.6302, + "step": 12813 + }, + { + "epoch": 1.1609250073611017, + "grad_norm": 0.8900094032287598, + "learning_rate": 0.00012261825650939406, + "loss": 2.6021, + "step": 12814 + }, + { + "epoch": 1.1610156055355485, + "grad_norm": 0.9213624596595764, + "learning_rate": 0.00012261221530840332, + "loss": 2.7691, + "step": 12815 + }, + { + "epoch": 1.1611062037099953, + "grad_norm": 0.9127732515335083, + "learning_rate": 0.00012260617410741255, + "loss": 2.1851, + "step": 12816 + }, + { + "epoch": 1.161196801884442, + "grad_norm": 0.8030425310134888, + "learning_rate": 0.00012260013290642181, + "loss": 2.2365, + "step": 12817 + }, + { + "epoch": 1.1612874000588889, + "grad_norm": 0.8596526384353638, + "learning_rate": 0.00012259409170543105, + "loss": 2.4951, + "step": 12818 + }, + { + "epoch": 1.1613779982333357, + "grad_norm": 0.9314450621604919, + "learning_rate": 0.00012258805050444028, + "loss": 2.4968, + "step": 12819 + }, + { + "epoch": 1.1614685964077824, + "grad_norm": 0.8484921455383301, + "learning_rate": 0.00012258200930344954, + "loss": 2.952, + "step": 12820 + }, + { + "epoch": 1.1615591945822292, + "grad_norm": 0.8565245866775513, + "learning_rate": 0.00012257596810245878, + "loss": 2.4689, + "step": 12821 + }, + { + "epoch": 1.161649792756676, + "grad_norm": 0.8459891676902771, + "learning_rate": 0.000122569926901468, + "loss": 2.5927, + "step": 12822 + }, + { + "epoch": 1.1617403909311228, + "grad_norm": 0.8907766342163086, + "learning_rate": 0.00012256388570047725, + "loss": 2.897, + "step": 12823 + }, + { + "epoch": 1.1618309891055696, + "grad_norm": 0.7980573773384094, + "learning_rate": 0.0001225578444994865, + "loss": 2.0528, + "step": 12824 + }, + { + "epoch": 1.1619215872800164, + "grad_norm": 0.9142286777496338, + "learning_rate": 0.00012255180329849574, + "loss": 3.1628, + "step": 12825 + }, + { + "epoch": 1.1620121854544632, + "grad_norm": 0.8531206846237183, + "learning_rate": 0.000122545762097505, + "loss": 2.7401, + "step": 12826 + }, + { + "epoch": 1.16210278362891, + "grad_norm": 0.849297046661377, + "learning_rate": 0.00012253972089651424, + "loss": 2.5975, + "step": 12827 + }, + { + "epoch": 1.1621933818033567, + "grad_norm": 0.8599570989608765, + "learning_rate": 0.00012253367969552347, + "loss": 2.6499, + "step": 12828 + }, + { + "epoch": 1.1622839799778035, + "grad_norm": 0.9090127348899841, + "learning_rate": 0.00012252763849453273, + "loss": 2.6503, + "step": 12829 + }, + { + "epoch": 1.1623745781522503, + "grad_norm": 0.8751391172409058, + "learning_rate": 0.00012252159729354197, + "loss": 2.9242, + "step": 12830 + }, + { + "epoch": 1.162465176326697, + "grad_norm": 0.8864240646362305, + "learning_rate": 0.0001225155560925512, + "loss": 2.7455, + "step": 12831 + }, + { + "epoch": 1.162555774501144, + "grad_norm": 0.9185150265693665, + "learning_rate": 0.00012250951489156043, + "loss": 2.7207, + "step": 12832 + }, + { + "epoch": 1.1626463726755907, + "grad_norm": 0.843814492225647, + "learning_rate": 0.0001225034736905697, + "loss": 2.6967, + "step": 12833 + }, + { + "epoch": 1.1627369708500375, + "grad_norm": 0.9412364363670349, + "learning_rate": 0.00012249743248957896, + "loss": 2.7655, + "step": 12834 + }, + { + "epoch": 1.1628275690244843, + "grad_norm": 0.968952476978302, + "learning_rate": 0.00012249139128858816, + "loss": 3.1, + "step": 12835 + }, + { + "epoch": 1.162918167198931, + "grad_norm": 0.8609442710876465, + "learning_rate": 0.00012248535008759742, + "loss": 2.5665, + "step": 12836 + }, + { + "epoch": 1.1630087653733776, + "grad_norm": 0.9024821519851685, + "learning_rate": 0.00012247930888660666, + "loss": 2.7884, + "step": 12837 + }, + { + "epoch": 1.1630993635478246, + "grad_norm": 0.9620121717453003, + "learning_rate": 0.00012247326768561592, + "loss": 2.6832, + "step": 12838 + }, + { + "epoch": 1.1631899617222712, + "grad_norm": 0.9081199169158936, + "learning_rate": 0.00012246722648462515, + "loss": 2.8622, + "step": 12839 + }, + { + "epoch": 1.1632805598967182, + "grad_norm": 0.9312058687210083, + "learning_rate": 0.0001224611852836344, + "loss": 2.6271, + "step": 12840 + }, + { + "epoch": 1.1633711580711648, + "grad_norm": 0.8962268829345703, + "learning_rate": 0.00012245514408264362, + "loss": 3.0194, + "step": 12841 + }, + { + "epoch": 1.1634617562456118, + "grad_norm": 0.9227458834648132, + "learning_rate": 0.00012244910288165288, + "loss": 2.6214, + "step": 12842 + }, + { + "epoch": 1.1635523544200583, + "grad_norm": 0.7361331582069397, + "learning_rate": 0.00012244306168066212, + "loss": 2.1238, + "step": 12843 + }, + { + "epoch": 1.1636429525945053, + "grad_norm": 0.8402937054634094, + "learning_rate": 0.00012243702047967135, + "loss": 2.8323, + "step": 12844 + }, + { + "epoch": 1.163733550768952, + "grad_norm": 0.8551919460296631, + "learning_rate": 0.0001224309792786806, + "loss": 2.8146, + "step": 12845 + }, + { + "epoch": 1.1638241489433987, + "grad_norm": 0.9112148284912109, + "learning_rate": 0.00012242493807768985, + "loss": 2.4091, + "step": 12846 + }, + { + "epoch": 1.1639147471178455, + "grad_norm": 1.0284186601638794, + "learning_rate": 0.0001224188968766991, + "loss": 2.8095, + "step": 12847 + }, + { + "epoch": 1.1640053452922923, + "grad_norm": 0.9072021842002869, + "learning_rate": 0.00012241285567570831, + "loss": 3.004, + "step": 12848 + }, + { + "epoch": 1.164095943466739, + "grad_norm": 0.855212926864624, + "learning_rate": 0.00012240681447471758, + "loss": 2.4993, + "step": 12849 + }, + { + "epoch": 1.1641865416411858, + "grad_norm": 0.9710696339607239, + "learning_rate": 0.00012240077327372684, + "loss": 2.8517, + "step": 12850 + }, + { + "epoch": 1.1642771398156326, + "grad_norm": 0.8792162537574768, + "learning_rate": 0.00012239473207273607, + "loss": 2.8609, + "step": 12851 + }, + { + "epoch": 1.1643677379900794, + "grad_norm": 0.8770850896835327, + "learning_rate": 0.0001223886908717453, + "loss": 2.7463, + "step": 12852 + }, + { + "epoch": 1.1644583361645262, + "grad_norm": 0.8892356753349304, + "learning_rate": 0.00012238264967075454, + "loss": 2.8173, + "step": 12853 + }, + { + "epoch": 1.164548934338973, + "grad_norm": 0.896256685256958, + "learning_rate": 0.0001223766084697638, + "loss": 2.7725, + "step": 12854 + }, + { + "epoch": 1.1646395325134198, + "grad_norm": 0.8747243285179138, + "learning_rate": 0.00012237056726877303, + "loss": 2.6376, + "step": 12855 + }, + { + "epoch": 1.1647301306878666, + "grad_norm": 0.8558664917945862, + "learning_rate": 0.00012236452606778227, + "loss": 2.6917, + "step": 12856 + }, + { + "epoch": 1.1648207288623134, + "grad_norm": 0.9142611622810364, + "learning_rate": 0.00012235848486679153, + "loss": 2.6773, + "step": 12857 + }, + { + "epoch": 1.1649113270367601, + "grad_norm": 0.9398519396781921, + "learning_rate": 0.00012235244366580076, + "loss": 2.7003, + "step": 12858 + }, + { + "epoch": 1.165001925211207, + "grad_norm": 0.8646072149276733, + "learning_rate": 0.00012234640246481002, + "loss": 2.5771, + "step": 12859 + }, + { + "epoch": 1.1650925233856537, + "grad_norm": 0.9028806090354919, + "learning_rate": 0.00012234036126381926, + "loss": 2.724, + "step": 12860 + }, + { + "epoch": 1.1651831215601005, + "grad_norm": 0.9132232666015625, + "learning_rate": 0.0001223343200628285, + "loss": 2.8019, + "step": 12861 + }, + { + "epoch": 1.1652737197345473, + "grad_norm": 0.9532009363174438, + "learning_rate": 0.00012232827886183773, + "loss": 2.9474, + "step": 12862 + }, + { + "epoch": 1.165364317908994, + "grad_norm": 0.8897185325622559, + "learning_rate": 0.000122322237660847, + "loss": 2.8066, + "step": 12863 + }, + { + "epoch": 1.1654549160834409, + "grad_norm": 0.9398787617683411, + "learning_rate": 0.00012231619645985625, + "loss": 2.6165, + "step": 12864 + }, + { + "epoch": 1.1655455142578877, + "grad_norm": 0.8578335046768188, + "learning_rate": 0.00012231015525886546, + "loss": 2.8087, + "step": 12865 + }, + { + "epoch": 1.1656361124323344, + "grad_norm": 0.8237757682800293, + "learning_rate": 0.00012230411405787472, + "loss": 2.5924, + "step": 12866 + }, + { + "epoch": 1.1657267106067812, + "grad_norm": 1.2362008094787598, + "learning_rate": 0.00012229807285688395, + "loss": 2.7132, + "step": 12867 + }, + { + "epoch": 1.165817308781228, + "grad_norm": 0.9220950603485107, + "learning_rate": 0.0001222920316558932, + "loss": 2.5839, + "step": 12868 + }, + { + "epoch": 1.1659079069556748, + "grad_norm": 0.8298407196998596, + "learning_rate": 0.00012228599045490242, + "loss": 2.619, + "step": 12869 + }, + { + "epoch": 1.1659985051301216, + "grad_norm": 0.7353097200393677, + "learning_rate": 0.00012227994925391168, + "loss": 2.0954, + "step": 12870 + }, + { + "epoch": 1.1660891033045684, + "grad_norm": 0.9698166847229004, + "learning_rate": 0.00012227390805292091, + "loss": 2.928, + "step": 12871 + }, + { + "epoch": 1.1661797014790152, + "grad_norm": 1.033394694328308, + "learning_rate": 0.00012226786685193018, + "loss": 2.6629, + "step": 12872 + }, + { + "epoch": 1.166270299653462, + "grad_norm": 0.9094239473342896, + "learning_rate": 0.0001222618256509394, + "loss": 2.7132, + "step": 12873 + }, + { + "epoch": 1.1663608978279087, + "grad_norm": 0.8128738403320312, + "learning_rate": 0.00012225578444994864, + "loss": 2.2095, + "step": 12874 + }, + { + "epoch": 1.1664514960023555, + "grad_norm": 1.0119949579238892, + "learning_rate": 0.0001222497432489579, + "loss": 2.6412, + "step": 12875 + }, + { + "epoch": 1.1665420941768023, + "grad_norm": 0.9446651935577393, + "learning_rate": 0.00012224370204796714, + "loss": 2.7938, + "step": 12876 + }, + { + "epoch": 1.1666326923512491, + "grad_norm": 0.905019998550415, + "learning_rate": 0.0001222376608469764, + "loss": 2.8921, + "step": 12877 + }, + { + "epoch": 1.166723290525696, + "grad_norm": 0.9107064008712769, + "learning_rate": 0.0001222316196459856, + "loss": 2.7029, + "step": 12878 + }, + { + "epoch": 1.1668138887001427, + "grad_norm": 0.9093142151832581, + "learning_rate": 0.00012222557844499487, + "loss": 2.8037, + "step": 12879 + }, + { + "epoch": 1.1669044868745895, + "grad_norm": 0.8606407642364502, + "learning_rate": 0.00012221953724400413, + "loss": 2.7073, + "step": 12880 + }, + { + "epoch": 1.1669950850490363, + "grad_norm": 0.9115155339241028, + "learning_rate": 0.00012221349604301336, + "loss": 2.7597, + "step": 12881 + }, + { + "epoch": 1.167085683223483, + "grad_norm": 0.8542153835296631, + "learning_rate": 0.0001222074548420226, + "loss": 2.5609, + "step": 12882 + }, + { + "epoch": 1.1671762813979298, + "grad_norm": 1.0093681812286377, + "learning_rate": 0.00012220141364103183, + "loss": 2.8625, + "step": 12883 + }, + { + "epoch": 1.1672668795723766, + "grad_norm": 0.9002559185028076, + "learning_rate": 0.0001221953724400411, + "loss": 2.8916, + "step": 12884 + }, + { + "epoch": 1.1673574777468234, + "grad_norm": 0.9413056969642639, + "learning_rate": 0.00012218933123905033, + "loss": 2.5761, + "step": 12885 + }, + { + "epoch": 1.1674480759212702, + "grad_norm": 0.902449905872345, + "learning_rate": 0.00012218329003805956, + "loss": 2.8062, + "step": 12886 + }, + { + "epoch": 1.167538674095717, + "grad_norm": 0.8735504150390625, + "learning_rate": 0.00012217724883706882, + "loss": 2.6671, + "step": 12887 + }, + { + "epoch": 1.1676292722701638, + "grad_norm": 0.9967604875564575, + "learning_rate": 0.00012217120763607806, + "loss": 2.9666, + "step": 12888 + }, + { + "epoch": 1.1677198704446106, + "grad_norm": 0.8294869065284729, + "learning_rate": 0.00012216516643508732, + "loss": 2.2953, + "step": 12889 + }, + { + "epoch": 1.1678104686190574, + "grad_norm": 0.8434504270553589, + "learning_rate": 0.00012215912523409655, + "loss": 2.7328, + "step": 12890 + }, + { + "epoch": 1.1679010667935041, + "grad_norm": 0.8965601325035095, + "learning_rate": 0.00012215308403310579, + "loss": 2.2288, + "step": 12891 + }, + { + "epoch": 1.167991664967951, + "grad_norm": 0.8963363170623779, + "learning_rate": 0.00012214704283211502, + "loss": 2.6121, + "step": 12892 + }, + { + "epoch": 1.1680822631423977, + "grad_norm": 0.8747273683547974, + "learning_rate": 0.00012214100163112428, + "loss": 2.8991, + "step": 12893 + }, + { + "epoch": 1.1681728613168445, + "grad_norm": 0.962726354598999, + "learning_rate": 0.00012213496043013351, + "loss": 2.5064, + "step": 12894 + }, + { + "epoch": 1.1682634594912913, + "grad_norm": 0.9826059937477112, + "learning_rate": 0.00012212891922914275, + "loss": 2.6382, + "step": 12895 + }, + { + "epoch": 1.168354057665738, + "grad_norm": 1.1204895973205566, + "learning_rate": 0.000122122878028152, + "loss": 2.7152, + "step": 12896 + }, + { + "epoch": 1.1684446558401849, + "grad_norm": 0.9132033586502075, + "learning_rate": 0.00012211683682716124, + "loss": 2.5308, + "step": 12897 + }, + { + "epoch": 1.1685352540146317, + "grad_norm": 0.8421321511268616, + "learning_rate": 0.0001221107956261705, + "loss": 2.6782, + "step": 12898 + }, + { + "epoch": 1.1686258521890784, + "grad_norm": 1.114262580871582, + "learning_rate": 0.0001221047544251797, + "loss": 2.9221, + "step": 12899 + }, + { + "epoch": 1.1687164503635252, + "grad_norm": 0.8554218411445618, + "learning_rate": 0.00012209871322418897, + "loss": 2.5353, + "step": 12900 + }, + { + "epoch": 1.168807048537972, + "grad_norm": 0.7703717947006226, + "learning_rate": 0.0001220926720231982, + "loss": 1.3434, + "step": 12901 + }, + { + "epoch": 1.1688976467124188, + "grad_norm": 0.7988015413284302, + "learning_rate": 0.00012208663082220747, + "loss": 2.5273, + "step": 12902 + }, + { + "epoch": 1.1689882448868656, + "grad_norm": 0.9733070731163025, + "learning_rate": 0.0001220805896212167, + "loss": 2.5333, + "step": 12903 + }, + { + "epoch": 1.1690788430613124, + "grad_norm": 0.9082719087600708, + "learning_rate": 0.00012207454842022594, + "loss": 2.8796, + "step": 12904 + }, + { + "epoch": 1.1691694412357592, + "grad_norm": 0.9239591956138611, + "learning_rate": 0.0001220685072192352, + "loss": 2.9089, + "step": 12905 + }, + { + "epoch": 1.169260039410206, + "grad_norm": 0.8351925611495972, + "learning_rate": 0.00012206246601824443, + "loss": 2.579, + "step": 12906 + }, + { + "epoch": 1.1693506375846527, + "grad_norm": 0.8524648547172546, + "learning_rate": 0.00012205642481725368, + "loss": 2.5786, + "step": 12907 + }, + { + "epoch": 1.1694412357590995, + "grad_norm": 0.8659688830375671, + "learning_rate": 0.00012205038361626291, + "loss": 2.5156, + "step": 12908 + }, + { + "epoch": 1.1695318339335463, + "grad_norm": 0.9635900259017944, + "learning_rate": 0.00012204434241527216, + "loss": 2.8615, + "step": 12909 + }, + { + "epoch": 1.169622432107993, + "grad_norm": 0.8079934120178223, + "learning_rate": 0.00012203830121428141, + "loss": 2.0184, + "step": 12910 + }, + { + "epoch": 1.16971303028244, + "grad_norm": 0.8835522532463074, + "learning_rate": 0.00012203226001329064, + "loss": 2.6083, + "step": 12911 + }, + { + "epoch": 1.1698036284568867, + "grad_norm": 0.9623753428459167, + "learning_rate": 0.0001220262188122999, + "loss": 2.6509, + "step": 12912 + }, + { + "epoch": 1.1698942266313335, + "grad_norm": 0.8704296946525574, + "learning_rate": 0.00012202017761130912, + "loss": 2.4791, + "step": 12913 + }, + { + "epoch": 1.1699848248057803, + "grad_norm": 0.8497985601425171, + "learning_rate": 0.00012201413641031839, + "loss": 2.0049, + "step": 12914 + }, + { + "epoch": 1.170075422980227, + "grad_norm": 0.8156812191009521, + "learning_rate": 0.0001220080952093276, + "loss": 1.9678, + "step": 12915 + }, + { + "epoch": 1.1701660211546738, + "grad_norm": 0.8458603024482727, + "learning_rate": 0.00012200205400833687, + "loss": 2.6159, + "step": 12916 + }, + { + "epoch": 1.1702566193291206, + "grad_norm": 0.9220245480537415, + "learning_rate": 0.00012199601280734611, + "loss": 2.7464, + "step": 12917 + }, + { + "epoch": 1.1703472175035672, + "grad_norm": 0.8529415130615234, + "learning_rate": 0.00012198997160635535, + "loss": 2.6752, + "step": 12918 + }, + { + "epoch": 1.1704378156780142, + "grad_norm": 0.9349474906921387, + "learning_rate": 0.0001219839304053646, + "loss": 2.6563, + "step": 12919 + }, + { + "epoch": 1.1705284138524608, + "grad_norm": 0.8528047800064087, + "learning_rate": 0.00012197788920437383, + "loss": 2.734, + "step": 12920 + }, + { + "epoch": 1.1706190120269078, + "grad_norm": 0.8682864904403687, + "learning_rate": 0.00012197184800338308, + "loss": 2.7406, + "step": 12921 + }, + { + "epoch": 1.1707096102013543, + "grad_norm": 0.9291223883628845, + "learning_rate": 0.00012196580680239231, + "loss": 2.9352, + "step": 12922 + }, + { + "epoch": 1.1708002083758013, + "grad_norm": 0.8791137337684631, + "learning_rate": 0.00012195976560140156, + "loss": 2.492, + "step": 12923 + }, + { + "epoch": 1.170890806550248, + "grad_norm": 0.778171181678772, + "learning_rate": 0.00012195372440041082, + "loss": 1.9941, + "step": 12924 + }, + { + "epoch": 1.170981404724695, + "grad_norm": 0.8866336941719055, + "learning_rate": 0.00012194768319942005, + "loss": 2.9115, + "step": 12925 + }, + { + "epoch": 1.1710720028991415, + "grad_norm": 0.9414125084877014, + "learning_rate": 0.0001219416419984293, + "loss": 2.9537, + "step": 12926 + }, + { + "epoch": 1.1711626010735883, + "grad_norm": 0.9407989382743835, + "learning_rate": 0.00012193560079743854, + "loss": 2.6477, + "step": 12927 + }, + { + "epoch": 1.171253199248035, + "grad_norm": 0.9099850058555603, + "learning_rate": 0.00012192955959644778, + "loss": 2.6917, + "step": 12928 + }, + { + "epoch": 1.1713437974224818, + "grad_norm": 0.9899085760116577, + "learning_rate": 0.00012192351839545702, + "loss": 2.619, + "step": 12929 + }, + { + "epoch": 1.1714343955969286, + "grad_norm": 0.9202999472618103, + "learning_rate": 0.00012191747719446627, + "loss": 2.4493, + "step": 12930 + }, + { + "epoch": 1.1715249937713754, + "grad_norm": 0.8485143780708313, + "learning_rate": 0.0001219114359934755, + "loss": 2.0373, + "step": 12931 + }, + { + "epoch": 1.1716155919458222, + "grad_norm": 0.8925278186798096, + "learning_rate": 0.00012190539479248475, + "loss": 2.6642, + "step": 12932 + }, + { + "epoch": 1.171706190120269, + "grad_norm": 0.8585709929466248, + "learning_rate": 0.00012189935359149401, + "loss": 2.7645, + "step": 12933 + }, + { + "epoch": 1.1717967882947158, + "grad_norm": 0.9225877523422241, + "learning_rate": 0.00012189331239050323, + "loss": 2.6159, + "step": 12934 + }, + { + "epoch": 1.1718873864691626, + "grad_norm": 0.8520552515983582, + "learning_rate": 0.00012188727118951249, + "loss": 2.8428, + "step": 12935 + }, + { + "epoch": 1.1719779846436094, + "grad_norm": 0.8777601718902588, + "learning_rate": 0.00012188122998852171, + "loss": 2.6525, + "step": 12936 + }, + { + "epoch": 1.1720685828180561, + "grad_norm": 1.0955818891525269, + "learning_rate": 0.00012187518878753097, + "loss": 2.7821, + "step": 12937 + }, + { + "epoch": 1.172159180992503, + "grad_norm": 0.8786496520042419, + "learning_rate": 0.0001218691475865402, + "loss": 2.5957, + "step": 12938 + }, + { + "epoch": 1.1722497791669497, + "grad_norm": 0.8910553455352783, + "learning_rate": 0.00012186310638554945, + "loss": 2.9602, + "step": 12939 + }, + { + "epoch": 1.1723403773413965, + "grad_norm": 0.9512487053871155, + "learning_rate": 0.0001218570651845587, + "loss": 2.7338, + "step": 12940 + }, + { + "epoch": 1.1724309755158433, + "grad_norm": 0.8507731556892395, + "learning_rate": 0.00012185102398356794, + "loss": 2.6798, + "step": 12941 + }, + { + "epoch": 1.17252157369029, + "grad_norm": 0.8914662003517151, + "learning_rate": 0.00012184498278257718, + "loss": 2.8849, + "step": 12942 + }, + { + "epoch": 1.1726121718647369, + "grad_norm": 0.8280551433563232, + "learning_rate": 0.00012183894158158642, + "loss": 2.74, + "step": 12943 + }, + { + "epoch": 1.1727027700391837, + "grad_norm": 0.9061474204063416, + "learning_rate": 0.00012183290038059568, + "loss": 2.7925, + "step": 12944 + }, + { + "epoch": 1.1727933682136304, + "grad_norm": 0.8761815428733826, + "learning_rate": 0.0001218268591796049, + "loss": 2.9366, + "step": 12945 + }, + { + "epoch": 1.1728839663880772, + "grad_norm": 0.8440141677856445, + "learning_rate": 0.00012182081797861416, + "loss": 2.8981, + "step": 12946 + }, + { + "epoch": 1.172974564562524, + "grad_norm": 0.8858604431152344, + "learning_rate": 0.00012181477677762341, + "loss": 2.7934, + "step": 12947 + }, + { + "epoch": 1.1730651627369708, + "grad_norm": 0.9299778342247009, + "learning_rate": 0.00012180873557663264, + "loss": 2.6835, + "step": 12948 + }, + { + "epoch": 1.1731557609114176, + "grad_norm": 0.9106560945510864, + "learning_rate": 0.00012180269437564189, + "loss": 2.47, + "step": 12949 + }, + { + "epoch": 1.1732463590858644, + "grad_norm": 0.8296729922294617, + "learning_rate": 0.00012179665317465112, + "loss": 2.4698, + "step": 12950 + }, + { + "epoch": 1.1733369572603112, + "grad_norm": 0.8757796287536621, + "learning_rate": 0.00012179061197366037, + "loss": 2.6739, + "step": 12951 + }, + { + "epoch": 1.173427555434758, + "grad_norm": 0.9807283878326416, + "learning_rate": 0.0001217845707726696, + "loss": 2.7613, + "step": 12952 + }, + { + "epoch": 1.1735181536092048, + "grad_norm": 0.8798565864562988, + "learning_rate": 0.00012177852957167885, + "loss": 2.7276, + "step": 12953 + }, + { + "epoch": 1.1736087517836515, + "grad_norm": 0.9378028512001038, + "learning_rate": 0.00012177248837068811, + "loss": 2.6741, + "step": 12954 + }, + { + "epoch": 1.1736993499580983, + "grad_norm": 0.8643040060997009, + "learning_rate": 0.00012176644716969733, + "loss": 2.4358, + "step": 12955 + }, + { + "epoch": 1.1737899481325451, + "grad_norm": 0.9166018962860107, + "learning_rate": 0.0001217604059687066, + "loss": 2.7741, + "step": 12956 + }, + { + "epoch": 1.173880546306992, + "grad_norm": 0.9435948133468628, + "learning_rate": 0.00012175436476771583, + "loss": 2.793, + "step": 12957 + }, + { + "epoch": 1.1739711444814387, + "grad_norm": 0.8669350147247314, + "learning_rate": 0.00012174832356672508, + "loss": 2.575, + "step": 12958 + }, + { + "epoch": 1.1740617426558855, + "grad_norm": 0.8728914260864258, + "learning_rate": 0.00012174228236573431, + "loss": 2.7424, + "step": 12959 + }, + { + "epoch": 1.1741523408303323, + "grad_norm": 0.8533288240432739, + "learning_rate": 0.00012173624116474356, + "loss": 2.5926, + "step": 12960 + }, + { + "epoch": 1.174242939004779, + "grad_norm": 0.8871995210647583, + "learning_rate": 0.00012173019996375279, + "loss": 2.5944, + "step": 12961 + }, + { + "epoch": 1.1743335371792258, + "grad_norm": 0.8414826393127441, + "learning_rate": 0.00012172415876276204, + "loss": 2.8319, + "step": 12962 + }, + { + "epoch": 1.1744241353536726, + "grad_norm": 0.8634635210037231, + "learning_rate": 0.00012171811756177129, + "loss": 2.8503, + "step": 12963 + }, + { + "epoch": 1.1745147335281194, + "grad_norm": 0.8495129346847534, + "learning_rate": 0.00012171207636078052, + "loss": 2.7346, + "step": 12964 + }, + { + "epoch": 1.1746053317025662, + "grad_norm": 0.8973274827003479, + "learning_rate": 0.00012170603515978978, + "loss": 2.8619, + "step": 12965 + }, + { + "epoch": 1.174695929877013, + "grad_norm": 0.8673251867294312, + "learning_rate": 0.000121699993958799, + "loss": 2.6399, + "step": 12966 + }, + { + "epoch": 1.1747865280514598, + "grad_norm": 0.8516775369644165, + "learning_rate": 0.00012169395275780826, + "loss": 2.5491, + "step": 12967 + }, + { + "epoch": 1.1748771262259066, + "grad_norm": 0.9082303643226624, + "learning_rate": 0.00012168791155681749, + "loss": 2.7886, + "step": 12968 + }, + { + "epoch": 1.1749677244003534, + "grad_norm": 0.9047560095787048, + "learning_rate": 0.00012168187035582675, + "loss": 2.7971, + "step": 12969 + }, + { + "epoch": 1.1750583225748001, + "grad_norm": 0.8284288644790649, + "learning_rate": 0.000121675829154836, + "loss": 2.786, + "step": 12970 + }, + { + "epoch": 1.175148920749247, + "grad_norm": 0.9011580944061279, + "learning_rate": 0.00012166978795384523, + "loss": 2.8552, + "step": 12971 + }, + { + "epoch": 1.1752395189236937, + "grad_norm": 0.7708240151405334, + "learning_rate": 0.00012166374675285448, + "loss": 2.149, + "step": 12972 + }, + { + "epoch": 1.1753301170981405, + "grad_norm": 0.9286472201347351, + "learning_rate": 0.00012165770555186371, + "loss": 2.7344, + "step": 12973 + }, + { + "epoch": 1.1754207152725873, + "grad_norm": 0.9913938641548157, + "learning_rate": 0.00012165166435087296, + "loss": 2.632, + "step": 12974 + }, + { + "epoch": 1.175511313447034, + "grad_norm": 0.9286721348762512, + "learning_rate": 0.00012164562314988219, + "loss": 2.37, + "step": 12975 + }, + { + "epoch": 1.1756019116214809, + "grad_norm": 0.8519781231880188, + "learning_rate": 0.00012163958194889145, + "loss": 2.3937, + "step": 12976 + }, + { + "epoch": 1.1756925097959277, + "grad_norm": 0.8638420104980469, + "learning_rate": 0.0001216335407479007, + "loss": 2.8253, + "step": 12977 + }, + { + "epoch": 1.1757831079703744, + "grad_norm": 0.838223934173584, + "learning_rate": 0.00012162749954690993, + "loss": 2.5624, + "step": 12978 + }, + { + "epoch": 1.1758737061448212, + "grad_norm": 0.9232915639877319, + "learning_rate": 0.00012162145834591918, + "loss": 3.0401, + "step": 12979 + }, + { + "epoch": 1.175964304319268, + "grad_norm": 0.9886789917945862, + "learning_rate": 0.00012161541714492842, + "loss": 3.0967, + "step": 12980 + }, + { + "epoch": 1.1760549024937148, + "grad_norm": 0.923711359500885, + "learning_rate": 0.00012160937594393766, + "loss": 2.5719, + "step": 12981 + }, + { + "epoch": 1.1761455006681616, + "grad_norm": 0.9208632111549377, + "learning_rate": 0.0001216033347429469, + "loss": 2.5824, + "step": 12982 + }, + { + "epoch": 1.1762360988426084, + "grad_norm": 0.8902470469474792, + "learning_rate": 0.00012159729354195614, + "loss": 2.6259, + "step": 12983 + }, + { + "epoch": 1.1763266970170552, + "grad_norm": 0.889758825302124, + "learning_rate": 0.0001215912523409654, + "loss": 2.6378, + "step": 12984 + }, + { + "epoch": 1.176417295191502, + "grad_norm": 0.9675484895706177, + "learning_rate": 0.00012158521113997463, + "loss": 2.7932, + "step": 12985 + }, + { + "epoch": 1.1765078933659487, + "grad_norm": 0.902069091796875, + "learning_rate": 0.00012157916993898389, + "loss": 2.6723, + "step": 12986 + }, + { + "epoch": 1.1765984915403955, + "grad_norm": 0.9504603147506714, + "learning_rate": 0.00012157312873799311, + "loss": 2.8354, + "step": 12987 + }, + { + "epoch": 1.1766890897148423, + "grad_norm": 0.855600893497467, + "learning_rate": 0.00012156708753700237, + "loss": 2.6418, + "step": 12988 + }, + { + "epoch": 1.176779687889289, + "grad_norm": 0.8285062909126282, + "learning_rate": 0.0001215610463360116, + "loss": 2.7276, + "step": 12989 + }, + { + "epoch": 1.176870286063736, + "grad_norm": 0.9031375646591187, + "learning_rate": 0.00012155500513502085, + "loss": 2.8514, + "step": 12990 + }, + { + "epoch": 1.1769608842381827, + "grad_norm": 0.930875837802887, + "learning_rate": 0.00012154896393403009, + "loss": 2.849, + "step": 12991 + }, + { + "epoch": 1.1770514824126295, + "grad_norm": 0.7757595777511597, + "learning_rate": 0.00012154292273303933, + "loss": 2.104, + "step": 12992 + }, + { + "epoch": 1.1771420805870763, + "grad_norm": 0.8314148187637329, + "learning_rate": 0.00012153688153204858, + "loss": 2.7999, + "step": 12993 + }, + { + "epoch": 1.177232678761523, + "grad_norm": 0.9372549057006836, + "learning_rate": 0.00012153084033105781, + "loss": 2.7654, + "step": 12994 + }, + { + "epoch": 1.1773232769359698, + "grad_norm": 0.8878065943717957, + "learning_rate": 0.00012152479913006706, + "loss": 2.7444, + "step": 12995 + }, + { + "epoch": 1.1774138751104166, + "grad_norm": 0.9117012619972229, + "learning_rate": 0.0001215187579290763, + "loss": 2.7076, + "step": 12996 + }, + { + "epoch": 1.1775044732848634, + "grad_norm": 0.9174711108207703, + "learning_rate": 0.00012151271672808556, + "loss": 2.9469, + "step": 12997 + }, + { + "epoch": 1.1775950714593102, + "grad_norm": 0.9361754655838013, + "learning_rate": 0.00012150667552709478, + "loss": 2.8116, + "step": 12998 + }, + { + "epoch": 1.1776856696337568, + "grad_norm": 0.9338895678520203, + "learning_rate": 0.00012150063432610404, + "loss": 2.8322, + "step": 12999 + }, + { + "epoch": 1.1777762678082038, + "grad_norm": 0.8952615857124329, + "learning_rate": 0.00012149459312511329, + "loss": 2.6783, + "step": 13000 + }, + { + "epoch": 1.1778668659826503, + "grad_norm": 0.8673512935638428, + "learning_rate": 0.00012148855192412252, + "loss": 2.7329, + "step": 13001 + }, + { + "epoch": 1.1779574641570973, + "grad_norm": 0.8569673299789429, + "learning_rate": 0.00012148251072313177, + "loss": 2.6963, + "step": 13002 + }, + { + "epoch": 1.178048062331544, + "grad_norm": 0.9151321053504944, + "learning_rate": 0.000121476469522141, + "loss": 2.8046, + "step": 13003 + }, + { + "epoch": 1.178138660505991, + "grad_norm": 0.8900312781333923, + "learning_rate": 0.00012147042832115025, + "loss": 2.9135, + "step": 13004 + }, + { + "epoch": 1.1782292586804375, + "grad_norm": 0.8362628817558289, + "learning_rate": 0.00012146438712015948, + "loss": 2.5449, + "step": 13005 + }, + { + "epoch": 1.1783198568548845, + "grad_norm": 0.9082969427108765, + "learning_rate": 0.00012145834591916873, + "loss": 2.778, + "step": 13006 + }, + { + "epoch": 1.178410455029331, + "grad_norm": 0.8465043902397156, + "learning_rate": 0.00012145230471817799, + "loss": 2.6231, + "step": 13007 + }, + { + "epoch": 1.1785010532037778, + "grad_norm": 0.8592329621315002, + "learning_rate": 0.00012144626351718721, + "loss": 2.764, + "step": 13008 + }, + { + "epoch": 1.1785916513782246, + "grad_norm": 0.9546691179275513, + "learning_rate": 0.00012144022231619647, + "loss": 2.8324, + "step": 13009 + }, + { + "epoch": 1.1786822495526714, + "grad_norm": 0.8990256190299988, + "learning_rate": 0.00012143418111520571, + "loss": 2.9645, + "step": 13010 + }, + { + "epoch": 1.1787728477271182, + "grad_norm": 0.9046444296836853, + "learning_rate": 0.00012142813991421496, + "loss": 2.8723, + "step": 13011 + }, + { + "epoch": 1.178863445901565, + "grad_norm": 0.8696375489234924, + "learning_rate": 0.00012142209871322419, + "loss": 2.7193, + "step": 13012 + }, + { + "epoch": 1.1789540440760118, + "grad_norm": 0.8721904754638672, + "learning_rate": 0.00012141605751223344, + "loss": 2.6154, + "step": 13013 + }, + { + "epoch": 1.1790446422504586, + "grad_norm": 0.8132858276367188, + "learning_rate": 0.00012141001631124269, + "loss": 2.5228, + "step": 13014 + }, + { + "epoch": 1.1791352404249054, + "grad_norm": 0.8955384492874146, + "learning_rate": 0.00012140397511025192, + "loss": 2.5883, + "step": 13015 + }, + { + "epoch": 1.1792258385993521, + "grad_norm": 0.8615983128547668, + "learning_rate": 0.00012139793390926118, + "loss": 2.6449, + "step": 13016 + }, + { + "epoch": 1.179316436773799, + "grad_norm": 0.928687572479248, + "learning_rate": 0.0001213918927082704, + "loss": 2.6579, + "step": 13017 + }, + { + "epoch": 1.1794070349482457, + "grad_norm": 0.8876420855522156, + "learning_rate": 0.00012138585150727966, + "loss": 2.4978, + "step": 13018 + }, + { + "epoch": 1.1794976331226925, + "grad_norm": 0.9173671007156372, + "learning_rate": 0.00012137981030628888, + "loss": 2.8309, + "step": 13019 + }, + { + "epoch": 1.1795882312971393, + "grad_norm": 0.9089097380638123, + "learning_rate": 0.00012137376910529814, + "loss": 2.6593, + "step": 13020 + }, + { + "epoch": 1.179678829471586, + "grad_norm": 0.8793743848800659, + "learning_rate": 0.00012136772790430738, + "loss": 2.9045, + "step": 13021 + }, + { + "epoch": 1.1797694276460329, + "grad_norm": 0.8691624402999878, + "learning_rate": 0.00012136168670331663, + "loss": 2.7155, + "step": 13022 + }, + { + "epoch": 1.1798600258204797, + "grad_norm": 0.895402193069458, + "learning_rate": 0.00012135564550232587, + "loss": 2.7658, + "step": 13023 + }, + { + "epoch": 1.1799506239949265, + "grad_norm": 0.8725312352180481, + "learning_rate": 0.00012134960430133511, + "loss": 2.5655, + "step": 13024 + }, + { + "epoch": 1.1800412221693732, + "grad_norm": 0.8904321193695068, + "learning_rate": 0.00012134356310034435, + "loss": 2.8853, + "step": 13025 + }, + { + "epoch": 1.18013182034382, + "grad_norm": 0.9181836247444153, + "learning_rate": 0.00012133752189935359, + "loss": 2.7352, + "step": 13026 + }, + { + "epoch": 1.1802224185182668, + "grad_norm": 0.8415603637695312, + "learning_rate": 0.00012133148069836284, + "loss": 2.4138, + "step": 13027 + }, + { + "epoch": 1.1803130166927136, + "grad_norm": 0.7232913374900818, + "learning_rate": 0.00012132543949737207, + "loss": 1.7364, + "step": 13028 + }, + { + "epoch": 1.1804036148671604, + "grad_norm": 0.8706855177879333, + "learning_rate": 0.00012131939829638133, + "loss": 2.5317, + "step": 13029 + }, + { + "epoch": 1.1804942130416072, + "grad_norm": 0.9294773936271667, + "learning_rate": 0.00012131335709539058, + "loss": 2.5548, + "step": 13030 + }, + { + "epoch": 1.180584811216054, + "grad_norm": 0.8837763071060181, + "learning_rate": 0.00012130731589439981, + "loss": 2.5495, + "step": 13031 + }, + { + "epoch": 1.1806754093905008, + "grad_norm": 0.9413143396377563, + "learning_rate": 0.00012130127469340906, + "loss": 2.8094, + "step": 13032 + }, + { + "epoch": 1.1807660075649475, + "grad_norm": 0.9078770279884338, + "learning_rate": 0.0001212952334924183, + "loss": 2.4035, + "step": 13033 + }, + { + "epoch": 1.1808566057393943, + "grad_norm": 0.9479053616523743, + "learning_rate": 0.00012128919229142754, + "loss": 2.899, + "step": 13034 + }, + { + "epoch": 1.1809472039138411, + "grad_norm": 1.0373680591583252, + "learning_rate": 0.00012128315109043678, + "loss": 3.0107, + "step": 13035 + }, + { + "epoch": 1.181037802088288, + "grad_norm": 0.9329968094825745, + "learning_rate": 0.00012127710988944602, + "loss": 2.7435, + "step": 13036 + }, + { + "epoch": 1.1811284002627347, + "grad_norm": 0.8967359662055969, + "learning_rate": 0.00012127106868845529, + "loss": 2.6023, + "step": 13037 + }, + { + "epoch": 1.1812189984371815, + "grad_norm": 0.9297956824302673, + "learning_rate": 0.0001212650274874645, + "loss": 3.0013, + "step": 13038 + }, + { + "epoch": 1.1813095966116283, + "grad_norm": 0.9054364562034607, + "learning_rate": 0.00012125898628647377, + "loss": 2.7395, + "step": 13039 + }, + { + "epoch": 1.181400194786075, + "grad_norm": 0.8728466033935547, + "learning_rate": 0.00012125294508548299, + "loss": 2.6094, + "step": 13040 + }, + { + "epoch": 1.1814907929605218, + "grad_norm": 0.8594852089881897, + "learning_rate": 0.00012124690388449225, + "loss": 2.5991, + "step": 13041 + }, + { + "epoch": 1.1815813911349686, + "grad_norm": 0.9187384247779846, + "learning_rate": 0.00012124086268350148, + "loss": 2.7982, + "step": 13042 + }, + { + "epoch": 1.1816719893094154, + "grad_norm": 0.9060741066932678, + "learning_rate": 0.00012123482148251073, + "loss": 3.0926, + "step": 13043 + }, + { + "epoch": 1.1817625874838622, + "grad_norm": 0.8222415447235107, + "learning_rate": 0.00012122878028151998, + "loss": 2.4763, + "step": 13044 + }, + { + "epoch": 1.181853185658309, + "grad_norm": 0.9188259840011597, + "learning_rate": 0.00012122273908052921, + "loss": 2.8254, + "step": 13045 + }, + { + "epoch": 1.1819437838327558, + "grad_norm": 0.9507157802581787, + "learning_rate": 0.00012121669787953846, + "loss": 2.9021, + "step": 13046 + }, + { + "epoch": 1.1820343820072026, + "grad_norm": 0.9075884222984314, + "learning_rate": 0.0001212106566785477, + "loss": 2.9187, + "step": 13047 + }, + { + "epoch": 1.1821249801816494, + "grad_norm": 0.9183154106140137, + "learning_rate": 0.00012120461547755695, + "loss": 3.1047, + "step": 13048 + }, + { + "epoch": 1.1822155783560961, + "grad_norm": 0.8586899042129517, + "learning_rate": 0.00012119857427656618, + "loss": 2.5619, + "step": 13049 + }, + { + "epoch": 1.182306176530543, + "grad_norm": 0.9560842514038086, + "learning_rate": 0.00012119253307557544, + "loss": 2.6691, + "step": 13050 + }, + { + "epoch": 1.1823967747049897, + "grad_norm": 1.1048201322555542, + "learning_rate": 0.00012118649187458466, + "loss": 2.6881, + "step": 13051 + }, + { + "epoch": 1.1824873728794365, + "grad_norm": 0.97100430727005, + "learning_rate": 0.00012118045067359392, + "loss": 2.7832, + "step": 13052 + }, + { + "epoch": 1.1825779710538833, + "grad_norm": 0.9508724808692932, + "learning_rate": 0.00012117440947260317, + "loss": 2.7183, + "step": 13053 + }, + { + "epoch": 1.18266856922833, + "grad_norm": 0.8817297220230103, + "learning_rate": 0.0001211683682716124, + "loss": 2.73, + "step": 13054 + }, + { + "epoch": 1.1827591674027769, + "grad_norm": 0.8796977400779724, + "learning_rate": 0.00012116232707062165, + "loss": 2.713, + "step": 13055 + }, + { + "epoch": 1.1828497655772237, + "grad_norm": 0.8379998803138733, + "learning_rate": 0.00012115628586963088, + "loss": 2.6016, + "step": 13056 + }, + { + "epoch": 1.1829403637516704, + "grad_norm": 0.935024082660675, + "learning_rate": 0.00012115024466864013, + "loss": 2.6679, + "step": 13057 + }, + { + "epoch": 1.1830309619261172, + "grad_norm": 0.8762075304985046, + "learning_rate": 0.00012114420346764936, + "loss": 2.4176, + "step": 13058 + }, + { + "epoch": 1.183121560100564, + "grad_norm": 0.9427459239959717, + "learning_rate": 0.00012113816226665861, + "loss": 2.6289, + "step": 13059 + }, + { + "epoch": 1.1832121582750108, + "grad_norm": 0.8625820875167847, + "learning_rate": 0.00012113212106566787, + "loss": 2.6615, + "step": 13060 + }, + { + "epoch": 1.1833027564494576, + "grad_norm": 0.8979569673538208, + "learning_rate": 0.0001211260798646771, + "loss": 2.8347, + "step": 13061 + }, + { + "epoch": 1.1833933546239044, + "grad_norm": 0.8613782525062561, + "learning_rate": 0.00012112003866368635, + "loss": 2.74, + "step": 13062 + }, + { + "epoch": 1.1834839527983512, + "grad_norm": 0.8840733766555786, + "learning_rate": 0.00012111399746269559, + "loss": 2.8336, + "step": 13063 + }, + { + "epoch": 1.183574550972798, + "grad_norm": 0.8571304678916931, + "learning_rate": 0.00012110795626170484, + "loss": 2.7173, + "step": 13064 + }, + { + "epoch": 1.1836651491472447, + "grad_norm": 0.9227386713027954, + "learning_rate": 0.00012110191506071407, + "loss": 2.7194, + "step": 13065 + }, + { + "epoch": 1.1837557473216915, + "grad_norm": 0.996884822845459, + "learning_rate": 0.00012109587385972332, + "loss": 2.5152, + "step": 13066 + }, + { + "epoch": 1.1838463454961383, + "grad_norm": 0.9023815393447876, + "learning_rate": 0.00012108983265873258, + "loss": 2.9761, + "step": 13067 + }, + { + "epoch": 1.183936943670585, + "grad_norm": 0.9184219837188721, + "learning_rate": 0.0001210837914577418, + "loss": 2.6072, + "step": 13068 + }, + { + "epoch": 1.184027541845032, + "grad_norm": 0.9152746200561523, + "learning_rate": 0.00012107775025675106, + "loss": 2.8743, + "step": 13069 + }, + { + "epoch": 1.1841181400194787, + "grad_norm": 0.9169911742210388, + "learning_rate": 0.00012107170905576028, + "loss": 2.6698, + "step": 13070 + }, + { + "epoch": 1.1842087381939255, + "grad_norm": 0.852645754814148, + "learning_rate": 0.00012106566785476954, + "loss": 2.687, + "step": 13071 + }, + { + "epoch": 1.1842993363683723, + "grad_norm": 0.8563351631164551, + "learning_rate": 0.00012105962665377876, + "loss": 2.7968, + "step": 13072 + }, + { + "epoch": 1.184389934542819, + "grad_norm": 0.8576003909111023, + "learning_rate": 0.00012105358545278802, + "loss": 2.5691, + "step": 13073 + }, + { + "epoch": 1.1844805327172658, + "grad_norm": 0.9336625337600708, + "learning_rate": 0.00012104754425179727, + "loss": 2.8129, + "step": 13074 + }, + { + "epoch": 1.1845711308917126, + "grad_norm": 0.7407578825950623, + "learning_rate": 0.0001210415030508065, + "loss": 2.037, + "step": 13075 + }, + { + "epoch": 1.1846617290661594, + "grad_norm": 0.891586184501648, + "learning_rate": 0.00012103546184981575, + "loss": 2.6473, + "step": 13076 + }, + { + "epoch": 1.1847523272406062, + "grad_norm": 0.9018651843070984, + "learning_rate": 0.00012102942064882499, + "loss": 2.5896, + "step": 13077 + }, + { + "epoch": 1.184842925415053, + "grad_norm": 0.9044234156608582, + "learning_rate": 0.00012102337944783423, + "loss": 2.7377, + "step": 13078 + }, + { + "epoch": 1.1849335235894998, + "grad_norm": 0.8962730169296265, + "learning_rate": 0.00012101733824684347, + "loss": 2.6379, + "step": 13079 + }, + { + "epoch": 1.1850241217639463, + "grad_norm": 0.8980580568313599, + "learning_rate": 0.00012101129704585273, + "loss": 2.701, + "step": 13080 + }, + { + "epoch": 1.1851147199383933, + "grad_norm": 0.9494738578796387, + "learning_rate": 0.00012100525584486195, + "loss": 2.9355, + "step": 13081 + }, + { + "epoch": 1.18520531811284, + "grad_norm": 0.8630883097648621, + "learning_rate": 0.00012099921464387121, + "loss": 2.5675, + "step": 13082 + }, + { + "epoch": 1.185295916287287, + "grad_norm": 0.9678050875663757, + "learning_rate": 0.00012099317344288046, + "loss": 2.5534, + "step": 13083 + }, + { + "epoch": 1.1853865144617335, + "grad_norm": 0.8885693550109863, + "learning_rate": 0.00012098713224188969, + "loss": 2.8408, + "step": 13084 + }, + { + "epoch": 1.1854771126361805, + "grad_norm": 0.8670777082443237, + "learning_rate": 0.00012098109104089894, + "loss": 2.829, + "step": 13085 + }, + { + "epoch": 1.185567710810627, + "grad_norm": 0.9350824356079102, + "learning_rate": 0.00012097504983990817, + "loss": 2.7412, + "step": 13086 + }, + { + "epoch": 1.185658308985074, + "grad_norm": 0.9906069040298462, + "learning_rate": 0.00012096900863891742, + "loss": 2.8879, + "step": 13087 + }, + { + "epoch": 1.1857489071595206, + "grad_norm": 0.9011228680610657, + "learning_rate": 0.00012096296743792666, + "loss": 2.8363, + "step": 13088 + }, + { + "epoch": 1.1858395053339674, + "grad_norm": 0.9087693691253662, + "learning_rate": 0.0001209569262369359, + "loss": 3.0549, + "step": 13089 + }, + { + "epoch": 1.1859301035084142, + "grad_norm": 0.8944625854492188, + "learning_rate": 0.00012095088503594516, + "loss": 2.8705, + "step": 13090 + }, + { + "epoch": 1.186020701682861, + "grad_norm": 0.9054609537124634, + "learning_rate": 0.00012094484383495438, + "loss": 2.8153, + "step": 13091 + }, + { + "epoch": 1.1861112998573078, + "grad_norm": 0.8724716901779175, + "learning_rate": 0.00012093880263396365, + "loss": 2.5382, + "step": 13092 + }, + { + "epoch": 1.1862018980317546, + "grad_norm": 0.8993306756019592, + "learning_rate": 0.00012093276143297288, + "loss": 2.7285, + "step": 13093 + }, + { + "epoch": 1.1862924962062014, + "grad_norm": 0.8582207560539246, + "learning_rate": 0.00012092672023198213, + "loss": 2.5879, + "step": 13094 + }, + { + "epoch": 1.1863830943806482, + "grad_norm": 0.8844720125198364, + "learning_rate": 0.00012092067903099136, + "loss": 2.5537, + "step": 13095 + }, + { + "epoch": 1.186473692555095, + "grad_norm": 0.9459013938903809, + "learning_rate": 0.00012091463783000061, + "loss": 2.8644, + "step": 13096 + }, + { + "epoch": 1.1865642907295417, + "grad_norm": 0.9552396535873413, + "learning_rate": 0.00012090859662900986, + "loss": 2.7414, + "step": 13097 + }, + { + "epoch": 1.1866548889039885, + "grad_norm": 0.8905479907989502, + "learning_rate": 0.00012090255542801909, + "loss": 2.8302, + "step": 13098 + }, + { + "epoch": 1.1867454870784353, + "grad_norm": 0.8989125490188599, + "learning_rate": 0.00012089651422702835, + "loss": 2.9438, + "step": 13099 + }, + { + "epoch": 1.186836085252882, + "grad_norm": 0.8722662329673767, + "learning_rate": 0.00012089047302603757, + "loss": 2.7234, + "step": 13100 + }, + { + "epoch": 1.1869266834273289, + "grad_norm": 0.8795772194862366, + "learning_rate": 0.00012088443182504683, + "loss": 2.7121, + "step": 13101 + }, + { + "epoch": 1.1870172816017757, + "grad_norm": 0.8383500576019287, + "learning_rate": 0.00012087839062405605, + "loss": 2.5439, + "step": 13102 + }, + { + "epoch": 1.1871078797762225, + "grad_norm": 0.8528857827186584, + "learning_rate": 0.00012087234942306532, + "loss": 2.783, + "step": 13103 + }, + { + "epoch": 1.1871984779506692, + "grad_norm": 0.8642762899398804, + "learning_rate": 0.00012086630822207456, + "loss": 2.3835, + "step": 13104 + }, + { + "epoch": 1.187289076125116, + "grad_norm": 0.8559863567352295, + "learning_rate": 0.0001208602670210838, + "loss": 2.7015, + "step": 13105 + }, + { + "epoch": 1.1873796742995628, + "grad_norm": 0.9095205664634705, + "learning_rate": 0.00012085422582009304, + "loss": 2.7338, + "step": 13106 + }, + { + "epoch": 1.1874702724740096, + "grad_norm": 0.8853405117988586, + "learning_rate": 0.00012084818461910228, + "loss": 2.6577, + "step": 13107 + }, + { + "epoch": 1.1875608706484564, + "grad_norm": 0.9100493788719177, + "learning_rate": 0.00012084214341811153, + "loss": 3.0784, + "step": 13108 + }, + { + "epoch": 1.1876514688229032, + "grad_norm": 0.8554806709289551, + "learning_rate": 0.00012083610221712076, + "loss": 2.606, + "step": 13109 + }, + { + "epoch": 1.18774206699735, + "grad_norm": 0.9573628902435303, + "learning_rate": 0.00012083006101613001, + "loss": 2.7028, + "step": 13110 + }, + { + "epoch": 1.1878326651717968, + "grad_norm": 0.9081097841262817, + "learning_rate": 0.00012082401981513924, + "loss": 2.5231, + "step": 13111 + }, + { + "epoch": 1.1879232633462435, + "grad_norm": 0.9411100149154663, + "learning_rate": 0.0001208179786141485, + "loss": 2.7594, + "step": 13112 + }, + { + "epoch": 1.1880138615206903, + "grad_norm": 0.8269672989845276, + "learning_rate": 0.00012081193741315775, + "loss": 2.5916, + "step": 13113 + }, + { + "epoch": 1.1881044596951371, + "grad_norm": 0.8668387532234192, + "learning_rate": 0.00012080589621216699, + "loss": 2.9112, + "step": 13114 + }, + { + "epoch": 1.188195057869584, + "grad_norm": 0.8624171018600464, + "learning_rate": 0.00012079985501117623, + "loss": 2.0368, + "step": 13115 + }, + { + "epoch": 1.1882856560440307, + "grad_norm": 0.930588960647583, + "learning_rate": 0.00012079381381018547, + "loss": 2.7542, + "step": 13116 + }, + { + "epoch": 1.1883762542184775, + "grad_norm": 0.8707219958305359, + "learning_rate": 0.00012078777260919471, + "loss": 2.5327, + "step": 13117 + }, + { + "epoch": 1.1884668523929243, + "grad_norm": 0.8959616422653198, + "learning_rate": 0.00012078173140820395, + "loss": 2.414, + "step": 13118 + }, + { + "epoch": 1.188557450567371, + "grad_norm": 0.9144120812416077, + "learning_rate": 0.0001207756902072132, + "loss": 2.6869, + "step": 13119 + }, + { + "epoch": 1.1886480487418178, + "grad_norm": 0.9427526593208313, + "learning_rate": 0.00012076964900622246, + "loss": 2.6591, + "step": 13120 + }, + { + "epoch": 1.1887386469162646, + "grad_norm": 0.9417350888252258, + "learning_rate": 0.00012076360780523168, + "loss": 2.8405, + "step": 13121 + }, + { + "epoch": 1.1888292450907114, + "grad_norm": 0.9298784136772156, + "learning_rate": 0.00012075756660424094, + "loss": 2.749, + "step": 13122 + }, + { + "epoch": 1.1889198432651582, + "grad_norm": 0.7387490272521973, + "learning_rate": 0.00012075152540325016, + "loss": 1.914, + "step": 13123 + }, + { + "epoch": 1.189010441439605, + "grad_norm": 0.8776398301124573, + "learning_rate": 0.00012074548420225942, + "loss": 2.6019, + "step": 13124 + }, + { + "epoch": 1.1891010396140518, + "grad_norm": 1.0399186611175537, + "learning_rate": 0.00012073944300126865, + "loss": 2.5572, + "step": 13125 + }, + { + "epoch": 1.1891916377884986, + "grad_norm": 0.8793044686317444, + "learning_rate": 0.0001207334018002779, + "loss": 2.4681, + "step": 13126 + }, + { + "epoch": 1.1892822359629454, + "grad_norm": 0.8867766261100769, + "learning_rate": 0.00012072736059928715, + "loss": 2.8242, + "step": 13127 + }, + { + "epoch": 1.1893728341373921, + "grad_norm": 0.9476644992828369, + "learning_rate": 0.00012072131939829638, + "loss": 2.69, + "step": 13128 + }, + { + "epoch": 1.189463432311839, + "grad_norm": 0.898973822593689, + "learning_rate": 0.00012071527819730563, + "loss": 2.6776, + "step": 13129 + }, + { + "epoch": 1.1895540304862857, + "grad_norm": 0.9053632020950317, + "learning_rate": 0.00012070923699631487, + "loss": 2.5752, + "step": 13130 + }, + { + "epoch": 1.1896446286607325, + "grad_norm": 0.8702847957611084, + "learning_rate": 0.00012070319579532413, + "loss": 2.8202, + "step": 13131 + }, + { + "epoch": 1.1897352268351793, + "grad_norm": 1.0691485404968262, + "learning_rate": 0.00012069715459433335, + "loss": 2.8631, + "step": 13132 + }, + { + "epoch": 1.189825825009626, + "grad_norm": 0.8875281810760498, + "learning_rate": 0.00012069111339334261, + "loss": 2.5896, + "step": 13133 + }, + { + "epoch": 1.1899164231840729, + "grad_norm": 0.898270308971405, + "learning_rate": 0.00012068507219235186, + "loss": 2.7295, + "step": 13134 + }, + { + "epoch": 1.1900070213585197, + "grad_norm": 0.9628542065620422, + "learning_rate": 0.00012067903099136109, + "loss": 2.4949, + "step": 13135 + }, + { + "epoch": 1.1900976195329664, + "grad_norm": 0.8992372155189514, + "learning_rate": 0.00012067298979037034, + "loss": 2.7718, + "step": 13136 + }, + { + "epoch": 1.1901882177074132, + "grad_norm": 0.9434299468994141, + "learning_rate": 0.00012066694858937957, + "loss": 2.714, + "step": 13137 + }, + { + "epoch": 1.19027881588186, + "grad_norm": 0.9443076848983765, + "learning_rate": 0.00012066090738838882, + "loss": 2.894, + "step": 13138 + }, + { + "epoch": 1.1903694140563068, + "grad_norm": 0.8726010322570801, + "learning_rate": 0.00012065486618739805, + "loss": 2.8251, + "step": 13139 + }, + { + "epoch": 1.1904600122307536, + "grad_norm": 1.0367841720581055, + "learning_rate": 0.0001206488249864073, + "loss": 2.7108, + "step": 13140 + }, + { + "epoch": 1.1905506104052004, + "grad_norm": 0.8951546549797058, + "learning_rate": 0.00012064278378541656, + "loss": 2.744, + "step": 13141 + }, + { + "epoch": 1.1906412085796472, + "grad_norm": 0.8764649033546448, + "learning_rate": 0.00012063674258442578, + "loss": 2.7447, + "step": 13142 + }, + { + "epoch": 1.190731806754094, + "grad_norm": 0.9860553741455078, + "learning_rate": 0.00012063070138343504, + "loss": 2.9267, + "step": 13143 + }, + { + "epoch": 1.1908224049285407, + "grad_norm": 0.945306658744812, + "learning_rate": 0.00012062466018244428, + "loss": 2.5237, + "step": 13144 + }, + { + "epoch": 1.1909130031029875, + "grad_norm": 0.9743179678916931, + "learning_rate": 0.00012061861898145353, + "loss": 2.8083, + "step": 13145 + }, + { + "epoch": 1.1910036012774343, + "grad_norm": 0.8627443909645081, + "learning_rate": 0.00012061257778046276, + "loss": 2.702, + "step": 13146 + }, + { + "epoch": 1.191094199451881, + "grad_norm": 1.0000134706497192, + "learning_rate": 0.00012060653657947201, + "loss": 2.5441, + "step": 13147 + }, + { + "epoch": 1.191184797626328, + "grad_norm": 0.6119873523712158, + "learning_rate": 0.00012060049537848124, + "loss": 1.2659, + "step": 13148 + }, + { + "epoch": 1.1912753958007747, + "grad_norm": 0.9044544696807861, + "learning_rate": 0.00012059445417749049, + "loss": 2.9565, + "step": 13149 + }, + { + "epoch": 1.1913659939752215, + "grad_norm": 0.8563584089279175, + "learning_rate": 0.00012058841297649974, + "loss": 2.5894, + "step": 13150 + }, + { + "epoch": 1.1914565921496683, + "grad_norm": 0.8768911957740784, + "learning_rate": 0.00012058237177550897, + "loss": 2.785, + "step": 13151 + }, + { + "epoch": 1.191547190324115, + "grad_norm": 0.8793584108352661, + "learning_rate": 0.00012057633057451823, + "loss": 2.8931, + "step": 13152 + }, + { + "epoch": 1.1916377884985618, + "grad_norm": 0.7964051365852356, + "learning_rate": 0.00012057028937352745, + "loss": 2.2284, + "step": 13153 + }, + { + "epoch": 1.1917283866730086, + "grad_norm": 0.9177857041358948, + "learning_rate": 0.00012056424817253671, + "loss": 2.7698, + "step": 13154 + }, + { + "epoch": 1.1918189848474554, + "grad_norm": 0.8626103401184082, + "learning_rate": 0.00012055820697154593, + "loss": 2.5773, + "step": 13155 + }, + { + "epoch": 1.1919095830219022, + "grad_norm": 0.8813077807426453, + "learning_rate": 0.0001205521657705552, + "loss": 2.6573, + "step": 13156 + }, + { + "epoch": 1.192000181196349, + "grad_norm": 0.8933619260787964, + "learning_rate": 0.00012054612456956444, + "loss": 2.7129, + "step": 13157 + }, + { + "epoch": 1.1920907793707958, + "grad_norm": 0.9490291476249695, + "learning_rate": 0.00012054008336857368, + "loss": 2.7129, + "step": 13158 + }, + { + "epoch": 1.1921813775452426, + "grad_norm": 0.8676515817642212, + "learning_rate": 0.00012053404216758292, + "loss": 2.7488, + "step": 13159 + }, + { + "epoch": 1.1922719757196893, + "grad_norm": 0.9698210954666138, + "learning_rate": 0.00012052800096659216, + "loss": 2.5328, + "step": 13160 + }, + { + "epoch": 1.192362573894136, + "grad_norm": 0.822591245174408, + "learning_rate": 0.0001205219597656014, + "loss": 2.6003, + "step": 13161 + }, + { + "epoch": 1.192453172068583, + "grad_norm": 1.010772943496704, + "learning_rate": 0.00012051591856461064, + "loss": 2.6689, + "step": 13162 + }, + { + "epoch": 1.1925437702430295, + "grad_norm": 0.856155276298523, + "learning_rate": 0.0001205098773636199, + "loss": 2.4802, + "step": 13163 + }, + { + "epoch": 1.1926343684174765, + "grad_norm": 0.9797654747962952, + "learning_rate": 0.00012050383616262915, + "loss": 3.018, + "step": 13164 + }, + { + "epoch": 1.192724966591923, + "grad_norm": 0.9312832355499268, + "learning_rate": 0.00012049779496163838, + "loss": 2.649, + "step": 13165 + }, + { + "epoch": 1.19281556476637, + "grad_norm": 0.8472509980201721, + "learning_rate": 0.00012049175376064763, + "loss": 2.529, + "step": 13166 + }, + { + "epoch": 1.1929061629408166, + "grad_norm": 0.8400976657867432, + "learning_rate": 0.00012048571255965686, + "loss": 2.6723, + "step": 13167 + }, + { + "epoch": 1.1929967611152636, + "grad_norm": 0.8581972122192383, + "learning_rate": 0.00012047967135866611, + "loss": 2.756, + "step": 13168 + }, + { + "epoch": 1.1930873592897102, + "grad_norm": 0.9170317649841309, + "learning_rate": 0.00012047363015767535, + "loss": 2.6888, + "step": 13169 + }, + { + "epoch": 1.193177957464157, + "grad_norm": 1.0304089784622192, + "learning_rate": 0.0001204675889566846, + "loss": 2.6886, + "step": 13170 + }, + { + "epoch": 1.1932685556386038, + "grad_norm": 0.9249605536460876, + "learning_rate": 0.00012046154775569385, + "loss": 2.5508, + "step": 13171 + }, + { + "epoch": 1.1933591538130506, + "grad_norm": 1.004319429397583, + "learning_rate": 0.00012045550655470308, + "loss": 2.6995, + "step": 13172 + }, + { + "epoch": 1.1934497519874974, + "grad_norm": 0.9835572838783264, + "learning_rate": 0.00012044946535371234, + "loss": 2.7907, + "step": 13173 + }, + { + "epoch": 1.1935403501619442, + "grad_norm": 0.8878253102302551, + "learning_rate": 0.00012044342415272156, + "loss": 2.4719, + "step": 13174 + }, + { + "epoch": 1.193630948336391, + "grad_norm": 0.9520058035850525, + "learning_rate": 0.00012043738295173082, + "loss": 2.8345, + "step": 13175 + }, + { + "epoch": 1.1937215465108377, + "grad_norm": 0.9747240543365479, + "learning_rate": 0.00012043134175074005, + "loss": 2.6168, + "step": 13176 + }, + { + "epoch": 1.1938121446852845, + "grad_norm": 0.9090903401374817, + "learning_rate": 0.0001204253005497493, + "loss": 2.9906, + "step": 13177 + }, + { + "epoch": 1.1939027428597313, + "grad_norm": 0.8646488785743713, + "learning_rate": 0.00012041925934875853, + "loss": 2.5818, + "step": 13178 + }, + { + "epoch": 1.193993341034178, + "grad_norm": 0.9580179452896118, + "learning_rate": 0.00012041321814776778, + "loss": 2.5977, + "step": 13179 + }, + { + "epoch": 1.1940839392086249, + "grad_norm": 0.9579797983169556, + "learning_rate": 0.00012040717694677703, + "loss": 2.9285, + "step": 13180 + }, + { + "epoch": 1.1941745373830717, + "grad_norm": 0.9405766129493713, + "learning_rate": 0.00012040113574578626, + "loss": 2.6713, + "step": 13181 + }, + { + "epoch": 1.1942651355575185, + "grad_norm": 0.8981817364692688, + "learning_rate": 0.00012039509454479551, + "loss": 2.5961, + "step": 13182 + }, + { + "epoch": 1.1943557337319652, + "grad_norm": 0.8610329031944275, + "learning_rate": 0.00012038905334380474, + "loss": 2.5697, + "step": 13183 + }, + { + "epoch": 1.194446331906412, + "grad_norm": 0.9035407304763794, + "learning_rate": 0.000120383012142814, + "loss": 2.8047, + "step": 13184 + }, + { + "epoch": 1.1945369300808588, + "grad_norm": 0.8730714917182922, + "learning_rate": 0.00012037697094182323, + "loss": 2.8254, + "step": 13185 + }, + { + "epoch": 1.1946275282553056, + "grad_norm": 0.8842551112174988, + "learning_rate": 0.00012037092974083249, + "loss": 2.707, + "step": 13186 + }, + { + "epoch": 1.1947181264297524, + "grad_norm": 0.8886706829071045, + "learning_rate": 0.00012036488853984173, + "loss": 2.725, + "step": 13187 + }, + { + "epoch": 1.1948087246041992, + "grad_norm": 0.9626285433769226, + "learning_rate": 0.00012035884733885097, + "loss": 2.6002, + "step": 13188 + }, + { + "epoch": 1.194899322778646, + "grad_norm": 0.9214462041854858, + "learning_rate": 0.00012035280613786022, + "loss": 2.7027, + "step": 13189 + }, + { + "epoch": 1.1949899209530928, + "grad_norm": 0.8614215850830078, + "learning_rate": 0.00012034676493686945, + "loss": 2.6288, + "step": 13190 + }, + { + "epoch": 1.1950805191275395, + "grad_norm": 0.9341640472412109, + "learning_rate": 0.0001203407237358787, + "loss": 2.6711, + "step": 13191 + }, + { + "epoch": 1.1951711173019863, + "grad_norm": 0.9343839883804321, + "learning_rate": 0.00012033468253488793, + "loss": 3.0869, + "step": 13192 + }, + { + "epoch": 1.1952617154764331, + "grad_norm": 0.8898777961730957, + "learning_rate": 0.00012032864133389718, + "loss": 2.841, + "step": 13193 + }, + { + "epoch": 1.19535231365088, + "grad_norm": 0.8380894064903259, + "learning_rate": 0.00012032260013290644, + "loss": 2.5139, + "step": 13194 + }, + { + "epoch": 1.1954429118253267, + "grad_norm": 0.9271271824836731, + "learning_rate": 0.00012031655893191566, + "loss": 2.5048, + "step": 13195 + }, + { + "epoch": 1.1955335099997735, + "grad_norm": 0.8487986326217651, + "learning_rate": 0.00012031051773092492, + "loss": 2.7715, + "step": 13196 + }, + { + "epoch": 1.1956241081742203, + "grad_norm": 0.9830788969993591, + "learning_rate": 0.00012030447652993416, + "loss": 2.7357, + "step": 13197 + }, + { + "epoch": 1.195714706348667, + "grad_norm": 0.9984167218208313, + "learning_rate": 0.0001202984353289434, + "loss": 2.7329, + "step": 13198 + }, + { + "epoch": 1.1958053045231138, + "grad_norm": 0.9417247772216797, + "learning_rate": 0.00012029239412795264, + "loss": 2.7756, + "step": 13199 + }, + { + "epoch": 1.1958959026975606, + "grad_norm": 0.8510646820068359, + "learning_rate": 0.00012028635292696189, + "loss": 2.8492, + "step": 13200 + }, + { + "epoch": 1.1959865008720074, + "grad_norm": 0.9139812588691711, + "learning_rate": 0.00012028031172597113, + "loss": 3.0073, + "step": 13201 + }, + { + "epoch": 1.1960770990464542, + "grad_norm": 0.9550654292106628, + "learning_rate": 0.00012027427052498037, + "loss": 2.8461, + "step": 13202 + }, + { + "epoch": 1.196167697220901, + "grad_norm": 0.9353850483894348, + "learning_rate": 0.00012026822932398963, + "loss": 2.5849, + "step": 13203 + }, + { + "epoch": 1.1962582953953478, + "grad_norm": 0.8713497519493103, + "learning_rate": 0.00012026218812299885, + "loss": 2.6248, + "step": 13204 + }, + { + "epoch": 1.1963488935697946, + "grad_norm": 0.8819947838783264, + "learning_rate": 0.00012025614692200811, + "loss": 2.5939, + "step": 13205 + }, + { + "epoch": 1.1964394917442414, + "grad_norm": 0.9178290367126465, + "learning_rate": 0.00012025010572101733, + "loss": 2.3423, + "step": 13206 + }, + { + "epoch": 1.1965300899186881, + "grad_norm": 0.86623215675354, + "learning_rate": 0.00012024406452002659, + "loss": 2.5964, + "step": 13207 + }, + { + "epoch": 1.196620688093135, + "grad_norm": 0.913720965385437, + "learning_rate": 0.00012023802331903583, + "loss": 2.7385, + "step": 13208 + }, + { + "epoch": 1.1967112862675817, + "grad_norm": 0.7771788239479065, + "learning_rate": 0.00012023198211804507, + "loss": 1.7449, + "step": 13209 + }, + { + "epoch": 1.1968018844420285, + "grad_norm": 0.866535484790802, + "learning_rate": 0.00012022594091705432, + "loss": 2.6794, + "step": 13210 + }, + { + "epoch": 1.1968924826164753, + "grad_norm": 0.8877447247505188, + "learning_rate": 0.00012021989971606356, + "loss": 2.7733, + "step": 13211 + }, + { + "epoch": 1.196983080790922, + "grad_norm": 1.0095254182815552, + "learning_rate": 0.0001202138585150728, + "loss": 2.8591, + "step": 13212 + }, + { + "epoch": 1.1970736789653689, + "grad_norm": 0.9081249833106995, + "learning_rate": 0.00012020781731408204, + "loss": 2.7724, + "step": 13213 + }, + { + "epoch": 1.1971642771398157, + "grad_norm": 0.7460049986839294, + "learning_rate": 0.00012020177611309128, + "loss": 1.8115, + "step": 13214 + }, + { + "epoch": 1.1972548753142624, + "grad_norm": 0.7711483836174011, + "learning_rate": 0.00012019573491210052, + "loss": 2.0935, + "step": 13215 + }, + { + "epoch": 1.1973454734887092, + "grad_norm": 0.9349619150161743, + "learning_rate": 0.00012018969371110978, + "loss": 2.7079, + "step": 13216 + }, + { + "epoch": 1.197436071663156, + "grad_norm": 0.8669865131378174, + "learning_rate": 0.00012018365251011903, + "loss": 2.4769, + "step": 13217 + }, + { + "epoch": 1.1975266698376028, + "grad_norm": 0.9116100072860718, + "learning_rate": 0.00012017761130912826, + "loss": 2.8353, + "step": 13218 + }, + { + "epoch": 1.1976172680120496, + "grad_norm": 0.7901685237884521, + "learning_rate": 0.00012017157010813751, + "loss": 1.8444, + "step": 13219 + }, + { + "epoch": 1.1977078661864964, + "grad_norm": 0.8668563365936279, + "learning_rate": 0.00012016552890714674, + "loss": 2.6732, + "step": 13220 + }, + { + "epoch": 1.1977984643609432, + "grad_norm": 0.9242969751358032, + "learning_rate": 0.00012015948770615599, + "loss": 2.8942, + "step": 13221 + }, + { + "epoch": 1.19788906253539, + "grad_norm": 0.9304991960525513, + "learning_rate": 0.00012015344650516522, + "loss": 3.0093, + "step": 13222 + }, + { + "epoch": 1.1979796607098367, + "grad_norm": 0.9733287692070007, + "learning_rate": 0.00012014740530417447, + "loss": 2.7151, + "step": 13223 + }, + { + "epoch": 1.1980702588842835, + "grad_norm": 0.8022044897079468, + "learning_rate": 0.00012014136410318373, + "loss": 2.0915, + "step": 13224 + }, + { + "epoch": 1.1981608570587303, + "grad_norm": 0.9218568801879883, + "learning_rate": 0.00012013532290219295, + "loss": 2.8667, + "step": 13225 + }, + { + "epoch": 1.198251455233177, + "grad_norm": 0.8645904660224915, + "learning_rate": 0.00012012928170120222, + "loss": 2.7609, + "step": 13226 + }, + { + "epoch": 1.198342053407624, + "grad_norm": 0.8654434680938721, + "learning_rate": 0.00012012324050021144, + "loss": 2.7588, + "step": 13227 + }, + { + "epoch": 1.1984326515820707, + "grad_norm": 0.869622528553009, + "learning_rate": 0.0001201171992992207, + "loss": 2.6507, + "step": 13228 + }, + { + "epoch": 1.1985232497565175, + "grad_norm": 0.890637993812561, + "learning_rate": 0.00012011115809822993, + "loss": 2.7191, + "step": 13229 + }, + { + "epoch": 1.1986138479309643, + "grad_norm": 0.8373474478721619, + "learning_rate": 0.00012010511689723918, + "loss": 2.5945, + "step": 13230 + }, + { + "epoch": 1.198704446105411, + "grad_norm": 0.8360206484794617, + "learning_rate": 0.00012009907569624843, + "loss": 2.6809, + "step": 13231 + }, + { + "epoch": 1.1987950442798578, + "grad_norm": 0.7983487844467163, + "learning_rate": 0.00012009303449525766, + "loss": 2.0849, + "step": 13232 + }, + { + "epoch": 1.1988856424543046, + "grad_norm": 0.8168964982032776, + "learning_rate": 0.00012008699329426691, + "loss": 2.5728, + "step": 13233 + }, + { + "epoch": 1.1989762406287514, + "grad_norm": 0.9799750447273254, + "learning_rate": 0.00012008095209327614, + "loss": 2.7865, + "step": 13234 + }, + { + "epoch": 1.1990668388031982, + "grad_norm": 0.8853220343589783, + "learning_rate": 0.0001200749108922854, + "loss": 2.6012, + "step": 13235 + }, + { + "epoch": 1.199157436977645, + "grad_norm": 0.8950837850570679, + "learning_rate": 0.00012006886969129462, + "loss": 2.7822, + "step": 13236 + }, + { + "epoch": 1.1992480351520918, + "grad_norm": 0.9014605283737183, + "learning_rate": 0.00012006282849030388, + "loss": 2.844, + "step": 13237 + }, + { + "epoch": 1.1993386333265386, + "grad_norm": 0.8796595335006714, + "learning_rate": 0.0001200567872893131, + "loss": 2.6367, + "step": 13238 + }, + { + "epoch": 1.1994292315009853, + "grad_norm": 0.9183904528617859, + "learning_rate": 0.00012005074608832237, + "loss": 2.8557, + "step": 13239 + }, + { + "epoch": 1.1995198296754321, + "grad_norm": 0.9172179698944092, + "learning_rate": 0.00012004470488733161, + "loss": 2.7656, + "step": 13240 + }, + { + "epoch": 1.199610427849879, + "grad_norm": 0.9369338154792786, + "learning_rate": 0.00012003866368634085, + "loss": 2.6773, + "step": 13241 + }, + { + "epoch": 1.1997010260243255, + "grad_norm": 1.008125901222229, + "learning_rate": 0.0001200326224853501, + "loss": 2.7764, + "step": 13242 + }, + { + "epoch": 1.1997916241987725, + "grad_norm": 0.902376651763916, + "learning_rate": 0.00012002658128435933, + "loss": 2.6566, + "step": 13243 + }, + { + "epoch": 1.199882222373219, + "grad_norm": 0.9029714465141296, + "learning_rate": 0.00012002054008336858, + "loss": 2.6633, + "step": 13244 + }, + { + "epoch": 1.199972820547666, + "grad_norm": 0.951615571975708, + "learning_rate": 0.00012001449888237781, + "loss": 2.7159, + "step": 13245 + }, + { + "epoch": 1.2000634187221126, + "grad_norm": 0.9182167053222656, + "learning_rate": 0.00012000845768138706, + "loss": 2.5462, + "step": 13246 + }, + { + "epoch": 1.2001540168965597, + "grad_norm": 0.9255519509315491, + "learning_rate": 0.00012000241648039632, + "loss": 2.8024, + "step": 13247 + }, + { + "epoch": 1.2002446150710062, + "grad_norm": 0.8840047121047974, + "learning_rate": 0.00011999637527940555, + "loss": 2.6223, + "step": 13248 + }, + { + "epoch": 1.2003352132454532, + "grad_norm": 0.8601930141448975, + "learning_rate": 0.0001199903340784148, + "loss": 2.8615, + "step": 13249 + }, + { + "epoch": 1.2004258114198998, + "grad_norm": 0.8518264293670654, + "learning_rate": 0.00011998429287742404, + "loss": 2.8011, + "step": 13250 + }, + { + "epoch": 1.2005164095943466, + "grad_norm": 0.9869011044502258, + "learning_rate": 0.00011997825167643328, + "loss": 2.6482, + "step": 13251 + }, + { + "epoch": 1.2006070077687934, + "grad_norm": 0.7477778792381287, + "learning_rate": 0.00011997221047544252, + "loss": 2.1672, + "step": 13252 + }, + { + "epoch": 1.2006976059432402, + "grad_norm": 0.9290754795074463, + "learning_rate": 0.00011996616927445177, + "loss": 2.6548, + "step": 13253 + }, + { + "epoch": 1.200788204117687, + "grad_norm": 0.9173887372016907, + "learning_rate": 0.00011996012807346103, + "loss": 2.5627, + "step": 13254 + }, + { + "epoch": 1.2008788022921337, + "grad_norm": 0.9304068088531494, + "learning_rate": 0.00011995408687247025, + "loss": 2.7593, + "step": 13255 + }, + { + "epoch": 1.2009694004665805, + "grad_norm": 0.9253214001655579, + "learning_rate": 0.00011994804567147951, + "loss": 2.7963, + "step": 13256 + }, + { + "epoch": 1.2010599986410273, + "grad_norm": 0.9114800095558167, + "learning_rate": 0.00011994200447048873, + "loss": 2.8711, + "step": 13257 + }, + { + "epoch": 1.201150596815474, + "grad_norm": 0.8657671809196472, + "learning_rate": 0.00011993596326949799, + "loss": 2.5755, + "step": 13258 + }, + { + "epoch": 1.2012411949899209, + "grad_norm": 0.8711355924606323, + "learning_rate": 0.00011992992206850721, + "loss": 2.8752, + "step": 13259 + }, + { + "epoch": 1.2013317931643677, + "grad_norm": 0.956447184085846, + "learning_rate": 0.00011992388086751647, + "loss": 2.9287, + "step": 13260 + }, + { + "epoch": 1.2014223913388145, + "grad_norm": 0.8635048270225525, + "learning_rate": 0.00011991783966652572, + "loss": 2.3689, + "step": 13261 + }, + { + "epoch": 1.2015129895132612, + "grad_norm": 0.8848374485969543, + "learning_rate": 0.00011991179846553495, + "loss": 2.6841, + "step": 13262 + }, + { + "epoch": 1.201603587687708, + "grad_norm": 0.9211754202842712, + "learning_rate": 0.0001199057572645442, + "loss": 2.922, + "step": 13263 + }, + { + "epoch": 1.2016941858621548, + "grad_norm": 0.8627405166625977, + "learning_rate": 0.00011989971606355343, + "loss": 2.5564, + "step": 13264 + }, + { + "epoch": 1.2017847840366016, + "grad_norm": 0.919350266456604, + "learning_rate": 0.00011989367486256268, + "loss": 2.5963, + "step": 13265 + }, + { + "epoch": 1.2018753822110484, + "grad_norm": 0.8787184953689575, + "learning_rate": 0.00011988763366157192, + "loss": 2.6436, + "step": 13266 + }, + { + "epoch": 1.2019659803854952, + "grad_norm": 0.9923052191734314, + "learning_rate": 0.00011988159246058118, + "loss": 3.0624, + "step": 13267 + }, + { + "epoch": 1.202056578559942, + "grad_norm": 0.9767078161239624, + "learning_rate": 0.0001198755512595904, + "loss": 2.6199, + "step": 13268 + }, + { + "epoch": 1.2021471767343888, + "grad_norm": 0.9027463793754578, + "learning_rate": 0.00011986951005859966, + "loss": 2.8865, + "step": 13269 + }, + { + "epoch": 1.2022377749088355, + "grad_norm": 0.9193617105484009, + "learning_rate": 0.0001198634688576089, + "loss": 2.7818, + "step": 13270 + }, + { + "epoch": 1.2023283730832823, + "grad_norm": 0.9132376909255981, + "learning_rate": 0.00011985742765661814, + "loss": 2.5578, + "step": 13271 + }, + { + "epoch": 1.2024189712577291, + "grad_norm": 0.7682933211326599, + "learning_rate": 0.00011985138645562739, + "loss": 2.0536, + "step": 13272 + }, + { + "epoch": 1.202509569432176, + "grad_norm": 0.7699368000030518, + "learning_rate": 0.00011984534525463662, + "loss": 2.3177, + "step": 13273 + }, + { + "epoch": 1.2026001676066227, + "grad_norm": 1.0234408378601074, + "learning_rate": 0.00011983930405364587, + "loss": 2.694, + "step": 13274 + }, + { + "epoch": 1.2026907657810695, + "grad_norm": 0.8899630904197693, + "learning_rate": 0.0001198332628526551, + "loss": 2.7555, + "step": 13275 + }, + { + "epoch": 1.2027813639555163, + "grad_norm": 0.9818927645683289, + "learning_rate": 0.00011982722165166435, + "loss": 2.688, + "step": 13276 + }, + { + "epoch": 1.202871962129963, + "grad_norm": 0.9615073800086975, + "learning_rate": 0.00011982118045067361, + "loss": 2.6538, + "step": 13277 + }, + { + "epoch": 1.2029625603044098, + "grad_norm": 0.9282393455505371, + "learning_rate": 0.00011981513924968283, + "loss": 2.5094, + "step": 13278 + }, + { + "epoch": 1.2030531584788566, + "grad_norm": 0.9140923023223877, + "learning_rate": 0.0001198090980486921, + "loss": 2.4698, + "step": 13279 + }, + { + "epoch": 1.2031437566533034, + "grad_norm": 0.8772634267807007, + "learning_rate": 0.00011980305684770133, + "loss": 2.6277, + "step": 13280 + }, + { + "epoch": 1.2032343548277502, + "grad_norm": 0.934677243232727, + "learning_rate": 0.00011979701564671058, + "loss": 2.648, + "step": 13281 + }, + { + "epoch": 1.203324953002197, + "grad_norm": 0.8575340509414673, + "learning_rate": 0.00011979097444571981, + "loss": 2.6521, + "step": 13282 + }, + { + "epoch": 1.2034155511766438, + "grad_norm": 0.9221498966217041, + "learning_rate": 0.00011978493324472906, + "loss": 2.6949, + "step": 13283 + }, + { + "epoch": 1.2035061493510906, + "grad_norm": 0.9172356724739075, + "learning_rate": 0.0001197788920437383, + "loss": 2.6806, + "step": 13284 + }, + { + "epoch": 1.2035967475255374, + "grad_norm": 0.8415682315826416, + "learning_rate": 0.00011977285084274754, + "loss": 2.4933, + "step": 13285 + }, + { + "epoch": 1.2036873456999841, + "grad_norm": 0.9921873211860657, + "learning_rate": 0.0001197668096417568, + "loss": 2.7045, + "step": 13286 + }, + { + "epoch": 1.203777943874431, + "grad_norm": 0.9288135766983032, + "learning_rate": 0.00011976076844076602, + "loss": 2.8017, + "step": 13287 + }, + { + "epoch": 1.2038685420488777, + "grad_norm": 0.6895476579666138, + "learning_rate": 0.00011975472723977528, + "loss": 1.2755, + "step": 13288 + }, + { + "epoch": 1.2039591402233245, + "grad_norm": 1.1049129962921143, + "learning_rate": 0.0001197486860387845, + "loss": 2.6766, + "step": 13289 + }, + { + "epoch": 1.2040497383977713, + "grad_norm": 0.9086504578590393, + "learning_rate": 0.00011974264483779376, + "loss": 2.5487, + "step": 13290 + }, + { + "epoch": 1.204140336572218, + "grad_norm": 0.8859380483627319, + "learning_rate": 0.00011973660363680301, + "loss": 2.6413, + "step": 13291 + }, + { + "epoch": 1.2042309347466649, + "grad_norm": 0.9562059640884399, + "learning_rate": 0.00011973056243581225, + "loss": 2.6208, + "step": 13292 + }, + { + "epoch": 1.2043215329211117, + "grad_norm": 0.8609703779220581, + "learning_rate": 0.0001197245212348215, + "loss": 2.7199, + "step": 13293 + }, + { + "epoch": 1.2044121310955584, + "grad_norm": 0.8983001112937927, + "learning_rate": 0.00011971848003383073, + "loss": 2.7118, + "step": 13294 + }, + { + "epoch": 1.2045027292700052, + "grad_norm": 0.7124919295310974, + "learning_rate": 0.00011971243883283997, + "loss": 1.9782, + "step": 13295 + }, + { + "epoch": 1.204593327444452, + "grad_norm": 0.7935409545898438, + "learning_rate": 0.00011970639763184921, + "loss": 2.0234, + "step": 13296 + }, + { + "epoch": 1.2046839256188988, + "grad_norm": 0.9782229661941528, + "learning_rate": 0.00011970035643085846, + "loss": 2.8152, + "step": 13297 + }, + { + "epoch": 1.2047745237933456, + "grad_norm": 0.8715781569480896, + "learning_rate": 0.00011969431522986769, + "loss": 2.6662, + "step": 13298 + }, + { + "epoch": 1.2048651219677924, + "grad_norm": 0.932564914226532, + "learning_rate": 0.00011968827402887695, + "loss": 2.6932, + "step": 13299 + }, + { + "epoch": 1.2049557201422392, + "grad_norm": 0.9063611030578613, + "learning_rate": 0.0001196822328278862, + "loss": 2.7753, + "step": 13300 + }, + { + "epoch": 1.205046318316686, + "grad_norm": 0.8797300457954407, + "learning_rate": 0.00011967619162689543, + "loss": 2.9461, + "step": 13301 + }, + { + "epoch": 1.2051369164911327, + "grad_norm": 0.918034017086029, + "learning_rate": 0.00011967015042590468, + "loss": 2.8284, + "step": 13302 + }, + { + "epoch": 1.2052275146655795, + "grad_norm": 0.8903288245201111, + "learning_rate": 0.00011966410922491392, + "loss": 2.8333, + "step": 13303 + }, + { + "epoch": 1.2053181128400263, + "grad_norm": 0.886682391166687, + "learning_rate": 0.00011965806802392316, + "loss": 2.6576, + "step": 13304 + }, + { + "epoch": 1.205408711014473, + "grad_norm": 0.8517394065856934, + "learning_rate": 0.0001196520268229324, + "loss": 2.2746, + "step": 13305 + }, + { + "epoch": 1.20549930918892, + "grad_norm": 0.8665563464164734, + "learning_rate": 0.00011964598562194164, + "loss": 2.5765, + "step": 13306 + }, + { + "epoch": 1.2055899073633667, + "grad_norm": 0.8748055100440979, + "learning_rate": 0.0001196399444209509, + "loss": 2.5674, + "step": 13307 + }, + { + "epoch": 1.2056805055378135, + "grad_norm": 0.8471503257751465, + "learning_rate": 0.00011963390321996013, + "loss": 2.7961, + "step": 13308 + }, + { + "epoch": 1.2057711037122603, + "grad_norm": 0.9093150496482849, + "learning_rate": 0.00011962786201896939, + "loss": 2.6406, + "step": 13309 + }, + { + "epoch": 1.205861701886707, + "grad_norm": 0.8393995761871338, + "learning_rate": 0.00011962182081797861, + "loss": 2.0305, + "step": 13310 + }, + { + "epoch": 1.2059523000611538, + "grad_norm": 0.8328712582588196, + "learning_rate": 0.00011961577961698787, + "loss": 2.5859, + "step": 13311 + }, + { + "epoch": 1.2060428982356006, + "grad_norm": 0.9585071206092834, + "learning_rate": 0.0001196097384159971, + "loss": 2.7243, + "step": 13312 + }, + { + "epoch": 1.2061334964100474, + "grad_norm": 0.8927338123321533, + "learning_rate": 0.00011960369721500635, + "loss": 2.6831, + "step": 13313 + }, + { + "epoch": 1.2062240945844942, + "grad_norm": 0.9259606599807739, + "learning_rate": 0.0001195976560140156, + "loss": 2.7392, + "step": 13314 + }, + { + "epoch": 1.206314692758941, + "grad_norm": 0.9590035080909729, + "learning_rate": 0.00011959161481302483, + "loss": 2.7477, + "step": 13315 + }, + { + "epoch": 1.2064052909333878, + "grad_norm": 0.8745469450950623, + "learning_rate": 0.00011958557361203408, + "loss": 2.7025, + "step": 13316 + }, + { + "epoch": 1.2064958891078346, + "grad_norm": 0.9102069735527039, + "learning_rate": 0.00011957953241104331, + "loss": 2.8401, + "step": 13317 + }, + { + "epoch": 1.2065864872822814, + "grad_norm": 0.9744749069213867, + "learning_rate": 0.00011957349121005258, + "loss": 2.6908, + "step": 13318 + }, + { + "epoch": 1.2066770854567281, + "grad_norm": 0.9070314168930054, + "learning_rate": 0.0001195674500090618, + "loss": 2.7177, + "step": 13319 + }, + { + "epoch": 1.206767683631175, + "grad_norm": 0.8376643657684326, + "learning_rate": 0.00011956140880807106, + "loss": 2.5195, + "step": 13320 + }, + { + "epoch": 1.2068582818056217, + "grad_norm": 0.8630726933479309, + "learning_rate": 0.0001195553676070803, + "loss": 2.4435, + "step": 13321 + }, + { + "epoch": 1.2069488799800685, + "grad_norm": 0.9089094996452332, + "learning_rate": 0.00011954932640608954, + "loss": 2.864, + "step": 13322 + }, + { + "epoch": 1.207039478154515, + "grad_norm": 0.9418652057647705, + "learning_rate": 0.00011954328520509879, + "loss": 2.7803, + "step": 13323 + }, + { + "epoch": 1.207130076328962, + "grad_norm": 0.9881170392036438, + "learning_rate": 0.00011953724400410802, + "loss": 2.5504, + "step": 13324 + }, + { + "epoch": 1.2072206745034086, + "grad_norm": 0.9695313572883606, + "learning_rate": 0.00011953120280311727, + "loss": 2.7702, + "step": 13325 + }, + { + "epoch": 1.2073112726778557, + "grad_norm": 0.8770482540130615, + "learning_rate": 0.0001195251616021265, + "loss": 2.4655, + "step": 13326 + }, + { + "epoch": 1.2074018708523022, + "grad_norm": 0.8266772627830505, + "learning_rate": 0.00011951912040113575, + "loss": 1.9725, + "step": 13327 + }, + { + "epoch": 1.2074924690267492, + "grad_norm": 0.9064305424690247, + "learning_rate": 0.00011951307920014498, + "loss": 2.9884, + "step": 13328 + }, + { + "epoch": 1.2075830672011958, + "grad_norm": 0.8777974247932434, + "learning_rate": 0.00011950703799915423, + "loss": 2.5731, + "step": 13329 + }, + { + "epoch": 1.2076736653756428, + "grad_norm": 0.8461089134216309, + "learning_rate": 0.00011950099679816349, + "loss": 2.7998, + "step": 13330 + }, + { + "epoch": 1.2077642635500894, + "grad_norm": 0.944796621799469, + "learning_rate": 0.00011949495559717273, + "loss": 3.0239, + "step": 13331 + }, + { + "epoch": 1.2078548617245362, + "grad_norm": 0.8593956232070923, + "learning_rate": 0.00011948891439618197, + "loss": 2.4973, + "step": 13332 + }, + { + "epoch": 1.207945459898983, + "grad_norm": 0.8772201538085938, + "learning_rate": 0.00011948287319519121, + "loss": 2.7993, + "step": 13333 + }, + { + "epoch": 1.2080360580734297, + "grad_norm": 0.8839553594589233, + "learning_rate": 0.00011947683199420046, + "loss": 2.7594, + "step": 13334 + }, + { + "epoch": 1.2081266562478765, + "grad_norm": 0.9341968894004822, + "learning_rate": 0.00011947079079320969, + "loss": 2.5279, + "step": 13335 + }, + { + "epoch": 1.2082172544223233, + "grad_norm": 0.8715872764587402, + "learning_rate": 0.00011946474959221894, + "loss": 2.6049, + "step": 13336 + }, + { + "epoch": 1.20830785259677, + "grad_norm": 0.7361478209495544, + "learning_rate": 0.00011945870839122818, + "loss": 2.0422, + "step": 13337 + }, + { + "epoch": 1.2083984507712169, + "grad_norm": 0.8662170767784119, + "learning_rate": 0.00011945266719023742, + "loss": 2.9048, + "step": 13338 + }, + { + "epoch": 1.2084890489456637, + "grad_norm": 0.9405286908149719, + "learning_rate": 0.00011944662598924668, + "loss": 2.7031, + "step": 13339 + }, + { + "epoch": 1.2085796471201105, + "grad_norm": 0.9404118061065674, + "learning_rate": 0.0001194405847882559, + "loss": 2.9459, + "step": 13340 + }, + { + "epoch": 1.2086702452945572, + "grad_norm": 0.8695924878120422, + "learning_rate": 0.00011943454358726516, + "loss": 2.83, + "step": 13341 + }, + { + "epoch": 1.208760843469004, + "grad_norm": 0.9108064770698547, + "learning_rate": 0.00011942850238627438, + "loss": 2.9784, + "step": 13342 + }, + { + "epoch": 1.2088514416434508, + "grad_norm": 0.7938379645347595, + "learning_rate": 0.00011942246118528364, + "loss": 2.0017, + "step": 13343 + }, + { + "epoch": 1.2089420398178976, + "grad_norm": 0.9407994747161865, + "learning_rate": 0.00011941641998429289, + "loss": 2.6423, + "step": 13344 + }, + { + "epoch": 1.2090326379923444, + "grad_norm": 1.0125030279159546, + "learning_rate": 0.00011941037878330212, + "loss": 2.725, + "step": 13345 + }, + { + "epoch": 1.2091232361667912, + "grad_norm": 0.9541229009628296, + "learning_rate": 0.00011940433758231137, + "loss": 2.4988, + "step": 13346 + }, + { + "epoch": 1.209213834341238, + "grad_norm": 0.8711185455322266, + "learning_rate": 0.0001193982963813206, + "loss": 2.6448, + "step": 13347 + }, + { + "epoch": 1.2093044325156848, + "grad_norm": 0.8643447160720825, + "learning_rate": 0.00011939225518032985, + "loss": 2.595, + "step": 13348 + }, + { + "epoch": 1.2093950306901315, + "grad_norm": 0.9465389251708984, + "learning_rate": 0.00011938621397933909, + "loss": 2.8472, + "step": 13349 + }, + { + "epoch": 1.2094856288645783, + "grad_norm": 0.8529083728790283, + "learning_rate": 0.00011938017277834835, + "loss": 2.735, + "step": 13350 + }, + { + "epoch": 1.2095762270390251, + "grad_norm": 0.8580809235572815, + "learning_rate": 0.0001193741315773576, + "loss": 2.5913, + "step": 13351 + }, + { + "epoch": 1.209666825213472, + "grad_norm": 0.9263976216316223, + "learning_rate": 0.00011936809037636683, + "loss": 2.6609, + "step": 13352 + }, + { + "epoch": 1.2097574233879187, + "grad_norm": 0.8536936640739441, + "learning_rate": 0.00011936204917537608, + "loss": 2.7093, + "step": 13353 + }, + { + "epoch": 1.2098480215623655, + "grad_norm": 0.8738409280776978, + "learning_rate": 0.00011935600797438531, + "loss": 2.7181, + "step": 13354 + }, + { + "epoch": 1.2099386197368123, + "grad_norm": 0.9216057062149048, + "learning_rate": 0.00011934996677339456, + "loss": 2.7747, + "step": 13355 + }, + { + "epoch": 1.210029217911259, + "grad_norm": 0.7660821080207825, + "learning_rate": 0.0001193439255724038, + "loss": 2.1397, + "step": 13356 + }, + { + "epoch": 1.2101198160857058, + "grad_norm": 0.9140440225601196, + "learning_rate": 0.00011933788437141304, + "loss": 2.5635, + "step": 13357 + }, + { + "epoch": 1.2102104142601526, + "grad_norm": 0.9005065560340881, + "learning_rate": 0.00011933184317042228, + "loss": 2.5696, + "step": 13358 + }, + { + "epoch": 1.2103010124345994, + "grad_norm": 0.9070847034454346, + "learning_rate": 0.00011932580196943152, + "loss": 2.6433, + "step": 13359 + }, + { + "epoch": 1.2103916106090462, + "grad_norm": 0.9698418378829956, + "learning_rate": 0.00011931976076844078, + "loss": 2.8535, + "step": 13360 + }, + { + "epoch": 1.210482208783493, + "grad_norm": 0.8985375165939331, + "learning_rate": 0.00011931371956745, + "loss": 2.6921, + "step": 13361 + }, + { + "epoch": 1.2105728069579398, + "grad_norm": 0.8826504349708557, + "learning_rate": 0.00011930767836645927, + "loss": 2.7565, + "step": 13362 + }, + { + "epoch": 1.2106634051323866, + "grad_norm": 0.9924156665802002, + "learning_rate": 0.0001193016371654685, + "loss": 2.8333, + "step": 13363 + }, + { + "epoch": 1.2107540033068334, + "grad_norm": 0.9101721048355103, + "learning_rate": 0.00011929559596447775, + "loss": 2.5067, + "step": 13364 + }, + { + "epoch": 1.2108446014812801, + "grad_norm": 0.9527926445007324, + "learning_rate": 0.00011928955476348698, + "loss": 2.6178, + "step": 13365 + }, + { + "epoch": 1.210935199655727, + "grad_norm": 0.8880937099456787, + "learning_rate": 0.00011928351356249623, + "loss": 2.9579, + "step": 13366 + }, + { + "epoch": 1.2110257978301737, + "grad_norm": 0.9080172181129456, + "learning_rate": 0.00011927747236150548, + "loss": 2.6753, + "step": 13367 + }, + { + "epoch": 1.2111163960046205, + "grad_norm": 0.910563051700592, + "learning_rate": 0.00011927143116051471, + "loss": 2.696, + "step": 13368 + }, + { + "epoch": 1.2112069941790673, + "grad_norm": 0.860895574092865, + "learning_rate": 0.00011926538995952396, + "loss": 2.6698, + "step": 13369 + }, + { + "epoch": 1.211297592353514, + "grad_norm": 0.8563354015350342, + "learning_rate": 0.00011925934875853319, + "loss": 2.714, + "step": 13370 + }, + { + "epoch": 1.2113881905279609, + "grad_norm": 0.8069356083869934, + "learning_rate": 0.00011925330755754245, + "loss": 1.9244, + "step": 13371 + }, + { + "epoch": 1.2114787887024077, + "grad_norm": 0.8766778707504272, + "learning_rate": 0.00011924726635655167, + "loss": 2.5372, + "step": 13372 + }, + { + "epoch": 1.2115693868768544, + "grad_norm": 0.8956578373908997, + "learning_rate": 0.00011924122515556094, + "loss": 2.5258, + "step": 13373 + }, + { + "epoch": 1.2116599850513012, + "grad_norm": 0.9781584143638611, + "learning_rate": 0.00011923518395457018, + "loss": 2.5865, + "step": 13374 + }, + { + "epoch": 1.211750583225748, + "grad_norm": 0.8262963891029358, + "learning_rate": 0.00011922914275357942, + "loss": 2.0991, + "step": 13375 + }, + { + "epoch": 1.2118411814001948, + "grad_norm": 0.910077691078186, + "learning_rate": 0.00011922310155258867, + "loss": 2.6706, + "step": 13376 + }, + { + "epoch": 1.2119317795746416, + "grad_norm": 0.9146504998207092, + "learning_rate": 0.0001192170603515979, + "loss": 2.4533, + "step": 13377 + }, + { + "epoch": 1.2120223777490884, + "grad_norm": 0.9335542917251587, + "learning_rate": 0.00011921101915060715, + "loss": 2.6256, + "step": 13378 + }, + { + "epoch": 1.2121129759235352, + "grad_norm": 0.9193528890609741, + "learning_rate": 0.00011920497794961638, + "loss": 2.7367, + "step": 13379 + }, + { + "epoch": 1.212203574097982, + "grad_norm": 0.8836948275566101, + "learning_rate": 0.00011919893674862563, + "loss": 2.6027, + "step": 13380 + }, + { + "epoch": 1.2122941722724287, + "grad_norm": 0.9431951642036438, + "learning_rate": 0.00011919289554763489, + "loss": 2.6902, + "step": 13381 + }, + { + "epoch": 1.2123847704468755, + "grad_norm": 0.8969326019287109, + "learning_rate": 0.00011918685434664411, + "loss": 2.7547, + "step": 13382 + }, + { + "epoch": 1.2124753686213223, + "grad_norm": 0.9070160388946533, + "learning_rate": 0.00011918081314565337, + "loss": 2.6737, + "step": 13383 + }, + { + "epoch": 1.2125659667957691, + "grad_norm": 0.9540169835090637, + "learning_rate": 0.0001191747719446626, + "loss": 2.9563, + "step": 13384 + }, + { + "epoch": 1.212656564970216, + "grad_norm": 0.8842712640762329, + "learning_rate": 0.00011916873074367185, + "loss": 2.6542, + "step": 13385 + }, + { + "epoch": 1.2127471631446627, + "grad_norm": 0.8828530311584473, + "learning_rate": 0.00011916268954268109, + "loss": 2.7923, + "step": 13386 + }, + { + "epoch": 1.2128377613191095, + "grad_norm": 0.9280295968055725, + "learning_rate": 0.00011915664834169033, + "loss": 2.6731, + "step": 13387 + }, + { + "epoch": 1.2129283594935563, + "grad_norm": 0.8635680079460144, + "learning_rate": 0.00011915060714069957, + "loss": 2.6834, + "step": 13388 + }, + { + "epoch": 1.213018957668003, + "grad_norm": 0.7472730875015259, + "learning_rate": 0.00011914456593970882, + "loss": 2.2401, + "step": 13389 + }, + { + "epoch": 1.2131095558424498, + "grad_norm": 0.8841162323951721, + "learning_rate": 0.00011913852473871808, + "loss": 2.7603, + "step": 13390 + }, + { + "epoch": 1.2132001540168966, + "grad_norm": 0.8425207734107971, + "learning_rate": 0.0001191324835377273, + "loss": 2.0976, + "step": 13391 + }, + { + "epoch": 1.2132907521913434, + "grad_norm": 0.9187888503074646, + "learning_rate": 0.00011912644233673656, + "loss": 2.7993, + "step": 13392 + }, + { + "epoch": 1.2133813503657902, + "grad_norm": 0.9312507510185242, + "learning_rate": 0.00011912040113574578, + "loss": 2.8055, + "step": 13393 + }, + { + "epoch": 1.213471948540237, + "grad_norm": 0.9255154132843018, + "learning_rate": 0.00011911435993475504, + "loss": 2.8722, + "step": 13394 + }, + { + "epoch": 1.2135625467146838, + "grad_norm": 0.8880317211151123, + "learning_rate": 0.00011910831873376427, + "loss": 2.6173, + "step": 13395 + }, + { + "epoch": 1.2136531448891306, + "grad_norm": 0.8476423025131226, + "learning_rate": 0.00011910227753277352, + "loss": 2.6958, + "step": 13396 + }, + { + "epoch": 1.2137437430635774, + "grad_norm": 0.9103541374206543, + "learning_rate": 0.00011909623633178277, + "loss": 2.9105, + "step": 13397 + }, + { + "epoch": 1.2138343412380241, + "grad_norm": 0.8770169615745544, + "learning_rate": 0.000119090195130792, + "loss": 2.625, + "step": 13398 + }, + { + "epoch": 1.213924939412471, + "grad_norm": 1.0482922792434692, + "learning_rate": 0.00011908415392980125, + "loss": 2.8687, + "step": 13399 + }, + { + "epoch": 1.2140155375869177, + "grad_norm": 0.8862916231155396, + "learning_rate": 0.00011907811272881049, + "loss": 2.7971, + "step": 13400 + }, + { + "epoch": 1.2141061357613645, + "grad_norm": 0.9265488386154175, + "learning_rate": 0.00011907207152781973, + "loss": 2.8218, + "step": 13401 + }, + { + "epoch": 1.2141967339358113, + "grad_norm": 0.9199996590614319, + "learning_rate": 0.00011906603032682897, + "loss": 2.6293, + "step": 13402 + }, + { + "epoch": 1.214287332110258, + "grad_norm": 0.8026936054229736, + "learning_rate": 0.00011905998912583823, + "loss": 1.9567, + "step": 13403 + }, + { + "epoch": 1.2143779302847046, + "grad_norm": 0.8605182766914368, + "learning_rate": 0.00011905394792484748, + "loss": 2.7879, + "step": 13404 + }, + { + "epoch": 1.2144685284591517, + "grad_norm": 0.9906163811683655, + "learning_rate": 0.00011904790672385671, + "loss": 2.9713, + "step": 13405 + }, + { + "epoch": 1.2145591266335982, + "grad_norm": 0.7556615471839905, + "learning_rate": 0.00011904186552286596, + "loss": 1.9442, + "step": 13406 + }, + { + "epoch": 1.2146497248080452, + "grad_norm": 0.7737604975700378, + "learning_rate": 0.00011903582432187519, + "loss": 2.059, + "step": 13407 + }, + { + "epoch": 1.2147403229824918, + "grad_norm": 0.888115644454956, + "learning_rate": 0.00011902978312088444, + "loss": 2.8042, + "step": 13408 + }, + { + "epoch": 1.2148309211569388, + "grad_norm": 0.8920609951019287, + "learning_rate": 0.00011902374191989367, + "loss": 2.5094, + "step": 13409 + }, + { + "epoch": 1.2149215193313854, + "grad_norm": 0.9123407006263733, + "learning_rate": 0.00011901770071890292, + "loss": 2.993, + "step": 13410 + }, + { + "epoch": 1.2150121175058324, + "grad_norm": 0.9296349883079529, + "learning_rate": 0.00011901165951791218, + "loss": 2.6822, + "step": 13411 + }, + { + "epoch": 1.215102715680279, + "grad_norm": 0.899649977684021, + "learning_rate": 0.0001190056183169214, + "loss": 2.8242, + "step": 13412 + }, + { + "epoch": 1.2151933138547257, + "grad_norm": 0.7826650738716125, + "learning_rate": 0.00011899957711593066, + "loss": 2.0464, + "step": 13413 + }, + { + "epoch": 1.2152839120291725, + "grad_norm": 0.9012027382850647, + "learning_rate": 0.00011899353591493988, + "loss": 2.6292, + "step": 13414 + }, + { + "epoch": 1.2153745102036193, + "grad_norm": 0.9222517609596252, + "learning_rate": 0.00011898749471394915, + "loss": 2.7149, + "step": 13415 + }, + { + "epoch": 1.215465108378066, + "grad_norm": 0.8506391048431396, + "learning_rate": 0.00011898145351295838, + "loss": 2.7796, + "step": 13416 + }, + { + "epoch": 1.2155557065525129, + "grad_norm": 0.8255226016044617, + "learning_rate": 0.00011897541231196763, + "loss": 2.7832, + "step": 13417 + }, + { + "epoch": 1.2156463047269597, + "grad_norm": 0.8842960000038147, + "learning_rate": 0.00011896937111097686, + "loss": 2.6827, + "step": 13418 + }, + { + "epoch": 1.2157369029014065, + "grad_norm": 0.9028264284133911, + "learning_rate": 0.00011896332990998611, + "loss": 2.7181, + "step": 13419 + }, + { + "epoch": 1.2158275010758532, + "grad_norm": 0.7465330958366394, + "learning_rate": 0.00011895728870899536, + "loss": 1.826, + "step": 13420 + }, + { + "epoch": 1.2159180992503, + "grad_norm": 0.874526858329773, + "learning_rate": 0.00011895124750800459, + "loss": 2.9373, + "step": 13421 + }, + { + "epoch": 1.2160086974247468, + "grad_norm": 0.9481098055839539, + "learning_rate": 0.00011894520630701385, + "loss": 2.6546, + "step": 13422 + }, + { + "epoch": 1.2160992955991936, + "grad_norm": 0.9519539475440979, + "learning_rate": 0.00011893916510602307, + "loss": 2.7804, + "step": 13423 + }, + { + "epoch": 1.2161898937736404, + "grad_norm": 0.9592075943946838, + "learning_rate": 0.00011893312390503233, + "loss": 2.7304, + "step": 13424 + }, + { + "epoch": 1.2162804919480872, + "grad_norm": 0.8844003081321716, + "learning_rate": 0.00011892708270404155, + "loss": 2.5988, + "step": 13425 + }, + { + "epoch": 1.216371090122534, + "grad_norm": 0.902490496635437, + "learning_rate": 0.00011892104150305081, + "loss": 2.578, + "step": 13426 + }, + { + "epoch": 1.2164616882969808, + "grad_norm": 0.897646427154541, + "learning_rate": 0.00011891500030206006, + "loss": 2.8185, + "step": 13427 + }, + { + "epoch": 1.2165522864714275, + "grad_norm": 0.915958046913147, + "learning_rate": 0.0001189089591010693, + "loss": 2.7567, + "step": 13428 + }, + { + "epoch": 1.2166428846458743, + "grad_norm": 0.9112728238105774, + "learning_rate": 0.00011890291790007854, + "loss": 2.6934, + "step": 13429 + }, + { + "epoch": 1.2167334828203211, + "grad_norm": 0.837419867515564, + "learning_rate": 0.00011889687669908778, + "loss": 2.7472, + "step": 13430 + }, + { + "epoch": 1.216824080994768, + "grad_norm": 0.963064968585968, + "learning_rate": 0.00011889083549809703, + "loss": 2.7547, + "step": 13431 + }, + { + "epoch": 1.2169146791692147, + "grad_norm": 0.8240452408790588, + "learning_rate": 0.00011888479429710626, + "loss": 2.6226, + "step": 13432 + }, + { + "epoch": 1.2170052773436615, + "grad_norm": 0.8898818492889404, + "learning_rate": 0.00011887875309611551, + "loss": 2.7131, + "step": 13433 + }, + { + "epoch": 1.2170958755181083, + "grad_norm": 0.9266934394836426, + "learning_rate": 0.00011887271189512477, + "loss": 2.7414, + "step": 13434 + }, + { + "epoch": 1.217186473692555, + "grad_norm": 0.8559704422950745, + "learning_rate": 0.000118866670694134, + "loss": 2.4398, + "step": 13435 + }, + { + "epoch": 1.2172770718670018, + "grad_norm": 1.2078471183776855, + "learning_rate": 0.00011886062949314325, + "loss": 2.7299, + "step": 13436 + }, + { + "epoch": 1.2173676700414486, + "grad_norm": 0.9172480702400208, + "learning_rate": 0.00011885458829215248, + "loss": 2.612, + "step": 13437 + }, + { + "epoch": 1.2174582682158954, + "grad_norm": 0.8631792068481445, + "learning_rate": 0.00011884854709116173, + "loss": 2.8915, + "step": 13438 + }, + { + "epoch": 1.2175488663903422, + "grad_norm": 0.8801352977752686, + "learning_rate": 0.00011884250589017097, + "loss": 2.7264, + "step": 13439 + }, + { + "epoch": 1.217639464564789, + "grad_norm": 0.8953225016593933, + "learning_rate": 0.00011883646468918021, + "loss": 2.5236, + "step": 13440 + }, + { + "epoch": 1.2177300627392358, + "grad_norm": 0.8931196928024292, + "learning_rate": 0.00011883042348818947, + "loss": 1.9188, + "step": 13441 + }, + { + "epoch": 1.2178206609136826, + "grad_norm": 0.8183695673942566, + "learning_rate": 0.0001188243822871987, + "loss": 2.1215, + "step": 13442 + }, + { + "epoch": 1.2179112590881294, + "grad_norm": 0.8901708722114563, + "learning_rate": 0.00011881834108620796, + "loss": 2.4241, + "step": 13443 + }, + { + "epoch": 1.2180018572625761, + "grad_norm": 0.928469181060791, + "learning_rate": 0.00011881229988521718, + "loss": 2.8539, + "step": 13444 + }, + { + "epoch": 1.218092455437023, + "grad_norm": 0.9289333820343018, + "learning_rate": 0.00011880625868422644, + "loss": 2.9491, + "step": 13445 + }, + { + "epoch": 1.2181830536114697, + "grad_norm": 0.979865550994873, + "learning_rate": 0.00011880021748323566, + "loss": 2.7002, + "step": 13446 + }, + { + "epoch": 1.2182736517859165, + "grad_norm": 0.9505870938301086, + "learning_rate": 0.00011879417628224492, + "loss": 2.5109, + "step": 13447 + }, + { + "epoch": 1.2183642499603633, + "grad_norm": 0.8949662446975708, + "learning_rate": 0.00011878813508125415, + "loss": 2.6932, + "step": 13448 + }, + { + "epoch": 1.21845484813481, + "grad_norm": 0.7907597422599792, + "learning_rate": 0.0001187820938802634, + "loss": 1.9673, + "step": 13449 + }, + { + "epoch": 1.2185454463092569, + "grad_norm": 0.9669802784919739, + "learning_rate": 0.00011877605267927265, + "loss": 2.6724, + "step": 13450 + }, + { + "epoch": 1.2186360444837037, + "grad_norm": 0.8792761564254761, + "learning_rate": 0.00011877001147828188, + "loss": 2.6707, + "step": 13451 + }, + { + "epoch": 1.2187266426581504, + "grad_norm": 0.920914351940155, + "learning_rate": 0.00011876397027729113, + "loss": 2.6419, + "step": 13452 + }, + { + "epoch": 1.2188172408325972, + "grad_norm": 0.8802902698516846, + "learning_rate": 0.00011875792907630036, + "loss": 2.7812, + "step": 13453 + }, + { + "epoch": 1.218907839007044, + "grad_norm": 0.8940179944038391, + "learning_rate": 0.00011875188787530963, + "loss": 2.8196, + "step": 13454 + }, + { + "epoch": 1.2189984371814908, + "grad_norm": 0.9157532453536987, + "learning_rate": 0.00011874584667431885, + "loss": 2.8198, + "step": 13455 + }, + { + "epoch": 1.2190890353559376, + "grad_norm": 0.8757342100143433, + "learning_rate": 0.00011873980547332811, + "loss": 2.974, + "step": 13456 + }, + { + "epoch": 1.2191796335303844, + "grad_norm": 0.8562928438186646, + "learning_rate": 0.00011873376427233736, + "loss": 2.8086, + "step": 13457 + }, + { + "epoch": 1.2192702317048312, + "grad_norm": 0.937716007232666, + "learning_rate": 0.00011872772307134659, + "loss": 2.9099, + "step": 13458 + }, + { + "epoch": 1.219360829879278, + "grad_norm": 0.8886007070541382, + "learning_rate": 0.00011872168187035584, + "loss": 2.8386, + "step": 13459 + }, + { + "epoch": 1.2194514280537248, + "grad_norm": 0.8713161945343018, + "learning_rate": 0.00011871564066936507, + "loss": 2.7326, + "step": 13460 + }, + { + "epoch": 1.2195420262281715, + "grad_norm": 0.8549056053161621, + "learning_rate": 0.00011870959946837432, + "loss": 2.9208, + "step": 13461 + }, + { + "epoch": 1.2196326244026183, + "grad_norm": 0.9400151371955872, + "learning_rate": 0.00011870355826738355, + "loss": 2.7117, + "step": 13462 + }, + { + "epoch": 1.2197232225770651, + "grad_norm": 0.85516756772995, + "learning_rate": 0.0001186975170663928, + "loss": 2.0436, + "step": 13463 + }, + { + "epoch": 1.219813820751512, + "grad_norm": 0.8383044004440308, + "learning_rate": 0.00011869147586540206, + "loss": 2.7817, + "step": 13464 + }, + { + "epoch": 1.2199044189259587, + "grad_norm": 0.8810840845108032, + "learning_rate": 0.00011868543466441128, + "loss": 2.3821, + "step": 13465 + }, + { + "epoch": 1.2199950171004055, + "grad_norm": 0.8703024387359619, + "learning_rate": 0.00011867939346342054, + "loss": 2.8716, + "step": 13466 + }, + { + "epoch": 1.2200856152748523, + "grad_norm": 0.896138072013855, + "learning_rate": 0.00011867335226242978, + "loss": 2.9617, + "step": 13467 + }, + { + "epoch": 1.220176213449299, + "grad_norm": 0.9117974638938904, + "learning_rate": 0.00011866731106143902, + "loss": 2.9657, + "step": 13468 + }, + { + "epoch": 1.2202668116237458, + "grad_norm": 0.899448037147522, + "learning_rate": 0.00011866126986044826, + "loss": 2.7801, + "step": 13469 + }, + { + "epoch": 1.2203574097981926, + "grad_norm": 0.8582465648651123, + "learning_rate": 0.0001186552286594575, + "loss": 2.6172, + "step": 13470 + }, + { + "epoch": 1.2204480079726394, + "grad_norm": 0.8862612843513489, + "learning_rate": 0.00011864918745846675, + "loss": 2.488, + "step": 13471 + }, + { + "epoch": 1.2205386061470862, + "grad_norm": 0.8569459915161133, + "learning_rate": 0.00011864314625747599, + "loss": 2.6202, + "step": 13472 + }, + { + "epoch": 1.220629204321533, + "grad_norm": 0.9131746292114258, + "learning_rate": 0.00011863710505648525, + "loss": 2.8717, + "step": 13473 + }, + { + "epoch": 1.2207198024959798, + "grad_norm": 0.8822236657142639, + "learning_rate": 0.00011863106385549447, + "loss": 2.4679, + "step": 13474 + }, + { + "epoch": 1.2208104006704266, + "grad_norm": 0.8981654047966003, + "learning_rate": 0.00011862502265450373, + "loss": 2.6768, + "step": 13475 + }, + { + "epoch": 1.2209009988448734, + "grad_norm": 0.8506154417991638, + "learning_rate": 0.00011861898145351295, + "loss": 2.7834, + "step": 13476 + }, + { + "epoch": 1.2209915970193201, + "grad_norm": 0.9550942182540894, + "learning_rate": 0.00011861294025252221, + "loss": 2.3346, + "step": 13477 + }, + { + "epoch": 1.221082195193767, + "grad_norm": 0.904869794845581, + "learning_rate": 0.00011860689905153143, + "loss": 2.8898, + "step": 13478 + }, + { + "epoch": 1.2211727933682137, + "grad_norm": 0.9285186529159546, + "learning_rate": 0.0001186008578505407, + "loss": 2.8825, + "step": 13479 + }, + { + "epoch": 1.2212633915426605, + "grad_norm": 0.8350031971931458, + "learning_rate": 0.00011859481664954994, + "loss": 2.7007, + "step": 13480 + }, + { + "epoch": 1.2213539897171073, + "grad_norm": 0.8892250061035156, + "learning_rate": 0.00011858877544855918, + "loss": 2.5414, + "step": 13481 + }, + { + "epoch": 1.221444587891554, + "grad_norm": 0.8453563451766968, + "learning_rate": 0.00011858273424756842, + "loss": 2.1442, + "step": 13482 + }, + { + "epoch": 1.2215351860660009, + "grad_norm": 0.8891221880912781, + "learning_rate": 0.00011857669304657766, + "loss": 2.6059, + "step": 13483 + }, + { + "epoch": 1.2216257842404477, + "grad_norm": 0.9723305702209473, + "learning_rate": 0.0001185706518455869, + "loss": 2.9371, + "step": 13484 + }, + { + "epoch": 1.2217163824148942, + "grad_norm": 0.8615736365318298, + "learning_rate": 0.00011856461064459614, + "loss": 2.7967, + "step": 13485 + }, + { + "epoch": 1.2218069805893412, + "grad_norm": 0.8711217641830444, + "learning_rate": 0.0001185585694436054, + "loss": 2.9116, + "step": 13486 + }, + { + "epoch": 1.2218975787637878, + "grad_norm": 0.8952157497406006, + "learning_rate": 0.00011855252824261465, + "loss": 2.8794, + "step": 13487 + }, + { + "epoch": 1.2219881769382348, + "grad_norm": 0.9107070565223694, + "learning_rate": 0.00011854648704162388, + "loss": 2.3629, + "step": 13488 + }, + { + "epoch": 1.2220787751126814, + "grad_norm": 0.927793025970459, + "learning_rate": 0.00011854044584063313, + "loss": 2.4936, + "step": 13489 + }, + { + "epoch": 1.2221693732871284, + "grad_norm": 0.9784005284309387, + "learning_rate": 0.00011853440463964236, + "loss": 2.5574, + "step": 13490 + }, + { + "epoch": 1.222259971461575, + "grad_norm": 0.896791398525238, + "learning_rate": 0.00011852836343865161, + "loss": 2.5922, + "step": 13491 + }, + { + "epoch": 1.222350569636022, + "grad_norm": 0.9172661304473877, + "learning_rate": 0.00011852232223766085, + "loss": 2.5785, + "step": 13492 + }, + { + "epoch": 1.2224411678104685, + "grad_norm": 0.9445326328277588, + "learning_rate": 0.00011851628103667009, + "loss": 2.8123, + "step": 13493 + }, + { + "epoch": 1.2225317659849153, + "grad_norm": 0.9283485412597656, + "learning_rate": 0.00011851023983567935, + "loss": 2.8603, + "step": 13494 + }, + { + "epoch": 1.222622364159362, + "grad_norm": 0.8851189613342285, + "learning_rate": 0.00011850419863468857, + "loss": 2.6471, + "step": 13495 + }, + { + "epoch": 1.2227129623338089, + "grad_norm": 0.878585159778595, + "learning_rate": 0.00011849815743369784, + "loss": 2.8016, + "step": 13496 + }, + { + "epoch": 1.2228035605082557, + "grad_norm": 0.9524368643760681, + "learning_rate": 0.00011849211623270706, + "loss": 2.5668, + "step": 13497 + }, + { + "epoch": 1.2228941586827025, + "grad_norm": 0.9405251145362854, + "learning_rate": 0.00011848607503171632, + "loss": 2.7841, + "step": 13498 + }, + { + "epoch": 1.2229847568571492, + "grad_norm": 0.8667965531349182, + "learning_rate": 0.00011848003383072555, + "loss": 2.9132, + "step": 13499 + }, + { + "epoch": 1.223075355031596, + "grad_norm": 0.874996542930603, + "learning_rate": 0.0001184739926297348, + "loss": 2.8408, + "step": 13500 + }, + { + "epoch": 1.2231659532060428, + "grad_norm": 0.8565261363983154, + "learning_rate": 0.00011846795142874405, + "loss": 2.5869, + "step": 13501 + }, + { + "epoch": 1.2232565513804896, + "grad_norm": 0.9050522446632385, + "learning_rate": 0.00011846191022775328, + "loss": 2.5891, + "step": 13502 + }, + { + "epoch": 1.2233471495549364, + "grad_norm": 0.9417402148246765, + "learning_rate": 0.00011845586902676253, + "loss": 2.8451, + "step": 13503 + }, + { + "epoch": 1.2234377477293832, + "grad_norm": 0.9044710397720337, + "learning_rate": 0.00011844982782577176, + "loss": 2.6425, + "step": 13504 + }, + { + "epoch": 1.22352834590383, + "grad_norm": 0.9643776416778564, + "learning_rate": 0.00011844378662478102, + "loss": 2.599, + "step": 13505 + }, + { + "epoch": 1.2236189440782768, + "grad_norm": 0.9341410994529724, + "learning_rate": 0.00011843774542379024, + "loss": 3.2219, + "step": 13506 + }, + { + "epoch": 1.2237095422527235, + "grad_norm": 0.8492351770401001, + "learning_rate": 0.0001184317042227995, + "loss": 2.6957, + "step": 13507 + }, + { + "epoch": 1.2238001404271703, + "grad_norm": 0.9210638999938965, + "learning_rate": 0.00011842566302180873, + "loss": 2.9376, + "step": 13508 + }, + { + "epoch": 1.2238907386016171, + "grad_norm": 0.8740113973617554, + "learning_rate": 0.00011841962182081799, + "loss": 2.7316, + "step": 13509 + }, + { + "epoch": 1.223981336776064, + "grad_norm": 0.866167426109314, + "learning_rate": 0.00011841358061982723, + "loss": 2.5212, + "step": 13510 + }, + { + "epoch": 1.2240719349505107, + "grad_norm": 0.83277827501297, + "learning_rate": 0.00011840753941883647, + "loss": 2.135, + "step": 13511 + }, + { + "epoch": 1.2241625331249575, + "grad_norm": 0.9253483414649963, + "learning_rate": 0.00011840149821784572, + "loss": 2.7395, + "step": 13512 + }, + { + "epoch": 1.2242531312994043, + "grad_norm": 0.8413522243499756, + "learning_rate": 0.00011839545701685495, + "loss": 2.6297, + "step": 13513 + }, + { + "epoch": 1.224343729473851, + "grad_norm": 0.9272920489311218, + "learning_rate": 0.0001183894158158642, + "loss": 2.6784, + "step": 13514 + }, + { + "epoch": 1.2244343276482978, + "grad_norm": 0.8257823586463928, + "learning_rate": 0.00011838337461487343, + "loss": 2.5163, + "step": 13515 + }, + { + "epoch": 1.2245249258227446, + "grad_norm": 0.8913666009902954, + "learning_rate": 0.00011837733341388268, + "loss": 2.4602, + "step": 13516 + }, + { + "epoch": 1.2246155239971914, + "grad_norm": 0.875767707824707, + "learning_rate": 0.00011837129221289194, + "loss": 2.5664, + "step": 13517 + }, + { + "epoch": 1.2247061221716382, + "grad_norm": 0.9318568110466003, + "learning_rate": 0.00011836525101190117, + "loss": 2.6185, + "step": 13518 + }, + { + "epoch": 1.224796720346085, + "grad_norm": 0.934582531452179, + "learning_rate": 0.00011835920981091042, + "loss": 2.7843, + "step": 13519 + }, + { + "epoch": 1.2248873185205318, + "grad_norm": 0.9001961946487427, + "learning_rate": 0.00011835316860991966, + "loss": 2.8091, + "step": 13520 + }, + { + "epoch": 1.2249779166949786, + "grad_norm": 0.8819711804389954, + "learning_rate": 0.0001183471274089289, + "loss": 2.6363, + "step": 13521 + }, + { + "epoch": 1.2250685148694254, + "grad_norm": 0.8505544662475586, + "learning_rate": 0.00011834108620793814, + "loss": 2.8136, + "step": 13522 + }, + { + "epoch": 1.2251591130438721, + "grad_norm": 0.8669375777244568, + "learning_rate": 0.00011833504500694739, + "loss": 2.7131, + "step": 13523 + }, + { + "epoch": 1.225249711218319, + "grad_norm": 0.8868345022201538, + "learning_rate": 0.00011832900380595663, + "loss": 2.5019, + "step": 13524 + }, + { + "epoch": 1.2253403093927657, + "grad_norm": 0.8890743255615234, + "learning_rate": 0.00011832296260496587, + "loss": 2.7429, + "step": 13525 + }, + { + "epoch": 1.2254309075672125, + "grad_norm": 0.8807379603385925, + "learning_rate": 0.00011831692140397513, + "loss": 2.6386, + "step": 13526 + }, + { + "epoch": 1.2255215057416593, + "grad_norm": 0.8887135982513428, + "learning_rate": 0.00011831088020298435, + "loss": 2.6191, + "step": 13527 + }, + { + "epoch": 1.225612103916106, + "grad_norm": 0.9134396910667419, + "learning_rate": 0.00011830483900199361, + "loss": 2.7957, + "step": 13528 + }, + { + "epoch": 1.2257027020905529, + "grad_norm": 0.8213861584663391, + "learning_rate": 0.00011829879780100283, + "loss": 2.3715, + "step": 13529 + }, + { + "epoch": 1.2257933002649997, + "grad_norm": 0.9040655493736267, + "learning_rate": 0.00011829275660001209, + "loss": 2.5865, + "step": 13530 + }, + { + "epoch": 1.2258838984394465, + "grad_norm": 0.8665436506271362, + "learning_rate": 0.00011828671539902134, + "loss": 2.705, + "step": 13531 + }, + { + "epoch": 1.2259744966138932, + "grad_norm": 0.8576552867889404, + "learning_rate": 0.00011828067419803057, + "loss": 2.4525, + "step": 13532 + }, + { + "epoch": 1.22606509478834, + "grad_norm": 0.9052128791809082, + "learning_rate": 0.00011827463299703982, + "loss": 3.0054, + "step": 13533 + }, + { + "epoch": 1.2261556929627868, + "grad_norm": 0.9449455738067627, + "learning_rate": 0.00011826859179604905, + "loss": 2.8668, + "step": 13534 + }, + { + "epoch": 1.2262462911372336, + "grad_norm": 0.9252604246139526, + "learning_rate": 0.0001182625505950583, + "loss": 2.6785, + "step": 13535 + }, + { + "epoch": 1.2263368893116804, + "grad_norm": 0.9093906283378601, + "learning_rate": 0.00011825650939406754, + "loss": 2.5956, + "step": 13536 + }, + { + "epoch": 1.2264274874861272, + "grad_norm": 0.8993532061576843, + "learning_rate": 0.0001182504681930768, + "loss": 2.6996, + "step": 13537 + }, + { + "epoch": 1.226518085660574, + "grad_norm": 0.8778936266899109, + "learning_rate": 0.00011824442699208602, + "loss": 2.5325, + "step": 13538 + }, + { + "epoch": 1.2266086838350208, + "grad_norm": 0.8525330424308777, + "learning_rate": 0.00011823838579109528, + "loss": 2.5332, + "step": 13539 + }, + { + "epoch": 1.2266992820094675, + "grad_norm": 0.9359047412872314, + "learning_rate": 0.00011823234459010453, + "loss": 2.8091, + "step": 13540 + }, + { + "epoch": 1.2267898801839143, + "grad_norm": 0.9106972217559814, + "learning_rate": 0.00011822630338911376, + "loss": 2.8075, + "step": 13541 + }, + { + "epoch": 1.2268804783583611, + "grad_norm": 0.9049439430236816, + "learning_rate": 0.00011822026218812301, + "loss": 2.4484, + "step": 13542 + }, + { + "epoch": 1.226971076532808, + "grad_norm": 0.821110725402832, + "learning_rate": 0.00011821422098713224, + "loss": 2.1142, + "step": 13543 + }, + { + "epoch": 1.2270616747072547, + "grad_norm": 0.9226616024971008, + "learning_rate": 0.00011820817978614149, + "loss": 3.019, + "step": 13544 + }, + { + "epoch": 1.2271522728817015, + "grad_norm": 0.8877851963043213, + "learning_rate": 0.00011820213858515072, + "loss": 2.717, + "step": 13545 + }, + { + "epoch": 1.2272428710561483, + "grad_norm": 0.9927411079406738, + "learning_rate": 0.00011819609738415997, + "loss": 2.7601, + "step": 13546 + }, + { + "epoch": 1.227333469230595, + "grad_norm": 0.9219405651092529, + "learning_rate": 0.00011819005618316923, + "loss": 2.9833, + "step": 13547 + }, + { + "epoch": 1.2274240674050418, + "grad_norm": 0.9158971309661865, + "learning_rate": 0.00011818401498217845, + "loss": 2.6291, + "step": 13548 + }, + { + "epoch": 1.2275146655794886, + "grad_norm": 0.8214133381843567, + "learning_rate": 0.00011817797378118771, + "loss": 2.0475, + "step": 13549 + }, + { + "epoch": 1.2276052637539354, + "grad_norm": 0.659070611000061, + "learning_rate": 0.00011817193258019695, + "loss": 1.372, + "step": 13550 + }, + { + "epoch": 1.2276958619283822, + "grad_norm": 0.8819649815559387, + "learning_rate": 0.0001181658913792062, + "loss": 2.713, + "step": 13551 + }, + { + "epoch": 1.227786460102829, + "grad_norm": 0.9214084148406982, + "learning_rate": 0.00011815985017821543, + "loss": 2.8203, + "step": 13552 + }, + { + "epoch": 1.2278770582772758, + "grad_norm": 0.869289755821228, + "learning_rate": 0.00011815380897722468, + "loss": 2.661, + "step": 13553 + }, + { + "epoch": 1.2279676564517226, + "grad_norm": 0.9325076341629028, + "learning_rate": 0.00011814776777623393, + "loss": 2.6488, + "step": 13554 + }, + { + "epoch": 1.2280582546261694, + "grad_norm": 0.9027155041694641, + "learning_rate": 0.00011814172657524316, + "loss": 2.6835, + "step": 13555 + }, + { + "epoch": 1.2281488528006161, + "grad_norm": 0.9042500257492065, + "learning_rate": 0.00011813568537425241, + "loss": 2.6836, + "step": 13556 + }, + { + "epoch": 1.228239450975063, + "grad_norm": 0.8442659378051758, + "learning_rate": 0.00011812964417326164, + "loss": 2.0849, + "step": 13557 + }, + { + "epoch": 1.2283300491495097, + "grad_norm": 0.9580832719802856, + "learning_rate": 0.0001181236029722709, + "loss": 2.8014, + "step": 13558 + }, + { + "epoch": 1.2284206473239565, + "grad_norm": 0.9278207421302795, + "learning_rate": 0.00011811756177128012, + "loss": 2.6116, + "step": 13559 + }, + { + "epoch": 1.2285112454984033, + "grad_norm": 0.930367648601532, + "learning_rate": 0.00011811152057028938, + "loss": 2.7647, + "step": 13560 + }, + { + "epoch": 1.22860184367285, + "grad_norm": 0.8877432346343994, + "learning_rate": 0.00011810547936929863, + "loss": 2.5987, + "step": 13561 + }, + { + "epoch": 1.2286924418472969, + "grad_norm": 0.9273622632026672, + "learning_rate": 0.00011809943816830787, + "loss": 2.7932, + "step": 13562 + }, + { + "epoch": 1.2287830400217437, + "grad_norm": 0.8784583806991577, + "learning_rate": 0.00011809339696731711, + "loss": 2.0512, + "step": 13563 + }, + { + "epoch": 1.2288736381961904, + "grad_norm": 1.0130200386047363, + "learning_rate": 0.00011808735576632635, + "loss": 2.6752, + "step": 13564 + }, + { + "epoch": 1.2289642363706372, + "grad_norm": 0.9121118783950806, + "learning_rate": 0.0001180813145653356, + "loss": 2.6305, + "step": 13565 + }, + { + "epoch": 1.2290548345450838, + "grad_norm": 0.9229391813278198, + "learning_rate": 0.00011807527336434483, + "loss": 2.6816, + "step": 13566 + }, + { + "epoch": 1.2291454327195308, + "grad_norm": 0.8722119927406311, + "learning_rate": 0.00011806923216335408, + "loss": 2.8396, + "step": 13567 + }, + { + "epoch": 1.2292360308939774, + "grad_norm": 0.8493664860725403, + "learning_rate": 0.00011806319096236331, + "loss": 2.7759, + "step": 13568 + }, + { + "epoch": 1.2293266290684244, + "grad_norm": 0.8480579257011414, + "learning_rate": 0.00011805714976137256, + "loss": 2.8183, + "step": 13569 + }, + { + "epoch": 1.229417227242871, + "grad_norm": 0.8877766728401184, + "learning_rate": 0.00011805110856038182, + "loss": 2.826, + "step": 13570 + }, + { + "epoch": 1.229507825417318, + "grad_norm": 0.8365257978439331, + "learning_rate": 0.00011804506735939105, + "loss": 1.9596, + "step": 13571 + }, + { + "epoch": 1.2295984235917645, + "grad_norm": 0.8977154493331909, + "learning_rate": 0.0001180390261584003, + "loss": 2.7755, + "step": 13572 + }, + { + "epoch": 1.2296890217662115, + "grad_norm": 0.8662694096565247, + "learning_rate": 0.00011803298495740954, + "loss": 2.4712, + "step": 13573 + }, + { + "epoch": 1.229779619940658, + "grad_norm": 0.8890901207923889, + "learning_rate": 0.00011802694375641878, + "loss": 2.5474, + "step": 13574 + }, + { + "epoch": 1.2298702181151049, + "grad_norm": 0.9210153818130493, + "learning_rate": 0.00011802090255542802, + "loss": 2.8643, + "step": 13575 + }, + { + "epoch": 1.2299608162895517, + "grad_norm": 0.8879782557487488, + "learning_rate": 0.00011801486135443726, + "loss": 2.5724, + "step": 13576 + }, + { + "epoch": 1.2300514144639985, + "grad_norm": 0.9530836939811707, + "learning_rate": 0.00011800882015344653, + "loss": 2.8645, + "step": 13577 + }, + { + "epoch": 1.2301420126384452, + "grad_norm": 0.9072704911231995, + "learning_rate": 0.00011800277895245575, + "loss": 2.5499, + "step": 13578 + }, + { + "epoch": 1.230232610812892, + "grad_norm": 0.8033381104469299, + "learning_rate": 0.00011799673775146501, + "loss": 2.1371, + "step": 13579 + }, + { + "epoch": 1.2303232089873388, + "grad_norm": 0.9265920519828796, + "learning_rate": 0.00011799069655047423, + "loss": 2.7945, + "step": 13580 + }, + { + "epoch": 1.2304138071617856, + "grad_norm": 0.9061498641967773, + "learning_rate": 0.00011798465534948349, + "loss": 2.7961, + "step": 13581 + }, + { + "epoch": 1.2305044053362324, + "grad_norm": 0.9236104488372803, + "learning_rate": 0.00011797861414849272, + "loss": 2.4617, + "step": 13582 + }, + { + "epoch": 1.2305950035106792, + "grad_norm": 0.8934850692749023, + "learning_rate": 0.00011797257294750197, + "loss": 2.4826, + "step": 13583 + }, + { + "epoch": 1.230685601685126, + "grad_norm": 0.7650866508483887, + "learning_rate": 0.00011796653174651122, + "loss": 2.1229, + "step": 13584 + }, + { + "epoch": 1.2307761998595728, + "grad_norm": 0.8934112787246704, + "learning_rate": 0.00011796049054552045, + "loss": 2.8962, + "step": 13585 + }, + { + "epoch": 1.2308667980340195, + "grad_norm": 0.8813452124595642, + "learning_rate": 0.0001179544493445297, + "loss": 2.6802, + "step": 13586 + }, + { + "epoch": 1.2309573962084663, + "grad_norm": 0.8966099619865417, + "learning_rate": 0.00011794840814353893, + "loss": 2.6918, + "step": 13587 + }, + { + "epoch": 1.2310479943829131, + "grad_norm": 0.965589702129364, + "learning_rate": 0.00011794236694254818, + "loss": 2.9093, + "step": 13588 + }, + { + "epoch": 1.23113859255736, + "grad_norm": 0.8946657776832581, + "learning_rate": 0.00011793632574155742, + "loss": 2.5261, + "step": 13589 + }, + { + "epoch": 1.2312291907318067, + "grad_norm": 0.8826931715011597, + "learning_rate": 0.00011793028454056668, + "loss": 2.7594, + "step": 13590 + }, + { + "epoch": 1.2313197889062535, + "grad_norm": 0.8275594115257263, + "learning_rate": 0.00011792424333957592, + "loss": 1.9075, + "step": 13591 + }, + { + "epoch": 1.2314103870807003, + "grad_norm": 0.8551765084266663, + "learning_rate": 0.00011791820213858516, + "loss": 2.6632, + "step": 13592 + }, + { + "epoch": 1.231500985255147, + "grad_norm": 0.922818124294281, + "learning_rate": 0.0001179121609375944, + "loss": 2.8426, + "step": 13593 + }, + { + "epoch": 1.2315915834295939, + "grad_norm": 0.95787113904953, + "learning_rate": 0.00011790611973660364, + "loss": 3.0465, + "step": 13594 + }, + { + "epoch": 1.2316821816040406, + "grad_norm": 0.8945023417472839, + "learning_rate": 0.00011790007853561289, + "loss": 2.9557, + "step": 13595 + }, + { + "epoch": 1.2317727797784874, + "grad_norm": 0.8846946358680725, + "learning_rate": 0.00011789403733462212, + "loss": 2.837, + "step": 13596 + }, + { + "epoch": 1.2318633779529342, + "grad_norm": 0.8556849956512451, + "learning_rate": 0.00011788799613363137, + "loss": 2.671, + "step": 13597 + }, + { + "epoch": 1.231953976127381, + "grad_norm": 0.9467567205429077, + "learning_rate": 0.0001178819549326406, + "loss": 2.6093, + "step": 13598 + }, + { + "epoch": 1.2320445743018278, + "grad_norm": 0.8835544586181641, + "learning_rate": 0.00011787591373164985, + "loss": 2.6573, + "step": 13599 + }, + { + "epoch": 1.2321351724762746, + "grad_norm": 0.8952550292015076, + "learning_rate": 0.00011786987253065911, + "loss": 2.5519, + "step": 13600 + }, + { + "epoch": 1.2322257706507214, + "grad_norm": 0.8454539179801941, + "learning_rate": 0.00011786383132966833, + "loss": 2.8063, + "step": 13601 + }, + { + "epoch": 1.2323163688251682, + "grad_norm": 0.8946300745010376, + "learning_rate": 0.0001178577901286776, + "loss": 2.7347, + "step": 13602 + }, + { + "epoch": 1.232406966999615, + "grad_norm": 0.8496564030647278, + "learning_rate": 0.00011785174892768683, + "loss": 2.5025, + "step": 13603 + }, + { + "epoch": 1.2324975651740617, + "grad_norm": 0.9315363168716431, + "learning_rate": 0.00011784570772669608, + "loss": 2.8408, + "step": 13604 + }, + { + "epoch": 1.2325881633485085, + "grad_norm": 0.9715417623519897, + "learning_rate": 0.00011783966652570531, + "loss": 2.8705, + "step": 13605 + }, + { + "epoch": 1.2326787615229553, + "grad_norm": 0.8826597929000854, + "learning_rate": 0.00011783362532471456, + "loss": 2.6652, + "step": 13606 + }, + { + "epoch": 1.232769359697402, + "grad_norm": 0.9997926354408264, + "learning_rate": 0.0001178275841237238, + "loss": 2.9601, + "step": 13607 + }, + { + "epoch": 1.2328599578718489, + "grad_norm": 0.9289110898971558, + "learning_rate": 0.00011782154292273304, + "loss": 2.7822, + "step": 13608 + }, + { + "epoch": 1.2329505560462957, + "grad_norm": 0.753768801689148, + "learning_rate": 0.0001178155017217423, + "loss": 2.1257, + "step": 13609 + }, + { + "epoch": 1.2330411542207425, + "grad_norm": 0.9001145362854004, + "learning_rate": 0.00011780946052075152, + "loss": 2.7663, + "step": 13610 + }, + { + "epoch": 1.2331317523951892, + "grad_norm": 0.8789004683494568, + "learning_rate": 0.00011780341931976078, + "loss": 2.591, + "step": 13611 + }, + { + "epoch": 1.233222350569636, + "grad_norm": 0.8977246284484863, + "learning_rate": 0.00011779737811877, + "loss": 2.467, + "step": 13612 + }, + { + "epoch": 1.2333129487440828, + "grad_norm": 0.8583729267120361, + "learning_rate": 0.00011779133691777926, + "loss": 2.7931, + "step": 13613 + }, + { + "epoch": 1.2334035469185296, + "grad_norm": 0.9658342599868774, + "learning_rate": 0.00011778529571678851, + "loss": 2.6544, + "step": 13614 + }, + { + "epoch": 1.2334941450929764, + "grad_norm": 0.8853914737701416, + "learning_rate": 0.00011777925451579775, + "loss": 2.9713, + "step": 13615 + }, + { + "epoch": 1.2335847432674232, + "grad_norm": 0.9242780804634094, + "learning_rate": 0.00011777321331480699, + "loss": 2.763, + "step": 13616 + }, + { + "epoch": 1.23367534144187, + "grad_norm": 0.9477262496948242, + "learning_rate": 0.00011776717211381623, + "loss": 2.836, + "step": 13617 + }, + { + "epoch": 1.2337659396163168, + "grad_norm": 0.7330362796783447, + "learning_rate": 0.00011776113091282547, + "loss": 2.0286, + "step": 13618 + }, + { + "epoch": 1.2338565377907635, + "grad_norm": 0.9286091327667236, + "learning_rate": 0.00011775508971183471, + "loss": 2.5998, + "step": 13619 + }, + { + "epoch": 1.2339471359652103, + "grad_norm": 0.8462490439414978, + "learning_rate": 0.00011774904851084396, + "loss": 2.5696, + "step": 13620 + }, + { + "epoch": 1.2340377341396571, + "grad_norm": 0.8527913689613342, + "learning_rate": 0.00011774300730985322, + "loss": 2.7211, + "step": 13621 + }, + { + "epoch": 1.234128332314104, + "grad_norm": 0.869577169418335, + "learning_rate": 0.00011773696610886245, + "loss": 2.5154, + "step": 13622 + }, + { + "epoch": 1.2342189304885507, + "grad_norm": 0.8263987898826599, + "learning_rate": 0.0001177309249078717, + "loss": 2.435, + "step": 13623 + }, + { + "epoch": 1.2343095286629975, + "grad_norm": 0.8984712958335876, + "learning_rate": 0.00011772488370688093, + "loss": 2.6837, + "step": 13624 + }, + { + "epoch": 1.2344001268374443, + "grad_norm": 0.8894766569137573, + "learning_rate": 0.00011771884250589018, + "loss": 2.6181, + "step": 13625 + }, + { + "epoch": 1.234490725011891, + "grad_norm": 0.9178896546363831, + "learning_rate": 0.00011771280130489941, + "loss": 2.8035, + "step": 13626 + }, + { + "epoch": 1.2345813231863378, + "grad_norm": 0.8724508881568909, + "learning_rate": 0.00011770676010390866, + "loss": 2.5382, + "step": 13627 + }, + { + "epoch": 1.2346719213607846, + "grad_norm": 0.7696490287780762, + "learning_rate": 0.0001177007189029179, + "loss": 2.0871, + "step": 13628 + }, + { + "epoch": 1.2347625195352314, + "grad_norm": 0.9794202446937561, + "learning_rate": 0.00011769467770192714, + "loss": 2.731, + "step": 13629 + }, + { + "epoch": 1.2348531177096782, + "grad_norm": 0.864454984664917, + "learning_rate": 0.0001176886365009364, + "loss": 2.5302, + "step": 13630 + }, + { + "epoch": 1.234943715884125, + "grad_norm": 0.9281347990036011, + "learning_rate": 0.00011768259529994563, + "loss": 2.9421, + "step": 13631 + }, + { + "epoch": 1.2350343140585718, + "grad_norm": 0.898017942905426, + "learning_rate": 0.00011767655409895489, + "loss": 2.965, + "step": 13632 + }, + { + "epoch": 1.2351249122330186, + "grad_norm": 0.9063250422477722, + "learning_rate": 0.00011767051289796411, + "loss": 2.8948, + "step": 13633 + }, + { + "epoch": 1.2352155104074654, + "grad_norm": 0.9420844912528992, + "learning_rate": 0.00011766447169697337, + "loss": 2.6872, + "step": 13634 + }, + { + "epoch": 1.2353061085819121, + "grad_norm": 0.9663841128349304, + "learning_rate": 0.0001176584304959826, + "loss": 2.8303, + "step": 13635 + }, + { + "epoch": 1.235396706756359, + "grad_norm": 0.8059768080711365, + "learning_rate": 0.00011765238929499185, + "loss": 2.0717, + "step": 13636 + }, + { + "epoch": 1.2354873049308057, + "grad_norm": 0.7439290285110474, + "learning_rate": 0.0001176463480940011, + "loss": 1.7244, + "step": 13637 + }, + { + "epoch": 1.2355779031052525, + "grad_norm": 0.8812217116355896, + "learning_rate": 0.00011764030689301033, + "loss": 2.8539, + "step": 13638 + }, + { + "epoch": 1.2356685012796993, + "grad_norm": 0.9572624564170837, + "learning_rate": 0.00011763426569201958, + "loss": 2.8318, + "step": 13639 + }, + { + "epoch": 1.235759099454146, + "grad_norm": 0.9234142303466797, + "learning_rate": 0.00011762822449102881, + "loss": 2.8091, + "step": 13640 + }, + { + "epoch": 1.2358496976285929, + "grad_norm": 0.9152365326881409, + "learning_rate": 0.00011762218329003807, + "loss": 2.6157, + "step": 13641 + }, + { + "epoch": 1.2359402958030397, + "grad_norm": 0.898281991481781, + "learning_rate": 0.0001176161420890473, + "loss": 2.7987, + "step": 13642 + }, + { + "epoch": 1.2360308939774864, + "grad_norm": 0.9128565788269043, + "learning_rate": 0.00011761010088805656, + "loss": 2.7451, + "step": 13643 + }, + { + "epoch": 1.2361214921519332, + "grad_norm": 0.9124343395233154, + "learning_rate": 0.0001176040596870658, + "loss": 2.4935, + "step": 13644 + }, + { + "epoch": 1.23621209032638, + "grad_norm": 0.8748149275779724, + "learning_rate": 0.00011759801848607504, + "loss": 2.8311, + "step": 13645 + }, + { + "epoch": 1.2363026885008268, + "grad_norm": 0.815438985824585, + "learning_rate": 0.00011759197728508429, + "loss": 2.0365, + "step": 13646 + }, + { + "epoch": 1.2363932866752734, + "grad_norm": 0.9844363927841187, + "learning_rate": 0.00011758593608409352, + "loss": 2.5896, + "step": 13647 + }, + { + "epoch": 1.2364838848497204, + "grad_norm": 0.9373422861099243, + "learning_rate": 0.00011757989488310277, + "loss": 2.7741, + "step": 13648 + }, + { + "epoch": 1.236574483024167, + "grad_norm": 0.9088853001594543, + "learning_rate": 0.000117573853682112, + "loss": 2.9384, + "step": 13649 + }, + { + "epoch": 1.236665081198614, + "grad_norm": 0.8440423607826233, + "learning_rate": 0.00011756781248112125, + "loss": 2.8307, + "step": 13650 + }, + { + "epoch": 1.2367556793730605, + "grad_norm": 0.8099270462989807, + "learning_rate": 0.00011756177128013051, + "loss": 2.1067, + "step": 13651 + }, + { + "epoch": 1.2368462775475075, + "grad_norm": 0.9332131147384644, + "learning_rate": 0.00011755573007913973, + "loss": 2.818, + "step": 13652 + }, + { + "epoch": 1.236936875721954, + "grad_norm": 0.7573803067207336, + "learning_rate": 0.00011754968887814899, + "loss": 2.1429, + "step": 13653 + }, + { + "epoch": 1.237027473896401, + "grad_norm": 0.9674602150917053, + "learning_rate": 0.00011754364767715823, + "loss": 2.6082, + "step": 13654 + }, + { + "epoch": 1.2371180720708477, + "grad_norm": 0.9265308380126953, + "learning_rate": 0.00011753760647616747, + "loss": 2.7426, + "step": 13655 + }, + { + "epoch": 1.2372086702452945, + "grad_norm": 0.8803828954696655, + "learning_rate": 0.00011753156527517671, + "loss": 2.685, + "step": 13656 + }, + { + "epoch": 1.2372992684197412, + "grad_norm": 0.9552605748176575, + "learning_rate": 0.00011752552407418595, + "loss": 2.8412, + "step": 13657 + }, + { + "epoch": 1.237389866594188, + "grad_norm": 1.0605292320251465, + "learning_rate": 0.00011751948287319519, + "loss": 2.8508, + "step": 13658 + }, + { + "epoch": 1.2374804647686348, + "grad_norm": 0.9214731454849243, + "learning_rate": 0.00011751344167220444, + "loss": 2.8123, + "step": 13659 + }, + { + "epoch": 1.2375710629430816, + "grad_norm": 0.9204232096672058, + "learning_rate": 0.0001175074004712137, + "loss": 2.7143, + "step": 13660 + }, + { + "epoch": 1.2376616611175284, + "grad_norm": 0.9418814182281494, + "learning_rate": 0.00011750135927022292, + "loss": 2.5438, + "step": 13661 + }, + { + "epoch": 1.2377522592919752, + "grad_norm": 0.9578654766082764, + "learning_rate": 0.00011749531806923218, + "loss": 2.9283, + "step": 13662 + }, + { + "epoch": 1.237842857466422, + "grad_norm": 0.7703756093978882, + "learning_rate": 0.0001174892768682414, + "loss": 2.0431, + "step": 13663 + }, + { + "epoch": 1.2379334556408688, + "grad_norm": 0.9558284282684326, + "learning_rate": 0.00011748323566725066, + "loss": 2.6628, + "step": 13664 + }, + { + "epoch": 1.2380240538153156, + "grad_norm": 0.7791132926940918, + "learning_rate": 0.00011747719446625988, + "loss": 1.9386, + "step": 13665 + }, + { + "epoch": 1.2381146519897623, + "grad_norm": 0.8276504874229431, + "learning_rate": 0.00011747115326526914, + "loss": 2.9398, + "step": 13666 + }, + { + "epoch": 1.2382052501642091, + "grad_norm": 0.9255601763725281, + "learning_rate": 0.00011746511206427839, + "loss": 2.487, + "step": 13667 + }, + { + "epoch": 1.238295848338656, + "grad_norm": 0.8835934996604919, + "learning_rate": 0.00011745907086328762, + "loss": 2.7072, + "step": 13668 + }, + { + "epoch": 1.2383864465131027, + "grad_norm": 0.9579166173934937, + "learning_rate": 0.00011745302966229687, + "loss": 2.7871, + "step": 13669 + }, + { + "epoch": 1.2384770446875495, + "grad_norm": 0.9335820078849792, + "learning_rate": 0.0001174469884613061, + "loss": 2.5792, + "step": 13670 + }, + { + "epoch": 1.2385676428619963, + "grad_norm": 0.8769365549087524, + "learning_rate": 0.00011744094726031535, + "loss": 2.8275, + "step": 13671 + }, + { + "epoch": 1.238658241036443, + "grad_norm": 0.9437575936317444, + "learning_rate": 0.00011743490605932459, + "loss": 2.8696, + "step": 13672 + }, + { + "epoch": 1.2387488392108899, + "grad_norm": 1.0831117630004883, + "learning_rate": 0.00011742886485833385, + "loss": 2.0859, + "step": 13673 + }, + { + "epoch": 1.2388394373853366, + "grad_norm": 0.9271838665008545, + "learning_rate": 0.0001174228236573431, + "loss": 2.8639, + "step": 13674 + }, + { + "epoch": 1.2389300355597834, + "grad_norm": 0.9400480389595032, + "learning_rate": 0.00011741678245635233, + "loss": 2.8389, + "step": 13675 + }, + { + "epoch": 1.2390206337342302, + "grad_norm": 0.8675599694252014, + "learning_rate": 0.00011741074125536158, + "loss": 2.6119, + "step": 13676 + }, + { + "epoch": 1.239111231908677, + "grad_norm": 0.8745811581611633, + "learning_rate": 0.00011740470005437081, + "loss": 2.4572, + "step": 13677 + }, + { + "epoch": 1.2392018300831238, + "grad_norm": 0.8858085870742798, + "learning_rate": 0.00011739865885338006, + "loss": 2.6544, + "step": 13678 + }, + { + "epoch": 1.2392924282575706, + "grad_norm": 0.980746328830719, + "learning_rate": 0.0001173926176523893, + "loss": 2.8599, + "step": 13679 + }, + { + "epoch": 1.2393830264320174, + "grad_norm": 0.7945973873138428, + "learning_rate": 0.00011738657645139854, + "loss": 2.0427, + "step": 13680 + }, + { + "epoch": 1.2394736246064642, + "grad_norm": 0.8975300192832947, + "learning_rate": 0.0001173805352504078, + "loss": 2.8363, + "step": 13681 + }, + { + "epoch": 1.239564222780911, + "grad_norm": 0.7541742324829102, + "learning_rate": 0.00011737449404941702, + "loss": 1.9075, + "step": 13682 + }, + { + "epoch": 1.2396548209553577, + "grad_norm": 0.88158118724823, + "learning_rate": 0.00011736845284842628, + "loss": 2.6485, + "step": 13683 + }, + { + "epoch": 1.2397454191298045, + "grad_norm": 0.8959039449691772, + "learning_rate": 0.0001173624116474355, + "loss": 2.5565, + "step": 13684 + }, + { + "epoch": 1.2398360173042513, + "grad_norm": 0.8899864554405212, + "learning_rate": 0.00011735637044644477, + "loss": 2.8745, + "step": 13685 + }, + { + "epoch": 1.239926615478698, + "grad_norm": 0.957848310470581, + "learning_rate": 0.000117350329245454, + "loss": 2.4079, + "step": 13686 + }, + { + "epoch": 1.2400172136531449, + "grad_norm": 0.8928811550140381, + "learning_rate": 0.00011734428804446325, + "loss": 2.8668, + "step": 13687 + }, + { + "epoch": 1.2401078118275917, + "grad_norm": 0.9272905588150024, + "learning_rate": 0.00011733824684347248, + "loss": 2.8046, + "step": 13688 + }, + { + "epoch": 1.2401984100020385, + "grad_norm": 0.8338502049446106, + "learning_rate": 0.00011733220564248173, + "loss": 2.6369, + "step": 13689 + }, + { + "epoch": 1.2402890081764852, + "grad_norm": 0.866508960723877, + "learning_rate": 0.00011732616444149098, + "loss": 2.6761, + "step": 13690 + }, + { + "epoch": 1.240379606350932, + "grad_norm": 0.8525264859199524, + "learning_rate": 0.00011732012324050021, + "loss": 2.0165, + "step": 13691 + }, + { + "epoch": 1.2404702045253788, + "grad_norm": 0.885350227355957, + "learning_rate": 0.00011731408203950947, + "loss": 2.6787, + "step": 13692 + }, + { + "epoch": 1.2405608026998256, + "grad_norm": 0.8945850729942322, + "learning_rate": 0.00011730804083851869, + "loss": 2.4646, + "step": 13693 + }, + { + "epoch": 1.2406514008742724, + "grad_norm": 0.9449759721755981, + "learning_rate": 0.00011730199963752795, + "loss": 2.6569, + "step": 13694 + }, + { + "epoch": 1.2407419990487192, + "grad_norm": 0.8794463276863098, + "learning_rate": 0.00011729595843653717, + "loss": 2.665, + "step": 13695 + }, + { + "epoch": 1.240832597223166, + "grad_norm": 0.8650881052017212, + "learning_rate": 0.00011728991723554644, + "loss": 2.7219, + "step": 13696 + }, + { + "epoch": 1.2409231953976128, + "grad_norm": 0.881790041923523, + "learning_rate": 0.00011728387603455568, + "loss": 2.763, + "step": 13697 + }, + { + "epoch": 1.2410137935720595, + "grad_norm": 0.8394583463668823, + "learning_rate": 0.00011727783483356492, + "loss": 2.5071, + "step": 13698 + }, + { + "epoch": 1.2411043917465063, + "grad_norm": 0.8784864544868469, + "learning_rate": 0.00011727179363257416, + "loss": 2.6133, + "step": 13699 + }, + { + "epoch": 1.2411949899209531, + "grad_norm": 0.9096598625183105, + "learning_rate": 0.0001172657524315834, + "loss": 2.6499, + "step": 13700 + }, + { + "epoch": 1.2412855880954, + "grad_norm": 1.0744953155517578, + "learning_rate": 0.00011725971123059265, + "loss": 2.6443, + "step": 13701 + }, + { + "epoch": 1.2413761862698467, + "grad_norm": 0.9006572365760803, + "learning_rate": 0.00011725367002960188, + "loss": 2.6525, + "step": 13702 + }, + { + "epoch": 1.2414667844442935, + "grad_norm": 1.023438572883606, + "learning_rate": 0.00011724762882861113, + "loss": 2.9222, + "step": 13703 + }, + { + "epoch": 1.2415573826187403, + "grad_norm": 0.9134818911552429, + "learning_rate": 0.00011724158762762039, + "loss": 2.808, + "step": 13704 + }, + { + "epoch": 1.241647980793187, + "grad_norm": 0.9021621942520142, + "learning_rate": 0.00011723554642662962, + "loss": 2.7069, + "step": 13705 + }, + { + "epoch": 1.2417385789676338, + "grad_norm": 0.747860312461853, + "learning_rate": 0.00011722950522563887, + "loss": 1.9416, + "step": 13706 + }, + { + "epoch": 1.2418291771420806, + "grad_norm": 0.8453734517097473, + "learning_rate": 0.0001172234640246481, + "loss": 2.4419, + "step": 13707 + }, + { + "epoch": 1.2419197753165274, + "grad_norm": 0.9550328254699707, + "learning_rate": 0.00011721742282365735, + "loss": 2.5146, + "step": 13708 + }, + { + "epoch": 1.2420103734909742, + "grad_norm": 0.8881344795227051, + "learning_rate": 0.00011721138162266659, + "loss": 2.488, + "step": 13709 + }, + { + "epoch": 1.242100971665421, + "grad_norm": 0.9149959683418274, + "learning_rate": 0.00011720534042167583, + "loss": 3.1721, + "step": 13710 + }, + { + "epoch": 1.2421915698398678, + "grad_norm": 0.8398410677909851, + "learning_rate": 0.00011719929922068508, + "loss": 2.1694, + "step": 13711 + }, + { + "epoch": 1.2422821680143146, + "grad_norm": 0.9640594124794006, + "learning_rate": 0.00011719325801969432, + "loss": 2.9711, + "step": 13712 + }, + { + "epoch": 1.2423727661887614, + "grad_norm": 0.9240449070930481, + "learning_rate": 0.00011718721681870358, + "loss": 2.6535, + "step": 13713 + }, + { + "epoch": 1.2424633643632081, + "grad_norm": 0.8476278185844421, + "learning_rate": 0.0001171811756177128, + "loss": 1.9514, + "step": 13714 + }, + { + "epoch": 1.242553962537655, + "grad_norm": 0.8824599981307983, + "learning_rate": 0.00011717513441672206, + "loss": 2.7245, + "step": 13715 + }, + { + "epoch": 1.2426445607121017, + "grad_norm": 0.8515196442604065, + "learning_rate": 0.00011716909321573128, + "loss": 2.646, + "step": 13716 + }, + { + "epoch": 1.2427351588865485, + "grad_norm": 0.9135792255401611, + "learning_rate": 0.00011716305201474054, + "loss": 2.8126, + "step": 13717 + }, + { + "epoch": 1.2428257570609953, + "grad_norm": 0.9382007122039795, + "learning_rate": 0.00011715701081374979, + "loss": 2.9864, + "step": 13718 + }, + { + "epoch": 1.242916355235442, + "grad_norm": 0.8738181591033936, + "learning_rate": 0.00011715096961275902, + "loss": 2.7724, + "step": 13719 + }, + { + "epoch": 1.2430069534098889, + "grad_norm": 0.8800392746925354, + "learning_rate": 0.00011714492841176827, + "loss": 2.6004, + "step": 13720 + }, + { + "epoch": 1.2430975515843357, + "grad_norm": 0.8504447937011719, + "learning_rate": 0.0001171388872107775, + "loss": 1.885, + "step": 13721 + }, + { + "epoch": 1.2431881497587824, + "grad_norm": 0.8960524797439575, + "learning_rate": 0.00011713284600978675, + "loss": 2.6502, + "step": 13722 + }, + { + "epoch": 1.2432787479332292, + "grad_norm": 0.8382000923156738, + "learning_rate": 0.00011712680480879599, + "loss": 2.6981, + "step": 13723 + }, + { + "epoch": 1.243369346107676, + "grad_norm": 0.9649202823638916, + "learning_rate": 0.00011712076360780525, + "loss": 2.9119, + "step": 13724 + }, + { + "epoch": 1.2434599442821228, + "grad_norm": 0.9578461647033691, + "learning_rate": 0.00011711472240681447, + "loss": 2.8557, + "step": 13725 + }, + { + "epoch": 1.2435505424565696, + "grad_norm": 0.9107789993286133, + "learning_rate": 0.00011710868120582373, + "loss": 2.99, + "step": 13726 + }, + { + "epoch": 1.2436411406310164, + "grad_norm": 0.93154376745224, + "learning_rate": 0.00011710264000483298, + "loss": 3.1144, + "step": 13727 + }, + { + "epoch": 1.243731738805463, + "grad_norm": 0.9467561841011047, + "learning_rate": 0.00011709659880384221, + "loss": 2.5778, + "step": 13728 + }, + { + "epoch": 1.24382233697991, + "grad_norm": 0.928127110004425, + "learning_rate": 0.00011709055760285146, + "loss": 2.8034, + "step": 13729 + }, + { + "epoch": 1.2439129351543565, + "grad_norm": 0.915582001209259, + "learning_rate": 0.00011708451640186069, + "loss": 2.7334, + "step": 13730 + }, + { + "epoch": 1.2440035333288035, + "grad_norm": 0.7572445869445801, + "learning_rate": 0.00011707847520086994, + "loss": 2.1954, + "step": 13731 + }, + { + "epoch": 1.24409413150325, + "grad_norm": 0.8945192694664001, + "learning_rate": 0.00011707243399987917, + "loss": 2.6707, + "step": 13732 + }, + { + "epoch": 1.244184729677697, + "grad_norm": 0.814207911491394, + "learning_rate": 0.00011706639279888842, + "loss": 2.7647, + "step": 13733 + }, + { + "epoch": 1.2442753278521437, + "grad_norm": 0.871125340461731, + "learning_rate": 0.00011706035159789768, + "loss": 2.7204, + "step": 13734 + }, + { + "epoch": 1.2443659260265907, + "grad_norm": 0.7295161485671997, + "learning_rate": 0.0001170543103969069, + "loss": 2.0274, + "step": 13735 + }, + { + "epoch": 1.2444565242010373, + "grad_norm": 0.9472147822380066, + "learning_rate": 0.00011704826919591616, + "loss": 2.6842, + "step": 13736 + }, + { + "epoch": 1.244547122375484, + "grad_norm": 0.9364922642707825, + "learning_rate": 0.0001170422279949254, + "loss": 3.0609, + "step": 13737 + }, + { + "epoch": 1.2446377205499308, + "grad_norm": 1.150909423828125, + "learning_rate": 0.00011703618679393464, + "loss": 2.7569, + "step": 13738 + }, + { + "epoch": 1.2447283187243776, + "grad_norm": 0.882233202457428, + "learning_rate": 0.00011703014559294388, + "loss": 2.7438, + "step": 13739 + }, + { + "epoch": 1.2448189168988244, + "grad_norm": 0.937177300453186, + "learning_rate": 0.00011702410439195313, + "loss": 2.8497, + "step": 13740 + }, + { + "epoch": 1.2449095150732712, + "grad_norm": 0.8483806848526001, + "learning_rate": 0.00011701806319096237, + "loss": 2.6834, + "step": 13741 + }, + { + "epoch": 1.245000113247718, + "grad_norm": 0.9221165180206299, + "learning_rate": 0.00011701202198997161, + "loss": 2.6734, + "step": 13742 + }, + { + "epoch": 1.2450907114221648, + "grad_norm": 0.8617711067199707, + "learning_rate": 0.00011700598078898086, + "loss": 2.4734, + "step": 13743 + }, + { + "epoch": 1.2451813095966116, + "grad_norm": 0.8919048309326172, + "learning_rate": 0.00011699993958799009, + "loss": 2.6335, + "step": 13744 + }, + { + "epoch": 1.2452719077710583, + "grad_norm": 0.8684387803077698, + "learning_rate": 0.00011699389838699935, + "loss": 2.5956, + "step": 13745 + }, + { + "epoch": 1.2453625059455051, + "grad_norm": 0.859746515750885, + "learning_rate": 0.00011698785718600857, + "loss": 2.9039, + "step": 13746 + }, + { + "epoch": 1.245453104119952, + "grad_norm": 0.8629549741744995, + "learning_rate": 0.00011698181598501783, + "loss": 2.7254, + "step": 13747 + }, + { + "epoch": 1.2455437022943987, + "grad_norm": 0.8924615979194641, + "learning_rate": 0.00011697577478402708, + "loss": 2.6625, + "step": 13748 + }, + { + "epoch": 1.2456343004688455, + "grad_norm": 0.848362922668457, + "learning_rate": 0.00011696973358303631, + "loss": 2.686, + "step": 13749 + }, + { + "epoch": 1.2457248986432923, + "grad_norm": 0.9513509273529053, + "learning_rate": 0.00011696369238204556, + "loss": 2.7146, + "step": 13750 + }, + { + "epoch": 1.245815496817739, + "grad_norm": 0.8530821800231934, + "learning_rate": 0.0001169576511810548, + "loss": 2.2086, + "step": 13751 + }, + { + "epoch": 1.2459060949921859, + "grad_norm": 0.9873633980751038, + "learning_rate": 0.00011695160998006404, + "loss": 2.52, + "step": 13752 + }, + { + "epoch": 1.2459966931666326, + "grad_norm": 0.8587110638618469, + "learning_rate": 0.00011694556877907328, + "loss": 2.7423, + "step": 13753 + }, + { + "epoch": 1.2460872913410794, + "grad_norm": 1.06276273727417, + "learning_rate": 0.00011693952757808253, + "loss": 2.9833, + "step": 13754 + }, + { + "epoch": 1.2461778895155262, + "grad_norm": 0.7569634318351746, + "learning_rate": 0.00011693348637709176, + "loss": 1.8518, + "step": 13755 + }, + { + "epoch": 1.246268487689973, + "grad_norm": 1.0076874494552612, + "learning_rate": 0.00011692744517610101, + "loss": 2.8836, + "step": 13756 + }, + { + "epoch": 1.2463590858644198, + "grad_norm": 0.8652862310409546, + "learning_rate": 0.00011692140397511027, + "loss": 2.7006, + "step": 13757 + }, + { + "epoch": 1.2464496840388666, + "grad_norm": 0.9061893820762634, + "learning_rate": 0.0001169153627741195, + "loss": 2.7648, + "step": 13758 + }, + { + "epoch": 1.2465402822133134, + "grad_norm": 0.9007641673088074, + "learning_rate": 0.00011690932157312875, + "loss": 2.7266, + "step": 13759 + }, + { + "epoch": 1.2466308803877602, + "grad_norm": 0.8954483866691589, + "learning_rate": 0.00011690328037213798, + "loss": 2.669, + "step": 13760 + }, + { + "epoch": 1.246721478562207, + "grad_norm": 0.8918956518173218, + "learning_rate": 0.00011689723917114723, + "loss": 2.6651, + "step": 13761 + }, + { + "epoch": 1.2468120767366537, + "grad_norm": 0.9045292735099792, + "learning_rate": 0.00011689119797015647, + "loss": 2.7474, + "step": 13762 + }, + { + "epoch": 1.2469026749111005, + "grad_norm": 0.928021252155304, + "learning_rate": 0.00011688515676916571, + "loss": 2.8787, + "step": 13763 + }, + { + "epoch": 1.2469932730855473, + "grad_norm": 0.9413610696792603, + "learning_rate": 0.00011687911556817497, + "loss": 2.7112, + "step": 13764 + }, + { + "epoch": 1.247083871259994, + "grad_norm": 0.9446027874946594, + "learning_rate": 0.0001168730743671842, + "loss": 2.9854, + "step": 13765 + }, + { + "epoch": 1.2471744694344409, + "grad_norm": 0.7981755137443542, + "learning_rate": 0.00011686703316619346, + "loss": 2.2016, + "step": 13766 + }, + { + "epoch": 1.2472650676088877, + "grad_norm": 0.9036828279495239, + "learning_rate": 0.00011686099196520268, + "loss": 2.7969, + "step": 13767 + }, + { + "epoch": 1.2473556657833345, + "grad_norm": 0.9609941840171814, + "learning_rate": 0.00011685495076421194, + "loss": 2.7555, + "step": 13768 + }, + { + "epoch": 1.2474462639577812, + "grad_norm": 0.9232535362243652, + "learning_rate": 0.00011684890956322117, + "loss": 2.7196, + "step": 13769 + }, + { + "epoch": 1.247536862132228, + "grad_norm": 0.9335423111915588, + "learning_rate": 0.00011684286836223042, + "loss": 2.7412, + "step": 13770 + }, + { + "epoch": 1.2476274603066748, + "grad_norm": 0.8813297748565674, + "learning_rate": 0.00011683682716123967, + "loss": 2.8571, + "step": 13771 + }, + { + "epoch": 1.2477180584811216, + "grad_norm": 0.8840615749359131, + "learning_rate": 0.0001168307859602489, + "loss": 2.644, + "step": 13772 + }, + { + "epoch": 1.2478086566555684, + "grad_norm": 0.8144643902778625, + "learning_rate": 0.00011682474475925815, + "loss": 2.4819, + "step": 13773 + }, + { + "epoch": 1.2478992548300152, + "grad_norm": 0.8697975277900696, + "learning_rate": 0.00011681870355826738, + "loss": 2.4529, + "step": 13774 + }, + { + "epoch": 1.247989853004462, + "grad_norm": 0.917266309261322, + "learning_rate": 0.00011681266235727663, + "loss": 2.69, + "step": 13775 + }, + { + "epoch": 1.2480804511789088, + "grad_norm": 0.9016988277435303, + "learning_rate": 0.00011680662115628586, + "loss": 2.9625, + "step": 13776 + }, + { + "epoch": 1.2481710493533555, + "grad_norm": 0.7463109493255615, + "learning_rate": 0.00011680057995529513, + "loss": 2.0452, + "step": 13777 + }, + { + "epoch": 1.2482616475278023, + "grad_norm": 0.9989697337150574, + "learning_rate": 0.00011679453875430437, + "loss": 2.8308, + "step": 13778 + }, + { + "epoch": 1.2483522457022491, + "grad_norm": 0.8923881649971008, + "learning_rate": 0.00011678849755331361, + "loss": 2.7212, + "step": 13779 + }, + { + "epoch": 1.248442843876696, + "grad_norm": 0.9524372220039368, + "learning_rate": 0.00011678245635232285, + "loss": 2.6132, + "step": 13780 + }, + { + "epoch": 1.2485334420511427, + "grad_norm": 0.9162205457687378, + "learning_rate": 0.00011677641515133209, + "loss": 2.485, + "step": 13781 + }, + { + "epoch": 1.2486240402255895, + "grad_norm": 0.9014964699745178, + "learning_rate": 0.00011677037395034134, + "loss": 2.7426, + "step": 13782 + }, + { + "epoch": 1.2487146384000363, + "grad_norm": 0.8988181352615356, + "learning_rate": 0.00011676433274935057, + "loss": 2.6737, + "step": 13783 + }, + { + "epoch": 1.248805236574483, + "grad_norm": 0.9634390473365784, + "learning_rate": 0.00011675829154835982, + "loss": 2.5997, + "step": 13784 + }, + { + "epoch": 1.2488958347489298, + "grad_norm": 0.9233003854751587, + "learning_rate": 0.00011675225034736905, + "loss": 2.8479, + "step": 13785 + }, + { + "epoch": 1.2489864329233766, + "grad_norm": 0.8954448699951172, + "learning_rate": 0.0001167462091463783, + "loss": 2.594, + "step": 13786 + }, + { + "epoch": 1.2490770310978234, + "grad_norm": 0.8657445311546326, + "learning_rate": 0.00011674016794538756, + "loss": 2.6772, + "step": 13787 + }, + { + "epoch": 1.2491676292722702, + "grad_norm": 0.8376715779304504, + "learning_rate": 0.00011673412674439678, + "loss": 2.5468, + "step": 13788 + }, + { + "epoch": 1.249258227446717, + "grad_norm": 0.8069362044334412, + "learning_rate": 0.00011672808554340604, + "loss": 1.8718, + "step": 13789 + }, + { + "epoch": 1.2493488256211638, + "grad_norm": 0.8774705529212952, + "learning_rate": 0.00011672204434241528, + "loss": 2.7932, + "step": 13790 + }, + { + "epoch": 1.2494394237956106, + "grad_norm": 0.8749731183052063, + "learning_rate": 0.00011671600314142452, + "loss": 2.7376, + "step": 13791 + }, + { + "epoch": 1.2495300219700574, + "grad_norm": 0.9246988892555237, + "learning_rate": 0.00011670996194043376, + "loss": 2.8665, + "step": 13792 + }, + { + "epoch": 1.2496206201445041, + "grad_norm": 0.8700436353683472, + "learning_rate": 0.000116703920739443, + "loss": 2.6397, + "step": 13793 + }, + { + "epoch": 1.249711218318951, + "grad_norm": 0.9283978939056396, + "learning_rate": 0.00011669787953845225, + "loss": 2.7722, + "step": 13794 + }, + { + "epoch": 1.2498018164933977, + "grad_norm": 0.8897081017494202, + "learning_rate": 0.00011669183833746149, + "loss": 2.5059, + "step": 13795 + }, + { + "epoch": 1.2498924146678445, + "grad_norm": 0.791551411151886, + "learning_rate": 0.00011668579713647075, + "loss": 2.145, + "step": 13796 + }, + { + "epoch": 1.2499830128422913, + "grad_norm": 0.8935575485229492, + "learning_rate": 0.00011667975593547997, + "loss": 2.7641, + "step": 13797 + }, + { + "epoch": 1.250073611016738, + "grad_norm": 0.8678048849105835, + "learning_rate": 0.00011667371473448923, + "loss": 2.7479, + "step": 13798 + }, + { + "epoch": 1.2501642091911849, + "grad_norm": 0.9810685515403748, + "learning_rate": 0.00011666767353349845, + "loss": 2.5205, + "step": 13799 + }, + { + "epoch": 1.2502548073656317, + "grad_norm": 0.856056809425354, + "learning_rate": 0.00011666163233250771, + "loss": 2.906, + "step": 13800 + }, + { + "epoch": 1.2503454055400784, + "grad_norm": 0.8192660808563232, + "learning_rate": 0.00011665559113151696, + "loss": 1.814, + "step": 13801 + }, + { + "epoch": 1.2504360037145252, + "grad_norm": 0.880900502204895, + "learning_rate": 0.0001166495499305262, + "loss": 2.8045, + "step": 13802 + }, + { + "epoch": 1.2505266018889718, + "grad_norm": 0.9985348582267761, + "learning_rate": 0.00011664350872953544, + "loss": 3.062, + "step": 13803 + }, + { + "epoch": 1.2506172000634188, + "grad_norm": 0.856438159942627, + "learning_rate": 0.00011663746752854468, + "loss": 2.7099, + "step": 13804 + }, + { + "epoch": 1.2507077982378654, + "grad_norm": 0.8457898497581482, + "learning_rate": 0.00011663142632755392, + "loss": 2.7552, + "step": 13805 + }, + { + "epoch": 1.2507983964123124, + "grad_norm": 0.8794441223144531, + "learning_rate": 0.00011662538512656316, + "loss": 2.6561, + "step": 13806 + }, + { + "epoch": 1.250888994586759, + "grad_norm": 0.8663780093193054, + "learning_rate": 0.0001166193439255724, + "loss": 2.7356, + "step": 13807 + }, + { + "epoch": 1.250979592761206, + "grad_norm": 0.8583436608314514, + "learning_rate": 0.00011661330272458167, + "loss": 2.4641, + "step": 13808 + }, + { + "epoch": 1.2510701909356525, + "grad_norm": 0.9188510179519653, + "learning_rate": 0.0001166072615235909, + "loss": 2.7755, + "step": 13809 + }, + { + "epoch": 1.2511607891100995, + "grad_norm": 0.8863204717636108, + "learning_rate": 0.00011660122032260015, + "loss": 2.6808, + "step": 13810 + }, + { + "epoch": 1.251251387284546, + "grad_norm": 0.8538246154785156, + "learning_rate": 0.00011659517912160938, + "loss": 2.6402, + "step": 13811 + }, + { + "epoch": 1.2513419854589931, + "grad_norm": 0.916484534740448, + "learning_rate": 0.00011658913792061863, + "loss": 2.7287, + "step": 13812 + }, + { + "epoch": 1.2514325836334397, + "grad_norm": 0.9287158846855164, + "learning_rate": 0.00011658309671962786, + "loss": 2.821, + "step": 13813 + }, + { + "epoch": 1.2515231818078867, + "grad_norm": 0.900719940662384, + "learning_rate": 0.00011657705551863711, + "loss": 2.5129, + "step": 13814 + }, + { + "epoch": 1.2516137799823333, + "grad_norm": 0.9198981523513794, + "learning_rate": 0.00011657101431764634, + "loss": 2.6784, + "step": 13815 + }, + { + "epoch": 1.2517043781567803, + "grad_norm": 1.0384328365325928, + "learning_rate": 0.00011656497311665559, + "loss": 2.9666, + "step": 13816 + }, + { + "epoch": 1.2517949763312268, + "grad_norm": 0.990770161151886, + "learning_rate": 0.00011655893191566485, + "loss": 2.9958, + "step": 13817 + }, + { + "epoch": 1.2518855745056738, + "grad_norm": 0.9217004776000977, + "learning_rate": 0.00011655289071467407, + "loss": 2.5464, + "step": 13818 + }, + { + "epoch": 1.2519761726801204, + "grad_norm": 0.9777029752731323, + "learning_rate": 0.00011654684951368334, + "loss": 2.6336, + "step": 13819 + }, + { + "epoch": 1.2520667708545674, + "grad_norm": 0.9156920909881592, + "learning_rate": 0.00011654080831269256, + "loss": 2.8731, + "step": 13820 + }, + { + "epoch": 1.252157369029014, + "grad_norm": 0.9146996140480042, + "learning_rate": 0.00011653476711170182, + "loss": 2.8167, + "step": 13821 + }, + { + "epoch": 1.252247967203461, + "grad_norm": 0.9937402606010437, + "learning_rate": 0.00011652872591071105, + "loss": 2.7207, + "step": 13822 + }, + { + "epoch": 1.2523385653779076, + "grad_norm": 0.87419593334198, + "learning_rate": 0.0001165226847097203, + "loss": 2.4909, + "step": 13823 + }, + { + "epoch": 1.2524291635523543, + "grad_norm": 0.7413035035133362, + "learning_rate": 0.00011651664350872955, + "loss": 1.9183, + "step": 13824 + }, + { + "epoch": 1.2525197617268011, + "grad_norm": 0.9126123189926147, + "learning_rate": 0.00011651060230773878, + "loss": 2.5918, + "step": 13825 + }, + { + "epoch": 1.252610359901248, + "grad_norm": 0.8771729469299316, + "learning_rate": 0.00011650456110674803, + "loss": 2.9176, + "step": 13826 + }, + { + "epoch": 1.2527009580756947, + "grad_norm": 0.8717336654663086, + "learning_rate": 0.00011649851990575726, + "loss": 2.5924, + "step": 13827 + }, + { + "epoch": 1.2527915562501415, + "grad_norm": 0.8539950847625732, + "learning_rate": 0.00011649247870476652, + "loss": 2.7182, + "step": 13828 + }, + { + "epoch": 1.2528821544245883, + "grad_norm": 0.8502089381217957, + "learning_rate": 0.00011648643750377574, + "loss": 2.5946, + "step": 13829 + }, + { + "epoch": 1.252972752599035, + "grad_norm": 0.8908237814903259, + "learning_rate": 0.000116480396302785, + "loss": 2.7591, + "step": 13830 + }, + { + "epoch": 1.2530633507734819, + "grad_norm": 1.0419542789459229, + "learning_rate": 0.00011647435510179425, + "loss": 2.6939, + "step": 13831 + }, + { + "epoch": 1.2531539489479286, + "grad_norm": 0.8944864869117737, + "learning_rate": 0.00011646831390080349, + "loss": 2.7512, + "step": 13832 + }, + { + "epoch": 1.2532445471223754, + "grad_norm": 0.887995183467865, + "learning_rate": 0.00011646227269981273, + "loss": 2.7121, + "step": 13833 + }, + { + "epoch": 1.2533351452968222, + "grad_norm": 0.916307270526886, + "learning_rate": 0.00011645623149882197, + "loss": 2.5615, + "step": 13834 + }, + { + "epoch": 1.253425743471269, + "grad_norm": 0.8498497009277344, + "learning_rate": 0.00011645019029783122, + "loss": 2.4, + "step": 13835 + }, + { + "epoch": 1.2535163416457158, + "grad_norm": 0.8641764521598816, + "learning_rate": 0.00011644414909684045, + "loss": 2.9435, + "step": 13836 + }, + { + "epoch": 1.2536069398201626, + "grad_norm": 0.8439734578132629, + "learning_rate": 0.0001164381078958497, + "loss": 2.5134, + "step": 13837 + }, + { + "epoch": 1.2536975379946094, + "grad_norm": 0.9413897395133972, + "learning_rate": 0.00011643206669485896, + "loss": 2.6806, + "step": 13838 + }, + { + "epoch": 1.2537881361690562, + "grad_norm": 0.88344407081604, + "learning_rate": 0.00011642602549386818, + "loss": 2.7333, + "step": 13839 + }, + { + "epoch": 1.253878734343503, + "grad_norm": 0.9865663647651672, + "learning_rate": 0.00011641998429287744, + "loss": 2.8791, + "step": 13840 + }, + { + "epoch": 1.2539693325179497, + "grad_norm": 0.930975615978241, + "learning_rate": 0.00011641394309188667, + "loss": 2.4122, + "step": 13841 + }, + { + "epoch": 1.2540599306923965, + "grad_norm": 0.8640120029449463, + "learning_rate": 0.00011640790189089592, + "loss": 2.5783, + "step": 13842 + }, + { + "epoch": 1.2541505288668433, + "grad_norm": 0.903770923614502, + "learning_rate": 0.00011640186068990516, + "loss": 2.6666, + "step": 13843 + }, + { + "epoch": 1.25424112704129, + "grad_norm": 0.9257115721702576, + "learning_rate": 0.0001163958194889144, + "loss": 2.5925, + "step": 13844 + }, + { + "epoch": 1.2543317252157369, + "grad_norm": 0.8976661562919617, + "learning_rate": 0.00011638977828792364, + "loss": 2.8361, + "step": 13845 + }, + { + "epoch": 1.2544223233901837, + "grad_norm": 0.8531420826911926, + "learning_rate": 0.00011638373708693288, + "loss": 2.6913, + "step": 13846 + }, + { + "epoch": 1.2545129215646305, + "grad_norm": 0.9090835452079773, + "learning_rate": 0.00011637769588594215, + "loss": 2.8301, + "step": 13847 + }, + { + "epoch": 1.2546035197390772, + "grad_norm": 0.9282850027084351, + "learning_rate": 0.00011637165468495137, + "loss": 2.9139, + "step": 13848 + }, + { + "epoch": 1.254694117913524, + "grad_norm": 0.9210748672485352, + "learning_rate": 0.00011636561348396063, + "loss": 2.6868, + "step": 13849 + }, + { + "epoch": 1.2547847160879708, + "grad_norm": 0.8423678278923035, + "learning_rate": 0.00011635957228296985, + "loss": 2.251, + "step": 13850 + }, + { + "epoch": 1.2548753142624176, + "grad_norm": 0.9425080418586731, + "learning_rate": 0.00011635353108197911, + "loss": 2.4624, + "step": 13851 + }, + { + "epoch": 1.2549659124368644, + "grad_norm": 0.9456670880317688, + "learning_rate": 0.00011634748988098833, + "loss": 2.7126, + "step": 13852 + }, + { + "epoch": 1.2550565106113112, + "grad_norm": 0.8109333515167236, + "learning_rate": 0.00011634144867999759, + "loss": 2.0727, + "step": 13853 + }, + { + "epoch": 1.255147108785758, + "grad_norm": 0.9466536641120911, + "learning_rate": 0.00011633540747900684, + "loss": 2.704, + "step": 13854 + }, + { + "epoch": 1.2552377069602048, + "grad_norm": 0.9292759895324707, + "learning_rate": 0.00011632936627801607, + "loss": 2.656, + "step": 13855 + }, + { + "epoch": 1.2553283051346515, + "grad_norm": 0.889254629611969, + "learning_rate": 0.00011632332507702532, + "loss": 2.473, + "step": 13856 + }, + { + "epoch": 1.2554189033090983, + "grad_norm": 0.934680700302124, + "learning_rate": 0.00011631728387603455, + "loss": 2.7109, + "step": 13857 + }, + { + "epoch": 1.2555095014835451, + "grad_norm": 0.9137035012245178, + "learning_rate": 0.0001163112426750438, + "loss": 2.7499, + "step": 13858 + }, + { + "epoch": 1.255600099657992, + "grad_norm": 0.801255464553833, + "learning_rate": 0.00011630520147405304, + "loss": 2.2309, + "step": 13859 + }, + { + "epoch": 1.2556906978324387, + "grad_norm": 0.8864529728889465, + "learning_rate": 0.0001162991602730623, + "loss": 2.498, + "step": 13860 + }, + { + "epoch": 1.2557812960068855, + "grad_norm": 0.8000440001487732, + "learning_rate": 0.00011629311907207154, + "loss": 1.9453, + "step": 13861 + }, + { + "epoch": 1.2558718941813323, + "grad_norm": 0.9022583365440369, + "learning_rate": 0.00011628707787108078, + "loss": 2.6963, + "step": 13862 + }, + { + "epoch": 1.255962492355779, + "grad_norm": 0.8363059759140015, + "learning_rate": 0.00011628103667009003, + "loss": 2.048, + "step": 13863 + }, + { + "epoch": 1.2560530905302258, + "grad_norm": 0.9264611005783081, + "learning_rate": 0.00011627499546909926, + "loss": 2.5388, + "step": 13864 + }, + { + "epoch": 1.2561436887046726, + "grad_norm": 0.9257709383964539, + "learning_rate": 0.00011626895426810851, + "loss": 2.744, + "step": 13865 + }, + { + "epoch": 1.2562342868791194, + "grad_norm": 0.956027090549469, + "learning_rate": 0.00011626291306711774, + "loss": 3.023, + "step": 13866 + }, + { + "epoch": 1.2563248850535662, + "grad_norm": 0.811943531036377, + "learning_rate": 0.00011625687186612699, + "loss": 2.1195, + "step": 13867 + }, + { + "epoch": 1.256415483228013, + "grad_norm": 0.8883787393569946, + "learning_rate": 0.00011625083066513625, + "loss": 2.581, + "step": 13868 + }, + { + "epoch": 1.2565060814024598, + "grad_norm": 0.9220066070556641, + "learning_rate": 0.00011624478946414547, + "loss": 2.6263, + "step": 13869 + }, + { + "epoch": 1.2565966795769066, + "grad_norm": 0.9127610921859741, + "learning_rate": 0.00011623874826315473, + "loss": 2.6411, + "step": 13870 + }, + { + "epoch": 1.2566872777513534, + "grad_norm": 0.9056390523910522, + "learning_rate": 0.00011623270706216395, + "loss": 2.7879, + "step": 13871 + }, + { + "epoch": 1.2567778759258001, + "grad_norm": 1.1237552165985107, + "learning_rate": 0.00011622666586117321, + "loss": 2.6576, + "step": 13872 + }, + { + "epoch": 1.256868474100247, + "grad_norm": 0.9010747075080872, + "learning_rate": 0.00011622062466018245, + "loss": 2.648, + "step": 13873 + }, + { + "epoch": 1.2569590722746937, + "grad_norm": 0.956256628036499, + "learning_rate": 0.0001162145834591917, + "loss": 2.715, + "step": 13874 + }, + { + "epoch": 1.2570496704491405, + "grad_norm": 0.8433724045753479, + "learning_rate": 0.00011620854225820093, + "loss": 2.6472, + "step": 13875 + }, + { + "epoch": 1.2571402686235873, + "grad_norm": 0.943148136138916, + "learning_rate": 0.00011620250105721018, + "loss": 2.3181, + "step": 13876 + }, + { + "epoch": 1.257230866798034, + "grad_norm": 0.9294377565383911, + "learning_rate": 0.00011619645985621943, + "loss": 2.8516, + "step": 13877 + }, + { + "epoch": 1.2573214649724809, + "grad_norm": 0.9265919923782349, + "learning_rate": 0.00011619041865522866, + "loss": 2.6827, + "step": 13878 + }, + { + "epoch": 1.2574120631469277, + "grad_norm": 0.9792430996894836, + "learning_rate": 0.00011618437745423792, + "loss": 2.7079, + "step": 13879 + }, + { + "epoch": 1.2575026613213744, + "grad_norm": 0.8818276524543762, + "learning_rate": 0.00011617833625324714, + "loss": 2.7438, + "step": 13880 + }, + { + "epoch": 1.2575932594958212, + "grad_norm": 0.9462215304374695, + "learning_rate": 0.0001161722950522564, + "loss": 2.79, + "step": 13881 + }, + { + "epoch": 1.257683857670268, + "grad_norm": 0.7950134873390198, + "learning_rate": 0.00011616625385126562, + "loss": 1.8817, + "step": 13882 + }, + { + "epoch": 1.2577744558447148, + "grad_norm": 0.808558464050293, + "learning_rate": 0.00011616021265027488, + "loss": 2.0174, + "step": 13883 + }, + { + "epoch": 1.2578650540191614, + "grad_norm": 0.9419522285461426, + "learning_rate": 0.00011615417144928413, + "loss": 2.6235, + "step": 13884 + }, + { + "epoch": 1.2579556521936084, + "grad_norm": 0.8888707756996155, + "learning_rate": 0.00011614813024829337, + "loss": 2.6569, + "step": 13885 + }, + { + "epoch": 1.258046250368055, + "grad_norm": 0.9328742623329163, + "learning_rate": 0.00011614208904730261, + "loss": 2.7602, + "step": 13886 + }, + { + "epoch": 1.258136848542502, + "grad_norm": 0.9389846920967102, + "learning_rate": 0.00011613604784631185, + "loss": 2.3761, + "step": 13887 + }, + { + "epoch": 1.2582274467169485, + "grad_norm": 0.8517104983329773, + "learning_rate": 0.0001161300066453211, + "loss": 2.5775, + "step": 13888 + }, + { + "epoch": 1.2583180448913955, + "grad_norm": 0.9327241778373718, + "learning_rate": 0.00011612396544433033, + "loss": 2.7055, + "step": 13889 + }, + { + "epoch": 1.258408643065842, + "grad_norm": 0.9001374840736389, + "learning_rate": 0.00011611792424333958, + "loss": 2.6224, + "step": 13890 + }, + { + "epoch": 1.2584992412402891, + "grad_norm": 1.03312349319458, + "learning_rate": 0.00011611188304234884, + "loss": 2.6668, + "step": 13891 + }, + { + "epoch": 1.2585898394147357, + "grad_norm": 0.9674019813537598, + "learning_rate": 0.00011610584184135807, + "loss": 2.477, + "step": 13892 + }, + { + "epoch": 1.2586804375891827, + "grad_norm": 0.9348928332328796, + "learning_rate": 0.00011609980064036732, + "loss": 2.6882, + "step": 13893 + }, + { + "epoch": 1.2587710357636293, + "grad_norm": 1.0012407302856445, + "learning_rate": 0.00011609375943937655, + "loss": 2.4517, + "step": 13894 + }, + { + "epoch": 1.2588616339380763, + "grad_norm": 0.9818168878555298, + "learning_rate": 0.0001160877182383858, + "loss": 3.0619, + "step": 13895 + }, + { + "epoch": 1.2589522321125228, + "grad_norm": 0.9751062989234924, + "learning_rate": 0.00011608167703739503, + "loss": 2.4979, + "step": 13896 + }, + { + "epoch": 1.2590428302869698, + "grad_norm": 1.0335019826889038, + "learning_rate": 0.00011607563583640428, + "loss": 2.8652, + "step": 13897 + }, + { + "epoch": 1.2591334284614164, + "grad_norm": 0.9071747064590454, + "learning_rate": 0.00011606959463541353, + "loss": 2.7207, + "step": 13898 + }, + { + "epoch": 1.2592240266358634, + "grad_norm": 0.901438295841217, + "learning_rate": 0.00011606355343442276, + "loss": 2.9127, + "step": 13899 + }, + { + "epoch": 1.25931462481031, + "grad_norm": 0.8648412823677063, + "learning_rate": 0.00011605751223343203, + "loss": 2.4303, + "step": 13900 + }, + { + "epoch": 1.259405222984757, + "grad_norm": 1.0836985111236572, + "learning_rate": 0.00011605147103244125, + "loss": 2.5376, + "step": 13901 + }, + { + "epoch": 1.2594958211592036, + "grad_norm": 0.9522296190261841, + "learning_rate": 0.00011604542983145051, + "loss": 2.9003, + "step": 13902 + }, + { + "epoch": 1.2595864193336506, + "grad_norm": 0.8819613456726074, + "learning_rate": 0.00011603938863045973, + "loss": 2.7289, + "step": 13903 + }, + { + "epoch": 1.2596770175080971, + "grad_norm": 0.8520228862762451, + "learning_rate": 0.00011603334742946899, + "loss": 2.3361, + "step": 13904 + }, + { + "epoch": 1.259767615682544, + "grad_norm": 0.8915687799453735, + "learning_rate": 0.00011602730622847822, + "loss": 2.7201, + "step": 13905 + }, + { + "epoch": 1.2598582138569907, + "grad_norm": 0.8908092379570007, + "learning_rate": 0.00011602126502748747, + "loss": 2.7406, + "step": 13906 + }, + { + "epoch": 1.2599488120314375, + "grad_norm": 0.9820717573165894, + "learning_rate": 0.00011601522382649672, + "loss": 2.5402, + "step": 13907 + }, + { + "epoch": 1.2600394102058843, + "grad_norm": 0.929581344127655, + "learning_rate": 0.00011600918262550595, + "loss": 2.6962, + "step": 13908 + }, + { + "epoch": 1.260130008380331, + "grad_norm": 1.0151978731155396, + "learning_rate": 0.0001160031414245152, + "loss": 2.9919, + "step": 13909 + }, + { + "epoch": 1.2602206065547779, + "grad_norm": 0.7524080276489258, + "learning_rate": 0.00011599710022352443, + "loss": 1.9948, + "step": 13910 + }, + { + "epoch": 1.2603112047292246, + "grad_norm": 1.0705671310424805, + "learning_rate": 0.00011599105902253368, + "loss": 2.7381, + "step": 13911 + }, + { + "epoch": 1.2604018029036714, + "grad_norm": 0.9453121423721313, + "learning_rate": 0.00011598501782154292, + "loss": 2.6522, + "step": 13912 + }, + { + "epoch": 1.2604924010781182, + "grad_norm": 0.9322149753570557, + "learning_rate": 0.00011597897662055218, + "loss": 2.6096, + "step": 13913 + }, + { + "epoch": 1.260582999252565, + "grad_norm": 0.9171059727668762, + "learning_rate": 0.00011597293541956142, + "loss": 2.7394, + "step": 13914 + }, + { + "epoch": 1.2606735974270118, + "grad_norm": 0.9364104866981506, + "learning_rate": 0.00011596689421857066, + "loss": 2.9597, + "step": 13915 + }, + { + "epoch": 1.2607641956014586, + "grad_norm": 0.9122873544692993, + "learning_rate": 0.0001159608530175799, + "loss": 2.785, + "step": 13916 + }, + { + "epoch": 1.2608547937759054, + "grad_norm": 0.8770792484283447, + "learning_rate": 0.00011595481181658914, + "loss": 2.7161, + "step": 13917 + }, + { + "epoch": 1.2609453919503522, + "grad_norm": 0.7288605570793152, + "learning_rate": 0.00011594877061559839, + "loss": 2.0378, + "step": 13918 + }, + { + "epoch": 1.261035990124799, + "grad_norm": 0.8802765011787415, + "learning_rate": 0.00011594272941460762, + "loss": 2.6664, + "step": 13919 + }, + { + "epoch": 1.2611265882992457, + "grad_norm": 1.0392309427261353, + "learning_rate": 0.00011593668821361687, + "loss": 2.6355, + "step": 13920 + }, + { + "epoch": 1.2612171864736925, + "grad_norm": 0.9023177623748779, + "learning_rate": 0.00011593064701262613, + "loss": 2.6368, + "step": 13921 + }, + { + "epoch": 1.2613077846481393, + "grad_norm": 0.951667308807373, + "learning_rate": 0.00011592460581163535, + "loss": 2.7444, + "step": 13922 + }, + { + "epoch": 1.261398382822586, + "grad_norm": 0.967898428440094, + "learning_rate": 0.00011591856461064461, + "loss": 2.7526, + "step": 13923 + }, + { + "epoch": 1.2614889809970329, + "grad_norm": 0.9075466990470886, + "learning_rate": 0.00011591252340965385, + "loss": 2.8321, + "step": 13924 + }, + { + "epoch": 1.2615795791714797, + "grad_norm": 0.9287778735160828, + "learning_rate": 0.0001159064822086631, + "loss": 2.6946, + "step": 13925 + }, + { + "epoch": 1.2616701773459265, + "grad_norm": 0.762406051158905, + "learning_rate": 0.00011590044100767233, + "loss": 1.9795, + "step": 13926 + }, + { + "epoch": 1.2617607755203732, + "grad_norm": 0.9416145086288452, + "learning_rate": 0.00011589439980668158, + "loss": 2.6467, + "step": 13927 + }, + { + "epoch": 1.26185137369482, + "grad_norm": 0.9303480386734009, + "learning_rate": 0.00011588835860569082, + "loss": 2.6724, + "step": 13928 + }, + { + "epoch": 1.2619419718692668, + "grad_norm": 0.9165884256362915, + "learning_rate": 0.00011588231740470006, + "loss": 2.4341, + "step": 13929 + }, + { + "epoch": 1.2620325700437136, + "grad_norm": 0.8955235481262207, + "learning_rate": 0.0001158762762037093, + "loss": 2.5541, + "step": 13930 + }, + { + "epoch": 1.2621231682181604, + "grad_norm": 0.914130687713623, + "learning_rate": 0.00011587023500271854, + "loss": 2.7287, + "step": 13931 + }, + { + "epoch": 1.2622137663926072, + "grad_norm": 0.808812141418457, + "learning_rate": 0.0001158641938017278, + "loss": 2.2809, + "step": 13932 + }, + { + "epoch": 1.262304364567054, + "grad_norm": 0.8660774827003479, + "learning_rate": 0.00011585815260073702, + "loss": 2.8855, + "step": 13933 + }, + { + "epoch": 1.2623949627415008, + "grad_norm": 0.7531846165657043, + "learning_rate": 0.00011585211139974628, + "loss": 1.9642, + "step": 13934 + }, + { + "epoch": 1.2624855609159475, + "grad_norm": 0.882360577583313, + "learning_rate": 0.0001158460701987555, + "loss": 2.61, + "step": 13935 + }, + { + "epoch": 1.2625761590903943, + "grad_norm": 1.0474483966827393, + "learning_rate": 0.00011584002899776476, + "loss": 2.7475, + "step": 13936 + }, + { + "epoch": 1.2626667572648411, + "grad_norm": 0.8630497455596924, + "learning_rate": 0.00011583398779677401, + "loss": 2.5906, + "step": 13937 + }, + { + "epoch": 1.262757355439288, + "grad_norm": 0.8596481680870056, + "learning_rate": 0.00011582794659578324, + "loss": 2.2349, + "step": 13938 + }, + { + "epoch": 1.2628479536137347, + "grad_norm": 0.875924289226532, + "learning_rate": 0.00011582190539479249, + "loss": 2.9041, + "step": 13939 + }, + { + "epoch": 1.2629385517881815, + "grad_norm": 0.8984411358833313, + "learning_rate": 0.00011581586419380173, + "loss": 2.7077, + "step": 13940 + }, + { + "epoch": 1.2630291499626283, + "grad_norm": 0.9591841697692871, + "learning_rate": 0.00011580982299281097, + "loss": 2.8377, + "step": 13941 + }, + { + "epoch": 1.263119748137075, + "grad_norm": 0.9735511541366577, + "learning_rate": 0.00011580378179182021, + "loss": 2.5233, + "step": 13942 + }, + { + "epoch": 1.2632103463115218, + "grad_norm": 0.8626397848129272, + "learning_rate": 0.00011579774059082946, + "loss": 2.5011, + "step": 13943 + }, + { + "epoch": 1.2633009444859686, + "grad_norm": 1.0481780767440796, + "learning_rate": 0.00011579169938983872, + "loss": 2.8745, + "step": 13944 + }, + { + "epoch": 1.2633915426604154, + "grad_norm": 0.9025171995162964, + "learning_rate": 0.00011578565818884795, + "loss": 2.6406, + "step": 13945 + }, + { + "epoch": 1.2634821408348622, + "grad_norm": 0.905469536781311, + "learning_rate": 0.0001157796169878572, + "loss": 2.7803, + "step": 13946 + }, + { + "epoch": 1.263572739009309, + "grad_norm": 0.8571692109107971, + "learning_rate": 0.00011577357578686643, + "loss": 1.94, + "step": 13947 + }, + { + "epoch": 1.2636633371837558, + "grad_norm": 0.8897181749343872, + "learning_rate": 0.00011576753458587568, + "loss": 2.5455, + "step": 13948 + }, + { + "epoch": 1.2637539353582026, + "grad_norm": 0.8642517924308777, + "learning_rate": 0.00011576149338488491, + "loss": 2.6901, + "step": 13949 + }, + { + "epoch": 1.2638445335326494, + "grad_norm": 0.9453202486038208, + "learning_rate": 0.00011575545218389416, + "loss": 2.6738, + "step": 13950 + }, + { + "epoch": 1.2639351317070961, + "grad_norm": 0.881290853023529, + "learning_rate": 0.00011574941098290342, + "loss": 2.8954, + "step": 13951 + }, + { + "epoch": 1.264025729881543, + "grad_norm": 0.8399613499641418, + "learning_rate": 0.00011574336978191264, + "loss": 2.596, + "step": 13952 + }, + { + "epoch": 1.2641163280559897, + "grad_norm": 0.8390774726867676, + "learning_rate": 0.0001157373285809219, + "loss": 2.4782, + "step": 13953 + }, + { + "epoch": 1.2642069262304365, + "grad_norm": 0.8183889389038086, + "learning_rate": 0.00011573128737993112, + "loss": 2.745, + "step": 13954 + }, + { + "epoch": 1.2642975244048833, + "grad_norm": 0.8352581262588501, + "learning_rate": 0.00011572524617894039, + "loss": 2.562, + "step": 13955 + }, + { + "epoch": 1.26438812257933, + "grad_norm": 0.899666965007782, + "learning_rate": 0.00011571920497794962, + "loss": 2.6482, + "step": 13956 + }, + { + "epoch": 1.2644787207537769, + "grad_norm": 0.8492160439491272, + "learning_rate": 0.00011571316377695887, + "loss": 2.4415, + "step": 13957 + }, + { + "epoch": 1.2645693189282237, + "grad_norm": 0.8617951273918152, + "learning_rate": 0.00011570712257596812, + "loss": 2.4716, + "step": 13958 + }, + { + "epoch": 1.2646599171026705, + "grad_norm": 0.8835850954055786, + "learning_rate": 0.00011570108137497735, + "loss": 2.5219, + "step": 13959 + }, + { + "epoch": 1.2647505152771172, + "grad_norm": 0.874099612236023, + "learning_rate": 0.0001156950401739866, + "loss": 2.8088, + "step": 13960 + }, + { + "epoch": 1.264841113451564, + "grad_norm": 0.9005377888679504, + "learning_rate": 0.00011568899897299583, + "loss": 2.7, + "step": 13961 + }, + { + "epoch": 1.2649317116260108, + "grad_norm": 0.846848726272583, + "learning_rate": 0.00011568295777200508, + "loss": 2.5417, + "step": 13962 + }, + { + "epoch": 1.2650223098004576, + "grad_norm": 0.876593291759491, + "learning_rate": 0.00011567691657101431, + "loss": 2.812, + "step": 13963 + }, + { + "epoch": 1.2651129079749044, + "grad_norm": 0.8210822939872742, + "learning_rate": 0.00011567087537002357, + "loss": 2.6352, + "step": 13964 + }, + { + "epoch": 1.265203506149351, + "grad_norm": 0.9278470873832703, + "learning_rate": 0.0001156648341690328, + "loss": 2.555, + "step": 13965 + }, + { + "epoch": 1.265294104323798, + "grad_norm": 0.668870747089386, + "learning_rate": 0.00011565879296804206, + "loss": 1.441, + "step": 13966 + }, + { + "epoch": 1.2653847024982445, + "grad_norm": 0.8669072389602661, + "learning_rate": 0.0001156527517670513, + "loss": 3.0846, + "step": 13967 + }, + { + "epoch": 1.2654753006726915, + "grad_norm": 0.924569845199585, + "learning_rate": 0.00011564671056606054, + "loss": 2.6763, + "step": 13968 + }, + { + "epoch": 1.265565898847138, + "grad_norm": 0.8386816382408142, + "learning_rate": 0.00011564066936506978, + "loss": 2.6702, + "step": 13969 + }, + { + "epoch": 1.2656564970215851, + "grad_norm": 0.9433380961418152, + "learning_rate": 0.00011563462816407902, + "loss": 2.7522, + "step": 13970 + }, + { + "epoch": 1.2657470951960317, + "grad_norm": 0.8722846508026123, + "learning_rate": 0.00011562858696308827, + "loss": 2.0973, + "step": 13971 + }, + { + "epoch": 1.2658376933704787, + "grad_norm": 0.8555676341056824, + "learning_rate": 0.0001156225457620975, + "loss": 2.9494, + "step": 13972 + }, + { + "epoch": 1.2659282915449253, + "grad_norm": 0.9186474680900574, + "learning_rate": 0.00011561650456110675, + "loss": 2.7071, + "step": 13973 + }, + { + "epoch": 1.2660188897193723, + "grad_norm": 0.9277351498603821, + "learning_rate": 0.00011561046336011601, + "loss": 2.7917, + "step": 13974 + }, + { + "epoch": 1.2661094878938188, + "grad_norm": 0.9405906796455383, + "learning_rate": 0.00011560442215912523, + "loss": 2.8425, + "step": 13975 + }, + { + "epoch": 1.2662000860682658, + "grad_norm": 0.8209936022758484, + "learning_rate": 0.00011559838095813449, + "loss": 1.9105, + "step": 13976 + }, + { + "epoch": 1.2662906842427124, + "grad_norm": 0.9271464347839355, + "learning_rate": 0.00011559233975714372, + "loss": 2.738, + "step": 13977 + }, + { + "epoch": 1.2663812824171594, + "grad_norm": 0.903946578502655, + "learning_rate": 0.00011558629855615297, + "loss": 2.7248, + "step": 13978 + }, + { + "epoch": 1.266471880591606, + "grad_norm": 0.9914761781692505, + "learning_rate": 0.0001155802573551622, + "loss": 2.6404, + "step": 13979 + }, + { + "epoch": 1.266562478766053, + "grad_norm": 0.9149607419967651, + "learning_rate": 0.00011557421615417145, + "loss": 2.4973, + "step": 13980 + }, + { + "epoch": 1.2666530769404996, + "grad_norm": 0.9603148698806763, + "learning_rate": 0.0001155681749531807, + "loss": 2.6287, + "step": 13981 + }, + { + "epoch": 1.2667436751149466, + "grad_norm": 0.7955971956253052, + "learning_rate": 0.00011556213375218994, + "loss": 1.9236, + "step": 13982 + }, + { + "epoch": 1.2668342732893931, + "grad_norm": 0.8805874586105347, + "learning_rate": 0.0001155560925511992, + "loss": 2.7675, + "step": 13983 + }, + { + "epoch": 1.2669248714638401, + "grad_norm": 0.8721504211425781, + "learning_rate": 0.00011555005135020842, + "loss": 2.7677, + "step": 13984 + }, + { + "epoch": 1.2670154696382867, + "grad_norm": 0.9754440784454346, + "learning_rate": 0.00011554401014921768, + "loss": 2.9476, + "step": 13985 + }, + { + "epoch": 1.2671060678127335, + "grad_norm": 0.8835493922233582, + "learning_rate": 0.0001155379689482269, + "loss": 2.696, + "step": 13986 + }, + { + "epoch": 1.2671966659871803, + "grad_norm": 0.8421795964241028, + "learning_rate": 0.00011553192774723616, + "loss": 2.6983, + "step": 13987 + }, + { + "epoch": 1.267287264161627, + "grad_norm": 0.924566924571991, + "learning_rate": 0.00011552588654624541, + "loss": 2.6592, + "step": 13988 + }, + { + "epoch": 1.2673778623360739, + "grad_norm": 0.8973004221916199, + "learning_rate": 0.00011551984534525464, + "loss": 2.6337, + "step": 13989 + }, + { + "epoch": 1.2674684605105206, + "grad_norm": 0.8989184498786926, + "learning_rate": 0.00011551380414426389, + "loss": 2.887, + "step": 13990 + }, + { + "epoch": 1.2675590586849674, + "grad_norm": 0.9658910632133484, + "learning_rate": 0.00011550776294327312, + "loss": 2.6646, + "step": 13991 + }, + { + "epoch": 1.2676496568594142, + "grad_norm": 0.893592119216919, + "learning_rate": 0.00011550172174228237, + "loss": 2.7529, + "step": 13992 + }, + { + "epoch": 1.267740255033861, + "grad_norm": 0.9793941974639893, + "learning_rate": 0.0001154956805412916, + "loss": 2.6267, + "step": 13993 + }, + { + "epoch": 1.2678308532083078, + "grad_norm": 0.9286267757415771, + "learning_rate": 0.00011548963934030085, + "loss": 3.1204, + "step": 13994 + }, + { + "epoch": 1.2679214513827546, + "grad_norm": 0.8792372941970825, + "learning_rate": 0.00011548359813931009, + "loss": 2.5242, + "step": 13995 + }, + { + "epoch": 1.2680120495572014, + "grad_norm": 1.0015803575515747, + "learning_rate": 0.00011547755693831935, + "loss": 2.7351, + "step": 13996 + }, + { + "epoch": 1.2681026477316482, + "grad_norm": 0.8586538434028625, + "learning_rate": 0.0001154715157373286, + "loss": 2.5309, + "step": 13997 + }, + { + "epoch": 1.268193245906095, + "grad_norm": 0.8838847875595093, + "learning_rate": 0.00011546547453633783, + "loss": 2.7352, + "step": 13998 + }, + { + "epoch": 1.2682838440805417, + "grad_norm": 0.879650354385376, + "learning_rate": 0.00011545943333534708, + "loss": 2.5329, + "step": 13999 + }, + { + "epoch": 1.2683744422549885, + "grad_norm": 0.885183572769165, + "learning_rate": 0.00011545339213435631, + "loss": 2.7897, + "step": 14000 + }, + { + "epoch": 1.2684650404294353, + "grad_norm": 0.9482781291007996, + "learning_rate": 0.00011544735093336556, + "loss": 2.7256, + "step": 14001 + }, + { + "epoch": 1.268555638603882, + "grad_norm": 0.8879468441009521, + "learning_rate": 0.00011544130973237479, + "loss": 2.7237, + "step": 14002 + }, + { + "epoch": 1.2686462367783289, + "grad_norm": 0.9498342871665955, + "learning_rate": 0.00011543526853138404, + "loss": 2.6651, + "step": 14003 + }, + { + "epoch": 1.2687368349527757, + "grad_norm": 0.8250385522842407, + "learning_rate": 0.0001154292273303933, + "loss": 1.8695, + "step": 14004 + }, + { + "epoch": 1.2688274331272225, + "grad_norm": 0.8791189789772034, + "learning_rate": 0.00011542318612940252, + "loss": 2.7257, + "step": 14005 + }, + { + "epoch": 1.2689180313016692, + "grad_norm": 0.8244776129722595, + "learning_rate": 0.00011541714492841178, + "loss": 1.8499, + "step": 14006 + }, + { + "epoch": 1.269008629476116, + "grad_norm": 0.9276573657989502, + "learning_rate": 0.000115411103727421, + "loss": 2.6267, + "step": 14007 + }, + { + "epoch": 1.2690992276505628, + "grad_norm": 0.812758207321167, + "learning_rate": 0.00011540506252643027, + "loss": 2.5244, + "step": 14008 + }, + { + "epoch": 1.2691898258250096, + "grad_norm": 0.8906194567680359, + "learning_rate": 0.0001153990213254395, + "loss": 3.0192, + "step": 14009 + }, + { + "epoch": 1.2692804239994564, + "grad_norm": 0.8776498436927795, + "learning_rate": 0.00011539298012444875, + "loss": 2.79, + "step": 14010 + }, + { + "epoch": 1.2693710221739032, + "grad_norm": 0.9475143551826477, + "learning_rate": 0.000115386938923458, + "loss": 2.6739, + "step": 14011 + }, + { + "epoch": 1.26946162034835, + "grad_norm": 0.8706595301628113, + "learning_rate": 0.00011538089772246723, + "loss": 2.679, + "step": 14012 + }, + { + "epoch": 1.2695522185227968, + "grad_norm": 0.7807923555374146, + "learning_rate": 0.00011537485652147648, + "loss": 2.1755, + "step": 14013 + }, + { + "epoch": 1.2696428166972435, + "grad_norm": 0.8859023451805115, + "learning_rate": 0.00011536881532048571, + "loss": 2.7869, + "step": 14014 + }, + { + "epoch": 1.2697334148716903, + "grad_norm": 1.0095210075378418, + "learning_rate": 0.00011536277411949497, + "loss": 2.521, + "step": 14015 + }, + { + "epoch": 1.2698240130461371, + "grad_norm": 0.9154870510101318, + "learning_rate": 0.00011535673291850419, + "loss": 2.7383, + "step": 14016 + }, + { + "epoch": 1.269914611220584, + "grad_norm": 0.9037215113639832, + "learning_rate": 0.00011535069171751345, + "loss": 2.6948, + "step": 14017 + }, + { + "epoch": 1.2700052093950307, + "grad_norm": 0.8380807638168335, + "learning_rate": 0.0001153446505165227, + "loss": 2.3627, + "step": 14018 + }, + { + "epoch": 1.2700958075694775, + "grad_norm": 0.9400677680969238, + "learning_rate": 0.00011533860931553193, + "loss": 2.5477, + "step": 14019 + }, + { + "epoch": 1.2701864057439243, + "grad_norm": 0.8722341060638428, + "learning_rate": 0.00011533256811454118, + "loss": 2.5108, + "step": 14020 + }, + { + "epoch": 1.270277003918371, + "grad_norm": 0.8858424425125122, + "learning_rate": 0.00011532652691355042, + "loss": 2.4879, + "step": 14021 + }, + { + "epoch": 1.2703676020928178, + "grad_norm": 0.8801668882369995, + "learning_rate": 0.00011532048571255966, + "loss": 2.5392, + "step": 14022 + }, + { + "epoch": 1.2704582002672646, + "grad_norm": 0.9309289455413818, + "learning_rate": 0.0001153144445115689, + "loss": 2.8721, + "step": 14023 + }, + { + "epoch": 1.2705487984417114, + "grad_norm": 0.885780394077301, + "learning_rate": 0.00011530840331057815, + "loss": 2.8008, + "step": 14024 + }, + { + "epoch": 1.2706393966161582, + "grad_norm": 0.8847570419311523, + "learning_rate": 0.00011530236210958738, + "loss": 2.0065, + "step": 14025 + }, + { + "epoch": 1.270729994790605, + "grad_norm": 0.8645356297492981, + "learning_rate": 0.00011529632090859663, + "loss": 2.5965, + "step": 14026 + }, + { + "epoch": 1.2708205929650518, + "grad_norm": 0.8691539168357849, + "learning_rate": 0.00011529027970760589, + "loss": 2.7069, + "step": 14027 + }, + { + "epoch": 1.2709111911394986, + "grad_norm": 0.9075314402580261, + "learning_rate": 0.00011528423850661512, + "loss": 2.7597, + "step": 14028 + }, + { + "epoch": 1.2710017893139454, + "grad_norm": 0.9191989302635193, + "learning_rate": 0.00011527819730562437, + "loss": 2.8074, + "step": 14029 + }, + { + "epoch": 1.2710923874883922, + "grad_norm": 0.9942866563796997, + "learning_rate": 0.0001152721561046336, + "loss": 2.6017, + "step": 14030 + }, + { + "epoch": 1.271182985662839, + "grad_norm": 0.8725181818008423, + "learning_rate": 0.00011526611490364285, + "loss": 2.5891, + "step": 14031 + }, + { + "epoch": 1.2712735838372857, + "grad_norm": 0.916723370552063, + "learning_rate": 0.00011526007370265209, + "loss": 2.4803, + "step": 14032 + }, + { + "epoch": 1.2713641820117325, + "grad_norm": 0.7598247528076172, + "learning_rate": 0.00011525403250166133, + "loss": 2.1486, + "step": 14033 + }, + { + "epoch": 1.2714547801861793, + "grad_norm": 0.8879695534706116, + "learning_rate": 0.0001152479913006706, + "loss": 2.6636, + "step": 14034 + }, + { + "epoch": 1.271545378360626, + "grad_norm": 0.8505950570106506, + "learning_rate": 0.00011524195009967982, + "loss": 2.8109, + "step": 14035 + }, + { + "epoch": 1.2716359765350729, + "grad_norm": 0.8888486623764038, + "learning_rate": 0.00011523590889868908, + "loss": 2.542, + "step": 14036 + }, + { + "epoch": 1.2717265747095197, + "grad_norm": 0.8962931036949158, + "learning_rate": 0.0001152298676976983, + "loss": 2.8018, + "step": 14037 + }, + { + "epoch": 1.2718171728839665, + "grad_norm": 0.8787301778793335, + "learning_rate": 0.00011522382649670756, + "loss": 2.6873, + "step": 14038 + }, + { + "epoch": 1.2719077710584132, + "grad_norm": 0.8770729899406433, + "learning_rate": 0.00011521778529571678, + "loss": 2.2297, + "step": 14039 + }, + { + "epoch": 1.27199836923286, + "grad_norm": 0.8835036754608154, + "learning_rate": 0.00011521174409472604, + "loss": 2.5798, + "step": 14040 + }, + { + "epoch": 1.2720889674073068, + "grad_norm": 0.8637056946754456, + "learning_rate": 0.00011520570289373529, + "loss": 2.6812, + "step": 14041 + }, + { + "epoch": 1.2721795655817536, + "grad_norm": 0.9378625750541687, + "learning_rate": 0.00011519966169274452, + "loss": 2.8466, + "step": 14042 + }, + { + "epoch": 1.2722701637562004, + "grad_norm": 0.913292646408081, + "learning_rate": 0.00011519362049175377, + "loss": 2.4555, + "step": 14043 + }, + { + "epoch": 1.2723607619306472, + "grad_norm": 0.8554812073707581, + "learning_rate": 0.000115187579290763, + "loss": 2.6754, + "step": 14044 + }, + { + "epoch": 1.272451360105094, + "grad_norm": 0.9193994402885437, + "learning_rate": 0.00011518153808977225, + "loss": 2.6931, + "step": 14045 + }, + { + "epoch": 1.2725419582795405, + "grad_norm": 0.8705118298530579, + "learning_rate": 0.00011517549688878148, + "loss": 2.7389, + "step": 14046 + }, + { + "epoch": 1.2726325564539875, + "grad_norm": 0.926697850227356, + "learning_rate": 0.00011516945568779075, + "loss": 2.9372, + "step": 14047 + }, + { + "epoch": 1.272723154628434, + "grad_norm": 0.8833777904510498, + "learning_rate": 0.0001151634144868, + "loss": 2.9149, + "step": 14048 + }, + { + "epoch": 1.2728137528028811, + "grad_norm": 0.9127588868141174, + "learning_rate": 0.00011515737328580923, + "loss": 2.7039, + "step": 14049 + }, + { + "epoch": 1.2729043509773277, + "grad_norm": 0.7620458602905273, + "learning_rate": 0.00011515133208481847, + "loss": 1.8792, + "step": 14050 + }, + { + "epoch": 1.2729949491517747, + "grad_norm": 0.9457992315292358, + "learning_rate": 0.00011514529088382771, + "loss": 2.6365, + "step": 14051 + }, + { + "epoch": 1.2730855473262213, + "grad_norm": 0.9517895579338074, + "learning_rate": 0.00011513924968283696, + "loss": 2.6084, + "step": 14052 + }, + { + "epoch": 1.2731761455006683, + "grad_norm": 0.9183372259140015, + "learning_rate": 0.00011513320848184619, + "loss": 2.5507, + "step": 14053 + }, + { + "epoch": 1.2732667436751148, + "grad_norm": 0.8866086602210999, + "learning_rate": 0.00011512716728085544, + "loss": 2.642, + "step": 14054 + }, + { + "epoch": 1.2733573418495618, + "grad_norm": 0.9093806743621826, + "learning_rate": 0.00011512112607986467, + "loss": 2.6229, + "step": 14055 + }, + { + "epoch": 1.2734479400240084, + "grad_norm": 0.8300230503082275, + "learning_rate": 0.00011511508487887392, + "loss": 2.6636, + "step": 14056 + }, + { + "epoch": 1.2735385381984554, + "grad_norm": 0.8493333458900452, + "learning_rate": 0.00011510904367788318, + "loss": 2.477, + "step": 14057 + }, + { + "epoch": 1.273629136372902, + "grad_norm": 0.9199255704879761, + "learning_rate": 0.0001151030024768924, + "loss": 2.6807, + "step": 14058 + }, + { + "epoch": 1.273719734547349, + "grad_norm": 0.8634240031242371, + "learning_rate": 0.00011509696127590166, + "loss": 2.8008, + "step": 14059 + }, + { + "epoch": 1.2738103327217956, + "grad_norm": 1.0087491273880005, + "learning_rate": 0.0001150909200749109, + "loss": 2.9032, + "step": 14060 + }, + { + "epoch": 1.2739009308962426, + "grad_norm": 0.9463327527046204, + "learning_rate": 0.00011508487887392014, + "loss": 2.5467, + "step": 14061 + }, + { + "epoch": 1.2739915290706891, + "grad_norm": 0.8254761099815369, + "learning_rate": 0.00011507883767292938, + "loss": 2.7258, + "step": 14062 + }, + { + "epoch": 1.2740821272451361, + "grad_norm": 0.8663907051086426, + "learning_rate": 0.00011507279647193863, + "loss": 2.6454, + "step": 14063 + }, + { + "epoch": 1.2741727254195827, + "grad_norm": 0.9753192663192749, + "learning_rate": 0.00011506675527094787, + "loss": 2.566, + "step": 14064 + }, + { + "epoch": 1.2742633235940297, + "grad_norm": 0.7986170053482056, + "learning_rate": 0.00011506071406995711, + "loss": 1.9991, + "step": 14065 + }, + { + "epoch": 1.2743539217684763, + "grad_norm": 0.7944823503494263, + "learning_rate": 0.00011505467286896637, + "loss": 1.9733, + "step": 14066 + }, + { + "epoch": 1.274444519942923, + "grad_norm": 0.7885774970054626, + "learning_rate": 0.00011504863166797559, + "loss": 2.1179, + "step": 14067 + }, + { + "epoch": 1.2745351181173699, + "grad_norm": 1.0074156522750854, + "learning_rate": 0.00011504259046698485, + "loss": 2.7349, + "step": 14068 + }, + { + "epoch": 1.2746257162918166, + "grad_norm": 0.9595509171485901, + "learning_rate": 0.00011503654926599407, + "loss": 3.1213, + "step": 14069 + }, + { + "epoch": 1.2747163144662634, + "grad_norm": 0.8800408244132996, + "learning_rate": 0.00011503050806500333, + "loss": 2.1691, + "step": 14070 + }, + { + "epoch": 1.2748069126407102, + "grad_norm": 0.9139654636383057, + "learning_rate": 0.00011502446686401258, + "loss": 2.3574, + "step": 14071 + }, + { + "epoch": 1.274897510815157, + "grad_norm": 0.9119341373443604, + "learning_rate": 0.00011501842566302181, + "loss": 2.752, + "step": 14072 + }, + { + "epoch": 1.2749881089896038, + "grad_norm": 1.012784481048584, + "learning_rate": 0.00011501238446203106, + "loss": 2.7369, + "step": 14073 + }, + { + "epoch": 1.2750787071640506, + "grad_norm": 1.1849640607833862, + "learning_rate": 0.0001150063432610403, + "loss": 2.6384, + "step": 14074 + }, + { + "epoch": 1.2751693053384974, + "grad_norm": 0.9643789529800415, + "learning_rate": 0.00011500030206004954, + "loss": 2.8069, + "step": 14075 + }, + { + "epoch": 1.2752599035129442, + "grad_norm": 0.7486335039138794, + "learning_rate": 0.00011499426085905878, + "loss": 1.9454, + "step": 14076 + }, + { + "epoch": 1.275350501687391, + "grad_norm": 0.8585030436515808, + "learning_rate": 0.00011498821965806802, + "loss": 2.394, + "step": 14077 + }, + { + "epoch": 1.2754410998618377, + "grad_norm": 0.842673122882843, + "learning_rate": 0.00011498217845707729, + "loss": 1.9082, + "step": 14078 + }, + { + "epoch": 1.2755316980362845, + "grad_norm": 0.8310146927833557, + "learning_rate": 0.00011497613725608652, + "loss": 2.7494, + "step": 14079 + }, + { + "epoch": 1.2756222962107313, + "grad_norm": 0.8788886666297913, + "learning_rate": 0.00011497009605509577, + "loss": 2.6455, + "step": 14080 + }, + { + "epoch": 1.275712894385178, + "grad_norm": 0.840686559677124, + "learning_rate": 0.000114964054854105, + "loss": 1.8825, + "step": 14081 + }, + { + "epoch": 1.2758034925596249, + "grad_norm": 0.8936050534248352, + "learning_rate": 0.00011495801365311425, + "loss": 2.712, + "step": 14082 + }, + { + "epoch": 1.2758940907340717, + "grad_norm": 0.9463229775428772, + "learning_rate": 0.00011495197245212348, + "loss": 2.9403, + "step": 14083 + }, + { + "epoch": 1.2759846889085185, + "grad_norm": 0.8814337253570557, + "learning_rate": 0.00011494593125113273, + "loss": 2.7913, + "step": 14084 + }, + { + "epoch": 1.2760752870829652, + "grad_norm": 0.7690864205360413, + "learning_rate": 0.00011493989005014196, + "loss": 1.9451, + "step": 14085 + }, + { + "epoch": 1.276165885257412, + "grad_norm": 0.9037293791770935, + "learning_rate": 0.00011493384884915121, + "loss": 2.913, + "step": 14086 + }, + { + "epoch": 1.2762564834318588, + "grad_norm": 0.9092139601707458, + "learning_rate": 0.00011492780764816047, + "loss": 2.6572, + "step": 14087 + }, + { + "epoch": 1.2763470816063056, + "grad_norm": 0.8857001066207886, + "learning_rate": 0.0001149217664471697, + "loss": 2.5722, + "step": 14088 + }, + { + "epoch": 1.2764376797807524, + "grad_norm": 0.9203624129295349, + "learning_rate": 0.00011491572524617896, + "loss": 2.4915, + "step": 14089 + }, + { + "epoch": 1.2765282779551992, + "grad_norm": 0.8556545376777649, + "learning_rate": 0.00011490968404518818, + "loss": 2.5972, + "step": 14090 + }, + { + "epoch": 1.276618876129646, + "grad_norm": 0.847294807434082, + "learning_rate": 0.00011490364284419744, + "loss": 2.5126, + "step": 14091 + }, + { + "epoch": 1.2767094743040928, + "grad_norm": 0.902136504650116, + "learning_rate": 0.00011489760164320667, + "loss": 2.4387, + "step": 14092 + }, + { + "epoch": 1.2768000724785395, + "grad_norm": 0.8818315863609314, + "learning_rate": 0.00011489156044221592, + "loss": 2.6567, + "step": 14093 + }, + { + "epoch": 1.2768906706529863, + "grad_norm": 0.9650582075119019, + "learning_rate": 0.00011488551924122517, + "loss": 2.9498, + "step": 14094 + }, + { + "epoch": 1.2769812688274331, + "grad_norm": 0.8763108849525452, + "learning_rate": 0.0001148794780402344, + "loss": 2.8611, + "step": 14095 + }, + { + "epoch": 1.27707186700188, + "grad_norm": 0.7820413708686829, + "learning_rate": 0.00011487343683924365, + "loss": 1.7703, + "step": 14096 + }, + { + "epoch": 1.2771624651763267, + "grad_norm": 0.8671559691429138, + "learning_rate": 0.00011486739563825288, + "loss": 2.6634, + "step": 14097 + }, + { + "epoch": 1.2772530633507735, + "grad_norm": 0.9270190596580505, + "learning_rate": 0.00011486135443726213, + "loss": 2.6388, + "step": 14098 + }, + { + "epoch": 1.2773436615252203, + "grad_norm": 0.8064213395118713, + "learning_rate": 0.00011485531323627136, + "loss": 1.8487, + "step": 14099 + }, + { + "epoch": 1.277434259699667, + "grad_norm": 0.9002907872200012, + "learning_rate": 0.00011484927203528062, + "loss": 2.3745, + "step": 14100 + }, + { + "epoch": 1.2775248578741139, + "grad_norm": 0.8690911531448364, + "learning_rate": 0.00011484323083428987, + "loss": 2.751, + "step": 14101 + }, + { + "epoch": 1.2776154560485606, + "grad_norm": 0.9621200561523438, + "learning_rate": 0.0001148371896332991, + "loss": 2.6208, + "step": 14102 + }, + { + "epoch": 1.2777060542230074, + "grad_norm": 0.8915932178497314, + "learning_rate": 0.00011483114843230835, + "loss": 2.9359, + "step": 14103 + }, + { + "epoch": 1.2777966523974542, + "grad_norm": 0.9627420902252197, + "learning_rate": 0.00011482510723131759, + "loss": 2.662, + "step": 14104 + }, + { + "epoch": 1.277887250571901, + "grad_norm": 0.9148938059806824, + "learning_rate": 0.00011481906603032684, + "loss": 2.8153, + "step": 14105 + }, + { + "epoch": 1.2779778487463478, + "grad_norm": 0.8840930461883545, + "learning_rate": 0.00011481302482933607, + "loss": 2.9061, + "step": 14106 + }, + { + "epoch": 1.2780684469207946, + "grad_norm": 0.9565235376358032, + "learning_rate": 0.00011480698362834532, + "loss": 2.4369, + "step": 14107 + }, + { + "epoch": 1.2781590450952414, + "grad_norm": 0.8722988963127136, + "learning_rate": 0.00011480094242735458, + "loss": 2.606, + "step": 14108 + }, + { + "epoch": 1.2782496432696882, + "grad_norm": 0.7941310405731201, + "learning_rate": 0.0001147949012263638, + "loss": 2.139, + "step": 14109 + }, + { + "epoch": 1.278340241444135, + "grad_norm": 0.8942198157310486, + "learning_rate": 0.00011478886002537306, + "loss": 2.7705, + "step": 14110 + }, + { + "epoch": 1.2784308396185817, + "grad_norm": 0.7521211504936218, + "learning_rate": 0.0001147828188243823, + "loss": 1.9833, + "step": 14111 + }, + { + "epoch": 1.2785214377930285, + "grad_norm": 0.8988014459609985, + "learning_rate": 0.00011477677762339154, + "loss": 2.7063, + "step": 14112 + }, + { + "epoch": 1.2786120359674753, + "grad_norm": 0.9389200806617737, + "learning_rate": 0.00011477073642240078, + "loss": 2.6564, + "step": 14113 + }, + { + "epoch": 1.278702634141922, + "grad_norm": 0.9361345171928406, + "learning_rate": 0.00011476469522141002, + "loss": 2.611, + "step": 14114 + }, + { + "epoch": 1.2787932323163689, + "grad_norm": 0.8898561596870422, + "learning_rate": 0.00011475865402041926, + "loss": 2.5398, + "step": 14115 + }, + { + "epoch": 1.2788838304908157, + "grad_norm": 0.7375013828277588, + "learning_rate": 0.0001147526128194285, + "loss": 1.8827, + "step": 14116 + }, + { + "epoch": 1.2789744286652625, + "grad_norm": 0.9431506991386414, + "learning_rate": 0.00011474657161843775, + "loss": 2.7043, + "step": 14117 + }, + { + "epoch": 1.2790650268397092, + "grad_norm": 0.8781595826148987, + "learning_rate": 0.00011474053041744699, + "loss": 2.4244, + "step": 14118 + }, + { + "epoch": 1.279155625014156, + "grad_norm": 0.9453206062316895, + "learning_rate": 0.00011473448921645625, + "loss": 3.0634, + "step": 14119 + }, + { + "epoch": 1.2792462231886028, + "grad_norm": 0.879636287689209, + "learning_rate": 0.00011472844801546547, + "loss": 2.7085, + "step": 14120 + }, + { + "epoch": 1.2793368213630496, + "grad_norm": 0.8800480961799622, + "learning_rate": 0.00011472240681447473, + "loss": 2.8512, + "step": 14121 + }, + { + "epoch": 1.2794274195374964, + "grad_norm": 0.9094306826591492, + "learning_rate": 0.00011471636561348395, + "loss": 2.5744, + "step": 14122 + }, + { + "epoch": 1.2795180177119432, + "grad_norm": 0.8632672429084778, + "learning_rate": 0.00011471032441249321, + "loss": 2.7402, + "step": 14123 + }, + { + "epoch": 1.27960861588639, + "grad_norm": 0.9052650928497314, + "learning_rate": 0.00011470428321150246, + "loss": 2.7604, + "step": 14124 + }, + { + "epoch": 1.2796992140608368, + "grad_norm": 0.7541826963424683, + "learning_rate": 0.00011469824201051169, + "loss": 1.8829, + "step": 14125 + }, + { + "epoch": 1.2797898122352835, + "grad_norm": 1.0916225910186768, + "learning_rate": 0.00011469220080952094, + "loss": 2.491, + "step": 14126 + }, + { + "epoch": 1.27988041040973, + "grad_norm": 0.8161256909370422, + "learning_rate": 0.00011468615960853017, + "loss": 2.1671, + "step": 14127 + }, + { + "epoch": 1.2799710085841771, + "grad_norm": 0.8655116558074951, + "learning_rate": 0.00011468011840753942, + "loss": 2.7524, + "step": 14128 + }, + { + "epoch": 1.2800616067586237, + "grad_norm": 0.8797135949134827, + "learning_rate": 0.00011467407720654866, + "loss": 2.7436, + "step": 14129 + }, + { + "epoch": 1.2801522049330707, + "grad_norm": 0.9165708422660828, + "learning_rate": 0.0001146680360055579, + "loss": 2.1224, + "step": 14130 + }, + { + "epoch": 1.2802428031075173, + "grad_norm": 0.9228053689002991, + "learning_rate": 0.00011466199480456717, + "loss": 2.5668, + "step": 14131 + }, + { + "epoch": 1.2803334012819643, + "grad_norm": 0.9372859597206116, + "learning_rate": 0.0001146559536035764, + "loss": 2.7308, + "step": 14132 + }, + { + "epoch": 1.2804239994564108, + "grad_norm": 0.9106300473213196, + "learning_rate": 0.00011464991240258565, + "loss": 2.8424, + "step": 14133 + }, + { + "epoch": 1.2805145976308578, + "grad_norm": 0.9766777157783508, + "learning_rate": 0.00011464387120159488, + "loss": 2.6378, + "step": 14134 + }, + { + "epoch": 1.2806051958053044, + "grad_norm": 0.9266110062599182, + "learning_rate": 0.00011463783000060413, + "loss": 2.7109, + "step": 14135 + }, + { + "epoch": 1.2806957939797514, + "grad_norm": 0.875864565372467, + "learning_rate": 0.00011463178879961336, + "loss": 2.5875, + "step": 14136 + }, + { + "epoch": 1.280786392154198, + "grad_norm": 0.9237201809883118, + "learning_rate": 0.00011462574759862261, + "loss": 2.8467, + "step": 14137 + }, + { + "epoch": 1.280876990328645, + "grad_norm": 0.8553819060325623, + "learning_rate": 0.00011461970639763187, + "loss": 2.7548, + "step": 14138 + }, + { + "epoch": 1.2809675885030916, + "grad_norm": 0.8872926235198975, + "learning_rate": 0.00011461366519664109, + "loss": 2.6782, + "step": 14139 + }, + { + "epoch": 1.2810581866775386, + "grad_norm": 0.8857077360153198, + "learning_rate": 0.00011460762399565035, + "loss": 2.7478, + "step": 14140 + }, + { + "epoch": 1.2811487848519851, + "grad_norm": 0.9080763459205627, + "learning_rate": 0.00011460158279465957, + "loss": 2.886, + "step": 14141 + }, + { + "epoch": 1.2812393830264321, + "grad_norm": 0.8603519797325134, + "learning_rate": 0.00011459554159366883, + "loss": 2.641, + "step": 14142 + }, + { + "epoch": 1.2813299812008787, + "grad_norm": 0.8852405548095703, + "learning_rate": 0.00011458950039267807, + "loss": 2.6176, + "step": 14143 + }, + { + "epoch": 1.2814205793753257, + "grad_norm": 0.8684738874435425, + "learning_rate": 0.00011458345919168732, + "loss": 2.4883, + "step": 14144 + }, + { + "epoch": 1.2815111775497723, + "grad_norm": 0.9469940066337585, + "learning_rate": 0.00011457741799069655, + "loss": 2.8765, + "step": 14145 + }, + { + "epoch": 1.2816017757242193, + "grad_norm": 0.8694462180137634, + "learning_rate": 0.0001145713767897058, + "loss": 2.7392, + "step": 14146 + }, + { + "epoch": 1.2816923738986659, + "grad_norm": 0.9405359625816345, + "learning_rate": 0.00011456533558871505, + "loss": 2.6839, + "step": 14147 + }, + { + "epoch": 1.2817829720731126, + "grad_norm": 0.8620887398719788, + "learning_rate": 0.00011455929438772428, + "loss": 2.5814, + "step": 14148 + }, + { + "epoch": 1.2818735702475594, + "grad_norm": 0.8949951529502869, + "learning_rate": 0.00011455325318673353, + "loss": 2.6253, + "step": 14149 + }, + { + "epoch": 1.2819641684220062, + "grad_norm": 0.9434047937393188, + "learning_rate": 0.00011454721198574276, + "loss": 2.8605, + "step": 14150 + }, + { + "epoch": 1.282054766596453, + "grad_norm": 0.8953028917312622, + "learning_rate": 0.00011454117078475202, + "loss": 2.7558, + "step": 14151 + }, + { + "epoch": 1.2821453647708998, + "grad_norm": 0.8783904314041138, + "learning_rate": 0.00011453512958376124, + "loss": 1.9964, + "step": 14152 + }, + { + "epoch": 1.2822359629453466, + "grad_norm": 0.8849269151687622, + "learning_rate": 0.0001145290883827705, + "loss": 2.9285, + "step": 14153 + }, + { + "epoch": 1.2823265611197934, + "grad_norm": 0.8931630849838257, + "learning_rate": 0.00011452304718177975, + "loss": 2.4708, + "step": 14154 + }, + { + "epoch": 1.2824171592942402, + "grad_norm": 0.781114399433136, + "learning_rate": 0.00011451700598078899, + "loss": 1.9073, + "step": 14155 + }, + { + "epoch": 1.282507757468687, + "grad_norm": 0.9157309532165527, + "learning_rate": 0.00011451096477979823, + "loss": 2.6183, + "step": 14156 + }, + { + "epoch": 1.2825983556431337, + "grad_norm": 0.8844940066337585, + "learning_rate": 0.00011450492357880747, + "loss": 2.6566, + "step": 14157 + }, + { + "epoch": 1.2826889538175805, + "grad_norm": 0.9777214527130127, + "learning_rate": 0.00011449888237781671, + "loss": 3.3101, + "step": 14158 + }, + { + "epoch": 1.2827795519920273, + "grad_norm": 0.7486431002616882, + "learning_rate": 0.00011449284117682595, + "loss": 1.8831, + "step": 14159 + }, + { + "epoch": 1.282870150166474, + "grad_norm": 0.9045810699462891, + "learning_rate": 0.0001144867999758352, + "loss": 2.6224, + "step": 14160 + }, + { + "epoch": 1.2829607483409209, + "grad_norm": 0.86388099193573, + "learning_rate": 0.00011448075877484446, + "loss": 2.5329, + "step": 14161 + }, + { + "epoch": 1.2830513465153677, + "grad_norm": 0.9768726229667664, + "learning_rate": 0.00011447471757385368, + "loss": 3.0413, + "step": 14162 + }, + { + "epoch": 1.2831419446898145, + "grad_norm": 0.8979764580726624, + "learning_rate": 0.00011446867637286294, + "loss": 2.7718, + "step": 14163 + }, + { + "epoch": 1.2832325428642612, + "grad_norm": 0.9113445281982422, + "learning_rate": 0.00011446263517187217, + "loss": 2.8073, + "step": 14164 + }, + { + "epoch": 1.283323141038708, + "grad_norm": 0.8571258783340454, + "learning_rate": 0.00011445659397088142, + "loss": 2.5998, + "step": 14165 + }, + { + "epoch": 1.2834137392131548, + "grad_norm": 0.8760800957679749, + "learning_rate": 0.00011445055276989066, + "loss": 2.5484, + "step": 14166 + }, + { + "epoch": 1.2835043373876016, + "grad_norm": 0.916314423084259, + "learning_rate": 0.0001144445115688999, + "loss": 2.7473, + "step": 14167 + }, + { + "epoch": 1.2835949355620484, + "grad_norm": 0.8209319710731506, + "learning_rate": 0.00011443847036790915, + "loss": 2.5052, + "step": 14168 + }, + { + "epoch": 1.2836855337364952, + "grad_norm": 0.8311260342597961, + "learning_rate": 0.00011443242916691838, + "loss": 2.6514, + "step": 14169 + }, + { + "epoch": 1.283776131910942, + "grad_norm": 0.8852003812789917, + "learning_rate": 0.00011442638796592765, + "loss": 2.6589, + "step": 14170 + }, + { + "epoch": 1.2838667300853888, + "grad_norm": 0.9299015998840332, + "learning_rate": 0.00011442034676493687, + "loss": 2.8669, + "step": 14171 + }, + { + "epoch": 1.2839573282598356, + "grad_norm": 0.8721514344215393, + "learning_rate": 0.00011441430556394613, + "loss": 2.3291, + "step": 14172 + }, + { + "epoch": 1.2840479264342823, + "grad_norm": 0.9540939331054688, + "learning_rate": 0.00011440826436295535, + "loss": 2.8355, + "step": 14173 + }, + { + "epoch": 1.2841385246087291, + "grad_norm": 0.9364213943481445, + "learning_rate": 0.00011440222316196461, + "loss": 2.7645, + "step": 14174 + }, + { + "epoch": 1.284229122783176, + "grad_norm": 0.9611315131187439, + "learning_rate": 0.00011439618196097383, + "loss": 3.0774, + "step": 14175 + }, + { + "epoch": 1.2843197209576227, + "grad_norm": 0.8849777579307556, + "learning_rate": 0.00011439014075998309, + "loss": 2.8408, + "step": 14176 + }, + { + "epoch": 1.2844103191320695, + "grad_norm": 0.8464404940605164, + "learning_rate": 0.00011438409955899234, + "loss": 2.5059, + "step": 14177 + }, + { + "epoch": 1.2845009173065163, + "grad_norm": 0.9548108577728271, + "learning_rate": 0.00011437805835800157, + "loss": 2.6509, + "step": 14178 + }, + { + "epoch": 1.284591515480963, + "grad_norm": 0.8906554579734802, + "learning_rate": 0.00011437201715701082, + "loss": 2.7007, + "step": 14179 + }, + { + "epoch": 1.2846821136554099, + "grad_norm": 0.9063049554824829, + "learning_rate": 0.00011436597595602005, + "loss": 2.8946, + "step": 14180 + }, + { + "epoch": 1.2847727118298566, + "grad_norm": 0.9740939140319824, + "learning_rate": 0.0001143599347550293, + "loss": 2.9414, + "step": 14181 + }, + { + "epoch": 1.2848633100043034, + "grad_norm": 0.9029611945152283, + "learning_rate": 0.00011435389355403854, + "loss": 2.6132, + "step": 14182 + }, + { + "epoch": 1.2849539081787502, + "grad_norm": 0.7758611440658569, + "learning_rate": 0.0001143478523530478, + "loss": 2.1775, + "step": 14183 + }, + { + "epoch": 1.285044506353197, + "grad_norm": 0.8508617281913757, + "learning_rate": 0.00011434181115205704, + "loss": 2.6206, + "step": 14184 + }, + { + "epoch": 1.2851351045276438, + "grad_norm": 0.9560179710388184, + "learning_rate": 0.00011433576995106628, + "loss": 3.0069, + "step": 14185 + }, + { + "epoch": 1.2852257027020906, + "grad_norm": 1.0572818517684937, + "learning_rate": 0.00011432972875007553, + "loss": 2.9558, + "step": 14186 + }, + { + "epoch": 1.2853163008765374, + "grad_norm": 0.8636766672134399, + "learning_rate": 0.00011432368754908476, + "loss": 2.8348, + "step": 14187 + }, + { + "epoch": 1.2854068990509842, + "grad_norm": 0.9150622487068176, + "learning_rate": 0.00011431764634809401, + "loss": 2.7915, + "step": 14188 + }, + { + "epoch": 1.285497497225431, + "grad_norm": 0.9056596755981445, + "learning_rate": 0.00011431160514710324, + "loss": 2.4248, + "step": 14189 + }, + { + "epoch": 1.2855880953998777, + "grad_norm": 0.7571642994880676, + "learning_rate": 0.00011430556394611249, + "loss": 1.9483, + "step": 14190 + }, + { + "epoch": 1.2856786935743245, + "grad_norm": 0.8690035939216614, + "learning_rate": 0.00011429952274512175, + "loss": 2.4517, + "step": 14191 + }, + { + "epoch": 1.2857692917487713, + "grad_norm": 0.8870172500610352, + "learning_rate": 0.00011429348154413097, + "loss": 2.7378, + "step": 14192 + }, + { + "epoch": 1.285859889923218, + "grad_norm": 0.822556734085083, + "learning_rate": 0.00011428744034314023, + "loss": 2.6675, + "step": 14193 + }, + { + "epoch": 1.2859504880976649, + "grad_norm": 0.8969274759292603, + "learning_rate": 0.00011428139914214945, + "loss": 2.8874, + "step": 14194 + }, + { + "epoch": 1.2860410862721117, + "grad_norm": 0.8987367749214172, + "learning_rate": 0.00011427535794115871, + "loss": 2.6385, + "step": 14195 + }, + { + "epoch": 1.2861316844465585, + "grad_norm": 0.74982750415802, + "learning_rate": 0.00011426931674016795, + "loss": 2.0511, + "step": 14196 + }, + { + "epoch": 1.2862222826210052, + "grad_norm": 0.946736752986908, + "learning_rate": 0.0001142632755391772, + "loss": 2.8642, + "step": 14197 + }, + { + "epoch": 1.286312880795452, + "grad_norm": 0.8469034433364868, + "learning_rate": 0.00011425723433818644, + "loss": 2.6179, + "step": 14198 + }, + { + "epoch": 1.2864034789698988, + "grad_norm": 0.9017081260681152, + "learning_rate": 0.00011425119313719568, + "loss": 2.8442, + "step": 14199 + }, + { + "epoch": 1.2864940771443456, + "grad_norm": 0.9473568201065063, + "learning_rate": 0.00011424515193620492, + "loss": 2.5666, + "step": 14200 + }, + { + "epoch": 1.2865846753187924, + "grad_norm": 0.8258535861968994, + "learning_rate": 0.00011423911073521416, + "loss": 2.0519, + "step": 14201 + }, + { + "epoch": 1.2866752734932392, + "grad_norm": 0.8943939805030823, + "learning_rate": 0.00011423306953422342, + "loss": 2.6168, + "step": 14202 + }, + { + "epoch": 1.286765871667686, + "grad_norm": 0.8479998111724854, + "learning_rate": 0.00011422702833323264, + "loss": 2.8942, + "step": 14203 + }, + { + "epoch": 1.2868564698421328, + "grad_norm": 0.8616847991943359, + "learning_rate": 0.0001142209871322419, + "loss": 2.7552, + "step": 14204 + }, + { + "epoch": 1.2869470680165795, + "grad_norm": 1.0065027475357056, + "learning_rate": 0.00011421494593125112, + "loss": 2.8808, + "step": 14205 + }, + { + "epoch": 1.2870376661910263, + "grad_norm": 0.8534553050994873, + "learning_rate": 0.00011420890473026038, + "loss": 2.7616, + "step": 14206 + }, + { + "epoch": 1.2871282643654731, + "grad_norm": 0.8589215874671936, + "learning_rate": 0.00011420286352926963, + "loss": 2.7533, + "step": 14207 + }, + { + "epoch": 1.2872188625399197, + "grad_norm": 0.9124715328216553, + "learning_rate": 0.00011419682232827886, + "loss": 2.7194, + "step": 14208 + }, + { + "epoch": 1.2873094607143667, + "grad_norm": 0.9236128926277161, + "learning_rate": 0.00011419078112728811, + "loss": 2.7307, + "step": 14209 + }, + { + "epoch": 1.2874000588888133, + "grad_norm": 0.8881348967552185, + "learning_rate": 0.00011418473992629735, + "loss": 2.8589, + "step": 14210 + }, + { + "epoch": 1.2874906570632603, + "grad_norm": 0.7193818688392639, + "learning_rate": 0.0001141786987253066, + "loss": 1.886, + "step": 14211 + }, + { + "epoch": 1.2875812552377068, + "grad_norm": 0.8495618104934692, + "learning_rate": 0.00011417265752431583, + "loss": 1.8823, + "step": 14212 + }, + { + "epoch": 1.2876718534121538, + "grad_norm": 0.9473788738250732, + "learning_rate": 0.00011416661632332508, + "loss": 2.5891, + "step": 14213 + }, + { + "epoch": 1.2877624515866004, + "grad_norm": 0.8277288675308228, + "learning_rate": 0.00011416057512233434, + "loss": 2.231, + "step": 14214 + }, + { + "epoch": 1.2878530497610474, + "grad_norm": 0.8552457094192505, + "learning_rate": 0.00011415453392134357, + "loss": 2.6503, + "step": 14215 + }, + { + "epoch": 1.287943647935494, + "grad_norm": 0.8821120858192444, + "learning_rate": 0.00011414849272035282, + "loss": 2.6854, + "step": 14216 + }, + { + "epoch": 1.288034246109941, + "grad_norm": 0.8876999020576477, + "learning_rate": 0.00011414245151936205, + "loss": 2.7404, + "step": 14217 + }, + { + "epoch": 1.2881248442843876, + "grad_norm": 0.7934743165969849, + "learning_rate": 0.0001141364103183713, + "loss": 2.0966, + "step": 14218 + }, + { + "epoch": 1.2882154424588346, + "grad_norm": 0.9545251727104187, + "learning_rate": 0.00011413036911738053, + "loss": 2.6956, + "step": 14219 + }, + { + "epoch": 1.2883060406332811, + "grad_norm": 0.8821830749511719, + "learning_rate": 0.00011412432791638978, + "loss": 2.7911, + "step": 14220 + }, + { + "epoch": 1.2883966388077281, + "grad_norm": 0.9513311386108398, + "learning_rate": 0.00011411828671539904, + "loss": 2.8463, + "step": 14221 + }, + { + "epoch": 1.2884872369821747, + "grad_norm": 0.8842543363571167, + "learning_rate": 0.00011411224551440826, + "loss": 2.6843, + "step": 14222 + }, + { + "epoch": 1.2885778351566217, + "grad_norm": 0.8540067076683044, + "learning_rate": 0.00011410620431341752, + "loss": 2.5138, + "step": 14223 + }, + { + "epoch": 1.2886684333310683, + "grad_norm": 0.9455700516700745, + "learning_rate": 0.00011410016311242675, + "loss": 2.8156, + "step": 14224 + }, + { + "epoch": 1.2887590315055153, + "grad_norm": 0.8828111290931702, + "learning_rate": 0.000114094121911436, + "loss": 2.3208, + "step": 14225 + }, + { + "epoch": 1.2888496296799619, + "grad_norm": 0.8186265826225281, + "learning_rate": 0.00011408808071044523, + "loss": 2.0082, + "step": 14226 + }, + { + "epoch": 1.2889402278544089, + "grad_norm": 0.8349612951278687, + "learning_rate": 0.00011408203950945449, + "loss": 2.5889, + "step": 14227 + }, + { + "epoch": 1.2890308260288554, + "grad_norm": 1.0181607007980347, + "learning_rate": 0.00011407599830846374, + "loss": 2.7736, + "step": 14228 + }, + { + "epoch": 1.2891214242033022, + "grad_norm": 0.9652703404426575, + "learning_rate": 0.00011406995710747297, + "loss": 2.695, + "step": 14229 + }, + { + "epoch": 1.289212022377749, + "grad_norm": 0.9140652418136597, + "learning_rate": 0.00011406391590648222, + "loss": 2.7487, + "step": 14230 + }, + { + "epoch": 1.2893026205521958, + "grad_norm": 0.7703216671943665, + "learning_rate": 0.00011405787470549145, + "loss": 1.9732, + "step": 14231 + }, + { + "epoch": 1.2893932187266426, + "grad_norm": 1.0025206804275513, + "learning_rate": 0.0001140518335045007, + "loss": 2.8053, + "step": 14232 + }, + { + "epoch": 1.2894838169010894, + "grad_norm": 0.8662354350090027, + "learning_rate": 0.00011404579230350993, + "loss": 2.7346, + "step": 14233 + }, + { + "epoch": 1.2895744150755362, + "grad_norm": 0.8334736227989197, + "learning_rate": 0.0001140397511025192, + "loss": 2.679, + "step": 14234 + }, + { + "epoch": 1.289665013249983, + "grad_norm": 0.8760561943054199, + "learning_rate": 0.00011403370990152841, + "loss": 2.5866, + "step": 14235 + }, + { + "epoch": 1.2897556114244297, + "grad_norm": 0.9201163649559021, + "learning_rate": 0.00011402766870053768, + "loss": 2.7784, + "step": 14236 + }, + { + "epoch": 1.2898462095988765, + "grad_norm": 0.8862758874893188, + "learning_rate": 0.00011402162749954692, + "loss": 2.7535, + "step": 14237 + }, + { + "epoch": 1.2899368077733233, + "grad_norm": 0.9339302182197571, + "learning_rate": 0.00011401558629855616, + "loss": 2.7219, + "step": 14238 + }, + { + "epoch": 1.29002740594777, + "grad_norm": 0.8915108442306519, + "learning_rate": 0.0001140095450975654, + "loss": 2.7895, + "step": 14239 + }, + { + "epoch": 1.2901180041222169, + "grad_norm": 0.9230952858924866, + "learning_rate": 0.00011400350389657464, + "loss": 2.5672, + "step": 14240 + }, + { + "epoch": 1.2902086022966637, + "grad_norm": 0.8645039200782776, + "learning_rate": 0.00011399746269558389, + "loss": 2.9838, + "step": 14241 + }, + { + "epoch": 1.2902992004711105, + "grad_norm": 0.7567187547683716, + "learning_rate": 0.00011399142149459312, + "loss": 2.4385, + "step": 14242 + }, + { + "epoch": 1.2903897986455573, + "grad_norm": 0.9928133487701416, + "learning_rate": 0.00011398538029360237, + "loss": 2.5685, + "step": 14243 + }, + { + "epoch": 1.290480396820004, + "grad_norm": 0.8473796844482422, + "learning_rate": 0.00011397933909261163, + "loss": 2.5235, + "step": 14244 + }, + { + "epoch": 1.2905709949944508, + "grad_norm": 0.8749201893806458, + "learning_rate": 0.00011397329789162085, + "loss": 2.6562, + "step": 14245 + }, + { + "epoch": 1.2906615931688976, + "grad_norm": 0.9203471541404724, + "learning_rate": 0.00011396725669063011, + "loss": 2.9132, + "step": 14246 + }, + { + "epoch": 1.2907521913433444, + "grad_norm": 0.7725811004638672, + "learning_rate": 0.00011396121548963935, + "loss": 1.9959, + "step": 14247 + }, + { + "epoch": 1.2908427895177912, + "grad_norm": 0.8651301264762878, + "learning_rate": 0.00011395517428864859, + "loss": 2.6846, + "step": 14248 + }, + { + "epoch": 1.290933387692238, + "grad_norm": 0.7562207579612732, + "learning_rate": 0.00011394913308765783, + "loss": 1.9018, + "step": 14249 + }, + { + "epoch": 1.2910239858666848, + "grad_norm": 0.9331405758857727, + "learning_rate": 0.00011394309188666707, + "loss": 2.5965, + "step": 14250 + }, + { + "epoch": 1.2911145840411316, + "grad_norm": 0.8358614444732666, + "learning_rate": 0.00011393705068567632, + "loss": 2.46, + "step": 14251 + }, + { + "epoch": 1.2912051822155783, + "grad_norm": 0.9170946478843689, + "learning_rate": 0.00011393100948468556, + "loss": 2.465, + "step": 14252 + }, + { + "epoch": 1.2912957803900251, + "grad_norm": 0.8704497218132019, + "learning_rate": 0.00011392496828369482, + "loss": 2.7343, + "step": 14253 + }, + { + "epoch": 1.291386378564472, + "grad_norm": 0.9503849148750305, + "learning_rate": 0.00011391892708270404, + "loss": 2.8026, + "step": 14254 + }, + { + "epoch": 1.2914769767389187, + "grad_norm": 0.8502583503723145, + "learning_rate": 0.0001139128858817133, + "loss": 2.7627, + "step": 14255 + }, + { + "epoch": 1.2915675749133655, + "grad_norm": 0.9027444124221802, + "learning_rate": 0.00011390684468072252, + "loss": 2.6707, + "step": 14256 + }, + { + "epoch": 1.2916581730878123, + "grad_norm": 0.9189189672470093, + "learning_rate": 0.00011390080347973178, + "loss": 2.6843, + "step": 14257 + }, + { + "epoch": 1.291748771262259, + "grad_norm": 0.7852887511253357, + "learning_rate": 0.00011389476227874103, + "loss": 1.9989, + "step": 14258 + }, + { + "epoch": 1.2918393694367059, + "grad_norm": 0.8720445036888123, + "learning_rate": 0.00011388872107775026, + "loss": 2.8348, + "step": 14259 + }, + { + "epoch": 1.2919299676111526, + "grad_norm": 0.936221182346344, + "learning_rate": 0.00011388267987675951, + "loss": 2.7679, + "step": 14260 + }, + { + "epoch": 1.2920205657855994, + "grad_norm": 0.8610431551933289, + "learning_rate": 0.00011387663867576874, + "loss": 2.7866, + "step": 14261 + }, + { + "epoch": 1.2921111639600462, + "grad_norm": 0.8858740329742432, + "learning_rate": 0.00011387059747477799, + "loss": 2.8425, + "step": 14262 + }, + { + "epoch": 1.292201762134493, + "grad_norm": 0.8682137131690979, + "learning_rate": 0.00011386455627378723, + "loss": 2.5424, + "step": 14263 + }, + { + "epoch": 1.2922923603089398, + "grad_norm": 0.874586284160614, + "learning_rate": 0.00011385851507279647, + "loss": 2.8451, + "step": 14264 + }, + { + "epoch": 1.2923829584833866, + "grad_norm": 0.8956544399261475, + "learning_rate": 0.00011385247387180571, + "loss": 2.6663, + "step": 14265 + }, + { + "epoch": 1.2924735566578334, + "grad_norm": 0.8358946442604065, + "learning_rate": 0.00011384643267081497, + "loss": 1.9537, + "step": 14266 + }, + { + "epoch": 1.2925641548322802, + "grad_norm": 0.8570829033851624, + "learning_rate": 0.00011384039146982422, + "loss": 2.412, + "step": 14267 + }, + { + "epoch": 1.292654753006727, + "grad_norm": 0.9219799041748047, + "learning_rate": 0.00011383435026883345, + "loss": 2.7911, + "step": 14268 + }, + { + "epoch": 1.2927453511811737, + "grad_norm": 0.9088215231895447, + "learning_rate": 0.0001138283090678427, + "loss": 2.4916, + "step": 14269 + }, + { + "epoch": 1.2928359493556205, + "grad_norm": 0.961423933506012, + "learning_rate": 0.00011382226786685193, + "loss": 2.5847, + "step": 14270 + }, + { + "epoch": 1.2929265475300673, + "grad_norm": 0.8771495819091797, + "learning_rate": 0.00011381622666586118, + "loss": 2.6602, + "step": 14271 + }, + { + "epoch": 1.293017145704514, + "grad_norm": 1.0020523071289062, + "learning_rate": 0.00011381018546487041, + "loss": 2.8705, + "step": 14272 + }, + { + "epoch": 1.2931077438789609, + "grad_norm": 0.9321510791778564, + "learning_rate": 0.00011380414426387966, + "loss": 2.8765, + "step": 14273 + }, + { + "epoch": 1.2931983420534077, + "grad_norm": 0.8592119812965393, + "learning_rate": 0.00011379810306288892, + "loss": 2.4132, + "step": 14274 + }, + { + "epoch": 1.2932889402278545, + "grad_norm": 0.9322753548622131, + "learning_rate": 0.00011379206186189814, + "loss": 3.0446, + "step": 14275 + }, + { + "epoch": 1.2933795384023012, + "grad_norm": 0.9117279052734375, + "learning_rate": 0.0001137860206609074, + "loss": 2.8799, + "step": 14276 + }, + { + "epoch": 1.293470136576748, + "grad_norm": 0.9282628893852234, + "learning_rate": 0.00011377997945991662, + "loss": 2.7327, + "step": 14277 + }, + { + "epoch": 1.2935607347511948, + "grad_norm": 0.9044782519340515, + "learning_rate": 0.00011377393825892589, + "loss": 2.7157, + "step": 14278 + }, + { + "epoch": 1.2936513329256416, + "grad_norm": 0.8783572316169739, + "learning_rate": 0.00011376789705793512, + "loss": 2.5682, + "step": 14279 + }, + { + "epoch": 1.2937419311000884, + "grad_norm": 0.8912427425384521, + "learning_rate": 0.00011376185585694437, + "loss": 2.883, + "step": 14280 + }, + { + "epoch": 1.2938325292745352, + "grad_norm": 0.8308584690093994, + "learning_rate": 0.00011375581465595361, + "loss": 2.0219, + "step": 14281 + }, + { + "epoch": 1.293923127448982, + "grad_norm": 0.886709988117218, + "learning_rate": 0.00011374977345496285, + "loss": 2.3842, + "step": 14282 + }, + { + "epoch": 1.2940137256234288, + "grad_norm": 0.8969421982765198, + "learning_rate": 0.0001137437322539721, + "loss": 2.7353, + "step": 14283 + }, + { + "epoch": 1.2941043237978755, + "grad_norm": 0.9013500809669495, + "learning_rate": 0.00011373769105298133, + "loss": 2.8548, + "step": 14284 + }, + { + "epoch": 1.2941949219723223, + "grad_norm": 0.8355393409729004, + "learning_rate": 0.00011373164985199058, + "loss": 2.0674, + "step": 14285 + }, + { + "epoch": 1.2942855201467691, + "grad_norm": 0.8864967226982117, + "learning_rate": 0.00011372560865099981, + "loss": 2.4277, + "step": 14286 + }, + { + "epoch": 1.294376118321216, + "grad_norm": 0.9432279467582703, + "learning_rate": 0.00011371956745000907, + "loss": 2.8485, + "step": 14287 + }, + { + "epoch": 1.2944667164956627, + "grad_norm": 0.9327329397201538, + "learning_rate": 0.00011371352624901832, + "loss": 2.3885, + "step": 14288 + }, + { + "epoch": 1.2945573146701093, + "grad_norm": 0.8470084071159363, + "learning_rate": 0.00011370748504802755, + "loss": 2.5648, + "step": 14289 + }, + { + "epoch": 1.2946479128445563, + "grad_norm": 0.9337359070777893, + "learning_rate": 0.0001137014438470368, + "loss": 2.8343, + "step": 14290 + }, + { + "epoch": 1.2947385110190028, + "grad_norm": 0.9464473724365234, + "learning_rate": 0.00011369540264604604, + "loss": 2.7313, + "step": 14291 + }, + { + "epoch": 1.2948291091934498, + "grad_norm": 0.8396269083023071, + "learning_rate": 0.00011368936144505528, + "loss": 2.7361, + "step": 14292 + }, + { + "epoch": 1.2949197073678964, + "grad_norm": 0.9309232831001282, + "learning_rate": 0.00011368332024406452, + "loss": 2.5706, + "step": 14293 + }, + { + "epoch": 1.2950103055423434, + "grad_norm": 0.9589826464653015, + "learning_rate": 0.00011367727904307377, + "loss": 2.3987, + "step": 14294 + }, + { + "epoch": 1.29510090371679, + "grad_norm": 0.9462398290634155, + "learning_rate": 0.00011367123784208303, + "loss": 2.7905, + "step": 14295 + }, + { + "epoch": 1.295191501891237, + "grad_norm": 0.8753448724746704, + "learning_rate": 0.00011366519664109225, + "loss": 2.7173, + "step": 14296 + }, + { + "epoch": 1.2952821000656836, + "grad_norm": 0.8089110851287842, + "learning_rate": 0.00011365915544010151, + "loss": 2.5521, + "step": 14297 + }, + { + "epoch": 1.2953726982401306, + "grad_norm": 0.8973471522331238, + "learning_rate": 0.00011365311423911074, + "loss": 2.7441, + "step": 14298 + }, + { + "epoch": 1.2954632964145771, + "grad_norm": 0.8996912240982056, + "learning_rate": 0.00011364707303811999, + "loss": 2.877, + "step": 14299 + }, + { + "epoch": 1.2955538945890241, + "grad_norm": 0.9701794385910034, + "learning_rate": 0.00011364103183712922, + "loss": 2.6272, + "step": 14300 + }, + { + "epoch": 1.2956444927634707, + "grad_norm": 0.9875121116638184, + "learning_rate": 0.00011363499063613847, + "loss": 2.7583, + "step": 14301 + }, + { + "epoch": 1.2957350909379177, + "grad_norm": 0.8711939454078674, + "learning_rate": 0.0001136289494351477, + "loss": 2.1632, + "step": 14302 + }, + { + "epoch": 1.2958256891123643, + "grad_norm": 0.7595044374465942, + "learning_rate": 0.00011362290823415695, + "loss": 2.0423, + "step": 14303 + }, + { + "epoch": 1.2959162872868113, + "grad_norm": 0.8814851641654968, + "learning_rate": 0.0001136168670331662, + "loss": 2.8542, + "step": 14304 + }, + { + "epoch": 1.2960068854612579, + "grad_norm": 0.9438799023628235, + "learning_rate": 0.00011361082583217544, + "loss": 2.7466, + "step": 14305 + }, + { + "epoch": 1.2960974836357049, + "grad_norm": 0.98140549659729, + "learning_rate": 0.0001136047846311847, + "loss": 2.7652, + "step": 14306 + }, + { + "epoch": 1.2961880818101514, + "grad_norm": 0.9542624354362488, + "learning_rate": 0.00011359874343019392, + "loss": 2.6099, + "step": 14307 + }, + { + "epoch": 1.2962786799845984, + "grad_norm": 0.9278813600540161, + "learning_rate": 0.00011359270222920318, + "loss": 2.7092, + "step": 14308 + }, + { + "epoch": 1.296369278159045, + "grad_norm": 0.8830441236495972, + "learning_rate": 0.0001135866610282124, + "loss": 2.7038, + "step": 14309 + }, + { + "epoch": 1.2964598763334918, + "grad_norm": 0.9329531788825989, + "learning_rate": 0.00011358061982722166, + "loss": 2.7521, + "step": 14310 + }, + { + "epoch": 1.2965504745079386, + "grad_norm": 0.8728445768356323, + "learning_rate": 0.00011357457862623091, + "loss": 2.6449, + "step": 14311 + }, + { + "epoch": 1.2966410726823854, + "grad_norm": 0.9195268154144287, + "learning_rate": 0.00011356853742524014, + "loss": 2.7932, + "step": 14312 + }, + { + "epoch": 1.2967316708568322, + "grad_norm": 0.74526447057724, + "learning_rate": 0.00011356249622424939, + "loss": 2.0622, + "step": 14313 + }, + { + "epoch": 1.296822269031279, + "grad_norm": 0.9132276177406311, + "learning_rate": 0.00011355645502325862, + "loss": 2.6569, + "step": 14314 + }, + { + "epoch": 1.2969128672057257, + "grad_norm": 0.8358798623085022, + "learning_rate": 0.00011355041382226787, + "loss": 2.579, + "step": 14315 + }, + { + "epoch": 1.2970034653801725, + "grad_norm": 0.9148034453392029, + "learning_rate": 0.0001135443726212771, + "loss": 2.7806, + "step": 14316 + }, + { + "epoch": 1.2970940635546193, + "grad_norm": 0.8019387722015381, + "learning_rate": 0.00011353833142028635, + "loss": 2.0328, + "step": 14317 + }, + { + "epoch": 1.297184661729066, + "grad_norm": 0.8764751553535461, + "learning_rate": 0.00011353229021929561, + "loss": 2.5043, + "step": 14318 + }, + { + "epoch": 1.297275259903513, + "grad_norm": 0.8713202476501465, + "learning_rate": 0.00011352624901830485, + "loss": 2.682, + "step": 14319 + }, + { + "epoch": 1.2973658580779597, + "grad_norm": 0.9544333815574646, + "learning_rate": 0.0001135202078173141, + "loss": 3.0153, + "step": 14320 + }, + { + "epoch": 1.2974564562524065, + "grad_norm": 0.9024598598480225, + "learning_rate": 0.00011351416661632333, + "loss": 2.5634, + "step": 14321 + }, + { + "epoch": 1.2975470544268533, + "grad_norm": 0.981759250164032, + "learning_rate": 0.00011350812541533258, + "loss": 2.6669, + "step": 14322 + }, + { + "epoch": 1.2976376526013, + "grad_norm": 0.8853939175605774, + "learning_rate": 0.00011350208421434181, + "loss": 2.4833, + "step": 14323 + }, + { + "epoch": 1.2977282507757468, + "grad_norm": 0.9284805655479431, + "learning_rate": 0.00011349604301335106, + "loss": 2.6158, + "step": 14324 + }, + { + "epoch": 1.2978188489501936, + "grad_norm": 0.9325213432312012, + "learning_rate": 0.00011349000181236032, + "loss": 2.5798, + "step": 14325 + }, + { + "epoch": 1.2979094471246404, + "grad_norm": 0.8102200627326965, + "learning_rate": 0.00011348396061136954, + "loss": 2.0277, + "step": 14326 + }, + { + "epoch": 1.2980000452990872, + "grad_norm": 0.8609589338302612, + "learning_rate": 0.0001134779194103788, + "loss": 2.6886, + "step": 14327 + }, + { + "epoch": 1.298090643473534, + "grad_norm": 0.8921110033988953, + "learning_rate": 0.00011347187820938802, + "loss": 2.7354, + "step": 14328 + }, + { + "epoch": 1.2981812416479808, + "grad_norm": 0.9331024885177612, + "learning_rate": 0.00011346583700839728, + "loss": 2.9531, + "step": 14329 + }, + { + "epoch": 1.2982718398224276, + "grad_norm": 1.0305876731872559, + "learning_rate": 0.00011345979580740652, + "loss": 2.8705, + "step": 14330 + }, + { + "epoch": 1.2983624379968743, + "grad_norm": 0.9420129060745239, + "learning_rate": 0.00011345375460641576, + "loss": 2.4721, + "step": 14331 + }, + { + "epoch": 1.2984530361713211, + "grad_norm": 0.9270628094673157, + "learning_rate": 0.000113447713405425, + "loss": 2.6588, + "step": 14332 + }, + { + "epoch": 1.298543634345768, + "grad_norm": 0.8681979179382324, + "learning_rate": 0.00011344167220443425, + "loss": 2.8584, + "step": 14333 + }, + { + "epoch": 1.2986342325202147, + "grad_norm": 0.9917831420898438, + "learning_rate": 0.0001134356310034435, + "loss": 2.9565, + "step": 14334 + }, + { + "epoch": 1.2987248306946615, + "grad_norm": 0.9300901889801025, + "learning_rate": 0.00011342958980245273, + "loss": 2.8597, + "step": 14335 + }, + { + "epoch": 1.2988154288691083, + "grad_norm": 0.8164933919906616, + "learning_rate": 0.00011342354860146198, + "loss": 2.3679, + "step": 14336 + }, + { + "epoch": 1.298906027043555, + "grad_norm": 0.8595083355903625, + "learning_rate": 0.00011341750740047121, + "loss": 2.3722, + "step": 14337 + }, + { + "epoch": 1.2989966252180019, + "grad_norm": 0.9700772762298584, + "learning_rate": 0.00011341146619948047, + "loss": 2.7218, + "step": 14338 + }, + { + "epoch": 1.2990872233924486, + "grad_norm": 0.9096315503120422, + "learning_rate": 0.00011340542499848969, + "loss": 2.7314, + "step": 14339 + }, + { + "epoch": 1.2991778215668954, + "grad_norm": 0.9720186591148376, + "learning_rate": 0.00011339938379749895, + "loss": 2.9222, + "step": 14340 + }, + { + "epoch": 1.2992684197413422, + "grad_norm": 0.9137464165687561, + "learning_rate": 0.0001133933425965082, + "loss": 2.6823, + "step": 14341 + }, + { + "epoch": 1.299359017915789, + "grad_norm": 0.9499240517616272, + "learning_rate": 0.00011338730139551743, + "loss": 2.9791, + "step": 14342 + }, + { + "epoch": 1.2994496160902358, + "grad_norm": 0.9541360139846802, + "learning_rate": 0.00011338126019452668, + "loss": 2.5308, + "step": 14343 + }, + { + "epoch": 1.2995402142646826, + "grad_norm": 0.9258900284767151, + "learning_rate": 0.00011337521899353592, + "loss": 2.7978, + "step": 14344 + }, + { + "epoch": 1.2996308124391294, + "grad_norm": 0.9012418985366821, + "learning_rate": 0.00011336917779254516, + "loss": 2.9389, + "step": 14345 + }, + { + "epoch": 1.2997214106135762, + "grad_norm": 0.88361656665802, + "learning_rate": 0.0001133631365915544, + "loss": 2.776, + "step": 14346 + }, + { + "epoch": 1.299812008788023, + "grad_norm": 0.9280603528022766, + "learning_rate": 0.00011335709539056365, + "loss": 2.7008, + "step": 14347 + }, + { + "epoch": 1.2999026069624697, + "grad_norm": 0.789997398853302, + "learning_rate": 0.0001133510541895729, + "loss": 2.162, + "step": 14348 + }, + { + "epoch": 1.2999932051369165, + "grad_norm": 0.9060059189796448, + "learning_rate": 0.00011334501298858213, + "loss": 2.8761, + "step": 14349 + }, + { + "epoch": 1.3000838033113633, + "grad_norm": 0.9422317743301392, + "learning_rate": 0.00011333897178759139, + "loss": 2.4537, + "step": 14350 + }, + { + "epoch": 1.30017440148581, + "grad_norm": 0.9461540579795837, + "learning_rate": 0.00011333293058660062, + "loss": 2.5555, + "step": 14351 + }, + { + "epoch": 1.3002649996602569, + "grad_norm": 0.8745197653770447, + "learning_rate": 0.00011332688938560987, + "loss": 2.7867, + "step": 14352 + }, + { + "epoch": 1.3003555978347037, + "grad_norm": 0.8394433259963989, + "learning_rate": 0.0001133208481846191, + "loss": 2.2006, + "step": 14353 + }, + { + "epoch": 1.3004461960091505, + "grad_norm": 0.9920573234558105, + "learning_rate": 0.00011331480698362835, + "loss": 2.6659, + "step": 14354 + }, + { + "epoch": 1.3005367941835972, + "grad_norm": 0.9621245265007019, + "learning_rate": 0.0001133087657826376, + "loss": 2.8328, + "step": 14355 + }, + { + "epoch": 1.300627392358044, + "grad_norm": 0.8631106615066528, + "learning_rate": 0.00011330272458164683, + "loss": 2.5745, + "step": 14356 + }, + { + "epoch": 1.3007179905324908, + "grad_norm": 0.8308063745498657, + "learning_rate": 0.0001132966833806561, + "loss": 2.3496, + "step": 14357 + }, + { + "epoch": 1.3008085887069376, + "grad_norm": 0.9258179664611816, + "learning_rate": 0.00011329064217966531, + "loss": 2.8448, + "step": 14358 + }, + { + "epoch": 1.3008991868813844, + "grad_norm": 1.049424648284912, + "learning_rate": 0.00011328460097867458, + "loss": 2.7743, + "step": 14359 + }, + { + "epoch": 1.3009897850558312, + "grad_norm": 0.9836458563804626, + "learning_rate": 0.0001132785597776838, + "loss": 2.6298, + "step": 14360 + }, + { + "epoch": 1.301080383230278, + "grad_norm": 0.9271444082260132, + "learning_rate": 0.00011327251857669306, + "loss": 2.8501, + "step": 14361 + }, + { + "epoch": 1.3011709814047248, + "grad_norm": 0.8913471102714539, + "learning_rate": 0.00011326647737570228, + "loss": 2.5866, + "step": 14362 + }, + { + "epoch": 1.3012615795791715, + "grad_norm": 0.8725783824920654, + "learning_rate": 0.00011326043617471154, + "loss": 2.5775, + "step": 14363 + }, + { + "epoch": 1.3013521777536183, + "grad_norm": 0.9215041995048523, + "learning_rate": 0.00011325439497372079, + "loss": 2.847, + "step": 14364 + }, + { + "epoch": 1.3014427759280651, + "grad_norm": 0.9089694619178772, + "learning_rate": 0.00011324835377273002, + "loss": 2.5736, + "step": 14365 + }, + { + "epoch": 1.301533374102512, + "grad_norm": 0.8400375843048096, + "learning_rate": 0.00011324231257173927, + "loss": 2.1965, + "step": 14366 + }, + { + "epoch": 1.3016239722769587, + "grad_norm": 0.9159331917762756, + "learning_rate": 0.0001132362713707485, + "loss": 3.0874, + "step": 14367 + }, + { + "epoch": 1.3017145704514055, + "grad_norm": 0.8923825621604919, + "learning_rate": 0.00011323023016975775, + "loss": 2.7576, + "step": 14368 + }, + { + "epoch": 1.3018051686258523, + "grad_norm": 0.9353786706924438, + "learning_rate": 0.00011322418896876698, + "loss": 2.8128, + "step": 14369 + }, + { + "epoch": 1.3018957668002988, + "grad_norm": 0.9495461583137512, + "learning_rate": 0.00011321814776777625, + "loss": 2.8265, + "step": 14370 + }, + { + "epoch": 1.3019863649747458, + "grad_norm": 0.8260383605957031, + "learning_rate": 0.00011321210656678549, + "loss": 2.5336, + "step": 14371 + }, + { + "epoch": 1.3020769631491924, + "grad_norm": 0.9492814540863037, + "learning_rate": 0.00011320606536579473, + "loss": 2.5399, + "step": 14372 + }, + { + "epoch": 1.3021675613236394, + "grad_norm": 0.9223256707191467, + "learning_rate": 0.00011320002416480397, + "loss": 2.7744, + "step": 14373 + }, + { + "epoch": 1.302258159498086, + "grad_norm": 0.9685488343238831, + "learning_rate": 0.00011319398296381321, + "loss": 2.8412, + "step": 14374 + }, + { + "epoch": 1.302348757672533, + "grad_norm": 0.866080641746521, + "learning_rate": 0.00011318794176282246, + "loss": 2.5834, + "step": 14375 + }, + { + "epoch": 1.3024393558469796, + "grad_norm": 0.9235203862190247, + "learning_rate": 0.00011318190056183169, + "loss": 2.8797, + "step": 14376 + }, + { + "epoch": 1.3025299540214266, + "grad_norm": 0.8746961951255798, + "learning_rate": 0.00011317585936084094, + "loss": 2.6335, + "step": 14377 + }, + { + "epoch": 1.3026205521958731, + "grad_norm": 0.9014873504638672, + "learning_rate": 0.0001131698181598502, + "loss": 2.5262, + "step": 14378 + }, + { + "epoch": 1.3027111503703201, + "grad_norm": 0.851410448551178, + "learning_rate": 0.00011316377695885942, + "loss": 2.7759, + "step": 14379 + }, + { + "epoch": 1.3028017485447667, + "grad_norm": 0.7930482029914856, + "learning_rate": 0.00011315773575786868, + "loss": 2.1934, + "step": 14380 + }, + { + "epoch": 1.3028923467192137, + "grad_norm": 0.9013059139251709, + "learning_rate": 0.0001131516945568779, + "loss": 2.8412, + "step": 14381 + }, + { + "epoch": 1.3029829448936603, + "grad_norm": 0.888835072517395, + "learning_rate": 0.00011314565335588716, + "loss": 2.6885, + "step": 14382 + }, + { + "epoch": 1.3030735430681073, + "grad_norm": 0.932662844657898, + "learning_rate": 0.0001131396121548964, + "loss": 2.4691, + "step": 14383 + }, + { + "epoch": 1.3031641412425539, + "grad_norm": 0.8853554725646973, + "learning_rate": 0.00011313357095390564, + "loss": 2.6242, + "step": 14384 + }, + { + "epoch": 1.3032547394170009, + "grad_norm": 0.8596768379211426, + "learning_rate": 0.00011312752975291489, + "loss": 2.7788, + "step": 14385 + }, + { + "epoch": 1.3033453375914474, + "grad_norm": 0.9165352582931519, + "learning_rate": 0.00011312148855192413, + "loss": 2.6593, + "step": 14386 + }, + { + "epoch": 1.3034359357658944, + "grad_norm": 0.8061500787734985, + "learning_rate": 0.00011311544735093337, + "loss": 2.4239, + "step": 14387 + }, + { + "epoch": 1.303526533940341, + "grad_norm": 0.9942141175270081, + "learning_rate": 0.00011310940614994261, + "loss": 2.8044, + "step": 14388 + }, + { + "epoch": 1.303617132114788, + "grad_norm": 0.8350105285644531, + "learning_rate": 0.00011310336494895187, + "loss": 2.653, + "step": 14389 + }, + { + "epoch": 1.3037077302892346, + "grad_norm": 0.9284819960594177, + "learning_rate": 0.00011309732374796109, + "loss": 2.5444, + "step": 14390 + }, + { + "epoch": 1.3037983284636814, + "grad_norm": 0.8798015713691711, + "learning_rate": 0.00011309128254697035, + "loss": 2.591, + "step": 14391 + }, + { + "epoch": 1.3038889266381282, + "grad_norm": 1.0235823392868042, + "learning_rate": 0.00011308524134597957, + "loss": 2.7732, + "step": 14392 + }, + { + "epoch": 1.303979524812575, + "grad_norm": 0.9145643711090088, + "learning_rate": 0.00011307920014498883, + "loss": 2.5137, + "step": 14393 + }, + { + "epoch": 1.3040701229870217, + "grad_norm": 0.8951877951622009, + "learning_rate": 0.00011307315894399808, + "loss": 2.6834, + "step": 14394 + }, + { + "epoch": 1.3041607211614685, + "grad_norm": 0.8363136649131775, + "learning_rate": 0.00011306711774300731, + "loss": 1.8946, + "step": 14395 + }, + { + "epoch": 1.3042513193359153, + "grad_norm": 0.9054199457168579, + "learning_rate": 0.00011306107654201656, + "loss": 2.9329, + "step": 14396 + }, + { + "epoch": 1.304341917510362, + "grad_norm": 0.844280481338501, + "learning_rate": 0.0001130550353410258, + "loss": 2.3003, + "step": 14397 + }, + { + "epoch": 1.304432515684809, + "grad_norm": 0.9091249704360962, + "learning_rate": 0.00011304899414003504, + "loss": 2.8198, + "step": 14398 + }, + { + "epoch": 1.3045231138592557, + "grad_norm": 0.8615298271179199, + "learning_rate": 0.00011304295293904428, + "loss": 3.1587, + "step": 14399 + }, + { + "epoch": 1.3046137120337025, + "grad_norm": 0.753895103931427, + "learning_rate": 0.00011303691173805352, + "loss": 1.9643, + "step": 14400 + }, + { + "epoch": 1.3047043102081493, + "grad_norm": 0.8684043884277344, + "learning_rate": 0.00011303087053706279, + "loss": 2.5165, + "step": 14401 + }, + { + "epoch": 1.304794908382596, + "grad_norm": 0.9079863429069519, + "learning_rate": 0.00011302482933607202, + "loss": 2.5533, + "step": 14402 + }, + { + "epoch": 1.3048855065570428, + "grad_norm": 0.9122676253318787, + "learning_rate": 0.00011301878813508127, + "loss": 2.7759, + "step": 14403 + }, + { + "epoch": 1.3049761047314896, + "grad_norm": 0.9053114652633667, + "learning_rate": 0.0001130127469340905, + "loss": 2.5802, + "step": 14404 + }, + { + "epoch": 1.3050667029059364, + "grad_norm": 0.8937276601791382, + "learning_rate": 0.00011300670573309975, + "loss": 2.5904, + "step": 14405 + }, + { + "epoch": 1.3051573010803832, + "grad_norm": 0.9037759304046631, + "learning_rate": 0.00011300066453210898, + "loss": 2.7994, + "step": 14406 + }, + { + "epoch": 1.30524789925483, + "grad_norm": 0.92457115650177, + "learning_rate": 0.00011299462333111823, + "loss": 2.7621, + "step": 14407 + }, + { + "epoch": 1.3053384974292768, + "grad_norm": 1.0364573001861572, + "learning_rate": 0.00011298858213012749, + "loss": 3.0926, + "step": 14408 + }, + { + "epoch": 1.3054290956037236, + "grad_norm": 0.8940638303756714, + "learning_rate": 0.00011298254092913671, + "loss": 2.5268, + "step": 14409 + }, + { + "epoch": 1.3055196937781703, + "grad_norm": 0.8456671833992004, + "learning_rate": 0.00011297649972814597, + "loss": 2.5878, + "step": 14410 + }, + { + "epoch": 1.3056102919526171, + "grad_norm": 0.8897901177406311, + "learning_rate": 0.0001129704585271552, + "loss": 2.7358, + "step": 14411 + }, + { + "epoch": 1.305700890127064, + "grad_norm": 0.9141911268234253, + "learning_rate": 0.00011296441732616445, + "loss": 2.6986, + "step": 14412 + }, + { + "epoch": 1.3057914883015107, + "grad_norm": 0.9427241683006287, + "learning_rate": 0.00011295837612517368, + "loss": 2.7651, + "step": 14413 + }, + { + "epoch": 1.3058820864759575, + "grad_norm": 0.9012803435325623, + "learning_rate": 0.00011295233492418294, + "loss": 2.529, + "step": 14414 + }, + { + "epoch": 1.3059726846504043, + "grad_norm": 0.8843117356300354, + "learning_rate": 0.00011294629372319218, + "loss": 2.6633, + "step": 14415 + }, + { + "epoch": 1.306063282824851, + "grad_norm": 0.9498480558395386, + "learning_rate": 0.00011294025252220142, + "loss": 2.7689, + "step": 14416 + }, + { + "epoch": 1.3061538809992979, + "grad_norm": 0.8689610362052917, + "learning_rate": 0.00011293421132121067, + "loss": 2.6776, + "step": 14417 + }, + { + "epoch": 1.3062444791737446, + "grad_norm": 0.8901576995849609, + "learning_rate": 0.0001129281701202199, + "loss": 2.8933, + "step": 14418 + }, + { + "epoch": 1.3063350773481914, + "grad_norm": 0.9174089431762695, + "learning_rate": 0.00011292212891922915, + "loss": 2.7772, + "step": 14419 + }, + { + "epoch": 1.3064256755226382, + "grad_norm": 0.893599271774292, + "learning_rate": 0.00011291608771823838, + "loss": 2.6686, + "step": 14420 + }, + { + "epoch": 1.306516273697085, + "grad_norm": 0.943385660648346, + "learning_rate": 0.00011291004651724764, + "loss": 2.6878, + "step": 14421 + }, + { + "epoch": 1.3066068718715318, + "grad_norm": 0.8886809349060059, + "learning_rate": 0.00011290400531625686, + "loss": 2.5728, + "step": 14422 + }, + { + "epoch": 1.3066974700459786, + "grad_norm": 0.8782337307929993, + "learning_rate": 0.00011289796411526612, + "loss": 2.6465, + "step": 14423 + }, + { + "epoch": 1.3067880682204254, + "grad_norm": 0.9374287724494934, + "learning_rate": 0.00011289192291427537, + "loss": 3.1341, + "step": 14424 + }, + { + "epoch": 1.3068786663948722, + "grad_norm": 0.8954364061355591, + "learning_rate": 0.0001128858817132846, + "loss": 3.1388, + "step": 14425 + }, + { + "epoch": 1.306969264569319, + "grad_norm": 0.9045082926750183, + "learning_rate": 0.00011287984051229385, + "loss": 2.7339, + "step": 14426 + }, + { + "epoch": 1.3070598627437657, + "grad_norm": 0.8389177322387695, + "learning_rate": 0.00011287379931130309, + "loss": 2.6343, + "step": 14427 + }, + { + "epoch": 1.3071504609182125, + "grad_norm": 0.8817770481109619, + "learning_rate": 0.00011286775811031234, + "loss": 2.649, + "step": 14428 + }, + { + "epoch": 1.3072410590926593, + "grad_norm": 0.9028447270393372, + "learning_rate": 0.00011286171690932157, + "loss": 2.8179, + "step": 14429 + }, + { + "epoch": 1.307331657267106, + "grad_norm": 0.908577024936676, + "learning_rate": 0.00011285567570833082, + "loss": 2.455, + "step": 14430 + }, + { + "epoch": 1.3074222554415529, + "grad_norm": 0.908356249332428, + "learning_rate": 0.00011284963450734008, + "loss": 2.8421, + "step": 14431 + }, + { + "epoch": 1.3075128536159997, + "grad_norm": 0.9562309980392456, + "learning_rate": 0.0001128435933063493, + "loss": 2.7495, + "step": 14432 + }, + { + "epoch": 1.3076034517904465, + "grad_norm": 0.8724298477172852, + "learning_rate": 0.00011283755210535856, + "loss": 2.756, + "step": 14433 + }, + { + "epoch": 1.3076940499648932, + "grad_norm": 0.8567585945129395, + "learning_rate": 0.0001128315109043678, + "loss": 2.7374, + "step": 14434 + }, + { + "epoch": 1.30778464813934, + "grad_norm": 0.7953203320503235, + "learning_rate": 0.00011282546970337704, + "loss": 1.9524, + "step": 14435 + }, + { + "epoch": 1.3078752463137868, + "grad_norm": 0.898435652256012, + "learning_rate": 0.00011281942850238628, + "loss": 2.8856, + "step": 14436 + }, + { + "epoch": 1.3079658444882336, + "grad_norm": 0.8614034652709961, + "learning_rate": 0.00011281338730139552, + "loss": 2.5399, + "step": 14437 + }, + { + "epoch": 1.3080564426626804, + "grad_norm": 0.766747236251831, + "learning_rate": 0.00011280734610040477, + "loss": 2.084, + "step": 14438 + }, + { + "epoch": 1.3081470408371272, + "grad_norm": 0.8593701720237732, + "learning_rate": 0.000112801304899414, + "loss": 2.7715, + "step": 14439 + }, + { + "epoch": 1.308237639011574, + "grad_norm": 0.9347501397132874, + "learning_rate": 0.00011279526369842327, + "loss": 2.8263, + "step": 14440 + }, + { + "epoch": 1.3083282371860208, + "grad_norm": 0.8418440222740173, + "learning_rate": 0.00011278922249743249, + "loss": 2.6929, + "step": 14441 + }, + { + "epoch": 1.3084188353604675, + "grad_norm": 0.8382723331451416, + "learning_rate": 0.00011278318129644175, + "loss": 2.4594, + "step": 14442 + }, + { + "epoch": 1.3085094335349143, + "grad_norm": 0.9145165681838989, + "learning_rate": 0.00011277714009545097, + "loss": 2.6768, + "step": 14443 + }, + { + "epoch": 1.3086000317093611, + "grad_norm": 0.878057062625885, + "learning_rate": 0.00011277109889446023, + "loss": 2.728, + "step": 14444 + }, + { + "epoch": 1.308690629883808, + "grad_norm": 0.9014794230461121, + "learning_rate": 0.00011276505769346948, + "loss": 2.6787, + "step": 14445 + }, + { + "epoch": 1.3087812280582547, + "grad_norm": 1.0088632106781006, + "learning_rate": 0.00011275901649247871, + "loss": 2.7589, + "step": 14446 + }, + { + "epoch": 1.3088718262327015, + "grad_norm": 0.7551456093788147, + "learning_rate": 0.00011275297529148796, + "loss": 2.0967, + "step": 14447 + }, + { + "epoch": 1.3089624244071483, + "grad_norm": 0.909915566444397, + "learning_rate": 0.00011274693409049719, + "loss": 2.9476, + "step": 14448 + }, + { + "epoch": 1.309053022581595, + "grad_norm": 0.9884951710700989, + "learning_rate": 0.00011274089288950644, + "loss": 2.8429, + "step": 14449 + }, + { + "epoch": 1.3091436207560418, + "grad_norm": 0.909955620765686, + "learning_rate": 0.00011273485168851567, + "loss": 2.5099, + "step": 14450 + }, + { + "epoch": 1.3092342189304884, + "grad_norm": 0.9700391888618469, + "learning_rate": 0.00011272881048752492, + "loss": 2.7, + "step": 14451 + }, + { + "epoch": 1.3093248171049354, + "grad_norm": 0.7161241769790649, + "learning_rate": 0.00011272276928653416, + "loss": 2.0033, + "step": 14452 + }, + { + "epoch": 1.309415415279382, + "grad_norm": 0.881435215473175, + "learning_rate": 0.00011271672808554342, + "loss": 2.728, + "step": 14453 + }, + { + "epoch": 1.309506013453829, + "grad_norm": 0.8578457832336426, + "learning_rate": 0.00011271068688455266, + "loss": 2.4733, + "step": 14454 + }, + { + "epoch": 1.3095966116282756, + "grad_norm": 0.9477645754814148, + "learning_rate": 0.0001127046456835619, + "loss": 2.6573, + "step": 14455 + }, + { + "epoch": 1.3096872098027226, + "grad_norm": 0.9158339500427246, + "learning_rate": 0.00011269860448257115, + "loss": 2.7468, + "step": 14456 + }, + { + "epoch": 1.3097778079771691, + "grad_norm": 0.9134920835494995, + "learning_rate": 0.00011269256328158038, + "loss": 2.5631, + "step": 14457 + }, + { + "epoch": 1.3098684061516161, + "grad_norm": 0.8821619153022766, + "learning_rate": 0.00011268652208058963, + "loss": 2.7936, + "step": 14458 + }, + { + "epoch": 1.3099590043260627, + "grad_norm": 0.97202467918396, + "learning_rate": 0.00011268048087959886, + "loss": 2.6234, + "step": 14459 + }, + { + "epoch": 1.3100496025005097, + "grad_norm": 0.9461880922317505, + "learning_rate": 0.00011267443967860811, + "loss": 2.8129, + "step": 14460 + }, + { + "epoch": 1.3101402006749563, + "grad_norm": 0.8213436603546143, + "learning_rate": 0.00011266839847761737, + "loss": 2.1135, + "step": 14461 + }, + { + "epoch": 1.3102307988494033, + "grad_norm": 0.9540859460830688, + "learning_rate": 0.00011266235727662659, + "loss": 2.8996, + "step": 14462 + }, + { + "epoch": 1.3103213970238499, + "grad_norm": 1.0332015752792358, + "learning_rate": 0.00011265631607563585, + "loss": 2.7091, + "step": 14463 + }, + { + "epoch": 1.3104119951982969, + "grad_norm": 0.9299668669700623, + "learning_rate": 0.00011265027487464507, + "loss": 2.7324, + "step": 14464 + }, + { + "epoch": 1.3105025933727434, + "grad_norm": 0.8779474496841431, + "learning_rate": 0.00011264423367365433, + "loss": 2.767, + "step": 14465 + }, + { + "epoch": 1.3105931915471905, + "grad_norm": 0.8603523373603821, + "learning_rate": 0.00011263819247266357, + "loss": 2.5175, + "step": 14466 + }, + { + "epoch": 1.310683789721637, + "grad_norm": 0.890596866607666, + "learning_rate": 0.00011263215127167282, + "loss": 2.6801, + "step": 14467 + }, + { + "epoch": 1.310774387896084, + "grad_norm": 0.8870825171470642, + "learning_rate": 0.00011262611007068206, + "loss": 2.661, + "step": 14468 + }, + { + "epoch": 1.3108649860705306, + "grad_norm": 0.8921064138412476, + "learning_rate": 0.0001126200688696913, + "loss": 2.5531, + "step": 14469 + }, + { + "epoch": 1.3109555842449776, + "grad_norm": 0.9642638564109802, + "learning_rate": 0.00011261402766870054, + "loss": 2.7954, + "step": 14470 + }, + { + "epoch": 1.3110461824194242, + "grad_norm": 1.0299586057662964, + "learning_rate": 0.00011260798646770978, + "loss": 2.547, + "step": 14471 + }, + { + "epoch": 1.311136780593871, + "grad_norm": 1.017004370689392, + "learning_rate": 0.00011260194526671903, + "loss": 2.7573, + "step": 14472 + }, + { + "epoch": 1.3112273787683177, + "grad_norm": 0.9602943062782288, + "learning_rate": 0.00011259590406572826, + "loss": 3.0222, + "step": 14473 + }, + { + "epoch": 1.3113179769427645, + "grad_norm": 0.8953611254692078, + "learning_rate": 0.00011258986286473752, + "loss": 2.8429, + "step": 14474 + }, + { + "epoch": 1.3114085751172113, + "grad_norm": 0.8985145092010498, + "learning_rate": 0.00011258382166374677, + "loss": 2.7029, + "step": 14475 + }, + { + "epoch": 1.311499173291658, + "grad_norm": 0.8277565240859985, + "learning_rate": 0.000112577780462756, + "loss": 2.4817, + "step": 14476 + }, + { + "epoch": 1.311589771466105, + "grad_norm": 0.8884091973304749, + "learning_rate": 0.00011257173926176525, + "loss": 2.6427, + "step": 14477 + }, + { + "epoch": 1.3116803696405517, + "grad_norm": 0.8587661385536194, + "learning_rate": 0.00011256569806077449, + "loss": 2.7565, + "step": 14478 + }, + { + "epoch": 1.3117709678149985, + "grad_norm": 0.9691606760025024, + "learning_rate": 0.00011255965685978373, + "loss": 2.4755, + "step": 14479 + }, + { + "epoch": 1.3118615659894453, + "grad_norm": 0.9410094022750854, + "learning_rate": 0.00011255361565879297, + "loss": 2.5361, + "step": 14480 + }, + { + "epoch": 1.311952164163892, + "grad_norm": 0.9709928631782532, + "learning_rate": 0.00011254757445780221, + "loss": 2.8104, + "step": 14481 + }, + { + "epoch": 1.3120427623383388, + "grad_norm": 0.9065724015235901, + "learning_rate": 0.00011254153325681145, + "loss": 2.7506, + "step": 14482 + }, + { + "epoch": 1.3121333605127856, + "grad_norm": 0.8646711707115173, + "learning_rate": 0.0001125354920558207, + "loss": 2.823, + "step": 14483 + }, + { + "epoch": 1.3122239586872324, + "grad_norm": 0.9550423622131348, + "learning_rate": 0.00011252945085482996, + "loss": 2.7006, + "step": 14484 + }, + { + "epoch": 1.3123145568616792, + "grad_norm": 0.910683810710907, + "learning_rate": 0.00011252340965383919, + "loss": 2.5899, + "step": 14485 + }, + { + "epoch": 1.312405155036126, + "grad_norm": 0.8479759097099304, + "learning_rate": 0.00011251736845284844, + "loss": 2.7358, + "step": 14486 + }, + { + "epoch": 1.3124957532105728, + "grad_norm": 1.0010724067687988, + "learning_rate": 0.00011251132725185767, + "loss": 2.7803, + "step": 14487 + }, + { + "epoch": 1.3125863513850196, + "grad_norm": 0.8705186247825623, + "learning_rate": 0.00011250528605086692, + "loss": 2.6, + "step": 14488 + }, + { + "epoch": 1.3126769495594663, + "grad_norm": 0.9574957489967346, + "learning_rate": 0.00011249924484987615, + "loss": 2.7252, + "step": 14489 + }, + { + "epoch": 1.3127675477339131, + "grad_norm": 1.0345216989517212, + "learning_rate": 0.0001124932036488854, + "loss": 2.6925, + "step": 14490 + }, + { + "epoch": 1.31285814590836, + "grad_norm": 0.922497034072876, + "learning_rate": 0.00011248716244789465, + "loss": 2.5896, + "step": 14491 + }, + { + "epoch": 1.3129487440828067, + "grad_norm": 0.8547871708869934, + "learning_rate": 0.00011248112124690388, + "loss": 2.6942, + "step": 14492 + }, + { + "epoch": 1.3130393422572535, + "grad_norm": 0.9228980541229248, + "learning_rate": 0.00011247508004591314, + "loss": 2.7169, + "step": 14493 + }, + { + "epoch": 1.3131299404317003, + "grad_norm": 0.8396173119544983, + "learning_rate": 0.00011246903884492237, + "loss": 2.0803, + "step": 14494 + }, + { + "epoch": 1.313220538606147, + "grad_norm": 0.8657645583152771, + "learning_rate": 0.00011246299764393163, + "loss": 2.5483, + "step": 14495 + }, + { + "epoch": 1.3133111367805939, + "grad_norm": 0.904474675655365, + "learning_rate": 0.00011245695644294085, + "loss": 2.7135, + "step": 14496 + }, + { + "epoch": 1.3134017349550406, + "grad_norm": 0.9173904061317444, + "learning_rate": 0.00011245091524195011, + "loss": 2.8194, + "step": 14497 + }, + { + "epoch": 1.3134923331294874, + "grad_norm": 0.9032579064369202, + "learning_rate": 0.00011244487404095936, + "loss": 2.7158, + "step": 14498 + }, + { + "epoch": 1.3135829313039342, + "grad_norm": 0.9039903879165649, + "learning_rate": 0.00011243883283996859, + "loss": 2.7737, + "step": 14499 + }, + { + "epoch": 1.313673529478381, + "grad_norm": 0.9009187817573547, + "learning_rate": 0.00011243279163897784, + "loss": 2.7302, + "step": 14500 + }, + { + "epoch": 1.3137641276528278, + "grad_norm": 0.913422703742981, + "learning_rate": 0.00011242675043798707, + "loss": 2.7448, + "step": 14501 + }, + { + "epoch": 1.3138547258272746, + "grad_norm": 0.7284567356109619, + "learning_rate": 0.00011242070923699632, + "loss": 1.9829, + "step": 14502 + }, + { + "epoch": 1.3139453240017214, + "grad_norm": 0.9478139877319336, + "learning_rate": 0.00011241466803600555, + "loss": 2.5854, + "step": 14503 + }, + { + "epoch": 1.3140359221761682, + "grad_norm": 0.9382946491241455, + "learning_rate": 0.0001124086268350148, + "loss": 2.7509, + "step": 14504 + }, + { + "epoch": 1.314126520350615, + "grad_norm": 0.8437978029251099, + "learning_rate": 0.00011240258563402406, + "loss": 1.791, + "step": 14505 + }, + { + "epoch": 1.3142171185250617, + "grad_norm": 0.9206727743148804, + "learning_rate": 0.0001123965444330333, + "loss": 2.6405, + "step": 14506 + }, + { + "epoch": 1.3143077166995085, + "grad_norm": 0.8940387964248657, + "learning_rate": 0.00011239050323204254, + "loss": 2.9744, + "step": 14507 + }, + { + "epoch": 1.3143983148739553, + "grad_norm": 0.9395255446434021, + "learning_rate": 0.00011238446203105178, + "loss": 2.8537, + "step": 14508 + }, + { + "epoch": 1.314488913048402, + "grad_norm": 0.8827662467956543, + "learning_rate": 0.00011237842083006103, + "loss": 2.8911, + "step": 14509 + }, + { + "epoch": 1.3145795112228489, + "grad_norm": 0.8433154821395874, + "learning_rate": 0.00011237237962907026, + "loss": 2.5203, + "step": 14510 + }, + { + "epoch": 1.3146701093972957, + "grad_norm": 0.8615090250968933, + "learning_rate": 0.00011236633842807951, + "loss": 2.4339, + "step": 14511 + }, + { + "epoch": 1.3147607075717425, + "grad_norm": 0.8821680545806885, + "learning_rate": 0.00011236029722708874, + "loss": 2.475, + "step": 14512 + }, + { + "epoch": 1.3148513057461892, + "grad_norm": 0.9192147254943848, + "learning_rate": 0.00011235425602609799, + "loss": 2.8255, + "step": 14513 + }, + { + "epoch": 1.314941903920636, + "grad_norm": 0.9474005699157715, + "learning_rate": 0.00011234821482510725, + "loss": 2.4068, + "step": 14514 + }, + { + "epoch": 1.3150325020950828, + "grad_norm": 0.9192864298820496, + "learning_rate": 0.00011234217362411647, + "loss": 2.8657, + "step": 14515 + }, + { + "epoch": 1.3151231002695296, + "grad_norm": 0.9173500537872314, + "learning_rate": 0.00011233613242312573, + "loss": 2.8978, + "step": 14516 + }, + { + "epoch": 1.3152136984439764, + "grad_norm": 0.9341002106666565, + "learning_rate": 0.00011233009122213497, + "loss": 2.5478, + "step": 14517 + }, + { + "epoch": 1.3153042966184232, + "grad_norm": 0.8940322399139404, + "learning_rate": 0.00011232405002114421, + "loss": 2.6053, + "step": 14518 + }, + { + "epoch": 1.31539489479287, + "grad_norm": 0.9182450771331787, + "learning_rate": 0.00011231800882015345, + "loss": 2.7588, + "step": 14519 + }, + { + "epoch": 1.3154854929673168, + "grad_norm": 0.8607437610626221, + "learning_rate": 0.0001123119676191627, + "loss": 2.5202, + "step": 14520 + }, + { + "epoch": 1.3155760911417635, + "grad_norm": 0.8604682683944702, + "learning_rate": 0.00011230592641817194, + "loss": 1.9967, + "step": 14521 + }, + { + "epoch": 1.3156666893162103, + "grad_norm": 0.9576664566993713, + "learning_rate": 0.00011229988521718118, + "loss": 2.6516, + "step": 14522 + }, + { + "epoch": 1.3157572874906571, + "grad_norm": 0.8403791189193726, + "learning_rate": 0.00011229384401619042, + "loss": 2.5614, + "step": 14523 + }, + { + "epoch": 1.315847885665104, + "grad_norm": 0.9597635269165039, + "learning_rate": 0.00011228780281519966, + "loss": 2.8195, + "step": 14524 + }, + { + "epoch": 1.3159384838395507, + "grad_norm": 0.9044824838638306, + "learning_rate": 0.00011228176161420892, + "loss": 2.561, + "step": 14525 + }, + { + "epoch": 1.3160290820139975, + "grad_norm": 0.8827633261680603, + "learning_rate": 0.00011227572041321814, + "loss": 2.594, + "step": 14526 + }, + { + "epoch": 1.3161196801884443, + "grad_norm": 0.9899894595146179, + "learning_rate": 0.0001122696792122274, + "loss": 2.8732, + "step": 14527 + }, + { + "epoch": 1.316210278362891, + "grad_norm": 0.8693621754646301, + "learning_rate": 0.00011226363801123665, + "loss": 2.5451, + "step": 14528 + }, + { + "epoch": 1.3163008765373378, + "grad_norm": 0.8993217945098877, + "learning_rate": 0.00011225759681024588, + "loss": 2.5182, + "step": 14529 + }, + { + "epoch": 1.3163914747117846, + "grad_norm": 0.9507960677146912, + "learning_rate": 0.00011225155560925513, + "loss": 2.8952, + "step": 14530 + }, + { + "epoch": 1.3164820728862314, + "grad_norm": 0.9343875050544739, + "learning_rate": 0.00011224551440826436, + "loss": 2.7681, + "step": 14531 + }, + { + "epoch": 1.316572671060678, + "grad_norm": 0.9623982906341553, + "learning_rate": 0.00011223947320727361, + "loss": 2.8508, + "step": 14532 + }, + { + "epoch": 1.316663269235125, + "grad_norm": 0.9360799789428711, + "learning_rate": 0.00011223343200628285, + "loss": 2.632, + "step": 14533 + }, + { + "epoch": 1.3167538674095716, + "grad_norm": 0.9090139865875244, + "learning_rate": 0.0001122273908052921, + "loss": 2.8631, + "step": 14534 + }, + { + "epoch": 1.3168444655840186, + "grad_norm": 0.9412652254104614, + "learning_rate": 0.00011222134960430135, + "loss": 2.4206, + "step": 14535 + }, + { + "epoch": 1.3169350637584651, + "grad_norm": 0.9718840718269348, + "learning_rate": 0.00011221530840331058, + "loss": 2.6635, + "step": 14536 + }, + { + "epoch": 1.3170256619329122, + "grad_norm": 0.845227837562561, + "learning_rate": 0.00011220926720231984, + "loss": 2.5959, + "step": 14537 + }, + { + "epoch": 1.3171162601073587, + "grad_norm": 0.8874105215072632, + "learning_rate": 0.00011220322600132907, + "loss": 2.832, + "step": 14538 + }, + { + "epoch": 1.3172068582818057, + "grad_norm": 0.883162796497345, + "learning_rate": 0.00011219718480033832, + "loss": 2.7087, + "step": 14539 + }, + { + "epoch": 1.3172974564562523, + "grad_norm": 0.9374744296073914, + "learning_rate": 0.00011219114359934755, + "loss": 2.8603, + "step": 14540 + }, + { + "epoch": 1.3173880546306993, + "grad_norm": 0.8821123838424683, + "learning_rate": 0.0001121851023983568, + "loss": 2.8076, + "step": 14541 + }, + { + "epoch": 1.3174786528051459, + "grad_norm": 1.0905436277389526, + "learning_rate": 0.00011217906119736603, + "loss": 2.6066, + "step": 14542 + }, + { + "epoch": 1.3175692509795929, + "grad_norm": 0.9033957123756409, + "learning_rate": 0.00011217301999637528, + "loss": 2.8027, + "step": 14543 + }, + { + "epoch": 1.3176598491540394, + "grad_norm": 0.9352469444274902, + "learning_rate": 0.00011216697879538454, + "loss": 2.4568, + "step": 14544 + }, + { + "epoch": 1.3177504473284865, + "grad_norm": 0.9173694849014282, + "learning_rate": 0.00011216093759439376, + "loss": 2.6547, + "step": 14545 + }, + { + "epoch": 1.317841045502933, + "grad_norm": 1.0218745470046997, + "learning_rate": 0.00011215489639340302, + "loss": 2.4625, + "step": 14546 + }, + { + "epoch": 1.31793164367738, + "grad_norm": 0.9150741100311279, + "learning_rate": 0.00011214885519241224, + "loss": 2.7013, + "step": 14547 + }, + { + "epoch": 1.3180222418518266, + "grad_norm": 0.8707213401794434, + "learning_rate": 0.0001121428139914215, + "loss": 2.584, + "step": 14548 + }, + { + "epoch": 1.3181128400262736, + "grad_norm": 0.9167519211769104, + "learning_rate": 0.00011213677279043073, + "loss": 2.7709, + "step": 14549 + }, + { + "epoch": 1.3182034382007202, + "grad_norm": 0.9661779999732971, + "learning_rate": 0.00011213073158943999, + "loss": 2.6125, + "step": 14550 + }, + { + "epoch": 1.3182940363751672, + "grad_norm": 0.932440996170044, + "learning_rate": 0.00011212469038844924, + "loss": 2.3429, + "step": 14551 + }, + { + "epoch": 1.3183846345496137, + "grad_norm": 0.9065505862236023, + "learning_rate": 0.00011211864918745847, + "loss": 2.7385, + "step": 14552 + }, + { + "epoch": 1.3184752327240605, + "grad_norm": 0.9219546318054199, + "learning_rate": 0.00011211260798646772, + "loss": 2.842, + "step": 14553 + }, + { + "epoch": 1.3185658308985073, + "grad_norm": 0.8762010335922241, + "learning_rate": 0.00011210656678547695, + "loss": 3.0495, + "step": 14554 + }, + { + "epoch": 1.318656429072954, + "grad_norm": 0.8831692934036255, + "learning_rate": 0.0001121005255844862, + "loss": 2.501, + "step": 14555 + }, + { + "epoch": 1.318747027247401, + "grad_norm": 0.9378563761711121, + "learning_rate": 0.00011209448438349543, + "loss": 2.9523, + "step": 14556 + }, + { + "epoch": 1.3188376254218477, + "grad_norm": 0.916092038154602, + "learning_rate": 0.0001120884431825047, + "loss": 2.7654, + "step": 14557 + }, + { + "epoch": 1.3189282235962945, + "grad_norm": 0.9218473434448242, + "learning_rate": 0.00011208240198151394, + "loss": 2.5542, + "step": 14558 + }, + { + "epoch": 1.3190188217707413, + "grad_norm": 0.887069821357727, + "learning_rate": 0.00011207636078052318, + "loss": 2.6178, + "step": 14559 + }, + { + "epoch": 1.319109419945188, + "grad_norm": 0.9451664090156555, + "learning_rate": 0.00011207031957953242, + "loss": 2.7654, + "step": 14560 + }, + { + "epoch": 1.3192000181196348, + "grad_norm": 0.8761762976646423, + "learning_rate": 0.00011206427837854166, + "loss": 2.6252, + "step": 14561 + }, + { + "epoch": 1.3192906162940816, + "grad_norm": 0.8884726762771606, + "learning_rate": 0.0001120582371775509, + "loss": 2.6812, + "step": 14562 + }, + { + "epoch": 1.3193812144685284, + "grad_norm": 0.8641188740730286, + "learning_rate": 0.00011205219597656014, + "loss": 2.685, + "step": 14563 + }, + { + "epoch": 1.3194718126429752, + "grad_norm": 0.8842985033988953, + "learning_rate": 0.00011204615477556939, + "loss": 2.5624, + "step": 14564 + }, + { + "epoch": 1.319562410817422, + "grad_norm": 0.9247583746910095, + "learning_rate": 0.00011204011357457865, + "loss": 2.858, + "step": 14565 + }, + { + "epoch": 1.3196530089918688, + "grad_norm": 0.7717034220695496, + "learning_rate": 0.00011203407237358787, + "loss": 1.8945, + "step": 14566 + }, + { + "epoch": 1.3197436071663156, + "grad_norm": 0.9250915050506592, + "learning_rate": 0.00011202803117259713, + "loss": 2.7543, + "step": 14567 + }, + { + "epoch": 1.3198342053407623, + "grad_norm": 0.7885026931762695, + "learning_rate": 0.00011202198997160635, + "loss": 2.0638, + "step": 14568 + }, + { + "epoch": 1.3199248035152091, + "grad_norm": 0.9255392551422119, + "learning_rate": 0.00011201594877061561, + "loss": 2.6597, + "step": 14569 + }, + { + "epoch": 1.320015401689656, + "grad_norm": 0.8306030035018921, + "learning_rate": 0.00011200990756962484, + "loss": 2.5694, + "step": 14570 + }, + { + "epoch": 1.3201059998641027, + "grad_norm": 0.9542054533958435, + "learning_rate": 0.00011200386636863409, + "loss": 2.7988, + "step": 14571 + }, + { + "epoch": 1.3201965980385495, + "grad_norm": 0.9192716479301453, + "learning_rate": 0.00011199782516764333, + "loss": 2.6267, + "step": 14572 + }, + { + "epoch": 1.3202871962129963, + "grad_norm": 0.9314266443252563, + "learning_rate": 0.00011199178396665257, + "loss": 2.8388, + "step": 14573 + }, + { + "epoch": 1.320377794387443, + "grad_norm": 0.858007550239563, + "learning_rate": 0.00011198574276566182, + "loss": 2.666, + "step": 14574 + }, + { + "epoch": 1.3204683925618899, + "grad_norm": 0.7738770842552185, + "learning_rate": 0.00011197970156467106, + "loss": 2.2217, + "step": 14575 + }, + { + "epoch": 1.3205589907363366, + "grad_norm": 0.8611568212509155, + "learning_rate": 0.00011197366036368032, + "loss": 2.7503, + "step": 14576 + }, + { + "epoch": 1.3206495889107834, + "grad_norm": 0.9276955127716064, + "learning_rate": 0.00011196761916268954, + "loss": 2.6865, + "step": 14577 + }, + { + "epoch": 1.3207401870852302, + "grad_norm": 0.8889250755310059, + "learning_rate": 0.0001119615779616988, + "loss": 2.5906, + "step": 14578 + }, + { + "epoch": 1.320830785259677, + "grad_norm": 0.8714244365692139, + "learning_rate": 0.00011195553676070802, + "loss": 2.7165, + "step": 14579 + }, + { + "epoch": 1.3209213834341238, + "grad_norm": 0.9086139798164368, + "learning_rate": 0.00011194949555971728, + "loss": 2.8636, + "step": 14580 + }, + { + "epoch": 1.3210119816085706, + "grad_norm": 0.7380340099334717, + "learning_rate": 0.00011194345435872653, + "loss": 2.0004, + "step": 14581 + }, + { + "epoch": 1.3211025797830174, + "grad_norm": 0.8938368558883667, + "learning_rate": 0.00011193741315773576, + "loss": 2.5591, + "step": 14582 + }, + { + "epoch": 1.3211931779574642, + "grad_norm": 0.9137680530548096, + "learning_rate": 0.00011193137195674501, + "loss": 2.5505, + "step": 14583 + }, + { + "epoch": 1.321283776131911, + "grad_norm": 0.8360911011695862, + "learning_rate": 0.00011192533075575424, + "loss": 1.9615, + "step": 14584 + }, + { + "epoch": 1.3213743743063577, + "grad_norm": 0.9076095819473267, + "learning_rate": 0.00011191928955476349, + "loss": 2.0803, + "step": 14585 + }, + { + "epoch": 1.3214649724808045, + "grad_norm": 0.9101969599723816, + "learning_rate": 0.00011191324835377273, + "loss": 2.8341, + "step": 14586 + }, + { + "epoch": 1.3215555706552513, + "grad_norm": 0.9184437990188599, + "learning_rate": 0.00011190720715278197, + "loss": 2.62, + "step": 14587 + }, + { + "epoch": 1.321646168829698, + "grad_norm": 0.9238921403884888, + "learning_rate": 0.00011190116595179123, + "loss": 2.8397, + "step": 14588 + }, + { + "epoch": 1.3217367670041449, + "grad_norm": 0.9192019104957581, + "learning_rate": 0.00011189512475080047, + "loss": 2.6666, + "step": 14589 + }, + { + "epoch": 1.3218273651785917, + "grad_norm": 1.0200953483581543, + "learning_rate": 0.00011188908354980972, + "loss": 2.8921, + "step": 14590 + }, + { + "epoch": 1.3219179633530385, + "grad_norm": 0.9141530394554138, + "learning_rate": 0.00011188304234881895, + "loss": 2.3492, + "step": 14591 + }, + { + "epoch": 1.3220085615274852, + "grad_norm": 0.9186556935310364, + "learning_rate": 0.0001118770011478282, + "loss": 2.6311, + "step": 14592 + }, + { + "epoch": 1.322099159701932, + "grad_norm": 0.8714763522148132, + "learning_rate": 0.00011187095994683743, + "loss": 2.7136, + "step": 14593 + }, + { + "epoch": 1.3221897578763788, + "grad_norm": 0.9072974324226379, + "learning_rate": 0.00011186491874584668, + "loss": 2.6998, + "step": 14594 + }, + { + "epoch": 1.3222803560508256, + "grad_norm": 0.8400343656539917, + "learning_rate": 0.00011185887754485594, + "loss": 2.6313, + "step": 14595 + }, + { + "epoch": 1.3223709542252724, + "grad_norm": 0.8764774799346924, + "learning_rate": 0.00011185283634386516, + "loss": 2.8902, + "step": 14596 + }, + { + "epoch": 1.3224615523997192, + "grad_norm": 0.9001419544219971, + "learning_rate": 0.00011184679514287442, + "loss": 2.7902, + "step": 14597 + }, + { + "epoch": 1.322552150574166, + "grad_norm": 0.8940165042877197, + "learning_rate": 0.00011184075394188364, + "loss": 2.778, + "step": 14598 + }, + { + "epoch": 1.3226427487486128, + "grad_norm": 0.8260006308555603, + "learning_rate": 0.0001118347127408929, + "loss": 2.6566, + "step": 14599 + }, + { + "epoch": 1.3227333469230595, + "grad_norm": 0.8568590879440308, + "learning_rate": 0.00011182867153990212, + "loss": 2.4831, + "step": 14600 + }, + { + "epoch": 1.3228239450975063, + "grad_norm": 0.9051082134246826, + "learning_rate": 0.00011182263033891138, + "loss": 2.7237, + "step": 14601 + }, + { + "epoch": 1.3229145432719531, + "grad_norm": 0.8103289604187012, + "learning_rate": 0.00011181658913792062, + "loss": 2.1233, + "step": 14602 + }, + { + "epoch": 1.3230051414464, + "grad_norm": 0.9187852740287781, + "learning_rate": 0.00011181054793692987, + "loss": 2.6149, + "step": 14603 + }, + { + "epoch": 1.3230957396208467, + "grad_norm": 0.9402803778648376, + "learning_rate": 0.00011180450673593911, + "loss": 2.7779, + "step": 14604 + }, + { + "epoch": 1.3231863377952935, + "grad_norm": 0.8844886422157288, + "learning_rate": 0.00011179846553494835, + "loss": 3.0237, + "step": 14605 + }, + { + "epoch": 1.3232769359697403, + "grad_norm": 0.7561600804328918, + "learning_rate": 0.0001117924243339576, + "loss": 2.0987, + "step": 14606 + }, + { + "epoch": 1.323367534144187, + "grad_norm": 0.8817071318626404, + "learning_rate": 0.00011178638313296683, + "loss": 2.6576, + "step": 14607 + }, + { + "epoch": 1.3234581323186339, + "grad_norm": 0.8897535800933838, + "learning_rate": 0.00011178034193197609, + "loss": 2.7296, + "step": 14608 + }, + { + "epoch": 1.3235487304930806, + "grad_norm": 0.876470685005188, + "learning_rate": 0.00011177430073098531, + "loss": 2.6124, + "step": 14609 + }, + { + "epoch": 1.3236393286675274, + "grad_norm": 0.8115516304969788, + "learning_rate": 0.00011176825952999457, + "loss": 2.2128, + "step": 14610 + }, + { + "epoch": 1.3237299268419742, + "grad_norm": 0.9391822218894958, + "learning_rate": 0.00011176221832900382, + "loss": 2.8225, + "step": 14611 + }, + { + "epoch": 1.323820525016421, + "grad_norm": 0.9143346548080444, + "learning_rate": 0.00011175617712801305, + "loss": 2.6706, + "step": 14612 + }, + { + "epoch": 1.3239111231908676, + "grad_norm": 0.9342555999755859, + "learning_rate": 0.0001117501359270223, + "loss": 2.8515, + "step": 14613 + }, + { + "epoch": 1.3240017213653146, + "grad_norm": 0.8462396264076233, + "learning_rate": 0.00011174409472603154, + "loss": 2.526, + "step": 14614 + }, + { + "epoch": 1.3240923195397611, + "grad_norm": 0.9373325705528259, + "learning_rate": 0.00011173805352504078, + "loss": 2.6972, + "step": 14615 + }, + { + "epoch": 1.3241829177142082, + "grad_norm": 0.7511455416679382, + "learning_rate": 0.00011173201232405002, + "loss": 1.8481, + "step": 14616 + }, + { + "epoch": 1.3242735158886547, + "grad_norm": 0.8537316918373108, + "learning_rate": 0.00011172597112305927, + "loss": 2.5247, + "step": 14617 + }, + { + "epoch": 1.3243641140631017, + "grad_norm": 0.8451107740402222, + "learning_rate": 0.00011171992992206853, + "loss": 2.5398, + "step": 14618 + }, + { + "epoch": 1.3244547122375483, + "grad_norm": 0.8558874130249023, + "learning_rate": 0.00011171388872107775, + "loss": 2.732, + "step": 14619 + }, + { + "epoch": 1.3245453104119953, + "grad_norm": 0.8849567770957947, + "learning_rate": 0.00011170784752008701, + "loss": 2.754, + "step": 14620 + }, + { + "epoch": 1.3246359085864419, + "grad_norm": 0.9501333236694336, + "learning_rate": 0.00011170180631909624, + "loss": 2.8753, + "step": 14621 + }, + { + "epoch": 1.3247265067608889, + "grad_norm": 0.9101044535636902, + "learning_rate": 0.00011169576511810549, + "loss": 2.8919, + "step": 14622 + }, + { + "epoch": 1.3248171049353354, + "grad_norm": 0.8614049553871155, + "learning_rate": 0.00011168972391711472, + "loss": 2.1683, + "step": 14623 + }, + { + "epoch": 1.3249077031097825, + "grad_norm": 0.7854657173156738, + "learning_rate": 0.00011168368271612397, + "loss": 2.2033, + "step": 14624 + }, + { + "epoch": 1.324998301284229, + "grad_norm": 0.7722598910331726, + "learning_rate": 0.00011167764151513322, + "loss": 1.9807, + "step": 14625 + }, + { + "epoch": 1.325088899458676, + "grad_norm": 0.8307091593742371, + "learning_rate": 0.00011167160031414245, + "loss": 2.6424, + "step": 14626 + }, + { + "epoch": 1.3251794976331226, + "grad_norm": 0.841039776802063, + "learning_rate": 0.00011166555911315171, + "loss": 2.2174, + "step": 14627 + }, + { + "epoch": 1.3252700958075696, + "grad_norm": 0.9442920088768005, + "learning_rate": 0.00011165951791216093, + "loss": 3.0064, + "step": 14628 + }, + { + "epoch": 1.3253606939820162, + "grad_norm": 0.9113558530807495, + "learning_rate": 0.0001116534767111702, + "loss": 2.8365, + "step": 14629 + }, + { + "epoch": 1.3254512921564632, + "grad_norm": 0.8335207104682922, + "learning_rate": 0.00011164743551017942, + "loss": 2.4678, + "step": 14630 + }, + { + "epoch": 1.3255418903309097, + "grad_norm": 0.8613524436950684, + "learning_rate": 0.00011164139430918868, + "loss": 2.6418, + "step": 14631 + }, + { + "epoch": 1.3256324885053568, + "grad_norm": 0.971595823764801, + "learning_rate": 0.0001116353531081979, + "loss": 2.7077, + "step": 14632 + }, + { + "epoch": 1.3257230866798033, + "grad_norm": 0.8751730918884277, + "learning_rate": 0.00011162931190720716, + "loss": 2.6004, + "step": 14633 + }, + { + "epoch": 1.32581368485425, + "grad_norm": 0.9041839838027954, + "learning_rate": 0.00011162327070621641, + "loss": 2.8263, + "step": 14634 + }, + { + "epoch": 1.325904283028697, + "grad_norm": 0.8256960511207581, + "learning_rate": 0.00011161722950522564, + "loss": 1.9463, + "step": 14635 + }, + { + "epoch": 1.3259948812031437, + "grad_norm": 0.9209835529327393, + "learning_rate": 0.00011161118830423489, + "loss": 3.0551, + "step": 14636 + }, + { + "epoch": 1.3260854793775905, + "grad_norm": 0.8891535997390747, + "learning_rate": 0.00011160514710324412, + "loss": 2.7864, + "step": 14637 + }, + { + "epoch": 1.3261760775520373, + "grad_norm": 0.9620514512062073, + "learning_rate": 0.00011159910590225337, + "loss": 2.7679, + "step": 14638 + }, + { + "epoch": 1.326266675726484, + "grad_norm": 0.8227847814559937, + "learning_rate": 0.0001115930647012626, + "loss": 2.3237, + "step": 14639 + }, + { + "epoch": 1.3263572739009308, + "grad_norm": 0.9478861093521118, + "learning_rate": 0.00011158702350027187, + "loss": 2.6048, + "step": 14640 + }, + { + "epoch": 1.3264478720753776, + "grad_norm": 0.8699279427528381, + "learning_rate": 0.00011158098229928111, + "loss": 2.1924, + "step": 14641 + }, + { + "epoch": 1.3265384702498244, + "grad_norm": 0.9397291541099548, + "learning_rate": 0.00011157494109829035, + "loss": 2.7024, + "step": 14642 + }, + { + "epoch": 1.3266290684242712, + "grad_norm": 0.9310387969017029, + "learning_rate": 0.0001115688998972996, + "loss": 2.7866, + "step": 14643 + }, + { + "epoch": 1.326719666598718, + "grad_norm": 0.992043673992157, + "learning_rate": 0.00011156285869630883, + "loss": 2.7003, + "step": 14644 + }, + { + "epoch": 1.3268102647731648, + "grad_norm": 0.8836283087730408, + "learning_rate": 0.00011155681749531808, + "loss": 2.7504, + "step": 14645 + }, + { + "epoch": 1.3269008629476116, + "grad_norm": 0.9048947095870972, + "learning_rate": 0.00011155077629432731, + "loss": 2.7821, + "step": 14646 + }, + { + "epoch": 1.3269914611220583, + "grad_norm": 0.8981366753578186, + "learning_rate": 0.00011154473509333656, + "loss": 2.8604, + "step": 14647 + }, + { + "epoch": 1.3270820592965051, + "grad_norm": 0.9549999237060547, + "learning_rate": 0.00011153869389234582, + "loss": 2.6649, + "step": 14648 + }, + { + "epoch": 1.327172657470952, + "grad_norm": 0.9929032921791077, + "learning_rate": 0.00011153265269135504, + "loss": 2.7988, + "step": 14649 + }, + { + "epoch": 1.3272632556453987, + "grad_norm": 0.8948190808296204, + "learning_rate": 0.0001115266114903643, + "loss": 2.8581, + "step": 14650 + }, + { + "epoch": 1.3273538538198455, + "grad_norm": 0.8795786499977112, + "learning_rate": 0.00011152057028937352, + "loss": 2.7797, + "step": 14651 + }, + { + "epoch": 1.3274444519942923, + "grad_norm": 0.8849783539772034, + "learning_rate": 0.00011151452908838278, + "loss": 2.7389, + "step": 14652 + }, + { + "epoch": 1.327535050168739, + "grad_norm": 0.890766441822052, + "learning_rate": 0.00011150848788739202, + "loss": 2.7715, + "step": 14653 + }, + { + "epoch": 1.3276256483431859, + "grad_norm": 0.9177873730659485, + "learning_rate": 0.00011150244668640126, + "loss": 2.7972, + "step": 14654 + }, + { + "epoch": 1.3277162465176326, + "grad_norm": 0.8746382594108582, + "learning_rate": 0.00011149640548541051, + "loss": 2.6501, + "step": 14655 + }, + { + "epoch": 1.3278068446920794, + "grad_norm": 0.9315887093544006, + "learning_rate": 0.00011149036428441975, + "loss": 2.59, + "step": 14656 + }, + { + "epoch": 1.3278974428665262, + "grad_norm": 0.7839439511299133, + "learning_rate": 0.000111484323083429, + "loss": 2.1673, + "step": 14657 + }, + { + "epoch": 1.327988041040973, + "grad_norm": 0.9450238347053528, + "learning_rate": 0.00011147828188243823, + "loss": 2.7861, + "step": 14658 + }, + { + "epoch": 1.3280786392154198, + "grad_norm": 0.8891294002532959, + "learning_rate": 0.00011147224068144748, + "loss": 2.7819, + "step": 14659 + }, + { + "epoch": 1.3281692373898666, + "grad_norm": 0.935945451259613, + "learning_rate": 0.00011146619948045671, + "loss": 2.6641, + "step": 14660 + }, + { + "epoch": 1.3282598355643134, + "grad_norm": 0.8506149053573608, + "learning_rate": 0.00011146015827946597, + "loss": 2.5287, + "step": 14661 + }, + { + "epoch": 1.3283504337387602, + "grad_norm": 1.0173264741897583, + "learning_rate": 0.00011145411707847519, + "loss": 2.6808, + "step": 14662 + }, + { + "epoch": 1.328441031913207, + "grad_norm": 0.8307505249977112, + "learning_rate": 0.00011144807587748445, + "loss": 1.9026, + "step": 14663 + }, + { + "epoch": 1.3285316300876537, + "grad_norm": 0.8784999847412109, + "learning_rate": 0.0001114420346764937, + "loss": 2.6059, + "step": 14664 + }, + { + "epoch": 1.3286222282621005, + "grad_norm": 0.9411612153053284, + "learning_rate": 0.00011143599347550293, + "loss": 2.6322, + "step": 14665 + }, + { + "epoch": 1.3287128264365473, + "grad_norm": 0.8584591150283813, + "learning_rate": 0.00011142995227451218, + "loss": 2.8023, + "step": 14666 + }, + { + "epoch": 1.328803424610994, + "grad_norm": 0.8731266856193542, + "learning_rate": 0.00011142391107352142, + "loss": 2.7339, + "step": 14667 + }, + { + "epoch": 1.3288940227854409, + "grad_norm": 0.9491941332817078, + "learning_rate": 0.00011141786987253066, + "loss": 2.731, + "step": 14668 + }, + { + "epoch": 1.3289846209598877, + "grad_norm": 1.037009835243225, + "learning_rate": 0.0001114118286715399, + "loss": 2.7921, + "step": 14669 + }, + { + "epoch": 1.3290752191343345, + "grad_norm": 0.8945019245147705, + "learning_rate": 0.00011140578747054914, + "loss": 2.8, + "step": 14670 + }, + { + "epoch": 1.3291658173087813, + "grad_norm": 0.8617443442344666, + "learning_rate": 0.0001113997462695584, + "loss": 2.8848, + "step": 14671 + }, + { + "epoch": 1.329256415483228, + "grad_norm": 1.0325307846069336, + "learning_rate": 0.00011139370506856764, + "loss": 2.6882, + "step": 14672 + }, + { + "epoch": 1.3293470136576748, + "grad_norm": 0.8962110280990601, + "learning_rate": 0.00011138766386757689, + "loss": 2.7602, + "step": 14673 + }, + { + "epoch": 1.3294376118321216, + "grad_norm": 0.8494322299957275, + "learning_rate": 0.00011138162266658612, + "loss": 2.6245, + "step": 14674 + }, + { + "epoch": 1.3295282100065684, + "grad_norm": 0.9649009108543396, + "learning_rate": 0.00011137558146559537, + "loss": 2.7613, + "step": 14675 + }, + { + "epoch": 1.3296188081810152, + "grad_norm": 0.9414894580841064, + "learning_rate": 0.0001113695402646046, + "loss": 2.6406, + "step": 14676 + }, + { + "epoch": 1.329709406355462, + "grad_norm": 0.8845420479774475, + "learning_rate": 0.00011136349906361385, + "loss": 2.721, + "step": 14677 + }, + { + "epoch": 1.3298000045299088, + "grad_norm": 0.947062075138092, + "learning_rate": 0.0001113574578626231, + "loss": 2.7324, + "step": 14678 + }, + { + "epoch": 1.3298906027043556, + "grad_norm": 0.8942372798919678, + "learning_rate": 0.00011135141666163233, + "loss": 2.7836, + "step": 14679 + }, + { + "epoch": 1.3299812008788023, + "grad_norm": 0.8720940351486206, + "learning_rate": 0.0001113453754606416, + "loss": 2.6794, + "step": 14680 + }, + { + "epoch": 1.3300717990532491, + "grad_norm": 0.9436805248260498, + "learning_rate": 0.00011133933425965081, + "loss": 2.8267, + "step": 14681 + }, + { + "epoch": 1.330162397227696, + "grad_norm": 0.7404989004135132, + "learning_rate": 0.00011133329305866008, + "loss": 1.789, + "step": 14682 + }, + { + "epoch": 1.3302529954021427, + "grad_norm": 0.849473237991333, + "learning_rate": 0.0001113272518576693, + "loss": 2.6972, + "step": 14683 + }, + { + "epoch": 1.3303435935765895, + "grad_norm": 0.8588374257087708, + "learning_rate": 0.00011132121065667856, + "loss": 2.748, + "step": 14684 + }, + { + "epoch": 1.3304341917510363, + "grad_norm": 0.882332444190979, + "learning_rate": 0.0001113151694556878, + "loss": 2.8115, + "step": 14685 + }, + { + "epoch": 1.330524789925483, + "grad_norm": 0.9087085723876953, + "learning_rate": 0.00011130912825469704, + "loss": 2.8464, + "step": 14686 + }, + { + "epoch": 1.3306153880999299, + "grad_norm": 1.0153979063034058, + "learning_rate": 0.00011130308705370629, + "loss": 2.4642, + "step": 14687 + }, + { + "epoch": 1.3307059862743766, + "grad_norm": 0.7360895872116089, + "learning_rate": 0.00011129704585271552, + "loss": 2.1668, + "step": 14688 + }, + { + "epoch": 1.3307965844488234, + "grad_norm": 0.9196758270263672, + "learning_rate": 0.00011129100465172477, + "loss": 2.7183, + "step": 14689 + }, + { + "epoch": 1.3308871826232702, + "grad_norm": 0.8692601323127747, + "learning_rate": 0.000111284963450734, + "loss": 2.5722, + "step": 14690 + }, + { + "epoch": 1.330977780797717, + "grad_norm": 0.9392343163490295, + "learning_rate": 0.00011127892224974325, + "loss": 2.6422, + "step": 14691 + }, + { + "epoch": 1.3310683789721638, + "grad_norm": 1.0061073303222656, + "learning_rate": 0.00011127288104875248, + "loss": 2.7697, + "step": 14692 + }, + { + "epoch": 1.3311589771466106, + "grad_norm": 0.8369057178497314, + "learning_rate": 0.00011126683984776174, + "loss": 2.5584, + "step": 14693 + }, + { + "epoch": 1.3312495753210571, + "grad_norm": 0.8928407430648804, + "learning_rate": 0.00011126079864677099, + "loss": 2.7453, + "step": 14694 + }, + { + "epoch": 1.3313401734955042, + "grad_norm": 0.8526977300643921, + "learning_rate": 0.00011125475744578023, + "loss": 2.6169, + "step": 14695 + }, + { + "epoch": 1.3314307716699507, + "grad_norm": 0.8586994409561157, + "learning_rate": 0.00011124871624478947, + "loss": 2.3859, + "step": 14696 + }, + { + "epoch": 1.3315213698443977, + "grad_norm": 0.9385809898376465, + "learning_rate": 0.00011124267504379871, + "loss": 3.0265, + "step": 14697 + }, + { + "epoch": 1.3316119680188443, + "grad_norm": 0.8180225491523743, + "learning_rate": 0.00011123663384280796, + "loss": 2.6966, + "step": 14698 + }, + { + "epoch": 1.3317025661932913, + "grad_norm": 0.8346174359321594, + "learning_rate": 0.00011123059264181719, + "loss": 2.4942, + "step": 14699 + }, + { + "epoch": 1.3317931643677379, + "grad_norm": 0.9425394535064697, + "learning_rate": 0.00011122455144082644, + "loss": 2.9666, + "step": 14700 + }, + { + "epoch": 1.3318837625421849, + "grad_norm": 0.8831472992897034, + "learning_rate": 0.0001112185102398357, + "loss": 2.7335, + "step": 14701 + }, + { + "epoch": 1.3319743607166314, + "grad_norm": 0.9352693557739258, + "learning_rate": 0.00011121246903884492, + "loss": 2.6905, + "step": 14702 + }, + { + "epoch": 1.3320649588910785, + "grad_norm": 0.7146704792976379, + "learning_rate": 0.00011120642783785418, + "loss": 1.8547, + "step": 14703 + }, + { + "epoch": 1.332155557065525, + "grad_norm": 0.8576149940490723, + "learning_rate": 0.00011120038663686341, + "loss": 2.3753, + "step": 14704 + }, + { + "epoch": 1.332246155239972, + "grad_norm": 0.9059699177742004, + "learning_rate": 0.00011119434543587266, + "loss": 2.6456, + "step": 14705 + }, + { + "epoch": 1.3323367534144186, + "grad_norm": 0.8829584717750549, + "learning_rate": 0.0001111883042348819, + "loss": 2.5002, + "step": 14706 + }, + { + "epoch": 1.3324273515888656, + "grad_norm": 0.8824552893638611, + "learning_rate": 0.00011118226303389114, + "loss": 2.8316, + "step": 14707 + }, + { + "epoch": 1.3325179497633122, + "grad_norm": 0.9517566561698914, + "learning_rate": 0.00011117622183290039, + "loss": 2.6461, + "step": 14708 + }, + { + "epoch": 1.3326085479377592, + "grad_norm": 0.933858335018158, + "learning_rate": 0.00011117018063190962, + "loss": 2.731, + "step": 14709 + }, + { + "epoch": 1.3326991461122057, + "grad_norm": 0.8362012505531311, + "learning_rate": 0.00011116413943091887, + "loss": 2.5978, + "step": 14710 + }, + { + "epoch": 1.3327897442866528, + "grad_norm": 0.9164485335350037, + "learning_rate": 0.0001111580982299281, + "loss": 2.9429, + "step": 14711 + }, + { + "epoch": 1.3328803424610993, + "grad_norm": 0.9387895464897156, + "learning_rate": 0.00011115205702893737, + "loss": 2.6541, + "step": 14712 + }, + { + "epoch": 1.3329709406355463, + "grad_norm": 0.9081547856330872, + "learning_rate": 0.00011114601582794659, + "loss": 2.5173, + "step": 14713 + }, + { + "epoch": 1.333061538809993, + "grad_norm": 0.9019197821617126, + "learning_rate": 0.00011113997462695585, + "loss": 2.7231, + "step": 14714 + }, + { + "epoch": 1.3331521369844397, + "grad_norm": 0.948771595954895, + "learning_rate": 0.0001111339334259651, + "loss": 2.6277, + "step": 14715 + }, + { + "epoch": 1.3332427351588865, + "grad_norm": 0.9328406453132629, + "learning_rate": 0.00011112789222497433, + "loss": 2.8799, + "step": 14716 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.8493534326553345, + "learning_rate": 0.00011112185102398358, + "loss": 2.4735, + "step": 14717 + }, + { + "epoch": 1.33342393150778, + "grad_norm": 0.8652331829071045, + "learning_rate": 0.00011111580982299281, + "loss": 2.6135, + "step": 14718 + }, + { + "epoch": 1.3335145296822268, + "grad_norm": 0.9385383725166321, + "learning_rate": 0.00011110976862200206, + "loss": 2.7113, + "step": 14719 + }, + { + "epoch": 1.3336051278566736, + "grad_norm": 0.8729870319366455, + "learning_rate": 0.0001111037274210113, + "loss": 2.7823, + "step": 14720 + }, + { + "epoch": 1.3336957260311204, + "grad_norm": 0.7606920599937439, + "learning_rate": 0.00011109768622002054, + "loss": 1.9648, + "step": 14721 + }, + { + "epoch": 1.3337863242055672, + "grad_norm": 0.8862031102180481, + "learning_rate": 0.00011109164501902978, + "loss": 2.8263, + "step": 14722 + }, + { + "epoch": 1.333876922380014, + "grad_norm": 0.7765184640884399, + "learning_rate": 0.00011108560381803902, + "loss": 2.0419, + "step": 14723 + }, + { + "epoch": 1.3339675205544608, + "grad_norm": 0.9109630584716797, + "learning_rate": 0.00011107956261704828, + "loss": 2.6115, + "step": 14724 + }, + { + "epoch": 1.3340581187289076, + "grad_norm": 0.8472123742103577, + "learning_rate": 0.00011107352141605752, + "loss": 2.6223, + "step": 14725 + }, + { + "epoch": 1.3341487169033543, + "grad_norm": 0.8516530990600586, + "learning_rate": 0.00011106748021506677, + "loss": 2.5297, + "step": 14726 + }, + { + "epoch": 1.3342393150778011, + "grad_norm": 0.923435628414154, + "learning_rate": 0.000111061439014076, + "loss": 2.633, + "step": 14727 + }, + { + "epoch": 1.334329913252248, + "grad_norm": 0.8588363528251648, + "learning_rate": 0.00011105539781308525, + "loss": 2.7159, + "step": 14728 + }, + { + "epoch": 1.3344205114266947, + "grad_norm": 0.9210296869277954, + "learning_rate": 0.00011104935661209448, + "loss": 2.8887, + "step": 14729 + }, + { + "epoch": 1.3345111096011415, + "grad_norm": 1.0418058633804321, + "learning_rate": 0.00011104331541110373, + "loss": 2.6777, + "step": 14730 + }, + { + "epoch": 1.3346017077755883, + "grad_norm": 0.9228699803352356, + "learning_rate": 0.00011103727421011299, + "loss": 2.7087, + "step": 14731 + }, + { + "epoch": 1.334692305950035, + "grad_norm": 0.9226989150047302, + "learning_rate": 0.00011103123300912221, + "loss": 2.8759, + "step": 14732 + }, + { + "epoch": 1.3347829041244819, + "grad_norm": 0.6329612135887146, + "learning_rate": 0.00011102519180813147, + "loss": 1.3041, + "step": 14733 + }, + { + "epoch": 1.3348735022989286, + "grad_norm": 0.9464155435562134, + "learning_rate": 0.00011101915060714069, + "loss": 2.794, + "step": 14734 + }, + { + "epoch": 1.3349641004733754, + "grad_norm": 0.8913255333900452, + "learning_rate": 0.00011101310940614995, + "loss": 2.7135, + "step": 14735 + }, + { + "epoch": 1.3350546986478222, + "grad_norm": 0.9965246915817261, + "learning_rate": 0.00011100706820515917, + "loss": 2.7762, + "step": 14736 + }, + { + "epoch": 1.335145296822269, + "grad_norm": 0.8834999203681946, + "learning_rate": 0.00011100102700416844, + "loss": 2.7515, + "step": 14737 + }, + { + "epoch": 1.3352358949967158, + "grad_norm": 0.9063661098480225, + "learning_rate": 0.00011099498580317768, + "loss": 2.6257, + "step": 14738 + }, + { + "epoch": 1.3353264931711626, + "grad_norm": 0.9358882904052734, + "learning_rate": 0.00011098894460218692, + "loss": 2.6586, + "step": 14739 + }, + { + "epoch": 1.3354170913456094, + "grad_norm": 0.8575732111930847, + "learning_rate": 0.00011098290340119617, + "loss": 2.6412, + "step": 14740 + }, + { + "epoch": 1.3355076895200562, + "grad_norm": 0.9158674478530884, + "learning_rate": 0.0001109768622002054, + "loss": 2.6313, + "step": 14741 + }, + { + "epoch": 1.335598287694503, + "grad_norm": 0.9187011122703552, + "learning_rate": 0.00011097082099921465, + "loss": 2.6078, + "step": 14742 + }, + { + "epoch": 1.3356888858689497, + "grad_norm": 0.7843091487884521, + "learning_rate": 0.00011096477979822388, + "loss": 2.0405, + "step": 14743 + }, + { + "epoch": 1.3357794840433965, + "grad_norm": 0.864300549030304, + "learning_rate": 0.00011095873859723314, + "loss": 2.4902, + "step": 14744 + }, + { + "epoch": 1.3358700822178433, + "grad_norm": 0.8535115122795105, + "learning_rate": 0.00011095269739624239, + "loss": 2.6919, + "step": 14745 + }, + { + "epoch": 1.33596068039229, + "grad_norm": 0.7969582676887512, + "learning_rate": 0.00011094665619525162, + "loss": 2.4934, + "step": 14746 + }, + { + "epoch": 1.3360512785667369, + "grad_norm": 0.8824692964553833, + "learning_rate": 0.00011094061499426087, + "loss": 2.8237, + "step": 14747 + }, + { + "epoch": 1.3361418767411837, + "grad_norm": 0.9236248135566711, + "learning_rate": 0.0001109345737932701, + "loss": 2.7204, + "step": 14748 + }, + { + "epoch": 1.3362324749156305, + "grad_norm": 0.9295632839202881, + "learning_rate": 0.00011092853259227935, + "loss": 2.8006, + "step": 14749 + }, + { + "epoch": 1.3363230730900773, + "grad_norm": 0.9304175972938538, + "learning_rate": 0.00011092249139128859, + "loss": 2.5158, + "step": 14750 + }, + { + "epoch": 1.336413671264524, + "grad_norm": 0.8635604381561279, + "learning_rate": 0.00011091645019029783, + "loss": 2.1045, + "step": 14751 + }, + { + "epoch": 1.3365042694389708, + "grad_norm": 1.033747911453247, + "learning_rate": 0.00011091040898930707, + "loss": 2.3835, + "step": 14752 + }, + { + "epoch": 1.3365948676134176, + "grad_norm": 0.8962675333023071, + "learning_rate": 0.00011090436778831632, + "loss": 2.6693, + "step": 14753 + }, + { + "epoch": 1.3366854657878644, + "grad_norm": 0.9267081022262573, + "learning_rate": 0.00011089832658732558, + "loss": 2.6496, + "step": 14754 + }, + { + "epoch": 1.3367760639623112, + "grad_norm": 0.909625232219696, + "learning_rate": 0.0001108922853863348, + "loss": 2.7364, + "step": 14755 + }, + { + "epoch": 1.336866662136758, + "grad_norm": 1.1238932609558105, + "learning_rate": 0.00011088624418534406, + "loss": 2.5852, + "step": 14756 + }, + { + "epoch": 1.3369572603112048, + "grad_norm": 0.8407071828842163, + "learning_rate": 0.00011088020298435329, + "loss": 2.6865, + "step": 14757 + }, + { + "epoch": 1.3370478584856516, + "grad_norm": 0.9228849411010742, + "learning_rate": 0.00011087416178336254, + "loss": 2.7967, + "step": 14758 + }, + { + "epoch": 1.3371384566600983, + "grad_norm": 0.9504418969154358, + "learning_rate": 0.00011086812058237177, + "loss": 2.7682, + "step": 14759 + }, + { + "epoch": 1.3372290548345451, + "grad_norm": 0.9132466316223145, + "learning_rate": 0.00011086207938138102, + "loss": 2.9982, + "step": 14760 + }, + { + "epoch": 1.337319653008992, + "grad_norm": 0.8620052933692932, + "learning_rate": 0.00011085603818039027, + "loss": 2.7923, + "step": 14761 + }, + { + "epoch": 1.3374102511834387, + "grad_norm": 0.864395022392273, + "learning_rate": 0.0001108499969793995, + "loss": 2.8096, + "step": 14762 + }, + { + "epoch": 1.3375008493578855, + "grad_norm": 0.9185915589332581, + "learning_rate": 0.00011084395577840877, + "loss": 2.7563, + "step": 14763 + }, + { + "epoch": 1.3375914475323323, + "grad_norm": 0.8857190012931824, + "learning_rate": 0.00011083791457741799, + "loss": 2.644, + "step": 14764 + }, + { + "epoch": 1.337682045706779, + "grad_norm": 0.8919821977615356, + "learning_rate": 0.00011083187337642725, + "loss": 2.7998, + "step": 14765 + }, + { + "epoch": 1.3377726438812259, + "grad_norm": 0.9825933575630188, + "learning_rate": 0.00011082583217543647, + "loss": 2.586, + "step": 14766 + }, + { + "epoch": 1.3378632420556726, + "grad_norm": 0.7245538830757141, + "learning_rate": 0.00011081979097444573, + "loss": 2.1107, + "step": 14767 + }, + { + "epoch": 1.3379538402301194, + "grad_norm": 0.8514480590820312, + "learning_rate": 0.00011081374977345498, + "loss": 2.4772, + "step": 14768 + }, + { + "epoch": 1.3380444384045662, + "grad_norm": 0.8682839274406433, + "learning_rate": 0.00011080770857246421, + "loss": 2.7967, + "step": 14769 + }, + { + "epoch": 1.338135036579013, + "grad_norm": 0.8552483320236206, + "learning_rate": 0.00011080166737147346, + "loss": 2.7006, + "step": 14770 + }, + { + "epoch": 1.3382256347534598, + "grad_norm": 0.9357792735099792, + "learning_rate": 0.00011079562617048269, + "loss": 2.7801, + "step": 14771 + }, + { + "epoch": 1.3383162329279066, + "grad_norm": 0.8490120768547058, + "learning_rate": 0.00011078958496949194, + "loss": 2.7517, + "step": 14772 + }, + { + "epoch": 1.3384068311023534, + "grad_norm": 0.901203989982605, + "learning_rate": 0.00011078354376850117, + "loss": 2.8479, + "step": 14773 + }, + { + "epoch": 1.3384974292768002, + "grad_norm": 0.9150070548057556, + "learning_rate": 0.00011077750256751042, + "loss": 2.8146, + "step": 14774 + }, + { + "epoch": 1.3385880274512467, + "grad_norm": 0.8883455395698547, + "learning_rate": 0.00011077146136651968, + "loss": 2.7258, + "step": 14775 + }, + { + "epoch": 1.3386786256256937, + "grad_norm": 0.9357127547264099, + "learning_rate": 0.00011076542016552892, + "loss": 2.5629, + "step": 14776 + }, + { + "epoch": 1.3387692238001403, + "grad_norm": 0.8463937640190125, + "learning_rate": 0.00011075937896453816, + "loss": 2.6375, + "step": 14777 + }, + { + "epoch": 1.3388598219745873, + "grad_norm": 0.8631737232208252, + "learning_rate": 0.0001107533377635474, + "loss": 2.5428, + "step": 14778 + }, + { + "epoch": 1.3389504201490339, + "grad_norm": 0.8693298101425171, + "learning_rate": 0.00011074729656255665, + "loss": 2.97, + "step": 14779 + }, + { + "epoch": 1.3390410183234809, + "grad_norm": 0.9178970456123352, + "learning_rate": 0.00011074125536156588, + "loss": 2.5895, + "step": 14780 + }, + { + "epoch": 1.3391316164979274, + "grad_norm": 0.9006129503250122, + "learning_rate": 0.00011073521416057513, + "loss": 2.8868, + "step": 14781 + }, + { + "epoch": 1.3392222146723745, + "grad_norm": 0.8753697872161865, + "learning_rate": 0.00011072917295958436, + "loss": 2.5483, + "step": 14782 + }, + { + "epoch": 1.339312812846821, + "grad_norm": 0.9149692058563232, + "learning_rate": 0.00011072313175859361, + "loss": 2.6773, + "step": 14783 + }, + { + "epoch": 1.339403411021268, + "grad_norm": 0.9284220933914185, + "learning_rate": 0.00011071709055760287, + "loss": 2.879, + "step": 14784 + }, + { + "epoch": 1.3394940091957146, + "grad_norm": 0.9455713629722595, + "learning_rate": 0.00011071104935661209, + "loss": 3.3628, + "step": 14785 + }, + { + "epoch": 1.3395846073701616, + "grad_norm": 0.8821995258331299, + "learning_rate": 0.00011070500815562135, + "loss": 2.4824, + "step": 14786 + }, + { + "epoch": 1.3396752055446082, + "grad_norm": 0.9729750752449036, + "learning_rate": 0.00011069896695463057, + "loss": 2.9913, + "step": 14787 + }, + { + "epoch": 1.3397658037190552, + "grad_norm": 0.9185976982116699, + "learning_rate": 0.00011069292575363983, + "loss": 2.7967, + "step": 14788 + }, + { + "epoch": 1.3398564018935017, + "grad_norm": 0.9012705087661743, + "learning_rate": 0.00011068688455264907, + "loss": 2.7252, + "step": 14789 + }, + { + "epoch": 1.3399470000679488, + "grad_norm": 0.8465925455093384, + "learning_rate": 0.00011068084335165832, + "loss": 2.0242, + "step": 14790 + }, + { + "epoch": 1.3400375982423953, + "grad_norm": 0.8592820763587952, + "learning_rate": 0.00011067480215066756, + "loss": 2.7236, + "step": 14791 + }, + { + "epoch": 1.3401281964168423, + "grad_norm": 0.8700540661811829, + "learning_rate": 0.0001106687609496768, + "loss": 2.5103, + "step": 14792 + }, + { + "epoch": 1.340218794591289, + "grad_norm": 1.0845316648483276, + "learning_rate": 0.00011066271974868604, + "loss": 2.7018, + "step": 14793 + }, + { + "epoch": 1.340309392765736, + "grad_norm": 0.9222479462623596, + "learning_rate": 0.00011065667854769528, + "loss": 2.6694, + "step": 14794 + }, + { + "epoch": 1.3403999909401825, + "grad_norm": 0.9294751286506653, + "learning_rate": 0.00011065063734670454, + "loss": 2.3926, + "step": 14795 + }, + { + "epoch": 1.3404905891146293, + "grad_norm": 0.9667559266090393, + "learning_rate": 0.00011064459614571376, + "loss": 2.8536, + "step": 14796 + }, + { + "epoch": 1.340581187289076, + "grad_norm": 0.8506504893302917, + "learning_rate": 0.00011063855494472302, + "loss": 2.6574, + "step": 14797 + }, + { + "epoch": 1.3406717854635228, + "grad_norm": 0.9533654451370239, + "learning_rate": 0.00011063251374373227, + "loss": 2.7272, + "step": 14798 + }, + { + "epoch": 1.3407623836379696, + "grad_norm": 0.8858861327171326, + "learning_rate": 0.0001106264725427415, + "loss": 2.9702, + "step": 14799 + }, + { + "epoch": 1.3408529818124164, + "grad_norm": 0.8781200647354126, + "learning_rate": 0.00011062043134175075, + "loss": 2.6323, + "step": 14800 + }, + { + "epoch": 1.3409435799868632, + "grad_norm": 0.8239489793777466, + "learning_rate": 0.00011061439014075998, + "loss": 2.0345, + "step": 14801 + }, + { + "epoch": 1.34103417816131, + "grad_norm": 0.9246968030929565, + "learning_rate": 0.00011060834893976923, + "loss": 2.7861, + "step": 14802 + }, + { + "epoch": 1.3411247763357568, + "grad_norm": 0.8560213446617126, + "learning_rate": 0.00011060230773877847, + "loss": 2.5332, + "step": 14803 + }, + { + "epoch": 1.3412153745102036, + "grad_norm": 0.8695186376571655, + "learning_rate": 0.00011059626653778771, + "loss": 2.7161, + "step": 14804 + }, + { + "epoch": 1.3413059726846503, + "grad_norm": 0.8381935954093933, + "learning_rate": 0.00011059022533679697, + "loss": 2.7425, + "step": 14805 + }, + { + "epoch": 1.3413965708590971, + "grad_norm": 0.9610893130302429, + "learning_rate": 0.0001105841841358062, + "loss": 2.9054, + "step": 14806 + }, + { + "epoch": 1.341487169033544, + "grad_norm": 0.8834319710731506, + "learning_rate": 0.00011057814293481546, + "loss": 2.688, + "step": 14807 + }, + { + "epoch": 1.3415777672079907, + "grad_norm": 0.8709170818328857, + "learning_rate": 0.00011057210173382469, + "loss": 1.9956, + "step": 14808 + }, + { + "epoch": 1.3416683653824375, + "grad_norm": 0.8212926983833313, + "learning_rate": 0.00011056606053283394, + "loss": 2.6097, + "step": 14809 + }, + { + "epoch": 1.3417589635568843, + "grad_norm": 0.9289710521697998, + "learning_rate": 0.00011056001933184317, + "loss": 2.4755, + "step": 14810 + }, + { + "epoch": 1.341849561731331, + "grad_norm": 0.8480867743492126, + "learning_rate": 0.00011055397813085242, + "loss": 2.7515, + "step": 14811 + }, + { + "epoch": 1.3419401599057779, + "grad_norm": 0.9713585376739502, + "learning_rate": 0.00011054793692986165, + "loss": 2.5871, + "step": 14812 + }, + { + "epoch": 1.3420307580802247, + "grad_norm": 0.8212472796440125, + "learning_rate": 0.0001105418957288709, + "loss": 2.4292, + "step": 14813 + }, + { + "epoch": 1.3421213562546714, + "grad_norm": 0.9309350252151489, + "learning_rate": 0.00011053585452788016, + "loss": 2.9762, + "step": 14814 + }, + { + "epoch": 1.3422119544291182, + "grad_norm": 0.8594279885292053, + "learning_rate": 0.00011052981332688938, + "loss": 2.5715, + "step": 14815 + }, + { + "epoch": 1.342302552603565, + "grad_norm": 0.7951178550720215, + "learning_rate": 0.00011052377212589864, + "loss": 1.8744, + "step": 14816 + }, + { + "epoch": 1.3423931507780118, + "grad_norm": 0.8807086944580078, + "learning_rate": 0.00011051773092490786, + "loss": 2.7455, + "step": 14817 + }, + { + "epoch": 1.3424837489524586, + "grad_norm": 0.8752193450927734, + "learning_rate": 0.00011051168972391713, + "loss": 2.4158, + "step": 14818 + }, + { + "epoch": 1.3425743471269054, + "grad_norm": 0.9696388840675354, + "learning_rate": 0.00011050564852292635, + "loss": 2.7815, + "step": 14819 + }, + { + "epoch": 1.3426649453013522, + "grad_norm": 0.8976157307624817, + "learning_rate": 0.00011049960732193561, + "loss": 2.8498, + "step": 14820 + }, + { + "epoch": 1.342755543475799, + "grad_norm": 0.916419267654419, + "learning_rate": 0.00011049356612094486, + "loss": 2.9421, + "step": 14821 + }, + { + "epoch": 1.3428461416502457, + "grad_norm": 0.8948194980621338, + "learning_rate": 0.00011048752491995409, + "loss": 2.6649, + "step": 14822 + }, + { + "epoch": 1.3429367398246925, + "grad_norm": 0.6426242589950562, + "learning_rate": 0.00011048148371896334, + "loss": 1.2886, + "step": 14823 + }, + { + "epoch": 1.3430273379991393, + "grad_norm": 0.8518858551979065, + "learning_rate": 0.00011047544251797257, + "loss": 2.543, + "step": 14824 + }, + { + "epoch": 1.343117936173586, + "grad_norm": 0.9394737482070923, + "learning_rate": 0.00011046940131698182, + "loss": 2.6899, + "step": 14825 + }, + { + "epoch": 1.343208534348033, + "grad_norm": 0.9159179329872131, + "learning_rate": 0.00011046336011599105, + "loss": 2.746, + "step": 14826 + }, + { + "epoch": 1.3432991325224797, + "grad_norm": 0.8797913789749146, + "learning_rate": 0.00011045731891500031, + "loss": 2.6366, + "step": 14827 + }, + { + "epoch": 1.3433897306969265, + "grad_norm": 0.8707519769668579, + "learning_rate": 0.00011045127771400956, + "loss": 2.5227, + "step": 14828 + }, + { + "epoch": 1.3434803288713733, + "grad_norm": 0.8401369452476501, + "learning_rate": 0.0001104452365130188, + "loss": 2.6309, + "step": 14829 + }, + { + "epoch": 1.34357092704582, + "grad_norm": 1.033834457397461, + "learning_rate": 0.00011043919531202804, + "loss": 3.0496, + "step": 14830 + }, + { + "epoch": 1.3436615252202668, + "grad_norm": 0.9442974328994751, + "learning_rate": 0.00011043315411103728, + "loss": 2.4228, + "step": 14831 + }, + { + "epoch": 1.3437521233947136, + "grad_norm": 1.0223500728607178, + "learning_rate": 0.00011042711291004652, + "loss": 2.8121, + "step": 14832 + }, + { + "epoch": 1.3438427215691604, + "grad_norm": 0.8682804703712463, + "learning_rate": 0.00011042107170905576, + "loss": 2.7497, + "step": 14833 + }, + { + "epoch": 1.3439333197436072, + "grad_norm": 0.9477080702781677, + "learning_rate": 0.000110415030508065, + "loss": 2.7156, + "step": 14834 + }, + { + "epoch": 1.344023917918054, + "grad_norm": 0.9078654050827026, + "learning_rate": 0.00011040898930707427, + "loss": 2.5494, + "step": 14835 + }, + { + "epoch": 1.3441145160925008, + "grad_norm": 0.8731060028076172, + "learning_rate": 0.00011040294810608349, + "loss": 2.8041, + "step": 14836 + }, + { + "epoch": 1.3442051142669476, + "grad_norm": 0.8965889811515808, + "learning_rate": 0.00011039690690509275, + "loss": 2.6133, + "step": 14837 + }, + { + "epoch": 1.3442957124413943, + "grad_norm": 0.9106099009513855, + "learning_rate": 0.00011039086570410197, + "loss": 2.7118, + "step": 14838 + }, + { + "epoch": 1.3443863106158411, + "grad_norm": 0.7299804091453552, + "learning_rate": 0.00011038482450311123, + "loss": 2.1289, + "step": 14839 + }, + { + "epoch": 1.344476908790288, + "grad_norm": 0.862583339214325, + "learning_rate": 0.00011037878330212046, + "loss": 2.7535, + "step": 14840 + }, + { + "epoch": 1.3445675069647347, + "grad_norm": 0.8748536109924316, + "learning_rate": 0.00011037274210112971, + "loss": 2.6111, + "step": 14841 + }, + { + "epoch": 1.3446581051391815, + "grad_norm": 1.1560038328170776, + "learning_rate": 0.00011036670090013895, + "loss": 2.7072, + "step": 14842 + }, + { + "epoch": 1.3447487033136283, + "grad_norm": 0.8705942034721375, + "learning_rate": 0.0001103606596991482, + "loss": 2.6787, + "step": 14843 + }, + { + "epoch": 1.344839301488075, + "grad_norm": 0.8039339780807495, + "learning_rate": 0.00011035461849815744, + "loss": 2.7213, + "step": 14844 + }, + { + "epoch": 1.3449298996625219, + "grad_norm": 0.9835205674171448, + "learning_rate": 0.00011034857729716668, + "loss": 2.5831, + "step": 14845 + }, + { + "epoch": 1.3450204978369686, + "grad_norm": 0.9131618738174438, + "learning_rate": 0.00011034253609617592, + "loss": 2.7494, + "step": 14846 + }, + { + "epoch": 1.3451110960114154, + "grad_norm": 0.9970893859863281, + "learning_rate": 0.00011033649489518516, + "loss": 2.4706, + "step": 14847 + }, + { + "epoch": 1.3452016941858622, + "grad_norm": 0.9335177540779114, + "learning_rate": 0.00011033045369419442, + "loss": 2.9578, + "step": 14848 + }, + { + "epoch": 1.345292292360309, + "grad_norm": 0.8638032078742981, + "learning_rate": 0.00011032441249320364, + "loss": 2.5789, + "step": 14849 + }, + { + "epoch": 1.3453828905347558, + "grad_norm": 0.94998699426651, + "learning_rate": 0.0001103183712922129, + "loss": 2.927, + "step": 14850 + }, + { + "epoch": 1.3454734887092026, + "grad_norm": 0.9014778733253479, + "learning_rate": 0.00011031233009122215, + "loss": 2.7247, + "step": 14851 + }, + { + "epoch": 1.3455640868836494, + "grad_norm": 0.9890420436859131, + "learning_rate": 0.00011030628889023138, + "loss": 2.6279, + "step": 14852 + }, + { + "epoch": 1.3456546850580962, + "grad_norm": 1.0155173540115356, + "learning_rate": 0.00011030024768924063, + "loss": 2.6487, + "step": 14853 + }, + { + "epoch": 1.345745283232543, + "grad_norm": 0.8665536046028137, + "learning_rate": 0.00011029420648824986, + "loss": 2.7949, + "step": 14854 + }, + { + "epoch": 1.3458358814069897, + "grad_norm": 0.9849683046340942, + "learning_rate": 0.00011028816528725911, + "loss": 2.7006, + "step": 14855 + }, + { + "epoch": 1.3459264795814363, + "grad_norm": 1.0065622329711914, + "learning_rate": 0.00011028212408626835, + "loss": 2.7634, + "step": 14856 + }, + { + "epoch": 1.3460170777558833, + "grad_norm": 0.8314952254295349, + "learning_rate": 0.00011027608288527759, + "loss": 2.6849, + "step": 14857 + }, + { + "epoch": 1.3461076759303299, + "grad_norm": 0.7560268640518188, + "learning_rate": 0.00011027004168428685, + "loss": 2.0759, + "step": 14858 + }, + { + "epoch": 1.3461982741047769, + "grad_norm": 0.8623712062835693, + "learning_rate": 0.00011026400048329609, + "loss": 2.7487, + "step": 14859 + }, + { + "epoch": 1.3462888722792234, + "grad_norm": 0.968999445438385, + "learning_rate": 0.00011025795928230534, + "loss": 2.4032, + "step": 14860 + }, + { + "epoch": 1.3463794704536705, + "grad_norm": 0.8708961606025696, + "learning_rate": 0.00011025191808131457, + "loss": 2.0101, + "step": 14861 + }, + { + "epoch": 1.346470068628117, + "grad_norm": 1.0145795345306396, + "learning_rate": 0.00011024587688032382, + "loss": 2.4733, + "step": 14862 + }, + { + "epoch": 1.346560666802564, + "grad_norm": 0.8684519529342651, + "learning_rate": 0.00011023983567933305, + "loss": 2.5841, + "step": 14863 + }, + { + "epoch": 1.3466512649770106, + "grad_norm": 0.9190366268157959, + "learning_rate": 0.0001102337944783423, + "loss": 2.734, + "step": 14864 + }, + { + "epoch": 1.3467418631514576, + "grad_norm": 0.9476665258407593, + "learning_rate": 0.00011022775327735155, + "loss": 3.0464, + "step": 14865 + }, + { + "epoch": 1.3468324613259042, + "grad_norm": 0.9485558867454529, + "learning_rate": 0.00011022171207636078, + "loss": 2.6708, + "step": 14866 + }, + { + "epoch": 1.3469230595003512, + "grad_norm": 1.0078599452972412, + "learning_rate": 0.00011021567087537004, + "loss": 2.7519, + "step": 14867 + }, + { + "epoch": 1.3470136576747977, + "grad_norm": 0.9519931077957153, + "learning_rate": 0.00011020962967437926, + "loss": 3.1901, + "step": 14868 + }, + { + "epoch": 1.3471042558492448, + "grad_norm": 0.9715824127197266, + "learning_rate": 0.00011020358847338852, + "loss": 2.7569, + "step": 14869 + }, + { + "epoch": 1.3471948540236913, + "grad_norm": 0.8715079426765442, + "learning_rate": 0.00011019754727239774, + "loss": 2.6047, + "step": 14870 + }, + { + "epoch": 1.3472854521981383, + "grad_norm": 0.9670085310935974, + "learning_rate": 0.000110191506071407, + "loss": 2.698, + "step": 14871 + }, + { + "epoch": 1.347376050372585, + "grad_norm": 0.9457783699035645, + "learning_rate": 0.00011018546487041625, + "loss": 2.7751, + "step": 14872 + }, + { + "epoch": 1.347466648547032, + "grad_norm": 1.0114490985870361, + "learning_rate": 0.00011017942366942549, + "loss": 2.5642, + "step": 14873 + }, + { + "epoch": 1.3475572467214785, + "grad_norm": 1.077850580215454, + "learning_rate": 0.00011017338246843473, + "loss": 2.8555, + "step": 14874 + }, + { + "epoch": 1.3476478448959255, + "grad_norm": 0.8745063543319702, + "learning_rate": 0.00011016734126744397, + "loss": 2.6506, + "step": 14875 + }, + { + "epoch": 1.347738443070372, + "grad_norm": 0.9308425188064575, + "learning_rate": 0.00011016130006645322, + "loss": 3.0974, + "step": 14876 + }, + { + "epoch": 1.3478290412448188, + "grad_norm": 0.8860751390457153, + "learning_rate": 0.00011015525886546245, + "loss": 2.7281, + "step": 14877 + }, + { + "epoch": 1.3479196394192656, + "grad_norm": 0.9051980376243591, + "learning_rate": 0.0001101492176644717, + "loss": 2.8502, + "step": 14878 + }, + { + "epoch": 1.3480102375937124, + "grad_norm": 0.911707878112793, + "learning_rate": 0.00011014317646348093, + "loss": 2.6456, + "step": 14879 + }, + { + "epoch": 1.3481008357681592, + "grad_norm": 0.8715819120407104, + "learning_rate": 0.00011013713526249019, + "loss": 2.5582, + "step": 14880 + }, + { + "epoch": 1.348191433942606, + "grad_norm": 0.8962529897689819, + "learning_rate": 0.00011013109406149944, + "loss": 2.7698, + "step": 14881 + }, + { + "epoch": 1.3482820321170528, + "grad_norm": 0.9007956981658936, + "learning_rate": 0.00011012505286050867, + "loss": 2.787, + "step": 14882 + }, + { + "epoch": 1.3483726302914996, + "grad_norm": 0.9050111770629883, + "learning_rate": 0.00011011901165951792, + "loss": 2.6923, + "step": 14883 + }, + { + "epoch": 1.3484632284659464, + "grad_norm": 0.8866615295410156, + "learning_rate": 0.00011011297045852716, + "loss": 2.6459, + "step": 14884 + }, + { + "epoch": 1.3485538266403931, + "grad_norm": 0.7946704030036926, + "learning_rate": 0.0001101069292575364, + "loss": 2.0379, + "step": 14885 + }, + { + "epoch": 1.34864442481484, + "grad_norm": 0.924056887626648, + "learning_rate": 0.00011010088805654564, + "loss": 2.9542, + "step": 14886 + }, + { + "epoch": 1.3487350229892867, + "grad_norm": 0.9230705499649048, + "learning_rate": 0.00011009484685555489, + "loss": 2.7664, + "step": 14887 + }, + { + "epoch": 1.3488256211637335, + "grad_norm": 0.890311062335968, + "learning_rate": 0.00011008880565456415, + "loss": 2.6095, + "step": 14888 + }, + { + "epoch": 1.3489162193381803, + "grad_norm": 0.91158527135849, + "learning_rate": 0.00011008276445357337, + "loss": 2.4797, + "step": 14889 + }, + { + "epoch": 1.349006817512627, + "grad_norm": 0.9158563613891602, + "learning_rate": 0.00011007672325258263, + "loss": 2.9333, + "step": 14890 + }, + { + "epoch": 1.3490974156870739, + "grad_norm": 0.8600070476531982, + "learning_rate": 0.00011007068205159186, + "loss": 2.8366, + "step": 14891 + }, + { + "epoch": 1.3491880138615207, + "grad_norm": 0.8913292288780212, + "learning_rate": 0.00011006464085060111, + "loss": 2.9663, + "step": 14892 + }, + { + "epoch": 1.3492786120359674, + "grad_norm": 0.9396470785140991, + "learning_rate": 0.00011005859964961034, + "loss": 2.9074, + "step": 14893 + }, + { + "epoch": 1.3493692102104142, + "grad_norm": 0.8760753273963928, + "learning_rate": 0.00011005255844861959, + "loss": 2.8494, + "step": 14894 + }, + { + "epoch": 1.349459808384861, + "grad_norm": 0.9538730978965759, + "learning_rate": 0.00011004651724762884, + "loss": 2.9575, + "step": 14895 + }, + { + "epoch": 1.3495504065593078, + "grad_norm": 0.768318772315979, + "learning_rate": 0.00011004047604663807, + "loss": 1.7905, + "step": 14896 + }, + { + "epoch": 1.3496410047337546, + "grad_norm": 0.8841218948364258, + "learning_rate": 0.00011003443484564732, + "loss": 2.5505, + "step": 14897 + }, + { + "epoch": 1.3497316029082014, + "grad_norm": 0.9122042655944824, + "learning_rate": 0.00011002839364465656, + "loss": 2.5797, + "step": 14898 + }, + { + "epoch": 1.3498222010826482, + "grad_norm": 0.9715846180915833, + "learning_rate": 0.00011002235244366582, + "loss": 2.8562, + "step": 14899 + }, + { + "epoch": 1.349912799257095, + "grad_norm": 0.8311958909034729, + "learning_rate": 0.00011001631124267504, + "loss": 2.151, + "step": 14900 + }, + { + "epoch": 1.3500033974315417, + "grad_norm": 0.9317797422409058, + "learning_rate": 0.0001100102700416843, + "loss": 2.8889, + "step": 14901 + }, + { + "epoch": 1.3500939956059885, + "grad_norm": 0.8447396755218506, + "learning_rate": 0.00011000422884069355, + "loss": 2.0198, + "step": 14902 + }, + { + "epoch": 1.3501845937804353, + "grad_norm": 0.7980365753173828, + "learning_rate": 0.00010999818763970278, + "loss": 1.8419, + "step": 14903 + }, + { + "epoch": 1.350275191954882, + "grad_norm": 0.8568747043609619, + "learning_rate": 0.00010999214643871203, + "loss": 2.5591, + "step": 14904 + }, + { + "epoch": 1.350365790129329, + "grad_norm": 0.9334638118743896, + "learning_rate": 0.00010998610523772126, + "loss": 3.0511, + "step": 14905 + }, + { + "epoch": 1.3504563883037757, + "grad_norm": 0.9876801371574402, + "learning_rate": 0.00010998006403673051, + "loss": 2.5938, + "step": 14906 + }, + { + "epoch": 1.3505469864782225, + "grad_norm": 0.9006835222244263, + "learning_rate": 0.00010997402283573974, + "loss": 2.9239, + "step": 14907 + }, + { + "epoch": 1.3506375846526693, + "grad_norm": 0.8892295956611633, + "learning_rate": 0.00010996798163474899, + "loss": 2.731, + "step": 14908 + }, + { + "epoch": 1.350728182827116, + "grad_norm": 0.943756103515625, + "learning_rate": 0.00010996194043375822, + "loss": 2.6877, + "step": 14909 + }, + { + "epoch": 1.3508187810015628, + "grad_norm": 0.9134541153907776, + "learning_rate": 0.00010995589923276747, + "loss": 2.7118, + "step": 14910 + }, + { + "epoch": 1.3509093791760096, + "grad_norm": 0.9180193543434143, + "learning_rate": 0.00010994985803177673, + "loss": 2.6205, + "step": 14911 + }, + { + "epoch": 1.3509999773504564, + "grad_norm": 0.94893479347229, + "learning_rate": 0.00010994381683078597, + "loss": 2.8092, + "step": 14912 + }, + { + "epoch": 1.3510905755249032, + "grad_norm": 0.9097800850868225, + "learning_rate": 0.00010993777562979521, + "loss": 2.6598, + "step": 14913 + }, + { + "epoch": 1.35118117369935, + "grad_norm": 0.9051206707954407, + "learning_rate": 0.00010993173442880445, + "loss": 2.8816, + "step": 14914 + }, + { + "epoch": 1.3512717718737968, + "grad_norm": 0.9262546300888062, + "learning_rate": 0.0001099256932278137, + "loss": 2.6575, + "step": 14915 + }, + { + "epoch": 1.3513623700482436, + "grad_norm": 0.8918857574462891, + "learning_rate": 0.00010991965202682293, + "loss": 2.7788, + "step": 14916 + }, + { + "epoch": 1.3514529682226903, + "grad_norm": 0.8985633850097656, + "learning_rate": 0.00010991361082583218, + "loss": 3.0336, + "step": 14917 + }, + { + "epoch": 1.3515435663971371, + "grad_norm": 0.934562623500824, + "learning_rate": 0.00010990756962484144, + "loss": 2.7295, + "step": 14918 + }, + { + "epoch": 1.351634164571584, + "grad_norm": 0.9072635173797607, + "learning_rate": 0.00010990152842385066, + "loss": 2.6031, + "step": 14919 + }, + { + "epoch": 1.3517247627460307, + "grad_norm": 0.958289623260498, + "learning_rate": 0.00010989548722285992, + "loss": 2.6429, + "step": 14920 + }, + { + "epoch": 1.3518153609204775, + "grad_norm": 0.9537333846092224, + "learning_rate": 0.00010988944602186914, + "loss": 2.8443, + "step": 14921 + }, + { + "epoch": 1.3519059590949243, + "grad_norm": 0.9013302326202393, + "learning_rate": 0.0001098834048208784, + "loss": 2.6838, + "step": 14922 + }, + { + "epoch": 1.351996557269371, + "grad_norm": 0.9356122612953186, + "learning_rate": 0.00010987736361988762, + "loss": 2.8213, + "step": 14923 + }, + { + "epoch": 1.3520871554438179, + "grad_norm": 0.8704968690872192, + "learning_rate": 0.00010987132241889688, + "loss": 2.0385, + "step": 14924 + }, + { + "epoch": 1.3521777536182646, + "grad_norm": 0.8618835210800171, + "learning_rate": 0.00010986528121790613, + "loss": 2.6401, + "step": 14925 + }, + { + "epoch": 1.3522683517927114, + "grad_norm": 0.8579511642456055, + "learning_rate": 0.00010985924001691537, + "loss": 2.187, + "step": 14926 + }, + { + "epoch": 1.3523589499671582, + "grad_norm": 0.9447112083435059, + "learning_rate": 0.00010985319881592461, + "loss": 2.6505, + "step": 14927 + }, + { + "epoch": 1.352449548141605, + "grad_norm": 0.8761979937553406, + "learning_rate": 0.00010984715761493385, + "loss": 2.7702, + "step": 14928 + }, + { + "epoch": 1.3525401463160518, + "grad_norm": 0.8942236304283142, + "learning_rate": 0.0001098411164139431, + "loss": 2.5205, + "step": 14929 + }, + { + "epoch": 1.3526307444904986, + "grad_norm": 1.0206246376037598, + "learning_rate": 0.00010983507521295233, + "loss": 2.541, + "step": 14930 + }, + { + "epoch": 1.3527213426649454, + "grad_norm": 0.8874747157096863, + "learning_rate": 0.00010982903401196159, + "loss": 2.7493, + "step": 14931 + }, + { + "epoch": 1.3528119408393922, + "grad_norm": 0.8558651208877563, + "learning_rate": 0.00010982299281097084, + "loss": 2.4665, + "step": 14932 + }, + { + "epoch": 1.352902539013839, + "grad_norm": 0.9029729962348938, + "learning_rate": 0.00010981695160998007, + "loss": 2.5758, + "step": 14933 + }, + { + "epoch": 1.3529931371882857, + "grad_norm": 0.8819206953048706, + "learning_rate": 0.00010981091040898932, + "loss": 2.8428, + "step": 14934 + }, + { + "epoch": 1.3530837353627325, + "grad_norm": 0.8850798010826111, + "learning_rate": 0.00010980486920799855, + "loss": 2.627, + "step": 14935 + }, + { + "epoch": 1.3531743335371793, + "grad_norm": 0.9001460671424866, + "learning_rate": 0.0001097988280070078, + "loss": 2.9907, + "step": 14936 + }, + { + "epoch": 1.3532649317116259, + "grad_norm": 0.9879273772239685, + "learning_rate": 0.00010979278680601704, + "loss": 2.7917, + "step": 14937 + }, + { + "epoch": 1.3533555298860729, + "grad_norm": 0.8337401151657104, + "learning_rate": 0.00010978674560502628, + "loss": 2.6454, + "step": 14938 + }, + { + "epoch": 1.3534461280605194, + "grad_norm": 0.8997114300727844, + "learning_rate": 0.00010978070440403552, + "loss": 2.7575, + "step": 14939 + }, + { + "epoch": 1.3535367262349665, + "grad_norm": 0.8998238444328308, + "learning_rate": 0.00010977466320304476, + "loss": 2.8281, + "step": 14940 + }, + { + "epoch": 1.353627324409413, + "grad_norm": 0.8411061763763428, + "learning_rate": 0.00010976862200205403, + "loss": 2.8371, + "step": 14941 + }, + { + "epoch": 1.35371792258386, + "grad_norm": 0.7671487331390381, + "learning_rate": 0.00010976258080106325, + "loss": 1.8831, + "step": 14942 + }, + { + "epoch": 1.3538085207583066, + "grad_norm": 0.8536024689674377, + "learning_rate": 0.00010975653960007251, + "loss": 2.8104, + "step": 14943 + }, + { + "epoch": 1.3538991189327536, + "grad_norm": 0.9017281532287598, + "learning_rate": 0.00010975049839908174, + "loss": 2.7181, + "step": 14944 + }, + { + "epoch": 1.3539897171072002, + "grad_norm": 0.8758330345153809, + "learning_rate": 0.00010974445719809099, + "loss": 2.5155, + "step": 14945 + }, + { + "epoch": 1.3540803152816472, + "grad_norm": 0.8256024718284607, + "learning_rate": 0.00010973841599710022, + "loss": 1.7867, + "step": 14946 + }, + { + "epoch": 1.3541709134560937, + "grad_norm": 0.8705412745475769, + "learning_rate": 0.00010973237479610947, + "loss": 2.6831, + "step": 14947 + }, + { + "epoch": 1.3542615116305408, + "grad_norm": 0.8826944828033447, + "learning_rate": 0.00010972633359511872, + "loss": 2.525, + "step": 14948 + }, + { + "epoch": 1.3543521098049873, + "grad_norm": 0.9110311269760132, + "learning_rate": 0.00010972029239412795, + "loss": 2.7133, + "step": 14949 + }, + { + "epoch": 1.3544427079794343, + "grad_norm": 0.9721638560295105, + "learning_rate": 0.00010971425119313721, + "loss": 2.7406, + "step": 14950 + }, + { + "epoch": 1.354533306153881, + "grad_norm": 0.9447487592697144, + "learning_rate": 0.00010970820999214643, + "loss": 2.7758, + "step": 14951 + }, + { + "epoch": 1.354623904328328, + "grad_norm": 0.8498446345329285, + "learning_rate": 0.0001097021687911557, + "loss": 2.8168, + "step": 14952 + }, + { + "epoch": 1.3547145025027745, + "grad_norm": 0.7598614692687988, + "learning_rate": 0.00010969612759016492, + "loss": 2.0649, + "step": 14953 + }, + { + "epoch": 1.3548051006772215, + "grad_norm": 0.9114550948143005, + "learning_rate": 0.00010969008638917418, + "loss": 2.3125, + "step": 14954 + }, + { + "epoch": 1.354895698851668, + "grad_norm": 0.9369724988937378, + "learning_rate": 0.00010968404518818342, + "loss": 2.8706, + "step": 14955 + }, + { + "epoch": 1.354986297026115, + "grad_norm": 0.9147566556930542, + "learning_rate": 0.00010967800398719266, + "loss": 2.7983, + "step": 14956 + }, + { + "epoch": 1.3550768952005616, + "grad_norm": 0.995192289352417, + "learning_rate": 0.0001096719627862019, + "loss": 3.0737, + "step": 14957 + }, + { + "epoch": 1.3551674933750084, + "grad_norm": 0.9013031721115112, + "learning_rate": 0.00010966592158521114, + "loss": 2.8006, + "step": 14958 + }, + { + "epoch": 1.3552580915494552, + "grad_norm": 0.8416640162467957, + "learning_rate": 0.00010965988038422039, + "loss": 2.5196, + "step": 14959 + }, + { + "epoch": 1.355348689723902, + "grad_norm": 0.8990280628204346, + "learning_rate": 0.00010965383918322962, + "loss": 2.8408, + "step": 14960 + }, + { + "epoch": 1.3554392878983488, + "grad_norm": 0.9433879852294922, + "learning_rate": 0.00010964779798223887, + "loss": 2.8945, + "step": 14961 + }, + { + "epoch": 1.3555298860727956, + "grad_norm": 0.9064143896102905, + "learning_rate": 0.00010964175678124813, + "loss": 2.9114, + "step": 14962 + }, + { + "epoch": 1.3556204842472424, + "grad_norm": 1.0541589260101318, + "learning_rate": 0.00010963571558025736, + "loss": 2.4761, + "step": 14963 + }, + { + "epoch": 1.3557110824216891, + "grad_norm": 0.8889071941375732, + "learning_rate": 0.00010962967437926661, + "loss": 3.0688, + "step": 14964 + }, + { + "epoch": 1.355801680596136, + "grad_norm": 0.9654818177223206, + "learning_rate": 0.00010962363317827585, + "loss": 2.6973, + "step": 14965 + }, + { + "epoch": 1.3558922787705827, + "grad_norm": 0.8745521903038025, + "learning_rate": 0.0001096175919772851, + "loss": 2.8752, + "step": 14966 + }, + { + "epoch": 1.3559828769450295, + "grad_norm": 0.9348964691162109, + "learning_rate": 0.00010961155077629433, + "loss": 2.7579, + "step": 14967 + }, + { + "epoch": 1.3560734751194763, + "grad_norm": 0.7320253252983093, + "learning_rate": 0.00010960550957530358, + "loss": 2.0117, + "step": 14968 + }, + { + "epoch": 1.356164073293923, + "grad_norm": 0.9185540676116943, + "learning_rate": 0.00010959946837431281, + "loss": 2.7555, + "step": 14969 + }, + { + "epoch": 1.3562546714683699, + "grad_norm": 0.924567461013794, + "learning_rate": 0.00010959342717332206, + "loss": 3.1605, + "step": 14970 + }, + { + "epoch": 1.3563452696428167, + "grad_norm": 0.8891798257827759, + "learning_rate": 0.00010958738597233132, + "loss": 2.2335, + "step": 14971 + }, + { + "epoch": 1.3564358678172634, + "grad_norm": 0.8557683825492859, + "learning_rate": 0.00010958134477134054, + "loss": 2.6677, + "step": 14972 + }, + { + "epoch": 1.3565264659917102, + "grad_norm": 0.929124653339386, + "learning_rate": 0.0001095753035703498, + "loss": 2.9273, + "step": 14973 + }, + { + "epoch": 1.356617064166157, + "grad_norm": 0.9078023433685303, + "learning_rate": 0.00010956926236935902, + "loss": 2.6135, + "step": 14974 + }, + { + "epoch": 1.3567076623406038, + "grad_norm": 0.9099353551864624, + "learning_rate": 0.00010956322116836828, + "loss": 2.6794, + "step": 14975 + }, + { + "epoch": 1.3567982605150506, + "grad_norm": 0.9074954390525818, + "learning_rate": 0.00010955717996737752, + "loss": 2.873, + "step": 14976 + }, + { + "epoch": 1.3568888586894974, + "grad_norm": 0.8922317028045654, + "learning_rate": 0.00010955113876638676, + "loss": 2.5869, + "step": 14977 + }, + { + "epoch": 1.3569794568639442, + "grad_norm": 0.8869142532348633, + "learning_rate": 0.00010954509756539601, + "loss": 2.8289, + "step": 14978 + }, + { + "epoch": 1.357070055038391, + "grad_norm": 0.8801629543304443, + "learning_rate": 0.00010953905636440525, + "loss": 2.9927, + "step": 14979 + }, + { + "epoch": 1.3571606532128377, + "grad_norm": 0.8625133633613586, + "learning_rate": 0.00010953301516341449, + "loss": 2.7505, + "step": 14980 + }, + { + "epoch": 1.3572512513872845, + "grad_norm": 0.9125632643699646, + "learning_rate": 0.00010952697396242373, + "loss": 2.7858, + "step": 14981 + }, + { + "epoch": 1.3573418495617313, + "grad_norm": 0.877788245677948, + "learning_rate": 0.00010952093276143299, + "loss": 2.7001, + "step": 14982 + }, + { + "epoch": 1.357432447736178, + "grad_norm": 0.7859412431716919, + "learning_rate": 0.00010951489156044221, + "loss": 2.0755, + "step": 14983 + }, + { + "epoch": 1.357523045910625, + "grad_norm": 0.8986729979515076, + "learning_rate": 0.00010950885035945147, + "loss": 2.519, + "step": 14984 + }, + { + "epoch": 1.3576136440850717, + "grad_norm": 0.9027456045150757, + "learning_rate": 0.00010950280915846072, + "loss": 2.7303, + "step": 14985 + }, + { + "epoch": 1.3577042422595185, + "grad_norm": 0.8672695159912109, + "learning_rate": 0.00010949676795746995, + "loss": 2.7304, + "step": 14986 + }, + { + "epoch": 1.3577948404339653, + "grad_norm": 0.8611310720443726, + "learning_rate": 0.0001094907267564792, + "loss": 2.7268, + "step": 14987 + }, + { + "epoch": 1.357885438608412, + "grad_norm": 0.943340003490448, + "learning_rate": 0.00010948468555548843, + "loss": 2.6128, + "step": 14988 + }, + { + "epoch": 1.3579760367828588, + "grad_norm": 0.9270703196525574, + "learning_rate": 0.00010947864435449768, + "loss": 2.7316, + "step": 14989 + }, + { + "epoch": 1.3580666349573056, + "grad_norm": 0.9286934733390808, + "learning_rate": 0.00010947260315350691, + "loss": 2.5934, + "step": 14990 + }, + { + "epoch": 1.3581572331317524, + "grad_norm": 0.918068528175354, + "learning_rate": 0.00010946656195251616, + "loss": 2.5644, + "step": 14991 + }, + { + "epoch": 1.3582478313061992, + "grad_norm": 0.8501341342926025, + "learning_rate": 0.00010946052075152542, + "loss": 2.5145, + "step": 14992 + }, + { + "epoch": 1.358338429480646, + "grad_norm": 0.9193550944328308, + "learning_rate": 0.00010945447955053464, + "loss": 2.751, + "step": 14993 + }, + { + "epoch": 1.3584290276550928, + "grad_norm": 0.93849116563797, + "learning_rate": 0.0001094484383495439, + "loss": 2.7232, + "step": 14994 + }, + { + "epoch": 1.3585196258295396, + "grad_norm": 0.8627676963806152, + "learning_rate": 0.00010944239714855314, + "loss": 2.6783, + "step": 14995 + }, + { + "epoch": 1.3586102240039863, + "grad_norm": 0.863307535648346, + "learning_rate": 0.00010943635594756239, + "loss": 2.3981, + "step": 14996 + }, + { + "epoch": 1.3587008221784331, + "grad_norm": 0.8786619305610657, + "learning_rate": 0.00010943031474657162, + "loss": 2.8542, + "step": 14997 + }, + { + "epoch": 1.35879142035288, + "grad_norm": 0.9689474105834961, + "learning_rate": 0.00010942427354558087, + "loss": 3.0082, + "step": 14998 + }, + { + "epoch": 1.3588820185273267, + "grad_norm": 0.8583463430404663, + "learning_rate": 0.0001094182323445901, + "loss": 2.7575, + "step": 14999 + }, + { + "epoch": 1.3589726167017735, + "grad_norm": 0.9492189288139343, + "learning_rate": 0.00010941219114359935, + "loss": 2.836, + "step": 15000 + }, + { + "epoch": 1.3590632148762203, + "grad_norm": 0.894128143787384, + "learning_rate": 0.00010940614994260861, + "loss": 2.757, + "step": 15001 + }, + { + "epoch": 1.359153813050667, + "grad_norm": 0.8551991581916809, + "learning_rate": 0.00010940010874161783, + "loss": 2.6071, + "step": 15002 + }, + { + "epoch": 1.3592444112251139, + "grad_norm": 0.8378260135650635, + "learning_rate": 0.00010939406754062709, + "loss": 2.5052, + "step": 15003 + }, + { + "epoch": 1.3593350093995606, + "grad_norm": 0.908362865447998, + "learning_rate": 0.00010938802633963631, + "loss": 2.6063, + "step": 15004 + }, + { + "epoch": 1.3594256075740074, + "grad_norm": 0.7704275250434875, + "learning_rate": 0.00010938198513864557, + "loss": 1.7979, + "step": 15005 + }, + { + "epoch": 1.3595162057484542, + "grad_norm": 0.9377520680427551, + "learning_rate": 0.0001093759439376548, + "loss": 2.6672, + "step": 15006 + }, + { + "epoch": 1.359606803922901, + "grad_norm": 0.9414226412773132, + "learning_rate": 0.00010936990273666406, + "loss": 2.7955, + "step": 15007 + }, + { + "epoch": 1.3596974020973478, + "grad_norm": 0.9201019406318665, + "learning_rate": 0.0001093638615356733, + "loss": 2.3927, + "step": 15008 + }, + { + "epoch": 1.3597880002717946, + "grad_norm": 0.8654845356941223, + "learning_rate": 0.00010935782033468254, + "loss": 2.9499, + "step": 15009 + }, + { + "epoch": 1.3598785984462414, + "grad_norm": 0.933740496635437, + "learning_rate": 0.00010935177913369179, + "loss": 2.4771, + "step": 15010 + }, + { + "epoch": 1.3599691966206882, + "grad_norm": 0.8974432945251465, + "learning_rate": 0.00010934573793270102, + "loss": 2.9154, + "step": 15011 + }, + { + "epoch": 1.360059794795135, + "grad_norm": 0.9529693126678467, + "learning_rate": 0.00010933969673171027, + "loss": 2.6756, + "step": 15012 + }, + { + "epoch": 1.3601503929695817, + "grad_norm": 0.9433736801147461, + "learning_rate": 0.0001093336555307195, + "loss": 2.7812, + "step": 15013 + }, + { + "epoch": 1.3602409911440285, + "grad_norm": 0.9568621516227722, + "learning_rate": 0.00010932761432972876, + "loss": 2.1638, + "step": 15014 + }, + { + "epoch": 1.3603315893184753, + "grad_norm": 0.8774458169937134, + "learning_rate": 0.00010932157312873801, + "loss": 2.7902, + "step": 15015 + }, + { + "epoch": 1.360422187492922, + "grad_norm": 0.9276570081710815, + "learning_rate": 0.00010931553192774724, + "loss": 2.9232, + "step": 15016 + }, + { + "epoch": 1.3605127856673689, + "grad_norm": 0.9613479375839233, + "learning_rate": 0.00010930949072675649, + "loss": 3.0803, + "step": 15017 + }, + { + "epoch": 1.3606033838418155, + "grad_norm": 0.8807725310325623, + "learning_rate": 0.00010930344952576573, + "loss": 2.7659, + "step": 15018 + }, + { + "epoch": 1.3606939820162625, + "grad_norm": 0.9285517334938049, + "learning_rate": 0.00010929740832477497, + "loss": 2.4952, + "step": 15019 + }, + { + "epoch": 1.360784580190709, + "grad_norm": 0.929538369178772, + "learning_rate": 0.00010929136712378421, + "loss": 2.6564, + "step": 15020 + }, + { + "epoch": 1.360875178365156, + "grad_norm": 0.9355729222297668, + "learning_rate": 0.00010928532592279345, + "loss": 2.9636, + "step": 15021 + }, + { + "epoch": 1.3609657765396026, + "grad_norm": 0.8994722366333008, + "learning_rate": 0.00010927928472180272, + "loss": 2.679, + "step": 15022 + }, + { + "epoch": 1.3610563747140496, + "grad_norm": 0.8653253316879272, + "learning_rate": 0.00010927324352081194, + "loss": 2.5673, + "step": 15023 + }, + { + "epoch": 1.3611469728884962, + "grad_norm": 0.8544042706489563, + "learning_rate": 0.0001092672023198212, + "loss": 2.7689, + "step": 15024 + }, + { + "epoch": 1.3612375710629432, + "grad_norm": 0.8373067378997803, + "learning_rate": 0.00010926116111883042, + "loss": 2.7039, + "step": 15025 + }, + { + "epoch": 1.3613281692373898, + "grad_norm": 0.9139289855957031, + "learning_rate": 0.00010925511991783968, + "loss": 2.7582, + "step": 15026 + }, + { + "epoch": 1.3614187674118368, + "grad_norm": 0.8827250003814697, + "learning_rate": 0.00010924907871684891, + "loss": 2.7527, + "step": 15027 + }, + { + "epoch": 1.3615093655862833, + "grad_norm": 0.9690288305282593, + "learning_rate": 0.00010924303751585816, + "loss": 2.9576, + "step": 15028 + }, + { + "epoch": 1.3615999637607303, + "grad_norm": 0.9419967532157898, + "learning_rate": 0.0001092369963148674, + "loss": 2.6345, + "step": 15029 + }, + { + "epoch": 1.361690561935177, + "grad_norm": 0.83641117811203, + "learning_rate": 0.00010923095511387664, + "loss": 2.5683, + "step": 15030 + }, + { + "epoch": 1.361781160109624, + "grad_norm": 0.9315350651741028, + "learning_rate": 0.00010922491391288589, + "loss": 2.8191, + "step": 15031 + }, + { + "epoch": 1.3618717582840705, + "grad_norm": 0.8487961888313293, + "learning_rate": 0.00010921887271189512, + "loss": 2.5863, + "step": 15032 + }, + { + "epoch": 1.3619623564585175, + "grad_norm": 0.9287465214729309, + "learning_rate": 0.00010921283151090437, + "loss": 2.5194, + "step": 15033 + }, + { + "epoch": 1.362052954632964, + "grad_norm": 0.8329832553863525, + "learning_rate": 0.0001092067903099136, + "loss": 2.7503, + "step": 15034 + }, + { + "epoch": 1.362143552807411, + "grad_norm": 0.9127006530761719, + "learning_rate": 0.00010920074910892287, + "loss": 2.816, + "step": 15035 + }, + { + "epoch": 1.3622341509818576, + "grad_norm": 0.9443028569221497, + "learning_rate": 0.00010919470790793209, + "loss": 2.7078, + "step": 15036 + }, + { + "epoch": 1.3623247491563046, + "grad_norm": 0.8633257150650024, + "learning_rate": 0.00010918866670694135, + "loss": 2.7485, + "step": 15037 + }, + { + "epoch": 1.3624153473307512, + "grad_norm": 0.9695553183555603, + "learning_rate": 0.0001091826255059506, + "loss": 2.6944, + "step": 15038 + }, + { + "epoch": 1.362505945505198, + "grad_norm": 0.8779095411300659, + "learning_rate": 0.00010917658430495983, + "loss": 2.975, + "step": 15039 + }, + { + "epoch": 1.3625965436796448, + "grad_norm": 0.9162921905517578, + "learning_rate": 0.00010917054310396908, + "loss": 2.5832, + "step": 15040 + }, + { + "epoch": 1.3626871418540916, + "grad_norm": 0.9257888197898865, + "learning_rate": 0.00010916450190297831, + "loss": 2.748, + "step": 15041 + }, + { + "epoch": 1.3627777400285384, + "grad_norm": 0.9489784836769104, + "learning_rate": 0.00010915846070198756, + "loss": 3.0906, + "step": 15042 + }, + { + "epoch": 1.3628683382029851, + "grad_norm": 0.9044173955917358, + "learning_rate": 0.0001091524195009968, + "loss": 2.6552, + "step": 15043 + }, + { + "epoch": 1.362958936377432, + "grad_norm": 0.9864049553871155, + "learning_rate": 0.00010914637830000604, + "loss": 2.4124, + "step": 15044 + }, + { + "epoch": 1.3630495345518787, + "grad_norm": 0.8666035532951355, + "learning_rate": 0.0001091403370990153, + "loss": 2.6877, + "step": 15045 + }, + { + "epoch": 1.3631401327263255, + "grad_norm": 0.7838872075080872, + "learning_rate": 0.00010913429589802454, + "loss": 2.1692, + "step": 15046 + }, + { + "epoch": 1.3632307309007723, + "grad_norm": 0.9577428102493286, + "learning_rate": 0.00010912825469703378, + "loss": 2.6679, + "step": 15047 + }, + { + "epoch": 1.363321329075219, + "grad_norm": 0.8775172829627991, + "learning_rate": 0.00010912221349604302, + "loss": 2.8126, + "step": 15048 + }, + { + "epoch": 1.3634119272496659, + "grad_norm": 1.011555552482605, + "learning_rate": 0.00010911617229505227, + "loss": 2.9318, + "step": 15049 + }, + { + "epoch": 1.3635025254241127, + "grad_norm": 0.8745965957641602, + "learning_rate": 0.0001091101310940615, + "loss": 2.5867, + "step": 15050 + }, + { + "epoch": 1.3635931235985594, + "grad_norm": 0.8469706177711487, + "learning_rate": 0.00010910408989307075, + "loss": 2.5848, + "step": 15051 + }, + { + "epoch": 1.3636837217730062, + "grad_norm": 0.8768621683120728, + "learning_rate": 0.00010909804869208, + "loss": 2.7775, + "step": 15052 + }, + { + "epoch": 1.363774319947453, + "grad_norm": 0.8556610345840454, + "learning_rate": 0.00010909200749108923, + "loss": 2.8576, + "step": 15053 + }, + { + "epoch": 1.3638649181218998, + "grad_norm": 0.9330260753631592, + "learning_rate": 0.00010908596629009849, + "loss": 2.6704, + "step": 15054 + }, + { + "epoch": 1.3639555162963466, + "grad_norm": 0.9594457149505615, + "learning_rate": 0.00010907992508910771, + "loss": 2.6399, + "step": 15055 + }, + { + "epoch": 1.3640461144707934, + "grad_norm": 0.9475923180580139, + "learning_rate": 0.00010907388388811697, + "loss": 2.6246, + "step": 15056 + }, + { + "epoch": 1.3641367126452402, + "grad_norm": 0.9193199276924133, + "learning_rate": 0.00010906784268712619, + "loss": 2.784, + "step": 15057 + }, + { + "epoch": 1.364227310819687, + "grad_norm": 0.8607155680656433, + "learning_rate": 0.00010906180148613545, + "loss": 2.7064, + "step": 15058 + }, + { + "epoch": 1.3643179089941337, + "grad_norm": 0.8201672434806824, + "learning_rate": 0.00010905576028514469, + "loss": 2.0374, + "step": 15059 + }, + { + "epoch": 1.3644085071685805, + "grad_norm": 0.8563007116317749, + "learning_rate": 0.00010904971908415394, + "loss": 2.826, + "step": 15060 + }, + { + "epoch": 1.3644991053430273, + "grad_norm": 0.9427745342254639, + "learning_rate": 0.00010904367788316318, + "loss": 2.9173, + "step": 15061 + }, + { + "epoch": 1.364589703517474, + "grad_norm": 0.9392231106758118, + "learning_rate": 0.00010903763668217242, + "loss": 3.0898, + "step": 15062 + }, + { + "epoch": 1.364680301691921, + "grad_norm": 0.9549875259399414, + "learning_rate": 0.00010903159548118166, + "loss": 2.7228, + "step": 15063 + }, + { + "epoch": 1.3647708998663677, + "grad_norm": 0.878953218460083, + "learning_rate": 0.0001090255542801909, + "loss": 2.6627, + "step": 15064 + }, + { + "epoch": 1.3648614980408145, + "grad_norm": 0.9176989793777466, + "learning_rate": 0.00010901951307920015, + "loss": 2.7614, + "step": 15065 + }, + { + "epoch": 1.3649520962152613, + "grad_norm": 0.9079804420471191, + "learning_rate": 0.00010901347187820938, + "loss": 2.7435, + "step": 15066 + }, + { + "epoch": 1.365042694389708, + "grad_norm": 0.9593878984451294, + "learning_rate": 0.00010900743067721864, + "loss": 2.6565, + "step": 15067 + }, + { + "epoch": 1.3651332925641548, + "grad_norm": 0.8527534604072571, + "learning_rate": 0.00010900138947622789, + "loss": 2.493, + "step": 15068 + }, + { + "epoch": 1.3652238907386016, + "grad_norm": 0.8499851226806641, + "learning_rate": 0.00010899534827523712, + "loss": 2.5373, + "step": 15069 + }, + { + "epoch": 1.3653144889130484, + "grad_norm": 0.9214285016059875, + "learning_rate": 0.00010898930707424637, + "loss": 2.5926, + "step": 15070 + }, + { + "epoch": 1.3654050870874952, + "grad_norm": 0.8799808025360107, + "learning_rate": 0.0001089832658732556, + "loss": 2.6907, + "step": 15071 + }, + { + "epoch": 1.365495685261942, + "grad_norm": 0.8070913553237915, + "learning_rate": 0.00010897722467226485, + "loss": 2.1639, + "step": 15072 + }, + { + "epoch": 1.3655862834363888, + "grad_norm": 0.9315145611763, + "learning_rate": 0.00010897118347127409, + "loss": 2.5848, + "step": 15073 + }, + { + "epoch": 1.3656768816108356, + "grad_norm": 0.8776925802230835, + "learning_rate": 0.00010896514227028333, + "loss": 2.7003, + "step": 15074 + }, + { + "epoch": 1.3657674797852823, + "grad_norm": 0.9398594498634338, + "learning_rate": 0.0001089591010692926, + "loss": 2.6853, + "step": 15075 + }, + { + "epoch": 1.3658580779597291, + "grad_norm": 0.9057838916778564, + "learning_rate": 0.00010895305986830182, + "loss": 2.7728, + "step": 15076 + }, + { + "epoch": 1.365948676134176, + "grad_norm": 0.7829293608665466, + "learning_rate": 0.00010894701866731108, + "loss": 2.0248, + "step": 15077 + }, + { + "epoch": 1.3660392743086227, + "grad_norm": 0.9042673707008362, + "learning_rate": 0.00010894097746632031, + "loss": 2.5051, + "step": 15078 + }, + { + "epoch": 1.3661298724830695, + "grad_norm": 0.8199784755706787, + "learning_rate": 0.00010893493626532956, + "loss": 2.0157, + "step": 15079 + }, + { + "epoch": 1.3662204706575163, + "grad_norm": 0.9959353804588318, + "learning_rate": 0.00010892889506433879, + "loss": 2.7899, + "step": 15080 + }, + { + "epoch": 1.366311068831963, + "grad_norm": 1.0412555932998657, + "learning_rate": 0.00010892285386334804, + "loss": 2.6458, + "step": 15081 + }, + { + "epoch": 1.3664016670064099, + "grad_norm": 0.8706786036491394, + "learning_rate": 0.00010891681266235729, + "loss": 2.5582, + "step": 15082 + }, + { + "epoch": 1.3664922651808566, + "grad_norm": 0.9607241153717041, + "learning_rate": 0.00010891077146136652, + "loss": 2.5737, + "step": 15083 + }, + { + "epoch": 1.3665828633553034, + "grad_norm": 0.9420785903930664, + "learning_rate": 0.00010890473026037577, + "loss": 2.7318, + "step": 15084 + }, + { + "epoch": 1.3666734615297502, + "grad_norm": 0.9170916080474854, + "learning_rate": 0.000108898689059385, + "loss": 2.639, + "step": 15085 + }, + { + "epoch": 1.366764059704197, + "grad_norm": 0.8847251534461975, + "learning_rate": 0.00010889264785839426, + "loss": 2.5729, + "step": 15086 + }, + { + "epoch": 1.3668546578786438, + "grad_norm": 0.8664564490318298, + "learning_rate": 0.00010888660665740349, + "loss": 2.5028, + "step": 15087 + }, + { + "epoch": 1.3669452560530906, + "grad_norm": 0.9828088283538818, + "learning_rate": 0.00010888056545641275, + "loss": 2.7946, + "step": 15088 + }, + { + "epoch": 1.3670358542275374, + "grad_norm": 0.9534833431243896, + "learning_rate": 0.00010887452425542197, + "loss": 2.6738, + "step": 15089 + }, + { + "epoch": 1.3671264524019842, + "grad_norm": 0.9565704464912415, + "learning_rate": 0.00010886848305443123, + "loss": 2.5606, + "step": 15090 + }, + { + "epoch": 1.367217050576431, + "grad_norm": 0.888576090335846, + "learning_rate": 0.00010886244185344048, + "loss": 2.7404, + "step": 15091 + }, + { + "epoch": 1.3673076487508777, + "grad_norm": 0.9280145764350891, + "learning_rate": 0.00010885640065244971, + "loss": 2.5837, + "step": 15092 + }, + { + "epoch": 1.3673982469253245, + "grad_norm": 0.9706459641456604, + "learning_rate": 0.00010885035945145896, + "loss": 2.5014, + "step": 15093 + }, + { + "epoch": 1.3674888450997713, + "grad_norm": 0.8110269904136658, + "learning_rate": 0.00010884431825046819, + "loss": 2.376, + "step": 15094 + }, + { + "epoch": 1.367579443274218, + "grad_norm": 0.9492589831352234, + "learning_rate": 0.00010883827704947744, + "loss": 2.9675, + "step": 15095 + }, + { + "epoch": 1.3676700414486649, + "grad_norm": 0.9815694689750671, + "learning_rate": 0.00010883223584848667, + "loss": 2.9585, + "step": 15096 + }, + { + "epoch": 1.3677606396231117, + "grad_norm": 1.0094577074050903, + "learning_rate": 0.00010882619464749592, + "loss": 2.762, + "step": 15097 + }, + { + "epoch": 1.3678512377975585, + "grad_norm": 0.8052096962928772, + "learning_rate": 0.00010882015344650518, + "loss": 2.7459, + "step": 15098 + }, + { + "epoch": 1.367941835972005, + "grad_norm": 0.9046322107315063, + "learning_rate": 0.00010881411224551442, + "loss": 2.863, + "step": 15099 + }, + { + "epoch": 1.368032434146452, + "grad_norm": 0.7726908922195435, + "learning_rate": 0.00010880807104452366, + "loss": 2.0851, + "step": 15100 + }, + { + "epoch": 1.3681230323208986, + "grad_norm": 0.894065260887146, + "learning_rate": 0.0001088020298435329, + "loss": 2.7104, + "step": 15101 + }, + { + "epoch": 1.3682136304953456, + "grad_norm": 0.9571437239646912, + "learning_rate": 0.00010879598864254215, + "loss": 3.1335, + "step": 15102 + }, + { + "epoch": 1.3683042286697922, + "grad_norm": 0.9041791558265686, + "learning_rate": 0.00010878994744155138, + "loss": 2.9168, + "step": 15103 + }, + { + "epoch": 1.3683948268442392, + "grad_norm": 0.8844923973083496, + "learning_rate": 0.00010878390624056063, + "loss": 2.6753, + "step": 15104 + }, + { + "epoch": 1.3684854250186858, + "grad_norm": 0.8833733797073364, + "learning_rate": 0.00010877786503956989, + "loss": 2.5725, + "step": 15105 + }, + { + "epoch": 1.3685760231931328, + "grad_norm": 0.9448814392089844, + "learning_rate": 0.00010877182383857911, + "loss": 2.689, + "step": 15106 + }, + { + "epoch": 1.3686666213675793, + "grad_norm": 0.9023659229278564, + "learning_rate": 0.00010876578263758837, + "loss": 2.6416, + "step": 15107 + }, + { + "epoch": 1.3687572195420263, + "grad_norm": 0.8973368406295776, + "learning_rate": 0.00010875974143659759, + "loss": 2.627, + "step": 15108 + }, + { + "epoch": 1.368847817716473, + "grad_norm": 0.879818856716156, + "learning_rate": 0.00010875370023560685, + "loss": 2.7016, + "step": 15109 + }, + { + "epoch": 1.36893841589092, + "grad_norm": 0.8776816129684448, + "learning_rate": 0.00010874765903461607, + "loss": 2.5981, + "step": 15110 + }, + { + "epoch": 1.3690290140653665, + "grad_norm": 0.8472450375556946, + "learning_rate": 0.00010874161783362533, + "loss": 2.7517, + "step": 15111 + }, + { + "epoch": 1.3691196122398135, + "grad_norm": 0.8894428610801697, + "learning_rate": 0.00010873557663263458, + "loss": 2.6936, + "step": 15112 + }, + { + "epoch": 1.36921021041426, + "grad_norm": 0.7953531742095947, + "learning_rate": 0.00010872953543164381, + "loss": 2.041, + "step": 15113 + }, + { + "epoch": 1.369300808588707, + "grad_norm": 0.9117192029953003, + "learning_rate": 0.00010872349423065306, + "loss": 2.3262, + "step": 15114 + }, + { + "epoch": 1.3693914067631536, + "grad_norm": 0.9259435534477234, + "learning_rate": 0.0001087174530296623, + "loss": 2.4374, + "step": 15115 + }, + { + "epoch": 1.3694820049376006, + "grad_norm": 0.9300802946090698, + "learning_rate": 0.00010871141182867154, + "loss": 2.8555, + "step": 15116 + }, + { + "epoch": 1.3695726031120472, + "grad_norm": 0.912590742111206, + "learning_rate": 0.00010870537062768078, + "loss": 2.6174, + "step": 15117 + }, + { + "epoch": 1.3696632012864942, + "grad_norm": 0.9926945567131042, + "learning_rate": 0.00010869932942669004, + "loss": 2.6351, + "step": 15118 + }, + { + "epoch": 1.3697537994609408, + "grad_norm": 0.8503032326698303, + "learning_rate": 0.00010869328822569926, + "loss": 2.6782, + "step": 15119 + }, + { + "epoch": 1.3698443976353876, + "grad_norm": 0.889021635055542, + "learning_rate": 0.00010868724702470852, + "loss": 2.6075, + "step": 15120 + }, + { + "epoch": 1.3699349958098344, + "grad_norm": 0.7389222383499146, + "learning_rate": 0.00010868120582371777, + "loss": 2.0809, + "step": 15121 + }, + { + "epoch": 1.3700255939842811, + "grad_norm": 0.7947004437446594, + "learning_rate": 0.000108675164622727, + "loss": 2.0472, + "step": 15122 + }, + { + "epoch": 1.370116192158728, + "grad_norm": 0.896501898765564, + "learning_rate": 0.00010866912342173625, + "loss": 2.8454, + "step": 15123 + }, + { + "epoch": 1.3702067903331747, + "grad_norm": 0.8881502151489258, + "learning_rate": 0.00010866308222074548, + "loss": 2.6087, + "step": 15124 + }, + { + "epoch": 1.3702973885076215, + "grad_norm": 0.9302576780319214, + "learning_rate": 0.00010865704101975473, + "loss": 2.8386, + "step": 15125 + }, + { + "epoch": 1.3703879866820683, + "grad_norm": 0.8644417524337769, + "learning_rate": 0.00010865099981876397, + "loss": 2.7266, + "step": 15126 + }, + { + "epoch": 1.370478584856515, + "grad_norm": 0.7822524905204773, + "learning_rate": 0.00010864495861777321, + "loss": 1.8883, + "step": 15127 + }, + { + "epoch": 1.3705691830309619, + "grad_norm": 0.9976415038108826, + "learning_rate": 0.00010863891741678247, + "loss": 2.7892, + "step": 15128 + }, + { + "epoch": 1.3706597812054087, + "grad_norm": 0.9314711689949036, + "learning_rate": 0.0001086328762157917, + "loss": 2.2789, + "step": 15129 + }, + { + "epoch": 1.3707503793798554, + "grad_norm": 0.858344554901123, + "learning_rate": 0.00010862683501480096, + "loss": 2.6912, + "step": 15130 + }, + { + "epoch": 1.3708409775543022, + "grad_norm": 0.8291793465614319, + "learning_rate": 0.00010862079381381019, + "loss": 2.2102, + "step": 15131 + }, + { + "epoch": 1.370931575728749, + "grad_norm": 0.8590041399002075, + "learning_rate": 0.00010861475261281944, + "loss": 2.8317, + "step": 15132 + }, + { + "epoch": 1.3710221739031958, + "grad_norm": 0.9483547210693359, + "learning_rate": 0.00010860871141182867, + "loss": 2.7868, + "step": 15133 + }, + { + "epoch": 1.3711127720776426, + "grad_norm": 0.8919011950492859, + "learning_rate": 0.00010860267021083792, + "loss": 2.9023, + "step": 15134 + }, + { + "epoch": 1.3712033702520894, + "grad_norm": 0.9123280644416809, + "learning_rate": 0.00010859662900984717, + "loss": 2.9245, + "step": 15135 + }, + { + "epoch": 1.3712939684265362, + "grad_norm": 0.9276072382926941, + "learning_rate": 0.0001085905878088564, + "loss": 2.7255, + "step": 15136 + }, + { + "epoch": 1.371384566600983, + "grad_norm": 0.90598064661026, + "learning_rate": 0.00010858454660786566, + "loss": 2.7313, + "step": 15137 + }, + { + "epoch": 1.3714751647754297, + "grad_norm": 0.9054915308952332, + "learning_rate": 0.00010857850540687488, + "loss": 2.5636, + "step": 15138 + }, + { + "epoch": 1.3715657629498765, + "grad_norm": 0.9943755269050598, + "learning_rate": 0.00010857246420588414, + "loss": 2.5144, + "step": 15139 + }, + { + "epoch": 1.3716563611243233, + "grad_norm": 0.9938766956329346, + "learning_rate": 0.00010856642300489336, + "loss": 2.8493, + "step": 15140 + }, + { + "epoch": 1.37174695929877, + "grad_norm": 0.8734217882156372, + "learning_rate": 0.00010856038180390263, + "loss": 2.7503, + "step": 15141 + }, + { + "epoch": 1.371837557473217, + "grad_norm": 0.9682058095932007, + "learning_rate": 0.00010855434060291187, + "loss": 2.8971, + "step": 15142 + }, + { + "epoch": 1.3719281556476637, + "grad_norm": 0.8944506645202637, + "learning_rate": 0.00010854829940192111, + "loss": 3.1454, + "step": 15143 + }, + { + "epoch": 1.3720187538221105, + "grad_norm": 0.8976675868034363, + "learning_rate": 0.00010854225820093035, + "loss": 2.4672, + "step": 15144 + }, + { + "epoch": 1.3721093519965573, + "grad_norm": 0.9027776122093201, + "learning_rate": 0.00010853621699993959, + "loss": 2.7789, + "step": 15145 + }, + { + "epoch": 1.372199950171004, + "grad_norm": 0.9295104146003723, + "learning_rate": 0.00010853017579894884, + "loss": 2.8297, + "step": 15146 + }, + { + "epoch": 1.3722905483454508, + "grad_norm": 0.9083614945411682, + "learning_rate": 0.00010852413459795807, + "loss": 2.8748, + "step": 15147 + }, + { + "epoch": 1.3723811465198976, + "grad_norm": 0.8514741659164429, + "learning_rate": 0.00010851809339696732, + "loss": 2.7467, + "step": 15148 + }, + { + "epoch": 1.3724717446943444, + "grad_norm": 0.9301573634147644, + "learning_rate": 0.00010851205219597655, + "loss": 2.7409, + "step": 15149 + }, + { + "epoch": 1.3725623428687912, + "grad_norm": 0.9805289506912231, + "learning_rate": 0.00010850601099498581, + "loss": 2.6497, + "step": 15150 + }, + { + "epoch": 1.372652941043238, + "grad_norm": 0.8886104822158813, + "learning_rate": 0.00010849996979399506, + "loss": 2.9687, + "step": 15151 + }, + { + "epoch": 1.3727435392176848, + "grad_norm": 0.8795484304428101, + "learning_rate": 0.0001084939285930043, + "loss": 2.698, + "step": 15152 + }, + { + "epoch": 1.3728341373921316, + "grad_norm": 0.9157743453979492, + "learning_rate": 0.00010848788739201354, + "loss": 2.8087, + "step": 15153 + }, + { + "epoch": 1.3729247355665783, + "grad_norm": 0.9445757269859314, + "learning_rate": 0.00010848184619102278, + "loss": 2.8075, + "step": 15154 + }, + { + "epoch": 1.3730153337410251, + "grad_norm": 0.8621684908866882, + "learning_rate": 0.00010847580499003202, + "loss": 2.6696, + "step": 15155 + }, + { + "epoch": 1.373105931915472, + "grad_norm": 0.9084709286689758, + "learning_rate": 0.00010846976378904126, + "loss": 2.7259, + "step": 15156 + }, + { + "epoch": 1.3731965300899187, + "grad_norm": 0.9183632135391235, + "learning_rate": 0.0001084637225880505, + "loss": 2.7777, + "step": 15157 + }, + { + "epoch": 1.3732871282643655, + "grad_norm": 0.9081554412841797, + "learning_rate": 0.00010845768138705977, + "loss": 2.7982, + "step": 15158 + }, + { + "epoch": 1.3733777264388123, + "grad_norm": 1.019303560256958, + "learning_rate": 0.00010845164018606899, + "loss": 2.8612, + "step": 15159 + }, + { + "epoch": 1.373468324613259, + "grad_norm": 0.7595248818397522, + "learning_rate": 0.00010844559898507825, + "loss": 1.8514, + "step": 15160 + }, + { + "epoch": 1.3735589227877059, + "grad_norm": 0.9590757489204407, + "learning_rate": 0.00010843955778408747, + "loss": 2.5972, + "step": 15161 + }, + { + "epoch": 1.3736495209621526, + "grad_norm": 0.8904309868812561, + "learning_rate": 0.00010843351658309673, + "loss": 2.7587, + "step": 15162 + }, + { + "epoch": 1.3737401191365994, + "grad_norm": 0.8541825413703918, + "learning_rate": 0.00010842747538210596, + "loss": 2.7828, + "step": 15163 + }, + { + "epoch": 1.3738307173110462, + "grad_norm": 0.9239258766174316, + "learning_rate": 0.00010842143418111521, + "loss": 2.771, + "step": 15164 + }, + { + "epoch": 1.373921315485493, + "grad_norm": 0.9997032284736633, + "learning_rate": 0.00010841539298012446, + "loss": 2.6037, + "step": 15165 + }, + { + "epoch": 1.3740119136599398, + "grad_norm": 0.9321373701095581, + "learning_rate": 0.0001084093517791337, + "loss": 2.5405, + "step": 15166 + }, + { + "epoch": 1.3741025118343866, + "grad_norm": 0.8558636903762817, + "learning_rate": 0.00010840331057814294, + "loss": 2.6961, + "step": 15167 + }, + { + "epoch": 1.3741931100088334, + "grad_norm": 0.9737372398376465, + "learning_rate": 0.00010839726937715218, + "loss": 2.7606, + "step": 15168 + }, + { + "epoch": 1.3742837081832802, + "grad_norm": 0.9879695773124695, + "learning_rate": 0.00010839122817616144, + "loss": 2.7307, + "step": 15169 + }, + { + "epoch": 1.374374306357727, + "grad_norm": 0.7811247110366821, + "learning_rate": 0.00010838518697517066, + "loss": 2.4118, + "step": 15170 + }, + { + "epoch": 1.3744649045321737, + "grad_norm": 0.8860350251197815, + "learning_rate": 0.00010837914577417992, + "loss": 2.7834, + "step": 15171 + }, + { + "epoch": 1.3745555027066205, + "grad_norm": 0.9161937832832336, + "learning_rate": 0.00010837310457318917, + "loss": 2.9206, + "step": 15172 + }, + { + "epoch": 1.3746461008810673, + "grad_norm": 0.9021293520927429, + "learning_rate": 0.0001083670633721984, + "loss": 2.8149, + "step": 15173 + }, + { + "epoch": 1.374736699055514, + "grad_norm": 0.951237678527832, + "learning_rate": 0.00010836102217120765, + "loss": 2.84, + "step": 15174 + }, + { + "epoch": 1.3748272972299609, + "grad_norm": 1.6872175931930542, + "learning_rate": 0.00010835498097021688, + "loss": 1.9505, + "step": 15175 + }, + { + "epoch": 1.3749178954044077, + "grad_norm": 0.8272773027420044, + "learning_rate": 0.00010834893976922613, + "loss": 2.7258, + "step": 15176 + }, + { + "epoch": 1.3750084935788545, + "grad_norm": 0.8386743068695068, + "learning_rate": 0.00010834289856823536, + "loss": 2.5864, + "step": 15177 + }, + { + "epoch": 1.3750990917533013, + "grad_norm": 0.8420237302780151, + "learning_rate": 0.00010833685736724461, + "loss": 2.6718, + "step": 15178 + }, + { + "epoch": 1.375189689927748, + "grad_norm": 0.9079393744468689, + "learning_rate": 0.00010833081616625384, + "loss": 2.521, + "step": 15179 + }, + { + "epoch": 1.3752802881021946, + "grad_norm": 0.8762547373771667, + "learning_rate": 0.00010832477496526309, + "loss": 2.538, + "step": 15180 + }, + { + "epoch": 1.3753708862766416, + "grad_norm": 0.885865330696106, + "learning_rate": 0.00010831873376427235, + "loss": 2.5452, + "step": 15181 + }, + { + "epoch": 1.3754614844510882, + "grad_norm": 0.914352536201477, + "learning_rate": 0.00010831269256328159, + "loss": 2.9917, + "step": 15182 + }, + { + "epoch": 1.3755520826255352, + "grad_norm": 0.8020502924919128, + "learning_rate": 0.00010830665136229084, + "loss": 1.9497, + "step": 15183 + }, + { + "epoch": 1.3756426807999818, + "grad_norm": 0.8802899718284607, + "learning_rate": 0.00010830061016130007, + "loss": 2.7307, + "step": 15184 + }, + { + "epoch": 1.3757332789744288, + "grad_norm": 0.8480355143547058, + "learning_rate": 0.00010829456896030932, + "loss": 2.1553, + "step": 15185 + }, + { + "epoch": 1.3758238771488753, + "grad_norm": 0.9560646414756775, + "learning_rate": 0.00010828852775931855, + "loss": 2.8303, + "step": 15186 + }, + { + "epoch": 1.3759144753233223, + "grad_norm": 0.9865615367889404, + "learning_rate": 0.0001082824865583278, + "loss": 2.7994, + "step": 15187 + }, + { + "epoch": 1.376005073497769, + "grad_norm": 0.9840458631515503, + "learning_rate": 0.00010827644535733706, + "loss": 2.829, + "step": 15188 + }, + { + "epoch": 1.376095671672216, + "grad_norm": 0.9216883778572083, + "learning_rate": 0.00010827040415634628, + "loss": 2.8622, + "step": 15189 + }, + { + "epoch": 1.3761862698466625, + "grad_norm": 0.6397256255149841, + "learning_rate": 0.00010826436295535554, + "loss": 1.4786, + "step": 15190 + }, + { + "epoch": 1.3762768680211095, + "grad_norm": 0.9209800958633423, + "learning_rate": 0.00010825832175436476, + "loss": 2.777, + "step": 15191 + }, + { + "epoch": 1.376367466195556, + "grad_norm": 0.845638632774353, + "learning_rate": 0.00010825228055337402, + "loss": 2.0063, + "step": 15192 + }, + { + "epoch": 1.376458064370003, + "grad_norm": 0.7393652200698853, + "learning_rate": 0.00010824623935238324, + "loss": 1.8393, + "step": 15193 + }, + { + "epoch": 1.3765486625444496, + "grad_norm": 0.9377306699752808, + "learning_rate": 0.0001082401981513925, + "loss": 2.9053, + "step": 15194 + }, + { + "epoch": 1.3766392607188966, + "grad_norm": 0.8974104523658752, + "learning_rate": 0.00010823415695040175, + "loss": 2.6841, + "step": 15195 + }, + { + "epoch": 1.3767298588933432, + "grad_norm": 0.978073000907898, + "learning_rate": 0.00010822811574941099, + "loss": 2.6657, + "step": 15196 + }, + { + "epoch": 1.3768204570677902, + "grad_norm": 0.8273757100105286, + "learning_rate": 0.00010822207454842023, + "loss": 2.3546, + "step": 15197 + }, + { + "epoch": 1.3769110552422368, + "grad_norm": 0.86529940366745, + "learning_rate": 0.00010821603334742947, + "loss": 2.869, + "step": 15198 + }, + { + "epoch": 1.3770016534166838, + "grad_norm": 0.8469657301902771, + "learning_rate": 0.00010820999214643872, + "loss": 2.3938, + "step": 15199 + }, + { + "epoch": 1.3770922515911304, + "grad_norm": 0.9306221008300781, + "learning_rate": 0.00010820395094544795, + "loss": 2.6809, + "step": 15200 + }, + { + "epoch": 1.3771828497655771, + "grad_norm": 0.8448715806007385, + "learning_rate": 0.00010819790974445721, + "loss": 1.9934, + "step": 15201 + }, + { + "epoch": 1.377273447940024, + "grad_norm": 0.7378783822059631, + "learning_rate": 0.00010819186854346646, + "loss": 2.0673, + "step": 15202 + }, + { + "epoch": 1.3773640461144707, + "grad_norm": 1.002251148223877, + "learning_rate": 0.00010818582734247569, + "loss": 2.6248, + "step": 15203 + }, + { + "epoch": 1.3774546442889175, + "grad_norm": 0.8802443146705627, + "learning_rate": 0.00010817978614148494, + "loss": 2.6568, + "step": 15204 + }, + { + "epoch": 1.3775452424633643, + "grad_norm": 0.8904653787612915, + "learning_rate": 0.00010817374494049417, + "loss": 2.4369, + "step": 15205 + }, + { + "epoch": 1.377635840637811, + "grad_norm": 0.8899369239807129, + "learning_rate": 0.00010816770373950342, + "loss": 2.858, + "step": 15206 + }, + { + "epoch": 1.3777264388122579, + "grad_norm": 0.9297996759414673, + "learning_rate": 0.00010816166253851266, + "loss": 2.6662, + "step": 15207 + }, + { + "epoch": 1.3778170369867047, + "grad_norm": 0.9008249044418335, + "learning_rate": 0.0001081556213375219, + "loss": 2.6468, + "step": 15208 + }, + { + "epoch": 1.3779076351611514, + "grad_norm": 0.9228049516677856, + "learning_rate": 0.00010814958013653114, + "loss": 2.7616, + "step": 15209 + }, + { + "epoch": 1.3779982333355982, + "grad_norm": 0.8585262894630432, + "learning_rate": 0.00010814353893554039, + "loss": 2.5035, + "step": 15210 + }, + { + "epoch": 1.378088831510045, + "grad_norm": 0.8997795581817627, + "learning_rate": 0.00010813749773454965, + "loss": 2.8832, + "step": 15211 + }, + { + "epoch": 1.3781794296844918, + "grad_norm": 0.9473674297332764, + "learning_rate": 0.00010813145653355887, + "loss": 2.6358, + "step": 15212 + }, + { + "epoch": 1.3782700278589386, + "grad_norm": 0.8972582817077637, + "learning_rate": 0.00010812541533256813, + "loss": 2.5417, + "step": 15213 + }, + { + "epoch": 1.3783606260333854, + "grad_norm": 0.890230655670166, + "learning_rate": 0.00010811937413157736, + "loss": 2.8005, + "step": 15214 + }, + { + "epoch": 1.3784512242078322, + "grad_norm": 0.9775114059448242, + "learning_rate": 0.00010811333293058661, + "loss": 3.0232, + "step": 15215 + }, + { + "epoch": 1.378541822382279, + "grad_norm": 0.9017194509506226, + "learning_rate": 0.00010810729172959584, + "loss": 2.8039, + "step": 15216 + }, + { + "epoch": 1.3786324205567257, + "grad_norm": 0.99144047498703, + "learning_rate": 0.00010810125052860509, + "loss": 2.7204, + "step": 15217 + }, + { + "epoch": 1.3787230187311725, + "grad_norm": 0.9183458089828491, + "learning_rate": 0.00010809520932761434, + "loss": 2.5026, + "step": 15218 + }, + { + "epoch": 1.3788136169056193, + "grad_norm": 0.771719753742218, + "learning_rate": 0.00010808916812662357, + "loss": 2.0666, + "step": 15219 + }, + { + "epoch": 1.378904215080066, + "grad_norm": 0.8438399434089661, + "learning_rate": 0.00010808312692563282, + "loss": 2.6545, + "step": 15220 + }, + { + "epoch": 1.378994813254513, + "grad_norm": 0.9352352023124695, + "learning_rate": 0.00010807708572464205, + "loss": 2.8583, + "step": 15221 + }, + { + "epoch": 1.3790854114289597, + "grad_norm": 0.936704695224762, + "learning_rate": 0.00010807104452365132, + "loss": 2.7543, + "step": 15222 + }, + { + "epoch": 1.3791760096034065, + "grad_norm": 0.9036718606948853, + "learning_rate": 0.00010806500332266054, + "loss": 2.7864, + "step": 15223 + }, + { + "epoch": 1.3792666077778533, + "grad_norm": 0.9497159123420715, + "learning_rate": 0.0001080589621216698, + "loss": 2.8026, + "step": 15224 + }, + { + "epoch": 1.3793572059523, + "grad_norm": 0.9040213227272034, + "learning_rate": 0.00010805292092067904, + "loss": 2.5921, + "step": 15225 + }, + { + "epoch": 1.3794478041267468, + "grad_norm": 0.8345484733581543, + "learning_rate": 0.00010804687971968828, + "loss": 2.4044, + "step": 15226 + }, + { + "epoch": 1.3795384023011936, + "grad_norm": 0.9805287718772888, + "learning_rate": 0.00010804083851869753, + "loss": 2.7644, + "step": 15227 + }, + { + "epoch": 1.3796290004756404, + "grad_norm": 0.9082584977149963, + "learning_rate": 0.00010803479731770676, + "loss": 2.787, + "step": 15228 + }, + { + "epoch": 1.3797195986500872, + "grad_norm": 0.8761799335479736, + "learning_rate": 0.00010802875611671601, + "loss": 2.5055, + "step": 15229 + }, + { + "epoch": 1.379810196824534, + "grad_norm": 0.9391998648643494, + "learning_rate": 0.00010802271491572524, + "loss": 2.669, + "step": 15230 + }, + { + "epoch": 1.3799007949989808, + "grad_norm": 0.9495140314102173, + "learning_rate": 0.00010801667371473449, + "loss": 2.9657, + "step": 15231 + }, + { + "epoch": 1.3799913931734276, + "grad_norm": 0.9240545630455017, + "learning_rate": 0.00010801063251374375, + "loss": 2.6276, + "step": 15232 + }, + { + "epoch": 1.3800819913478743, + "grad_norm": 0.9687759876251221, + "learning_rate": 0.00010800459131275299, + "loss": 2.6372, + "step": 15233 + }, + { + "epoch": 1.3801725895223211, + "grad_norm": 0.9067445397377014, + "learning_rate": 0.00010799855011176223, + "loss": 2.7228, + "step": 15234 + }, + { + "epoch": 1.380263187696768, + "grad_norm": 0.8907060027122498, + "learning_rate": 0.00010799250891077147, + "loss": 2.7396, + "step": 15235 + }, + { + "epoch": 1.3803537858712147, + "grad_norm": 0.9532051682472229, + "learning_rate": 0.00010798646770978071, + "loss": 2.5618, + "step": 15236 + }, + { + "epoch": 1.3804443840456615, + "grad_norm": 0.801182210445404, + "learning_rate": 0.00010798042650878995, + "loss": 2.1204, + "step": 15237 + }, + { + "epoch": 1.3805349822201083, + "grad_norm": 0.9735051989555359, + "learning_rate": 0.0001079743853077992, + "loss": 2.6484, + "step": 15238 + }, + { + "epoch": 1.380625580394555, + "grad_norm": 0.9783148169517517, + "learning_rate": 0.00010796834410680843, + "loss": 2.6249, + "step": 15239 + }, + { + "epoch": 1.3807161785690019, + "grad_norm": 0.9168943762779236, + "learning_rate": 0.00010796230290581768, + "loss": 2.798, + "step": 15240 + }, + { + "epoch": 1.3808067767434486, + "grad_norm": 0.8940675854682922, + "learning_rate": 0.00010795626170482694, + "loss": 2.7132, + "step": 15241 + }, + { + "epoch": 1.3808973749178954, + "grad_norm": 0.9158835411071777, + "learning_rate": 0.00010795022050383616, + "loss": 2.7405, + "step": 15242 + }, + { + "epoch": 1.3809879730923422, + "grad_norm": 0.8999227285385132, + "learning_rate": 0.00010794417930284542, + "loss": 2.5697, + "step": 15243 + }, + { + "epoch": 1.381078571266789, + "grad_norm": 0.8435917496681213, + "learning_rate": 0.00010793813810185464, + "loss": 2.6186, + "step": 15244 + }, + { + "epoch": 1.3811691694412358, + "grad_norm": 0.7803501486778259, + "learning_rate": 0.0001079320969008639, + "loss": 2.0585, + "step": 15245 + }, + { + "epoch": 1.3812597676156826, + "grad_norm": 0.8978751301765442, + "learning_rate": 0.00010792605569987314, + "loss": 2.7388, + "step": 15246 + }, + { + "epoch": 1.3813503657901294, + "grad_norm": 0.9088658094406128, + "learning_rate": 0.00010792001449888238, + "loss": 2.7453, + "step": 15247 + }, + { + "epoch": 1.3814409639645762, + "grad_norm": 0.7751898765563965, + "learning_rate": 0.00010791397329789163, + "loss": 1.9585, + "step": 15248 + }, + { + "epoch": 1.381531562139023, + "grad_norm": 0.7722139954566956, + "learning_rate": 0.00010790793209690087, + "loss": 1.9468, + "step": 15249 + }, + { + "epoch": 1.3816221603134697, + "grad_norm": 0.8653639554977417, + "learning_rate": 0.00010790189089591011, + "loss": 2.5226, + "step": 15250 + }, + { + "epoch": 1.3817127584879165, + "grad_norm": 0.8333589434623718, + "learning_rate": 0.00010789584969491935, + "loss": 2.5942, + "step": 15251 + }, + { + "epoch": 1.3818033566623633, + "grad_norm": 0.7261785864830017, + "learning_rate": 0.0001078898084939286, + "loss": 2.0426, + "step": 15252 + }, + { + "epoch": 1.38189395483681, + "grad_norm": 0.9980748891830444, + "learning_rate": 0.00010788376729293783, + "loss": 2.8945, + "step": 15253 + }, + { + "epoch": 1.3819845530112569, + "grad_norm": 0.8600407838821411, + "learning_rate": 0.00010787772609194709, + "loss": 2.5809, + "step": 15254 + }, + { + "epoch": 1.3820751511857037, + "grad_norm": 0.929593563079834, + "learning_rate": 0.00010787168489095634, + "loss": 2.7266, + "step": 15255 + }, + { + "epoch": 1.3821657493601505, + "grad_norm": 0.6202995181083679, + "learning_rate": 0.00010786564368996557, + "loss": 1.4683, + "step": 15256 + }, + { + "epoch": 1.3822563475345973, + "grad_norm": 0.9249987006187439, + "learning_rate": 0.00010785960248897482, + "loss": 2.723, + "step": 15257 + }, + { + "epoch": 1.382346945709044, + "grad_norm": 0.8360349535942078, + "learning_rate": 0.00010785356128798405, + "loss": 2.4889, + "step": 15258 + }, + { + "epoch": 1.3824375438834908, + "grad_norm": 0.8406080007553101, + "learning_rate": 0.0001078475200869933, + "loss": 2.0169, + "step": 15259 + }, + { + "epoch": 1.3825281420579376, + "grad_norm": 0.8838976621627808, + "learning_rate": 0.00010784147888600253, + "loss": 2.6971, + "step": 15260 + }, + { + "epoch": 1.3826187402323842, + "grad_norm": 0.8730194568634033, + "learning_rate": 0.00010783543768501178, + "loss": 2.3419, + "step": 15261 + }, + { + "epoch": 1.3827093384068312, + "grad_norm": 0.9175186157226562, + "learning_rate": 0.00010782939648402104, + "loss": 2.5849, + "step": 15262 + }, + { + "epoch": 1.3827999365812778, + "grad_norm": 0.8670556545257568, + "learning_rate": 0.00010782335528303026, + "loss": 2.8906, + "step": 15263 + }, + { + "epoch": 1.3828905347557248, + "grad_norm": 0.8544732332229614, + "learning_rate": 0.00010781731408203953, + "loss": 2.426, + "step": 15264 + }, + { + "epoch": 1.3829811329301713, + "grad_norm": 0.8882031440734863, + "learning_rate": 0.00010781127288104876, + "loss": 2.6125, + "step": 15265 + }, + { + "epoch": 1.3830717311046183, + "grad_norm": 0.9579074382781982, + "learning_rate": 0.00010780523168005801, + "loss": 2.7053, + "step": 15266 + }, + { + "epoch": 1.383162329279065, + "grad_norm": 0.8758944272994995, + "learning_rate": 0.00010779919047906724, + "loss": 2.5905, + "step": 15267 + }, + { + "epoch": 1.383252927453512, + "grad_norm": 0.8865820169448853, + "learning_rate": 0.00010779314927807649, + "loss": 2.7674, + "step": 15268 + }, + { + "epoch": 1.3833435256279585, + "grad_norm": 0.8990516662597656, + "learning_rate": 0.00010778710807708572, + "loss": 2.6874, + "step": 15269 + }, + { + "epoch": 1.3834341238024055, + "grad_norm": 0.8588494658470154, + "learning_rate": 0.00010778106687609497, + "loss": 2.5571, + "step": 15270 + }, + { + "epoch": 1.383524721976852, + "grad_norm": 0.7448217868804932, + "learning_rate": 0.00010777502567510422, + "loss": 1.9293, + "step": 15271 + }, + { + "epoch": 1.383615320151299, + "grad_norm": 0.9518078565597534, + "learning_rate": 0.00010776898447411345, + "loss": 2.5705, + "step": 15272 + }, + { + "epoch": 1.3837059183257456, + "grad_norm": 0.9037646651268005, + "learning_rate": 0.00010776294327312271, + "loss": 2.7851, + "step": 15273 + }, + { + "epoch": 1.3837965165001926, + "grad_norm": 0.8178298473358154, + "learning_rate": 0.00010775690207213193, + "loss": 2.5114, + "step": 15274 + }, + { + "epoch": 1.3838871146746392, + "grad_norm": 0.8631923794746399, + "learning_rate": 0.0001077508608711412, + "loss": 2.4629, + "step": 15275 + }, + { + "epoch": 1.3839777128490862, + "grad_norm": 0.8871225714683533, + "learning_rate": 0.00010774481967015042, + "loss": 2.5243, + "step": 15276 + }, + { + "epoch": 1.3840683110235328, + "grad_norm": 0.9350371956825256, + "learning_rate": 0.00010773877846915968, + "loss": 2.8957, + "step": 15277 + }, + { + "epoch": 1.3841589091979798, + "grad_norm": 0.9537808299064636, + "learning_rate": 0.00010773273726816892, + "loss": 2.5701, + "step": 15278 + }, + { + "epoch": 1.3842495073724264, + "grad_norm": 0.8574159741401672, + "learning_rate": 0.00010772669606717816, + "loss": 2.6953, + "step": 15279 + }, + { + "epoch": 1.3843401055468734, + "grad_norm": 0.7580463290214539, + "learning_rate": 0.0001077206548661874, + "loss": 1.954, + "step": 15280 + }, + { + "epoch": 1.38443070372132, + "grad_norm": 0.9315984845161438, + "learning_rate": 0.00010771461366519664, + "loss": 2.6739, + "step": 15281 + }, + { + "epoch": 1.3845213018957667, + "grad_norm": 0.7868449687957764, + "learning_rate": 0.00010770857246420589, + "loss": 2.1695, + "step": 15282 + }, + { + "epoch": 1.3846119000702135, + "grad_norm": 0.872338056564331, + "learning_rate": 0.00010770253126321512, + "loss": 2.7238, + "step": 15283 + }, + { + "epoch": 1.3847024982446603, + "grad_norm": 0.8090105056762695, + "learning_rate": 0.00010769649006222437, + "loss": 2.4291, + "step": 15284 + }, + { + "epoch": 1.384793096419107, + "grad_norm": 0.9060272574424744, + "learning_rate": 0.00010769044886123363, + "loss": 2.7684, + "step": 15285 + }, + { + "epoch": 1.3848836945935539, + "grad_norm": 0.9027689099311829, + "learning_rate": 0.00010768440766024286, + "loss": 2.8355, + "step": 15286 + }, + { + "epoch": 1.3849742927680007, + "grad_norm": 0.9629434943199158, + "learning_rate": 0.00010767836645925211, + "loss": 2.9399, + "step": 15287 + }, + { + "epoch": 1.3850648909424474, + "grad_norm": 0.967058002948761, + "learning_rate": 0.00010767232525826135, + "loss": 2.9918, + "step": 15288 + }, + { + "epoch": 1.3851554891168942, + "grad_norm": 0.8834347128868103, + "learning_rate": 0.0001076662840572706, + "loss": 2.7435, + "step": 15289 + }, + { + "epoch": 1.385246087291341, + "grad_norm": 0.90744948387146, + "learning_rate": 0.00010766024285627983, + "loss": 2.7577, + "step": 15290 + }, + { + "epoch": 1.3853366854657878, + "grad_norm": 0.7508055567741394, + "learning_rate": 0.00010765420165528908, + "loss": 1.9927, + "step": 15291 + }, + { + "epoch": 1.3854272836402346, + "grad_norm": 1.004172921180725, + "learning_rate": 0.00010764816045429834, + "loss": 2.5203, + "step": 15292 + }, + { + "epoch": 1.3855178818146814, + "grad_norm": 0.9253383278846741, + "learning_rate": 0.00010764211925330756, + "loss": 2.7829, + "step": 15293 + }, + { + "epoch": 1.3856084799891282, + "grad_norm": 0.8948156833648682, + "learning_rate": 0.00010763607805231682, + "loss": 2.824, + "step": 15294 + }, + { + "epoch": 1.385699078163575, + "grad_norm": 0.7536840438842773, + "learning_rate": 0.00010763003685132604, + "loss": 1.9417, + "step": 15295 + }, + { + "epoch": 1.3857896763380217, + "grad_norm": 0.9291996955871582, + "learning_rate": 0.0001076239956503353, + "loss": 2.7984, + "step": 15296 + }, + { + "epoch": 1.3858802745124685, + "grad_norm": 0.9620806574821472, + "learning_rate": 0.00010761795444934452, + "loss": 2.6626, + "step": 15297 + }, + { + "epoch": 1.3859708726869153, + "grad_norm": 0.885456383228302, + "learning_rate": 0.00010761191324835378, + "loss": 2.7291, + "step": 15298 + }, + { + "epoch": 1.386061470861362, + "grad_norm": 0.8817821741104126, + "learning_rate": 0.00010760587204736302, + "loss": 2.8188, + "step": 15299 + }, + { + "epoch": 1.386152069035809, + "grad_norm": 0.9112196564674377, + "learning_rate": 0.00010759983084637226, + "loss": 2.6674, + "step": 15300 + }, + { + "epoch": 1.3862426672102557, + "grad_norm": 0.8837308287620544, + "learning_rate": 0.00010759378964538151, + "loss": 2.8187, + "step": 15301 + }, + { + "epoch": 1.3863332653847025, + "grad_norm": 0.9565207958221436, + "learning_rate": 0.00010758774844439074, + "loss": 2.5878, + "step": 15302 + }, + { + "epoch": 1.3864238635591493, + "grad_norm": 0.9084428548812866, + "learning_rate": 0.00010758170724339999, + "loss": 2.6326, + "step": 15303 + }, + { + "epoch": 1.386514461733596, + "grad_norm": 0.9669269919395447, + "learning_rate": 0.00010757566604240923, + "loss": 2.7078, + "step": 15304 + }, + { + "epoch": 1.3866050599080428, + "grad_norm": 0.8606119155883789, + "learning_rate": 0.00010756962484141849, + "loss": 2.5853, + "step": 15305 + }, + { + "epoch": 1.3866956580824896, + "grad_norm": 0.863376796245575, + "learning_rate": 0.00010756358364042771, + "loss": 2.7247, + "step": 15306 + }, + { + "epoch": 1.3867862562569364, + "grad_norm": 0.9415199756622314, + "learning_rate": 0.00010755754243943697, + "loss": 2.7248, + "step": 15307 + }, + { + "epoch": 1.3868768544313832, + "grad_norm": 0.8580802083015442, + "learning_rate": 0.00010755150123844622, + "loss": 2.8387, + "step": 15308 + }, + { + "epoch": 1.38696745260583, + "grad_norm": 0.9698554873466492, + "learning_rate": 0.00010754546003745545, + "loss": 2.8748, + "step": 15309 + }, + { + "epoch": 1.3870580507802768, + "grad_norm": 0.9092365503311157, + "learning_rate": 0.0001075394188364647, + "loss": 2.8024, + "step": 15310 + }, + { + "epoch": 1.3871486489547236, + "grad_norm": 0.8745278120040894, + "learning_rate": 0.00010753337763547393, + "loss": 2.5448, + "step": 15311 + }, + { + "epoch": 1.3872392471291703, + "grad_norm": 0.919029712677002, + "learning_rate": 0.00010752733643448318, + "loss": 2.8012, + "step": 15312 + }, + { + "epoch": 1.3873298453036171, + "grad_norm": 0.9272984266281128, + "learning_rate": 0.00010752129523349241, + "loss": 2.7203, + "step": 15313 + }, + { + "epoch": 1.387420443478064, + "grad_norm": 0.891955554485321, + "learning_rate": 0.00010751525403250166, + "loss": 2.6692, + "step": 15314 + }, + { + "epoch": 1.3875110416525107, + "grad_norm": 0.874492347240448, + "learning_rate": 0.00010750921283151092, + "loss": 2.8612, + "step": 15315 + }, + { + "epoch": 1.3876016398269575, + "grad_norm": 0.9337881207466125, + "learning_rate": 0.00010750317163052014, + "loss": 2.7516, + "step": 15316 + }, + { + "epoch": 1.3876922380014043, + "grad_norm": 0.9768016934394836, + "learning_rate": 0.0001074971304295294, + "loss": 3.0192, + "step": 15317 + }, + { + "epoch": 1.387782836175851, + "grad_norm": 0.8971214890480042, + "learning_rate": 0.00010749108922853864, + "loss": 2.6307, + "step": 15318 + }, + { + "epoch": 1.3878734343502979, + "grad_norm": 0.9381541013717651, + "learning_rate": 0.00010748504802754789, + "loss": 2.6288, + "step": 15319 + }, + { + "epoch": 1.3879640325247447, + "grad_norm": 0.9579330086708069, + "learning_rate": 0.00010747900682655712, + "loss": 2.4826, + "step": 15320 + }, + { + "epoch": 1.3880546306991914, + "grad_norm": 0.9424172043800354, + "learning_rate": 0.00010747296562556637, + "loss": 2.829, + "step": 15321 + }, + { + "epoch": 1.3881452288736382, + "grad_norm": 0.7794191837310791, + "learning_rate": 0.00010746692442457562, + "loss": 2.0221, + "step": 15322 + }, + { + "epoch": 1.388235827048085, + "grad_norm": 0.7773945331573486, + "learning_rate": 0.00010746088322358485, + "loss": 2.3993, + "step": 15323 + }, + { + "epoch": 1.3883264252225318, + "grad_norm": 0.8959839344024658, + "learning_rate": 0.00010745484202259411, + "loss": 2.7004, + "step": 15324 + }, + { + "epoch": 1.3884170233969786, + "grad_norm": 0.8668240904808044, + "learning_rate": 0.00010744880082160333, + "loss": 2.7167, + "step": 15325 + }, + { + "epoch": 1.3885076215714254, + "grad_norm": 0.8443088531494141, + "learning_rate": 0.00010744275962061259, + "loss": 2.9782, + "step": 15326 + }, + { + "epoch": 1.3885982197458722, + "grad_norm": 0.8849349617958069, + "learning_rate": 0.00010743671841962181, + "loss": 2.5146, + "step": 15327 + }, + { + "epoch": 1.388688817920319, + "grad_norm": 0.8220199942588806, + "learning_rate": 0.00010743067721863107, + "loss": 2.5694, + "step": 15328 + }, + { + "epoch": 1.3887794160947657, + "grad_norm": 0.9699872732162476, + "learning_rate": 0.0001074246360176403, + "loss": 2.5441, + "step": 15329 + }, + { + "epoch": 1.3888700142692125, + "grad_norm": 0.8171850442886353, + "learning_rate": 0.00010741859481664956, + "loss": 2.1197, + "step": 15330 + }, + { + "epoch": 1.3889606124436593, + "grad_norm": 0.8313572406768799, + "learning_rate": 0.0001074125536156588, + "loss": 2.6676, + "step": 15331 + }, + { + "epoch": 1.389051210618106, + "grad_norm": 0.9903745651245117, + "learning_rate": 0.00010740651241466804, + "loss": 2.5354, + "step": 15332 + }, + { + "epoch": 1.389141808792553, + "grad_norm": 0.9256560802459717, + "learning_rate": 0.00010740047121367728, + "loss": 2.9195, + "step": 15333 + }, + { + "epoch": 1.3892324069669997, + "grad_norm": 0.898756742477417, + "learning_rate": 0.00010739443001268652, + "loss": 2.6827, + "step": 15334 + }, + { + "epoch": 1.3893230051414465, + "grad_norm": 0.8368139863014221, + "learning_rate": 0.00010738838881169577, + "loss": 2.5947, + "step": 15335 + }, + { + "epoch": 1.3894136033158933, + "grad_norm": 0.9086229801177979, + "learning_rate": 0.000107382347610705, + "loss": 2.6044, + "step": 15336 + }, + { + "epoch": 1.38950420149034, + "grad_norm": 0.732909083366394, + "learning_rate": 0.00010737630640971426, + "loss": 1.9112, + "step": 15337 + }, + { + "epoch": 1.3895947996647868, + "grad_norm": 1.0356653928756714, + "learning_rate": 0.00010737026520872351, + "loss": 2.6978, + "step": 15338 + }, + { + "epoch": 1.3896853978392336, + "grad_norm": 0.9243597984313965, + "learning_rate": 0.00010736422400773274, + "loss": 2.6977, + "step": 15339 + }, + { + "epoch": 1.3897759960136804, + "grad_norm": 0.8896283507347107, + "learning_rate": 0.00010735818280674199, + "loss": 2.6553, + "step": 15340 + }, + { + "epoch": 1.3898665941881272, + "grad_norm": 0.9104408621788025, + "learning_rate": 0.00010735214160575123, + "loss": 2.8611, + "step": 15341 + }, + { + "epoch": 1.3899571923625738, + "grad_norm": 0.9072704911231995, + "learning_rate": 0.00010734610040476047, + "loss": 2.6465, + "step": 15342 + }, + { + "epoch": 1.3900477905370208, + "grad_norm": 0.9566786289215088, + "learning_rate": 0.0001073400592037697, + "loss": 2.8073, + "step": 15343 + }, + { + "epoch": 1.3901383887114673, + "grad_norm": 0.8150953054428101, + "learning_rate": 0.00010733401800277895, + "loss": 2.2339, + "step": 15344 + }, + { + "epoch": 1.3902289868859143, + "grad_norm": 0.8593475818634033, + "learning_rate": 0.00010732797680178822, + "loss": 2.6845, + "step": 15345 + }, + { + "epoch": 1.390319585060361, + "grad_norm": 0.8881624341011047, + "learning_rate": 0.00010732193560079744, + "loss": 2.5211, + "step": 15346 + }, + { + "epoch": 1.390410183234808, + "grad_norm": 0.909048318862915, + "learning_rate": 0.0001073158943998067, + "loss": 2.8344, + "step": 15347 + }, + { + "epoch": 1.3905007814092545, + "grad_norm": 0.9876799583435059, + "learning_rate": 0.00010730985319881592, + "loss": 2.7896, + "step": 15348 + }, + { + "epoch": 1.3905913795837015, + "grad_norm": 0.9145303964614868, + "learning_rate": 0.00010730381199782518, + "loss": 2.7192, + "step": 15349 + }, + { + "epoch": 1.390681977758148, + "grad_norm": 0.8849799633026123, + "learning_rate": 0.00010729777079683441, + "loss": 2.8544, + "step": 15350 + }, + { + "epoch": 1.390772575932595, + "grad_norm": 0.7786979079246521, + "learning_rate": 0.00010729172959584366, + "loss": 2.2331, + "step": 15351 + }, + { + "epoch": 1.3908631741070416, + "grad_norm": 0.8802885413169861, + "learning_rate": 0.00010728568839485291, + "loss": 2.763, + "step": 15352 + }, + { + "epoch": 1.3909537722814886, + "grad_norm": 0.8627022504806519, + "learning_rate": 0.00010727964719386214, + "loss": 2.5171, + "step": 15353 + }, + { + "epoch": 1.3910443704559352, + "grad_norm": 0.9253868460655212, + "learning_rate": 0.00010727360599287139, + "loss": 2.791, + "step": 15354 + }, + { + "epoch": 1.3911349686303822, + "grad_norm": 0.8888195157051086, + "learning_rate": 0.00010726756479188062, + "loss": 2.7551, + "step": 15355 + }, + { + "epoch": 1.3912255668048288, + "grad_norm": 0.8206668496131897, + "learning_rate": 0.00010726152359088988, + "loss": 2.0691, + "step": 15356 + }, + { + "epoch": 1.3913161649792758, + "grad_norm": 0.9614198207855225, + "learning_rate": 0.0001072554823898991, + "loss": 2.4722, + "step": 15357 + }, + { + "epoch": 1.3914067631537224, + "grad_norm": 0.9561471939086914, + "learning_rate": 0.00010724944118890837, + "loss": 2.7092, + "step": 15358 + }, + { + "epoch": 1.3914973613281694, + "grad_norm": 0.9572356939315796, + "learning_rate": 0.00010724339998791759, + "loss": 2.7601, + "step": 15359 + }, + { + "epoch": 1.391587959502616, + "grad_norm": 0.9284488558769226, + "learning_rate": 0.00010723735878692685, + "loss": 2.7645, + "step": 15360 + }, + { + "epoch": 1.391678557677063, + "grad_norm": 0.9272369146347046, + "learning_rate": 0.0001072313175859361, + "loss": 2.7023, + "step": 15361 + }, + { + "epoch": 1.3917691558515095, + "grad_norm": 0.9151242971420288, + "learning_rate": 0.00010722527638494533, + "loss": 2.8593, + "step": 15362 + }, + { + "epoch": 1.3918597540259563, + "grad_norm": 0.9276977777481079, + "learning_rate": 0.00010721923518395458, + "loss": 2.7646, + "step": 15363 + }, + { + "epoch": 1.391950352200403, + "grad_norm": 0.8842701315879822, + "learning_rate": 0.00010721319398296381, + "loss": 2.6288, + "step": 15364 + }, + { + "epoch": 1.3920409503748499, + "grad_norm": 0.9392748475074768, + "learning_rate": 0.00010720715278197306, + "loss": 2.8812, + "step": 15365 + }, + { + "epoch": 1.3921315485492967, + "grad_norm": 0.8664273023605347, + "learning_rate": 0.0001072011115809823, + "loss": 2.7501, + "step": 15366 + }, + { + "epoch": 1.3922221467237434, + "grad_norm": 0.9294573068618774, + "learning_rate": 0.00010719507037999154, + "loss": 2.916, + "step": 15367 + }, + { + "epoch": 1.3923127448981902, + "grad_norm": 0.9853792190551758, + "learning_rate": 0.0001071890291790008, + "loss": 2.5814, + "step": 15368 + }, + { + "epoch": 1.392403343072637, + "grad_norm": 0.9895381927490234, + "learning_rate": 0.00010718298797801004, + "loss": 2.7043, + "step": 15369 + }, + { + "epoch": 1.3924939412470838, + "grad_norm": 0.8927691578865051, + "learning_rate": 0.00010717694677701928, + "loss": 2.7675, + "step": 15370 + }, + { + "epoch": 1.3925845394215306, + "grad_norm": 0.8110284209251404, + "learning_rate": 0.00010717090557602852, + "loss": 2.0138, + "step": 15371 + }, + { + "epoch": 1.3926751375959774, + "grad_norm": 0.8910278677940369, + "learning_rate": 0.00010716486437503777, + "loss": 2.7029, + "step": 15372 + }, + { + "epoch": 1.3927657357704242, + "grad_norm": 0.8816971778869629, + "learning_rate": 0.000107158823174047, + "loss": 2.5971, + "step": 15373 + }, + { + "epoch": 1.392856333944871, + "grad_norm": 0.8984795212745667, + "learning_rate": 0.00010715278197305625, + "loss": 2.189, + "step": 15374 + }, + { + "epoch": 1.3929469321193177, + "grad_norm": 0.8835976123809814, + "learning_rate": 0.00010714674077206551, + "loss": 2.6519, + "step": 15375 + }, + { + "epoch": 1.3930375302937645, + "grad_norm": 1.0192865133285522, + "learning_rate": 0.00010714069957107473, + "loss": 2.6863, + "step": 15376 + }, + { + "epoch": 1.3931281284682113, + "grad_norm": 1.0315839052200317, + "learning_rate": 0.00010713465837008399, + "loss": 2.6486, + "step": 15377 + }, + { + "epoch": 1.3932187266426581, + "grad_norm": 0.9248895645141602, + "learning_rate": 0.00010712861716909321, + "loss": 2.5977, + "step": 15378 + }, + { + "epoch": 1.393309324817105, + "grad_norm": 0.8922947645187378, + "learning_rate": 0.00010712257596810247, + "loss": 2.6961, + "step": 15379 + }, + { + "epoch": 1.3933999229915517, + "grad_norm": 0.8866088390350342, + "learning_rate": 0.00010711653476711169, + "loss": 2.4101, + "step": 15380 + }, + { + "epoch": 1.3934905211659985, + "grad_norm": 0.9789847731590271, + "learning_rate": 0.00010711049356612095, + "loss": 2.879, + "step": 15381 + }, + { + "epoch": 1.3935811193404453, + "grad_norm": 0.8454307913780212, + "learning_rate": 0.0001071044523651302, + "loss": 2.653, + "step": 15382 + }, + { + "epoch": 1.393671717514892, + "grad_norm": 0.939683735370636, + "learning_rate": 0.00010709841116413943, + "loss": 2.9061, + "step": 15383 + }, + { + "epoch": 1.3937623156893388, + "grad_norm": 0.876384437084198, + "learning_rate": 0.00010709236996314868, + "loss": 2.829, + "step": 15384 + }, + { + "epoch": 1.3938529138637856, + "grad_norm": 0.9184178709983826, + "learning_rate": 0.00010708632876215792, + "loss": 2.9325, + "step": 15385 + }, + { + "epoch": 1.3939435120382324, + "grad_norm": 0.9645160436630249, + "learning_rate": 0.00010708028756116716, + "loss": 2.9226, + "step": 15386 + }, + { + "epoch": 1.3940341102126792, + "grad_norm": 0.9361401200294495, + "learning_rate": 0.0001070742463601764, + "loss": 2.7272, + "step": 15387 + }, + { + "epoch": 1.394124708387126, + "grad_norm": 0.9083284139633179, + "learning_rate": 0.00010706820515918566, + "loss": 2.5384, + "step": 15388 + }, + { + "epoch": 1.3942153065615728, + "grad_norm": 0.9006229639053345, + "learning_rate": 0.00010706216395819488, + "loss": 2.6713, + "step": 15389 + }, + { + "epoch": 1.3943059047360196, + "grad_norm": 1.0179495811462402, + "learning_rate": 0.00010705612275720414, + "loss": 2.5704, + "step": 15390 + }, + { + "epoch": 1.3943965029104664, + "grad_norm": 0.8982391953468323, + "learning_rate": 0.00010705008155621339, + "loss": 2.9579, + "step": 15391 + }, + { + "epoch": 1.3944871010849131, + "grad_norm": 0.9625990986824036, + "learning_rate": 0.00010704404035522262, + "loss": 2.779, + "step": 15392 + }, + { + "epoch": 1.39457769925936, + "grad_norm": 0.8514867424964905, + "learning_rate": 0.00010703799915423187, + "loss": 2.6859, + "step": 15393 + }, + { + "epoch": 1.3946682974338067, + "grad_norm": 1.1185661554336548, + "learning_rate": 0.0001070319579532411, + "loss": 2.5215, + "step": 15394 + }, + { + "epoch": 1.3947588956082535, + "grad_norm": 0.9437501430511475, + "learning_rate": 0.00010702591675225035, + "loss": 2.8669, + "step": 15395 + }, + { + "epoch": 1.3948494937827003, + "grad_norm": 0.8936399817466736, + "learning_rate": 0.00010701987555125959, + "loss": 2.7373, + "step": 15396 + }, + { + "epoch": 1.394940091957147, + "grad_norm": 0.912371814250946, + "learning_rate": 0.00010701383435026883, + "loss": 2.3261, + "step": 15397 + }, + { + "epoch": 1.3950306901315939, + "grad_norm": 0.8958061337471008, + "learning_rate": 0.0001070077931492781, + "loss": 2.7411, + "step": 15398 + }, + { + "epoch": 1.3951212883060407, + "grad_norm": 0.9463261365890503, + "learning_rate": 0.00010700175194828732, + "loss": 2.7867, + "step": 15399 + }, + { + "epoch": 1.3952118864804874, + "grad_norm": 1.0860391855239868, + "learning_rate": 0.00010699571074729658, + "loss": 2.6292, + "step": 15400 + }, + { + "epoch": 1.3953024846549342, + "grad_norm": 0.8669007420539856, + "learning_rate": 0.00010698966954630581, + "loss": 2.6027, + "step": 15401 + }, + { + "epoch": 1.395393082829381, + "grad_norm": 0.9644559025764465, + "learning_rate": 0.00010698362834531506, + "loss": 2.5614, + "step": 15402 + }, + { + "epoch": 1.3954836810038278, + "grad_norm": 1.0595704317092896, + "learning_rate": 0.00010697758714432429, + "loss": 2.6716, + "step": 15403 + }, + { + "epoch": 1.3955742791782746, + "grad_norm": 1.0270155668258667, + "learning_rate": 0.00010697154594333354, + "loss": 2.4778, + "step": 15404 + }, + { + "epoch": 1.3956648773527214, + "grad_norm": 0.9295092225074768, + "learning_rate": 0.00010696550474234279, + "loss": 2.6601, + "step": 15405 + }, + { + "epoch": 1.3957554755271682, + "grad_norm": 0.8409826755523682, + "learning_rate": 0.00010695946354135202, + "loss": 2.7053, + "step": 15406 + }, + { + "epoch": 1.395846073701615, + "grad_norm": 0.7848532795906067, + "learning_rate": 0.00010695342234036127, + "loss": 2.0941, + "step": 15407 + }, + { + "epoch": 1.3959366718760617, + "grad_norm": 0.8407394289970398, + "learning_rate": 0.0001069473811393705, + "loss": 2.6132, + "step": 15408 + }, + { + "epoch": 1.3960272700505085, + "grad_norm": 0.900488555431366, + "learning_rate": 0.00010694133993837976, + "loss": 2.6146, + "step": 15409 + }, + { + "epoch": 1.3961178682249553, + "grad_norm": 0.8777965903282166, + "learning_rate": 0.00010693529873738898, + "loss": 2.7274, + "step": 15410 + }, + { + "epoch": 1.396208466399402, + "grad_norm": 0.9197179079055786, + "learning_rate": 0.00010692925753639825, + "loss": 2.7551, + "step": 15411 + }, + { + "epoch": 1.396299064573849, + "grad_norm": 0.9676556587219238, + "learning_rate": 0.0001069232163354075, + "loss": 2.984, + "step": 15412 + }, + { + "epoch": 1.3963896627482957, + "grad_norm": 0.8740732669830322, + "learning_rate": 0.00010691717513441673, + "loss": 2.7547, + "step": 15413 + }, + { + "epoch": 1.3964802609227425, + "grad_norm": 0.9239213466644287, + "learning_rate": 0.00010691113393342598, + "loss": 2.7719, + "step": 15414 + }, + { + "epoch": 1.3965708590971893, + "grad_norm": 0.8646217584609985, + "learning_rate": 0.00010690509273243521, + "loss": 2.6823, + "step": 15415 + }, + { + "epoch": 1.396661457271636, + "grad_norm": 0.886494517326355, + "learning_rate": 0.00010689905153144446, + "loss": 2.6481, + "step": 15416 + }, + { + "epoch": 1.3967520554460828, + "grad_norm": 0.8719490766525269, + "learning_rate": 0.00010689301033045369, + "loss": 2.0206, + "step": 15417 + }, + { + "epoch": 1.3968426536205296, + "grad_norm": 0.9334052205085754, + "learning_rate": 0.00010688696912946294, + "loss": 2.6319, + "step": 15418 + }, + { + "epoch": 1.3969332517949764, + "grad_norm": 0.9081969261169434, + "learning_rate": 0.0001068809279284722, + "loss": 2.966, + "step": 15419 + }, + { + "epoch": 1.3970238499694232, + "grad_norm": 0.959445059299469, + "learning_rate": 0.00010687488672748143, + "loss": 2.7282, + "step": 15420 + }, + { + "epoch": 1.39711444814387, + "grad_norm": 0.9204225540161133, + "learning_rate": 0.00010686884552649068, + "loss": 2.7553, + "step": 15421 + }, + { + "epoch": 1.3972050463183168, + "grad_norm": 0.8518250584602356, + "learning_rate": 0.00010686280432549992, + "loss": 2.8289, + "step": 15422 + }, + { + "epoch": 1.3972956444927633, + "grad_norm": 0.881614625453949, + "learning_rate": 0.00010685676312450916, + "loss": 2.6122, + "step": 15423 + }, + { + "epoch": 1.3973862426672103, + "grad_norm": 0.8545541763305664, + "learning_rate": 0.0001068507219235184, + "loss": 2.5136, + "step": 15424 + }, + { + "epoch": 1.397476840841657, + "grad_norm": 0.8837812542915344, + "learning_rate": 0.00010684468072252764, + "loss": 2.7311, + "step": 15425 + }, + { + "epoch": 1.397567439016104, + "grad_norm": 0.8551583886146545, + "learning_rate": 0.00010683863952153688, + "loss": 2.5776, + "step": 15426 + }, + { + "epoch": 1.3976580371905505, + "grad_norm": 0.9509851932525635, + "learning_rate": 0.00010683259832054613, + "loss": 2.7373, + "step": 15427 + }, + { + "epoch": 1.3977486353649975, + "grad_norm": 0.9360498785972595, + "learning_rate": 0.00010682655711955539, + "loss": 2.6359, + "step": 15428 + }, + { + "epoch": 1.397839233539444, + "grad_norm": 0.8927934169769287, + "learning_rate": 0.00010682051591856461, + "loss": 2.7116, + "step": 15429 + }, + { + "epoch": 1.397929831713891, + "grad_norm": 0.9856849908828735, + "learning_rate": 0.00010681447471757387, + "loss": 2.7231, + "step": 15430 + }, + { + "epoch": 1.3980204298883376, + "grad_norm": 0.8116346001625061, + "learning_rate": 0.00010680843351658309, + "loss": 1.9777, + "step": 15431 + }, + { + "epoch": 1.3981110280627846, + "grad_norm": 0.9173145294189453, + "learning_rate": 0.00010680239231559235, + "loss": 2.6092, + "step": 15432 + }, + { + "epoch": 1.3982016262372312, + "grad_norm": 0.9751958250999451, + "learning_rate": 0.00010679635111460158, + "loss": 2.7586, + "step": 15433 + }, + { + "epoch": 1.3982922244116782, + "grad_norm": 0.9035011529922485, + "learning_rate": 0.00010679030991361083, + "loss": 2.7399, + "step": 15434 + }, + { + "epoch": 1.3983828225861248, + "grad_norm": 0.893263578414917, + "learning_rate": 0.00010678426871262008, + "loss": 2.7493, + "step": 15435 + }, + { + "epoch": 1.3984734207605718, + "grad_norm": 0.8969844579696655, + "learning_rate": 0.00010677822751162931, + "loss": 2.5823, + "step": 15436 + }, + { + "epoch": 1.3985640189350184, + "grad_norm": 0.8513990640640259, + "learning_rate": 0.00010677218631063856, + "loss": 2.7438, + "step": 15437 + }, + { + "epoch": 1.3986546171094654, + "grad_norm": 0.9084559082984924, + "learning_rate": 0.0001067661451096478, + "loss": 2.5647, + "step": 15438 + }, + { + "epoch": 1.398745215283912, + "grad_norm": 0.7360543012619019, + "learning_rate": 0.00010676010390865704, + "loss": 1.9021, + "step": 15439 + }, + { + "epoch": 1.398835813458359, + "grad_norm": 0.899829626083374, + "learning_rate": 0.00010675406270766628, + "loss": 2.4604, + "step": 15440 + }, + { + "epoch": 1.3989264116328055, + "grad_norm": 0.9753703474998474, + "learning_rate": 0.00010674802150667554, + "loss": 2.9152, + "step": 15441 + }, + { + "epoch": 1.3990170098072525, + "grad_norm": 0.7890316843986511, + "learning_rate": 0.00010674198030568479, + "loss": 2.242, + "step": 15442 + }, + { + "epoch": 1.399107607981699, + "grad_norm": 0.958104133605957, + "learning_rate": 0.00010673593910469402, + "loss": 2.6016, + "step": 15443 + }, + { + "epoch": 1.3991982061561459, + "grad_norm": 0.7828273773193359, + "learning_rate": 0.00010672989790370327, + "loss": 2.189, + "step": 15444 + }, + { + "epoch": 1.3992888043305927, + "grad_norm": 0.9115172624588013, + "learning_rate": 0.0001067238567027125, + "loss": 2.7328, + "step": 15445 + }, + { + "epoch": 1.3993794025050394, + "grad_norm": 0.8729400634765625, + "learning_rate": 0.00010671781550172175, + "loss": 2.711, + "step": 15446 + }, + { + "epoch": 1.3994700006794862, + "grad_norm": 0.887810230255127, + "learning_rate": 0.00010671177430073098, + "loss": 3.0031, + "step": 15447 + }, + { + "epoch": 1.399560598853933, + "grad_norm": 0.8871765732765198, + "learning_rate": 0.00010670573309974023, + "loss": 2.6545, + "step": 15448 + }, + { + "epoch": 1.3996511970283798, + "grad_norm": 0.9049781560897827, + "learning_rate": 0.00010669969189874949, + "loss": 2.4913, + "step": 15449 + }, + { + "epoch": 1.3997417952028266, + "grad_norm": 0.8829920887947083, + "learning_rate": 0.00010669365069775871, + "loss": 2.6099, + "step": 15450 + }, + { + "epoch": 1.3998323933772734, + "grad_norm": 0.9649124145507812, + "learning_rate": 0.00010668760949676797, + "loss": 2.87, + "step": 15451 + }, + { + "epoch": 1.3999229915517202, + "grad_norm": 0.9239031076431274, + "learning_rate": 0.00010668156829577721, + "loss": 2.6769, + "step": 15452 + }, + { + "epoch": 1.400013589726167, + "grad_norm": 0.8848552703857422, + "learning_rate": 0.00010667552709478646, + "loss": 2.7116, + "step": 15453 + }, + { + "epoch": 1.4001041879006138, + "grad_norm": 0.9244495034217834, + "learning_rate": 0.00010666948589379569, + "loss": 2.7435, + "step": 15454 + }, + { + "epoch": 1.4001947860750605, + "grad_norm": 0.8938380479812622, + "learning_rate": 0.00010666344469280494, + "loss": 2.5287, + "step": 15455 + }, + { + "epoch": 1.4002853842495073, + "grad_norm": 0.8678187131881714, + "learning_rate": 0.00010665740349181417, + "loss": 2.6316, + "step": 15456 + }, + { + "epoch": 1.4003759824239541, + "grad_norm": 0.837033748626709, + "learning_rate": 0.00010665136229082342, + "loss": 2.5178, + "step": 15457 + }, + { + "epoch": 1.400466580598401, + "grad_norm": 0.8148756623268127, + "learning_rate": 0.00010664532108983267, + "loss": 1.9939, + "step": 15458 + }, + { + "epoch": 1.4005571787728477, + "grad_norm": 0.785550594329834, + "learning_rate": 0.0001066392798888419, + "loss": 2.0415, + "step": 15459 + }, + { + "epoch": 1.4006477769472945, + "grad_norm": 0.9429792165756226, + "learning_rate": 0.00010663323868785116, + "loss": 2.5691, + "step": 15460 + }, + { + "epoch": 1.4007383751217413, + "grad_norm": 0.9303426742553711, + "learning_rate": 0.00010662719748686038, + "loss": 2.7264, + "step": 15461 + }, + { + "epoch": 1.400828973296188, + "grad_norm": 0.963034987449646, + "learning_rate": 0.00010662115628586964, + "loss": 2.4645, + "step": 15462 + }, + { + "epoch": 1.4009195714706348, + "grad_norm": 0.8607929944992065, + "learning_rate": 0.00010661511508487886, + "loss": 2.6778, + "step": 15463 + }, + { + "epoch": 1.4010101696450816, + "grad_norm": 0.895878791809082, + "learning_rate": 0.00010660907388388812, + "loss": 2.6785, + "step": 15464 + }, + { + "epoch": 1.4011007678195284, + "grad_norm": 0.851431131362915, + "learning_rate": 0.00010660303268289737, + "loss": 2.743, + "step": 15465 + }, + { + "epoch": 1.4011913659939752, + "grad_norm": 0.8714445233345032, + "learning_rate": 0.0001065969914819066, + "loss": 2.6475, + "step": 15466 + }, + { + "epoch": 1.401281964168422, + "grad_norm": 0.8241127133369446, + "learning_rate": 0.00010659095028091585, + "loss": 2.094, + "step": 15467 + }, + { + "epoch": 1.4013725623428688, + "grad_norm": 0.9010136127471924, + "learning_rate": 0.00010658490907992509, + "loss": 2.7111, + "step": 15468 + }, + { + "epoch": 1.4014631605173156, + "grad_norm": 0.8912574648857117, + "learning_rate": 0.00010657886787893434, + "loss": 2.7526, + "step": 15469 + }, + { + "epoch": 1.4015537586917624, + "grad_norm": 0.9072902798652649, + "learning_rate": 0.00010657282667794357, + "loss": 2.5625, + "step": 15470 + }, + { + "epoch": 1.4016443568662091, + "grad_norm": 0.8838597536087036, + "learning_rate": 0.00010656678547695282, + "loss": 2.5658, + "step": 15471 + }, + { + "epoch": 1.401734955040656, + "grad_norm": 0.8485627174377441, + "learning_rate": 0.00010656074427596208, + "loss": 2.3761, + "step": 15472 + }, + { + "epoch": 1.4018255532151027, + "grad_norm": 0.9536202549934387, + "learning_rate": 0.00010655470307497131, + "loss": 2.6774, + "step": 15473 + }, + { + "epoch": 1.4019161513895495, + "grad_norm": 0.9974142909049988, + "learning_rate": 0.00010654866187398056, + "loss": 2.5966, + "step": 15474 + }, + { + "epoch": 1.4020067495639963, + "grad_norm": 0.8705393075942993, + "learning_rate": 0.0001065426206729898, + "loss": 2.8465, + "step": 15475 + }, + { + "epoch": 1.402097347738443, + "grad_norm": 0.9660161137580872, + "learning_rate": 0.00010653657947199904, + "loss": 2.7805, + "step": 15476 + }, + { + "epoch": 1.4021879459128899, + "grad_norm": 0.8608981370925903, + "learning_rate": 0.00010653053827100828, + "loss": 2.6943, + "step": 15477 + }, + { + "epoch": 1.4022785440873367, + "grad_norm": 0.9363119602203369, + "learning_rate": 0.00010652449707001752, + "loss": 2.7958, + "step": 15478 + }, + { + "epoch": 1.4023691422617834, + "grad_norm": 0.8964061737060547, + "learning_rate": 0.00010651845586902678, + "loss": 2.6635, + "step": 15479 + }, + { + "epoch": 1.4024597404362302, + "grad_norm": 0.7981519103050232, + "learning_rate": 0.000106512414668036, + "loss": 2.0021, + "step": 15480 + }, + { + "epoch": 1.402550338610677, + "grad_norm": 0.8670828342437744, + "learning_rate": 0.00010650637346704527, + "loss": 2.6751, + "step": 15481 + }, + { + "epoch": 1.4026409367851238, + "grad_norm": 0.9338674545288086, + "learning_rate": 0.00010650033226605449, + "loss": 2.7578, + "step": 15482 + }, + { + "epoch": 1.4027315349595706, + "grad_norm": 0.9254218339920044, + "learning_rate": 0.00010649429106506375, + "loss": 2.4522, + "step": 15483 + }, + { + "epoch": 1.4028221331340174, + "grad_norm": 0.9851395487785339, + "learning_rate": 0.00010648824986407297, + "loss": 2.5731, + "step": 15484 + }, + { + "epoch": 1.4029127313084642, + "grad_norm": 0.9271631836891174, + "learning_rate": 0.00010648220866308223, + "loss": 2.8274, + "step": 15485 + }, + { + "epoch": 1.403003329482911, + "grad_norm": 0.8954036831855774, + "learning_rate": 0.00010647616746209146, + "loss": 2.1748, + "step": 15486 + }, + { + "epoch": 1.4030939276573577, + "grad_norm": 0.7929334044456482, + "learning_rate": 0.00010647012626110071, + "loss": 1.9253, + "step": 15487 + }, + { + "epoch": 1.4031845258318045, + "grad_norm": 0.8767159581184387, + "learning_rate": 0.00010646408506010996, + "loss": 2.6762, + "step": 15488 + }, + { + "epoch": 1.4032751240062513, + "grad_norm": 0.9187232851982117, + "learning_rate": 0.00010645804385911919, + "loss": 2.6607, + "step": 15489 + }, + { + "epoch": 1.403365722180698, + "grad_norm": 0.8870342373847961, + "learning_rate": 0.00010645200265812844, + "loss": 2.7662, + "step": 15490 + }, + { + "epoch": 1.403456320355145, + "grad_norm": 0.9089182019233704, + "learning_rate": 0.00010644596145713767, + "loss": 2.5285, + "step": 15491 + }, + { + "epoch": 1.4035469185295917, + "grad_norm": 0.9064914584159851, + "learning_rate": 0.00010643992025614694, + "loss": 2.9627, + "step": 15492 + }, + { + "epoch": 1.4036375167040385, + "grad_norm": 0.9068315029144287, + "learning_rate": 0.00010643387905515616, + "loss": 2.6135, + "step": 15493 + }, + { + "epoch": 1.4037281148784853, + "grad_norm": 0.9259910583496094, + "learning_rate": 0.00010642783785416542, + "loss": 2.5495, + "step": 15494 + }, + { + "epoch": 1.403818713052932, + "grad_norm": 0.891585111618042, + "learning_rate": 0.00010642179665317467, + "loss": 2.5517, + "step": 15495 + }, + { + "epoch": 1.4039093112273788, + "grad_norm": 0.9336378574371338, + "learning_rate": 0.0001064157554521839, + "loss": 2.5972, + "step": 15496 + }, + { + "epoch": 1.4039999094018256, + "grad_norm": 1.0472909212112427, + "learning_rate": 0.00010640971425119315, + "loss": 2.7523, + "step": 15497 + }, + { + "epoch": 1.4040905075762724, + "grad_norm": 0.8848074078559875, + "learning_rate": 0.00010640367305020238, + "loss": 2.6499, + "step": 15498 + }, + { + "epoch": 1.4041811057507192, + "grad_norm": 0.8844707608222961, + "learning_rate": 0.00010639763184921163, + "loss": 2.5842, + "step": 15499 + }, + { + "epoch": 1.404271703925166, + "grad_norm": 0.9333747029304504, + "learning_rate": 0.00010639159064822086, + "loss": 2.7861, + "step": 15500 + }, + { + "epoch": 1.4043623020996128, + "grad_norm": 0.9057377576828003, + "learning_rate": 0.00010638554944723011, + "loss": 2.5466, + "step": 15501 + }, + { + "epoch": 1.4044529002740596, + "grad_norm": 0.9730805158615112, + "learning_rate": 0.00010637950824623937, + "loss": 2.9666, + "step": 15502 + }, + { + "epoch": 1.4045434984485063, + "grad_norm": 0.9327659010887146, + "learning_rate": 0.00010637346704524859, + "loss": 2.769, + "step": 15503 + }, + { + "epoch": 1.404634096622953, + "grad_norm": 0.7123082876205444, + "learning_rate": 0.00010636742584425785, + "loss": 1.8356, + "step": 15504 + }, + { + "epoch": 1.4047246947974, + "grad_norm": 0.8817719221115112, + "learning_rate": 0.00010636138464326709, + "loss": 2.9181, + "step": 15505 + }, + { + "epoch": 1.4048152929718465, + "grad_norm": 0.9106013774871826, + "learning_rate": 0.00010635534344227633, + "loss": 2.8672, + "step": 15506 + }, + { + "epoch": 1.4049058911462935, + "grad_norm": 0.9554626941680908, + "learning_rate": 0.00010634930224128557, + "loss": 2.8587, + "step": 15507 + }, + { + "epoch": 1.40499648932074, + "grad_norm": 0.9791618585586548, + "learning_rate": 0.00010634326104029482, + "loss": 2.606, + "step": 15508 + }, + { + "epoch": 1.405087087495187, + "grad_norm": 0.947522759437561, + "learning_rate": 0.00010633721983930406, + "loss": 2.7181, + "step": 15509 + }, + { + "epoch": 1.4051776856696336, + "grad_norm": 0.9083862900733948, + "learning_rate": 0.0001063311786383133, + "loss": 2.9653, + "step": 15510 + }, + { + "epoch": 1.4052682838440806, + "grad_norm": 0.9467526078224182, + "learning_rate": 0.00010632513743732256, + "loss": 2.9164, + "step": 15511 + }, + { + "epoch": 1.4053588820185272, + "grad_norm": 0.9809921979904175, + "learning_rate": 0.00010631909623633178, + "loss": 2.7327, + "step": 15512 + }, + { + "epoch": 1.4054494801929742, + "grad_norm": 0.8658980131149292, + "learning_rate": 0.00010631305503534104, + "loss": 2.5801, + "step": 15513 + }, + { + "epoch": 1.4055400783674208, + "grad_norm": 0.9119535684585571, + "learning_rate": 0.00010630701383435026, + "loss": 2.7746, + "step": 15514 + }, + { + "epoch": 1.4056306765418678, + "grad_norm": 0.9265426993370056, + "learning_rate": 0.00010630097263335952, + "loss": 2.5866, + "step": 15515 + }, + { + "epoch": 1.4057212747163144, + "grad_norm": 0.8868656158447266, + "learning_rate": 0.00010629493143236874, + "loss": 2.7445, + "step": 15516 + }, + { + "epoch": 1.4058118728907614, + "grad_norm": 0.8422524333000183, + "learning_rate": 0.000106288890231378, + "loss": 2.5885, + "step": 15517 + }, + { + "epoch": 1.405902471065208, + "grad_norm": 0.8491546511650085, + "learning_rate": 0.00010628284903038725, + "loss": 2.6371, + "step": 15518 + }, + { + "epoch": 1.405993069239655, + "grad_norm": 0.9544780850410461, + "learning_rate": 0.00010627680782939649, + "loss": 2.51, + "step": 15519 + }, + { + "epoch": 1.4060836674141015, + "grad_norm": 0.9106453657150269, + "learning_rate": 0.00010627076662840573, + "loss": 2.5766, + "step": 15520 + }, + { + "epoch": 1.4061742655885485, + "grad_norm": 0.9487755298614502, + "learning_rate": 0.00010626472542741497, + "loss": 2.7561, + "step": 15521 + }, + { + "epoch": 1.406264863762995, + "grad_norm": 0.8351373672485352, + "learning_rate": 0.00010625868422642422, + "loss": 2.5924, + "step": 15522 + }, + { + "epoch": 1.406355461937442, + "grad_norm": 0.8683030009269714, + "learning_rate": 0.00010625264302543345, + "loss": 2.574, + "step": 15523 + }, + { + "epoch": 1.4064460601118887, + "grad_norm": 0.9213854670524597, + "learning_rate": 0.00010624660182444271, + "loss": 2.6222, + "step": 15524 + }, + { + "epoch": 1.4065366582863355, + "grad_norm": 1.0074388980865479, + "learning_rate": 0.00010624056062345196, + "loss": 2.6574, + "step": 15525 + }, + { + "epoch": 1.4066272564607822, + "grad_norm": 0.8356515765190125, + "learning_rate": 0.00010623451942246119, + "loss": 2.6449, + "step": 15526 + }, + { + "epoch": 1.406717854635229, + "grad_norm": 0.892952024936676, + "learning_rate": 0.00010622847822147044, + "loss": 2.9239, + "step": 15527 + }, + { + "epoch": 1.4068084528096758, + "grad_norm": 0.8210960030555725, + "learning_rate": 0.00010622243702047967, + "loss": 2.2189, + "step": 15528 + }, + { + "epoch": 1.4068990509841226, + "grad_norm": 0.9047417044639587, + "learning_rate": 0.00010621639581948892, + "loss": 2.7284, + "step": 15529 + }, + { + "epoch": 1.4069896491585694, + "grad_norm": 0.9092671871185303, + "learning_rate": 0.00010621035461849816, + "loss": 2.9684, + "step": 15530 + }, + { + "epoch": 1.4070802473330162, + "grad_norm": 0.9125660061836243, + "learning_rate": 0.0001062043134175074, + "loss": 2.9742, + "step": 15531 + }, + { + "epoch": 1.407170845507463, + "grad_norm": 0.8576372861862183, + "learning_rate": 0.00010619827221651666, + "loss": 2.639, + "step": 15532 + }, + { + "epoch": 1.4072614436819098, + "grad_norm": 1.002008080482483, + "learning_rate": 0.00010619223101552588, + "loss": 2.444, + "step": 15533 + }, + { + "epoch": 1.4073520418563565, + "grad_norm": 0.9417327642440796, + "learning_rate": 0.00010618618981453515, + "loss": 2.6588, + "step": 15534 + }, + { + "epoch": 1.4074426400308033, + "grad_norm": 0.8932138681411743, + "learning_rate": 0.00010618014861354437, + "loss": 3.0138, + "step": 15535 + }, + { + "epoch": 1.4075332382052501, + "grad_norm": 0.8490815162658691, + "learning_rate": 0.00010617410741255363, + "loss": 2.5892, + "step": 15536 + }, + { + "epoch": 1.407623836379697, + "grad_norm": 0.8844385147094727, + "learning_rate": 0.00010616806621156286, + "loss": 2.6094, + "step": 15537 + }, + { + "epoch": 1.4077144345541437, + "grad_norm": 1.0478872060775757, + "learning_rate": 0.00010616202501057211, + "loss": 2.8075, + "step": 15538 + }, + { + "epoch": 1.4078050327285905, + "grad_norm": 0.9584821462631226, + "learning_rate": 0.00010615598380958136, + "loss": 2.7078, + "step": 15539 + }, + { + "epoch": 1.4078956309030373, + "grad_norm": 0.967642605304718, + "learning_rate": 0.00010614994260859059, + "loss": 2.6662, + "step": 15540 + }, + { + "epoch": 1.407986229077484, + "grad_norm": 0.969519317150116, + "learning_rate": 0.00010614390140759984, + "loss": 2.0141, + "step": 15541 + }, + { + "epoch": 1.4080768272519308, + "grad_norm": 0.865511417388916, + "learning_rate": 0.00010613786020660907, + "loss": 2.0913, + "step": 15542 + }, + { + "epoch": 1.4081674254263776, + "grad_norm": 0.9303832650184631, + "learning_rate": 0.00010613181900561833, + "loss": 2.7929, + "step": 15543 + }, + { + "epoch": 1.4082580236008244, + "grad_norm": 0.8313817381858826, + "learning_rate": 0.00010612577780462755, + "loss": 2.0311, + "step": 15544 + }, + { + "epoch": 1.4083486217752712, + "grad_norm": 0.9237419366836548, + "learning_rate": 0.00010611973660363682, + "loss": 2.8527, + "step": 15545 + }, + { + "epoch": 1.408439219949718, + "grad_norm": 0.9017833471298218, + "learning_rate": 0.00010611369540264604, + "loss": 2.9814, + "step": 15546 + }, + { + "epoch": 1.4085298181241648, + "grad_norm": 0.8282190561294556, + "learning_rate": 0.0001061076542016553, + "loss": 1.8192, + "step": 15547 + }, + { + "epoch": 1.4086204162986116, + "grad_norm": 0.728314995765686, + "learning_rate": 0.00010610161300066454, + "loss": 2.221, + "step": 15548 + }, + { + "epoch": 1.4087110144730584, + "grad_norm": 0.9139243364334106, + "learning_rate": 0.00010609557179967378, + "loss": 2.8211, + "step": 15549 + }, + { + "epoch": 1.4088016126475051, + "grad_norm": 0.8431646823883057, + "learning_rate": 0.00010608953059868303, + "loss": 2.6473, + "step": 15550 + }, + { + "epoch": 1.408892210821952, + "grad_norm": 0.8715980648994446, + "learning_rate": 0.00010608348939769226, + "loss": 2.7445, + "step": 15551 + }, + { + "epoch": 1.4089828089963987, + "grad_norm": 0.9030753970146179, + "learning_rate": 0.00010607744819670151, + "loss": 2.8666, + "step": 15552 + }, + { + "epoch": 1.4090734071708455, + "grad_norm": 0.9031649231910706, + "learning_rate": 0.00010607140699571074, + "loss": 2.6103, + "step": 15553 + }, + { + "epoch": 1.4091640053452923, + "grad_norm": 0.9663422107696533, + "learning_rate": 0.00010606536579471999, + "loss": 2.7866, + "step": 15554 + }, + { + "epoch": 1.409254603519739, + "grad_norm": 0.969334065914154, + "learning_rate": 0.00010605932459372925, + "loss": 2.9156, + "step": 15555 + }, + { + "epoch": 1.4093452016941859, + "grad_norm": 0.869122326374054, + "learning_rate": 0.00010605328339273848, + "loss": 2.8374, + "step": 15556 + }, + { + "epoch": 1.4094357998686327, + "grad_norm": 0.9188150763511658, + "learning_rate": 0.00010604724219174773, + "loss": 2.5252, + "step": 15557 + }, + { + "epoch": 1.4095263980430794, + "grad_norm": 0.8901343941688538, + "learning_rate": 0.00010604120099075697, + "loss": 2.583, + "step": 15558 + }, + { + "epoch": 1.4096169962175262, + "grad_norm": 0.8844144940376282, + "learning_rate": 0.00010603515978976621, + "loss": 2.6543, + "step": 15559 + }, + { + "epoch": 1.409707594391973, + "grad_norm": 0.9023352861404419, + "learning_rate": 0.00010602911858877545, + "loss": 2.5609, + "step": 15560 + }, + { + "epoch": 1.4097981925664198, + "grad_norm": 0.8791859149932861, + "learning_rate": 0.0001060230773877847, + "loss": 2.63, + "step": 15561 + }, + { + "epoch": 1.4098887907408666, + "grad_norm": 0.9657248258590698, + "learning_rate": 0.00010601703618679396, + "loss": 2.752, + "step": 15562 + }, + { + "epoch": 1.4099793889153134, + "grad_norm": 0.9085990190505981, + "learning_rate": 0.00010601099498580318, + "loss": 2.8344, + "step": 15563 + }, + { + "epoch": 1.4100699870897602, + "grad_norm": 0.8786447644233704, + "learning_rate": 0.00010600495378481244, + "loss": 2.9213, + "step": 15564 + }, + { + "epoch": 1.410160585264207, + "grad_norm": 0.9498050212860107, + "learning_rate": 0.00010599891258382166, + "loss": 2.8012, + "step": 15565 + }, + { + "epoch": 1.4102511834386537, + "grad_norm": 0.9294019341468811, + "learning_rate": 0.00010599287138283092, + "loss": 2.9406, + "step": 15566 + }, + { + "epoch": 1.4103417816131005, + "grad_norm": 0.8240673542022705, + "learning_rate": 0.00010598683018184014, + "loss": 2.2194, + "step": 15567 + }, + { + "epoch": 1.4104323797875473, + "grad_norm": 0.8915050029754639, + "learning_rate": 0.0001059807889808494, + "loss": 2.6023, + "step": 15568 + }, + { + "epoch": 1.410522977961994, + "grad_norm": 0.930475652217865, + "learning_rate": 0.00010597474777985865, + "loss": 2.8247, + "step": 15569 + }, + { + "epoch": 1.410613576136441, + "grad_norm": 0.9110034704208374, + "learning_rate": 0.00010596870657886788, + "loss": 3.0116, + "step": 15570 + }, + { + "epoch": 1.4107041743108877, + "grad_norm": 0.9400051236152649, + "learning_rate": 0.00010596266537787713, + "loss": 2.7757, + "step": 15571 + }, + { + "epoch": 1.4107947724853345, + "grad_norm": 0.8862847685813904, + "learning_rate": 0.00010595662417688636, + "loss": 2.7614, + "step": 15572 + }, + { + "epoch": 1.4108853706597813, + "grad_norm": 0.8846600651741028, + "learning_rate": 0.00010595058297589561, + "loss": 2.8594, + "step": 15573 + }, + { + "epoch": 1.410975968834228, + "grad_norm": 0.8938871026039124, + "learning_rate": 0.00010594454177490485, + "loss": 2.0952, + "step": 15574 + }, + { + "epoch": 1.4110665670086748, + "grad_norm": 0.9089074730873108, + "learning_rate": 0.00010593850057391411, + "loss": 2.7689, + "step": 15575 + }, + { + "epoch": 1.4111571651831216, + "grad_norm": 0.9715632796287537, + "learning_rate": 0.00010593245937292333, + "loss": 2.8504, + "step": 15576 + }, + { + "epoch": 1.4112477633575684, + "grad_norm": 0.8380113244056702, + "learning_rate": 0.00010592641817193259, + "loss": 2.7636, + "step": 15577 + }, + { + "epoch": 1.4113383615320152, + "grad_norm": 0.7915178537368774, + "learning_rate": 0.00010592037697094184, + "loss": 2.1351, + "step": 15578 + }, + { + "epoch": 1.411428959706462, + "grad_norm": 0.9036857485771179, + "learning_rate": 0.00010591433576995107, + "loss": 2.7522, + "step": 15579 + }, + { + "epoch": 1.4115195578809088, + "grad_norm": 1.033710241317749, + "learning_rate": 0.00010590829456896032, + "loss": 2.5885, + "step": 15580 + }, + { + "epoch": 1.4116101560553556, + "grad_norm": 0.9170441627502441, + "learning_rate": 0.00010590225336796955, + "loss": 2.4946, + "step": 15581 + }, + { + "epoch": 1.4117007542298023, + "grad_norm": 0.8747112154960632, + "learning_rate": 0.0001058962121669788, + "loss": 2.876, + "step": 15582 + }, + { + "epoch": 1.4117913524042491, + "grad_norm": 0.8962070941925049, + "learning_rate": 0.00010589017096598803, + "loss": 2.7162, + "step": 15583 + }, + { + "epoch": 1.411881950578696, + "grad_norm": 0.8906370997428894, + "learning_rate": 0.00010588412976499728, + "loss": 2.9696, + "step": 15584 + }, + { + "epoch": 1.4119725487531425, + "grad_norm": 0.9185800552368164, + "learning_rate": 0.00010587808856400654, + "loss": 2.8213, + "step": 15585 + }, + { + "epoch": 1.4120631469275895, + "grad_norm": 0.8887556791305542, + "learning_rate": 0.00010587204736301576, + "loss": 2.2468, + "step": 15586 + }, + { + "epoch": 1.412153745102036, + "grad_norm": 0.9206401109695435, + "learning_rate": 0.00010586600616202502, + "loss": 2.786, + "step": 15587 + }, + { + "epoch": 1.412244343276483, + "grad_norm": 0.9139530658721924, + "learning_rate": 0.00010585996496103426, + "loss": 2.4123, + "step": 15588 + }, + { + "epoch": 1.4123349414509296, + "grad_norm": 0.8610823154449463, + "learning_rate": 0.0001058539237600435, + "loss": 2.2462, + "step": 15589 + }, + { + "epoch": 1.4124255396253766, + "grad_norm": 0.7941434979438782, + "learning_rate": 0.00010584788255905274, + "loss": 2.1267, + "step": 15590 + }, + { + "epoch": 1.4125161377998232, + "grad_norm": 0.8390020132064819, + "learning_rate": 0.00010584184135806199, + "loss": 2.1696, + "step": 15591 + }, + { + "epoch": 1.4126067359742702, + "grad_norm": 0.923326849937439, + "learning_rate": 0.00010583580015707124, + "loss": 2.6147, + "step": 15592 + }, + { + "epoch": 1.4126973341487168, + "grad_norm": 0.8946356177330017, + "learning_rate": 0.00010582975895608047, + "loss": 2.5668, + "step": 15593 + }, + { + "epoch": 1.4127879323231638, + "grad_norm": 0.8579235076904297, + "learning_rate": 0.00010582371775508972, + "loss": 2.6294, + "step": 15594 + }, + { + "epoch": 1.4128785304976104, + "grad_norm": 0.8851041793823242, + "learning_rate": 0.00010581767655409895, + "loss": 2.1584, + "step": 15595 + }, + { + "epoch": 1.4129691286720574, + "grad_norm": 0.9295178651809692, + "learning_rate": 0.00010581163535310821, + "loss": 2.7671, + "step": 15596 + }, + { + "epoch": 1.413059726846504, + "grad_norm": 0.9539811611175537, + "learning_rate": 0.00010580559415211743, + "loss": 2.7433, + "step": 15597 + }, + { + "epoch": 1.413150325020951, + "grad_norm": 0.9197204113006592, + "learning_rate": 0.0001057995529511267, + "loss": 2.6385, + "step": 15598 + }, + { + "epoch": 1.4132409231953975, + "grad_norm": 0.8788797855377197, + "learning_rate": 0.00010579351175013594, + "loss": 2.6196, + "step": 15599 + }, + { + "epoch": 1.4133315213698445, + "grad_norm": 0.8969588279724121, + "learning_rate": 0.00010578747054914518, + "loss": 2.7259, + "step": 15600 + }, + { + "epoch": 1.413422119544291, + "grad_norm": 0.8647223114967346, + "learning_rate": 0.00010578142934815442, + "loss": 2.5844, + "step": 15601 + }, + { + "epoch": 1.413512717718738, + "grad_norm": 0.9013519883155823, + "learning_rate": 0.00010577538814716366, + "loss": 2.6841, + "step": 15602 + }, + { + "epoch": 1.4136033158931847, + "grad_norm": 0.8844185471534729, + "learning_rate": 0.0001057693469461729, + "loss": 2.5864, + "step": 15603 + }, + { + "epoch": 1.4136939140676317, + "grad_norm": 0.8783792853355408, + "learning_rate": 0.00010576330574518214, + "loss": 2.6139, + "step": 15604 + }, + { + "epoch": 1.4137845122420782, + "grad_norm": 0.9318008422851562, + "learning_rate": 0.00010575726454419139, + "loss": 2.5709, + "step": 15605 + }, + { + "epoch": 1.413875110416525, + "grad_norm": 0.8531637787818909, + "learning_rate": 0.00010575122334320062, + "loss": 2.5487, + "step": 15606 + }, + { + "epoch": 1.4139657085909718, + "grad_norm": 0.8485503792762756, + "learning_rate": 0.00010574518214220988, + "loss": 2.6541, + "step": 15607 + }, + { + "epoch": 1.4140563067654186, + "grad_norm": 0.8043048977851868, + "learning_rate": 0.00010573914094121913, + "loss": 1.9815, + "step": 15608 + }, + { + "epoch": 1.4141469049398654, + "grad_norm": 0.9761777520179749, + "learning_rate": 0.00010573309974022836, + "loss": 2.7677, + "step": 15609 + }, + { + "epoch": 1.4142375031143122, + "grad_norm": 0.8840779066085815, + "learning_rate": 0.00010572705853923761, + "loss": 2.7296, + "step": 15610 + }, + { + "epoch": 1.414328101288759, + "grad_norm": 0.9197785258293152, + "learning_rate": 0.00010572101733824685, + "loss": 2.606, + "step": 15611 + }, + { + "epoch": 1.4144186994632058, + "grad_norm": 0.8874826431274414, + "learning_rate": 0.00010571497613725609, + "loss": 2.636, + "step": 15612 + }, + { + "epoch": 1.4145092976376525, + "grad_norm": 0.8628225326538086, + "learning_rate": 0.00010570893493626533, + "loss": 2.7223, + "step": 15613 + }, + { + "epoch": 1.4145998958120993, + "grad_norm": 0.9436374306678772, + "learning_rate": 0.00010570289373527457, + "loss": 2.6513, + "step": 15614 + }, + { + "epoch": 1.4146904939865461, + "grad_norm": 0.9565494060516357, + "learning_rate": 0.00010569685253428384, + "loss": 2.6936, + "step": 15615 + }, + { + "epoch": 1.414781092160993, + "grad_norm": 1.0232431888580322, + "learning_rate": 0.00010569081133329306, + "loss": 2.7736, + "step": 15616 + }, + { + "epoch": 1.4148716903354397, + "grad_norm": 0.8676552176475525, + "learning_rate": 0.00010568477013230232, + "loss": 2.7386, + "step": 15617 + }, + { + "epoch": 1.4149622885098865, + "grad_norm": 0.9563570022583008, + "learning_rate": 0.00010567872893131154, + "loss": 2.7812, + "step": 15618 + }, + { + "epoch": 1.4150528866843333, + "grad_norm": 0.9574043154716492, + "learning_rate": 0.0001056726877303208, + "loss": 2.7814, + "step": 15619 + }, + { + "epoch": 1.41514348485878, + "grad_norm": 0.8673239350318909, + "learning_rate": 0.00010566664652933003, + "loss": 2.6653, + "step": 15620 + }, + { + "epoch": 1.4152340830332268, + "grad_norm": 0.7505585551261902, + "learning_rate": 0.00010566060532833928, + "loss": 2.0081, + "step": 15621 + }, + { + "epoch": 1.4153246812076736, + "grad_norm": 0.9326882362365723, + "learning_rate": 0.00010565456412734853, + "loss": 2.9922, + "step": 15622 + }, + { + "epoch": 1.4154152793821204, + "grad_norm": 0.9693891406059265, + "learning_rate": 0.00010564852292635776, + "loss": 2.8386, + "step": 15623 + }, + { + "epoch": 1.4155058775565672, + "grad_norm": 0.9332000613212585, + "learning_rate": 0.00010564248172536701, + "loss": 2.6904, + "step": 15624 + }, + { + "epoch": 1.415596475731014, + "grad_norm": 0.8949036598205566, + "learning_rate": 0.00010563644052437624, + "loss": 2.8012, + "step": 15625 + }, + { + "epoch": 1.4156870739054608, + "grad_norm": 0.9810470938682556, + "learning_rate": 0.00010563039932338549, + "loss": 2.518, + "step": 15626 + }, + { + "epoch": 1.4157776720799076, + "grad_norm": 0.9255223870277405, + "learning_rate": 0.00010562435812239473, + "loss": 2.7986, + "step": 15627 + }, + { + "epoch": 1.4158682702543544, + "grad_norm": 0.9131430387496948, + "learning_rate": 0.00010561831692140399, + "loss": 3.0079, + "step": 15628 + }, + { + "epoch": 1.4159588684288011, + "grad_norm": 0.9145027995109558, + "learning_rate": 0.00010561227572041323, + "loss": 2.6944, + "step": 15629 + }, + { + "epoch": 1.416049466603248, + "grad_norm": 0.8508443236351013, + "learning_rate": 0.00010560623451942247, + "loss": 2.7091, + "step": 15630 + }, + { + "epoch": 1.4161400647776947, + "grad_norm": 0.9020459651947021, + "learning_rate": 0.00010560019331843172, + "loss": 2.6405, + "step": 15631 + }, + { + "epoch": 1.4162306629521415, + "grad_norm": 0.9268031716346741, + "learning_rate": 0.00010559415211744095, + "loss": 2.7656, + "step": 15632 + }, + { + "epoch": 1.4163212611265883, + "grad_norm": 0.9472808837890625, + "learning_rate": 0.0001055881109164502, + "loss": 2.5208, + "step": 15633 + }, + { + "epoch": 1.416411859301035, + "grad_norm": 1.0338910818099976, + "learning_rate": 0.00010558206971545943, + "loss": 2.9233, + "step": 15634 + }, + { + "epoch": 1.4165024574754819, + "grad_norm": 0.9137048125267029, + "learning_rate": 0.00010557602851446868, + "loss": 2.8516, + "step": 15635 + }, + { + "epoch": 1.4165930556499287, + "grad_norm": 0.9619840979576111, + "learning_rate": 0.00010556998731347791, + "loss": 2.7268, + "step": 15636 + }, + { + "epoch": 1.4166836538243754, + "grad_norm": 0.9035990834236145, + "learning_rate": 0.00010556394611248716, + "loss": 2.9221, + "step": 15637 + }, + { + "epoch": 1.4167742519988222, + "grad_norm": 0.9251071214675903, + "learning_rate": 0.00010555790491149642, + "loss": 2.6092, + "step": 15638 + }, + { + "epoch": 1.416864850173269, + "grad_norm": 0.9093044996261597, + "learning_rate": 0.00010555186371050564, + "loss": 2.7059, + "step": 15639 + }, + { + "epoch": 1.4169554483477158, + "grad_norm": 0.9493022561073303, + "learning_rate": 0.0001055458225095149, + "loss": 2.6832, + "step": 15640 + }, + { + "epoch": 1.4170460465221626, + "grad_norm": 0.9184513688087463, + "learning_rate": 0.00010553978130852414, + "loss": 2.8134, + "step": 15641 + }, + { + "epoch": 1.4171366446966094, + "grad_norm": 0.8965427279472351, + "learning_rate": 0.00010553374010753339, + "loss": 2.7981, + "step": 15642 + }, + { + "epoch": 1.4172272428710562, + "grad_norm": 0.8844570517539978, + "learning_rate": 0.00010552769890654262, + "loss": 2.6468, + "step": 15643 + }, + { + "epoch": 1.417317841045503, + "grad_norm": 1.0700607299804688, + "learning_rate": 0.00010552165770555187, + "loss": 3.2323, + "step": 15644 + }, + { + "epoch": 1.4174084392199497, + "grad_norm": 0.7778760194778442, + "learning_rate": 0.00010551561650456111, + "loss": 2.1555, + "step": 15645 + }, + { + "epoch": 1.4174990373943965, + "grad_norm": 0.8886135220527649, + "learning_rate": 0.00010550957530357035, + "loss": 2.5679, + "step": 15646 + }, + { + "epoch": 1.4175896355688433, + "grad_norm": 0.7957186698913574, + "learning_rate": 0.00010550353410257961, + "loss": 1.9803, + "step": 15647 + }, + { + "epoch": 1.41768023374329, + "grad_norm": 0.9103161692619324, + "learning_rate": 0.00010549749290158883, + "loss": 2.4972, + "step": 15648 + }, + { + "epoch": 1.417770831917737, + "grad_norm": 0.8983567953109741, + "learning_rate": 0.00010549145170059809, + "loss": 2.6922, + "step": 15649 + }, + { + "epoch": 1.4178614300921837, + "grad_norm": 0.9462973475456238, + "learning_rate": 0.00010548541049960731, + "loss": 2.6581, + "step": 15650 + }, + { + "epoch": 1.4179520282666305, + "grad_norm": 0.907849907875061, + "learning_rate": 0.00010547936929861657, + "loss": 2.7955, + "step": 15651 + }, + { + "epoch": 1.4180426264410773, + "grad_norm": 0.9077268242835999, + "learning_rate": 0.00010547332809762582, + "loss": 2.7399, + "step": 15652 + }, + { + "epoch": 1.418133224615524, + "grad_norm": 0.8473414778709412, + "learning_rate": 0.00010546728689663506, + "loss": 2.4605, + "step": 15653 + }, + { + "epoch": 1.4182238227899708, + "grad_norm": 0.9644797444343567, + "learning_rate": 0.0001054612456956443, + "loss": 2.7693, + "step": 15654 + }, + { + "epoch": 1.4183144209644176, + "grad_norm": 0.8053792715072632, + "learning_rate": 0.00010545520449465354, + "loss": 1.9811, + "step": 15655 + }, + { + "epoch": 1.4184050191388644, + "grad_norm": 0.7282828688621521, + "learning_rate": 0.00010544916329366278, + "loss": 1.5002, + "step": 15656 + }, + { + "epoch": 1.4184956173133112, + "grad_norm": 0.870424211025238, + "learning_rate": 0.00010544312209267202, + "loss": 2.1671, + "step": 15657 + }, + { + "epoch": 1.418586215487758, + "grad_norm": 0.9924094676971436, + "learning_rate": 0.00010543708089168127, + "loss": 2.705, + "step": 15658 + }, + { + "epoch": 1.4186768136622048, + "grad_norm": 0.9073165059089661, + "learning_rate": 0.00010543103969069053, + "loss": 2.7566, + "step": 15659 + }, + { + "epoch": 1.4187674118366516, + "grad_norm": 0.8507643938064575, + "learning_rate": 0.00010542499848969976, + "loss": 2.6813, + "step": 15660 + }, + { + "epoch": 1.4188580100110983, + "grad_norm": 0.8490605354309082, + "learning_rate": 0.00010541895728870901, + "loss": 2.6701, + "step": 15661 + }, + { + "epoch": 1.4189486081855451, + "grad_norm": 0.913503885269165, + "learning_rate": 0.00010541291608771824, + "loss": 2.7934, + "step": 15662 + }, + { + "epoch": 1.419039206359992, + "grad_norm": 0.9201673269271851, + "learning_rate": 0.00010540687488672749, + "loss": 2.5688, + "step": 15663 + }, + { + "epoch": 1.4191298045344387, + "grad_norm": 0.9409085512161255, + "learning_rate": 0.00010540083368573672, + "loss": 2.7958, + "step": 15664 + }, + { + "epoch": 1.4192204027088855, + "grad_norm": 0.8911150693893433, + "learning_rate": 0.00010539479248474597, + "loss": 2.5733, + "step": 15665 + }, + { + "epoch": 1.419311000883332, + "grad_norm": 0.9466382265090942, + "learning_rate": 0.0001053887512837552, + "loss": 2.483, + "step": 15666 + }, + { + "epoch": 1.419401599057779, + "grad_norm": 0.8164169192314148, + "learning_rate": 0.00010538271008276445, + "loss": 2.1948, + "step": 15667 + }, + { + "epoch": 1.4194921972322256, + "grad_norm": 0.8584315776824951, + "learning_rate": 0.00010537666888177371, + "loss": 2.538, + "step": 15668 + }, + { + "epoch": 1.4195827954066726, + "grad_norm": 0.8871946930885315, + "learning_rate": 0.00010537062768078294, + "loss": 2.1105, + "step": 15669 + }, + { + "epoch": 1.4196733935811192, + "grad_norm": 0.8762960433959961, + "learning_rate": 0.0001053645864797922, + "loss": 2.7501, + "step": 15670 + }, + { + "epoch": 1.4197639917555662, + "grad_norm": 0.9298334121704102, + "learning_rate": 0.00010535854527880142, + "loss": 2.5007, + "step": 15671 + }, + { + "epoch": 1.4198545899300128, + "grad_norm": 0.8590381145477295, + "learning_rate": 0.00010535250407781068, + "loss": 2.6496, + "step": 15672 + }, + { + "epoch": 1.4199451881044598, + "grad_norm": 0.9576934576034546, + "learning_rate": 0.00010534646287681991, + "loss": 2.8118, + "step": 15673 + }, + { + "epoch": 1.4200357862789064, + "grad_norm": 0.7896827459335327, + "learning_rate": 0.00010534042167582916, + "loss": 2.4145, + "step": 15674 + }, + { + "epoch": 1.4201263844533534, + "grad_norm": 0.835326611995697, + "learning_rate": 0.00010533438047483841, + "loss": 2.6427, + "step": 15675 + }, + { + "epoch": 1.4202169826278, + "grad_norm": 0.9102540612220764, + "learning_rate": 0.00010532833927384764, + "loss": 2.6139, + "step": 15676 + }, + { + "epoch": 1.420307580802247, + "grad_norm": 0.9125969409942627, + "learning_rate": 0.00010532229807285689, + "loss": 2.588, + "step": 15677 + }, + { + "epoch": 1.4203981789766935, + "grad_norm": 0.7663774490356445, + "learning_rate": 0.00010531625687186612, + "loss": 1.8482, + "step": 15678 + }, + { + "epoch": 1.4204887771511405, + "grad_norm": 0.8986841440200806, + "learning_rate": 0.00010531021567087538, + "loss": 2.6207, + "step": 15679 + }, + { + "epoch": 1.420579375325587, + "grad_norm": 0.8655927181243896, + "learning_rate": 0.0001053041744698846, + "loss": 2.5215, + "step": 15680 + }, + { + "epoch": 1.420669973500034, + "grad_norm": 0.8392590880393982, + "learning_rate": 0.00010529813326889387, + "loss": 2.4797, + "step": 15681 + }, + { + "epoch": 1.4207605716744807, + "grad_norm": 0.8785133957862854, + "learning_rate": 0.00010529209206790311, + "loss": 2.8782, + "step": 15682 + }, + { + "epoch": 1.4208511698489277, + "grad_norm": 0.9153806567192078, + "learning_rate": 0.00010528605086691235, + "loss": 2.7627, + "step": 15683 + }, + { + "epoch": 1.4209417680233742, + "grad_norm": 0.8136012554168701, + "learning_rate": 0.0001052800096659216, + "loss": 2.1058, + "step": 15684 + }, + { + "epoch": 1.4210323661978213, + "grad_norm": 0.9216044545173645, + "learning_rate": 0.00010527396846493083, + "loss": 2.7262, + "step": 15685 + }, + { + "epoch": 1.4211229643722678, + "grad_norm": 0.8748034238815308, + "learning_rate": 0.00010526792726394008, + "loss": 2.5603, + "step": 15686 + }, + { + "epoch": 1.4212135625467146, + "grad_norm": 0.8709174394607544, + "learning_rate": 0.00010526188606294931, + "loss": 2.8382, + "step": 15687 + }, + { + "epoch": 1.4213041607211614, + "grad_norm": 0.9786860942840576, + "learning_rate": 0.00010525584486195856, + "loss": 2.7453, + "step": 15688 + }, + { + "epoch": 1.4213947588956082, + "grad_norm": 0.9286543726921082, + "learning_rate": 0.00010524980366096782, + "loss": 2.6534, + "step": 15689 + }, + { + "epoch": 1.421485357070055, + "grad_norm": 0.8529017567634583, + "learning_rate": 0.00010524376245997704, + "loss": 2.8192, + "step": 15690 + }, + { + "epoch": 1.4215759552445018, + "grad_norm": 0.9232750535011292, + "learning_rate": 0.0001052377212589863, + "loss": 2.9055, + "step": 15691 + }, + { + "epoch": 1.4216665534189485, + "grad_norm": 0.8700448870658875, + "learning_rate": 0.00010523168005799554, + "loss": 2.0271, + "step": 15692 + }, + { + "epoch": 1.4217571515933953, + "grad_norm": 0.9101629853248596, + "learning_rate": 0.00010522563885700478, + "loss": 2.9419, + "step": 15693 + }, + { + "epoch": 1.4218477497678421, + "grad_norm": 0.898749053478241, + "learning_rate": 0.00010521959765601402, + "loss": 2.7909, + "step": 15694 + }, + { + "epoch": 1.421938347942289, + "grad_norm": 1.0940462350845337, + "learning_rate": 0.00010521355645502326, + "loss": 2.4191, + "step": 15695 + }, + { + "epoch": 1.4220289461167357, + "grad_norm": 0.8907029628753662, + "learning_rate": 0.0001052075152540325, + "loss": 2.6864, + "step": 15696 + }, + { + "epoch": 1.4221195442911825, + "grad_norm": 1.0217232704162598, + "learning_rate": 0.00010520147405304175, + "loss": 2.8112, + "step": 15697 + }, + { + "epoch": 1.4222101424656293, + "grad_norm": 0.9608759880065918, + "learning_rate": 0.00010519543285205101, + "loss": 2.9154, + "step": 15698 + }, + { + "epoch": 1.422300740640076, + "grad_norm": 0.8985737562179565, + "learning_rate": 0.00010518939165106023, + "loss": 2.7085, + "step": 15699 + }, + { + "epoch": 1.4223913388145228, + "grad_norm": 0.9224170446395874, + "learning_rate": 0.00010518335045006949, + "loss": 2.7063, + "step": 15700 + }, + { + "epoch": 1.4224819369889696, + "grad_norm": 0.881087064743042, + "learning_rate": 0.00010517730924907871, + "loss": 2.5829, + "step": 15701 + }, + { + "epoch": 1.4225725351634164, + "grad_norm": 0.9487547874450684, + "learning_rate": 0.00010517126804808797, + "loss": 2.564, + "step": 15702 + }, + { + "epoch": 1.4226631333378632, + "grad_norm": 0.8751921653747559, + "learning_rate": 0.00010516522684709719, + "loss": 2.6947, + "step": 15703 + }, + { + "epoch": 1.42275373151231, + "grad_norm": 0.8518961071968079, + "learning_rate": 0.00010515918564610645, + "loss": 2.7863, + "step": 15704 + }, + { + "epoch": 1.4228443296867568, + "grad_norm": 0.8689746260643005, + "learning_rate": 0.0001051531444451157, + "loss": 2.532, + "step": 15705 + }, + { + "epoch": 1.4229349278612036, + "grad_norm": 0.8633487224578857, + "learning_rate": 0.00010514710324412493, + "loss": 2.8517, + "step": 15706 + }, + { + "epoch": 1.4230255260356504, + "grad_norm": 0.9291914701461792, + "learning_rate": 0.00010514106204313418, + "loss": 2.4688, + "step": 15707 + }, + { + "epoch": 1.4231161242100971, + "grad_norm": 1.0382589101791382, + "learning_rate": 0.00010513502084214342, + "loss": 2.7578, + "step": 15708 + }, + { + "epoch": 1.423206722384544, + "grad_norm": 0.9735532999038696, + "learning_rate": 0.00010512897964115266, + "loss": 2.8864, + "step": 15709 + }, + { + "epoch": 1.4232973205589907, + "grad_norm": 0.8106351494789124, + "learning_rate": 0.0001051229384401619, + "loss": 2.105, + "step": 15710 + }, + { + "epoch": 1.4233879187334375, + "grad_norm": 1.0194026231765747, + "learning_rate": 0.00010511689723917116, + "loss": 2.6912, + "step": 15711 + }, + { + "epoch": 1.4234785169078843, + "grad_norm": 0.8670504093170166, + "learning_rate": 0.0001051108560381804, + "loss": 2.8199, + "step": 15712 + }, + { + "epoch": 1.423569115082331, + "grad_norm": 0.8965749740600586, + "learning_rate": 0.00010510481483718964, + "loss": 2.7599, + "step": 15713 + }, + { + "epoch": 1.4236597132567779, + "grad_norm": 0.9207585453987122, + "learning_rate": 0.00010509877363619889, + "loss": 2.7774, + "step": 15714 + }, + { + "epoch": 1.4237503114312247, + "grad_norm": 0.9132574200630188, + "learning_rate": 0.00010509273243520812, + "loss": 2.7456, + "step": 15715 + }, + { + "epoch": 1.4238409096056714, + "grad_norm": 0.9924381375312805, + "learning_rate": 0.00010508669123421737, + "loss": 2.9426, + "step": 15716 + }, + { + "epoch": 1.4239315077801182, + "grad_norm": 0.8927339911460876, + "learning_rate": 0.0001050806500332266, + "loss": 2.7482, + "step": 15717 + }, + { + "epoch": 1.424022105954565, + "grad_norm": 0.8797442317008972, + "learning_rate": 0.00010507460883223585, + "loss": 2.585, + "step": 15718 + }, + { + "epoch": 1.4241127041290118, + "grad_norm": 0.8960539102554321, + "learning_rate": 0.00010506856763124511, + "loss": 2.4933, + "step": 15719 + }, + { + "epoch": 1.4242033023034586, + "grad_norm": 0.95100337266922, + "learning_rate": 0.00010506252643025433, + "loss": 2.7028, + "step": 15720 + }, + { + "epoch": 1.4242939004779054, + "grad_norm": 0.7929642200469971, + "learning_rate": 0.0001050564852292636, + "loss": 2.1145, + "step": 15721 + }, + { + "epoch": 1.4243844986523522, + "grad_norm": 0.8995538949966431, + "learning_rate": 0.00010505044402827281, + "loss": 2.7742, + "step": 15722 + }, + { + "epoch": 1.424475096826799, + "grad_norm": 0.9456876516342163, + "learning_rate": 0.00010504440282728208, + "loss": 2.7485, + "step": 15723 + }, + { + "epoch": 1.4245656950012457, + "grad_norm": 0.9010539650917053, + "learning_rate": 0.00010503836162629131, + "loss": 2.4266, + "step": 15724 + }, + { + "epoch": 1.4246562931756925, + "grad_norm": 0.9667198657989502, + "learning_rate": 0.00010503232042530056, + "loss": 2.676, + "step": 15725 + }, + { + "epoch": 1.4247468913501393, + "grad_norm": 0.8934321999549866, + "learning_rate": 0.00010502627922430979, + "loss": 2.631, + "step": 15726 + }, + { + "epoch": 1.424837489524586, + "grad_norm": 0.8414550423622131, + "learning_rate": 0.00010502023802331904, + "loss": 2.1031, + "step": 15727 + }, + { + "epoch": 1.424928087699033, + "grad_norm": 0.9177693128585815, + "learning_rate": 0.00010501419682232829, + "loss": 2.5947, + "step": 15728 + }, + { + "epoch": 1.4250186858734797, + "grad_norm": 0.8801243901252747, + "learning_rate": 0.00010500815562133752, + "loss": 2.5647, + "step": 15729 + }, + { + "epoch": 1.4251092840479265, + "grad_norm": 0.9086279273033142, + "learning_rate": 0.00010500211442034678, + "loss": 2.4652, + "step": 15730 + }, + { + "epoch": 1.4251998822223733, + "grad_norm": 0.9439216256141663, + "learning_rate": 0.000104996073219356, + "loss": 2.7867, + "step": 15731 + }, + { + "epoch": 1.42529048039682, + "grad_norm": 1.0078365802764893, + "learning_rate": 0.00010499003201836526, + "loss": 2.6902, + "step": 15732 + }, + { + "epoch": 1.4253810785712668, + "grad_norm": 0.9239680171012878, + "learning_rate": 0.00010498399081737448, + "loss": 2.6429, + "step": 15733 + }, + { + "epoch": 1.4254716767457136, + "grad_norm": 0.9102606177330017, + "learning_rate": 0.00010497794961638375, + "loss": 2.4489, + "step": 15734 + }, + { + "epoch": 1.4255622749201604, + "grad_norm": 0.8474092483520508, + "learning_rate": 0.00010497190841539299, + "loss": 2.6397, + "step": 15735 + }, + { + "epoch": 1.4256528730946072, + "grad_norm": 0.9156891703605652, + "learning_rate": 0.00010496586721440223, + "loss": 2.9878, + "step": 15736 + }, + { + "epoch": 1.425743471269054, + "grad_norm": 0.9178515672683716, + "learning_rate": 0.00010495982601341147, + "loss": 2.6405, + "step": 15737 + }, + { + "epoch": 1.4258340694435008, + "grad_norm": 0.8741840720176697, + "learning_rate": 0.00010495378481242071, + "loss": 2.7227, + "step": 15738 + }, + { + "epoch": 1.4259246676179476, + "grad_norm": 1.086130142211914, + "learning_rate": 0.00010494774361142996, + "loss": 2.6155, + "step": 15739 + }, + { + "epoch": 1.4260152657923943, + "grad_norm": 0.9601522088050842, + "learning_rate": 0.00010494170241043919, + "loss": 2.6648, + "step": 15740 + }, + { + "epoch": 1.4261058639668411, + "grad_norm": 0.8516501784324646, + "learning_rate": 0.00010493566120944844, + "loss": 2.8151, + "step": 15741 + }, + { + "epoch": 1.426196462141288, + "grad_norm": 1.0740405321121216, + "learning_rate": 0.0001049296200084577, + "loss": 2.7758, + "step": 15742 + }, + { + "epoch": 1.4262870603157347, + "grad_norm": 0.9362530708312988, + "learning_rate": 0.00010492357880746693, + "loss": 2.808, + "step": 15743 + }, + { + "epoch": 1.4263776584901815, + "grad_norm": 0.9166111946105957, + "learning_rate": 0.00010491753760647618, + "loss": 2.6178, + "step": 15744 + }, + { + "epoch": 1.4264682566646283, + "grad_norm": 0.8932455778121948, + "learning_rate": 0.00010491149640548541, + "loss": 2.7227, + "step": 15745 + }, + { + "epoch": 1.426558854839075, + "grad_norm": 0.85554438829422, + "learning_rate": 0.00010490545520449466, + "loss": 2.4728, + "step": 15746 + }, + { + "epoch": 1.4266494530135216, + "grad_norm": 0.9344154000282288, + "learning_rate": 0.0001048994140035039, + "loss": 3.1175, + "step": 15747 + }, + { + "epoch": 1.4267400511879686, + "grad_norm": 0.9741591811180115, + "learning_rate": 0.00010489337280251314, + "loss": 2.4076, + "step": 15748 + }, + { + "epoch": 1.4268306493624152, + "grad_norm": 0.9334582090377808, + "learning_rate": 0.0001048873316015224, + "loss": 2.7832, + "step": 15749 + }, + { + "epoch": 1.4269212475368622, + "grad_norm": 0.9674261808395386, + "learning_rate": 0.00010488129040053163, + "loss": 2.9809, + "step": 15750 + }, + { + "epoch": 1.4270118457113088, + "grad_norm": 0.9211977124214172, + "learning_rate": 0.00010487524919954089, + "loss": 2.9479, + "step": 15751 + }, + { + "epoch": 1.4271024438857558, + "grad_norm": 0.8517081141471863, + "learning_rate": 0.00010486920799855011, + "loss": 2.6255, + "step": 15752 + }, + { + "epoch": 1.4271930420602024, + "grad_norm": 0.8997851014137268, + "learning_rate": 0.00010486316679755937, + "loss": 2.7601, + "step": 15753 + }, + { + "epoch": 1.4272836402346494, + "grad_norm": 0.8575140833854675, + "learning_rate": 0.00010485712559656859, + "loss": 2.6763, + "step": 15754 + }, + { + "epoch": 1.427374238409096, + "grad_norm": 0.8344035744667053, + "learning_rate": 0.00010485108439557785, + "loss": 2.6566, + "step": 15755 + }, + { + "epoch": 1.427464836583543, + "grad_norm": 0.9869785904884338, + "learning_rate": 0.00010484504319458708, + "loss": 2.5895, + "step": 15756 + }, + { + "epoch": 1.4275554347579895, + "grad_norm": 0.844226062297821, + "learning_rate": 0.00010483900199359633, + "loss": 2.7443, + "step": 15757 + }, + { + "epoch": 1.4276460329324365, + "grad_norm": 0.8922265768051147, + "learning_rate": 0.00010483296079260558, + "loss": 2.8631, + "step": 15758 + }, + { + "epoch": 1.427736631106883, + "grad_norm": 0.8637226223945618, + "learning_rate": 0.00010482691959161481, + "loss": 2.5888, + "step": 15759 + }, + { + "epoch": 1.42782722928133, + "grad_norm": 0.933573305606842, + "learning_rate": 0.00010482087839062406, + "loss": 2.8427, + "step": 15760 + }, + { + "epoch": 1.4279178274557767, + "grad_norm": 0.9257960319519043, + "learning_rate": 0.0001048148371896333, + "loss": 2.7235, + "step": 15761 + }, + { + "epoch": 1.4280084256302237, + "grad_norm": 0.8712658286094666, + "learning_rate": 0.00010480879598864256, + "loss": 2.9013, + "step": 15762 + }, + { + "epoch": 1.4280990238046702, + "grad_norm": 0.8775187134742737, + "learning_rate": 0.00010480275478765178, + "loss": 2.9874, + "step": 15763 + }, + { + "epoch": 1.4281896219791173, + "grad_norm": 0.8794278502464294, + "learning_rate": 0.00010479671358666104, + "loss": 2.776, + "step": 15764 + }, + { + "epoch": 1.4282802201535638, + "grad_norm": 0.8458172678947449, + "learning_rate": 0.00010479067238567029, + "loss": 2.6376, + "step": 15765 + }, + { + "epoch": 1.4283708183280108, + "grad_norm": 0.8910616636276245, + "learning_rate": 0.00010478463118467952, + "loss": 2.551, + "step": 15766 + }, + { + "epoch": 1.4284614165024574, + "grad_norm": 0.7971237897872925, + "learning_rate": 0.00010477858998368877, + "loss": 2.011, + "step": 15767 + }, + { + "epoch": 1.4285520146769042, + "grad_norm": 0.8529636263847351, + "learning_rate": 0.000104772548782698, + "loss": 2.4535, + "step": 15768 + }, + { + "epoch": 1.428642612851351, + "grad_norm": 0.9299589395523071, + "learning_rate": 0.00010476650758170725, + "loss": 2.854, + "step": 15769 + }, + { + "epoch": 1.4287332110257978, + "grad_norm": 0.8755809664726257, + "learning_rate": 0.00010476046638071648, + "loss": 2.6217, + "step": 15770 + }, + { + "epoch": 1.4288238092002445, + "grad_norm": 0.9814316034317017, + "learning_rate": 0.00010475442517972573, + "loss": 2.7665, + "step": 15771 + }, + { + "epoch": 1.4289144073746913, + "grad_norm": 0.9068488478660583, + "learning_rate": 0.00010474838397873499, + "loss": 3.0443, + "step": 15772 + }, + { + "epoch": 1.4290050055491381, + "grad_norm": 0.9460812211036682, + "learning_rate": 0.00010474234277774421, + "loss": 2.6473, + "step": 15773 + }, + { + "epoch": 1.429095603723585, + "grad_norm": 0.9479021430015564, + "learning_rate": 0.00010473630157675347, + "loss": 2.6071, + "step": 15774 + }, + { + "epoch": 1.4291862018980317, + "grad_norm": 0.9434310793876648, + "learning_rate": 0.00010473026037576271, + "loss": 2.7441, + "step": 15775 + }, + { + "epoch": 1.4292768000724785, + "grad_norm": 0.9296923279762268, + "learning_rate": 0.00010472421917477195, + "loss": 2.6407, + "step": 15776 + }, + { + "epoch": 1.4293673982469253, + "grad_norm": 0.888063371181488, + "learning_rate": 0.00010471817797378119, + "loss": 2.5753, + "step": 15777 + }, + { + "epoch": 1.429457996421372, + "grad_norm": 0.9684174060821533, + "learning_rate": 0.00010471213677279044, + "loss": 2.932, + "step": 15778 + }, + { + "epoch": 1.4295485945958188, + "grad_norm": 0.9245384335517883, + "learning_rate": 0.00010470609557179968, + "loss": 2.7535, + "step": 15779 + }, + { + "epoch": 1.4296391927702656, + "grad_norm": 0.9019595980644226, + "learning_rate": 0.00010470005437080892, + "loss": 2.5627, + "step": 15780 + }, + { + "epoch": 1.4297297909447124, + "grad_norm": 0.8582555651664734, + "learning_rate": 0.00010469401316981817, + "loss": 2.7527, + "step": 15781 + }, + { + "epoch": 1.4298203891191592, + "grad_norm": 0.8619992136955261, + "learning_rate": 0.0001046879719688274, + "loss": 2.8933, + "step": 15782 + }, + { + "epoch": 1.429910987293606, + "grad_norm": 0.9504382610321045, + "learning_rate": 0.00010468193076783666, + "loss": 2.5391, + "step": 15783 + }, + { + "epoch": 1.4300015854680528, + "grad_norm": 0.9201132655143738, + "learning_rate": 0.00010467588956684588, + "loss": 2.8126, + "step": 15784 + }, + { + "epoch": 1.4300921836424996, + "grad_norm": 0.9824628829956055, + "learning_rate": 0.00010466984836585514, + "loss": 2.9969, + "step": 15785 + }, + { + "epoch": 1.4301827818169464, + "grad_norm": 0.9029931426048279, + "learning_rate": 0.00010466380716486436, + "loss": 2.4157, + "step": 15786 + }, + { + "epoch": 1.4302733799913931, + "grad_norm": 0.8273648023605347, + "learning_rate": 0.00010465776596387362, + "loss": 2.0477, + "step": 15787 + }, + { + "epoch": 1.43036397816584, + "grad_norm": 0.9520987272262573, + "learning_rate": 0.00010465172476288287, + "loss": 2.6016, + "step": 15788 + }, + { + "epoch": 1.4304545763402867, + "grad_norm": 0.9118772149085999, + "learning_rate": 0.0001046456835618921, + "loss": 2.8567, + "step": 15789 + }, + { + "epoch": 1.4305451745147335, + "grad_norm": 0.9022212624549866, + "learning_rate": 0.00010463964236090135, + "loss": 2.6148, + "step": 15790 + }, + { + "epoch": 1.4306357726891803, + "grad_norm": 0.9126179218292236, + "learning_rate": 0.00010463360115991059, + "loss": 2.7574, + "step": 15791 + }, + { + "epoch": 1.430726370863627, + "grad_norm": 0.8577129244804382, + "learning_rate": 0.00010462755995891984, + "loss": 2.8007, + "step": 15792 + }, + { + "epoch": 1.4308169690380739, + "grad_norm": 0.7804527878761292, + "learning_rate": 0.00010462151875792907, + "loss": 1.9582, + "step": 15793 + }, + { + "epoch": 1.4309075672125207, + "grad_norm": 0.8644417524337769, + "learning_rate": 0.00010461547755693833, + "loss": 2.8663, + "step": 15794 + }, + { + "epoch": 1.4309981653869674, + "grad_norm": 1.0126395225524902, + "learning_rate": 0.00010460943635594758, + "loss": 2.8378, + "step": 15795 + }, + { + "epoch": 1.4310887635614142, + "grad_norm": 0.8498242497444153, + "learning_rate": 0.00010460339515495681, + "loss": 2.7666, + "step": 15796 + }, + { + "epoch": 1.431179361735861, + "grad_norm": 0.8623649477958679, + "learning_rate": 0.00010459735395396606, + "loss": 2.6321, + "step": 15797 + }, + { + "epoch": 1.4312699599103078, + "grad_norm": 0.8328481316566467, + "learning_rate": 0.0001045913127529753, + "loss": 2.8345, + "step": 15798 + }, + { + "epoch": 1.4313605580847546, + "grad_norm": 0.99293053150177, + "learning_rate": 0.00010458527155198454, + "loss": 2.5082, + "step": 15799 + }, + { + "epoch": 1.4314511562592014, + "grad_norm": 0.9259225130081177, + "learning_rate": 0.00010457923035099378, + "loss": 2.8245, + "step": 15800 + }, + { + "epoch": 1.4315417544336482, + "grad_norm": 0.9474362730979919, + "learning_rate": 0.00010457318915000302, + "loss": 2.8107, + "step": 15801 + }, + { + "epoch": 1.431632352608095, + "grad_norm": 0.8992394208908081, + "learning_rate": 0.00010456714794901228, + "loss": 2.5886, + "step": 15802 + }, + { + "epoch": 1.4317229507825417, + "grad_norm": 0.9438349604606628, + "learning_rate": 0.0001045611067480215, + "loss": 3.0068, + "step": 15803 + }, + { + "epoch": 1.4318135489569885, + "grad_norm": 0.9275969862937927, + "learning_rate": 0.00010455506554703077, + "loss": 2.8485, + "step": 15804 + }, + { + "epoch": 1.4319041471314353, + "grad_norm": 0.9617617130279541, + "learning_rate": 0.00010454902434603999, + "loss": 2.8184, + "step": 15805 + }, + { + "epoch": 1.431994745305882, + "grad_norm": 0.9338918924331665, + "learning_rate": 0.00010454298314504925, + "loss": 2.8136, + "step": 15806 + }, + { + "epoch": 1.432085343480329, + "grad_norm": 0.9020053148269653, + "learning_rate": 0.00010453694194405848, + "loss": 2.8848, + "step": 15807 + }, + { + "epoch": 1.4321759416547757, + "grad_norm": 0.8585301637649536, + "learning_rate": 0.00010453090074306773, + "loss": 2.7219, + "step": 15808 + }, + { + "epoch": 1.4322665398292225, + "grad_norm": 0.8658254146575928, + "learning_rate": 0.00010452485954207698, + "loss": 2.9688, + "step": 15809 + }, + { + "epoch": 1.4323571380036693, + "grad_norm": 0.9140102863311768, + "learning_rate": 0.00010451881834108621, + "loss": 2.7349, + "step": 15810 + }, + { + "epoch": 1.432447736178116, + "grad_norm": 0.9299371838569641, + "learning_rate": 0.00010451277714009546, + "loss": 2.7313, + "step": 15811 + }, + { + "epoch": 1.4325383343525628, + "grad_norm": 0.8588709831237793, + "learning_rate": 0.00010450673593910469, + "loss": 2.6099, + "step": 15812 + }, + { + "epoch": 1.4326289325270096, + "grad_norm": 0.8512428998947144, + "learning_rate": 0.00010450069473811394, + "loss": 2.6186, + "step": 15813 + }, + { + "epoch": 1.4327195307014564, + "grad_norm": 0.8362672328948975, + "learning_rate": 0.00010449465353712317, + "loss": 2.5857, + "step": 15814 + }, + { + "epoch": 1.4328101288759032, + "grad_norm": 0.8805187344551086, + "learning_rate": 0.00010448861233613244, + "loss": 2.5946, + "step": 15815 + }, + { + "epoch": 1.43290072705035, + "grad_norm": 0.9102354049682617, + "learning_rate": 0.00010448257113514166, + "loss": 2.4868, + "step": 15816 + }, + { + "epoch": 1.4329913252247968, + "grad_norm": 0.9186050295829773, + "learning_rate": 0.00010447652993415092, + "loss": 2.7857, + "step": 15817 + }, + { + "epoch": 1.4330819233992436, + "grad_norm": 0.8752469420433044, + "learning_rate": 0.00010447048873316016, + "loss": 2.4533, + "step": 15818 + }, + { + "epoch": 1.4331725215736904, + "grad_norm": 0.9525467157363892, + "learning_rate": 0.0001044644475321694, + "loss": 2.7424, + "step": 15819 + }, + { + "epoch": 1.4332631197481371, + "grad_norm": 0.9672656655311584, + "learning_rate": 0.00010445840633117865, + "loss": 2.5278, + "step": 15820 + }, + { + "epoch": 1.433353717922584, + "grad_norm": 0.8757354617118835, + "learning_rate": 0.00010445236513018788, + "loss": 2.7291, + "step": 15821 + }, + { + "epoch": 1.4334443160970307, + "grad_norm": 0.9535986185073853, + "learning_rate": 0.00010444632392919713, + "loss": 2.5831, + "step": 15822 + }, + { + "epoch": 1.4335349142714775, + "grad_norm": 0.8975239396095276, + "learning_rate": 0.00010444028272820636, + "loss": 2.6748, + "step": 15823 + }, + { + "epoch": 1.4336255124459243, + "grad_norm": 0.8787752985954285, + "learning_rate": 0.00010443424152721561, + "loss": 2.4221, + "step": 15824 + }, + { + "epoch": 1.433716110620371, + "grad_norm": 0.6135916709899902, + "learning_rate": 0.00010442820032622487, + "loss": 1.4876, + "step": 15825 + }, + { + "epoch": 1.4338067087948179, + "grad_norm": 0.9300597310066223, + "learning_rate": 0.00010442215912523409, + "loss": 2.7284, + "step": 15826 + }, + { + "epoch": 1.4338973069692647, + "grad_norm": 0.7023464441299438, + "learning_rate": 0.00010441611792424335, + "loss": 1.4849, + "step": 15827 + }, + { + "epoch": 1.4339879051437112, + "grad_norm": 0.8531951308250427, + "learning_rate": 0.00010441007672325259, + "loss": 2.7255, + "step": 15828 + }, + { + "epoch": 1.4340785033181582, + "grad_norm": 0.9630823731422424, + "learning_rate": 0.00010440403552226183, + "loss": 2.6502, + "step": 15829 + }, + { + "epoch": 1.4341691014926048, + "grad_norm": 0.8978110551834106, + "learning_rate": 0.00010439799432127107, + "loss": 2.6538, + "step": 15830 + }, + { + "epoch": 1.4342596996670518, + "grad_norm": 0.934942364692688, + "learning_rate": 0.00010439195312028032, + "loss": 3.0367, + "step": 15831 + }, + { + "epoch": 1.4343502978414984, + "grad_norm": 0.9035627245903015, + "learning_rate": 0.00010438591191928956, + "loss": 2.3587, + "step": 15832 + }, + { + "epoch": 1.4344408960159454, + "grad_norm": 0.9420019388198853, + "learning_rate": 0.0001043798707182988, + "loss": 3.0338, + "step": 15833 + }, + { + "epoch": 1.434531494190392, + "grad_norm": 0.8866280913352966, + "learning_rate": 0.00010437382951730806, + "loss": 2.4927, + "step": 15834 + }, + { + "epoch": 1.434622092364839, + "grad_norm": 0.7652139663696289, + "learning_rate": 0.00010436778831631728, + "loss": 2.0772, + "step": 15835 + }, + { + "epoch": 1.4347126905392855, + "grad_norm": 0.9065811038017273, + "learning_rate": 0.00010436174711532654, + "loss": 2.8775, + "step": 15836 + }, + { + "epoch": 1.4348032887137325, + "grad_norm": 0.940147340297699, + "learning_rate": 0.00010435570591433576, + "loss": 2.5115, + "step": 15837 + }, + { + "epoch": 1.434893886888179, + "grad_norm": 0.9147688150405884, + "learning_rate": 0.00010434966471334502, + "loss": 2.7722, + "step": 15838 + }, + { + "epoch": 1.434984485062626, + "grad_norm": 0.8565689325332642, + "learning_rate": 0.00010434362351235427, + "loss": 2.6567, + "step": 15839 + }, + { + "epoch": 1.4350750832370727, + "grad_norm": 0.886617124080658, + "learning_rate": 0.0001043375823113635, + "loss": 2.5631, + "step": 15840 + }, + { + "epoch": 1.4351656814115197, + "grad_norm": 0.9392498135566711, + "learning_rate": 0.00010433154111037275, + "loss": 2.6032, + "step": 15841 + }, + { + "epoch": 1.4352562795859662, + "grad_norm": 0.8575042486190796, + "learning_rate": 0.00010432549990938199, + "loss": 2.7944, + "step": 15842 + }, + { + "epoch": 1.4353468777604133, + "grad_norm": 0.9141993522644043, + "learning_rate": 0.00010431945870839123, + "loss": 2.6992, + "step": 15843 + }, + { + "epoch": 1.4354374759348598, + "grad_norm": 0.9363918304443359, + "learning_rate": 0.00010431341750740047, + "loss": 2.8455, + "step": 15844 + }, + { + "epoch": 1.4355280741093068, + "grad_norm": 0.9514438509941101, + "learning_rate": 0.00010430737630640971, + "loss": 2.7072, + "step": 15845 + }, + { + "epoch": 1.4356186722837534, + "grad_norm": 0.9259507656097412, + "learning_rate": 0.00010430133510541895, + "loss": 2.9163, + "step": 15846 + }, + { + "epoch": 1.4357092704582004, + "grad_norm": 0.8967195749282837, + "learning_rate": 0.00010429529390442821, + "loss": 2.73, + "step": 15847 + }, + { + "epoch": 1.435799868632647, + "grad_norm": 0.9836736917495728, + "learning_rate": 0.00010428925270343746, + "loss": 2.7981, + "step": 15848 + }, + { + "epoch": 1.4358904668070938, + "grad_norm": 0.8769347071647644, + "learning_rate": 0.00010428321150244669, + "loss": 2.6563, + "step": 15849 + }, + { + "epoch": 1.4359810649815405, + "grad_norm": 0.8858426213264465, + "learning_rate": 0.00010427717030145594, + "loss": 2.5926, + "step": 15850 + }, + { + "epoch": 1.4360716631559873, + "grad_norm": 0.9798609614372253, + "learning_rate": 0.00010427112910046517, + "loss": 2.6286, + "step": 15851 + }, + { + "epoch": 1.4361622613304341, + "grad_norm": 0.9046863913536072, + "learning_rate": 0.00010426508789947442, + "loss": 2.6718, + "step": 15852 + }, + { + "epoch": 1.436252859504881, + "grad_norm": 0.8748306632041931, + "learning_rate": 0.00010425904669848365, + "loss": 2.7579, + "step": 15853 + }, + { + "epoch": 1.4363434576793277, + "grad_norm": 0.588259220123291, + "learning_rate": 0.0001042530054974929, + "loss": 1.2872, + "step": 15854 + }, + { + "epoch": 1.4364340558537745, + "grad_norm": 0.8823748826980591, + "learning_rate": 0.00010424696429650216, + "loss": 2.6414, + "step": 15855 + }, + { + "epoch": 1.4365246540282213, + "grad_norm": 0.8934335708618164, + "learning_rate": 0.00010424092309551138, + "loss": 2.6728, + "step": 15856 + }, + { + "epoch": 1.436615252202668, + "grad_norm": 0.8535973429679871, + "learning_rate": 0.00010423488189452065, + "loss": 2.483, + "step": 15857 + }, + { + "epoch": 1.4367058503771148, + "grad_norm": 0.8996016383171082, + "learning_rate": 0.00010422884069352987, + "loss": 2.5243, + "step": 15858 + }, + { + "epoch": 1.4367964485515616, + "grad_norm": 0.9314201474189758, + "learning_rate": 0.00010422279949253913, + "loss": 2.659, + "step": 15859 + }, + { + "epoch": 1.4368870467260084, + "grad_norm": 0.8864303827285767, + "learning_rate": 0.00010421675829154836, + "loss": 2.6698, + "step": 15860 + }, + { + "epoch": 1.4369776449004552, + "grad_norm": 0.7332324385643005, + "learning_rate": 0.00010421071709055761, + "loss": 2.134, + "step": 15861 + }, + { + "epoch": 1.437068243074902, + "grad_norm": 0.8523163199424744, + "learning_rate": 0.00010420467588956686, + "loss": 2.4713, + "step": 15862 + }, + { + "epoch": 1.4371588412493488, + "grad_norm": 0.8555571436882019, + "learning_rate": 0.00010419863468857609, + "loss": 2.622, + "step": 15863 + }, + { + "epoch": 1.4372494394237956, + "grad_norm": 0.8858376145362854, + "learning_rate": 0.00010419259348758534, + "loss": 2.8068, + "step": 15864 + }, + { + "epoch": 1.4373400375982424, + "grad_norm": 0.8959872722625732, + "learning_rate": 0.00010418655228659457, + "loss": 2.6367, + "step": 15865 + }, + { + "epoch": 1.4374306357726891, + "grad_norm": 0.8896713256835938, + "learning_rate": 0.00010418051108560383, + "loss": 2.5539, + "step": 15866 + }, + { + "epoch": 1.437521233947136, + "grad_norm": 0.8868478536605835, + "learning_rate": 0.00010417446988461305, + "loss": 2.7982, + "step": 15867 + }, + { + "epoch": 1.4376118321215827, + "grad_norm": 1.0022470951080322, + "learning_rate": 0.00010416842868362231, + "loss": 2.5365, + "step": 15868 + }, + { + "epoch": 1.4377024302960295, + "grad_norm": 0.9749160408973694, + "learning_rate": 0.00010416238748263156, + "loss": 2.6218, + "step": 15869 + }, + { + "epoch": 1.4377930284704763, + "grad_norm": 0.8645715117454529, + "learning_rate": 0.0001041563462816408, + "loss": 2.6327, + "step": 15870 + }, + { + "epoch": 1.437883626644923, + "grad_norm": 0.9191557168960571, + "learning_rate": 0.00010415030508065004, + "loss": 2.5312, + "step": 15871 + }, + { + "epoch": 1.4379742248193699, + "grad_norm": 0.893951416015625, + "learning_rate": 0.00010414426387965928, + "loss": 2.7874, + "step": 15872 + }, + { + "epoch": 1.4380648229938167, + "grad_norm": 0.9094374179840088, + "learning_rate": 0.00010413822267866853, + "loss": 2.6963, + "step": 15873 + }, + { + "epoch": 1.4381554211682634, + "grad_norm": 0.7939246892929077, + "learning_rate": 0.00010413218147767776, + "loss": 1.9903, + "step": 15874 + }, + { + "epoch": 1.4382460193427102, + "grad_norm": 0.7912826538085938, + "learning_rate": 0.00010412614027668701, + "loss": 2.0563, + "step": 15875 + }, + { + "epoch": 1.438336617517157, + "grad_norm": 0.8620092272758484, + "learning_rate": 0.00010412009907569624, + "loss": 2.5997, + "step": 15876 + }, + { + "epoch": 1.4384272156916038, + "grad_norm": 0.9852482676506042, + "learning_rate": 0.00010411405787470549, + "loss": 2.7489, + "step": 15877 + }, + { + "epoch": 1.4385178138660506, + "grad_norm": 0.9384878277778625, + "learning_rate": 0.00010410801667371475, + "loss": 2.6854, + "step": 15878 + }, + { + "epoch": 1.4386084120404974, + "grad_norm": 0.9381676912307739, + "learning_rate": 0.00010410197547272398, + "loss": 2.83, + "step": 15879 + }, + { + "epoch": 1.4386990102149442, + "grad_norm": 0.8637872934341431, + "learning_rate": 0.00010409593427173323, + "loss": 2.8514, + "step": 15880 + }, + { + "epoch": 1.438789608389391, + "grad_norm": 0.9107564687728882, + "learning_rate": 0.00010408989307074247, + "loss": 2.7243, + "step": 15881 + }, + { + "epoch": 1.4388802065638377, + "grad_norm": 0.8943136930465698, + "learning_rate": 0.00010408385186975171, + "loss": 2.6688, + "step": 15882 + }, + { + "epoch": 1.4389708047382845, + "grad_norm": 0.8929430842399597, + "learning_rate": 0.00010407781066876095, + "loss": 2.6229, + "step": 15883 + }, + { + "epoch": 1.4390614029127313, + "grad_norm": 0.9657732248306274, + "learning_rate": 0.0001040717694677702, + "loss": 2.3257, + "step": 15884 + }, + { + "epoch": 1.4391520010871781, + "grad_norm": 0.8849520087242126, + "learning_rate": 0.00010406572826677946, + "loss": 2.6342, + "step": 15885 + }, + { + "epoch": 1.439242599261625, + "grad_norm": 0.9251413941383362, + "learning_rate": 0.00010405968706578868, + "loss": 2.5638, + "step": 15886 + }, + { + "epoch": 1.4393331974360717, + "grad_norm": 0.7937556505203247, + "learning_rate": 0.00010405364586479794, + "loss": 2.2332, + "step": 15887 + }, + { + "epoch": 1.4394237956105185, + "grad_norm": 0.7814804315567017, + "learning_rate": 0.00010404760466380716, + "loss": 2.2883, + "step": 15888 + }, + { + "epoch": 1.4395143937849653, + "grad_norm": 0.9404162168502808, + "learning_rate": 0.00010404156346281642, + "loss": 3.1, + "step": 15889 + }, + { + "epoch": 1.439604991959412, + "grad_norm": 0.5956668257713318, + "learning_rate": 0.00010403552226182564, + "loss": 1.334, + "step": 15890 + }, + { + "epoch": 1.4396955901338588, + "grad_norm": 0.9715930819511414, + "learning_rate": 0.0001040294810608349, + "loss": 2.5562, + "step": 15891 + }, + { + "epoch": 1.4397861883083056, + "grad_norm": 0.8905881643295288, + "learning_rate": 0.00010402343985984415, + "loss": 2.7779, + "step": 15892 + }, + { + "epoch": 1.4398767864827524, + "grad_norm": 0.8999216556549072, + "learning_rate": 0.00010401739865885338, + "loss": 2.8453, + "step": 15893 + }, + { + "epoch": 1.4399673846571992, + "grad_norm": 1.0385087728500366, + "learning_rate": 0.00010401135745786263, + "loss": 2.8345, + "step": 15894 + }, + { + "epoch": 1.440057982831646, + "grad_norm": 0.8837065696716309, + "learning_rate": 0.00010400531625687186, + "loss": 2.7556, + "step": 15895 + }, + { + "epoch": 1.4401485810060928, + "grad_norm": 0.8554170727729797, + "learning_rate": 0.00010399927505588111, + "loss": 2.5688, + "step": 15896 + }, + { + "epoch": 1.4402391791805396, + "grad_norm": 0.8931684494018555, + "learning_rate": 0.00010399323385489035, + "loss": 2.5626, + "step": 15897 + }, + { + "epoch": 1.4403297773549864, + "grad_norm": 0.968029797077179, + "learning_rate": 0.00010398719265389961, + "loss": 2.9476, + "step": 15898 + }, + { + "epoch": 1.4404203755294331, + "grad_norm": 0.919249415397644, + "learning_rate": 0.00010398115145290885, + "loss": 2.6278, + "step": 15899 + }, + { + "epoch": 1.44051097370388, + "grad_norm": 0.9069905281066895, + "learning_rate": 0.00010397511025191809, + "loss": 2.8515, + "step": 15900 + }, + { + "epoch": 1.4406015718783267, + "grad_norm": 0.6631118655204773, + "learning_rate": 0.00010396906905092734, + "loss": 1.4806, + "step": 15901 + }, + { + "epoch": 1.4406921700527735, + "grad_norm": 0.966960072517395, + "learning_rate": 0.00010396302784993657, + "loss": 2.962, + "step": 15902 + }, + { + "epoch": 1.4407827682272203, + "grad_norm": 0.9522446990013123, + "learning_rate": 0.00010395698664894582, + "loss": 2.4643, + "step": 15903 + }, + { + "epoch": 1.440873366401667, + "grad_norm": 1.0885016918182373, + "learning_rate": 0.00010395094544795505, + "loss": 2.8329, + "step": 15904 + }, + { + "epoch": 1.4409639645761139, + "grad_norm": 0.8696107864379883, + "learning_rate": 0.0001039449042469643, + "loss": 2.657, + "step": 15905 + }, + { + "epoch": 1.4410545627505607, + "grad_norm": 0.8895121812820435, + "learning_rate": 0.00010393886304597353, + "loss": 2.6618, + "step": 15906 + }, + { + "epoch": 1.4411451609250074, + "grad_norm": 0.9714156985282898, + "learning_rate": 0.00010393282184498278, + "loss": 2.7584, + "step": 15907 + }, + { + "epoch": 1.4412357590994542, + "grad_norm": 0.882104754447937, + "learning_rate": 0.00010392678064399204, + "loss": 2.8218, + "step": 15908 + }, + { + "epoch": 1.4413263572739008, + "grad_norm": 0.8780916929244995, + "learning_rate": 0.00010392073944300126, + "loss": 2.4623, + "step": 15909 + }, + { + "epoch": 1.4414169554483478, + "grad_norm": 0.9145567417144775, + "learning_rate": 0.00010391469824201052, + "loss": 2.7059, + "step": 15910 + }, + { + "epoch": 1.4415075536227944, + "grad_norm": 0.8283483386039734, + "learning_rate": 0.00010390865704101976, + "loss": 2.5732, + "step": 15911 + }, + { + "epoch": 1.4415981517972414, + "grad_norm": 0.9917430877685547, + "learning_rate": 0.000103902615840029, + "loss": 2.7636, + "step": 15912 + }, + { + "epoch": 1.441688749971688, + "grad_norm": 0.9726248383522034, + "learning_rate": 0.00010389657463903824, + "loss": 2.7251, + "step": 15913 + }, + { + "epoch": 1.441779348146135, + "grad_norm": 0.8720115423202515, + "learning_rate": 0.00010389053343804749, + "loss": 2.8066, + "step": 15914 + }, + { + "epoch": 1.4418699463205815, + "grad_norm": 0.8937447667121887, + "learning_rate": 0.00010388449223705674, + "loss": 2.8855, + "step": 15915 + }, + { + "epoch": 1.4419605444950285, + "grad_norm": 0.8852982521057129, + "learning_rate": 0.00010387845103606597, + "loss": 2.6541, + "step": 15916 + }, + { + "epoch": 1.442051142669475, + "grad_norm": 0.8940990567207336, + "learning_rate": 0.00010387240983507523, + "loss": 2.6867, + "step": 15917 + }, + { + "epoch": 1.442141740843922, + "grad_norm": 0.9215620756149292, + "learning_rate": 0.00010386636863408445, + "loss": 2.7614, + "step": 15918 + }, + { + "epoch": 1.4422323390183687, + "grad_norm": 0.8867507576942444, + "learning_rate": 0.00010386032743309371, + "loss": 2.8476, + "step": 15919 + }, + { + "epoch": 1.4423229371928157, + "grad_norm": 0.8634175062179565, + "learning_rate": 0.00010385428623210293, + "loss": 2.4939, + "step": 15920 + }, + { + "epoch": 1.4424135353672622, + "grad_norm": 0.9054033756256104, + "learning_rate": 0.0001038482450311122, + "loss": 2.6912, + "step": 15921 + }, + { + "epoch": 1.4425041335417093, + "grad_norm": 0.8759632706642151, + "learning_rate": 0.00010384220383012144, + "loss": 2.8529, + "step": 15922 + }, + { + "epoch": 1.4425947317161558, + "grad_norm": 0.853156328201294, + "learning_rate": 0.00010383616262913068, + "loss": 2.4081, + "step": 15923 + }, + { + "epoch": 1.4426853298906028, + "grad_norm": 0.906978964805603, + "learning_rate": 0.00010383012142813992, + "loss": 2.8675, + "step": 15924 + }, + { + "epoch": 1.4427759280650494, + "grad_norm": 0.8637932538986206, + "learning_rate": 0.00010382408022714916, + "loss": 1.973, + "step": 15925 + }, + { + "epoch": 1.4428665262394964, + "grad_norm": 0.8919674158096313, + "learning_rate": 0.0001038180390261584, + "loss": 2.9513, + "step": 15926 + }, + { + "epoch": 1.442957124413943, + "grad_norm": 0.8832579851150513, + "learning_rate": 0.00010381199782516764, + "loss": 2.8088, + "step": 15927 + }, + { + "epoch": 1.44304772258839, + "grad_norm": 0.9001261591911316, + "learning_rate": 0.00010380595662417689, + "loss": 2.7324, + "step": 15928 + }, + { + "epoch": 1.4431383207628365, + "grad_norm": 0.9152958989143372, + "learning_rate": 0.00010379991542318615, + "loss": 2.6116, + "step": 15929 + }, + { + "epoch": 1.4432289189372833, + "grad_norm": 0.955616295337677, + "learning_rate": 0.00010379387422219538, + "loss": 2.7006, + "step": 15930 + }, + { + "epoch": 1.4433195171117301, + "grad_norm": 0.9700506329536438, + "learning_rate": 0.00010378783302120463, + "loss": 2.7343, + "step": 15931 + }, + { + "epoch": 1.443410115286177, + "grad_norm": 0.7881382703781128, + "learning_rate": 0.00010378179182021386, + "loss": 2.2368, + "step": 15932 + }, + { + "epoch": 1.4435007134606237, + "grad_norm": 0.9743044972419739, + "learning_rate": 0.00010377575061922311, + "loss": 2.665, + "step": 15933 + }, + { + "epoch": 1.4435913116350705, + "grad_norm": 0.8577666878700256, + "learning_rate": 0.00010376970941823234, + "loss": 2.5487, + "step": 15934 + }, + { + "epoch": 1.4436819098095173, + "grad_norm": 0.8959378600120544, + "learning_rate": 0.00010376366821724159, + "loss": 2.7478, + "step": 15935 + }, + { + "epoch": 1.443772507983964, + "grad_norm": 0.880902111530304, + "learning_rate": 0.00010375762701625083, + "loss": 1.9981, + "step": 15936 + }, + { + "epoch": 1.4438631061584108, + "grad_norm": 0.721723198890686, + "learning_rate": 0.00010375158581526007, + "loss": 1.9224, + "step": 15937 + }, + { + "epoch": 1.4439537043328576, + "grad_norm": 0.9363014698028564, + "learning_rate": 0.00010374554461426934, + "loss": 2.7103, + "step": 15938 + }, + { + "epoch": 1.4440443025073044, + "grad_norm": 0.9234932661056519, + "learning_rate": 0.00010373950341327856, + "loss": 2.8825, + "step": 15939 + }, + { + "epoch": 1.4441349006817512, + "grad_norm": 0.8669955134391785, + "learning_rate": 0.00010373346221228782, + "loss": 2.6736, + "step": 15940 + }, + { + "epoch": 1.444225498856198, + "grad_norm": 0.8311142921447754, + "learning_rate": 0.00010372742101129704, + "loss": 2.0048, + "step": 15941 + }, + { + "epoch": 1.4443160970306448, + "grad_norm": 0.7755846977233887, + "learning_rate": 0.0001037213798103063, + "loss": 2.0635, + "step": 15942 + }, + { + "epoch": 1.4444066952050916, + "grad_norm": 0.9024965167045593, + "learning_rate": 0.00010371533860931553, + "loss": 2.7056, + "step": 15943 + }, + { + "epoch": 1.4444972933795384, + "grad_norm": 0.8644944429397583, + "learning_rate": 0.00010370929740832478, + "loss": 2.6983, + "step": 15944 + }, + { + "epoch": 1.4445878915539851, + "grad_norm": 0.8862882256507874, + "learning_rate": 0.00010370325620733403, + "loss": 2.6864, + "step": 15945 + }, + { + "epoch": 1.444678489728432, + "grad_norm": 0.9091517329216003, + "learning_rate": 0.00010369721500634326, + "loss": 2.5906, + "step": 15946 + }, + { + "epoch": 1.4447690879028787, + "grad_norm": 0.9112953543663025, + "learning_rate": 0.00010369117380535251, + "loss": 2.4708, + "step": 15947 + }, + { + "epoch": 1.4448596860773255, + "grad_norm": 0.970741331577301, + "learning_rate": 0.00010368513260436174, + "loss": 2.5911, + "step": 15948 + }, + { + "epoch": 1.4449502842517723, + "grad_norm": 1.1705747842788696, + "learning_rate": 0.000103679091403371, + "loss": 2.7868, + "step": 15949 + }, + { + "epoch": 1.445040882426219, + "grad_norm": 0.8984044194221497, + "learning_rate": 0.00010367305020238023, + "loss": 2.8485, + "step": 15950 + }, + { + "epoch": 1.4451314806006659, + "grad_norm": 0.9440551400184631, + "learning_rate": 0.00010366700900138949, + "loss": 3.0717, + "step": 15951 + }, + { + "epoch": 1.4452220787751127, + "grad_norm": 0.8573768138885498, + "learning_rate": 0.00010366096780039873, + "loss": 2.0924, + "step": 15952 + }, + { + "epoch": 1.4453126769495594, + "grad_norm": 0.9279734492301941, + "learning_rate": 0.00010365492659940797, + "loss": 2.4573, + "step": 15953 + }, + { + "epoch": 1.4454032751240062, + "grad_norm": 0.8307550549507141, + "learning_rate": 0.00010364888539841722, + "loss": 1.9268, + "step": 15954 + }, + { + "epoch": 1.445493873298453, + "grad_norm": 0.961914598941803, + "learning_rate": 0.00010364284419742645, + "loss": 2.7093, + "step": 15955 + }, + { + "epoch": 1.4455844714728998, + "grad_norm": 0.8801884055137634, + "learning_rate": 0.0001036368029964357, + "loss": 2.6122, + "step": 15956 + }, + { + "epoch": 1.4456750696473466, + "grad_norm": 0.9388702511787415, + "learning_rate": 0.00010363076179544493, + "loss": 2.8459, + "step": 15957 + }, + { + "epoch": 1.4457656678217934, + "grad_norm": 0.9441458582878113, + "learning_rate": 0.00010362472059445418, + "loss": 2.7494, + "step": 15958 + }, + { + "epoch": 1.4458562659962402, + "grad_norm": 0.9846778512001038, + "learning_rate": 0.00010361867939346344, + "loss": 2.6418, + "step": 15959 + }, + { + "epoch": 1.445946864170687, + "grad_norm": 0.9052222967147827, + "learning_rate": 0.00010361263819247266, + "loss": 2.8024, + "step": 15960 + }, + { + "epoch": 1.4460374623451338, + "grad_norm": 0.772263765335083, + "learning_rate": 0.00010360659699148192, + "loss": 2.0265, + "step": 15961 + }, + { + "epoch": 1.4461280605195805, + "grad_norm": 0.8759628534317017, + "learning_rate": 0.00010360055579049116, + "loss": 2.6384, + "step": 15962 + }, + { + "epoch": 1.4462186586940273, + "grad_norm": 0.941504716873169, + "learning_rate": 0.0001035945145895004, + "loss": 2.7454, + "step": 15963 + }, + { + "epoch": 1.4463092568684741, + "grad_norm": 0.9384605288505554, + "learning_rate": 0.00010358847338850964, + "loss": 2.9749, + "step": 15964 + }, + { + "epoch": 1.446399855042921, + "grad_norm": 0.8113866448402405, + "learning_rate": 0.00010358243218751889, + "loss": 2.0162, + "step": 15965 + }, + { + "epoch": 1.4464904532173677, + "grad_norm": 0.9050496816635132, + "learning_rate": 0.00010357639098652812, + "loss": 2.6954, + "step": 15966 + }, + { + "epoch": 1.4465810513918145, + "grad_norm": 0.8871278166770935, + "learning_rate": 0.00010357034978553737, + "loss": 2.6881, + "step": 15967 + }, + { + "epoch": 1.4466716495662613, + "grad_norm": 0.848561704158783, + "learning_rate": 0.00010356430858454661, + "loss": 2.6071, + "step": 15968 + }, + { + "epoch": 1.446762247740708, + "grad_norm": 0.879438042640686, + "learning_rate": 0.00010355826738355585, + "loss": 2.7447, + "step": 15969 + }, + { + "epoch": 1.4468528459151548, + "grad_norm": 0.8833010792732239, + "learning_rate": 0.00010355222618256511, + "loss": 2.6576, + "step": 15970 + }, + { + "epoch": 1.4469434440896016, + "grad_norm": 0.8822922110557556, + "learning_rate": 0.00010354618498157433, + "loss": 2.5317, + "step": 15971 + }, + { + "epoch": 1.4470340422640484, + "grad_norm": 0.9859601855278015, + "learning_rate": 0.00010354014378058359, + "loss": 2.6779, + "step": 15972 + }, + { + "epoch": 1.4471246404384952, + "grad_norm": 0.9659163355827332, + "learning_rate": 0.00010353410257959281, + "loss": 2.625, + "step": 15973 + }, + { + "epoch": 1.447215238612942, + "grad_norm": 0.9569882154464722, + "learning_rate": 0.00010352806137860207, + "loss": 2.7628, + "step": 15974 + }, + { + "epoch": 1.4473058367873888, + "grad_norm": 0.9087982773780823, + "learning_rate": 0.00010352202017761132, + "loss": 2.9065, + "step": 15975 + }, + { + "epoch": 1.4473964349618356, + "grad_norm": 0.8931900858879089, + "learning_rate": 0.00010351597897662055, + "loss": 2.6188, + "step": 15976 + }, + { + "epoch": 1.4474870331362824, + "grad_norm": 0.9040892124176025, + "learning_rate": 0.0001035099377756298, + "loss": 2.8794, + "step": 15977 + }, + { + "epoch": 1.4475776313107291, + "grad_norm": 0.960062563419342, + "learning_rate": 0.00010350389657463904, + "loss": 2.8341, + "step": 15978 + }, + { + "epoch": 1.447668229485176, + "grad_norm": 1.0044913291931152, + "learning_rate": 0.00010349785537364828, + "loss": 2.9009, + "step": 15979 + }, + { + "epoch": 1.4477588276596227, + "grad_norm": 0.913404643535614, + "learning_rate": 0.00010349181417265752, + "loss": 2.6677, + "step": 15980 + }, + { + "epoch": 1.4478494258340695, + "grad_norm": 1.0523500442504883, + "learning_rate": 0.00010348577297166678, + "loss": 2.8951, + "step": 15981 + }, + { + "epoch": 1.4479400240085163, + "grad_norm": 0.9014593958854675, + "learning_rate": 0.00010347973177067603, + "loss": 2.6403, + "step": 15982 + }, + { + "epoch": 1.448030622182963, + "grad_norm": 0.9364784955978394, + "learning_rate": 0.00010347369056968526, + "loss": 2.7606, + "step": 15983 + }, + { + "epoch": 1.4481212203574099, + "grad_norm": 0.8833152651786804, + "learning_rate": 0.00010346764936869451, + "loss": 2.6992, + "step": 15984 + }, + { + "epoch": 1.4482118185318567, + "grad_norm": 0.8958204984664917, + "learning_rate": 0.00010346160816770374, + "loss": 2.7845, + "step": 15985 + }, + { + "epoch": 1.4483024167063034, + "grad_norm": 0.9119958877563477, + "learning_rate": 0.00010345556696671299, + "loss": 2.7792, + "step": 15986 + }, + { + "epoch": 1.4483930148807502, + "grad_norm": 0.9085328578948975, + "learning_rate": 0.00010344952576572222, + "loss": 2.776, + "step": 15987 + }, + { + "epoch": 1.448483613055197, + "grad_norm": 0.9419613480567932, + "learning_rate": 0.00010344348456473147, + "loss": 2.6362, + "step": 15988 + }, + { + "epoch": 1.4485742112296438, + "grad_norm": 0.8921108841896057, + "learning_rate": 0.00010343744336374073, + "loss": 2.7495, + "step": 15989 + }, + { + "epoch": 1.4486648094040904, + "grad_norm": 0.8464099764823914, + "learning_rate": 0.00010343140216274995, + "loss": 2.7757, + "step": 15990 + }, + { + "epoch": 1.4487554075785374, + "grad_norm": 0.8748329281806946, + "learning_rate": 0.00010342536096175921, + "loss": 2.5571, + "step": 15991 + }, + { + "epoch": 1.448846005752984, + "grad_norm": 0.9115822911262512, + "learning_rate": 0.00010341931976076843, + "loss": 2.5402, + "step": 15992 + }, + { + "epoch": 1.448936603927431, + "grad_norm": 0.8813031315803528, + "learning_rate": 0.0001034132785597777, + "loss": 2.5782, + "step": 15993 + }, + { + "epoch": 1.4490272021018775, + "grad_norm": 0.9248985052108765, + "learning_rate": 0.00010340723735878693, + "loss": 2.7746, + "step": 15994 + }, + { + "epoch": 1.4491178002763245, + "grad_norm": 0.9796416163444519, + "learning_rate": 0.00010340119615779618, + "loss": 2.7254, + "step": 15995 + }, + { + "epoch": 1.449208398450771, + "grad_norm": 0.9616577625274658, + "learning_rate": 0.00010339515495680543, + "loss": 2.5465, + "step": 15996 + }, + { + "epoch": 1.449298996625218, + "grad_norm": 0.9016675353050232, + "learning_rate": 0.00010338911375581466, + "loss": 2.7958, + "step": 15997 + }, + { + "epoch": 1.4493895947996647, + "grad_norm": 0.9343098998069763, + "learning_rate": 0.00010338307255482391, + "loss": 2.8905, + "step": 15998 + }, + { + "epoch": 1.4494801929741117, + "grad_norm": 0.8904984593391418, + "learning_rate": 0.00010337703135383314, + "loss": 2.8609, + "step": 15999 + }, + { + "epoch": 1.4495707911485582, + "grad_norm": 0.9381710290908813, + "learning_rate": 0.00010337099015284239, + "loss": 2.906, + "step": 16000 + }, + { + "epoch": 1.4496613893230053, + "grad_norm": 0.8725552558898926, + "learning_rate": 0.00010336494895185162, + "loss": 2.6772, + "step": 16001 + }, + { + "epoch": 1.4497519874974518, + "grad_norm": 0.8325265645980835, + "learning_rate": 0.00010335890775086088, + "loss": 2.6136, + "step": 16002 + }, + { + "epoch": 1.4498425856718988, + "grad_norm": 0.894798219203949, + "learning_rate": 0.0001033528665498701, + "loss": 2.818, + "step": 16003 + }, + { + "epoch": 1.4499331838463454, + "grad_norm": 0.87880539894104, + "learning_rate": 0.00010334682534887937, + "loss": 2.8234, + "step": 16004 + }, + { + "epoch": 1.4500237820207924, + "grad_norm": 0.8914008736610413, + "learning_rate": 0.00010334078414788861, + "loss": 2.9341, + "step": 16005 + }, + { + "epoch": 1.450114380195239, + "grad_norm": 0.8675506114959717, + "learning_rate": 0.00010333474294689785, + "loss": 2.4542, + "step": 16006 + }, + { + "epoch": 1.450204978369686, + "grad_norm": 0.7860566973686218, + "learning_rate": 0.0001033287017459071, + "loss": 2.0445, + "step": 16007 + }, + { + "epoch": 1.4502955765441325, + "grad_norm": 0.9856662154197693, + "learning_rate": 0.00010332266054491633, + "loss": 2.9773, + "step": 16008 + }, + { + "epoch": 1.4503861747185796, + "grad_norm": 0.9480153322219849, + "learning_rate": 0.00010331661934392558, + "loss": 2.8338, + "step": 16009 + }, + { + "epoch": 1.4504767728930261, + "grad_norm": 0.7824752926826477, + "learning_rate": 0.00010331057814293481, + "loss": 2.1746, + "step": 16010 + }, + { + "epoch": 1.450567371067473, + "grad_norm": 0.9064708948135376, + "learning_rate": 0.00010330453694194406, + "loss": 2.6674, + "step": 16011 + }, + { + "epoch": 1.4506579692419197, + "grad_norm": 0.8573368787765503, + "learning_rate": 0.00010329849574095332, + "loss": 2.5283, + "step": 16012 + }, + { + "epoch": 1.4507485674163665, + "grad_norm": 0.8782755136489868, + "learning_rate": 0.00010329245453996254, + "loss": 2.5923, + "step": 16013 + }, + { + "epoch": 1.4508391655908133, + "grad_norm": 1.0004963874816895, + "learning_rate": 0.0001032864133389718, + "loss": 2.5689, + "step": 16014 + }, + { + "epoch": 1.45092976376526, + "grad_norm": 1.0411244630813599, + "learning_rate": 0.00010328037213798103, + "loss": 2.6235, + "step": 16015 + }, + { + "epoch": 1.4510203619397068, + "grad_norm": 0.8905091881752014, + "learning_rate": 0.00010327433093699028, + "loss": 2.7679, + "step": 16016 + }, + { + "epoch": 1.4511109601141536, + "grad_norm": 0.9397721886634827, + "learning_rate": 0.00010326828973599952, + "loss": 2.9142, + "step": 16017 + }, + { + "epoch": 1.4512015582886004, + "grad_norm": 0.9040799736976624, + "learning_rate": 0.00010326224853500876, + "loss": 2.552, + "step": 16018 + }, + { + "epoch": 1.4512921564630472, + "grad_norm": 0.9056157469749451, + "learning_rate": 0.00010325620733401801, + "loss": 2.7252, + "step": 16019 + }, + { + "epoch": 1.451382754637494, + "grad_norm": 0.8870717883110046, + "learning_rate": 0.00010325016613302725, + "loss": 2.7264, + "step": 16020 + }, + { + "epoch": 1.4514733528119408, + "grad_norm": 0.918647050857544, + "learning_rate": 0.00010324412493203651, + "loss": 2.541, + "step": 16021 + }, + { + "epoch": 1.4515639509863876, + "grad_norm": 0.9798156023025513, + "learning_rate": 0.00010323808373104573, + "loss": 2.6737, + "step": 16022 + }, + { + "epoch": 1.4516545491608344, + "grad_norm": 0.9042078852653503, + "learning_rate": 0.00010323204253005499, + "loss": 2.8654, + "step": 16023 + }, + { + "epoch": 1.4517451473352811, + "grad_norm": 0.9918453693389893, + "learning_rate": 0.00010322600132906421, + "loss": 2.6649, + "step": 16024 + }, + { + "epoch": 1.451835745509728, + "grad_norm": 1.033980369567871, + "learning_rate": 0.00010321996012807347, + "loss": 2.5346, + "step": 16025 + }, + { + "epoch": 1.4519263436841747, + "grad_norm": 0.8921496868133545, + "learning_rate": 0.00010321391892708272, + "loss": 2.6236, + "step": 16026 + }, + { + "epoch": 1.4520169418586215, + "grad_norm": 0.8739498853683472, + "learning_rate": 0.00010320787772609195, + "loss": 2.7868, + "step": 16027 + }, + { + "epoch": 1.4521075400330683, + "grad_norm": 0.8563128709793091, + "learning_rate": 0.0001032018365251012, + "loss": 2.9125, + "step": 16028 + }, + { + "epoch": 1.452198138207515, + "grad_norm": 0.8797217011451721, + "learning_rate": 0.00010319579532411043, + "loss": 2.7222, + "step": 16029 + }, + { + "epoch": 1.4522887363819619, + "grad_norm": 0.9453586339950562, + "learning_rate": 0.00010318975412311968, + "loss": 2.769, + "step": 16030 + }, + { + "epoch": 1.4523793345564087, + "grad_norm": 0.9158952832221985, + "learning_rate": 0.00010318371292212892, + "loss": 2.6878, + "step": 16031 + }, + { + "epoch": 1.4524699327308555, + "grad_norm": 0.9204325079917908, + "learning_rate": 0.00010317767172113816, + "loss": 2.7975, + "step": 16032 + }, + { + "epoch": 1.4525605309053022, + "grad_norm": 0.8751494884490967, + "learning_rate": 0.0001031716305201474, + "loss": 2.7254, + "step": 16033 + }, + { + "epoch": 1.452651129079749, + "grad_norm": 0.917327344417572, + "learning_rate": 0.00010316558931915666, + "loss": 2.6666, + "step": 16034 + }, + { + "epoch": 1.4527417272541958, + "grad_norm": 0.8676403760910034, + "learning_rate": 0.0001031595481181659, + "loss": 2.6561, + "step": 16035 + }, + { + "epoch": 1.4528323254286426, + "grad_norm": 0.8887217044830322, + "learning_rate": 0.00010315350691717514, + "loss": 2.5914, + "step": 16036 + }, + { + "epoch": 1.4529229236030894, + "grad_norm": 0.9242755174636841, + "learning_rate": 0.00010314746571618439, + "loss": 2.8407, + "step": 16037 + }, + { + "epoch": 1.4530135217775362, + "grad_norm": 0.8392877578735352, + "learning_rate": 0.00010314142451519362, + "loss": 2.5432, + "step": 16038 + }, + { + "epoch": 1.453104119951983, + "grad_norm": 0.9383438229560852, + "learning_rate": 0.00010313538331420287, + "loss": 2.9039, + "step": 16039 + }, + { + "epoch": 1.4531947181264298, + "grad_norm": 0.8915058970451355, + "learning_rate": 0.0001031293421132121, + "loss": 2.5195, + "step": 16040 + }, + { + "epoch": 1.4532853163008765, + "grad_norm": 0.9134679436683655, + "learning_rate": 0.00010312330091222135, + "loss": 2.8219, + "step": 16041 + }, + { + "epoch": 1.4533759144753233, + "grad_norm": 0.9801425337791443, + "learning_rate": 0.00010311725971123061, + "loss": 2.6013, + "step": 16042 + }, + { + "epoch": 1.4534665126497701, + "grad_norm": 0.7401105165481567, + "learning_rate": 0.00010311121851023983, + "loss": 2.1532, + "step": 16043 + }, + { + "epoch": 1.453557110824217, + "grad_norm": 0.9473139643669128, + "learning_rate": 0.0001031051773092491, + "loss": 2.9959, + "step": 16044 + }, + { + "epoch": 1.4536477089986637, + "grad_norm": 0.8875133395195007, + "learning_rate": 0.00010309913610825831, + "loss": 2.6911, + "step": 16045 + }, + { + "epoch": 1.4537383071731105, + "grad_norm": 0.932736337184906, + "learning_rate": 0.00010309309490726758, + "loss": 2.8104, + "step": 16046 + }, + { + "epoch": 1.4538289053475573, + "grad_norm": 0.8948931694030762, + "learning_rate": 0.00010308705370627681, + "loss": 2.6586, + "step": 16047 + }, + { + "epoch": 1.453919503522004, + "grad_norm": 0.9383507966995239, + "learning_rate": 0.00010308101250528606, + "loss": 2.993, + "step": 16048 + }, + { + "epoch": 1.4540101016964508, + "grad_norm": 0.8780879378318787, + "learning_rate": 0.0001030749713042953, + "loss": 2.5547, + "step": 16049 + }, + { + "epoch": 1.4541006998708976, + "grad_norm": 0.898321807384491, + "learning_rate": 0.00010306893010330454, + "loss": 2.6381, + "step": 16050 + }, + { + "epoch": 1.4541912980453444, + "grad_norm": 0.8786556720733643, + "learning_rate": 0.00010306288890231379, + "loss": 2.7428, + "step": 16051 + }, + { + "epoch": 1.4542818962197912, + "grad_norm": 0.9406435489654541, + "learning_rate": 0.00010305684770132302, + "loss": 2.6117, + "step": 16052 + }, + { + "epoch": 1.454372494394238, + "grad_norm": 0.9160633087158203, + "learning_rate": 0.00010305080650033228, + "loss": 2.6339, + "step": 16053 + }, + { + "epoch": 1.4544630925686848, + "grad_norm": 0.7768097519874573, + "learning_rate": 0.0001030447652993415, + "loss": 1.936, + "step": 16054 + }, + { + "epoch": 1.4545536907431316, + "grad_norm": 0.9474305510520935, + "learning_rate": 0.00010303872409835076, + "loss": 2.886, + "step": 16055 + }, + { + "epoch": 1.4546442889175784, + "grad_norm": 0.9260689616203308, + "learning_rate": 0.00010303268289736001, + "loss": 2.6134, + "step": 16056 + }, + { + "epoch": 1.4547348870920251, + "grad_norm": 0.8464567065238953, + "learning_rate": 0.00010302664169636924, + "loss": 2.8697, + "step": 16057 + }, + { + "epoch": 1.454825485266472, + "grad_norm": 0.8868590593338013, + "learning_rate": 0.00010302060049537849, + "loss": 2.6263, + "step": 16058 + }, + { + "epoch": 1.4549160834409187, + "grad_norm": 0.906585693359375, + "learning_rate": 0.00010301455929438773, + "loss": 2.939, + "step": 16059 + }, + { + "epoch": 1.4550066816153655, + "grad_norm": 0.9009336829185486, + "learning_rate": 0.00010300851809339697, + "loss": 2.4115, + "step": 16060 + }, + { + "epoch": 1.4550972797898123, + "grad_norm": 0.9271279573440552, + "learning_rate": 0.00010300247689240621, + "loss": 2.648, + "step": 16061 + }, + { + "epoch": 1.455187877964259, + "grad_norm": 0.9109893441200256, + "learning_rate": 0.00010299643569141546, + "loss": 2.6006, + "step": 16062 + }, + { + "epoch": 1.4552784761387059, + "grad_norm": 0.8872578144073486, + "learning_rate": 0.00010299039449042469, + "loss": 2.6889, + "step": 16063 + }, + { + "epoch": 1.4553690743131527, + "grad_norm": 0.9035077691078186, + "learning_rate": 0.00010298435328943394, + "loss": 2.8189, + "step": 16064 + }, + { + "epoch": 1.4554596724875994, + "grad_norm": 1.0216156244277954, + "learning_rate": 0.0001029783120884432, + "loss": 2.5871, + "step": 16065 + }, + { + "epoch": 1.4555502706620462, + "grad_norm": 0.919145405292511, + "learning_rate": 0.00010297227088745243, + "loss": 2.5203, + "step": 16066 + }, + { + "epoch": 1.455640868836493, + "grad_norm": 0.9109575152397156, + "learning_rate": 0.00010296622968646168, + "loss": 2.6724, + "step": 16067 + }, + { + "epoch": 1.4557314670109398, + "grad_norm": 0.9047344326972961, + "learning_rate": 0.00010296018848547091, + "loss": 2.5401, + "step": 16068 + }, + { + "epoch": 1.4558220651853866, + "grad_norm": 0.8767677545547485, + "learning_rate": 0.00010295414728448016, + "loss": 2.792, + "step": 16069 + }, + { + "epoch": 1.4559126633598334, + "grad_norm": 0.9247723817825317, + "learning_rate": 0.0001029481060834894, + "loss": 2.6404, + "step": 16070 + }, + { + "epoch": 1.45600326153428, + "grad_norm": 0.8640480041503906, + "learning_rate": 0.00010294206488249864, + "loss": 2.8426, + "step": 16071 + }, + { + "epoch": 1.456093859708727, + "grad_norm": 0.9440805912017822, + "learning_rate": 0.0001029360236815079, + "loss": 2.9768, + "step": 16072 + }, + { + "epoch": 1.4561844578831735, + "grad_norm": 0.8906153440475464, + "learning_rate": 0.00010292998248051713, + "loss": 2.8301, + "step": 16073 + }, + { + "epoch": 1.4562750560576205, + "grad_norm": 1.021181583404541, + "learning_rate": 0.00010292394127952639, + "loss": 2.7178, + "step": 16074 + }, + { + "epoch": 1.456365654232067, + "grad_norm": 0.9059416055679321, + "learning_rate": 0.0001029179000785356, + "loss": 2.7339, + "step": 16075 + }, + { + "epoch": 1.456456252406514, + "grad_norm": 0.9328790307044983, + "learning_rate": 0.00010291185887754487, + "loss": 2.7028, + "step": 16076 + }, + { + "epoch": 1.4565468505809607, + "grad_norm": 0.8581868410110474, + "learning_rate": 0.00010290581767655409, + "loss": 2.9516, + "step": 16077 + }, + { + "epoch": 1.4566374487554077, + "grad_norm": 0.81711345911026, + "learning_rate": 0.00010289977647556335, + "loss": 1.8977, + "step": 16078 + }, + { + "epoch": 1.4567280469298542, + "grad_norm": 0.8569017648696899, + "learning_rate": 0.0001028937352745726, + "loss": 2.6061, + "step": 16079 + }, + { + "epoch": 1.4568186451043013, + "grad_norm": 0.8758562803268433, + "learning_rate": 0.00010288769407358183, + "loss": 2.5107, + "step": 16080 + }, + { + "epoch": 1.4569092432787478, + "grad_norm": 0.8432973623275757, + "learning_rate": 0.00010288165287259108, + "loss": 2.2714, + "step": 16081 + }, + { + "epoch": 1.4569998414531948, + "grad_norm": 0.8966032266616821, + "learning_rate": 0.00010287561167160031, + "loss": 2.3993, + "step": 16082 + }, + { + "epoch": 1.4570904396276414, + "grad_norm": 0.8646834492683411, + "learning_rate": 0.00010286957047060956, + "loss": 2.8251, + "step": 16083 + }, + { + "epoch": 1.4571810378020884, + "grad_norm": 0.9461577534675598, + "learning_rate": 0.0001028635292696188, + "loss": 2.6904, + "step": 16084 + }, + { + "epoch": 1.457271635976535, + "grad_norm": 0.9673038721084595, + "learning_rate": 0.00010285748806862806, + "loss": 2.8184, + "step": 16085 + }, + { + "epoch": 1.457362234150982, + "grad_norm": 0.8934886455535889, + "learning_rate": 0.0001028514468676373, + "loss": 2.6926, + "step": 16086 + }, + { + "epoch": 1.4574528323254285, + "grad_norm": 0.986755907535553, + "learning_rate": 0.00010284540566664654, + "loss": 2.5852, + "step": 16087 + }, + { + "epoch": 1.4575434304998756, + "grad_norm": 0.8887634873390198, + "learning_rate": 0.00010283936446565578, + "loss": 2.7407, + "step": 16088 + }, + { + "epoch": 1.4576340286743221, + "grad_norm": 0.8851547837257385, + "learning_rate": 0.00010283332326466502, + "loss": 2.767, + "step": 16089 + }, + { + "epoch": 1.4577246268487691, + "grad_norm": 0.9154039025306702, + "learning_rate": 0.00010282728206367427, + "loss": 2.7551, + "step": 16090 + }, + { + "epoch": 1.4578152250232157, + "grad_norm": 0.9085490107536316, + "learning_rate": 0.0001028212408626835, + "loss": 2.7544, + "step": 16091 + }, + { + "epoch": 1.4579058231976625, + "grad_norm": 0.8832782506942749, + "learning_rate": 0.00010281519966169275, + "loss": 2.8696, + "step": 16092 + }, + { + "epoch": 1.4579964213721093, + "grad_norm": 0.9176831245422363, + "learning_rate": 0.00010280915846070198, + "loss": 2.7383, + "step": 16093 + }, + { + "epoch": 1.458087019546556, + "grad_norm": 0.9044227004051208, + "learning_rate": 0.00010280311725971123, + "loss": 2.816, + "step": 16094 + }, + { + "epoch": 1.4581776177210029, + "grad_norm": 0.8332406282424927, + "learning_rate": 0.00010279707605872049, + "loss": 2.714, + "step": 16095 + }, + { + "epoch": 1.4582682158954496, + "grad_norm": 0.9012326002120972, + "learning_rate": 0.00010279103485772971, + "loss": 2.6362, + "step": 16096 + }, + { + "epoch": 1.4583588140698964, + "grad_norm": 0.8533813953399658, + "learning_rate": 0.00010278499365673897, + "loss": 1.958, + "step": 16097 + }, + { + "epoch": 1.4584494122443432, + "grad_norm": 0.9623411297798157, + "learning_rate": 0.00010277895245574821, + "loss": 2.5894, + "step": 16098 + }, + { + "epoch": 1.45854001041879, + "grad_norm": 1.0067411661148071, + "learning_rate": 0.00010277291125475745, + "loss": 2.5624, + "step": 16099 + }, + { + "epoch": 1.4586306085932368, + "grad_norm": 0.9114263653755188, + "learning_rate": 0.00010276687005376669, + "loss": 2.8095, + "step": 16100 + }, + { + "epoch": 1.4587212067676836, + "grad_norm": 0.8210362195968628, + "learning_rate": 0.00010276082885277594, + "loss": 2.5384, + "step": 16101 + }, + { + "epoch": 1.4588118049421304, + "grad_norm": 0.8897750973701477, + "learning_rate": 0.00010275478765178518, + "loss": 2.8067, + "step": 16102 + }, + { + "epoch": 1.4589024031165772, + "grad_norm": 0.8871409296989441, + "learning_rate": 0.00010274874645079442, + "loss": 2.7085, + "step": 16103 + }, + { + "epoch": 1.458993001291024, + "grad_norm": 0.8783572316169739, + "learning_rate": 0.00010274270524980368, + "loss": 2.6672, + "step": 16104 + }, + { + "epoch": 1.4590835994654707, + "grad_norm": 0.9175392985343933, + "learning_rate": 0.0001027366640488129, + "loss": 2.9258, + "step": 16105 + }, + { + "epoch": 1.4591741976399175, + "grad_norm": 0.9113894701004028, + "learning_rate": 0.00010273062284782216, + "loss": 2.5878, + "step": 16106 + }, + { + "epoch": 1.4592647958143643, + "grad_norm": 0.8391130566596985, + "learning_rate": 0.00010272458164683138, + "loss": 2.5337, + "step": 16107 + }, + { + "epoch": 1.459355393988811, + "grad_norm": 0.9163378477096558, + "learning_rate": 0.00010271854044584064, + "loss": 2.6765, + "step": 16108 + }, + { + "epoch": 1.4594459921632579, + "grad_norm": 0.8045284152030945, + "learning_rate": 0.00010271249924484989, + "loss": 1.8796, + "step": 16109 + }, + { + "epoch": 1.4595365903377047, + "grad_norm": 0.8834563493728638, + "learning_rate": 0.00010270645804385912, + "loss": 3.097, + "step": 16110 + }, + { + "epoch": 1.4596271885121515, + "grad_norm": 0.8560469150543213, + "learning_rate": 0.00010270041684286837, + "loss": 3.0084, + "step": 16111 + }, + { + "epoch": 1.4597177866865982, + "grad_norm": 0.8722403049468994, + "learning_rate": 0.0001026943756418776, + "loss": 1.979, + "step": 16112 + }, + { + "epoch": 1.459808384861045, + "grad_norm": 0.8787767887115479, + "learning_rate": 0.00010268833444088685, + "loss": 2.7672, + "step": 16113 + }, + { + "epoch": 1.4598989830354918, + "grad_norm": 0.9399629831314087, + "learning_rate": 0.00010268229323989609, + "loss": 2.4774, + "step": 16114 + }, + { + "epoch": 1.4599895812099386, + "grad_norm": 0.8992600440979004, + "learning_rate": 0.00010267625203890533, + "loss": 2.6303, + "step": 16115 + }, + { + "epoch": 1.4600801793843854, + "grad_norm": 0.9750590920448303, + "learning_rate": 0.0001026702108379146, + "loss": 2.653, + "step": 16116 + }, + { + "epoch": 1.4601707775588322, + "grad_norm": 0.8881676197052002, + "learning_rate": 0.00010266416963692383, + "loss": 2.7846, + "step": 16117 + }, + { + "epoch": 1.460261375733279, + "grad_norm": 0.8970430493354797, + "learning_rate": 0.00010265812843593308, + "loss": 2.5923, + "step": 16118 + }, + { + "epoch": 1.4603519739077258, + "grad_norm": 0.9611961245536804, + "learning_rate": 0.00010265208723494231, + "loss": 2.6901, + "step": 16119 + }, + { + "epoch": 1.4604425720821725, + "grad_norm": 1.0432662963867188, + "learning_rate": 0.00010264604603395156, + "loss": 2.5769, + "step": 16120 + }, + { + "epoch": 1.4605331702566193, + "grad_norm": 0.8710027933120728, + "learning_rate": 0.0001026400048329608, + "loss": 2.6421, + "step": 16121 + }, + { + "epoch": 1.4606237684310661, + "grad_norm": 0.9656355381011963, + "learning_rate": 0.00010263396363197004, + "loss": 2.4349, + "step": 16122 + }, + { + "epoch": 1.460714366605513, + "grad_norm": 0.930292010307312, + "learning_rate": 0.00010262792243097927, + "loss": 2.6143, + "step": 16123 + }, + { + "epoch": 1.4608049647799597, + "grad_norm": 0.9648611545562744, + "learning_rate": 0.00010262188122998852, + "loss": 2.664, + "step": 16124 + }, + { + "epoch": 1.4608955629544065, + "grad_norm": 0.8678645491600037, + "learning_rate": 0.00010261584002899778, + "loss": 2.8952, + "step": 16125 + }, + { + "epoch": 1.4609861611288533, + "grad_norm": 0.8465788960456848, + "learning_rate": 0.000102609798828007, + "loss": 2.82, + "step": 16126 + }, + { + "epoch": 1.4610767593033, + "grad_norm": 0.9745088815689087, + "learning_rate": 0.00010260375762701627, + "loss": 3.1275, + "step": 16127 + }, + { + "epoch": 1.4611673574777468, + "grad_norm": 0.880868673324585, + "learning_rate": 0.00010259771642602549, + "loss": 2.6849, + "step": 16128 + }, + { + "epoch": 1.4612579556521936, + "grad_norm": 0.8523460626602173, + "learning_rate": 0.00010259167522503475, + "loss": 2.5487, + "step": 16129 + }, + { + "epoch": 1.4613485538266404, + "grad_norm": 0.9698441028594971, + "learning_rate": 0.00010258563402404398, + "loss": 2.8779, + "step": 16130 + }, + { + "epoch": 1.4614391520010872, + "grad_norm": 1.0876535177230835, + "learning_rate": 0.00010257959282305323, + "loss": 2.8007, + "step": 16131 + }, + { + "epoch": 1.461529750175534, + "grad_norm": 0.8628656268119812, + "learning_rate": 0.00010257355162206248, + "loss": 2.6826, + "step": 16132 + }, + { + "epoch": 1.4616203483499808, + "grad_norm": 0.9331173896789551, + "learning_rate": 0.00010256751042107171, + "loss": 2.9182, + "step": 16133 + }, + { + "epoch": 1.4617109465244276, + "grad_norm": 0.8953608870506287, + "learning_rate": 0.00010256146922008096, + "loss": 2.6507, + "step": 16134 + }, + { + "epoch": 1.4618015446988744, + "grad_norm": 0.9030852913856506, + "learning_rate": 0.00010255542801909019, + "loss": 2.7722, + "step": 16135 + }, + { + "epoch": 1.4618921428733211, + "grad_norm": 0.9163779020309448, + "learning_rate": 0.00010254938681809945, + "loss": 2.8323, + "step": 16136 + }, + { + "epoch": 1.461982741047768, + "grad_norm": 0.8707368969917297, + "learning_rate": 0.00010254334561710867, + "loss": 2.7189, + "step": 16137 + }, + { + "epoch": 1.4620733392222147, + "grad_norm": 0.9519248604774475, + "learning_rate": 0.00010253730441611793, + "loss": 2.8376, + "step": 16138 + }, + { + "epoch": 1.4621639373966615, + "grad_norm": 0.9766550660133362, + "learning_rate": 0.00010253126321512718, + "loss": 2.7914, + "step": 16139 + }, + { + "epoch": 1.4622545355711083, + "grad_norm": 0.9065207839012146, + "learning_rate": 0.00010252522201413642, + "loss": 2.7645, + "step": 16140 + }, + { + "epoch": 1.462345133745555, + "grad_norm": 0.8805143237113953, + "learning_rate": 0.00010251918081314566, + "loss": 2.802, + "step": 16141 + }, + { + "epoch": 1.4624357319200019, + "grad_norm": 0.9156805872917175, + "learning_rate": 0.0001025131396121549, + "loss": 2.7839, + "step": 16142 + }, + { + "epoch": 1.4625263300944487, + "grad_norm": 0.896294891834259, + "learning_rate": 0.00010250709841116415, + "loss": 2.6739, + "step": 16143 + }, + { + "epoch": 1.4626169282688954, + "grad_norm": 0.8940411806106567, + "learning_rate": 0.00010250105721017338, + "loss": 2.6993, + "step": 16144 + }, + { + "epoch": 1.4627075264433422, + "grad_norm": 0.9612843990325928, + "learning_rate": 0.00010249501600918263, + "loss": 2.631, + "step": 16145 + }, + { + "epoch": 1.462798124617789, + "grad_norm": 0.8789357542991638, + "learning_rate": 0.00010248897480819189, + "loss": 2.5781, + "step": 16146 + }, + { + "epoch": 1.4628887227922358, + "grad_norm": 0.999243438243866, + "learning_rate": 0.00010248293360720111, + "loss": 2.5558, + "step": 16147 + }, + { + "epoch": 1.4629793209666826, + "grad_norm": 0.8628416657447815, + "learning_rate": 0.00010247689240621037, + "loss": 2.7805, + "step": 16148 + }, + { + "epoch": 1.4630699191411294, + "grad_norm": 0.8674250841140747, + "learning_rate": 0.0001024708512052196, + "loss": 2.7028, + "step": 16149 + }, + { + "epoch": 1.4631605173155762, + "grad_norm": 0.8135842084884644, + "learning_rate": 0.00010246481000422885, + "loss": 2.1921, + "step": 16150 + }, + { + "epoch": 1.463251115490023, + "grad_norm": 0.8928179144859314, + "learning_rate": 0.00010245876880323809, + "loss": 2.4731, + "step": 16151 + }, + { + "epoch": 1.4633417136644695, + "grad_norm": 0.9340653419494629, + "learning_rate": 0.00010245272760224733, + "loss": 2.5876, + "step": 16152 + }, + { + "epoch": 1.4634323118389165, + "grad_norm": 0.9420751333236694, + "learning_rate": 0.00010244668640125657, + "loss": 2.819, + "step": 16153 + }, + { + "epoch": 1.463522910013363, + "grad_norm": 0.8848419785499573, + "learning_rate": 0.00010244064520026582, + "loss": 2.6735, + "step": 16154 + }, + { + "epoch": 1.46361350818781, + "grad_norm": 0.9156612753868103, + "learning_rate": 0.00010243460399927506, + "loss": 2.8681, + "step": 16155 + }, + { + "epoch": 1.4637041063622567, + "grad_norm": 0.9297181963920593, + "learning_rate": 0.0001024285627982843, + "loss": 2.7806, + "step": 16156 + }, + { + "epoch": 1.4637947045367037, + "grad_norm": 0.9726491570472717, + "learning_rate": 0.00010242252159729356, + "loss": 2.8012, + "step": 16157 + }, + { + "epoch": 1.4638853027111502, + "grad_norm": 0.8416334390640259, + "learning_rate": 0.00010241648039630278, + "loss": 1.9636, + "step": 16158 + }, + { + "epoch": 1.4639759008855973, + "grad_norm": 0.8784343004226685, + "learning_rate": 0.00010241043919531204, + "loss": 2.9543, + "step": 16159 + }, + { + "epoch": 1.4640664990600438, + "grad_norm": 0.9166204333305359, + "learning_rate": 0.00010240439799432126, + "loss": 2.7831, + "step": 16160 + }, + { + "epoch": 1.4641570972344908, + "grad_norm": 0.9680293202400208, + "learning_rate": 0.00010239835679333052, + "loss": 2.6015, + "step": 16161 + }, + { + "epoch": 1.4642476954089374, + "grad_norm": 0.9626327157020569, + "learning_rate": 0.00010239231559233977, + "loss": 2.6551, + "step": 16162 + }, + { + "epoch": 1.4643382935833844, + "grad_norm": 0.8556072115898132, + "learning_rate": 0.000102386274391349, + "loss": 2.659, + "step": 16163 + }, + { + "epoch": 1.464428891757831, + "grad_norm": 0.8241608738899231, + "learning_rate": 0.00010238023319035825, + "loss": 2.3474, + "step": 16164 + }, + { + "epoch": 1.464519489932278, + "grad_norm": 0.8858409523963928, + "learning_rate": 0.00010237419198936748, + "loss": 2.7952, + "step": 16165 + }, + { + "epoch": 1.4646100881067246, + "grad_norm": 0.8905127644538879, + "learning_rate": 0.00010236815078837673, + "loss": 2.5165, + "step": 16166 + }, + { + "epoch": 1.4647006862811716, + "grad_norm": 0.8796435594558716, + "learning_rate": 0.00010236210958738597, + "loss": 2.6348, + "step": 16167 + }, + { + "epoch": 1.4647912844556181, + "grad_norm": 0.8917887806892395, + "learning_rate": 0.00010235606838639523, + "loss": 2.6321, + "step": 16168 + }, + { + "epoch": 1.4648818826300651, + "grad_norm": 0.9651597142219543, + "learning_rate": 0.00010235002718540448, + "loss": 2.6726, + "step": 16169 + }, + { + "epoch": 1.4649724808045117, + "grad_norm": 0.8734707832336426, + "learning_rate": 0.00010234398598441371, + "loss": 2.8811, + "step": 16170 + }, + { + "epoch": 1.4650630789789587, + "grad_norm": 0.9674471020698547, + "learning_rate": 0.00010233794478342296, + "loss": 2.7803, + "step": 16171 + }, + { + "epoch": 1.4651536771534053, + "grad_norm": 0.8463068008422852, + "learning_rate": 0.00010233190358243219, + "loss": 2.8081, + "step": 16172 + }, + { + "epoch": 1.465244275327852, + "grad_norm": 0.9415707588195801, + "learning_rate": 0.00010232586238144144, + "loss": 2.6798, + "step": 16173 + }, + { + "epoch": 1.4653348735022989, + "grad_norm": 0.9117160439491272, + "learning_rate": 0.00010231982118045067, + "loss": 2.7476, + "step": 16174 + }, + { + "epoch": 1.4654254716767456, + "grad_norm": 0.8907058835029602, + "learning_rate": 0.00010231377997945992, + "loss": 2.7974, + "step": 16175 + }, + { + "epoch": 1.4655160698511924, + "grad_norm": 0.8618707060813904, + "learning_rate": 0.00010230773877846918, + "loss": 2.4892, + "step": 16176 + }, + { + "epoch": 1.4656066680256392, + "grad_norm": 0.7511118054389954, + "learning_rate": 0.0001023016975774784, + "loss": 1.9474, + "step": 16177 + }, + { + "epoch": 1.465697266200086, + "grad_norm": 0.8980703353881836, + "learning_rate": 0.00010229565637648766, + "loss": 2.9454, + "step": 16178 + }, + { + "epoch": 1.4657878643745328, + "grad_norm": 0.8901035785675049, + "learning_rate": 0.00010228961517549688, + "loss": 2.6185, + "step": 16179 + }, + { + "epoch": 1.4658784625489796, + "grad_norm": 0.8837894201278687, + "learning_rate": 0.00010228357397450614, + "loss": 2.509, + "step": 16180 + }, + { + "epoch": 1.4659690607234264, + "grad_norm": 0.9317362308502197, + "learning_rate": 0.00010227753277351538, + "loss": 2.737, + "step": 16181 + }, + { + "epoch": 1.4660596588978732, + "grad_norm": 0.9105429649353027, + "learning_rate": 0.00010227149157252463, + "loss": 2.5919, + "step": 16182 + }, + { + "epoch": 1.46615025707232, + "grad_norm": 0.9086498618125916, + "learning_rate": 0.00010226545037153386, + "loss": 2.5555, + "step": 16183 + }, + { + "epoch": 1.4662408552467667, + "grad_norm": 0.9270234704017639, + "learning_rate": 0.00010225940917054311, + "loss": 2.6645, + "step": 16184 + }, + { + "epoch": 1.4663314534212135, + "grad_norm": 0.8622215986251831, + "learning_rate": 0.00010225336796955236, + "loss": 1.927, + "step": 16185 + }, + { + "epoch": 1.4664220515956603, + "grad_norm": 0.8185757994651794, + "learning_rate": 0.00010224732676856159, + "loss": 2.133, + "step": 16186 + }, + { + "epoch": 1.466512649770107, + "grad_norm": 0.9286385178565979, + "learning_rate": 0.00010224128556757084, + "loss": 2.6958, + "step": 16187 + }, + { + "epoch": 1.4666032479445539, + "grad_norm": 1.0190925598144531, + "learning_rate": 0.00010223524436658007, + "loss": 2.8461, + "step": 16188 + }, + { + "epoch": 1.4666938461190007, + "grad_norm": 0.9026345610618591, + "learning_rate": 0.00010222920316558933, + "loss": 2.7764, + "step": 16189 + }, + { + "epoch": 1.4667844442934475, + "grad_norm": 0.8956326246261597, + "learning_rate": 0.00010222316196459855, + "loss": 2.497, + "step": 16190 + }, + { + "epoch": 1.4668750424678942, + "grad_norm": 0.9298624992370605, + "learning_rate": 0.00010221712076360781, + "loss": 2.6409, + "step": 16191 + }, + { + "epoch": 1.466965640642341, + "grad_norm": 0.8779633045196533, + "learning_rate": 0.00010221107956261706, + "loss": 2.5105, + "step": 16192 + }, + { + "epoch": 1.4670562388167878, + "grad_norm": 0.8760823011398315, + "learning_rate": 0.0001022050383616263, + "loss": 2.536, + "step": 16193 + }, + { + "epoch": 1.4671468369912346, + "grad_norm": 0.8464410305023193, + "learning_rate": 0.00010219899716063554, + "loss": 2.5951, + "step": 16194 + }, + { + "epoch": 1.4672374351656814, + "grad_norm": 0.9794614911079407, + "learning_rate": 0.00010219295595964478, + "loss": 2.9094, + "step": 16195 + }, + { + "epoch": 1.4673280333401282, + "grad_norm": 0.9387351274490356, + "learning_rate": 0.00010218691475865402, + "loss": 2.7034, + "step": 16196 + }, + { + "epoch": 1.467418631514575, + "grad_norm": 0.9214311242103577, + "learning_rate": 0.00010218087355766326, + "loss": 2.7183, + "step": 16197 + }, + { + "epoch": 1.4675092296890218, + "grad_norm": 0.9663037657737732, + "learning_rate": 0.0001021748323566725, + "loss": 2.7362, + "step": 16198 + }, + { + "epoch": 1.4675998278634685, + "grad_norm": 0.8673437833786011, + "learning_rate": 0.00010216879115568177, + "loss": 2.8275, + "step": 16199 + }, + { + "epoch": 1.4676904260379153, + "grad_norm": 0.8840517997741699, + "learning_rate": 0.00010216274995469099, + "loss": 2.7096, + "step": 16200 + }, + { + "epoch": 1.4677810242123621, + "grad_norm": 0.8689034581184387, + "learning_rate": 0.00010215670875370025, + "loss": 2.7515, + "step": 16201 + }, + { + "epoch": 1.467871622386809, + "grad_norm": 1.0208184719085693, + "learning_rate": 0.00010215066755270948, + "loss": 2.7532, + "step": 16202 + }, + { + "epoch": 1.4679622205612557, + "grad_norm": 0.9384142160415649, + "learning_rate": 0.00010214462635171873, + "loss": 2.863, + "step": 16203 + }, + { + "epoch": 1.4680528187357025, + "grad_norm": 0.9339054226875305, + "learning_rate": 0.00010213858515072797, + "loss": 2.5543, + "step": 16204 + }, + { + "epoch": 1.4681434169101493, + "grad_norm": 0.9976431131362915, + "learning_rate": 0.00010213254394973721, + "loss": 2.64, + "step": 16205 + }, + { + "epoch": 1.468234015084596, + "grad_norm": 1.0012625455856323, + "learning_rate": 0.00010212650274874646, + "loss": 2.8881, + "step": 16206 + }, + { + "epoch": 1.4683246132590428, + "grad_norm": 0.901428759098053, + "learning_rate": 0.0001021204615477557, + "loss": 2.7315, + "step": 16207 + }, + { + "epoch": 1.4684152114334896, + "grad_norm": 0.8323436975479126, + "learning_rate": 0.00010211442034676496, + "loss": 2.5452, + "step": 16208 + }, + { + "epoch": 1.4685058096079364, + "grad_norm": 0.9907451272010803, + "learning_rate": 0.00010210837914577418, + "loss": 2.4119, + "step": 16209 + }, + { + "epoch": 1.4685964077823832, + "grad_norm": 0.912111759185791, + "learning_rate": 0.00010210233794478344, + "loss": 2.8896, + "step": 16210 + }, + { + "epoch": 1.46868700595683, + "grad_norm": 0.8729226589202881, + "learning_rate": 0.00010209629674379266, + "loss": 2.7097, + "step": 16211 + }, + { + "epoch": 1.4687776041312768, + "grad_norm": 0.9929680824279785, + "learning_rate": 0.00010209025554280192, + "loss": 2.8818, + "step": 16212 + }, + { + "epoch": 1.4688682023057236, + "grad_norm": 0.9763609766960144, + "learning_rate": 0.00010208421434181115, + "loss": 2.7906, + "step": 16213 + }, + { + "epoch": 1.4689588004801704, + "grad_norm": 0.8333255648612976, + "learning_rate": 0.0001020781731408204, + "loss": 2.6127, + "step": 16214 + }, + { + "epoch": 1.4690493986546171, + "grad_norm": 0.9023739099502563, + "learning_rate": 0.00010207213193982965, + "loss": 2.6525, + "step": 16215 + }, + { + "epoch": 1.469139996829064, + "grad_norm": 0.8928757309913635, + "learning_rate": 0.00010206609073883888, + "loss": 2.5243, + "step": 16216 + }, + { + "epoch": 1.4692305950035107, + "grad_norm": 0.8556694388389587, + "learning_rate": 0.00010206004953784813, + "loss": 2.4925, + "step": 16217 + }, + { + "epoch": 1.4693211931779575, + "grad_norm": 1.0154345035552979, + "learning_rate": 0.00010205400833685736, + "loss": 2.7933, + "step": 16218 + }, + { + "epoch": 1.4694117913524043, + "grad_norm": 0.8206966519355774, + "learning_rate": 0.00010204796713586661, + "loss": 2.5889, + "step": 16219 + }, + { + "epoch": 1.469502389526851, + "grad_norm": 0.8910681009292603, + "learning_rate": 0.00010204192593487585, + "loss": 2.6192, + "step": 16220 + }, + { + "epoch": 1.4695929877012979, + "grad_norm": 0.8444541096687317, + "learning_rate": 0.0001020358847338851, + "loss": 2.7641, + "step": 16221 + }, + { + "epoch": 1.4696835858757447, + "grad_norm": 0.8446338176727295, + "learning_rate": 0.00010202984353289435, + "loss": 2.2953, + "step": 16222 + }, + { + "epoch": 1.4697741840501914, + "grad_norm": 0.9137651920318604, + "learning_rate": 0.00010202380233190359, + "loss": 2.6317, + "step": 16223 + }, + { + "epoch": 1.4698647822246382, + "grad_norm": 0.845544695854187, + "learning_rate": 0.00010201776113091284, + "loss": 2.5245, + "step": 16224 + }, + { + "epoch": 1.469955380399085, + "grad_norm": 0.8886789083480835, + "learning_rate": 0.00010201171992992207, + "loss": 2.5262, + "step": 16225 + }, + { + "epoch": 1.4700459785735318, + "grad_norm": 0.9760403633117676, + "learning_rate": 0.00010200567872893132, + "loss": 2.8657, + "step": 16226 + }, + { + "epoch": 1.4701365767479786, + "grad_norm": 0.9289631843566895, + "learning_rate": 0.00010199963752794055, + "loss": 2.8158, + "step": 16227 + }, + { + "epoch": 1.4702271749224254, + "grad_norm": 0.9241437911987305, + "learning_rate": 0.0001019935963269498, + "loss": 3.007, + "step": 16228 + }, + { + "epoch": 1.4703177730968722, + "grad_norm": 0.8948547840118408, + "learning_rate": 0.00010198755512595906, + "loss": 2.8171, + "step": 16229 + }, + { + "epoch": 1.470408371271319, + "grad_norm": 0.9364776015281677, + "learning_rate": 0.00010198151392496828, + "loss": 2.7014, + "step": 16230 + }, + { + "epoch": 1.4704989694457657, + "grad_norm": 0.8868430256843567, + "learning_rate": 0.00010197547272397754, + "loss": 2.8118, + "step": 16231 + }, + { + "epoch": 1.4705895676202125, + "grad_norm": 0.9173663258552551, + "learning_rate": 0.00010196943152298676, + "loss": 2.8386, + "step": 16232 + }, + { + "epoch": 1.470680165794659, + "grad_norm": 0.9102275967597961, + "learning_rate": 0.00010196339032199602, + "loss": 2.6524, + "step": 16233 + }, + { + "epoch": 1.470770763969106, + "grad_norm": 0.8685004115104675, + "learning_rate": 0.00010195734912100526, + "loss": 2.5103, + "step": 16234 + }, + { + "epoch": 1.4708613621435527, + "grad_norm": 0.9407669901847839, + "learning_rate": 0.0001019513079200145, + "loss": 2.6915, + "step": 16235 + }, + { + "epoch": 1.4709519603179997, + "grad_norm": 0.9152872562408447, + "learning_rate": 0.00010194526671902375, + "loss": 2.7836, + "step": 16236 + }, + { + "epoch": 1.4710425584924463, + "grad_norm": 1.025937795639038, + "learning_rate": 0.00010193922551803299, + "loss": 3.1097, + "step": 16237 + }, + { + "epoch": 1.4711331566668933, + "grad_norm": 0.8363569378852844, + "learning_rate": 0.00010193318431704223, + "loss": 2.6344, + "step": 16238 + }, + { + "epoch": 1.4712237548413398, + "grad_norm": 0.9125135540962219, + "learning_rate": 0.00010192714311605147, + "loss": 2.7804, + "step": 16239 + }, + { + "epoch": 1.4713143530157868, + "grad_norm": 0.8971707820892334, + "learning_rate": 0.00010192110191506073, + "loss": 2.3448, + "step": 16240 + }, + { + "epoch": 1.4714049511902334, + "grad_norm": 0.7798356413841248, + "learning_rate": 0.00010191506071406995, + "loss": 2.1055, + "step": 16241 + }, + { + "epoch": 1.4714955493646804, + "grad_norm": 0.8427672386169434, + "learning_rate": 0.00010190901951307921, + "loss": 2.2423, + "step": 16242 + }, + { + "epoch": 1.471586147539127, + "grad_norm": 0.8197469115257263, + "learning_rate": 0.00010190297831208843, + "loss": 1.8666, + "step": 16243 + }, + { + "epoch": 1.471676745713574, + "grad_norm": 0.8479146361351013, + "learning_rate": 0.00010189693711109769, + "loss": 2.7834, + "step": 16244 + }, + { + "epoch": 1.4717673438880206, + "grad_norm": 0.9793601632118225, + "learning_rate": 0.00010189089591010694, + "loss": 2.8605, + "step": 16245 + }, + { + "epoch": 1.4718579420624676, + "grad_norm": 0.7738685607910156, + "learning_rate": 0.00010188485470911617, + "loss": 1.9758, + "step": 16246 + }, + { + "epoch": 1.4719485402369141, + "grad_norm": 0.8817498683929443, + "learning_rate": 0.00010187881350812542, + "loss": 2.7155, + "step": 16247 + }, + { + "epoch": 1.4720391384113611, + "grad_norm": 0.875717043876648, + "learning_rate": 0.00010187277230713466, + "loss": 2.6384, + "step": 16248 + }, + { + "epoch": 1.4721297365858077, + "grad_norm": 0.8959464430809021, + "learning_rate": 0.0001018667311061439, + "loss": 2.6802, + "step": 16249 + }, + { + "epoch": 1.4722203347602547, + "grad_norm": 0.9165173768997192, + "learning_rate": 0.00010186068990515314, + "loss": 2.7431, + "step": 16250 + }, + { + "epoch": 1.4723109329347013, + "grad_norm": 0.8448246121406555, + "learning_rate": 0.00010185464870416239, + "loss": 2.5983, + "step": 16251 + }, + { + "epoch": 1.4724015311091483, + "grad_norm": 0.9355139136314392, + "learning_rate": 0.00010184860750317165, + "loss": 2.9394, + "step": 16252 + }, + { + "epoch": 1.4724921292835949, + "grad_norm": 0.9511482119560242, + "learning_rate": 0.00010184256630218088, + "loss": 2.5232, + "step": 16253 + }, + { + "epoch": 1.4725827274580416, + "grad_norm": 1.036126732826233, + "learning_rate": 0.00010183652510119013, + "loss": 2.6849, + "step": 16254 + }, + { + "epoch": 1.4726733256324884, + "grad_norm": 0.8204269409179688, + "learning_rate": 0.00010183048390019936, + "loss": 2.1159, + "step": 16255 + }, + { + "epoch": 1.4727639238069352, + "grad_norm": 0.8639670610427856, + "learning_rate": 0.00010182444269920861, + "loss": 2.8374, + "step": 16256 + }, + { + "epoch": 1.472854521981382, + "grad_norm": 0.8732779622077942, + "learning_rate": 0.00010181840149821784, + "loss": 2.8977, + "step": 16257 + }, + { + "epoch": 1.4729451201558288, + "grad_norm": 0.9089916348457336, + "learning_rate": 0.00010181236029722709, + "loss": 2.7006, + "step": 16258 + }, + { + "epoch": 1.4730357183302756, + "grad_norm": 0.9319767951965332, + "learning_rate": 0.00010180631909623635, + "loss": 2.9076, + "step": 16259 + }, + { + "epoch": 1.4731263165047224, + "grad_norm": 0.9631838798522949, + "learning_rate": 0.00010180027789524557, + "loss": 2.6284, + "step": 16260 + }, + { + "epoch": 1.4732169146791692, + "grad_norm": 0.8540383577346802, + "learning_rate": 0.00010179423669425483, + "loss": 2.3978, + "step": 16261 + }, + { + "epoch": 1.473307512853616, + "grad_norm": 0.8998319506645203, + "learning_rate": 0.00010178819549326406, + "loss": 2.4488, + "step": 16262 + }, + { + "epoch": 1.4733981110280627, + "grad_norm": 0.8420599102973938, + "learning_rate": 0.00010178215429227332, + "loss": 2.4338, + "step": 16263 + }, + { + "epoch": 1.4734887092025095, + "grad_norm": 0.8232948184013367, + "learning_rate": 0.00010177611309128254, + "loss": 2.0715, + "step": 16264 + }, + { + "epoch": 1.4735793073769563, + "grad_norm": 0.716425895690918, + "learning_rate": 0.0001017700718902918, + "loss": 1.8733, + "step": 16265 + }, + { + "epoch": 1.473669905551403, + "grad_norm": 0.8994917273521423, + "learning_rate": 0.00010176403068930105, + "loss": 2.7247, + "step": 16266 + }, + { + "epoch": 1.4737605037258499, + "grad_norm": 0.9884907007217407, + "learning_rate": 0.00010175798948831028, + "loss": 2.4941, + "step": 16267 + }, + { + "epoch": 1.4738511019002967, + "grad_norm": 0.9541784524917603, + "learning_rate": 0.00010175194828731953, + "loss": 2.6448, + "step": 16268 + }, + { + "epoch": 1.4739417000747435, + "grad_norm": 0.8359546065330505, + "learning_rate": 0.00010174590708632876, + "loss": 2.4718, + "step": 16269 + }, + { + "epoch": 1.4740322982491902, + "grad_norm": 0.8711704015731812, + "learning_rate": 0.00010173986588533801, + "loss": 2.7652, + "step": 16270 + }, + { + "epoch": 1.474122896423637, + "grad_norm": 0.9177829027175903, + "learning_rate": 0.00010173382468434724, + "loss": 2.9632, + "step": 16271 + }, + { + "epoch": 1.4742134945980838, + "grad_norm": 0.9027284979820251, + "learning_rate": 0.0001017277834833565, + "loss": 2.9473, + "step": 16272 + }, + { + "epoch": 1.4743040927725306, + "grad_norm": 0.8269640803337097, + "learning_rate": 0.00010172174228236572, + "loss": 2.6301, + "step": 16273 + }, + { + "epoch": 1.4743946909469774, + "grad_norm": 1.0113766193389893, + "learning_rate": 0.00010171570108137499, + "loss": 2.6923, + "step": 16274 + }, + { + "epoch": 1.4744852891214242, + "grad_norm": 0.8979906439781189, + "learning_rate": 0.00010170965988038423, + "loss": 2.7249, + "step": 16275 + }, + { + "epoch": 1.474575887295871, + "grad_norm": 0.9256106019020081, + "learning_rate": 0.00010170361867939347, + "loss": 2.8602, + "step": 16276 + }, + { + "epoch": 1.4746664854703178, + "grad_norm": 0.866527795791626, + "learning_rate": 0.00010169757747840272, + "loss": 2.5502, + "step": 16277 + }, + { + "epoch": 1.4747570836447645, + "grad_norm": 0.9486052989959717, + "learning_rate": 0.00010169153627741195, + "loss": 2.6429, + "step": 16278 + }, + { + "epoch": 1.4748476818192113, + "grad_norm": 0.8919153213500977, + "learning_rate": 0.0001016854950764212, + "loss": 2.6148, + "step": 16279 + }, + { + "epoch": 1.4749382799936581, + "grad_norm": 0.9030455350875854, + "learning_rate": 0.00010167945387543043, + "loss": 2.7426, + "step": 16280 + }, + { + "epoch": 1.475028878168105, + "grad_norm": 1.005014181137085, + "learning_rate": 0.00010167341267443968, + "loss": 2.8013, + "step": 16281 + }, + { + "epoch": 1.4751194763425517, + "grad_norm": 0.9080285429954529, + "learning_rate": 0.00010166737147344894, + "loss": 2.8294, + "step": 16282 + }, + { + "epoch": 1.4752100745169985, + "grad_norm": 0.7879761457443237, + "learning_rate": 0.00010166133027245816, + "loss": 1.9898, + "step": 16283 + }, + { + "epoch": 1.4753006726914453, + "grad_norm": 0.9745945930480957, + "learning_rate": 0.00010165528907146742, + "loss": 2.7201, + "step": 16284 + }, + { + "epoch": 1.475391270865892, + "grad_norm": 0.908972978591919, + "learning_rate": 0.00010164924787047666, + "loss": 2.8286, + "step": 16285 + }, + { + "epoch": 1.4754818690403388, + "grad_norm": 1.0102506875991821, + "learning_rate": 0.0001016432066694859, + "loss": 2.5667, + "step": 16286 + }, + { + "epoch": 1.4755724672147856, + "grad_norm": 0.9715589284896851, + "learning_rate": 0.00010163716546849514, + "loss": 2.7399, + "step": 16287 + }, + { + "epoch": 1.4756630653892324, + "grad_norm": 0.935084879398346, + "learning_rate": 0.00010163112426750438, + "loss": 2.8596, + "step": 16288 + }, + { + "epoch": 1.4757536635636792, + "grad_norm": 0.895576536655426, + "learning_rate": 0.00010162508306651363, + "loss": 2.444, + "step": 16289 + }, + { + "epoch": 1.475844261738126, + "grad_norm": 0.8934658169746399, + "learning_rate": 0.00010161904186552287, + "loss": 2.6496, + "step": 16290 + }, + { + "epoch": 1.4759348599125728, + "grad_norm": 0.8406203389167786, + "learning_rate": 0.00010161300066453213, + "loss": 2.7346, + "step": 16291 + }, + { + "epoch": 1.4760254580870196, + "grad_norm": 0.8998481631278992, + "learning_rate": 0.00010160695946354135, + "loss": 2.5623, + "step": 16292 + }, + { + "epoch": 1.4761160562614664, + "grad_norm": 0.8825884461402893, + "learning_rate": 0.00010160091826255061, + "loss": 2.7649, + "step": 16293 + }, + { + "epoch": 1.4762066544359131, + "grad_norm": 0.9035083651542664, + "learning_rate": 0.00010159487706155983, + "loss": 2.6844, + "step": 16294 + }, + { + "epoch": 1.47629725261036, + "grad_norm": 0.9548903107643127, + "learning_rate": 0.00010158883586056909, + "loss": 3.0106, + "step": 16295 + }, + { + "epoch": 1.4763878507848067, + "grad_norm": 0.9771421551704407, + "learning_rate": 0.00010158279465957834, + "loss": 2.4417, + "step": 16296 + }, + { + "epoch": 1.4764784489592535, + "grad_norm": 0.8862084746360779, + "learning_rate": 0.00010157675345858757, + "loss": 2.6154, + "step": 16297 + }, + { + "epoch": 1.4765690471337003, + "grad_norm": 0.8976213932037354, + "learning_rate": 0.00010157071225759682, + "loss": 2.7182, + "step": 16298 + }, + { + "epoch": 1.476659645308147, + "grad_norm": 0.8731586337089539, + "learning_rate": 0.00010156467105660605, + "loss": 2.687, + "step": 16299 + }, + { + "epoch": 1.4767502434825939, + "grad_norm": 0.8454179167747498, + "learning_rate": 0.0001015586298556153, + "loss": 2.6149, + "step": 16300 + }, + { + "epoch": 1.4768408416570407, + "grad_norm": 0.9300788640975952, + "learning_rate": 0.00010155258865462454, + "loss": 2.8257, + "step": 16301 + }, + { + "epoch": 1.4769314398314874, + "grad_norm": 0.984755277633667, + "learning_rate": 0.00010154654745363378, + "loss": 2.7947, + "step": 16302 + }, + { + "epoch": 1.4770220380059342, + "grad_norm": 0.9062609076499939, + "learning_rate": 0.00010154050625264302, + "loss": 2.7795, + "step": 16303 + }, + { + "epoch": 1.477112636180381, + "grad_norm": 0.9166389107704163, + "learning_rate": 0.00010153446505165228, + "loss": 2.8018, + "step": 16304 + }, + { + "epoch": 1.4772032343548278, + "grad_norm": 0.8889276385307312, + "learning_rate": 0.00010152842385066153, + "loss": 2.484, + "step": 16305 + }, + { + "epoch": 1.4772938325292746, + "grad_norm": 0.8671256303787231, + "learning_rate": 0.00010152238264967076, + "loss": 2.8627, + "step": 16306 + }, + { + "epoch": 1.4773844307037214, + "grad_norm": 0.9578911662101746, + "learning_rate": 0.00010151634144868001, + "loss": 2.7585, + "step": 16307 + }, + { + "epoch": 1.4774750288781682, + "grad_norm": 0.9270259141921997, + "learning_rate": 0.00010151030024768924, + "loss": 2.4783, + "step": 16308 + }, + { + "epoch": 1.477565627052615, + "grad_norm": 0.8952410817146301, + "learning_rate": 0.00010150425904669849, + "loss": 2.518, + "step": 16309 + }, + { + "epoch": 1.4776562252270617, + "grad_norm": 0.8562290072441101, + "learning_rate": 0.00010149821784570772, + "loss": 2.6286, + "step": 16310 + }, + { + "epoch": 1.4777468234015085, + "grad_norm": 0.8140015006065369, + "learning_rate": 0.00010149217664471697, + "loss": 2.4408, + "step": 16311 + }, + { + "epoch": 1.4778374215759553, + "grad_norm": 0.9327413439750671, + "learning_rate": 0.00010148613544372623, + "loss": 2.949, + "step": 16312 + }, + { + "epoch": 1.4779280197504021, + "grad_norm": 0.9296375513076782, + "learning_rate": 0.00010148009424273545, + "loss": 2.8359, + "step": 16313 + }, + { + "epoch": 1.4780186179248487, + "grad_norm": 0.9312794208526611, + "learning_rate": 0.00010147405304174471, + "loss": 2.7586, + "step": 16314 + }, + { + "epoch": 1.4781092160992957, + "grad_norm": 0.9186202883720398, + "learning_rate": 0.00010146801184075393, + "loss": 2.6297, + "step": 16315 + }, + { + "epoch": 1.4781998142737423, + "grad_norm": 0.9274389147758484, + "learning_rate": 0.0001014619706397632, + "loss": 2.6551, + "step": 16316 + }, + { + "epoch": 1.4782904124481893, + "grad_norm": 0.8520640730857849, + "learning_rate": 0.00010145592943877243, + "loss": 2.5653, + "step": 16317 + }, + { + "epoch": 1.4783810106226358, + "grad_norm": 1.081243872642517, + "learning_rate": 0.00010144988823778168, + "loss": 2.3446, + "step": 16318 + }, + { + "epoch": 1.4784716087970828, + "grad_norm": 0.821619987487793, + "learning_rate": 0.00010144384703679092, + "loss": 2.3516, + "step": 16319 + }, + { + "epoch": 1.4785622069715294, + "grad_norm": 0.8680546879768372, + "learning_rate": 0.00010143780583580016, + "loss": 2.6332, + "step": 16320 + }, + { + "epoch": 1.4786528051459764, + "grad_norm": 0.9461286664009094, + "learning_rate": 0.0001014317646348094, + "loss": 2.9073, + "step": 16321 + }, + { + "epoch": 1.478743403320423, + "grad_norm": 0.9333396553993225, + "learning_rate": 0.00010142572343381864, + "loss": 2.811, + "step": 16322 + }, + { + "epoch": 1.47883400149487, + "grad_norm": 0.8846263885498047, + "learning_rate": 0.0001014196822328279, + "loss": 2.9687, + "step": 16323 + }, + { + "epoch": 1.4789245996693166, + "grad_norm": 0.7926561832427979, + "learning_rate": 0.00010141364103183712, + "loss": 2.1326, + "step": 16324 + }, + { + "epoch": 1.4790151978437636, + "grad_norm": 0.927221417427063, + "learning_rate": 0.00010140759983084638, + "loss": 2.7215, + "step": 16325 + }, + { + "epoch": 1.4791057960182101, + "grad_norm": 0.9106737375259399, + "learning_rate": 0.00010140155862985563, + "loss": 2.8765, + "step": 16326 + }, + { + "epoch": 1.4791963941926571, + "grad_norm": 0.8286792635917664, + "learning_rate": 0.00010139551742886486, + "loss": 1.9152, + "step": 16327 + }, + { + "epoch": 1.4792869923671037, + "grad_norm": 0.9176548719406128, + "learning_rate": 0.00010138947622787411, + "loss": 2.7157, + "step": 16328 + }, + { + "epoch": 1.4793775905415507, + "grad_norm": 0.9133270978927612, + "learning_rate": 0.00010138343502688335, + "loss": 2.7339, + "step": 16329 + }, + { + "epoch": 1.4794681887159973, + "grad_norm": 0.9186022877693176, + "learning_rate": 0.0001013773938258926, + "loss": 2.7533, + "step": 16330 + }, + { + "epoch": 1.4795587868904443, + "grad_norm": 0.9576243162155151, + "learning_rate": 0.00010137135262490183, + "loss": 2.0051, + "step": 16331 + }, + { + "epoch": 1.4796493850648909, + "grad_norm": 0.9221391081809998, + "learning_rate": 0.00010136531142391108, + "loss": 2.5265, + "step": 16332 + }, + { + "epoch": 1.4797399832393379, + "grad_norm": 1.0090417861938477, + "learning_rate": 0.00010135927022292031, + "loss": 2.7641, + "step": 16333 + }, + { + "epoch": 1.4798305814137844, + "grad_norm": 0.945004940032959, + "learning_rate": 0.00010135322902192956, + "loss": 2.8475, + "step": 16334 + }, + { + "epoch": 1.4799211795882312, + "grad_norm": 0.933580219745636, + "learning_rate": 0.00010134718782093882, + "loss": 2.7931, + "step": 16335 + }, + { + "epoch": 1.480011777762678, + "grad_norm": 0.9002946615219116, + "learning_rate": 0.00010134114661994805, + "loss": 2.9078, + "step": 16336 + }, + { + "epoch": 1.4801023759371248, + "grad_norm": 0.8993304967880249, + "learning_rate": 0.0001013351054189573, + "loss": 2.7119, + "step": 16337 + }, + { + "epoch": 1.4801929741115716, + "grad_norm": 0.8931337594985962, + "learning_rate": 0.00010132906421796653, + "loss": 2.7848, + "step": 16338 + }, + { + "epoch": 1.4802835722860184, + "grad_norm": 0.9377882480621338, + "learning_rate": 0.00010132302301697578, + "loss": 3.0318, + "step": 16339 + }, + { + "epoch": 1.4803741704604652, + "grad_norm": 0.8863666653633118, + "learning_rate": 0.00010131698181598502, + "loss": 2.5484, + "step": 16340 + }, + { + "epoch": 1.480464768634912, + "grad_norm": 0.9138427972793579, + "learning_rate": 0.00010131094061499426, + "loss": 2.2829, + "step": 16341 + }, + { + "epoch": 1.4805553668093587, + "grad_norm": 0.9150992035865784, + "learning_rate": 0.00010130489941400351, + "loss": 2.6499, + "step": 16342 + }, + { + "epoch": 1.4806459649838055, + "grad_norm": 0.8850988149642944, + "learning_rate": 0.00010129885821301275, + "loss": 2.5236, + "step": 16343 + }, + { + "epoch": 1.4807365631582523, + "grad_norm": 0.9101285338401794, + "learning_rate": 0.000101292817012022, + "loss": 2.8277, + "step": 16344 + }, + { + "epoch": 1.480827161332699, + "grad_norm": 0.8161450028419495, + "learning_rate": 0.00010128677581103123, + "loss": 2.564, + "step": 16345 + }, + { + "epoch": 1.4809177595071459, + "grad_norm": 0.8994805216789246, + "learning_rate": 0.00010128073461004049, + "loss": 2.8311, + "step": 16346 + }, + { + "epoch": 1.4810083576815927, + "grad_norm": 0.8627252578735352, + "learning_rate": 0.00010127469340904971, + "loss": 2.5452, + "step": 16347 + }, + { + "epoch": 1.4810989558560395, + "grad_norm": 0.7774037718772888, + "learning_rate": 0.00010126865220805897, + "loss": 1.9559, + "step": 16348 + }, + { + "epoch": 1.4811895540304862, + "grad_norm": 0.9159610271453857, + "learning_rate": 0.00010126261100706822, + "loss": 2.6474, + "step": 16349 + }, + { + "epoch": 1.481280152204933, + "grad_norm": 0.9542679190635681, + "learning_rate": 0.00010125656980607745, + "loss": 2.7049, + "step": 16350 + }, + { + "epoch": 1.4813707503793798, + "grad_norm": 0.9112997651100159, + "learning_rate": 0.0001012505286050867, + "loss": 3.0825, + "step": 16351 + }, + { + "epoch": 1.4814613485538266, + "grad_norm": 0.8708102703094482, + "learning_rate": 0.00010124448740409593, + "loss": 2.8408, + "step": 16352 + }, + { + "epoch": 1.4815519467282734, + "grad_norm": 0.9894232749938965, + "learning_rate": 0.00010123844620310518, + "loss": 3.2265, + "step": 16353 + }, + { + "epoch": 1.4816425449027202, + "grad_norm": 0.9989529252052307, + "learning_rate": 0.00010123240500211441, + "loss": 2.7664, + "step": 16354 + }, + { + "epoch": 1.481733143077167, + "grad_norm": 0.8609356880187988, + "learning_rate": 0.00010122636380112368, + "loss": 2.7874, + "step": 16355 + }, + { + "epoch": 1.4818237412516138, + "grad_norm": 0.9716179966926575, + "learning_rate": 0.00010122032260013292, + "loss": 2.837, + "step": 16356 + }, + { + "epoch": 1.4819143394260605, + "grad_norm": 0.910101056098938, + "learning_rate": 0.00010121428139914216, + "loss": 2.738, + "step": 16357 + }, + { + "epoch": 1.4820049376005073, + "grad_norm": 0.9243163466453552, + "learning_rate": 0.0001012082401981514, + "loss": 2.7847, + "step": 16358 + }, + { + "epoch": 1.4820955357749541, + "grad_norm": 0.8858658075332642, + "learning_rate": 0.00010120219899716064, + "loss": 2.7988, + "step": 16359 + }, + { + "epoch": 1.482186133949401, + "grad_norm": 0.9527480602264404, + "learning_rate": 0.00010119615779616989, + "loss": 2.732, + "step": 16360 + }, + { + "epoch": 1.4822767321238477, + "grad_norm": 0.901532769203186, + "learning_rate": 0.00010119011659517912, + "loss": 2.6694, + "step": 16361 + }, + { + "epoch": 1.4823673302982945, + "grad_norm": 0.8930734992027283, + "learning_rate": 0.00010118407539418837, + "loss": 2.7248, + "step": 16362 + }, + { + "epoch": 1.4824579284727413, + "grad_norm": 0.9895654916763306, + "learning_rate": 0.0001011780341931976, + "loss": 2.6962, + "step": 16363 + }, + { + "epoch": 1.482548526647188, + "grad_norm": 0.9838057160377502, + "learning_rate": 0.00010117199299220685, + "loss": 2.873, + "step": 16364 + }, + { + "epoch": 1.4826391248216348, + "grad_norm": 0.9276647567749023, + "learning_rate": 0.00010116595179121611, + "loss": 2.5568, + "step": 16365 + }, + { + "epoch": 1.4827297229960816, + "grad_norm": 0.9067767858505249, + "learning_rate": 0.00010115991059022533, + "loss": 2.7801, + "step": 16366 + }, + { + "epoch": 1.4828203211705284, + "grad_norm": 0.8999720215797424, + "learning_rate": 0.00010115386938923459, + "loss": 2.6533, + "step": 16367 + }, + { + "epoch": 1.4829109193449752, + "grad_norm": 0.8538822531700134, + "learning_rate": 0.00010114782818824383, + "loss": 2.8592, + "step": 16368 + }, + { + "epoch": 1.483001517519422, + "grad_norm": 0.9228870272636414, + "learning_rate": 0.00010114178698725307, + "loss": 2.57, + "step": 16369 + }, + { + "epoch": 1.4830921156938688, + "grad_norm": 0.8186926245689392, + "learning_rate": 0.00010113574578626231, + "loss": 2.6469, + "step": 16370 + }, + { + "epoch": 1.4831827138683156, + "grad_norm": 0.8744476437568665, + "learning_rate": 0.00010112970458527156, + "loss": 2.5505, + "step": 16371 + }, + { + "epoch": 1.4832733120427624, + "grad_norm": 0.8644199967384338, + "learning_rate": 0.0001011236633842808, + "loss": 2.6415, + "step": 16372 + }, + { + "epoch": 1.4833639102172091, + "grad_norm": 0.7169846892356873, + "learning_rate": 0.00010111762218329004, + "loss": 1.9766, + "step": 16373 + }, + { + "epoch": 1.483454508391656, + "grad_norm": 0.9057109951972961, + "learning_rate": 0.00010111158098229929, + "loss": 2.5822, + "step": 16374 + }, + { + "epoch": 1.4835451065661027, + "grad_norm": 0.8548807501792908, + "learning_rate": 0.00010110553978130852, + "loss": 2.7377, + "step": 16375 + }, + { + "epoch": 1.4836357047405495, + "grad_norm": 0.9632381200790405, + "learning_rate": 0.00010109949858031778, + "loss": 2.5943, + "step": 16376 + }, + { + "epoch": 1.4837263029149963, + "grad_norm": 0.8986362814903259, + "learning_rate": 0.000101093457379327, + "loss": 2.6989, + "step": 16377 + }, + { + "epoch": 1.483816901089443, + "grad_norm": 0.8789224624633789, + "learning_rate": 0.00010108741617833626, + "loss": 2.7481, + "step": 16378 + }, + { + "epoch": 1.4839074992638899, + "grad_norm": 0.851360023021698, + "learning_rate": 0.00010108137497734551, + "loss": 2.6553, + "step": 16379 + }, + { + "epoch": 1.4839980974383367, + "grad_norm": 0.9530118107795715, + "learning_rate": 0.00010107533377635474, + "loss": 2.9573, + "step": 16380 + }, + { + "epoch": 1.4840886956127834, + "grad_norm": 0.9862829446792603, + "learning_rate": 0.00010106929257536399, + "loss": 2.5769, + "step": 16381 + }, + { + "epoch": 1.4841792937872302, + "grad_norm": 0.8900852799415588, + "learning_rate": 0.00010106325137437323, + "loss": 2.7692, + "step": 16382 + }, + { + "epoch": 1.484269891961677, + "grad_norm": 0.8750439882278442, + "learning_rate": 0.00010105721017338247, + "loss": 2.8286, + "step": 16383 + }, + { + "epoch": 1.4843604901361238, + "grad_norm": 0.9477570056915283, + "learning_rate": 0.00010105116897239171, + "loss": 2.5078, + "step": 16384 + }, + { + "epoch": 1.4844510883105706, + "grad_norm": 0.9361546635627747, + "learning_rate": 0.00010104512777140096, + "loss": 2.9626, + "step": 16385 + }, + { + "epoch": 1.4845416864850174, + "grad_norm": 0.9405622482299805, + "learning_rate": 0.00010103908657041022, + "loss": 3.0906, + "step": 16386 + }, + { + "epoch": 1.4846322846594642, + "grad_norm": 0.8513687252998352, + "learning_rate": 0.00010103304536941944, + "loss": 2.6346, + "step": 16387 + }, + { + "epoch": 1.484722882833911, + "grad_norm": 0.9233656525611877, + "learning_rate": 0.0001010270041684287, + "loss": 2.696, + "step": 16388 + }, + { + "epoch": 1.4848134810083577, + "grad_norm": 0.9406001567840576, + "learning_rate": 0.00010102096296743793, + "loss": 2.7082, + "step": 16389 + }, + { + "epoch": 1.4849040791828045, + "grad_norm": 0.8898848295211792, + "learning_rate": 0.00010101492176644718, + "loss": 2.6212, + "step": 16390 + }, + { + "epoch": 1.4849946773572513, + "grad_norm": 0.9215397238731384, + "learning_rate": 0.00010100888056545641, + "loss": 2.7, + "step": 16391 + }, + { + "epoch": 1.4850852755316981, + "grad_norm": 0.896547257900238, + "learning_rate": 0.00010100283936446566, + "loss": 2.6814, + "step": 16392 + }, + { + "epoch": 1.485175873706145, + "grad_norm": 0.775710940361023, + "learning_rate": 0.0001009967981634749, + "loss": 2.2806, + "step": 16393 + }, + { + "epoch": 1.4852664718805917, + "grad_norm": 0.952246904373169, + "learning_rate": 0.00010099075696248414, + "loss": 2.8408, + "step": 16394 + }, + { + "epoch": 1.4853570700550383, + "grad_norm": 0.7680861353874207, + "learning_rate": 0.0001009847157614934, + "loss": 2.201, + "step": 16395 + }, + { + "epoch": 1.4854476682294853, + "grad_norm": 0.9060443043708801, + "learning_rate": 0.00010097867456050262, + "loss": 2.8012, + "step": 16396 + }, + { + "epoch": 1.4855382664039318, + "grad_norm": 0.941781222820282, + "learning_rate": 0.00010097263335951189, + "loss": 2.6808, + "step": 16397 + }, + { + "epoch": 1.4856288645783788, + "grad_norm": 0.7640252709388733, + "learning_rate": 0.0001009665921585211, + "loss": 2.0859, + "step": 16398 + }, + { + "epoch": 1.4857194627528254, + "grad_norm": 0.884688675403595, + "learning_rate": 0.00010096055095753037, + "loss": 2.7379, + "step": 16399 + }, + { + "epoch": 1.4858100609272724, + "grad_norm": 0.9362975358963013, + "learning_rate": 0.0001009545097565396, + "loss": 2.5696, + "step": 16400 + }, + { + "epoch": 1.485900659101719, + "grad_norm": 0.8877776265144348, + "learning_rate": 0.00010094846855554885, + "loss": 2.5901, + "step": 16401 + }, + { + "epoch": 1.485991257276166, + "grad_norm": 0.914934515953064, + "learning_rate": 0.0001009424273545581, + "loss": 2.67, + "step": 16402 + }, + { + "epoch": 1.4860818554506126, + "grad_norm": 0.8945679068565369, + "learning_rate": 0.00010093638615356733, + "loss": 3.034, + "step": 16403 + }, + { + "epoch": 1.4861724536250596, + "grad_norm": 0.7811424732208252, + "learning_rate": 0.00010093034495257658, + "loss": 2.1516, + "step": 16404 + }, + { + "epoch": 1.4862630517995061, + "grad_norm": 0.8809052109718323, + "learning_rate": 0.00010092430375158581, + "loss": 2.6811, + "step": 16405 + }, + { + "epoch": 1.4863536499739531, + "grad_norm": 0.9410752654075623, + "learning_rate": 0.00010091826255059506, + "loss": 2.8303, + "step": 16406 + }, + { + "epoch": 1.4864442481483997, + "grad_norm": 0.9084923267364502, + "learning_rate": 0.0001009122213496043, + "loss": 2.5897, + "step": 16407 + }, + { + "epoch": 1.4865348463228467, + "grad_norm": 0.9101558327674866, + "learning_rate": 0.00010090618014861356, + "loss": 2.6761, + "step": 16408 + }, + { + "epoch": 1.4866254444972933, + "grad_norm": 0.9390742778778076, + "learning_rate": 0.0001009001389476228, + "loss": 2.7998, + "step": 16409 + }, + { + "epoch": 1.4867160426717403, + "grad_norm": 0.9254473447799683, + "learning_rate": 0.00010089409774663204, + "loss": 2.4709, + "step": 16410 + }, + { + "epoch": 1.4868066408461869, + "grad_norm": 0.9375855326652527, + "learning_rate": 0.00010088805654564128, + "loss": 2.5728, + "step": 16411 + }, + { + "epoch": 1.4868972390206339, + "grad_norm": 0.9166346788406372, + "learning_rate": 0.00010088201534465052, + "loss": 2.8977, + "step": 16412 + }, + { + "epoch": 1.4869878371950804, + "grad_norm": 0.892586350440979, + "learning_rate": 0.00010087597414365977, + "loss": 2.5674, + "step": 16413 + }, + { + "epoch": 1.4870784353695274, + "grad_norm": 0.9813213348388672, + "learning_rate": 0.000100869932942669, + "loss": 2.8028, + "step": 16414 + }, + { + "epoch": 1.487169033543974, + "grad_norm": 0.9104825854301453, + "learning_rate": 0.00010086389174167825, + "loss": 2.6342, + "step": 16415 + }, + { + "epoch": 1.4872596317184208, + "grad_norm": 0.9057201147079468, + "learning_rate": 0.00010085785054068751, + "loss": 2.5827, + "step": 16416 + }, + { + "epoch": 1.4873502298928676, + "grad_norm": 0.8948456645011902, + "learning_rate": 0.00010085180933969673, + "loss": 2.5118, + "step": 16417 + }, + { + "epoch": 1.4874408280673144, + "grad_norm": 0.8796730041503906, + "learning_rate": 0.00010084576813870599, + "loss": 2.2275, + "step": 16418 + }, + { + "epoch": 1.4875314262417612, + "grad_norm": 0.73371422290802, + "learning_rate": 0.00010083972693771521, + "loss": 1.8347, + "step": 16419 + }, + { + "epoch": 1.487622024416208, + "grad_norm": 0.8871137499809265, + "learning_rate": 0.00010083368573672447, + "loss": 2.5462, + "step": 16420 + }, + { + "epoch": 1.4877126225906547, + "grad_norm": 0.8552727103233337, + "learning_rate": 0.0001008276445357337, + "loss": 2.7496, + "step": 16421 + }, + { + "epoch": 1.4878032207651015, + "grad_norm": 0.9118919968605042, + "learning_rate": 0.00010082160333474295, + "loss": 2.6801, + "step": 16422 + }, + { + "epoch": 1.4878938189395483, + "grad_norm": 0.8465383052825928, + "learning_rate": 0.00010081556213375219, + "loss": 2.6156, + "step": 16423 + }, + { + "epoch": 1.487984417113995, + "grad_norm": 0.9900501370429993, + "learning_rate": 0.00010080952093276144, + "loss": 2.7302, + "step": 16424 + }, + { + "epoch": 1.4880750152884419, + "grad_norm": 0.8868098855018616, + "learning_rate": 0.00010080347973177068, + "loss": 2.4765, + "step": 16425 + }, + { + "epoch": 1.4881656134628887, + "grad_norm": 1.012864112854004, + "learning_rate": 0.00010079743853077992, + "loss": 3.0194, + "step": 16426 + }, + { + "epoch": 1.4882562116373355, + "grad_norm": 0.8882436752319336, + "learning_rate": 0.00010079139732978918, + "loss": 2.6563, + "step": 16427 + }, + { + "epoch": 1.4883468098117822, + "grad_norm": 0.8982276916503906, + "learning_rate": 0.0001007853561287984, + "loss": 2.6181, + "step": 16428 + }, + { + "epoch": 1.488437407986229, + "grad_norm": 0.8857689499855042, + "learning_rate": 0.00010077931492780766, + "loss": 2.9672, + "step": 16429 + }, + { + "epoch": 1.4885280061606758, + "grad_norm": 0.8781326413154602, + "learning_rate": 0.00010077327372681688, + "loss": 2.6076, + "step": 16430 + }, + { + "epoch": 1.4886186043351226, + "grad_norm": 0.8701682686805725, + "learning_rate": 0.00010076723252582614, + "loss": 2.7451, + "step": 16431 + }, + { + "epoch": 1.4887092025095694, + "grad_norm": 0.8471136093139648, + "learning_rate": 0.00010076119132483539, + "loss": 2.2677, + "step": 16432 + }, + { + "epoch": 1.4887998006840162, + "grad_norm": 0.8927145600318909, + "learning_rate": 0.00010075515012384462, + "loss": 2.7421, + "step": 16433 + }, + { + "epoch": 1.488890398858463, + "grad_norm": 0.9482345581054688, + "learning_rate": 0.00010074910892285387, + "loss": 2.0906, + "step": 16434 + }, + { + "epoch": 1.4889809970329098, + "grad_norm": 0.9015002846717834, + "learning_rate": 0.0001007430677218631, + "loss": 2.541, + "step": 16435 + }, + { + "epoch": 1.4890715952073565, + "grad_norm": 0.9042606949806213, + "learning_rate": 0.00010073702652087235, + "loss": 2.6655, + "step": 16436 + }, + { + "epoch": 1.4891621933818033, + "grad_norm": 0.8937274813652039, + "learning_rate": 0.00010073098531988159, + "loss": 2.5437, + "step": 16437 + }, + { + "epoch": 1.4892527915562501, + "grad_norm": 0.8850977420806885, + "learning_rate": 0.00010072494411889083, + "loss": 2.7373, + "step": 16438 + }, + { + "epoch": 1.489343389730697, + "grad_norm": 0.8991812467575073, + "learning_rate": 0.0001007189029179001, + "loss": 2.5413, + "step": 16439 + }, + { + "epoch": 1.4894339879051437, + "grad_norm": 0.8971765041351318, + "learning_rate": 0.00010071286171690933, + "loss": 2.7179, + "step": 16440 + }, + { + "epoch": 1.4895245860795905, + "grad_norm": 0.9521796107292175, + "learning_rate": 0.00010070682051591858, + "loss": 2.774, + "step": 16441 + }, + { + "epoch": 1.4896151842540373, + "grad_norm": 0.9271578788757324, + "learning_rate": 0.00010070077931492781, + "loss": 2.6428, + "step": 16442 + }, + { + "epoch": 1.489705782428484, + "grad_norm": 0.8754159212112427, + "learning_rate": 0.00010069473811393706, + "loss": 2.5856, + "step": 16443 + }, + { + "epoch": 1.4897963806029308, + "grad_norm": 0.923878014087677, + "learning_rate": 0.00010068869691294629, + "loss": 2.8059, + "step": 16444 + }, + { + "epoch": 1.4898869787773776, + "grad_norm": 0.8929242491722107, + "learning_rate": 0.00010068265571195554, + "loss": 3.1548, + "step": 16445 + }, + { + "epoch": 1.4899775769518244, + "grad_norm": 0.9270483255386353, + "learning_rate": 0.0001006766145109648, + "loss": 2.6298, + "step": 16446 + }, + { + "epoch": 1.4900681751262712, + "grad_norm": 0.8763620853424072, + "learning_rate": 0.00010067057330997402, + "loss": 2.7418, + "step": 16447 + }, + { + "epoch": 1.490158773300718, + "grad_norm": 0.8820145726203918, + "learning_rate": 0.00010066453210898328, + "loss": 2.8904, + "step": 16448 + }, + { + "epoch": 1.4902493714751648, + "grad_norm": 0.9522371292114258, + "learning_rate": 0.0001006584909079925, + "loss": 2.8267, + "step": 16449 + }, + { + "epoch": 1.4903399696496116, + "grad_norm": 0.9400720000267029, + "learning_rate": 0.00010065244970700176, + "loss": 2.7367, + "step": 16450 + }, + { + "epoch": 1.4904305678240584, + "grad_norm": 0.9036501049995422, + "learning_rate": 0.00010064640850601099, + "loss": 2.9616, + "step": 16451 + }, + { + "epoch": 1.4905211659985051, + "grad_norm": 0.9278193116188049, + "learning_rate": 0.00010064036730502025, + "loss": 2.8778, + "step": 16452 + }, + { + "epoch": 1.490611764172952, + "grad_norm": 0.8641403317451477, + "learning_rate": 0.00010063432610402948, + "loss": 2.5023, + "step": 16453 + }, + { + "epoch": 1.4907023623473987, + "grad_norm": 0.7405550479888916, + "learning_rate": 0.00010062828490303873, + "loss": 1.9536, + "step": 16454 + }, + { + "epoch": 1.4907929605218455, + "grad_norm": 0.9569677114486694, + "learning_rate": 0.00010062224370204798, + "loss": 2.8462, + "step": 16455 + }, + { + "epoch": 1.4908835586962923, + "grad_norm": 0.8647177815437317, + "learning_rate": 0.00010061620250105721, + "loss": 2.5289, + "step": 16456 + }, + { + "epoch": 1.490974156870739, + "grad_norm": 0.9862083792686462, + "learning_rate": 0.00010061016130006646, + "loss": 2.8821, + "step": 16457 + }, + { + "epoch": 1.4910647550451859, + "grad_norm": 0.9822497367858887, + "learning_rate": 0.00010060412009907569, + "loss": 2.7563, + "step": 16458 + }, + { + "epoch": 1.4911553532196327, + "grad_norm": 0.9375252723693848, + "learning_rate": 0.00010059807889808495, + "loss": 2.7199, + "step": 16459 + }, + { + "epoch": 1.4912459513940794, + "grad_norm": 0.8675227165222168, + "learning_rate": 0.00010059203769709417, + "loss": 2.5831, + "step": 16460 + }, + { + "epoch": 1.4913365495685262, + "grad_norm": 0.8642427921295166, + "learning_rate": 0.00010058599649610343, + "loss": 2.6578, + "step": 16461 + }, + { + "epoch": 1.491427147742973, + "grad_norm": 0.7568634152412415, + "learning_rate": 0.00010057995529511268, + "loss": 2.1711, + "step": 16462 + }, + { + "epoch": 1.4915177459174198, + "grad_norm": 0.9337765574455261, + "learning_rate": 0.00010057391409412192, + "loss": 2.8224, + "step": 16463 + }, + { + "epoch": 1.4916083440918666, + "grad_norm": 0.9455143213272095, + "learning_rate": 0.00010056787289313116, + "loss": 2.5901, + "step": 16464 + }, + { + "epoch": 1.4916989422663134, + "grad_norm": 0.8434531688690186, + "learning_rate": 0.0001005618316921404, + "loss": 2.6585, + "step": 16465 + }, + { + "epoch": 1.4917895404407602, + "grad_norm": 0.6490452885627747, + "learning_rate": 0.00010055579049114965, + "loss": 1.4106, + "step": 16466 + }, + { + "epoch": 1.491880138615207, + "grad_norm": 0.8739264011383057, + "learning_rate": 0.00010054974929015888, + "loss": 2.6291, + "step": 16467 + }, + { + "epoch": 1.4919707367896538, + "grad_norm": 0.837227463722229, + "learning_rate": 0.00010054370808916813, + "loss": 2.8413, + "step": 16468 + }, + { + "epoch": 1.4920613349641005, + "grad_norm": 0.7859196662902832, + "learning_rate": 0.00010053766688817739, + "loss": 2.2275, + "step": 16469 + }, + { + "epoch": 1.4921519331385473, + "grad_norm": 0.8884235620498657, + "learning_rate": 0.00010053162568718661, + "loss": 2.812, + "step": 16470 + }, + { + "epoch": 1.4922425313129941, + "grad_norm": 0.9585059881210327, + "learning_rate": 0.00010052558448619587, + "loss": 2.7524, + "step": 16471 + }, + { + "epoch": 1.492333129487441, + "grad_norm": 0.9275176525115967, + "learning_rate": 0.0001005195432852051, + "loss": 2.4854, + "step": 16472 + }, + { + "epoch": 1.4924237276618877, + "grad_norm": 0.9107611179351807, + "learning_rate": 0.00010051350208421435, + "loss": 2.711, + "step": 16473 + }, + { + "epoch": 1.4925143258363345, + "grad_norm": 0.9399802088737488, + "learning_rate": 0.00010050746088322359, + "loss": 2.8451, + "step": 16474 + }, + { + "epoch": 1.4926049240107813, + "grad_norm": 1.0042895078659058, + "learning_rate": 0.00010050141968223283, + "loss": 2.7764, + "step": 16475 + }, + { + "epoch": 1.4926955221852278, + "grad_norm": 0.851216733455658, + "learning_rate": 0.00010049537848124208, + "loss": 2.7907, + "step": 16476 + }, + { + "epoch": 1.4927861203596748, + "grad_norm": 0.8813603520393372, + "learning_rate": 0.00010048933728025131, + "loss": 2.9704, + "step": 16477 + }, + { + "epoch": 1.4928767185341214, + "grad_norm": 0.9232842326164246, + "learning_rate": 0.00010048329607926058, + "loss": 2.7967, + "step": 16478 + }, + { + "epoch": 1.4929673167085684, + "grad_norm": 0.9161113500595093, + "learning_rate": 0.0001004772548782698, + "loss": 2.9983, + "step": 16479 + }, + { + "epoch": 1.493057914883015, + "grad_norm": 0.9168485999107361, + "learning_rate": 0.00010047121367727906, + "loss": 2.6417, + "step": 16480 + }, + { + "epoch": 1.493148513057462, + "grad_norm": 0.9280624389648438, + "learning_rate": 0.00010046517247628828, + "loss": 2.6969, + "step": 16481 + }, + { + "epoch": 1.4932391112319086, + "grad_norm": 0.8851504921913147, + "learning_rate": 0.00010045913127529754, + "loss": 2.7281, + "step": 16482 + }, + { + "epoch": 1.4933297094063556, + "grad_norm": 0.8233470320701599, + "learning_rate": 0.00010045309007430676, + "loss": 1.9533, + "step": 16483 + }, + { + "epoch": 1.4934203075808021, + "grad_norm": 0.9051110744476318, + "learning_rate": 0.00010044704887331602, + "loss": 2.6659, + "step": 16484 + }, + { + "epoch": 1.4935109057552491, + "grad_norm": 0.9930852055549622, + "learning_rate": 0.00010044100767232527, + "loss": 2.7572, + "step": 16485 + }, + { + "epoch": 1.4936015039296957, + "grad_norm": 0.8711932897567749, + "learning_rate": 0.0001004349664713345, + "loss": 2.63, + "step": 16486 + }, + { + "epoch": 1.4936921021041427, + "grad_norm": 0.8577189445495605, + "learning_rate": 0.00010042892527034375, + "loss": 2.7514, + "step": 16487 + }, + { + "epoch": 1.4937827002785893, + "grad_norm": 0.843389093875885, + "learning_rate": 0.00010042288406935298, + "loss": 2.629, + "step": 16488 + }, + { + "epoch": 1.4938732984530363, + "grad_norm": 1.0503966808319092, + "learning_rate": 0.00010041684286836223, + "loss": 2.6988, + "step": 16489 + }, + { + "epoch": 1.4939638966274829, + "grad_norm": 0.8897389769554138, + "learning_rate": 0.00010041080166737147, + "loss": 2.4925, + "step": 16490 + }, + { + "epoch": 1.4940544948019299, + "grad_norm": 0.8369511961936951, + "learning_rate": 0.00010040476046638073, + "loss": 1.8931, + "step": 16491 + }, + { + "epoch": 1.4941450929763764, + "grad_norm": 0.9495627880096436, + "learning_rate": 0.00010039871926538997, + "loss": 2.8087, + "step": 16492 + }, + { + "epoch": 1.4942356911508234, + "grad_norm": 1.0307716131210327, + "learning_rate": 0.00010039267806439921, + "loss": 2.7797, + "step": 16493 + }, + { + "epoch": 1.49432628932527, + "grad_norm": 0.878555953502655, + "learning_rate": 0.00010038663686340846, + "loss": 2.8891, + "step": 16494 + }, + { + "epoch": 1.494416887499717, + "grad_norm": 0.959223747253418, + "learning_rate": 0.00010038059566241769, + "loss": 2.5944, + "step": 16495 + }, + { + "epoch": 1.4945074856741636, + "grad_norm": 0.9393720626831055, + "learning_rate": 0.00010037455446142694, + "loss": 2.8609, + "step": 16496 + }, + { + "epoch": 1.4945980838486104, + "grad_norm": 0.9072279930114746, + "learning_rate": 0.00010036851326043617, + "loss": 2.8901, + "step": 16497 + }, + { + "epoch": 1.4946886820230572, + "grad_norm": 0.9108867049217224, + "learning_rate": 0.00010036247205944542, + "loss": 2.7239, + "step": 16498 + }, + { + "epoch": 1.494779280197504, + "grad_norm": 0.8795201778411865, + "learning_rate": 0.00010035643085845468, + "loss": 2.6833, + "step": 16499 + }, + { + "epoch": 1.4948698783719507, + "grad_norm": 0.8436217308044434, + "learning_rate": 0.0001003503896574639, + "loss": 2.4689, + "step": 16500 + }, + { + "epoch": 1.4949604765463975, + "grad_norm": 0.9698108434677124, + "learning_rate": 0.00010034434845647316, + "loss": 2.5791, + "step": 16501 + }, + { + "epoch": 1.4950510747208443, + "grad_norm": 0.876471757888794, + "learning_rate": 0.00010033830725548238, + "loss": 2.474, + "step": 16502 + }, + { + "epoch": 1.495141672895291, + "grad_norm": 0.9262843728065491, + "learning_rate": 0.00010033226605449164, + "loss": 2.7388, + "step": 16503 + }, + { + "epoch": 1.4952322710697379, + "grad_norm": 0.9091270565986633, + "learning_rate": 0.00010032622485350088, + "loss": 2.6426, + "step": 16504 + }, + { + "epoch": 1.4953228692441847, + "grad_norm": 0.8246127963066101, + "learning_rate": 0.00010032018365251013, + "loss": 2.5021, + "step": 16505 + }, + { + "epoch": 1.4954134674186315, + "grad_norm": 1.0283665657043457, + "learning_rate": 0.00010031414245151937, + "loss": 3.1103, + "step": 16506 + }, + { + "epoch": 1.4955040655930782, + "grad_norm": 0.9189587831497192, + "learning_rate": 0.00010030810125052861, + "loss": 2.8214, + "step": 16507 + }, + { + "epoch": 1.495594663767525, + "grad_norm": 0.885899543762207, + "learning_rate": 0.00010030206004953785, + "loss": 2.6956, + "step": 16508 + }, + { + "epoch": 1.4956852619419718, + "grad_norm": 0.9699710011482239, + "learning_rate": 0.00010029601884854709, + "loss": 2.6885, + "step": 16509 + }, + { + "epoch": 1.4957758601164186, + "grad_norm": 0.936873733997345, + "learning_rate": 0.00010028997764755635, + "loss": 2.9157, + "step": 16510 + }, + { + "epoch": 1.4958664582908654, + "grad_norm": 0.8582513332366943, + "learning_rate": 0.00010028393644656557, + "loss": 2.7127, + "step": 16511 + }, + { + "epoch": 1.4959570564653122, + "grad_norm": 0.9281741380691528, + "learning_rate": 0.00010027789524557483, + "loss": 2.7371, + "step": 16512 + }, + { + "epoch": 1.496047654639759, + "grad_norm": 0.8954042792320251, + "learning_rate": 0.00010027185404458405, + "loss": 2.7223, + "step": 16513 + }, + { + "epoch": 1.4961382528142058, + "grad_norm": 0.9388142824172974, + "learning_rate": 0.00010026581284359331, + "loss": 2.8366, + "step": 16514 + }, + { + "epoch": 1.4962288509886525, + "grad_norm": 0.9268831610679626, + "learning_rate": 0.00010025977164260256, + "loss": 3.0783, + "step": 16515 + }, + { + "epoch": 1.4963194491630993, + "grad_norm": 0.9023306369781494, + "learning_rate": 0.0001002537304416118, + "loss": 2.7844, + "step": 16516 + }, + { + "epoch": 1.4964100473375461, + "grad_norm": 0.8785666823387146, + "learning_rate": 0.00010024768924062104, + "loss": 2.5923, + "step": 16517 + }, + { + "epoch": 1.496500645511993, + "grad_norm": 0.8991689682006836, + "learning_rate": 0.00010024164803963028, + "loss": 2.7129, + "step": 16518 + }, + { + "epoch": 1.4965912436864397, + "grad_norm": 0.8958995342254639, + "learning_rate": 0.00010023560683863952, + "loss": 2.7967, + "step": 16519 + }, + { + "epoch": 1.4966818418608865, + "grad_norm": 0.9442534446716309, + "learning_rate": 0.00010022956563764876, + "loss": 2.593, + "step": 16520 + }, + { + "epoch": 1.4967724400353333, + "grad_norm": 0.9043110609054565, + "learning_rate": 0.000100223524436658, + "loss": 2.6283, + "step": 16521 + }, + { + "epoch": 1.49686303820978, + "grad_norm": 0.9111265540122986, + "learning_rate": 0.00010021748323566727, + "loss": 2.6942, + "step": 16522 + }, + { + "epoch": 1.4969536363842268, + "grad_norm": 0.9857207536697388, + "learning_rate": 0.0001002114420346765, + "loss": 2.7278, + "step": 16523 + }, + { + "epoch": 1.4970442345586736, + "grad_norm": 0.8576552867889404, + "learning_rate": 0.00010020540083368575, + "loss": 2.5262, + "step": 16524 + }, + { + "epoch": 1.4971348327331204, + "grad_norm": 0.8376864194869995, + "learning_rate": 0.00010019935963269498, + "loss": 2.2028, + "step": 16525 + }, + { + "epoch": 1.4972254309075672, + "grad_norm": 0.9119274020195007, + "learning_rate": 0.00010019331843170423, + "loss": 2.9289, + "step": 16526 + }, + { + "epoch": 1.497316029082014, + "grad_norm": 0.893267035484314, + "learning_rate": 0.00010018727723071346, + "loss": 2.6589, + "step": 16527 + }, + { + "epoch": 1.4974066272564608, + "grad_norm": 0.8789058923721313, + "learning_rate": 0.00010018123602972271, + "loss": 2.7944, + "step": 16528 + }, + { + "epoch": 1.4974972254309076, + "grad_norm": 0.8140485882759094, + "learning_rate": 0.00010017519482873196, + "loss": 2.6166, + "step": 16529 + }, + { + "epoch": 1.4975878236053544, + "grad_norm": 0.8865492343902588, + "learning_rate": 0.0001001691536277412, + "loss": 2.8839, + "step": 16530 + }, + { + "epoch": 1.4976784217798012, + "grad_norm": 0.9388381242752075, + "learning_rate": 0.00010016311242675045, + "loss": 2.742, + "step": 16531 + }, + { + "epoch": 1.497769019954248, + "grad_norm": 0.8527897596359253, + "learning_rate": 0.00010015707122575968, + "loss": 2.5932, + "step": 16532 + }, + { + "epoch": 1.4978596181286947, + "grad_norm": 0.873733401298523, + "learning_rate": 0.00010015103002476894, + "loss": 2.8804, + "step": 16533 + }, + { + "epoch": 1.4979502163031415, + "grad_norm": 0.9397907257080078, + "learning_rate": 0.00010014498882377816, + "loss": 2.7984, + "step": 16534 + }, + { + "epoch": 1.4980408144775883, + "grad_norm": 0.9372403025627136, + "learning_rate": 0.00010013894762278742, + "loss": 2.8667, + "step": 16535 + }, + { + "epoch": 1.498131412652035, + "grad_norm": 0.9036403298377991, + "learning_rate": 0.00010013290642179667, + "loss": 2.6754, + "step": 16536 + }, + { + "epoch": 1.4982220108264819, + "grad_norm": 0.9445891380310059, + "learning_rate": 0.0001001268652208059, + "loss": 2.9664, + "step": 16537 + }, + { + "epoch": 1.4983126090009287, + "grad_norm": 0.8897271752357483, + "learning_rate": 0.00010012082401981515, + "loss": 3.0356, + "step": 16538 + }, + { + "epoch": 1.4984032071753755, + "grad_norm": 0.9131219983100891, + "learning_rate": 0.00010011478281882438, + "loss": 2.388, + "step": 16539 + }, + { + "epoch": 1.4984938053498222, + "grad_norm": 0.8817803263664246, + "learning_rate": 0.00010010874161783363, + "loss": 2.6287, + "step": 16540 + }, + { + "epoch": 1.498584403524269, + "grad_norm": 0.9099993109703064, + "learning_rate": 0.00010010270041684286, + "loss": 2.7273, + "step": 16541 + }, + { + "epoch": 1.4986750016987158, + "grad_norm": 0.8977237939834595, + "learning_rate": 0.00010009665921585212, + "loss": 2.9437, + "step": 16542 + }, + { + "epoch": 1.4987655998731626, + "grad_norm": 0.8728723526000977, + "learning_rate": 0.00010009061801486134, + "loss": 2.7165, + "step": 16543 + }, + { + "epoch": 1.4988561980476094, + "grad_norm": 0.8453114628791809, + "learning_rate": 0.0001000845768138706, + "loss": 2.8228, + "step": 16544 + }, + { + "epoch": 1.4989467962220562, + "grad_norm": 0.9808012843132019, + "learning_rate": 0.00010007853561287985, + "loss": 2.8108, + "step": 16545 + }, + { + "epoch": 1.499037394396503, + "grad_norm": 0.9062343239784241, + "learning_rate": 0.00010007249441188909, + "loss": 2.5099, + "step": 16546 + }, + { + "epoch": 1.4991279925709498, + "grad_norm": 0.8778569102287292, + "learning_rate": 0.00010006645321089834, + "loss": 2.7185, + "step": 16547 + }, + { + "epoch": 1.4992185907453965, + "grad_norm": 0.956894040107727, + "learning_rate": 0.00010006041200990757, + "loss": 3.0414, + "step": 16548 + }, + { + "epoch": 1.4993091889198433, + "grad_norm": 0.8858861327171326, + "learning_rate": 0.00010005437080891682, + "loss": 2.8346, + "step": 16549 + }, + { + "epoch": 1.4993997870942901, + "grad_norm": 0.8720013499259949, + "learning_rate": 0.00010004832960792605, + "loss": 2.6803, + "step": 16550 + }, + { + "epoch": 1.499490385268737, + "grad_norm": 0.8983142375946045, + "learning_rate": 0.0001000422884069353, + "loss": 2.6927, + "step": 16551 + }, + { + "epoch": 1.4995809834431837, + "grad_norm": 0.8839842677116394, + "learning_rate": 0.00010003624720594456, + "loss": 2.7119, + "step": 16552 + }, + { + "epoch": 1.4996715816176305, + "grad_norm": 0.9522319436073303, + "learning_rate": 0.00010003020600495378, + "loss": 2.6016, + "step": 16553 + }, + { + "epoch": 1.4997621797920773, + "grad_norm": 0.9236234426498413, + "learning_rate": 0.00010002416480396304, + "loss": 2.6136, + "step": 16554 + }, + { + "epoch": 1.499852777966524, + "grad_norm": 0.9259904623031616, + "learning_rate": 0.00010001812360297228, + "loss": 2.5281, + "step": 16555 + }, + { + "epoch": 1.4999433761409708, + "grad_norm": 0.942194402217865, + "learning_rate": 0.00010001208240198152, + "loss": 2.6912, + "step": 16556 + }, + { + "epoch": 1.5000339743154174, + "grad_norm": 0.8249863982200623, + "learning_rate": 0.00010000604120099076, + "loss": 2.0453, + "step": 16557 + }, + { + "epoch": 1.5001245724898644, + "grad_norm": 0.9099984169006348, + "learning_rate": 0.0001, + "loss": 2.7646, + "step": 16558 + }, + { + "epoch": 1.500215170664311, + "grad_norm": 0.8727027773857117, + "learning_rate": 9.999395879900925e-05, + "loss": 2.7318, + "step": 16559 + }, + { + "epoch": 1.500305768838758, + "grad_norm": 0.8140009641647339, + "learning_rate": 9.998791759801849e-05, + "loss": 2.7151, + "step": 16560 + }, + { + "epoch": 1.5003963670132046, + "grad_norm": 0.824970543384552, + "learning_rate": 9.998187639702773e-05, + "loss": 2.01, + "step": 16561 + }, + { + "epoch": 1.5004869651876516, + "grad_norm": 1.0056698322296143, + "learning_rate": 9.997583519603698e-05, + "loss": 2.7546, + "step": 16562 + }, + { + "epoch": 1.5005775633620981, + "grad_norm": 0.9557559490203857, + "learning_rate": 9.996979399504623e-05, + "loss": 3.0024, + "step": 16563 + }, + { + "epoch": 1.5006681615365451, + "grad_norm": 0.8841919302940369, + "learning_rate": 9.996375279405546e-05, + "loss": 2.7669, + "step": 16564 + }, + { + "epoch": 1.5007587597109917, + "grad_norm": 0.9673140048980713, + "learning_rate": 9.995771159306471e-05, + "loss": 2.7207, + "step": 16565 + }, + { + "epoch": 1.5008493578854387, + "grad_norm": 0.8926862478256226, + "learning_rate": 9.995167039207394e-05, + "loss": 2.6622, + "step": 16566 + }, + { + "epoch": 1.5009399560598853, + "grad_norm": 0.992433488368988, + "learning_rate": 9.994562919108319e-05, + "loss": 2.6742, + "step": 16567 + }, + { + "epoch": 1.5010305542343323, + "grad_norm": 0.8813818097114563, + "learning_rate": 9.993958799009243e-05, + "loss": 2.6829, + "step": 16568 + }, + { + "epoch": 1.5011211524087789, + "grad_norm": 0.83925461769104, + "learning_rate": 9.993354678910167e-05, + "loss": 2.7309, + "step": 16569 + }, + { + "epoch": 1.5012117505832259, + "grad_norm": 0.8883786201477051, + "learning_rate": 9.992750558811092e-05, + "loss": 2.7156, + "step": 16570 + }, + { + "epoch": 1.5013023487576724, + "grad_norm": 0.9220590591430664, + "learning_rate": 9.992146438712017e-05, + "loss": 2.5671, + "step": 16571 + }, + { + "epoch": 1.5013929469321194, + "grad_norm": 0.8706879615783691, + "learning_rate": 9.99154231861294e-05, + "loss": 2.5671, + "step": 16572 + }, + { + "epoch": 1.501483545106566, + "grad_norm": 0.9201347231864929, + "learning_rate": 9.990938198513865e-05, + "loss": 2.7268, + "step": 16573 + }, + { + "epoch": 1.501574143281013, + "grad_norm": 0.9314637184143066, + "learning_rate": 9.990334078414789e-05, + "loss": 2.8407, + "step": 16574 + }, + { + "epoch": 1.5016647414554596, + "grad_norm": 0.9317262768745422, + "learning_rate": 9.989729958315713e-05, + "loss": 2.8233, + "step": 16575 + }, + { + "epoch": 1.5017553396299066, + "grad_norm": 0.8977290391921997, + "learning_rate": 9.989125838216638e-05, + "loss": 2.8286, + "step": 16576 + }, + { + "epoch": 1.5018459378043532, + "grad_norm": 0.9817782044410706, + "learning_rate": 9.988521718117563e-05, + "loss": 2.7915, + "step": 16577 + }, + { + "epoch": 1.5019365359788002, + "grad_norm": 0.9340333938598633, + "learning_rate": 9.987917598018488e-05, + "loss": 2.8912, + "step": 16578 + }, + { + "epoch": 1.5020271341532467, + "grad_norm": 0.9602909088134766, + "learning_rate": 9.987313477919411e-05, + "loss": 2.786, + "step": 16579 + }, + { + "epoch": 1.5021177323276937, + "grad_norm": 0.9223007559776306, + "learning_rate": 9.986709357820336e-05, + "loss": 2.5992, + "step": 16580 + }, + { + "epoch": 1.5022083305021403, + "grad_norm": 0.8818015456199646, + "learning_rate": 9.986105237721259e-05, + "loss": 2.6942, + "step": 16581 + }, + { + "epoch": 1.5022989286765873, + "grad_norm": 0.9580425024032593, + "learning_rate": 9.985501117622184e-05, + "loss": 2.6781, + "step": 16582 + }, + { + "epoch": 1.5023895268510339, + "grad_norm": 0.8337585926055908, + "learning_rate": 9.984896997523107e-05, + "loss": 2.2357, + "step": 16583 + }, + { + "epoch": 1.502480125025481, + "grad_norm": 0.8457958698272705, + "learning_rate": 9.984292877424032e-05, + "loss": 2.3167, + "step": 16584 + }, + { + "epoch": 1.5025707231999275, + "grad_norm": 0.9297235012054443, + "learning_rate": 9.983688757324957e-05, + "loss": 2.5806, + "step": 16585 + }, + { + "epoch": 1.5026613213743745, + "grad_norm": 0.9021223783493042, + "learning_rate": 9.983084637225882e-05, + "loss": 2.4966, + "step": 16586 + }, + { + "epoch": 1.502751919548821, + "grad_norm": 0.9313759803771973, + "learning_rate": 9.982480517126805e-05, + "loss": 2.7415, + "step": 16587 + }, + { + "epoch": 1.5028425177232678, + "grad_norm": 0.8456605076789856, + "learning_rate": 9.98187639702773e-05, + "loss": 2.6437, + "step": 16588 + }, + { + "epoch": 1.5029331158977146, + "grad_norm": 0.9131222367286682, + "learning_rate": 9.981272276928653e-05, + "loss": 2.6754, + "step": 16589 + }, + { + "epoch": 1.5030237140721614, + "grad_norm": 0.6765736937522888, + "learning_rate": 9.980668156829578e-05, + "loss": 1.3184, + "step": 16590 + }, + { + "epoch": 1.5031143122466082, + "grad_norm": 0.9342328310012817, + "learning_rate": 9.980064036730503e-05, + "loss": 2.5675, + "step": 16591 + }, + { + "epoch": 1.503204910421055, + "grad_norm": 0.9210125803947449, + "learning_rate": 9.979459916631427e-05, + "loss": 2.9675, + "step": 16592 + }, + { + "epoch": 1.5032955085955018, + "grad_norm": 0.9418028593063354, + "learning_rate": 9.978855796532351e-05, + "loss": 2.6551, + "step": 16593 + }, + { + "epoch": 1.5033861067699485, + "grad_norm": 0.7802528738975525, + "learning_rate": 9.978251676433276e-05, + "loss": 2.0756, + "step": 16594 + }, + { + "epoch": 1.5034767049443953, + "grad_norm": 0.896238386631012, + "learning_rate": 9.9776475563342e-05, + "loss": 2.5378, + "step": 16595 + }, + { + "epoch": 1.5035673031188421, + "grad_norm": 0.9106538891792297, + "learning_rate": 9.977043436235124e-05, + "loss": 2.4746, + "step": 16596 + }, + { + "epoch": 1.503657901293289, + "grad_norm": 1.0449588298797607, + "learning_rate": 9.976439316136049e-05, + "loss": 2.7323, + "step": 16597 + }, + { + "epoch": 1.5037484994677357, + "grad_norm": 0.9416106939315796, + "learning_rate": 9.975835196036972e-05, + "loss": 2.5956, + "step": 16598 + }, + { + "epoch": 1.5038390976421825, + "grad_norm": 0.8793179392814636, + "learning_rate": 9.975231075937897e-05, + "loss": 2.4595, + "step": 16599 + }, + { + "epoch": 1.5039296958166293, + "grad_norm": 0.9680559039115906, + "learning_rate": 9.974626955838821e-05, + "loss": 2.5567, + "step": 16600 + }, + { + "epoch": 1.504020293991076, + "grad_norm": 0.7472904324531555, + "learning_rate": 9.974022835739746e-05, + "loss": 1.8785, + "step": 16601 + }, + { + "epoch": 1.5041108921655229, + "grad_norm": 0.9321959018707275, + "learning_rate": 9.97341871564067e-05, + "loss": 2.8512, + "step": 16602 + }, + { + "epoch": 1.5042014903399696, + "grad_norm": 0.9108050465583801, + "learning_rate": 9.972814595541594e-05, + "loss": 2.7423, + "step": 16603 + }, + { + "epoch": 1.5042920885144164, + "grad_norm": 0.7796315550804138, + "learning_rate": 9.972210475442518e-05, + "loss": 2.2303, + "step": 16604 + }, + { + "epoch": 1.5043826866888632, + "grad_norm": 0.9314116835594177, + "learning_rate": 9.971606355343443e-05, + "loss": 2.6349, + "step": 16605 + }, + { + "epoch": 1.50447328486331, + "grad_norm": 0.7175164818763733, + "learning_rate": 9.971002235244366e-05, + "loss": 1.9547, + "step": 16606 + }, + { + "epoch": 1.5045638830377568, + "grad_norm": 0.8866356015205383, + "learning_rate": 9.970398115145292e-05, + "loss": 2.6839, + "step": 16607 + }, + { + "epoch": 1.5046544812122036, + "grad_norm": 0.8759667873382568, + "learning_rate": 9.969793995046215e-05, + "loss": 2.903, + "step": 16608 + }, + { + "epoch": 1.5047450793866504, + "grad_norm": 0.903766930103302, + "learning_rate": 9.96918987494714e-05, + "loss": 2.5394, + "step": 16609 + }, + { + "epoch": 1.5048356775610972, + "grad_norm": 0.9844602942466736, + "learning_rate": 9.968585754848065e-05, + "loss": 2.9804, + "step": 16610 + }, + { + "epoch": 1.504926275735544, + "grad_norm": 0.8513286709785461, + "learning_rate": 9.967981634748988e-05, + "loss": 2.7651, + "step": 16611 + }, + { + "epoch": 1.5050168739099907, + "grad_norm": 1.0160385370254517, + "learning_rate": 9.967377514649913e-05, + "loss": 2.7421, + "step": 16612 + }, + { + "epoch": 1.5051074720844375, + "grad_norm": 0.9189468026161194, + "learning_rate": 9.966773394550837e-05, + "loss": 2.8039, + "step": 16613 + }, + { + "epoch": 1.5051980702588843, + "grad_norm": 0.8671509623527527, + "learning_rate": 9.966169274451761e-05, + "loss": 2.6239, + "step": 16614 + }, + { + "epoch": 1.505288668433331, + "grad_norm": 0.9525647163391113, + "learning_rate": 9.965565154352686e-05, + "loss": 2.8256, + "step": 16615 + }, + { + "epoch": 1.5053792666077779, + "grad_norm": 0.9110146760940552, + "learning_rate": 9.964961034253611e-05, + "loss": 2.8484, + "step": 16616 + }, + { + "epoch": 1.5054698647822247, + "grad_norm": 0.8418036103248596, + "learning_rate": 9.964356914154534e-05, + "loss": 2.526, + "step": 16617 + }, + { + "epoch": 1.5055604629566715, + "grad_norm": 0.9899435043334961, + "learning_rate": 9.963752794055459e-05, + "loss": 2.7315, + "step": 16618 + }, + { + "epoch": 1.5056510611311182, + "grad_norm": 0.86851567029953, + "learning_rate": 9.963148673956382e-05, + "loss": 2.9582, + "step": 16619 + }, + { + "epoch": 1.505741659305565, + "grad_norm": 0.878342866897583, + "learning_rate": 9.962544553857307e-05, + "loss": 2.6813, + "step": 16620 + }, + { + "epoch": 1.5058322574800118, + "grad_norm": 0.8787097930908203, + "learning_rate": 9.96194043375823e-05, + "loss": 2.8404, + "step": 16621 + }, + { + "epoch": 1.5059228556544586, + "grad_norm": 0.9615069031715393, + "learning_rate": 9.961336313659157e-05, + "loss": 2.6356, + "step": 16622 + }, + { + "epoch": 1.5060134538289054, + "grad_norm": 0.8927707672119141, + "learning_rate": 9.96073219356008e-05, + "loss": 2.9218, + "step": 16623 + }, + { + "epoch": 1.5061040520033522, + "grad_norm": 0.897990882396698, + "learning_rate": 9.960128073461005e-05, + "loss": 2.7335, + "step": 16624 + }, + { + "epoch": 1.506194650177799, + "grad_norm": 0.7940438389778137, + "learning_rate": 9.959523953361928e-05, + "loss": 2.1576, + "step": 16625 + }, + { + "epoch": 1.5062852483522458, + "grad_norm": 0.8875399827957153, + "learning_rate": 9.958919833262853e-05, + "loss": 2.6298, + "step": 16626 + }, + { + "epoch": 1.5063758465266925, + "grad_norm": 0.828218400478363, + "learning_rate": 9.958315713163778e-05, + "loss": 2.8455, + "step": 16627 + }, + { + "epoch": 1.506466444701139, + "grad_norm": 0.742219865322113, + "learning_rate": 9.957711593064701e-05, + "loss": 1.8854, + "step": 16628 + }, + { + "epoch": 1.5065570428755861, + "grad_norm": 0.88135826587677, + "learning_rate": 9.957107472965626e-05, + "loss": 2.6837, + "step": 16629 + }, + { + "epoch": 1.5066476410500327, + "grad_norm": 0.9554004669189453, + "learning_rate": 9.956503352866551e-05, + "loss": 2.829, + "step": 16630 + }, + { + "epoch": 1.5067382392244797, + "grad_norm": 0.8575025200843811, + "learning_rate": 9.955899232767475e-05, + "loss": 1.9884, + "step": 16631 + }, + { + "epoch": 1.5068288373989263, + "grad_norm": 0.9169819951057434, + "learning_rate": 9.955295112668399e-05, + "loss": 2.6996, + "step": 16632 + }, + { + "epoch": 1.5069194355733733, + "grad_norm": 0.8703895211219788, + "learning_rate": 9.954690992569324e-05, + "loss": 2.702, + "step": 16633 + }, + { + "epoch": 1.5070100337478198, + "grad_norm": 0.9089241027832031, + "learning_rate": 9.954086872470247e-05, + "loss": 2.6941, + "step": 16634 + }, + { + "epoch": 1.5071006319222668, + "grad_norm": 0.9341093301773071, + "learning_rate": 9.953482752371172e-05, + "loss": 2.7849, + "step": 16635 + }, + { + "epoch": 1.5071912300967134, + "grad_norm": 0.9315711855888367, + "learning_rate": 9.952878632272095e-05, + "loss": 2.5969, + "step": 16636 + }, + { + "epoch": 1.5072818282711604, + "grad_norm": 0.9401592016220093, + "learning_rate": 9.952274512173021e-05, + "loss": 2.7494, + "step": 16637 + }, + { + "epoch": 1.507372426445607, + "grad_norm": 0.8826307058334351, + "learning_rate": 9.951670392073945e-05, + "loss": 2.7049, + "step": 16638 + }, + { + "epoch": 1.507463024620054, + "grad_norm": 0.8430770039558411, + "learning_rate": 9.95106627197487e-05, + "loss": 2.4139, + "step": 16639 + }, + { + "epoch": 1.5075536227945006, + "grad_norm": 0.8253557682037354, + "learning_rate": 9.950462151875793e-05, + "loss": 2.3564, + "step": 16640 + }, + { + "epoch": 1.5076442209689476, + "grad_norm": 0.968018114566803, + "learning_rate": 9.949858031776718e-05, + "loss": 2.6814, + "step": 16641 + }, + { + "epoch": 1.5077348191433941, + "grad_norm": 0.7706854343414307, + "learning_rate": 9.949253911677642e-05, + "loss": 1.9121, + "step": 16642 + }, + { + "epoch": 1.5078254173178411, + "grad_norm": 0.9405561685562134, + "learning_rate": 9.948649791578566e-05, + "loss": 2.7528, + "step": 16643 + }, + { + "epoch": 1.5079160154922877, + "grad_norm": 0.9341402053833008, + "learning_rate": 9.94804567147949e-05, + "loss": 2.5096, + "step": 16644 + }, + { + "epoch": 1.5080066136667347, + "grad_norm": 0.9287415146827698, + "learning_rate": 9.947441551380415e-05, + "loss": 2.7862, + "step": 16645 + }, + { + "epoch": 1.5080972118411813, + "grad_norm": 1.0095103979110718, + "learning_rate": 9.94683743128134e-05, + "loss": 2.5884, + "step": 16646 + }, + { + "epoch": 1.5081878100156283, + "grad_norm": 0.9717746376991272, + "learning_rate": 9.946233311182264e-05, + "loss": 2.5672, + "step": 16647 + }, + { + "epoch": 1.5082784081900749, + "grad_norm": 1.0364140272140503, + "learning_rate": 9.945629191083188e-05, + "loss": 2.6811, + "step": 16648 + }, + { + "epoch": 1.5083690063645219, + "grad_norm": 0.9110268354415894, + "learning_rate": 9.945025070984112e-05, + "loss": 2.8051, + "step": 16649 + }, + { + "epoch": 1.5084596045389684, + "grad_norm": 0.8774221539497375, + "learning_rate": 9.944420950885036e-05, + "loss": 2.7444, + "step": 16650 + }, + { + "epoch": 1.5085502027134154, + "grad_norm": 0.900936484336853, + "learning_rate": 9.94381683078596e-05, + "loss": 2.558, + "step": 16651 + }, + { + "epoch": 1.508640800887862, + "grad_norm": 0.937366783618927, + "learning_rate": 9.943212710686886e-05, + "loss": 2.9901, + "step": 16652 + }, + { + "epoch": 1.508731399062309, + "grad_norm": 0.8727425336837769, + "learning_rate": 9.94260859058781e-05, + "loss": 2.8706, + "step": 16653 + }, + { + "epoch": 1.5088219972367556, + "grad_norm": 0.8821354508399963, + "learning_rate": 9.942004470488734e-05, + "loss": 2.6804, + "step": 16654 + }, + { + "epoch": 1.5089125954112026, + "grad_norm": 0.8994351029396057, + "learning_rate": 9.941400350389658e-05, + "loss": 2.6784, + "step": 16655 + }, + { + "epoch": 1.5090031935856492, + "grad_norm": 0.8772478699684143, + "learning_rate": 9.940796230290582e-05, + "loss": 2.5612, + "step": 16656 + }, + { + "epoch": 1.5090937917600962, + "grad_norm": 0.9085608720779419, + "learning_rate": 9.940192110191506e-05, + "loss": 2.7975, + "step": 16657 + }, + { + "epoch": 1.5091843899345427, + "grad_norm": 0.9636149406433105, + "learning_rate": 9.93958799009243e-05, + "loss": 2.9653, + "step": 16658 + }, + { + "epoch": 1.5092749881089897, + "grad_norm": 0.6834480166435242, + "learning_rate": 9.938983869993355e-05, + "loss": 1.6357, + "step": 16659 + }, + { + "epoch": 1.5093655862834363, + "grad_norm": 1.0683423280715942, + "learning_rate": 9.93837974989428e-05, + "loss": 2.7706, + "step": 16660 + }, + { + "epoch": 1.5094561844578833, + "grad_norm": 1.2432302236557007, + "learning_rate": 9.937775629795203e-05, + "loss": 2.7403, + "step": 16661 + }, + { + "epoch": 1.5095467826323299, + "grad_norm": 0.9908851385116577, + "learning_rate": 9.937171509696128e-05, + "loss": 2.8642, + "step": 16662 + }, + { + "epoch": 1.509637380806777, + "grad_norm": 0.9150229692459106, + "learning_rate": 9.936567389597053e-05, + "loss": 2.5456, + "step": 16663 + }, + { + "epoch": 1.5097279789812235, + "grad_norm": 0.9386987686157227, + "learning_rate": 9.935963269497976e-05, + "loss": 2.7409, + "step": 16664 + }, + { + "epoch": 1.5098185771556705, + "grad_norm": 0.9003937840461731, + "learning_rate": 9.935359149398901e-05, + "loss": 2.7756, + "step": 16665 + }, + { + "epoch": 1.509909175330117, + "grad_norm": 0.7401272654533386, + "learning_rate": 9.934755029299824e-05, + "loss": 1.9677, + "step": 16666 + }, + { + "epoch": 1.509999773504564, + "grad_norm": 0.871955156326294, + "learning_rate": 9.93415090920075e-05, + "loss": 2.6483, + "step": 16667 + }, + { + "epoch": 1.5100903716790106, + "grad_norm": 0.9855775237083435, + "learning_rate": 9.933546789101674e-05, + "loss": 2.905, + "step": 16668 + }, + { + "epoch": 1.5101809698534574, + "grad_norm": 0.9023216962814331, + "learning_rate": 9.932942669002599e-05, + "loss": 2.7962, + "step": 16669 + }, + { + "epoch": 1.5102715680279042, + "grad_norm": 0.851429283618927, + "learning_rate": 9.932338548903522e-05, + "loss": 2.6019, + "step": 16670 + }, + { + "epoch": 1.510362166202351, + "grad_norm": 0.9591960906982422, + "learning_rate": 9.931734428804447e-05, + "loss": 2.7481, + "step": 16671 + }, + { + "epoch": 1.5104527643767978, + "grad_norm": 0.8456997275352478, + "learning_rate": 9.93113030870537e-05, + "loss": 2.6138, + "step": 16672 + }, + { + "epoch": 1.5105433625512446, + "grad_norm": 1.0875048637390137, + "learning_rate": 9.930526188606295e-05, + "loss": 2.5337, + "step": 16673 + }, + { + "epoch": 1.5106339607256913, + "grad_norm": 0.9519892334938049, + "learning_rate": 9.92992206850722e-05, + "loss": 2.4811, + "step": 16674 + }, + { + "epoch": 1.5107245589001381, + "grad_norm": 0.93968665599823, + "learning_rate": 9.929317948408145e-05, + "loss": 2.7124, + "step": 16675 + }, + { + "epoch": 1.510815157074585, + "grad_norm": 0.855208694934845, + "learning_rate": 9.928713828309068e-05, + "loss": 2.945, + "step": 16676 + }, + { + "epoch": 1.5109057552490317, + "grad_norm": 0.9211595058441162, + "learning_rate": 9.928109708209993e-05, + "loss": 2.8431, + "step": 16677 + }, + { + "epoch": 1.5109963534234785, + "grad_norm": 0.9554876089096069, + "learning_rate": 9.927505588110918e-05, + "loss": 2.6177, + "step": 16678 + }, + { + "epoch": 1.5110869515979253, + "grad_norm": 0.9185726642608643, + "learning_rate": 9.926901468011841e-05, + "loss": 2.8598, + "step": 16679 + }, + { + "epoch": 1.511177549772372, + "grad_norm": 0.9164839386940002, + "learning_rate": 9.926297347912766e-05, + "loss": 2.7027, + "step": 16680 + }, + { + "epoch": 1.5112681479468189, + "grad_norm": 0.8685365915298462, + "learning_rate": 9.925693227813689e-05, + "loss": 2.7998, + "step": 16681 + }, + { + "epoch": 1.5113587461212656, + "grad_norm": 0.8994429111480713, + "learning_rate": 9.925089107714615e-05, + "loss": 2.6418, + "step": 16682 + }, + { + "epoch": 1.5114493442957124, + "grad_norm": 0.9016191363334656, + "learning_rate": 9.924484987615539e-05, + "loss": 2.8512, + "step": 16683 + }, + { + "epoch": 1.5115399424701592, + "grad_norm": 0.8547800779342651, + "learning_rate": 9.923880867516463e-05, + "loss": 2.6273, + "step": 16684 + }, + { + "epoch": 1.511630540644606, + "grad_norm": 0.9568622708320618, + "learning_rate": 9.923276747417387e-05, + "loss": 2.5831, + "step": 16685 + }, + { + "epoch": 1.5117211388190528, + "grad_norm": 0.8935781717300415, + "learning_rate": 9.922672627318312e-05, + "loss": 2.7515, + "step": 16686 + }, + { + "epoch": 1.5118117369934996, + "grad_norm": 0.9281009435653687, + "learning_rate": 9.922068507219235e-05, + "loss": 2.7382, + "step": 16687 + }, + { + "epoch": 1.5119023351679464, + "grad_norm": 0.9863302707672119, + "learning_rate": 9.92146438712016e-05, + "loss": 2.4749, + "step": 16688 + }, + { + "epoch": 1.5119929333423932, + "grad_norm": 0.9032320380210876, + "learning_rate": 9.920860267021083e-05, + "loss": 2.5647, + "step": 16689 + }, + { + "epoch": 1.51208353151684, + "grad_norm": 0.9064184427261353, + "learning_rate": 9.920256146922009e-05, + "loss": 2.7052, + "step": 16690 + }, + { + "epoch": 1.5121741296912867, + "grad_norm": 0.9041010141372681, + "learning_rate": 9.919652026822933e-05, + "loss": 2.5864, + "step": 16691 + }, + { + "epoch": 1.5122647278657335, + "grad_norm": 0.8622470498085022, + "learning_rate": 9.919047906723857e-05, + "loss": 2.4648, + "step": 16692 + }, + { + "epoch": 1.5123553260401803, + "grad_norm": 0.7706490755081177, + "learning_rate": 9.918443786624781e-05, + "loss": 1.8143, + "step": 16693 + }, + { + "epoch": 1.512445924214627, + "grad_norm": 0.88752681016922, + "learning_rate": 9.917839666525706e-05, + "loss": 2.7847, + "step": 16694 + }, + { + "epoch": 1.5125365223890739, + "grad_norm": 0.8544816374778748, + "learning_rate": 9.91723554642663e-05, + "loss": 2.5659, + "step": 16695 + }, + { + "epoch": 1.5126271205635207, + "grad_norm": 0.972822904586792, + "learning_rate": 9.916631426327554e-05, + "loss": 2.7774, + "step": 16696 + }, + { + "epoch": 1.5127177187379675, + "grad_norm": 0.9620334506034851, + "learning_rate": 9.91602730622848e-05, + "loss": 2.7806, + "step": 16697 + }, + { + "epoch": 1.5128083169124142, + "grad_norm": 0.8920525312423706, + "learning_rate": 9.915423186129403e-05, + "loss": 2.2498, + "step": 16698 + }, + { + "epoch": 1.512898915086861, + "grad_norm": 1.322804570198059, + "learning_rate": 9.914819066030328e-05, + "loss": 2.1345, + "step": 16699 + }, + { + "epoch": 1.5129895132613078, + "grad_norm": 0.8689243793487549, + "learning_rate": 9.914214945931251e-05, + "loss": 2.7749, + "step": 16700 + }, + { + "epoch": 1.5130801114357546, + "grad_norm": 0.8783984780311584, + "learning_rate": 9.913610825832176e-05, + "loss": 2.4777, + "step": 16701 + }, + { + "epoch": 1.5131707096102014, + "grad_norm": 0.9713832139968872, + "learning_rate": 9.9130067057331e-05, + "loss": 2.5481, + "step": 16702 + }, + { + "epoch": 1.5132613077846482, + "grad_norm": 0.897550642490387, + "learning_rate": 9.912402585634024e-05, + "loss": 2.7258, + "step": 16703 + }, + { + "epoch": 1.513351905959095, + "grad_norm": 0.9023040533065796, + "learning_rate": 9.911798465534948e-05, + "loss": 2.9491, + "step": 16704 + }, + { + "epoch": 1.5134425041335418, + "grad_norm": 0.9110019207000732, + "learning_rate": 9.911194345435874e-05, + "loss": 2.8903, + "step": 16705 + }, + { + "epoch": 1.5135331023079885, + "grad_norm": 0.8271604180335999, + "learning_rate": 9.910590225336797e-05, + "loss": 2.6914, + "step": 16706 + }, + { + "epoch": 1.5136237004824353, + "grad_norm": 0.9064258933067322, + "learning_rate": 9.909986105237722e-05, + "loss": 2.8215, + "step": 16707 + }, + { + "epoch": 1.5137142986568821, + "grad_norm": 0.9429650902748108, + "learning_rate": 9.909381985138645e-05, + "loss": 2.8169, + "step": 16708 + }, + { + "epoch": 1.5138048968313287, + "grad_norm": 0.8729255199432373, + "learning_rate": 9.90877786503957e-05, + "loss": 2.5857, + "step": 16709 + }, + { + "epoch": 1.5138954950057757, + "grad_norm": 0.8895695209503174, + "learning_rate": 9.908173744940495e-05, + "loss": 2.5208, + "step": 16710 + }, + { + "epoch": 1.5139860931802223, + "grad_norm": 0.7877746820449829, + "learning_rate": 9.907569624841418e-05, + "loss": 2.1334, + "step": 16711 + }, + { + "epoch": 1.5140766913546693, + "grad_norm": 0.9279079437255859, + "learning_rate": 9.906965504742343e-05, + "loss": 2.7056, + "step": 16712 + }, + { + "epoch": 1.5141672895291158, + "grad_norm": 0.9231999516487122, + "learning_rate": 9.906361384643268e-05, + "loss": 2.547, + "step": 16713 + }, + { + "epoch": 1.5142578877035628, + "grad_norm": 0.9231662154197693, + "learning_rate": 9.905757264544193e-05, + "loss": 2.6494, + "step": 16714 + }, + { + "epoch": 1.5143484858780094, + "grad_norm": 0.8052405714988708, + "learning_rate": 9.905153144445116e-05, + "loss": 2.0271, + "step": 16715 + }, + { + "epoch": 1.5144390840524564, + "grad_norm": 0.9212625622749329, + "learning_rate": 9.904549024346041e-05, + "loss": 2.7269, + "step": 16716 + }, + { + "epoch": 1.514529682226903, + "grad_norm": 0.8532984256744385, + "learning_rate": 9.903944904246964e-05, + "loss": 2.5025, + "step": 16717 + }, + { + "epoch": 1.51462028040135, + "grad_norm": 0.9026652574539185, + "learning_rate": 9.903340784147889e-05, + "loss": 2.7642, + "step": 16718 + }, + { + "epoch": 1.5147108785757966, + "grad_norm": 0.9538257718086243, + "learning_rate": 9.902736664048812e-05, + "loss": 2.7455, + "step": 16719 + }, + { + "epoch": 1.5148014767502436, + "grad_norm": 0.8357330560684204, + "learning_rate": 9.902132543949739e-05, + "loss": 2.6452, + "step": 16720 + }, + { + "epoch": 1.5148920749246901, + "grad_norm": 0.9157715439796448, + "learning_rate": 9.901528423850662e-05, + "loss": 2.6292, + "step": 16721 + }, + { + "epoch": 1.5149826730991371, + "grad_norm": 0.9237580895423889, + "learning_rate": 9.900924303751587e-05, + "loss": 2.8979, + "step": 16722 + }, + { + "epoch": 1.5150732712735837, + "grad_norm": 0.9480633735656738, + "learning_rate": 9.90032018365251e-05, + "loss": 2.5853, + "step": 16723 + }, + { + "epoch": 1.5151638694480307, + "grad_norm": 0.853870153427124, + "learning_rate": 9.899716063553435e-05, + "loss": 2.7339, + "step": 16724 + }, + { + "epoch": 1.5152544676224773, + "grad_norm": 0.9517751336097717, + "learning_rate": 9.899111943454358e-05, + "loss": 2.5748, + "step": 16725 + }, + { + "epoch": 1.5153450657969243, + "grad_norm": 0.8931750059127808, + "learning_rate": 9.898507823355283e-05, + "loss": 2.5276, + "step": 16726 + }, + { + "epoch": 1.5154356639713709, + "grad_norm": 0.90276038646698, + "learning_rate": 9.897903703256208e-05, + "loss": 2.0412, + "step": 16727 + }, + { + "epoch": 1.5155262621458179, + "grad_norm": 0.9265339374542236, + "learning_rate": 9.897299583157133e-05, + "loss": 2.676, + "step": 16728 + }, + { + "epoch": 1.5156168603202644, + "grad_norm": 0.8767831325531006, + "learning_rate": 9.896695463058057e-05, + "loss": 2.6474, + "step": 16729 + }, + { + "epoch": 1.5157074584947114, + "grad_norm": 0.8654578924179077, + "learning_rate": 9.896091342958981e-05, + "loss": 2.7176, + "step": 16730 + }, + { + "epoch": 1.515798056669158, + "grad_norm": 0.9918581247329712, + "learning_rate": 9.895487222859905e-05, + "loss": 2.813, + "step": 16731 + }, + { + "epoch": 1.515888654843605, + "grad_norm": 0.8680897951126099, + "learning_rate": 9.894883102760829e-05, + "loss": 2.7991, + "step": 16732 + }, + { + "epoch": 1.5159792530180516, + "grad_norm": 0.896532416343689, + "learning_rate": 9.894278982661754e-05, + "loss": 2.6815, + "step": 16733 + }, + { + "epoch": 1.5160698511924986, + "grad_norm": 0.9440168738365173, + "learning_rate": 9.893674862562677e-05, + "loss": 2.6837, + "step": 16734 + }, + { + "epoch": 1.5161604493669452, + "grad_norm": 0.843313992023468, + "learning_rate": 9.893070742463603e-05, + "loss": 2.6287, + "step": 16735 + }, + { + "epoch": 1.5162510475413922, + "grad_norm": 0.9640353322029114, + "learning_rate": 9.892466622364527e-05, + "loss": 2.5403, + "step": 16736 + }, + { + "epoch": 1.5163416457158387, + "grad_norm": 0.9367311596870422, + "learning_rate": 9.891862502265451e-05, + "loss": 3.0617, + "step": 16737 + }, + { + "epoch": 1.5164322438902857, + "grad_norm": 0.8764252066612244, + "learning_rate": 9.891258382166375e-05, + "loss": 2.7737, + "step": 16738 + }, + { + "epoch": 1.5165228420647323, + "grad_norm": 0.8954668641090393, + "learning_rate": 9.8906542620673e-05, + "loss": 2.6999, + "step": 16739 + }, + { + "epoch": 1.5166134402391793, + "grad_norm": 0.9017359614372253, + "learning_rate": 9.890050141968223e-05, + "loss": 2.8009, + "step": 16740 + }, + { + "epoch": 1.5167040384136259, + "grad_norm": 1.0119106769561768, + "learning_rate": 9.889446021869148e-05, + "loss": 2.4756, + "step": 16741 + }, + { + "epoch": 1.516794636588073, + "grad_norm": 0.9150450825691223, + "learning_rate": 9.888841901770072e-05, + "loss": 2.5847, + "step": 16742 + }, + { + "epoch": 1.5168852347625195, + "grad_norm": 0.9352822303771973, + "learning_rate": 9.888237781670997e-05, + "loss": 2.6993, + "step": 16743 + }, + { + "epoch": 1.5169758329369665, + "grad_norm": 0.8847276568412781, + "learning_rate": 9.88763366157192e-05, + "loss": 2.5812, + "step": 16744 + }, + { + "epoch": 1.517066431111413, + "grad_norm": 0.9698963761329651, + "learning_rate": 9.887029541472845e-05, + "loss": 2.5521, + "step": 16745 + }, + { + "epoch": 1.51715702928586, + "grad_norm": 0.993162989616394, + "learning_rate": 9.88642542137377e-05, + "loss": 2.733, + "step": 16746 + }, + { + "epoch": 1.5172476274603066, + "grad_norm": 1.0425885915756226, + "learning_rate": 9.885821301274693e-05, + "loss": 2.7535, + "step": 16747 + }, + { + "epoch": 1.5173382256347536, + "grad_norm": 0.972946286201477, + "learning_rate": 9.885217181175618e-05, + "loss": 2.7803, + "step": 16748 + }, + { + "epoch": 1.5174288238092002, + "grad_norm": 1.0917781591415405, + "learning_rate": 9.884613061076542e-05, + "loss": 2.6126, + "step": 16749 + }, + { + "epoch": 1.517519421983647, + "grad_norm": 0.9135161638259888, + "learning_rate": 9.884008940977468e-05, + "loss": 2.2222, + "step": 16750 + }, + { + "epoch": 1.5176100201580938, + "grad_norm": 0.9305933713912964, + "learning_rate": 9.883404820878391e-05, + "loss": 2.596, + "step": 16751 + }, + { + "epoch": 1.5177006183325406, + "grad_norm": 0.9711153507232666, + "learning_rate": 9.882800700779316e-05, + "loss": 2.8117, + "step": 16752 + }, + { + "epoch": 1.5177912165069873, + "grad_norm": 0.9147023558616638, + "learning_rate": 9.88219658068024e-05, + "loss": 2.7345, + "step": 16753 + }, + { + "epoch": 1.5178818146814341, + "grad_norm": 0.9201809167861938, + "learning_rate": 9.881592460581164e-05, + "loss": 2.7072, + "step": 16754 + }, + { + "epoch": 1.517972412855881, + "grad_norm": 0.9715085625648499, + "learning_rate": 9.880988340482088e-05, + "loss": 2.7675, + "step": 16755 + }, + { + "epoch": 1.5180630110303277, + "grad_norm": 0.8801594376564026, + "learning_rate": 9.880384220383012e-05, + "loss": 2.4809, + "step": 16756 + }, + { + "epoch": 1.5181536092047745, + "grad_norm": 0.8895268440246582, + "learning_rate": 9.879780100283937e-05, + "loss": 2.7337, + "step": 16757 + }, + { + "epoch": 1.5182442073792213, + "grad_norm": 0.953029453754425, + "learning_rate": 9.879175980184862e-05, + "loss": 2.5221, + "step": 16758 + }, + { + "epoch": 1.518334805553668, + "grad_norm": 0.8846101760864258, + "learning_rate": 9.878571860085785e-05, + "loss": 2.598, + "step": 16759 + }, + { + "epoch": 1.5184254037281149, + "grad_norm": 0.9190221428871155, + "learning_rate": 9.87796773998671e-05, + "loss": 2.7842, + "step": 16760 + }, + { + "epoch": 1.5185160019025616, + "grad_norm": 0.8712000250816345, + "learning_rate": 9.877363619887633e-05, + "loss": 2.5308, + "step": 16761 + }, + { + "epoch": 1.5186066000770084, + "grad_norm": 0.9141484498977661, + "learning_rate": 9.876759499788558e-05, + "loss": 2.5927, + "step": 16762 + }, + { + "epoch": 1.5186971982514552, + "grad_norm": 0.9749485850334167, + "learning_rate": 9.876155379689483e-05, + "loss": 2.6725, + "step": 16763 + }, + { + "epoch": 1.518787796425902, + "grad_norm": 0.8924545645713806, + "learning_rate": 9.875551259590406e-05, + "loss": 2.5645, + "step": 16764 + }, + { + "epoch": 1.5188783946003488, + "grad_norm": 0.8535587787628174, + "learning_rate": 9.874947139491332e-05, + "loss": 2.0676, + "step": 16765 + }, + { + "epoch": 1.5189689927747956, + "grad_norm": 0.9355964660644531, + "learning_rate": 9.874343019392256e-05, + "loss": 2.7382, + "step": 16766 + }, + { + "epoch": 1.5190595909492424, + "grad_norm": 0.9263586401939392, + "learning_rate": 9.87373889929318e-05, + "loss": 2.49, + "step": 16767 + }, + { + "epoch": 1.5191501891236892, + "grad_norm": 0.9938408732414246, + "learning_rate": 9.873134779194104e-05, + "loss": 2.6127, + "step": 16768 + }, + { + "epoch": 1.519240787298136, + "grad_norm": 0.7705458998680115, + "learning_rate": 9.872530659095029e-05, + "loss": 2.1077, + "step": 16769 + }, + { + "epoch": 1.5193313854725827, + "grad_norm": 0.958346962928772, + "learning_rate": 9.871926538995952e-05, + "loss": 2.6065, + "step": 16770 + }, + { + "epoch": 1.5194219836470295, + "grad_norm": 1.0536257028579712, + "learning_rate": 9.871322418896877e-05, + "loss": 3.0428, + "step": 16771 + }, + { + "epoch": 1.5195125818214763, + "grad_norm": 0.9522027373313904, + "learning_rate": 9.870718298797802e-05, + "loss": 2.8178, + "step": 16772 + }, + { + "epoch": 1.519603179995923, + "grad_norm": 0.9970042109489441, + "learning_rate": 9.870114178698726e-05, + "loss": 2.7604, + "step": 16773 + }, + { + "epoch": 1.5196937781703699, + "grad_norm": 0.8069034814834595, + "learning_rate": 9.86951005859965e-05, + "loss": 2.0783, + "step": 16774 + }, + { + "epoch": 1.5197843763448167, + "grad_norm": 0.8806484341621399, + "learning_rate": 9.868905938500575e-05, + "loss": 2.9365, + "step": 16775 + }, + { + "epoch": 1.5198749745192635, + "grad_norm": 0.9771223068237305, + "learning_rate": 9.868301818401498e-05, + "loss": 2.9698, + "step": 16776 + }, + { + "epoch": 1.5199655726937102, + "grad_norm": 0.8964653611183167, + "learning_rate": 9.867697698302423e-05, + "loss": 2.8023, + "step": 16777 + }, + { + "epoch": 1.520056170868157, + "grad_norm": 0.8820413947105408, + "learning_rate": 9.867093578203348e-05, + "loss": 2.8176, + "step": 16778 + }, + { + "epoch": 1.5201467690426038, + "grad_norm": 0.9069036245346069, + "learning_rate": 9.866489458104271e-05, + "loss": 2.6372, + "step": 16779 + }, + { + "epoch": 1.5202373672170506, + "grad_norm": 0.9704498648643494, + "learning_rate": 9.865885338005196e-05, + "loss": 2.5605, + "step": 16780 + }, + { + "epoch": 1.5203279653914974, + "grad_norm": 0.8568739295005798, + "learning_rate": 9.86528121790612e-05, + "loss": 2.9662, + "step": 16781 + }, + { + "epoch": 1.5204185635659442, + "grad_norm": 0.8823909163475037, + "learning_rate": 9.864677097807045e-05, + "loss": 2.5718, + "step": 16782 + }, + { + "epoch": 1.520509161740391, + "grad_norm": 0.8761699199676514, + "learning_rate": 9.864072977707969e-05, + "loss": 2.6897, + "step": 16783 + }, + { + "epoch": 1.5205997599148378, + "grad_norm": 0.8784447312355042, + "learning_rate": 9.863468857608893e-05, + "loss": 2.7313, + "step": 16784 + }, + { + "epoch": 1.5206903580892845, + "grad_norm": 0.9695220589637756, + "learning_rate": 9.862864737509817e-05, + "loss": 2.7361, + "step": 16785 + }, + { + "epoch": 1.5207809562637313, + "grad_norm": 0.9612027406692505, + "learning_rate": 9.862260617410742e-05, + "loss": 2.8154, + "step": 16786 + }, + { + "epoch": 1.5208715544381781, + "grad_norm": 0.9157556295394897, + "learning_rate": 9.861656497311666e-05, + "loss": 2.9248, + "step": 16787 + }, + { + "epoch": 1.520962152612625, + "grad_norm": 0.9145557284355164, + "learning_rate": 9.861052377212591e-05, + "loss": 2.7376, + "step": 16788 + }, + { + "epoch": 1.5210527507870717, + "grad_norm": 0.8669790625572205, + "learning_rate": 9.860448257113514e-05, + "loss": 2.7548, + "step": 16789 + }, + { + "epoch": 1.5211433489615183, + "grad_norm": 0.9573755860328674, + "learning_rate": 9.859844137014439e-05, + "loss": 2.9019, + "step": 16790 + }, + { + "epoch": 1.5212339471359653, + "grad_norm": 1.0002273321151733, + "learning_rate": 9.859240016915363e-05, + "loss": 2.5833, + "step": 16791 + }, + { + "epoch": 1.5213245453104118, + "grad_norm": 0.9630582332611084, + "learning_rate": 9.858635896816287e-05, + "loss": 2.4535, + "step": 16792 + }, + { + "epoch": 1.5214151434848588, + "grad_norm": 0.8990309238433838, + "learning_rate": 9.858031776717211e-05, + "loss": 2.6269, + "step": 16793 + }, + { + "epoch": 1.5215057416593054, + "grad_norm": 0.9441512227058411, + "learning_rate": 9.857427656618136e-05, + "loss": 2.7208, + "step": 16794 + }, + { + "epoch": 1.5215963398337524, + "grad_norm": 0.9517843127250671, + "learning_rate": 9.85682353651906e-05, + "loss": 3.1757, + "step": 16795 + }, + { + "epoch": 1.521686938008199, + "grad_norm": 0.9428243041038513, + "learning_rate": 9.856219416419985e-05, + "loss": 2.4387, + "step": 16796 + }, + { + "epoch": 1.521777536182646, + "grad_norm": 0.9048566222190857, + "learning_rate": 9.85561529632091e-05, + "loss": 2.8204, + "step": 16797 + }, + { + "epoch": 1.5218681343570926, + "grad_norm": 0.9139631986618042, + "learning_rate": 9.855011176221833e-05, + "loss": 2.5827, + "step": 16798 + }, + { + "epoch": 1.5219587325315396, + "grad_norm": 0.9730714559555054, + "learning_rate": 9.854407056122758e-05, + "loss": 2.6941, + "step": 16799 + }, + { + "epoch": 1.5220493307059861, + "grad_norm": 0.9058308601379395, + "learning_rate": 9.853802936023681e-05, + "loss": 2.6686, + "step": 16800 + }, + { + "epoch": 1.5221399288804331, + "grad_norm": 0.7958921790122986, + "learning_rate": 9.853198815924606e-05, + "loss": 2.0994, + "step": 16801 + }, + { + "epoch": 1.5222305270548797, + "grad_norm": 0.9460868835449219, + "learning_rate": 9.852594695825531e-05, + "loss": 2.6198, + "step": 16802 + }, + { + "epoch": 1.5223211252293267, + "grad_norm": 0.7736759781837463, + "learning_rate": 9.851990575726456e-05, + "loss": 2.0752, + "step": 16803 + }, + { + "epoch": 1.5224117234037733, + "grad_norm": 0.859073281288147, + "learning_rate": 9.851386455627379e-05, + "loss": 2.8224, + "step": 16804 + }, + { + "epoch": 1.5225023215782203, + "grad_norm": 1.1760649681091309, + "learning_rate": 9.850782335528304e-05, + "loss": 2.5895, + "step": 16805 + }, + { + "epoch": 1.5225929197526669, + "grad_norm": 0.9757549166679382, + "learning_rate": 9.850178215429227e-05, + "loss": 2.9239, + "step": 16806 + }, + { + "epoch": 1.5226835179271139, + "grad_norm": 0.8243734240531921, + "learning_rate": 9.849574095330152e-05, + "loss": 2.2664, + "step": 16807 + }, + { + "epoch": 1.5227741161015604, + "grad_norm": 0.9239568710327148, + "learning_rate": 9.848969975231075e-05, + "loss": 2.6928, + "step": 16808 + }, + { + "epoch": 1.5228647142760074, + "grad_norm": 0.8697055578231812, + "learning_rate": 9.848365855132e-05, + "loss": 2.5938, + "step": 16809 + }, + { + "epoch": 1.522955312450454, + "grad_norm": 0.9566666483879089, + "learning_rate": 9.847761735032925e-05, + "loss": 2.5499, + "step": 16810 + }, + { + "epoch": 1.523045910624901, + "grad_norm": 0.911773145198822, + "learning_rate": 9.84715761493385e-05, + "loss": 2.6193, + "step": 16811 + }, + { + "epoch": 1.5231365087993476, + "grad_norm": 0.8796859979629517, + "learning_rate": 9.846553494834773e-05, + "loss": 2.701, + "step": 16812 + }, + { + "epoch": 1.5232271069737946, + "grad_norm": 0.9121832847595215, + "learning_rate": 9.845949374735698e-05, + "loss": 2.5637, + "step": 16813 + }, + { + "epoch": 1.5233177051482412, + "grad_norm": 0.8471085429191589, + "learning_rate": 9.845345254636623e-05, + "loss": 2.4594, + "step": 16814 + }, + { + "epoch": 1.5234083033226882, + "grad_norm": 0.9652700424194336, + "learning_rate": 9.844741134537546e-05, + "loss": 2.7733, + "step": 16815 + }, + { + "epoch": 1.5234989014971347, + "grad_norm": 0.9096866846084595, + "learning_rate": 9.844137014438471e-05, + "loss": 2.6931, + "step": 16816 + }, + { + "epoch": 1.5235894996715817, + "grad_norm": 0.9031850695610046, + "learning_rate": 9.843532894339396e-05, + "loss": 2.6419, + "step": 16817 + }, + { + "epoch": 1.5236800978460283, + "grad_norm": 0.8729600310325623, + "learning_rate": 9.84292877424032e-05, + "loss": 2.669, + "step": 16818 + }, + { + "epoch": 1.5237706960204753, + "grad_norm": 0.9436779618263245, + "learning_rate": 9.842324654141244e-05, + "loss": 2.6992, + "step": 16819 + }, + { + "epoch": 1.523861294194922, + "grad_norm": 0.9583137035369873, + "learning_rate": 9.841720534042168e-05, + "loss": 2.6573, + "step": 16820 + }, + { + "epoch": 1.523951892369369, + "grad_norm": 0.922698974609375, + "learning_rate": 9.841116413943092e-05, + "loss": 2.7049, + "step": 16821 + }, + { + "epoch": 1.5240424905438155, + "grad_norm": 0.8681346774101257, + "learning_rate": 9.840512293844017e-05, + "loss": 2.477, + "step": 16822 + }, + { + "epoch": 1.5241330887182625, + "grad_norm": 0.7599506974220276, + "learning_rate": 9.83990817374494e-05, + "loss": 1.9906, + "step": 16823 + }, + { + "epoch": 1.524223686892709, + "grad_norm": 0.8906880021095276, + "learning_rate": 9.839304053645865e-05, + "loss": 2.5142, + "step": 16824 + }, + { + "epoch": 1.524314285067156, + "grad_norm": 0.9223815202713013, + "learning_rate": 9.83869993354679e-05, + "loss": 2.6314, + "step": 16825 + }, + { + "epoch": 1.5244048832416026, + "grad_norm": 0.9130087494850159, + "learning_rate": 9.838095813447714e-05, + "loss": 2.7334, + "step": 16826 + }, + { + "epoch": 1.5244954814160496, + "grad_norm": 0.9518387317657471, + "learning_rate": 9.837491693348638e-05, + "loss": 2.3561, + "step": 16827 + }, + { + "epoch": 1.5245860795904962, + "grad_norm": 0.8803222179412842, + "learning_rate": 9.836887573249563e-05, + "loss": 3.038, + "step": 16828 + }, + { + "epoch": 1.5246766777649432, + "grad_norm": 0.9251043200492859, + "learning_rate": 9.836283453150487e-05, + "loss": 2.5477, + "step": 16829 + }, + { + "epoch": 1.5247672759393898, + "grad_norm": 0.9837248921394348, + "learning_rate": 9.83567933305141e-05, + "loss": 2.7997, + "step": 16830 + }, + { + "epoch": 1.5248578741138366, + "grad_norm": 0.911780834197998, + "learning_rate": 9.835075212952335e-05, + "loss": 2.6946, + "step": 16831 + }, + { + "epoch": 1.5249484722882833, + "grad_norm": 0.9700822234153748, + "learning_rate": 9.83447109285326e-05, + "loss": 2.6808, + "step": 16832 + }, + { + "epoch": 1.5250390704627301, + "grad_norm": 0.9764134287834167, + "learning_rate": 9.833866972754185e-05, + "loss": 2.9084, + "step": 16833 + }, + { + "epoch": 1.525129668637177, + "grad_norm": 1.0174660682678223, + "learning_rate": 9.833262852655108e-05, + "loss": 2.6588, + "step": 16834 + }, + { + "epoch": 1.5252202668116237, + "grad_norm": 1.0204633474349976, + "learning_rate": 9.832658732556033e-05, + "loss": 2.6856, + "step": 16835 + }, + { + "epoch": 1.5253108649860705, + "grad_norm": 0.8584938645362854, + "learning_rate": 9.832054612456957e-05, + "loss": 2.4914, + "step": 16836 + }, + { + "epoch": 1.5254014631605173, + "grad_norm": 0.9301232695579529, + "learning_rate": 9.831450492357881e-05, + "loss": 2.7399, + "step": 16837 + }, + { + "epoch": 1.525492061334964, + "grad_norm": 0.9146918058395386, + "learning_rate": 9.830846372258805e-05, + "loss": 2.6761, + "step": 16838 + }, + { + "epoch": 1.5255826595094109, + "grad_norm": 0.8654019236564636, + "learning_rate": 9.83024225215973e-05, + "loss": 2.7788, + "step": 16839 + }, + { + "epoch": 1.5256732576838576, + "grad_norm": 0.8976036310195923, + "learning_rate": 9.829638132060654e-05, + "loss": 2.9364, + "step": 16840 + }, + { + "epoch": 1.5257638558583044, + "grad_norm": 0.9410428404808044, + "learning_rate": 9.829034011961579e-05, + "loss": 2.6963, + "step": 16841 + }, + { + "epoch": 1.5258544540327512, + "grad_norm": 0.8991221785545349, + "learning_rate": 9.828429891862502e-05, + "loss": 2.4795, + "step": 16842 + }, + { + "epoch": 1.525945052207198, + "grad_norm": 0.9156516790390015, + "learning_rate": 9.827825771763427e-05, + "loss": 2.7679, + "step": 16843 + }, + { + "epoch": 1.5260356503816448, + "grad_norm": 0.8680191040039062, + "learning_rate": 9.82722165166435e-05, + "loss": 2.5344, + "step": 16844 + }, + { + "epoch": 1.5261262485560916, + "grad_norm": 1.0526612997055054, + "learning_rate": 9.826617531565275e-05, + "loss": 2.5569, + "step": 16845 + }, + { + "epoch": 1.5262168467305384, + "grad_norm": 0.9510833621025085, + "learning_rate": 9.8260134114662e-05, + "loss": 2.692, + "step": 16846 + }, + { + "epoch": 1.5263074449049852, + "grad_norm": 0.895352303981781, + "learning_rate": 9.825409291367125e-05, + "loss": 2.7128, + "step": 16847 + }, + { + "epoch": 1.526398043079432, + "grad_norm": 0.9344759583473206, + "learning_rate": 9.824805171268048e-05, + "loss": 2.7828, + "step": 16848 + }, + { + "epoch": 1.5264886412538787, + "grad_norm": 0.9817216396331787, + "learning_rate": 9.824201051168973e-05, + "loss": 2.7763, + "step": 16849 + }, + { + "epoch": 1.5265792394283255, + "grad_norm": 0.877785325050354, + "learning_rate": 9.823596931069898e-05, + "loss": 2.8187, + "step": 16850 + }, + { + "epoch": 1.5266698376027723, + "grad_norm": 0.9680641293525696, + "learning_rate": 9.822992810970821e-05, + "loss": 2.6861, + "step": 16851 + }, + { + "epoch": 1.526760435777219, + "grad_norm": 0.960779070854187, + "learning_rate": 9.822388690871746e-05, + "loss": 2.8194, + "step": 16852 + }, + { + "epoch": 1.5268510339516659, + "grad_norm": 1.0465651750564575, + "learning_rate": 9.82178457077267e-05, + "loss": 2.6963, + "step": 16853 + }, + { + "epoch": 1.5269416321261127, + "grad_norm": 0.9237720966339111, + "learning_rate": 9.821180450673595e-05, + "loss": 2.5877, + "step": 16854 + }, + { + "epoch": 1.5270322303005595, + "grad_norm": 0.9271373748779297, + "learning_rate": 9.820576330574519e-05, + "loss": 2.61, + "step": 16855 + }, + { + "epoch": 1.5271228284750062, + "grad_norm": 0.9084030985832214, + "learning_rate": 9.819972210475444e-05, + "loss": 2.7724, + "step": 16856 + }, + { + "epoch": 1.527213426649453, + "grad_norm": 0.8558924794197083, + "learning_rate": 9.819368090376367e-05, + "loss": 2.0252, + "step": 16857 + }, + { + "epoch": 1.5273040248238998, + "grad_norm": 0.909878671169281, + "learning_rate": 9.818763970277292e-05, + "loss": 2.7978, + "step": 16858 + }, + { + "epoch": 1.5273946229983466, + "grad_norm": 0.8600506782531738, + "learning_rate": 9.818159850178215e-05, + "loss": 2.8316, + "step": 16859 + }, + { + "epoch": 1.5274852211727934, + "grad_norm": 0.7449944019317627, + "learning_rate": 9.81755573007914e-05, + "loss": 2.107, + "step": 16860 + }, + { + "epoch": 1.5275758193472402, + "grad_norm": 0.9849152565002441, + "learning_rate": 9.816951609980065e-05, + "loss": 2.8381, + "step": 16861 + }, + { + "epoch": 1.527666417521687, + "grad_norm": 0.8717986345291138, + "learning_rate": 9.81634748988099e-05, + "loss": 2.6378, + "step": 16862 + }, + { + "epoch": 1.5277570156961338, + "grad_norm": 0.9166927337646484, + "learning_rate": 9.815743369781913e-05, + "loss": 2.8059, + "step": 16863 + }, + { + "epoch": 1.5278476138705805, + "grad_norm": 0.9043338894844055, + "learning_rate": 9.815139249682838e-05, + "loss": 2.5208, + "step": 16864 + }, + { + "epoch": 1.5279382120450273, + "grad_norm": 0.8935568332672119, + "learning_rate": 9.814535129583762e-05, + "loss": 2.6484, + "step": 16865 + }, + { + "epoch": 1.5280288102194741, + "grad_norm": 0.9135342836380005, + "learning_rate": 9.813931009484686e-05, + "loss": 2.5609, + "step": 16866 + }, + { + "epoch": 1.528119408393921, + "grad_norm": 0.8804158568382263, + "learning_rate": 9.81332688938561e-05, + "loss": 2.6889, + "step": 16867 + }, + { + "epoch": 1.5282100065683677, + "grad_norm": 0.9058445692062378, + "learning_rate": 9.812722769286534e-05, + "loss": 2.7705, + "step": 16868 + }, + { + "epoch": 1.5283006047428145, + "grad_norm": 0.9411551356315613, + "learning_rate": 9.81211864918746e-05, + "loss": 2.5644, + "step": 16869 + }, + { + "epoch": 1.5283912029172613, + "grad_norm": 0.9615513682365417, + "learning_rate": 9.811514529088383e-05, + "loss": 2.6861, + "step": 16870 + }, + { + "epoch": 1.5284818010917078, + "grad_norm": 1.0097687244415283, + "learning_rate": 9.810910408989308e-05, + "loss": 2.9356, + "step": 16871 + }, + { + "epoch": 1.5285723992661548, + "grad_norm": 0.9224622845649719, + "learning_rate": 9.810306288890232e-05, + "loss": 2.8283, + "step": 16872 + }, + { + "epoch": 1.5286629974406014, + "grad_norm": 0.9308323860168457, + "learning_rate": 9.809702168791156e-05, + "loss": 2.7568, + "step": 16873 + }, + { + "epoch": 1.5287535956150484, + "grad_norm": 0.932563841342926, + "learning_rate": 9.80909804869208e-05, + "loss": 3.1089, + "step": 16874 + }, + { + "epoch": 1.528844193789495, + "grad_norm": 0.8812679052352905, + "learning_rate": 9.808493928593005e-05, + "loss": 2.687, + "step": 16875 + }, + { + "epoch": 1.528934791963942, + "grad_norm": 0.9320706129074097, + "learning_rate": 9.807889808493928e-05, + "loss": 2.9029, + "step": 16876 + }, + { + "epoch": 1.5290253901383886, + "grad_norm": 0.8024165034294128, + "learning_rate": 9.807285688394854e-05, + "loss": 2.1054, + "step": 16877 + }, + { + "epoch": 1.5291159883128356, + "grad_norm": 0.922347903251648, + "learning_rate": 9.806681568295777e-05, + "loss": 2.6986, + "step": 16878 + }, + { + "epoch": 1.5292065864872821, + "grad_norm": 0.9526455402374268, + "learning_rate": 9.806077448196702e-05, + "loss": 2.6677, + "step": 16879 + }, + { + "epoch": 1.5292971846617291, + "grad_norm": 0.7871290445327759, + "learning_rate": 9.805473328097626e-05, + "loss": 2.0092, + "step": 16880 + }, + { + "epoch": 1.5293877828361757, + "grad_norm": 0.8863994479179382, + "learning_rate": 9.80486920799855e-05, + "loss": 2.5477, + "step": 16881 + }, + { + "epoch": 1.5294783810106227, + "grad_norm": 0.8946748375892639, + "learning_rate": 9.804265087899475e-05, + "loss": 2.9347, + "step": 16882 + }, + { + "epoch": 1.5295689791850693, + "grad_norm": 0.922580897808075, + "learning_rate": 9.803660967800399e-05, + "loss": 2.7255, + "step": 16883 + }, + { + "epoch": 1.5296595773595163, + "grad_norm": 0.9451807141304016, + "learning_rate": 9.803056847701325e-05, + "loss": 2.8607, + "step": 16884 + }, + { + "epoch": 1.5297501755339629, + "grad_norm": 0.9333718419075012, + "learning_rate": 9.802452727602248e-05, + "loss": 2.7006, + "step": 16885 + }, + { + "epoch": 1.5298407737084099, + "grad_norm": 0.9252846837043762, + "learning_rate": 9.801848607503173e-05, + "loss": 2.5873, + "step": 16886 + }, + { + "epoch": 1.5299313718828564, + "grad_norm": 0.89591383934021, + "learning_rate": 9.801244487404096e-05, + "loss": 2.5877, + "step": 16887 + }, + { + "epoch": 1.5300219700573034, + "grad_norm": 0.8925186991691589, + "learning_rate": 9.800640367305021e-05, + "loss": 2.8111, + "step": 16888 + }, + { + "epoch": 1.53011256823175, + "grad_norm": 0.8027336597442627, + "learning_rate": 9.800036247205944e-05, + "loss": 2.3069, + "step": 16889 + }, + { + "epoch": 1.530203166406197, + "grad_norm": 0.9608847498893738, + "learning_rate": 9.799432127106869e-05, + "loss": 2.8725, + "step": 16890 + }, + { + "epoch": 1.5302937645806436, + "grad_norm": 1.0260668992996216, + "learning_rate": 9.798828007007793e-05, + "loss": 2.8071, + "step": 16891 + }, + { + "epoch": 1.5303843627550906, + "grad_norm": 0.950680136680603, + "learning_rate": 9.798223886908719e-05, + "loss": 2.7118, + "step": 16892 + }, + { + "epoch": 1.5304749609295372, + "grad_norm": 0.8494809865951538, + "learning_rate": 9.797619766809642e-05, + "loss": 2.6163, + "step": 16893 + }, + { + "epoch": 1.5305655591039842, + "grad_norm": 0.8277015089988708, + "learning_rate": 9.797015646710567e-05, + "loss": 2.4671, + "step": 16894 + }, + { + "epoch": 1.5306561572784307, + "grad_norm": 0.8997530341148376, + "learning_rate": 9.79641152661149e-05, + "loss": 2.806, + "step": 16895 + }, + { + "epoch": 1.5307467554528777, + "grad_norm": 0.9472346305847168, + "learning_rate": 9.795807406512415e-05, + "loss": 2.8063, + "step": 16896 + }, + { + "epoch": 1.5308373536273243, + "grad_norm": 0.9244300127029419, + "learning_rate": 9.79520328641334e-05, + "loss": 2.8812, + "step": 16897 + }, + { + "epoch": 1.5309279518017713, + "grad_norm": 1.011687159538269, + "learning_rate": 9.794599166314263e-05, + "loss": 2.3802, + "step": 16898 + }, + { + "epoch": 1.531018549976218, + "grad_norm": 0.9697418212890625, + "learning_rate": 9.793995046215188e-05, + "loss": 2.9781, + "step": 16899 + }, + { + "epoch": 1.531109148150665, + "grad_norm": 0.9659990668296814, + "learning_rate": 9.793390926116113e-05, + "loss": 2.7352, + "step": 16900 + }, + { + "epoch": 1.5311997463251115, + "grad_norm": 0.8006253838539124, + "learning_rate": 9.792786806017038e-05, + "loss": 2.0931, + "step": 16901 + }, + { + "epoch": 1.5312903444995585, + "grad_norm": 0.8643366098403931, + "learning_rate": 9.792182685917961e-05, + "loss": 2.4915, + "step": 16902 + }, + { + "epoch": 1.531380942674005, + "grad_norm": 0.9133711457252502, + "learning_rate": 9.791578565818886e-05, + "loss": 2.9516, + "step": 16903 + }, + { + "epoch": 1.531471540848452, + "grad_norm": 0.8240745067596436, + "learning_rate": 9.790974445719809e-05, + "loss": 2.8086, + "step": 16904 + }, + { + "epoch": 1.5315621390228986, + "grad_norm": 0.8182093501091003, + "learning_rate": 9.790370325620734e-05, + "loss": 2.2544, + "step": 16905 + }, + { + "epoch": 1.5316527371973456, + "grad_norm": 0.859754741191864, + "learning_rate": 9.789766205521657e-05, + "loss": 2.514, + "step": 16906 + }, + { + "epoch": 1.5317433353717922, + "grad_norm": 0.9030807018280029, + "learning_rate": 9.789162085422583e-05, + "loss": 2.7329, + "step": 16907 + }, + { + "epoch": 1.5318339335462392, + "grad_norm": 0.9357253909111023, + "learning_rate": 9.788557965323507e-05, + "loss": 2.6732, + "step": 16908 + }, + { + "epoch": 1.5319245317206858, + "grad_norm": 0.9044972062110901, + "learning_rate": 9.787953845224432e-05, + "loss": 2.8137, + "step": 16909 + }, + { + "epoch": 1.5320151298951328, + "grad_norm": 1.0066031217575073, + "learning_rate": 9.787349725125355e-05, + "loss": 2.8951, + "step": 16910 + }, + { + "epoch": 1.5321057280695793, + "grad_norm": 0.7949905395507812, + "learning_rate": 9.78674560502628e-05, + "loss": 1.9192, + "step": 16911 + }, + { + "epoch": 1.5321963262440261, + "grad_norm": 0.9327507615089417, + "learning_rate": 9.786141484927203e-05, + "loss": 2.9262, + "step": 16912 + }, + { + "epoch": 1.532286924418473, + "grad_norm": 0.9060817956924438, + "learning_rate": 9.785537364828128e-05, + "loss": 2.5122, + "step": 16913 + }, + { + "epoch": 1.5323775225929197, + "grad_norm": 0.9024823307991028, + "learning_rate": 9.784933244729053e-05, + "loss": 2.7253, + "step": 16914 + }, + { + "epoch": 1.5324681207673665, + "grad_norm": 0.9016098976135254, + "learning_rate": 9.784329124629977e-05, + "loss": 2.6295, + "step": 16915 + }, + { + "epoch": 1.5325587189418133, + "grad_norm": 0.9785366654396057, + "learning_rate": 9.783725004530902e-05, + "loss": 2.7849, + "step": 16916 + }, + { + "epoch": 1.53264931711626, + "grad_norm": 0.8754266500473022, + "learning_rate": 9.783120884431826e-05, + "loss": 2.7587, + "step": 16917 + }, + { + "epoch": 1.5327399152907069, + "grad_norm": 0.9336187243461609, + "learning_rate": 9.78251676433275e-05, + "loss": 3.161, + "step": 16918 + }, + { + "epoch": 1.5328305134651536, + "grad_norm": 0.8733602166175842, + "learning_rate": 9.781912644233674e-05, + "loss": 2.1508, + "step": 16919 + }, + { + "epoch": 1.5329211116396004, + "grad_norm": 0.8879563808441162, + "learning_rate": 9.781308524134598e-05, + "loss": 2.6928, + "step": 16920 + }, + { + "epoch": 1.5330117098140472, + "grad_norm": 1.1186325550079346, + "learning_rate": 9.780704404035522e-05, + "loss": 2.6078, + "step": 16921 + }, + { + "epoch": 1.533102307988494, + "grad_norm": 0.8795099258422852, + "learning_rate": 9.780100283936448e-05, + "loss": 2.8089, + "step": 16922 + }, + { + "epoch": 1.5331929061629408, + "grad_norm": 0.9322019219398499, + "learning_rate": 9.779496163837371e-05, + "loss": 2.6775, + "step": 16923 + }, + { + "epoch": 1.5332835043373876, + "grad_norm": 0.7884368896484375, + "learning_rate": 9.778892043738296e-05, + "loss": 2.0524, + "step": 16924 + }, + { + "epoch": 1.5333741025118344, + "grad_norm": 0.8920405507087708, + "learning_rate": 9.77828792363922e-05, + "loss": 2.6657, + "step": 16925 + }, + { + "epoch": 1.5334647006862812, + "grad_norm": 0.9827669262886047, + "learning_rate": 9.777683803540144e-05, + "loss": 2.5454, + "step": 16926 + }, + { + "epoch": 1.533555298860728, + "grad_norm": 0.9655629992485046, + "learning_rate": 9.777079683441068e-05, + "loss": 3.1873, + "step": 16927 + }, + { + "epoch": 1.5336458970351747, + "grad_norm": 0.9303285479545593, + "learning_rate": 9.776475563341992e-05, + "loss": 2.5842, + "step": 16928 + }, + { + "epoch": 1.5337364952096215, + "grad_norm": 0.9175190329551697, + "learning_rate": 9.775871443242917e-05, + "loss": 2.6889, + "step": 16929 + }, + { + "epoch": 1.5338270933840683, + "grad_norm": 1.0364006757736206, + "learning_rate": 9.775267323143842e-05, + "loss": 2.6079, + "step": 16930 + }, + { + "epoch": 1.533917691558515, + "grad_norm": 0.8789488673210144, + "learning_rate": 9.774663203044765e-05, + "loss": 2.4726, + "step": 16931 + }, + { + "epoch": 1.5340082897329619, + "grad_norm": 0.8540599942207336, + "learning_rate": 9.77405908294569e-05, + "loss": 2.6543, + "step": 16932 + }, + { + "epoch": 1.5340988879074087, + "grad_norm": 0.8590208888053894, + "learning_rate": 9.773454962846615e-05, + "loss": 2.5952, + "step": 16933 + }, + { + "epoch": 1.5341894860818555, + "grad_norm": 0.8987787961959839, + "learning_rate": 9.772850842747538e-05, + "loss": 2.6947, + "step": 16934 + }, + { + "epoch": 1.5342800842563022, + "grad_norm": 0.9178187251091003, + "learning_rate": 9.772246722648463e-05, + "loss": 2.8703, + "step": 16935 + }, + { + "epoch": 1.534370682430749, + "grad_norm": 0.970901608467102, + "learning_rate": 9.771642602549387e-05, + "loss": 2.7807, + "step": 16936 + }, + { + "epoch": 1.5344612806051958, + "grad_norm": 0.8857434391975403, + "learning_rate": 9.771038482450313e-05, + "loss": 2.566, + "step": 16937 + }, + { + "epoch": 1.5345518787796426, + "grad_norm": 0.8191441297531128, + "learning_rate": 9.770434362351236e-05, + "loss": 1.92, + "step": 16938 + }, + { + "epoch": 1.5346424769540894, + "grad_norm": 0.8711693286895752, + "learning_rate": 9.769830242252161e-05, + "loss": 2.527, + "step": 16939 + }, + { + "epoch": 1.5347330751285362, + "grad_norm": 0.9097295999526978, + "learning_rate": 9.769226122153084e-05, + "loss": 2.9699, + "step": 16940 + }, + { + "epoch": 1.534823673302983, + "grad_norm": 0.8771814107894897, + "learning_rate": 9.768622002054009e-05, + "loss": 2.7922, + "step": 16941 + }, + { + "epoch": 1.5349142714774298, + "grad_norm": 0.9026029706001282, + "learning_rate": 9.768017881954932e-05, + "loss": 2.7223, + "step": 16942 + }, + { + "epoch": 1.5350048696518765, + "grad_norm": 0.8797240257263184, + "learning_rate": 9.767413761855857e-05, + "loss": 2.599, + "step": 16943 + }, + { + "epoch": 1.5350954678263233, + "grad_norm": 0.8571259379386902, + "learning_rate": 9.766809641756782e-05, + "loss": 2.6151, + "step": 16944 + }, + { + "epoch": 1.5351860660007701, + "grad_norm": 0.8719232082366943, + "learning_rate": 9.766205521657707e-05, + "loss": 2.6766, + "step": 16945 + }, + { + "epoch": 1.535276664175217, + "grad_norm": 0.8028589487075806, + "learning_rate": 9.76560140155863e-05, + "loss": 2.1161, + "step": 16946 + }, + { + "epoch": 1.5353672623496637, + "grad_norm": 0.830539882183075, + "learning_rate": 9.764997281459555e-05, + "loss": 1.9788, + "step": 16947 + }, + { + "epoch": 1.5354578605241105, + "grad_norm": 0.909980058670044, + "learning_rate": 9.764393161360478e-05, + "loss": 2.8783, + "step": 16948 + }, + { + "epoch": 1.5355484586985573, + "grad_norm": 0.7294101119041443, + "learning_rate": 9.763789041261403e-05, + "loss": 1.9152, + "step": 16949 + }, + { + "epoch": 1.535639056873004, + "grad_norm": 0.9542833566665649, + "learning_rate": 9.763184921162328e-05, + "loss": 2.5917, + "step": 16950 + }, + { + "epoch": 1.5357296550474508, + "grad_norm": 0.9051535725593567, + "learning_rate": 9.762580801063251e-05, + "loss": 2.9482, + "step": 16951 + }, + { + "epoch": 1.5358202532218974, + "grad_norm": 0.9156363010406494, + "learning_rate": 9.761976680964177e-05, + "loss": 2.6231, + "step": 16952 + }, + { + "epoch": 1.5359108513963444, + "grad_norm": 0.9955580234527588, + "learning_rate": 9.7613725608651e-05, + "loss": 2.6803, + "step": 16953 + }, + { + "epoch": 1.536001449570791, + "grad_norm": 0.9041898250579834, + "learning_rate": 9.760768440766025e-05, + "loss": 2.8109, + "step": 16954 + }, + { + "epoch": 1.536092047745238, + "grad_norm": 0.9240245819091797, + "learning_rate": 9.760164320666949e-05, + "loss": 2.9076, + "step": 16955 + }, + { + "epoch": 1.5361826459196846, + "grad_norm": 0.8452627658843994, + "learning_rate": 9.759560200567874e-05, + "loss": 2.7283, + "step": 16956 + }, + { + "epoch": 1.5362732440941316, + "grad_norm": 0.8721271753311157, + "learning_rate": 9.758956080468797e-05, + "loss": 2.5461, + "step": 16957 + }, + { + "epoch": 1.5363638422685781, + "grad_norm": 0.9187932014465332, + "learning_rate": 9.758351960369722e-05, + "loss": 2.7361, + "step": 16958 + }, + { + "epoch": 1.5364544404430251, + "grad_norm": 0.8962506651878357, + "learning_rate": 9.757747840270647e-05, + "loss": 2.7174, + "step": 16959 + }, + { + "epoch": 1.5365450386174717, + "grad_norm": 0.8970420360565186, + "learning_rate": 9.757143720171571e-05, + "loss": 2.8146, + "step": 16960 + }, + { + "epoch": 1.5366356367919187, + "grad_norm": 0.9279984831809998, + "learning_rate": 9.756539600072495e-05, + "loss": 2.6053, + "step": 16961 + }, + { + "epoch": 1.5367262349663653, + "grad_norm": 0.9174923300743103, + "learning_rate": 9.75593547997342e-05, + "loss": 2.7287, + "step": 16962 + }, + { + "epoch": 1.5368168331408123, + "grad_norm": 0.8112099170684814, + "learning_rate": 9.755331359874343e-05, + "loss": 2.1136, + "step": 16963 + }, + { + "epoch": 1.5369074313152589, + "grad_norm": 0.8764845728874207, + "learning_rate": 9.754727239775268e-05, + "loss": 2.5043, + "step": 16964 + }, + { + "epoch": 1.5369980294897059, + "grad_norm": 0.8849969506263733, + "learning_rate": 9.754123119676192e-05, + "loss": 2.824, + "step": 16965 + }, + { + "epoch": 1.5370886276641524, + "grad_norm": 0.8881600499153137, + "learning_rate": 9.753518999577116e-05, + "loss": 2.8694, + "step": 16966 + }, + { + "epoch": 1.5371792258385995, + "grad_norm": 0.8916904926300049, + "learning_rate": 9.75291487947804e-05, + "loss": 2.9373, + "step": 16967 + }, + { + "epoch": 1.537269824013046, + "grad_norm": 0.9860527515411377, + "learning_rate": 9.752310759378965e-05, + "loss": 2.8047, + "step": 16968 + }, + { + "epoch": 1.537360422187493, + "grad_norm": 0.9044759273529053, + "learning_rate": 9.75170663927989e-05, + "loss": 2.6952, + "step": 16969 + }, + { + "epoch": 1.5374510203619396, + "grad_norm": 0.9519761204719543, + "learning_rate": 9.751102519180813e-05, + "loss": 3.1056, + "step": 16970 + }, + { + "epoch": 1.5375416185363866, + "grad_norm": 0.8741687536239624, + "learning_rate": 9.750498399081738e-05, + "loss": 2.8027, + "step": 16971 + }, + { + "epoch": 1.5376322167108332, + "grad_norm": 0.8748754262924194, + "learning_rate": 9.749894278982662e-05, + "loss": 2.6891, + "step": 16972 + }, + { + "epoch": 1.5377228148852802, + "grad_norm": 0.8991838693618774, + "learning_rate": 9.749290158883586e-05, + "loss": 2.7481, + "step": 16973 + }, + { + "epoch": 1.5378134130597267, + "grad_norm": 0.8039204478263855, + "learning_rate": 9.748686038784511e-05, + "loss": 2.4745, + "step": 16974 + }, + { + "epoch": 1.5379040112341738, + "grad_norm": 0.9329891800880432, + "learning_rate": 9.748081918685436e-05, + "loss": 2.6145, + "step": 16975 + }, + { + "epoch": 1.5379946094086203, + "grad_norm": 1.0540255308151245, + "learning_rate": 9.747477798586359e-05, + "loss": 2.7454, + "step": 16976 + }, + { + "epoch": 1.5380852075830673, + "grad_norm": 0.9030938744544983, + "learning_rate": 9.746873678487284e-05, + "loss": 2.6819, + "step": 16977 + }, + { + "epoch": 1.538175805757514, + "grad_norm": 0.8886061310768127, + "learning_rate": 9.746269558388207e-05, + "loss": 2.6076, + "step": 16978 + }, + { + "epoch": 1.538266403931961, + "grad_norm": 0.9588932991027832, + "learning_rate": 9.745665438289132e-05, + "loss": 2.1749, + "step": 16979 + }, + { + "epoch": 1.5383570021064075, + "grad_norm": 0.8645809292793274, + "learning_rate": 9.745061318190056e-05, + "loss": 2.5831, + "step": 16980 + }, + { + "epoch": 1.5384476002808545, + "grad_norm": 0.8763397932052612, + "learning_rate": 9.74445719809098e-05, + "loss": 2.6694, + "step": 16981 + }, + { + "epoch": 1.538538198455301, + "grad_norm": 0.8628765344619751, + "learning_rate": 9.743853077991905e-05, + "loss": 2.481, + "step": 16982 + }, + { + "epoch": 1.538628796629748, + "grad_norm": 0.8416115641593933, + "learning_rate": 9.74324895789283e-05, + "loss": 2.4578, + "step": 16983 + }, + { + "epoch": 1.5387193948041946, + "grad_norm": 0.9965024590492249, + "learning_rate": 9.742644837793755e-05, + "loss": 2.583, + "step": 16984 + }, + { + "epoch": 1.5388099929786416, + "grad_norm": 0.9219428896903992, + "learning_rate": 9.742040717694678e-05, + "loss": 2.8361, + "step": 16985 + }, + { + "epoch": 1.5389005911530882, + "grad_norm": 0.8605660796165466, + "learning_rate": 9.741436597595603e-05, + "loss": 2.5345, + "step": 16986 + }, + { + "epoch": 1.5389911893275352, + "grad_norm": 0.9235706925392151, + "learning_rate": 9.740832477496526e-05, + "loss": 2.8809, + "step": 16987 + }, + { + "epoch": 1.5390817875019818, + "grad_norm": 0.8344838619232178, + "learning_rate": 9.740228357397451e-05, + "loss": 2.4683, + "step": 16988 + }, + { + "epoch": 1.5391723856764288, + "grad_norm": 0.770371675491333, + "learning_rate": 9.739624237298376e-05, + "loss": 1.8908, + "step": 16989 + }, + { + "epoch": 1.5392629838508753, + "grad_norm": 0.8798065781593323, + "learning_rate": 9.7390201171993e-05, + "loss": 3.0085, + "step": 16990 + }, + { + "epoch": 1.5393535820253224, + "grad_norm": 0.9458695650100708, + "learning_rate": 9.738415997100224e-05, + "loss": 2.5714, + "step": 16991 + }, + { + "epoch": 1.539444180199769, + "grad_norm": 0.910643994808197, + "learning_rate": 9.737811877001149e-05, + "loss": 2.6544, + "step": 16992 + }, + { + "epoch": 1.5395347783742157, + "grad_norm": 0.8743060231208801, + "learning_rate": 9.737207756902072e-05, + "loss": 2.5935, + "step": 16993 + }, + { + "epoch": 1.5396253765486625, + "grad_norm": 0.9105738997459412, + "learning_rate": 9.736603636802997e-05, + "loss": 2.8492, + "step": 16994 + }, + { + "epoch": 1.5397159747231093, + "grad_norm": 0.8808718323707581, + "learning_rate": 9.73599951670392e-05, + "loss": 2.6759, + "step": 16995 + }, + { + "epoch": 1.539806572897556, + "grad_norm": 0.8877318501472473, + "learning_rate": 9.735395396604845e-05, + "loss": 2.9187, + "step": 16996 + }, + { + "epoch": 1.5398971710720029, + "grad_norm": 0.9379448294639587, + "learning_rate": 9.73479127650577e-05, + "loss": 2.6675, + "step": 16997 + }, + { + "epoch": 1.5399877692464496, + "grad_norm": 0.9755513668060303, + "learning_rate": 9.734187156406695e-05, + "loss": 2.6751, + "step": 16998 + }, + { + "epoch": 1.5400783674208964, + "grad_norm": 0.8682353496551514, + "learning_rate": 9.733583036307618e-05, + "loss": 1.9092, + "step": 16999 + }, + { + "epoch": 1.5401689655953432, + "grad_norm": 0.8510947227478027, + "learning_rate": 9.732978916208543e-05, + "loss": 2.6801, + "step": 17000 + }, + { + "epoch": 1.54025956376979, + "grad_norm": 0.8679479956626892, + "learning_rate": 9.732374796109467e-05, + "loss": 2.5982, + "step": 17001 + }, + { + "epoch": 1.5403501619442368, + "grad_norm": 0.9110893607139587, + "learning_rate": 9.731770676010391e-05, + "loss": 2.8082, + "step": 17002 + }, + { + "epoch": 1.5404407601186836, + "grad_norm": 0.9423732757568359, + "learning_rate": 9.731166555911316e-05, + "loss": 2.901, + "step": 17003 + }, + { + "epoch": 1.5405313582931304, + "grad_norm": 0.8929821252822876, + "learning_rate": 9.73056243581224e-05, + "loss": 2.9114, + "step": 17004 + }, + { + "epoch": 1.5406219564675772, + "grad_norm": 0.8688027858734131, + "learning_rate": 9.729958315713165e-05, + "loss": 2.6476, + "step": 17005 + }, + { + "epoch": 1.540712554642024, + "grad_norm": 0.9591584801673889, + "learning_rate": 9.729354195614089e-05, + "loss": 2.6465, + "step": 17006 + }, + { + "epoch": 1.5408031528164707, + "grad_norm": 0.9474549293518066, + "learning_rate": 9.728750075515013e-05, + "loss": 3.0751, + "step": 17007 + }, + { + "epoch": 1.5408937509909175, + "grad_norm": 0.7508571147918701, + "learning_rate": 9.728145955415937e-05, + "loss": 1.8497, + "step": 17008 + }, + { + "epoch": 1.5409843491653643, + "grad_norm": 0.9251189231872559, + "learning_rate": 9.727541835316862e-05, + "loss": 2.893, + "step": 17009 + }, + { + "epoch": 1.541074947339811, + "grad_norm": 0.8662340641021729, + "learning_rate": 9.726937715217785e-05, + "loss": 2.5712, + "step": 17010 + }, + { + "epoch": 1.5411655455142579, + "grad_norm": 0.914425790309906, + "learning_rate": 9.72633359511871e-05, + "loss": 2.8141, + "step": 17011 + }, + { + "epoch": 1.5412561436887047, + "grad_norm": 0.8156189322471619, + "learning_rate": 9.725729475019634e-05, + "loss": 2.4561, + "step": 17012 + }, + { + "epoch": 1.5413467418631515, + "grad_norm": 0.7707452774047852, + "learning_rate": 9.725125354920559e-05, + "loss": 1.9184, + "step": 17013 + }, + { + "epoch": 1.5414373400375982, + "grad_norm": 0.8930141925811768, + "learning_rate": 9.724521234821483e-05, + "loss": 2.5789, + "step": 17014 + }, + { + "epoch": 1.541527938212045, + "grad_norm": 0.8564104437828064, + "learning_rate": 9.723917114722407e-05, + "loss": 2.6072, + "step": 17015 + }, + { + "epoch": 1.5416185363864918, + "grad_norm": 0.8943631052970886, + "learning_rate": 9.723312994623332e-05, + "loss": 2.7308, + "step": 17016 + }, + { + "epoch": 1.5417091345609386, + "grad_norm": 0.9780496954917908, + "learning_rate": 9.722708874524256e-05, + "loss": 2.65, + "step": 17017 + }, + { + "epoch": 1.5417997327353854, + "grad_norm": 0.783615231513977, + "learning_rate": 9.72210475442518e-05, + "loss": 2.1312, + "step": 17018 + }, + { + "epoch": 1.5418903309098322, + "grad_norm": 0.8383142352104187, + "learning_rate": 9.721500634326105e-05, + "loss": 2.6837, + "step": 17019 + }, + { + "epoch": 1.541980929084279, + "grad_norm": 0.866119921207428, + "learning_rate": 9.72089651422703e-05, + "loss": 2.5077, + "step": 17020 + }, + { + "epoch": 1.5420715272587258, + "grad_norm": 0.9507381916046143, + "learning_rate": 9.720292394127953e-05, + "loss": 2.5426, + "step": 17021 + }, + { + "epoch": 1.5421621254331725, + "grad_norm": 0.7973870635032654, + "learning_rate": 9.719688274028878e-05, + "loss": 2.2052, + "step": 17022 + }, + { + "epoch": 1.5422527236076193, + "grad_norm": 0.9275195598602295, + "learning_rate": 9.719084153929801e-05, + "loss": 2.6112, + "step": 17023 + }, + { + "epoch": 1.5423433217820661, + "grad_norm": 0.9025492668151855, + "learning_rate": 9.718480033830726e-05, + "loss": 2.6749, + "step": 17024 + }, + { + "epoch": 1.542433919956513, + "grad_norm": 0.8550133109092712, + "learning_rate": 9.71787591373165e-05, + "loss": 2.699, + "step": 17025 + }, + { + "epoch": 1.5425245181309597, + "grad_norm": 0.8867694139480591, + "learning_rate": 9.717271793632574e-05, + "loss": 2.8784, + "step": 17026 + }, + { + "epoch": 1.5426151163054065, + "grad_norm": 0.8563838005065918, + "learning_rate": 9.716667673533499e-05, + "loss": 2.154, + "step": 17027 + }, + { + "epoch": 1.5427057144798533, + "grad_norm": 0.8905094265937805, + "learning_rate": 9.716063553434424e-05, + "loss": 2.9634, + "step": 17028 + }, + { + "epoch": 1.5427963126543, + "grad_norm": 1.050936222076416, + "learning_rate": 9.715459433335347e-05, + "loss": 2.9027, + "step": 17029 + }, + { + "epoch": 1.5428869108287468, + "grad_norm": 0.9053871631622314, + "learning_rate": 9.714855313236272e-05, + "loss": 2.6799, + "step": 17030 + }, + { + "epoch": 1.5429775090031936, + "grad_norm": 1.0403635501861572, + "learning_rate": 9.714251193137195e-05, + "loss": 2.625, + "step": 17031 + }, + { + "epoch": 1.5430681071776404, + "grad_norm": 0.8975755572319031, + "learning_rate": 9.71364707303812e-05, + "loss": 2.9674, + "step": 17032 + }, + { + "epoch": 1.543158705352087, + "grad_norm": 0.870379626750946, + "learning_rate": 9.713042952939045e-05, + "loss": 2.6307, + "step": 17033 + }, + { + "epoch": 1.543249303526534, + "grad_norm": 0.9086102843284607, + "learning_rate": 9.71243883283997e-05, + "loss": 2.8722, + "step": 17034 + }, + { + "epoch": 1.5433399017009806, + "grad_norm": 0.8871670961380005, + "learning_rate": 9.711834712740893e-05, + "loss": 2.7587, + "step": 17035 + }, + { + "epoch": 1.5434304998754276, + "grad_norm": 1.0136505365371704, + "learning_rate": 9.711230592641818e-05, + "loss": 2.7091, + "step": 17036 + }, + { + "epoch": 1.5435210980498741, + "grad_norm": 0.9266486763954163, + "learning_rate": 9.710626472542743e-05, + "loss": 2.6357, + "step": 17037 + }, + { + "epoch": 1.5436116962243212, + "grad_norm": 0.9176317453384399, + "learning_rate": 9.710022352443666e-05, + "loss": 2.4107, + "step": 17038 + }, + { + "epoch": 1.5437022943987677, + "grad_norm": 0.8719377517700195, + "learning_rate": 9.709418232344591e-05, + "loss": 2.6783, + "step": 17039 + }, + { + "epoch": 1.5437928925732147, + "grad_norm": 0.9884628653526306, + "learning_rate": 9.708814112245514e-05, + "loss": 2.5934, + "step": 17040 + }, + { + "epoch": 1.5438834907476613, + "grad_norm": 0.9276598691940308, + "learning_rate": 9.708209992146439e-05, + "loss": 2.8712, + "step": 17041 + }, + { + "epoch": 1.5439740889221083, + "grad_norm": 0.879586398601532, + "learning_rate": 9.707605872047364e-05, + "loss": 2.6825, + "step": 17042 + }, + { + "epoch": 1.5440646870965549, + "grad_norm": 0.877999484539032, + "learning_rate": 9.707001751948288e-05, + "loss": 2.6192, + "step": 17043 + }, + { + "epoch": 1.5441552852710019, + "grad_norm": 0.8804571628570557, + "learning_rate": 9.706397631849212e-05, + "loss": 2.5084, + "step": 17044 + }, + { + "epoch": 1.5442458834454484, + "grad_norm": 0.8374150395393372, + "learning_rate": 9.705793511750137e-05, + "loss": 2.666, + "step": 17045 + }, + { + "epoch": 1.5443364816198955, + "grad_norm": 0.9494504928588867, + "learning_rate": 9.70518939165106e-05, + "loss": 2.8707, + "step": 17046 + }, + { + "epoch": 1.544427079794342, + "grad_norm": 0.8880566954612732, + "learning_rate": 9.704585271551985e-05, + "loss": 2.6792, + "step": 17047 + }, + { + "epoch": 1.544517677968789, + "grad_norm": 0.8888248205184937, + "learning_rate": 9.70398115145291e-05, + "loss": 2.7741, + "step": 17048 + }, + { + "epoch": 1.5446082761432356, + "grad_norm": 0.9032984375953674, + "learning_rate": 9.703377031353834e-05, + "loss": 2.8502, + "step": 17049 + }, + { + "epoch": 1.5446988743176826, + "grad_norm": 0.9366747140884399, + "learning_rate": 9.702772911254758e-05, + "loss": 2.6367, + "step": 17050 + }, + { + "epoch": 1.5447894724921292, + "grad_norm": 0.924944281578064, + "learning_rate": 9.702168791155682e-05, + "loss": 2.9269, + "step": 17051 + }, + { + "epoch": 1.5448800706665762, + "grad_norm": 0.8307647109031677, + "learning_rate": 9.701564671056607e-05, + "loss": 1.9026, + "step": 17052 + }, + { + "epoch": 1.5449706688410227, + "grad_norm": 0.9033969640731812, + "learning_rate": 9.70096055095753e-05, + "loss": 2.6211, + "step": 17053 + }, + { + "epoch": 1.5450612670154698, + "grad_norm": 0.8656501770019531, + "learning_rate": 9.700356430858455e-05, + "loss": 2.4593, + "step": 17054 + }, + { + "epoch": 1.5451518651899163, + "grad_norm": 0.8641902804374695, + "learning_rate": 9.699752310759379e-05, + "loss": 2.7032, + "step": 17055 + }, + { + "epoch": 1.5452424633643633, + "grad_norm": 0.9108701348304749, + "learning_rate": 9.699148190660304e-05, + "loss": 2.5263, + "step": 17056 + }, + { + "epoch": 1.54533306153881, + "grad_norm": 0.9117081761360168, + "learning_rate": 9.698544070561228e-05, + "loss": 2.6029, + "step": 17057 + }, + { + "epoch": 1.545423659713257, + "grad_norm": 0.9243536591529846, + "learning_rate": 9.697939950462153e-05, + "loss": 2.6908, + "step": 17058 + }, + { + "epoch": 1.5455142578877035, + "grad_norm": 0.7591539025306702, + "learning_rate": 9.697335830363076e-05, + "loss": 2.0213, + "step": 17059 + }, + { + "epoch": 1.5456048560621505, + "grad_norm": 0.819677472114563, + "learning_rate": 9.696731710264001e-05, + "loss": 2.0204, + "step": 17060 + }, + { + "epoch": 1.545695454236597, + "grad_norm": 0.9352689385414124, + "learning_rate": 9.696127590164925e-05, + "loss": 2.775, + "step": 17061 + }, + { + "epoch": 1.545786052411044, + "grad_norm": 0.8923984169960022, + "learning_rate": 9.69552347006585e-05, + "loss": 2.7208, + "step": 17062 + }, + { + "epoch": 1.5458766505854906, + "grad_norm": 0.9008134603500366, + "learning_rate": 9.694919349966773e-05, + "loss": 2.852, + "step": 17063 + }, + { + "epoch": 1.5459672487599376, + "grad_norm": 0.9046340584754944, + "learning_rate": 9.694315229867699e-05, + "loss": 2.7285, + "step": 17064 + }, + { + "epoch": 1.5460578469343842, + "grad_norm": 0.9082133769989014, + "learning_rate": 9.693711109768622e-05, + "loss": 2.8881, + "step": 17065 + }, + { + "epoch": 1.5461484451088312, + "grad_norm": 0.9265731573104858, + "learning_rate": 9.693106989669547e-05, + "loss": 2.6229, + "step": 17066 + }, + { + "epoch": 1.5462390432832778, + "grad_norm": 0.9124261736869812, + "learning_rate": 9.69250286957047e-05, + "loss": 2.7706, + "step": 17067 + }, + { + "epoch": 1.5463296414577248, + "grad_norm": 0.8882290720939636, + "learning_rate": 9.691898749471395e-05, + "loss": 2.7855, + "step": 17068 + }, + { + "epoch": 1.5464202396321713, + "grad_norm": 0.9008594155311584, + "learning_rate": 9.69129462937232e-05, + "loss": 2.8731, + "step": 17069 + }, + { + "epoch": 1.5465108378066184, + "grad_norm": 0.877801775932312, + "learning_rate": 9.690690509273243e-05, + "loss": 2.7259, + "step": 17070 + }, + { + "epoch": 1.546601435981065, + "grad_norm": 0.8903822302818298, + "learning_rate": 9.690086389174168e-05, + "loss": 2.4752, + "step": 17071 + }, + { + "epoch": 1.546692034155512, + "grad_norm": 0.9169249534606934, + "learning_rate": 9.689482269075093e-05, + "loss": 2.7384, + "step": 17072 + }, + { + "epoch": 1.5467826323299585, + "grad_norm": 0.9059023857116699, + "learning_rate": 9.688878148976018e-05, + "loss": 2.7919, + "step": 17073 + }, + { + "epoch": 1.5468732305044053, + "grad_norm": 0.9077298641204834, + "learning_rate": 9.688274028876941e-05, + "loss": 2.6113, + "step": 17074 + }, + { + "epoch": 1.546963828678852, + "grad_norm": 0.8518819212913513, + "learning_rate": 9.687669908777866e-05, + "loss": 2.5474, + "step": 17075 + }, + { + "epoch": 1.5470544268532989, + "grad_norm": 0.8709754943847656, + "learning_rate": 9.687065788678789e-05, + "loss": 2.7992, + "step": 17076 + }, + { + "epoch": 1.5471450250277456, + "grad_norm": 0.9523245096206665, + "learning_rate": 9.686461668579714e-05, + "loss": 2.6849, + "step": 17077 + }, + { + "epoch": 1.5472356232021924, + "grad_norm": 0.9230613708496094, + "learning_rate": 9.685857548480637e-05, + "loss": 2.5938, + "step": 17078 + }, + { + "epoch": 1.5473262213766392, + "grad_norm": 1.1256977319717407, + "learning_rate": 9.685253428381564e-05, + "loss": 3.2657, + "step": 17079 + }, + { + "epoch": 1.547416819551086, + "grad_norm": 0.9065850973129272, + "learning_rate": 9.684649308282487e-05, + "loss": 2.8619, + "step": 17080 + }, + { + "epoch": 1.5475074177255328, + "grad_norm": 0.9590450525283813, + "learning_rate": 9.684045188183412e-05, + "loss": 2.7251, + "step": 17081 + }, + { + "epoch": 1.5475980158999796, + "grad_norm": 0.8585705757141113, + "learning_rate": 9.683441068084335e-05, + "loss": 2.5651, + "step": 17082 + }, + { + "epoch": 1.5476886140744264, + "grad_norm": 0.8897895812988281, + "learning_rate": 9.68283694798526e-05, + "loss": 2.6916, + "step": 17083 + }, + { + "epoch": 1.5477792122488732, + "grad_norm": 0.8859939575195312, + "learning_rate": 9.682232827886185e-05, + "loss": 2.9461, + "step": 17084 + }, + { + "epoch": 1.54786981042332, + "grad_norm": 0.9120270609855652, + "learning_rate": 9.681628707787108e-05, + "loss": 2.7865, + "step": 17085 + }, + { + "epoch": 1.5479604085977667, + "grad_norm": 0.9506993889808655, + "learning_rate": 9.681024587688033e-05, + "loss": 2.5513, + "step": 17086 + }, + { + "epoch": 1.5480510067722135, + "grad_norm": 0.9255087375640869, + "learning_rate": 9.680420467588958e-05, + "loss": 2.6504, + "step": 17087 + }, + { + "epoch": 1.5481416049466603, + "grad_norm": 0.755122721195221, + "learning_rate": 9.679816347489882e-05, + "loss": 2.1142, + "step": 17088 + }, + { + "epoch": 1.548232203121107, + "grad_norm": 0.8546381592750549, + "learning_rate": 9.679212227390806e-05, + "loss": 2.8085, + "step": 17089 + }, + { + "epoch": 1.5483228012955539, + "grad_norm": 0.9152607917785645, + "learning_rate": 9.67860810729173e-05, + "loss": 2.6547, + "step": 17090 + }, + { + "epoch": 1.5484133994700007, + "grad_norm": 0.9308465123176575, + "learning_rate": 9.678003987192654e-05, + "loss": 2.5477, + "step": 17091 + }, + { + "epoch": 1.5485039976444475, + "grad_norm": 0.8630667924880981, + "learning_rate": 9.677399867093579e-05, + "loss": 2.6213, + "step": 17092 + }, + { + "epoch": 1.5485945958188942, + "grad_norm": 0.9165999293327332, + "learning_rate": 9.676795746994502e-05, + "loss": 2.7219, + "step": 17093 + }, + { + "epoch": 1.548685193993341, + "grad_norm": 0.9330263137817383, + "learning_rate": 9.676191626895428e-05, + "loss": 2.8479, + "step": 17094 + }, + { + "epoch": 1.5487757921677878, + "grad_norm": 0.9145743250846863, + "learning_rate": 9.675587506796352e-05, + "loss": 2.8891, + "step": 17095 + }, + { + "epoch": 1.5488663903422346, + "grad_norm": 0.931904673576355, + "learning_rate": 9.674983386697276e-05, + "loss": 2.7991, + "step": 17096 + }, + { + "epoch": 1.5489569885166814, + "grad_norm": 0.8641915321350098, + "learning_rate": 9.6743792665982e-05, + "loss": 2.6144, + "step": 17097 + }, + { + "epoch": 1.5490475866911282, + "grad_norm": 0.8987800478935242, + "learning_rate": 9.673775146499125e-05, + "loss": 2.6959, + "step": 17098 + }, + { + "epoch": 1.549138184865575, + "grad_norm": 0.8487061858177185, + "learning_rate": 9.673171026400048e-05, + "loss": 2.2872, + "step": 17099 + }, + { + "epoch": 1.5492287830400218, + "grad_norm": 0.6546309590339661, + "learning_rate": 9.672566906300973e-05, + "loss": 1.3343, + "step": 17100 + }, + { + "epoch": 1.5493193812144685, + "grad_norm": 0.6930833458900452, + "learning_rate": 9.671962786201897e-05, + "loss": 1.4882, + "step": 17101 + }, + { + "epoch": 1.5494099793889153, + "grad_norm": 0.9227604866027832, + "learning_rate": 9.671358666102822e-05, + "loss": 2.8628, + "step": 17102 + }, + { + "epoch": 1.5495005775633621, + "grad_norm": 0.9039713740348816, + "learning_rate": 9.670754546003747e-05, + "loss": 2.952, + "step": 17103 + }, + { + "epoch": 1.549591175737809, + "grad_norm": 0.9384124875068665, + "learning_rate": 9.67015042590467e-05, + "loss": 2.7958, + "step": 17104 + }, + { + "epoch": 1.5496817739122557, + "grad_norm": 1.076833963394165, + "learning_rate": 9.669546305805595e-05, + "loss": 2.7154, + "step": 17105 + }, + { + "epoch": 1.5497723720867025, + "grad_norm": 0.9386153221130371, + "learning_rate": 9.668942185706519e-05, + "loss": 2.5103, + "step": 17106 + }, + { + "epoch": 1.5498629702611493, + "grad_norm": 0.9044088125228882, + "learning_rate": 9.668338065607443e-05, + "loss": 2.7876, + "step": 17107 + }, + { + "epoch": 1.549953568435596, + "grad_norm": 0.8970648050308228, + "learning_rate": 9.667733945508367e-05, + "loss": 2.5317, + "step": 17108 + }, + { + "epoch": 1.5500441666100429, + "grad_norm": 0.8607326745986938, + "learning_rate": 9.667129825409293e-05, + "loss": 2.7496, + "step": 17109 + }, + { + "epoch": 1.5501347647844896, + "grad_norm": 0.9767703413963318, + "learning_rate": 9.666525705310216e-05, + "loss": 2.5994, + "step": 17110 + }, + { + "epoch": 1.5502253629589364, + "grad_norm": 0.8855897188186646, + "learning_rate": 9.665921585211141e-05, + "loss": 2.8571, + "step": 17111 + }, + { + "epoch": 1.5503159611333832, + "grad_norm": 0.9108989834785461, + "learning_rate": 9.665317465112064e-05, + "loss": 2.8297, + "step": 17112 + }, + { + "epoch": 1.55040655930783, + "grad_norm": 0.8577064871788025, + "learning_rate": 9.664713345012989e-05, + "loss": 2.596, + "step": 17113 + }, + { + "epoch": 1.5504971574822766, + "grad_norm": 0.9248507618904114, + "learning_rate": 9.664109224913913e-05, + "loss": 2.8262, + "step": 17114 + }, + { + "epoch": 1.5505877556567236, + "grad_norm": 0.9046310782432556, + "learning_rate": 9.663505104814837e-05, + "loss": 2.7683, + "step": 17115 + }, + { + "epoch": 1.5506783538311701, + "grad_norm": 0.843241810798645, + "learning_rate": 9.662900984715762e-05, + "loss": 2.3487, + "step": 17116 + }, + { + "epoch": 1.5507689520056172, + "grad_norm": 0.9844672083854675, + "learning_rate": 9.662296864616687e-05, + "loss": 2.918, + "step": 17117 + }, + { + "epoch": 1.5508595501800637, + "grad_norm": 0.9047618508338928, + "learning_rate": 9.66169274451761e-05, + "loss": 2.6559, + "step": 17118 + }, + { + "epoch": 1.5509501483545107, + "grad_norm": 0.8883386254310608, + "learning_rate": 9.661088624418535e-05, + "loss": 2.8928, + "step": 17119 + }, + { + "epoch": 1.5510407465289573, + "grad_norm": 0.9184898138046265, + "learning_rate": 9.66048450431946e-05, + "loss": 2.8909, + "step": 17120 + }, + { + "epoch": 1.5511313447034043, + "grad_norm": 0.9011064767837524, + "learning_rate": 9.659880384220383e-05, + "loss": 2.5942, + "step": 17121 + }, + { + "epoch": 1.5512219428778509, + "grad_norm": 0.9952341914176941, + "learning_rate": 9.659276264121308e-05, + "loss": 2.7705, + "step": 17122 + }, + { + "epoch": 1.5513125410522979, + "grad_norm": 0.8761496543884277, + "learning_rate": 9.658672144022231e-05, + "loss": 2.7007, + "step": 17123 + }, + { + "epoch": 1.5514031392267444, + "grad_norm": 0.8326719403266907, + "learning_rate": 9.658068023923157e-05, + "loss": 2.7537, + "step": 17124 + }, + { + "epoch": 1.5514937374011915, + "grad_norm": 0.942346453666687, + "learning_rate": 9.657463903824081e-05, + "loss": 2.5952, + "step": 17125 + }, + { + "epoch": 1.551584335575638, + "grad_norm": 0.8195433020591736, + "learning_rate": 9.656859783725006e-05, + "loss": 2.5767, + "step": 17126 + }, + { + "epoch": 1.551674933750085, + "grad_norm": 0.9885293841362, + "learning_rate": 9.656255663625929e-05, + "loss": 2.7241, + "step": 17127 + }, + { + "epoch": 1.5517655319245316, + "grad_norm": 0.8280848860740662, + "learning_rate": 9.655651543526854e-05, + "loss": 2.6888, + "step": 17128 + }, + { + "epoch": 1.5518561300989786, + "grad_norm": 1.0159382820129395, + "learning_rate": 9.655047423427777e-05, + "loss": 2.7237, + "step": 17129 + }, + { + "epoch": 1.5519467282734252, + "grad_norm": 0.8223744034767151, + "learning_rate": 9.654443303328702e-05, + "loss": 2.6833, + "step": 17130 + }, + { + "epoch": 1.5520373264478722, + "grad_norm": 0.8572942614555359, + "learning_rate": 9.653839183229625e-05, + "loss": 2.5628, + "step": 17131 + }, + { + "epoch": 1.5521279246223187, + "grad_norm": 0.8678943514823914, + "learning_rate": 9.653235063130551e-05, + "loss": 2.8191, + "step": 17132 + }, + { + "epoch": 1.5522185227967658, + "grad_norm": 0.9076806902885437, + "learning_rate": 9.652630943031475e-05, + "loss": 2.7848, + "step": 17133 + }, + { + "epoch": 1.5523091209712123, + "grad_norm": 0.9167758226394653, + "learning_rate": 9.6520268229324e-05, + "loss": 2.5624, + "step": 17134 + }, + { + "epoch": 1.5523997191456593, + "grad_norm": 0.8912537097930908, + "learning_rate": 9.651422702833323e-05, + "loss": 2.6418, + "step": 17135 + }, + { + "epoch": 1.552490317320106, + "grad_norm": 0.900397539138794, + "learning_rate": 9.650818582734248e-05, + "loss": 2.8454, + "step": 17136 + }, + { + "epoch": 1.552580915494553, + "grad_norm": 0.9351205229759216, + "learning_rate": 9.650214462635173e-05, + "loss": 2.5746, + "step": 17137 + }, + { + "epoch": 1.5526715136689995, + "grad_norm": 0.8947529196739197, + "learning_rate": 9.649610342536096e-05, + "loss": 2.905, + "step": 17138 + }, + { + "epoch": 1.5527621118434465, + "grad_norm": 0.8905893564224243, + "learning_rate": 9.649006222437022e-05, + "loss": 2.695, + "step": 17139 + }, + { + "epoch": 1.552852710017893, + "grad_norm": 0.8987908363342285, + "learning_rate": 9.648402102337946e-05, + "loss": 2.7419, + "step": 17140 + }, + { + "epoch": 1.55294330819234, + "grad_norm": 0.9690076112747192, + "learning_rate": 9.64779798223887e-05, + "loss": 2.5534, + "step": 17141 + }, + { + "epoch": 1.5530339063667866, + "grad_norm": 0.9646934866905212, + "learning_rate": 9.647193862139794e-05, + "loss": 2.8628, + "step": 17142 + }, + { + "epoch": 1.5531245045412336, + "grad_norm": 0.8964982032775879, + "learning_rate": 9.646589742040718e-05, + "loss": 2.6732, + "step": 17143 + }, + { + "epoch": 1.5532151027156802, + "grad_norm": 0.8704732656478882, + "learning_rate": 9.645985621941642e-05, + "loss": 2.5352, + "step": 17144 + }, + { + "epoch": 1.5533057008901272, + "grad_norm": 0.8585503101348877, + "learning_rate": 9.645381501842567e-05, + "loss": 2.6184, + "step": 17145 + }, + { + "epoch": 1.5533962990645738, + "grad_norm": 0.9686841368675232, + "learning_rate": 9.64477738174349e-05, + "loss": 3.0136, + "step": 17146 + }, + { + "epoch": 1.5534868972390208, + "grad_norm": 0.9016891717910767, + "learning_rate": 9.644173261644416e-05, + "loss": 2.6961, + "step": 17147 + }, + { + "epoch": 1.5535774954134673, + "grad_norm": 0.8828538656234741, + "learning_rate": 9.64356914154534e-05, + "loss": 2.4657, + "step": 17148 + }, + { + "epoch": 1.5536680935879144, + "grad_norm": 0.9424493908882141, + "learning_rate": 9.642965021446264e-05, + "loss": 2.8769, + "step": 17149 + }, + { + "epoch": 1.553758691762361, + "grad_norm": 0.9444913268089294, + "learning_rate": 9.642360901347188e-05, + "loss": 2.7257, + "step": 17150 + }, + { + "epoch": 1.553849289936808, + "grad_norm": 0.9232015013694763, + "learning_rate": 9.641756781248112e-05, + "loss": 2.6563, + "step": 17151 + }, + { + "epoch": 1.5539398881112545, + "grad_norm": 0.9771692156791687, + "learning_rate": 9.641152661149037e-05, + "loss": 2.7041, + "step": 17152 + }, + { + "epoch": 1.5540304862857015, + "grad_norm": 0.9213663935661316, + "learning_rate": 9.64054854104996e-05, + "loss": 2.9359, + "step": 17153 + }, + { + "epoch": 1.554121084460148, + "grad_norm": 0.9031211137771606, + "learning_rate": 9.639944420950885e-05, + "loss": 2.6624, + "step": 17154 + }, + { + "epoch": 1.5542116826345949, + "grad_norm": 0.9903860688209534, + "learning_rate": 9.63934030085181e-05, + "loss": 2.5617, + "step": 17155 + }, + { + "epoch": 1.5543022808090416, + "grad_norm": 0.907143235206604, + "learning_rate": 9.638736180752735e-05, + "loss": 2.5789, + "step": 17156 + }, + { + "epoch": 1.5543928789834884, + "grad_norm": 0.9835034012794495, + "learning_rate": 9.638132060653658e-05, + "loss": 2.6276, + "step": 17157 + }, + { + "epoch": 1.5544834771579352, + "grad_norm": 0.9150813817977905, + "learning_rate": 9.637527940554583e-05, + "loss": 2.6458, + "step": 17158 + }, + { + "epoch": 1.554574075332382, + "grad_norm": 0.8934159874916077, + "learning_rate": 9.636923820455506e-05, + "loss": 2.7757, + "step": 17159 + }, + { + "epoch": 1.5546646735068288, + "grad_norm": 0.9101525545120239, + "learning_rate": 9.636319700356431e-05, + "loss": 2.7951, + "step": 17160 + }, + { + "epoch": 1.5547552716812756, + "grad_norm": 0.8781415820121765, + "learning_rate": 9.635715580257355e-05, + "loss": 2.7518, + "step": 17161 + }, + { + "epoch": 1.5548458698557224, + "grad_norm": 0.8957802057266235, + "learning_rate": 9.635111460158281e-05, + "loss": 2.808, + "step": 17162 + }, + { + "epoch": 1.5549364680301692, + "grad_norm": 0.9561132788658142, + "learning_rate": 9.634507340059204e-05, + "loss": 2.7435, + "step": 17163 + }, + { + "epoch": 1.555027066204616, + "grad_norm": 0.8970696926116943, + "learning_rate": 9.633903219960129e-05, + "loss": 2.7059, + "step": 17164 + }, + { + "epoch": 1.5551176643790627, + "grad_norm": 0.8914760947227478, + "learning_rate": 9.633299099861052e-05, + "loss": 2.9182, + "step": 17165 + }, + { + "epoch": 1.5552082625535095, + "grad_norm": 0.9059017896652222, + "learning_rate": 9.632694979761977e-05, + "loss": 2.7137, + "step": 17166 + }, + { + "epoch": 1.5552988607279563, + "grad_norm": 0.9060317873954773, + "learning_rate": 9.6320908596629e-05, + "loss": 2.9469, + "step": 17167 + }, + { + "epoch": 1.555389458902403, + "grad_norm": 0.9231916666030884, + "learning_rate": 9.631486739563825e-05, + "loss": 2.5611, + "step": 17168 + }, + { + "epoch": 1.5554800570768499, + "grad_norm": 0.949616014957428, + "learning_rate": 9.63088261946475e-05, + "loss": 2.7938, + "step": 17169 + }, + { + "epoch": 1.5555706552512967, + "grad_norm": 0.948480486869812, + "learning_rate": 9.630278499365675e-05, + "loss": 2.699, + "step": 17170 + }, + { + "epoch": 1.5556612534257435, + "grad_norm": 0.9744041562080383, + "learning_rate": 9.6296743792666e-05, + "loss": 2.7624, + "step": 17171 + }, + { + "epoch": 1.5557518516001902, + "grad_norm": 0.883072018623352, + "learning_rate": 9.629070259167523e-05, + "loss": 2.975, + "step": 17172 + }, + { + "epoch": 1.555842449774637, + "grad_norm": 0.8706905245780945, + "learning_rate": 9.628466139068448e-05, + "loss": 2.7421, + "step": 17173 + }, + { + "epoch": 1.5559330479490838, + "grad_norm": 0.9681324362754822, + "learning_rate": 9.627862018969371e-05, + "loss": 2.4957, + "step": 17174 + }, + { + "epoch": 1.5560236461235306, + "grad_norm": 0.8474427461624146, + "learning_rate": 9.627257898870296e-05, + "loss": 2.6549, + "step": 17175 + }, + { + "epoch": 1.5561142442979774, + "grad_norm": 0.8031418323516846, + "learning_rate": 9.626653778771219e-05, + "loss": 2.0804, + "step": 17176 + }, + { + "epoch": 1.5562048424724242, + "grad_norm": 0.8840213418006897, + "learning_rate": 9.626049658672145e-05, + "loss": 2.676, + "step": 17177 + }, + { + "epoch": 1.556295440646871, + "grad_norm": 0.8459686040878296, + "learning_rate": 9.625445538573069e-05, + "loss": 2.7339, + "step": 17178 + }, + { + "epoch": 1.5563860388213178, + "grad_norm": 0.9452251195907593, + "learning_rate": 9.624841418473994e-05, + "loss": 2.7176, + "step": 17179 + }, + { + "epoch": 1.5564766369957646, + "grad_norm": 0.8637509942054749, + "learning_rate": 9.624237298374917e-05, + "loss": 1.962, + "step": 17180 + }, + { + "epoch": 1.5565672351702113, + "grad_norm": 0.6201047897338867, + "learning_rate": 9.623633178275842e-05, + "loss": 1.2515, + "step": 17181 + }, + { + "epoch": 1.5566578333446581, + "grad_norm": 0.9412020444869995, + "learning_rate": 9.623029058176765e-05, + "loss": 2.592, + "step": 17182 + }, + { + "epoch": 1.556748431519105, + "grad_norm": 0.9988900423049927, + "learning_rate": 9.62242493807769e-05, + "loss": 2.5527, + "step": 17183 + }, + { + "epoch": 1.5568390296935517, + "grad_norm": 0.9480759501457214, + "learning_rate": 9.621820817978615e-05, + "loss": 2.7907, + "step": 17184 + }, + { + "epoch": 1.5569296278679985, + "grad_norm": 0.8669121265411377, + "learning_rate": 9.62121669787954e-05, + "loss": 2.7612, + "step": 17185 + }, + { + "epoch": 1.5570202260424453, + "grad_norm": 0.8967902660369873, + "learning_rate": 9.620612577780463e-05, + "loss": 2.856, + "step": 17186 + }, + { + "epoch": 1.557110824216892, + "grad_norm": 0.8676226735115051, + "learning_rate": 9.620008457681388e-05, + "loss": 2.5524, + "step": 17187 + }, + { + "epoch": 1.5572014223913389, + "grad_norm": 0.8921180963516235, + "learning_rate": 9.619404337582312e-05, + "loss": 2.6272, + "step": 17188 + }, + { + "epoch": 1.5572920205657856, + "grad_norm": 1.026442050933838, + "learning_rate": 9.618800217483236e-05, + "loss": 2.8302, + "step": 17189 + }, + { + "epoch": 1.5573826187402324, + "grad_norm": 0.92808598279953, + "learning_rate": 9.61819609738416e-05, + "loss": 2.7273, + "step": 17190 + }, + { + "epoch": 1.5574732169146792, + "grad_norm": 0.7311328053474426, + "learning_rate": 9.617591977285084e-05, + "loss": 1.7978, + "step": 17191 + }, + { + "epoch": 1.557563815089126, + "grad_norm": 0.9949586987495422, + "learning_rate": 9.61698785718601e-05, + "loss": 2.9355, + "step": 17192 + }, + { + "epoch": 1.5576544132635728, + "grad_norm": 0.946540892124176, + "learning_rate": 9.616383737086933e-05, + "loss": 2.8442, + "step": 17193 + }, + { + "epoch": 1.5577450114380196, + "grad_norm": 0.9114975333213806, + "learning_rate": 9.615779616987858e-05, + "loss": 3.0253, + "step": 17194 + }, + { + "epoch": 1.5578356096124661, + "grad_norm": 0.901911199092865, + "learning_rate": 9.615175496888782e-05, + "loss": 3.0342, + "step": 17195 + }, + { + "epoch": 1.5579262077869132, + "grad_norm": 0.9162164926528931, + "learning_rate": 9.614571376789706e-05, + "loss": 2.643, + "step": 17196 + }, + { + "epoch": 1.5580168059613597, + "grad_norm": 0.8649075031280518, + "learning_rate": 9.61396725669063e-05, + "loss": 2.6252, + "step": 17197 + }, + { + "epoch": 1.5581074041358067, + "grad_norm": 0.8436648845672607, + "learning_rate": 9.613363136591555e-05, + "loss": 2.2651, + "step": 17198 + }, + { + "epoch": 1.5581980023102533, + "grad_norm": 0.9040085077285767, + "learning_rate": 9.612759016492479e-05, + "loss": 2.6212, + "step": 17199 + }, + { + "epoch": 1.5582886004847003, + "grad_norm": 0.9252174496650696, + "learning_rate": 9.612154896393404e-05, + "loss": 2.7369, + "step": 17200 + }, + { + "epoch": 1.5583791986591469, + "grad_norm": 0.8687570691108704, + "learning_rate": 9.611550776294327e-05, + "loss": 2.6871, + "step": 17201 + }, + { + "epoch": 1.5584697968335939, + "grad_norm": 0.8868851065635681, + "learning_rate": 9.610946656195252e-05, + "loss": 2.7074, + "step": 17202 + }, + { + "epoch": 1.5585603950080404, + "grad_norm": 0.908690333366394, + "learning_rate": 9.610342536096177e-05, + "loss": 2.7007, + "step": 17203 + }, + { + "epoch": 1.5586509931824875, + "grad_norm": 0.9296079874038696, + "learning_rate": 9.6097384159971e-05, + "loss": 2.7751, + "step": 17204 + }, + { + "epoch": 1.558741591356934, + "grad_norm": 0.8927004933357239, + "learning_rate": 9.609134295898025e-05, + "loss": 2.8428, + "step": 17205 + }, + { + "epoch": 1.558832189531381, + "grad_norm": 0.9056280851364136, + "learning_rate": 9.608530175798949e-05, + "loss": 2.6428, + "step": 17206 + }, + { + "epoch": 1.5589227877058276, + "grad_norm": 0.8752632141113281, + "learning_rate": 9.607926055699875e-05, + "loss": 2.6899, + "step": 17207 + }, + { + "epoch": 1.5590133858802746, + "grad_norm": 0.9302763938903809, + "learning_rate": 9.607321935600798e-05, + "loss": 2.7768, + "step": 17208 + }, + { + "epoch": 1.5591039840547212, + "grad_norm": 0.8994808793067932, + "learning_rate": 9.606717815501723e-05, + "loss": 1.9289, + "step": 17209 + }, + { + "epoch": 1.5591945822291682, + "grad_norm": 0.8909130096435547, + "learning_rate": 9.606113695402646e-05, + "loss": 2.7717, + "step": 17210 + }, + { + "epoch": 1.5592851804036147, + "grad_norm": 0.8651666641235352, + "learning_rate": 9.605509575303571e-05, + "loss": 2.3736, + "step": 17211 + }, + { + "epoch": 1.5593757785780618, + "grad_norm": 0.8571377396583557, + "learning_rate": 9.604905455204494e-05, + "loss": 2.542, + "step": 17212 + }, + { + "epoch": 1.5594663767525083, + "grad_norm": 1.0889347791671753, + "learning_rate": 9.604301335105419e-05, + "loss": 2.764, + "step": 17213 + }, + { + "epoch": 1.5595569749269553, + "grad_norm": 1.0049833059310913, + "learning_rate": 9.603697215006344e-05, + "loss": 2.9982, + "step": 17214 + }, + { + "epoch": 1.559647573101402, + "grad_norm": 0.5887945294380188, + "learning_rate": 9.603093094907269e-05, + "loss": 1.1989, + "step": 17215 + }, + { + "epoch": 1.559738171275849, + "grad_norm": 0.971591055393219, + "learning_rate": 9.602488974808192e-05, + "loss": 2.8686, + "step": 17216 + }, + { + "epoch": 1.5598287694502955, + "grad_norm": 0.7949687242507935, + "learning_rate": 9.601884854709117e-05, + "loss": 2.0753, + "step": 17217 + }, + { + "epoch": 1.5599193676247425, + "grad_norm": 0.8941996693611145, + "learning_rate": 9.60128073461004e-05, + "loss": 2.6612, + "step": 17218 + }, + { + "epoch": 1.560009965799189, + "grad_norm": 1.0006047487258911, + "learning_rate": 9.600676614510965e-05, + "loss": 2.6701, + "step": 17219 + }, + { + "epoch": 1.560100563973636, + "grad_norm": 0.88852858543396, + "learning_rate": 9.60007249441189e-05, + "loss": 2.7659, + "step": 17220 + }, + { + "epoch": 1.5601911621480826, + "grad_norm": 0.9160418510437012, + "learning_rate": 9.599468374312813e-05, + "loss": 2.6056, + "step": 17221 + }, + { + "epoch": 1.5602817603225296, + "grad_norm": 0.9553495645523071, + "learning_rate": 9.598864254213738e-05, + "loss": 2.7167, + "step": 17222 + }, + { + "epoch": 1.5603723584969762, + "grad_norm": 0.8827643990516663, + "learning_rate": 9.598260134114663e-05, + "loss": 2.3574, + "step": 17223 + }, + { + "epoch": 1.5604629566714232, + "grad_norm": 0.9957475662231445, + "learning_rate": 9.597656014015587e-05, + "loss": 3.0393, + "step": 17224 + }, + { + "epoch": 1.5605535548458698, + "grad_norm": 0.8897873163223267, + "learning_rate": 9.597051893916511e-05, + "loss": 2.592, + "step": 17225 + }, + { + "epoch": 1.5606441530203168, + "grad_norm": 0.8845698833465576, + "learning_rate": 9.596447773817436e-05, + "loss": 2.9377, + "step": 17226 + }, + { + "epoch": 1.5607347511947633, + "grad_norm": 0.8373962044715881, + "learning_rate": 9.595843653718359e-05, + "loss": 2.0721, + "step": 17227 + }, + { + "epoch": 1.5608253493692104, + "grad_norm": 0.895963191986084, + "learning_rate": 9.595239533619284e-05, + "loss": 2.6732, + "step": 17228 + }, + { + "epoch": 1.560915947543657, + "grad_norm": 0.9658252000808716, + "learning_rate": 9.594635413520209e-05, + "loss": 2.66, + "step": 17229 + }, + { + "epoch": 1.561006545718104, + "grad_norm": 0.862857460975647, + "learning_rate": 9.594031293421133e-05, + "loss": 2.5032, + "step": 17230 + }, + { + "epoch": 1.5610971438925505, + "grad_norm": 0.9277161955833435, + "learning_rate": 9.593427173322057e-05, + "loss": 2.7669, + "step": 17231 + }, + { + "epoch": 1.5611877420669975, + "grad_norm": 0.9111839532852173, + "learning_rate": 9.592823053222981e-05, + "loss": 2.7792, + "step": 17232 + }, + { + "epoch": 1.561278340241444, + "grad_norm": 0.990095853805542, + "learning_rate": 9.592218933123905e-05, + "loss": 2.8328, + "step": 17233 + }, + { + "epoch": 1.561368938415891, + "grad_norm": 0.7821797728538513, + "learning_rate": 9.59161481302483e-05, + "loss": 2.0964, + "step": 17234 + }, + { + "epoch": 1.5614595365903376, + "grad_norm": 0.9286513924598694, + "learning_rate": 9.591010692925754e-05, + "loss": 2.7094, + "step": 17235 + }, + { + "epoch": 1.5615501347647844, + "grad_norm": 0.8913218975067139, + "learning_rate": 9.590406572826678e-05, + "loss": 2.7798, + "step": 17236 + }, + { + "epoch": 1.5616407329392312, + "grad_norm": 0.9838734865188599, + "learning_rate": 9.589802452727603e-05, + "loss": 2.6927, + "step": 17237 + }, + { + "epoch": 1.561731331113678, + "grad_norm": 0.8525139689445496, + "learning_rate": 9.589198332628527e-05, + "loss": 2.5245, + "step": 17238 + }, + { + "epoch": 1.5618219292881248, + "grad_norm": 0.8777035474777222, + "learning_rate": 9.588594212529452e-05, + "loss": 2.5783, + "step": 17239 + }, + { + "epoch": 1.5619125274625716, + "grad_norm": 0.8789008259773254, + "learning_rate": 9.587990092430375e-05, + "loss": 2.8237, + "step": 17240 + }, + { + "epoch": 1.5620031256370184, + "grad_norm": 0.8569720387458801, + "learning_rate": 9.5873859723313e-05, + "loss": 2.6077, + "step": 17241 + }, + { + "epoch": 1.5620937238114652, + "grad_norm": 0.9273871183395386, + "learning_rate": 9.586781852232224e-05, + "loss": 2.5146, + "step": 17242 + }, + { + "epoch": 1.562184321985912, + "grad_norm": 0.9415690898895264, + "learning_rate": 9.586177732133148e-05, + "loss": 2.7136, + "step": 17243 + }, + { + "epoch": 1.5622749201603587, + "grad_norm": 0.7559627294540405, + "learning_rate": 9.585573612034073e-05, + "loss": 1.7964, + "step": 17244 + }, + { + "epoch": 1.5623655183348055, + "grad_norm": 0.9211739301681519, + "learning_rate": 9.584969491934998e-05, + "loss": 2.4825, + "step": 17245 + }, + { + "epoch": 1.5624561165092523, + "grad_norm": 0.8258457779884338, + "learning_rate": 9.584365371835921e-05, + "loss": 2.1653, + "step": 17246 + }, + { + "epoch": 1.562546714683699, + "grad_norm": 0.937039852142334, + "learning_rate": 9.583761251736846e-05, + "loss": 2.8236, + "step": 17247 + }, + { + "epoch": 1.5626373128581459, + "grad_norm": 0.8784468770027161, + "learning_rate": 9.58315713163777e-05, + "loss": 2.4092, + "step": 17248 + }, + { + "epoch": 1.5627279110325927, + "grad_norm": 0.791472315788269, + "learning_rate": 9.582553011538694e-05, + "loss": 1.8876, + "step": 17249 + }, + { + "epoch": 1.5628185092070395, + "grad_norm": 0.9001960754394531, + "learning_rate": 9.581948891439618e-05, + "loss": 2.662, + "step": 17250 + }, + { + "epoch": 1.5629091073814863, + "grad_norm": 0.9268090724945068, + "learning_rate": 9.581344771340542e-05, + "loss": 2.7129, + "step": 17251 + }, + { + "epoch": 1.562999705555933, + "grad_norm": 0.964580237865448, + "learning_rate": 9.580740651241467e-05, + "loss": 2.8847, + "step": 17252 + }, + { + "epoch": 1.5630903037303798, + "grad_norm": 0.8822961449623108, + "learning_rate": 9.580136531142392e-05, + "loss": 2.7205, + "step": 17253 + }, + { + "epoch": 1.5631809019048266, + "grad_norm": 0.7743932008743286, + "learning_rate": 9.579532411043315e-05, + "loss": 2.0925, + "step": 17254 + }, + { + "epoch": 1.5632715000792734, + "grad_norm": 0.8909611701965332, + "learning_rate": 9.57892829094424e-05, + "loss": 2.8274, + "step": 17255 + }, + { + "epoch": 1.5633620982537202, + "grad_norm": 0.9558739066123962, + "learning_rate": 9.578324170845165e-05, + "loss": 2.7835, + "step": 17256 + }, + { + "epoch": 1.563452696428167, + "grad_norm": 0.9494229555130005, + "learning_rate": 9.577720050746088e-05, + "loss": 2.542, + "step": 17257 + }, + { + "epoch": 1.5635432946026138, + "grad_norm": 0.9550375938415527, + "learning_rate": 9.577115930647013e-05, + "loss": 2.7544, + "step": 17258 + }, + { + "epoch": 1.5636338927770606, + "grad_norm": 0.8123276829719543, + "learning_rate": 9.576511810547938e-05, + "loss": 1.8181, + "step": 17259 + }, + { + "epoch": 1.5637244909515073, + "grad_norm": 0.9589495658874512, + "learning_rate": 9.575907690448863e-05, + "loss": 2.584, + "step": 17260 + }, + { + "epoch": 1.5638150891259541, + "grad_norm": 1.003684163093567, + "learning_rate": 9.575303570349786e-05, + "loss": 2.7396, + "step": 17261 + }, + { + "epoch": 1.563905687300401, + "grad_norm": 0.9789413809776306, + "learning_rate": 9.574699450250711e-05, + "loss": 2.9935, + "step": 17262 + }, + { + "epoch": 1.5639962854748477, + "grad_norm": 0.95533686876297, + "learning_rate": 9.574095330151634e-05, + "loss": 2.7528, + "step": 17263 + }, + { + "epoch": 1.5640868836492945, + "grad_norm": 0.9672768712043762, + "learning_rate": 9.573491210052559e-05, + "loss": 2.543, + "step": 17264 + }, + { + "epoch": 1.5641774818237413, + "grad_norm": 0.9804579019546509, + "learning_rate": 9.572887089953482e-05, + "loss": 2.6732, + "step": 17265 + }, + { + "epoch": 1.564268079998188, + "grad_norm": 1.0301330089569092, + "learning_rate": 9.572282969854407e-05, + "loss": 2.696, + "step": 17266 + }, + { + "epoch": 1.5643586781726349, + "grad_norm": 0.8902652859687805, + "learning_rate": 9.571678849755332e-05, + "loss": 2.8705, + "step": 17267 + }, + { + "epoch": 1.5644492763470816, + "grad_norm": 0.8999311327934265, + "learning_rate": 9.571074729656257e-05, + "loss": 2.9028, + "step": 17268 + }, + { + "epoch": 1.5645398745215284, + "grad_norm": 0.8727661371231079, + "learning_rate": 9.57047060955718e-05, + "loss": 2.5924, + "step": 17269 + }, + { + "epoch": 1.5646304726959752, + "grad_norm": 0.9316558241844177, + "learning_rate": 9.569866489458105e-05, + "loss": 2.5182, + "step": 17270 + }, + { + "epoch": 1.564721070870422, + "grad_norm": 0.8977721929550171, + "learning_rate": 9.56926236935903e-05, + "loss": 2.77, + "step": 17271 + }, + { + "epoch": 1.5648116690448688, + "grad_norm": 0.8761014938354492, + "learning_rate": 9.568658249259953e-05, + "loss": 2.661, + "step": 17272 + }, + { + "epoch": 1.5649022672193156, + "grad_norm": 0.9538341164588928, + "learning_rate": 9.568054129160878e-05, + "loss": 2.8055, + "step": 17273 + }, + { + "epoch": 1.5649928653937624, + "grad_norm": 0.9148719906806946, + "learning_rate": 9.567450009061802e-05, + "loss": 2.6569, + "step": 17274 + }, + { + "epoch": 1.5650834635682092, + "grad_norm": 0.8711931705474854, + "learning_rate": 9.566845888962727e-05, + "loss": 2.8989, + "step": 17275 + }, + { + "epoch": 1.5651740617426557, + "grad_norm": 0.8188814520835876, + "learning_rate": 9.56624176886365e-05, + "loss": 2.4979, + "step": 17276 + }, + { + "epoch": 1.5652646599171027, + "grad_norm": 0.8854779601097107, + "learning_rate": 9.565637648764575e-05, + "loss": 2.6442, + "step": 17277 + }, + { + "epoch": 1.5653552580915493, + "grad_norm": 0.9366462826728821, + "learning_rate": 9.565033528665499e-05, + "loss": 2.9126, + "step": 17278 + }, + { + "epoch": 1.5654458562659963, + "grad_norm": 0.9307314157485962, + "learning_rate": 9.564429408566424e-05, + "loss": 3.066, + "step": 17279 + }, + { + "epoch": 1.5655364544404429, + "grad_norm": 0.8491356372833252, + "learning_rate": 9.563825288467347e-05, + "loss": 2.548, + "step": 17280 + }, + { + "epoch": 1.5656270526148899, + "grad_norm": 0.7744339108467102, + "learning_rate": 9.563221168368272e-05, + "loss": 2.1213, + "step": 17281 + }, + { + "epoch": 1.5657176507893364, + "grad_norm": 0.917536735534668, + "learning_rate": 9.562617048269196e-05, + "loss": 2.7557, + "step": 17282 + }, + { + "epoch": 1.5658082489637835, + "grad_norm": 0.901792049407959, + "learning_rate": 9.562012928170121e-05, + "loss": 2.7562, + "step": 17283 + }, + { + "epoch": 1.56589884713823, + "grad_norm": 0.9356027841567993, + "learning_rate": 9.561408808071045e-05, + "loss": 2.6926, + "step": 17284 + }, + { + "epoch": 1.565989445312677, + "grad_norm": 0.9286466240882874, + "learning_rate": 9.56080468797197e-05, + "loss": 2.6248, + "step": 17285 + }, + { + "epoch": 1.5660800434871236, + "grad_norm": 0.9184763431549072, + "learning_rate": 9.560200567872893e-05, + "loss": 2.6632, + "step": 17286 + }, + { + "epoch": 1.5661706416615706, + "grad_norm": 0.918357253074646, + "learning_rate": 9.559596447773818e-05, + "loss": 2.6064, + "step": 17287 + }, + { + "epoch": 1.5662612398360172, + "grad_norm": 0.9453641176223755, + "learning_rate": 9.558992327674742e-05, + "loss": 2.8419, + "step": 17288 + }, + { + "epoch": 1.5663518380104642, + "grad_norm": 0.9711840152740479, + "learning_rate": 9.558388207575667e-05, + "loss": 2.542, + "step": 17289 + }, + { + "epoch": 1.5664424361849107, + "grad_norm": 0.7976774573326111, + "learning_rate": 9.557784087476592e-05, + "loss": 2.1204, + "step": 17290 + }, + { + "epoch": 1.5665330343593578, + "grad_norm": 0.9408089518547058, + "learning_rate": 9.557179967377515e-05, + "loss": 2.6705, + "step": 17291 + }, + { + "epoch": 1.5666236325338043, + "grad_norm": 0.9041121602058411, + "learning_rate": 9.55657584727844e-05, + "loss": 2.914, + "step": 17292 + }, + { + "epoch": 1.5667142307082513, + "grad_norm": 0.8916457295417786, + "learning_rate": 9.555971727179363e-05, + "loss": 2.8136, + "step": 17293 + }, + { + "epoch": 1.566804828882698, + "grad_norm": 0.8987795114517212, + "learning_rate": 9.555367607080288e-05, + "loss": 2.7307, + "step": 17294 + }, + { + "epoch": 1.566895427057145, + "grad_norm": 0.8976991176605225, + "learning_rate": 9.554763486981212e-05, + "loss": 2.7079, + "step": 17295 + }, + { + "epoch": 1.5669860252315915, + "grad_norm": 0.8905600309371948, + "learning_rate": 9.554159366882136e-05, + "loss": 2.5702, + "step": 17296 + }, + { + "epoch": 1.5670766234060385, + "grad_norm": 0.8728801608085632, + "learning_rate": 9.553555246783061e-05, + "loss": 2.9702, + "step": 17297 + }, + { + "epoch": 1.567167221580485, + "grad_norm": 0.9159106612205505, + "learning_rate": 9.552951126683986e-05, + "loss": 2.7925, + "step": 17298 + }, + { + "epoch": 1.567257819754932, + "grad_norm": 0.9557865262031555, + "learning_rate": 9.552347006584909e-05, + "loss": 2.9917, + "step": 17299 + }, + { + "epoch": 1.5673484179293786, + "grad_norm": 0.9457446336746216, + "learning_rate": 9.551742886485834e-05, + "loss": 2.892, + "step": 17300 + }, + { + "epoch": 1.5674390161038256, + "grad_norm": 0.9556406736373901, + "learning_rate": 9.551138766386757e-05, + "loss": 2.7228, + "step": 17301 + }, + { + "epoch": 1.5675296142782722, + "grad_norm": 0.8743106126785278, + "learning_rate": 9.550534646287682e-05, + "loss": 2.5113, + "step": 17302 + }, + { + "epoch": 1.5676202124527192, + "grad_norm": 0.9639087915420532, + "learning_rate": 9.549930526188607e-05, + "loss": 2.8026, + "step": 17303 + }, + { + "epoch": 1.5677108106271658, + "grad_norm": 0.8654579520225525, + "learning_rate": 9.549326406089532e-05, + "loss": 2.2736, + "step": 17304 + }, + { + "epoch": 1.5678014088016128, + "grad_norm": 0.8495532274246216, + "learning_rate": 9.548722285990455e-05, + "loss": 2.764, + "step": 17305 + }, + { + "epoch": 1.5678920069760593, + "grad_norm": 0.936550498008728, + "learning_rate": 9.54811816589138e-05, + "loss": 2.2143, + "step": 17306 + }, + { + "epoch": 1.5679826051505064, + "grad_norm": 0.8973893523216248, + "learning_rate": 9.547514045792305e-05, + "loss": 2.848, + "step": 17307 + }, + { + "epoch": 1.568073203324953, + "grad_norm": 0.9294204711914062, + "learning_rate": 9.546909925693228e-05, + "loss": 2.6659, + "step": 17308 + }, + { + "epoch": 1.5681638014994, + "grad_norm": 0.9135354161262512, + "learning_rate": 9.546305805594153e-05, + "loss": 2.4806, + "step": 17309 + }, + { + "epoch": 1.5682543996738465, + "grad_norm": 0.8933982253074646, + "learning_rate": 9.545701685495076e-05, + "loss": 1.8056, + "step": 17310 + }, + { + "epoch": 1.5683449978482935, + "grad_norm": 0.904650866985321, + "learning_rate": 9.545097565396001e-05, + "loss": 2.7154, + "step": 17311 + }, + { + "epoch": 1.56843559602274, + "grad_norm": 0.9865103960037231, + "learning_rate": 9.544493445296926e-05, + "loss": 3.2721, + "step": 17312 + }, + { + "epoch": 1.568526194197187, + "grad_norm": 0.8943601846694946, + "learning_rate": 9.54388932519785e-05, + "loss": 2.719, + "step": 17313 + }, + { + "epoch": 1.5686167923716337, + "grad_norm": 0.9099177718162537, + "learning_rate": 9.543285205098774e-05, + "loss": 2.7649, + "step": 17314 + }, + { + "epoch": 1.5687073905460807, + "grad_norm": 0.9411017894744873, + "learning_rate": 9.542681084999699e-05, + "loss": 2.7374, + "step": 17315 + }, + { + "epoch": 1.5687979887205272, + "grad_norm": 0.8722718954086304, + "learning_rate": 9.542076964900622e-05, + "loss": 2.5633, + "step": 17316 + }, + { + "epoch": 1.568888586894974, + "grad_norm": 0.8572062253952026, + "learning_rate": 9.541472844801547e-05, + "loss": 2.5539, + "step": 17317 + }, + { + "epoch": 1.5689791850694208, + "grad_norm": 0.8946864008903503, + "learning_rate": 9.54086872470247e-05, + "loss": 2.4873, + "step": 17318 + }, + { + "epoch": 1.5690697832438676, + "grad_norm": 0.8736240863800049, + "learning_rate": 9.540264604603396e-05, + "loss": 2.5822, + "step": 17319 + }, + { + "epoch": 1.5691603814183144, + "grad_norm": 0.7085603475570679, + "learning_rate": 9.53966048450432e-05, + "loss": 2.1724, + "step": 17320 + }, + { + "epoch": 1.5692509795927612, + "grad_norm": 0.984448254108429, + "learning_rate": 9.539056364405244e-05, + "loss": 2.7628, + "step": 17321 + }, + { + "epoch": 1.569341577767208, + "grad_norm": 0.86647629737854, + "learning_rate": 9.538452244306168e-05, + "loss": 2.714, + "step": 17322 + }, + { + "epoch": 1.5694321759416547, + "grad_norm": 0.8126024603843689, + "learning_rate": 9.537848124207093e-05, + "loss": 2.0256, + "step": 17323 + }, + { + "epoch": 1.5695227741161015, + "grad_norm": 0.9441698789596558, + "learning_rate": 9.537244004108017e-05, + "loss": 2.7002, + "step": 17324 + }, + { + "epoch": 1.5696133722905483, + "grad_norm": 0.9163732528686523, + "learning_rate": 9.536639884008941e-05, + "loss": 2.6625, + "step": 17325 + }, + { + "epoch": 1.569703970464995, + "grad_norm": 0.8701455593109131, + "learning_rate": 9.536035763909866e-05, + "loss": 2.7284, + "step": 17326 + }, + { + "epoch": 1.569794568639442, + "grad_norm": 0.915300190448761, + "learning_rate": 9.53543164381079e-05, + "loss": 2.5479, + "step": 17327 + }, + { + "epoch": 1.5698851668138887, + "grad_norm": 0.6228784322738647, + "learning_rate": 9.534827523711715e-05, + "loss": 1.3192, + "step": 17328 + }, + { + "epoch": 1.5699757649883355, + "grad_norm": 0.7886748909950256, + "learning_rate": 9.534223403612639e-05, + "loss": 1.9105, + "step": 17329 + }, + { + "epoch": 1.5700663631627823, + "grad_norm": 0.9151439666748047, + "learning_rate": 9.533619283513563e-05, + "loss": 2.6488, + "step": 17330 + }, + { + "epoch": 1.570156961337229, + "grad_norm": 0.772873044013977, + "learning_rate": 9.533015163414487e-05, + "loss": 2.0136, + "step": 17331 + }, + { + "epoch": 1.5702475595116758, + "grad_norm": 0.8923223614692688, + "learning_rate": 9.532411043315411e-05, + "loss": 2.5266, + "step": 17332 + }, + { + "epoch": 1.5703381576861226, + "grad_norm": 0.9113184213638306, + "learning_rate": 9.531806923216335e-05, + "loss": 2.6282, + "step": 17333 + }, + { + "epoch": 1.5704287558605694, + "grad_norm": 0.9802087545394897, + "learning_rate": 9.531202803117261e-05, + "loss": 2.521, + "step": 17334 + }, + { + "epoch": 1.5705193540350162, + "grad_norm": 0.7322333455085754, + "learning_rate": 9.530598683018184e-05, + "loss": 2.2041, + "step": 17335 + }, + { + "epoch": 1.570609952209463, + "grad_norm": 0.80330890417099, + "learning_rate": 9.529994562919109e-05, + "loss": 2.6147, + "step": 17336 + }, + { + "epoch": 1.5707005503839098, + "grad_norm": 0.8414239287376404, + "learning_rate": 9.529390442820033e-05, + "loss": 2.6384, + "step": 17337 + }, + { + "epoch": 1.5707911485583566, + "grad_norm": 0.8580455780029297, + "learning_rate": 9.528786322720957e-05, + "loss": 2.5736, + "step": 17338 + }, + { + "epoch": 1.5708817467328033, + "grad_norm": 0.928164005279541, + "learning_rate": 9.528182202621882e-05, + "loss": 2.8201, + "step": 17339 + }, + { + "epoch": 1.5709723449072501, + "grad_norm": 0.7664555907249451, + "learning_rate": 9.527578082522805e-05, + "loss": 1.9788, + "step": 17340 + }, + { + "epoch": 1.571062943081697, + "grad_norm": 0.746277391910553, + "learning_rate": 9.52697396242373e-05, + "loss": 1.8844, + "step": 17341 + }, + { + "epoch": 1.5711535412561437, + "grad_norm": 0.8669825196266174, + "learning_rate": 9.526369842324655e-05, + "loss": 2.2367, + "step": 17342 + }, + { + "epoch": 1.5712441394305905, + "grad_norm": 0.9002436995506287, + "learning_rate": 9.52576572222558e-05, + "loss": 2.5597, + "step": 17343 + }, + { + "epoch": 1.5713347376050373, + "grad_norm": 0.9430416226387024, + "learning_rate": 9.525161602126503e-05, + "loss": 2.8298, + "step": 17344 + }, + { + "epoch": 1.571425335779484, + "grad_norm": 0.9888560175895691, + "learning_rate": 9.524557482027428e-05, + "loss": 2.6378, + "step": 17345 + }, + { + "epoch": 1.5715159339539309, + "grad_norm": 0.8852373361587524, + "learning_rate": 9.523953361928351e-05, + "loss": 2.5787, + "step": 17346 + }, + { + "epoch": 1.5716065321283776, + "grad_norm": 0.5729936957359314, + "learning_rate": 9.523349241829276e-05, + "loss": 1.271, + "step": 17347 + }, + { + "epoch": 1.5716971303028244, + "grad_norm": 0.9671497941017151, + "learning_rate": 9.5227451217302e-05, + "loss": 2.7653, + "step": 17348 + }, + { + "epoch": 1.5717877284772712, + "grad_norm": 0.8889458179473877, + "learning_rate": 9.522141001631126e-05, + "loss": 2.4559, + "step": 17349 + }, + { + "epoch": 1.571878326651718, + "grad_norm": 0.8699650168418884, + "learning_rate": 9.521536881532049e-05, + "loss": 2.4213, + "step": 17350 + }, + { + "epoch": 1.5719689248261648, + "grad_norm": 0.8909333944320679, + "learning_rate": 9.520932761432974e-05, + "loss": 2.6274, + "step": 17351 + }, + { + "epoch": 1.5720595230006116, + "grad_norm": 0.8690886497497559, + "learning_rate": 9.520328641333897e-05, + "loss": 2.8401, + "step": 17352 + }, + { + "epoch": 1.5721501211750584, + "grad_norm": 0.8642981648445129, + "learning_rate": 9.519724521234822e-05, + "loss": 2.6434, + "step": 17353 + }, + { + "epoch": 1.5722407193495052, + "grad_norm": 0.8669374585151672, + "learning_rate": 9.519120401135745e-05, + "loss": 2.6354, + "step": 17354 + }, + { + "epoch": 1.572331317523952, + "grad_norm": 0.8668257594108582, + "learning_rate": 9.51851628103667e-05, + "loss": 1.9311, + "step": 17355 + }, + { + "epoch": 1.5724219156983987, + "grad_norm": 0.6345776319503784, + "learning_rate": 9.517912160937595e-05, + "loss": 1.489, + "step": 17356 + }, + { + "epoch": 1.5725125138728453, + "grad_norm": 0.900653064250946, + "learning_rate": 9.51730804083852e-05, + "loss": 2.4974, + "step": 17357 + }, + { + "epoch": 1.5726031120472923, + "grad_norm": 0.9256742596626282, + "learning_rate": 9.516703920739444e-05, + "loss": 2.6551, + "step": 17358 + }, + { + "epoch": 1.5726937102217389, + "grad_norm": 0.944088339805603, + "learning_rate": 9.516099800640368e-05, + "loss": 2.5477, + "step": 17359 + }, + { + "epoch": 1.5727843083961859, + "grad_norm": 0.8950543999671936, + "learning_rate": 9.515495680541293e-05, + "loss": 2.7059, + "step": 17360 + }, + { + "epoch": 1.5728749065706324, + "grad_norm": 0.7548168897628784, + "learning_rate": 9.514891560442216e-05, + "loss": 1.9284, + "step": 17361 + }, + { + "epoch": 1.5729655047450795, + "grad_norm": 0.9461868405342102, + "learning_rate": 9.514287440343141e-05, + "loss": 2.6002, + "step": 17362 + }, + { + "epoch": 1.573056102919526, + "grad_norm": 0.8675640821456909, + "learning_rate": 9.513683320244064e-05, + "loss": 2.6232, + "step": 17363 + }, + { + "epoch": 1.573146701093973, + "grad_norm": 0.8922026753425598, + "learning_rate": 9.51307920014499e-05, + "loss": 2.703, + "step": 17364 + }, + { + "epoch": 1.5732372992684196, + "grad_norm": 0.9072169065475464, + "learning_rate": 9.512475080045914e-05, + "loss": 2.8225, + "step": 17365 + }, + { + "epoch": 1.5733278974428666, + "grad_norm": 0.9346560835838318, + "learning_rate": 9.511870959946838e-05, + "loss": 2.594, + "step": 17366 + }, + { + "epoch": 1.5734184956173132, + "grad_norm": 0.8458805084228516, + "learning_rate": 9.511266839847762e-05, + "loss": 2.7508, + "step": 17367 + }, + { + "epoch": 1.5735090937917602, + "grad_norm": 0.8659564256668091, + "learning_rate": 9.510662719748687e-05, + "loss": 2.8655, + "step": 17368 + }, + { + "epoch": 1.5735996919662067, + "grad_norm": 0.9216242432594299, + "learning_rate": 9.51005859964961e-05, + "loss": 2.7852, + "step": 17369 + }, + { + "epoch": 1.5736902901406538, + "grad_norm": 0.9392908811569214, + "learning_rate": 9.509454479550535e-05, + "loss": 2.4052, + "step": 17370 + }, + { + "epoch": 1.5737808883151003, + "grad_norm": 0.8922399878501892, + "learning_rate": 9.50885035945146e-05, + "loss": 2.5302, + "step": 17371 + }, + { + "epoch": 1.5738714864895473, + "grad_norm": 0.8566911220550537, + "learning_rate": 9.508246239352384e-05, + "loss": 2.7654, + "step": 17372 + }, + { + "epoch": 1.573962084663994, + "grad_norm": 0.901738703250885, + "learning_rate": 9.507642119253308e-05, + "loss": 2.4634, + "step": 17373 + }, + { + "epoch": 1.574052682838441, + "grad_norm": 0.9127346873283386, + "learning_rate": 9.507037999154232e-05, + "loss": 2.6152, + "step": 17374 + }, + { + "epoch": 1.5741432810128875, + "grad_norm": 0.9200370907783508, + "learning_rate": 9.506433879055157e-05, + "loss": 2.5901, + "step": 17375 + }, + { + "epoch": 1.5742338791873345, + "grad_norm": 0.957054853439331, + "learning_rate": 9.50582975895608e-05, + "loss": 2.7237, + "step": 17376 + }, + { + "epoch": 1.574324477361781, + "grad_norm": 0.9472711682319641, + "learning_rate": 9.505225638857005e-05, + "loss": 2.4813, + "step": 17377 + }, + { + "epoch": 1.574415075536228, + "grad_norm": 0.8773202896118164, + "learning_rate": 9.504621518757929e-05, + "loss": 2.8829, + "step": 17378 + }, + { + "epoch": 1.5745056737106746, + "grad_norm": 0.948672890663147, + "learning_rate": 9.504017398658855e-05, + "loss": 2.7784, + "step": 17379 + }, + { + "epoch": 1.5745962718851216, + "grad_norm": 0.8703508973121643, + "learning_rate": 9.503413278559778e-05, + "loss": 2.5734, + "step": 17380 + }, + { + "epoch": 1.5746868700595682, + "grad_norm": 0.9679356217384338, + "learning_rate": 9.502809158460703e-05, + "loss": 2.5903, + "step": 17381 + }, + { + "epoch": 1.5747774682340152, + "grad_norm": 0.8785926699638367, + "learning_rate": 9.502205038361626e-05, + "loss": 2.5848, + "step": 17382 + }, + { + "epoch": 1.5748680664084618, + "grad_norm": 0.9246433973312378, + "learning_rate": 9.501600918262551e-05, + "loss": 2.7132, + "step": 17383 + }, + { + "epoch": 1.5749586645829088, + "grad_norm": 0.8712687492370605, + "learning_rate": 9.500996798163475e-05, + "loss": 2.4616, + "step": 17384 + }, + { + "epoch": 1.5750492627573554, + "grad_norm": 0.9962888956069946, + "learning_rate": 9.5003926780644e-05, + "loss": 2.4802, + "step": 17385 + }, + { + "epoch": 1.5751398609318024, + "grad_norm": 0.7522957921028137, + "learning_rate": 9.499788557965323e-05, + "loss": 2.071, + "step": 17386 + }, + { + "epoch": 1.575230459106249, + "grad_norm": 0.8563006520271301, + "learning_rate": 9.499184437866249e-05, + "loss": 2.7993, + "step": 17387 + }, + { + "epoch": 1.575321057280696, + "grad_norm": 0.8978300094604492, + "learning_rate": 9.498580317767172e-05, + "loss": 2.7462, + "step": 17388 + }, + { + "epoch": 1.5754116554551425, + "grad_norm": 0.9202252626419067, + "learning_rate": 9.497976197668097e-05, + "loss": 2.6583, + "step": 17389 + }, + { + "epoch": 1.5755022536295895, + "grad_norm": 0.8817426562309265, + "learning_rate": 9.497372077569022e-05, + "loss": 2.9595, + "step": 17390 + }, + { + "epoch": 1.575592851804036, + "grad_norm": 1.1174800395965576, + "learning_rate": 9.496767957469945e-05, + "loss": 2.6565, + "step": 17391 + }, + { + "epoch": 1.575683449978483, + "grad_norm": 0.923343300819397, + "learning_rate": 9.49616383737087e-05, + "loss": 2.7455, + "step": 17392 + }, + { + "epoch": 1.5757740481529297, + "grad_norm": 0.8740699887275696, + "learning_rate": 9.495559717271793e-05, + "loss": 2.8579, + "step": 17393 + }, + { + "epoch": 1.5758646463273767, + "grad_norm": 0.9111614227294922, + "learning_rate": 9.49495559717272e-05, + "loss": 2.8426, + "step": 17394 + }, + { + "epoch": 1.5759552445018232, + "grad_norm": 0.9046357274055481, + "learning_rate": 9.494351477073643e-05, + "loss": 2.6426, + "step": 17395 + }, + { + "epoch": 1.5760458426762702, + "grad_norm": 0.843614935874939, + "learning_rate": 9.493747356974568e-05, + "loss": 2.4925, + "step": 17396 + }, + { + "epoch": 1.5761364408507168, + "grad_norm": 1.0664691925048828, + "learning_rate": 9.493143236875491e-05, + "loss": 2.7276, + "step": 17397 + }, + { + "epoch": 1.5762270390251636, + "grad_norm": 0.8517662882804871, + "learning_rate": 9.492539116776416e-05, + "loss": 2.6856, + "step": 17398 + }, + { + "epoch": 1.5763176371996104, + "grad_norm": 0.9212304949760437, + "learning_rate": 9.491934996677339e-05, + "loss": 2.7583, + "step": 17399 + }, + { + "epoch": 1.5764082353740572, + "grad_norm": 0.8750777244567871, + "learning_rate": 9.491330876578264e-05, + "loss": 2.3958, + "step": 17400 + }, + { + "epoch": 1.576498833548504, + "grad_norm": 0.918040931224823, + "learning_rate": 9.490726756479187e-05, + "loss": 2.7669, + "step": 17401 + }, + { + "epoch": 1.5765894317229507, + "grad_norm": 0.8884965777397156, + "learning_rate": 9.490122636380114e-05, + "loss": 2.6696, + "step": 17402 + }, + { + "epoch": 1.5766800298973975, + "grad_norm": 0.9965223670005798, + "learning_rate": 9.489518516281037e-05, + "loss": 2.8653, + "step": 17403 + }, + { + "epoch": 1.5767706280718443, + "grad_norm": 0.8980855345726013, + "learning_rate": 9.488914396181962e-05, + "loss": 2.7927, + "step": 17404 + }, + { + "epoch": 1.576861226246291, + "grad_norm": 0.9415149688720703, + "learning_rate": 9.488310276082885e-05, + "loss": 2.8957, + "step": 17405 + }, + { + "epoch": 1.576951824420738, + "grad_norm": 0.871220052242279, + "learning_rate": 9.48770615598381e-05, + "loss": 2.4219, + "step": 17406 + }, + { + "epoch": 1.5770424225951847, + "grad_norm": 1.0385693311691284, + "learning_rate": 9.487102035884735e-05, + "loss": 2.6173, + "step": 17407 + }, + { + "epoch": 1.5771330207696315, + "grad_norm": 1.0444263219833374, + "learning_rate": 9.486497915785658e-05, + "loss": 2.6047, + "step": 17408 + }, + { + "epoch": 1.5772236189440783, + "grad_norm": 0.9607446193695068, + "learning_rate": 9.485893795686583e-05, + "loss": 2.5141, + "step": 17409 + }, + { + "epoch": 1.577314217118525, + "grad_norm": 0.8867365121841431, + "learning_rate": 9.485289675587508e-05, + "loss": 2.6081, + "step": 17410 + }, + { + "epoch": 1.5774048152929718, + "grad_norm": 0.8622238039970398, + "learning_rate": 9.484685555488432e-05, + "loss": 2.541, + "step": 17411 + }, + { + "epoch": 1.5774954134674186, + "grad_norm": 0.8680390119552612, + "learning_rate": 9.484081435389356e-05, + "loss": 2.7023, + "step": 17412 + }, + { + "epoch": 1.5775860116418654, + "grad_norm": 0.9290357828140259, + "learning_rate": 9.48347731529028e-05, + "loss": 2.6813, + "step": 17413 + }, + { + "epoch": 1.5776766098163122, + "grad_norm": 0.8961611390113831, + "learning_rate": 9.482873195191204e-05, + "loss": 2.2914, + "step": 17414 + }, + { + "epoch": 1.577767207990759, + "grad_norm": 0.9595559239387512, + "learning_rate": 9.482269075092129e-05, + "loss": 3.2001, + "step": 17415 + }, + { + "epoch": 1.5778578061652058, + "grad_norm": 0.8894730806350708, + "learning_rate": 9.481664954993053e-05, + "loss": 2.5242, + "step": 17416 + }, + { + "epoch": 1.5779484043396526, + "grad_norm": 0.9140087366104126, + "learning_rate": 9.481060834893978e-05, + "loss": 2.3258, + "step": 17417 + }, + { + "epoch": 1.5780390025140993, + "grad_norm": 0.9531443119049072, + "learning_rate": 9.480456714794902e-05, + "loss": 2.874, + "step": 17418 + }, + { + "epoch": 1.5781296006885461, + "grad_norm": 0.8365461826324463, + "learning_rate": 9.479852594695826e-05, + "loss": 2.0451, + "step": 17419 + }, + { + "epoch": 1.578220198862993, + "grad_norm": 0.9143983125686646, + "learning_rate": 9.47924847459675e-05, + "loss": 2.6902, + "step": 17420 + }, + { + "epoch": 1.5783107970374397, + "grad_norm": 0.9046006202697754, + "learning_rate": 9.478644354497674e-05, + "loss": 2.9923, + "step": 17421 + }, + { + "epoch": 1.5784013952118865, + "grad_norm": 0.9026309251785278, + "learning_rate": 9.478040234398598e-05, + "loss": 2.4969, + "step": 17422 + }, + { + "epoch": 1.5784919933863333, + "grad_norm": 0.7705323100090027, + "learning_rate": 9.477436114299523e-05, + "loss": 1.9963, + "step": 17423 + }, + { + "epoch": 1.57858259156078, + "grad_norm": 0.9173418879508972, + "learning_rate": 9.476831994200447e-05, + "loss": 2.6902, + "step": 17424 + }, + { + "epoch": 1.5786731897352269, + "grad_norm": 0.951797366142273, + "learning_rate": 9.476227874101372e-05, + "loss": 2.6732, + "step": 17425 + }, + { + "epoch": 1.5787637879096736, + "grad_norm": 1.0137157440185547, + "learning_rate": 9.475623754002297e-05, + "loss": 2.7272, + "step": 17426 + }, + { + "epoch": 1.5788543860841204, + "grad_norm": 0.8913120627403259, + "learning_rate": 9.47501963390322e-05, + "loss": 2.3989, + "step": 17427 + }, + { + "epoch": 1.5789449842585672, + "grad_norm": 0.8978005051612854, + "learning_rate": 9.474415513804145e-05, + "loss": 2.6407, + "step": 17428 + }, + { + "epoch": 1.579035582433014, + "grad_norm": 0.9370878338813782, + "learning_rate": 9.473811393705068e-05, + "loss": 2.7243, + "step": 17429 + }, + { + "epoch": 1.5791261806074608, + "grad_norm": 0.910105288028717, + "learning_rate": 9.473207273605993e-05, + "loss": 2.6683, + "step": 17430 + }, + { + "epoch": 1.5792167787819076, + "grad_norm": 0.9382012486457825, + "learning_rate": 9.472603153506918e-05, + "loss": 2.7157, + "step": 17431 + }, + { + "epoch": 1.5793073769563544, + "grad_norm": 0.8867807984352112, + "learning_rate": 9.471999033407843e-05, + "loss": 2.6881, + "step": 17432 + }, + { + "epoch": 1.5793979751308012, + "grad_norm": 0.8635627031326294, + "learning_rate": 9.471394913308766e-05, + "loss": 2.543, + "step": 17433 + }, + { + "epoch": 1.579488573305248, + "grad_norm": 0.914683997631073, + "learning_rate": 9.470790793209691e-05, + "loss": 2.5277, + "step": 17434 + }, + { + "epoch": 1.5795791714796947, + "grad_norm": 0.874502956867218, + "learning_rate": 9.470186673110614e-05, + "loss": 2.7729, + "step": 17435 + }, + { + "epoch": 1.5796697696541415, + "grad_norm": 0.8734977841377258, + "learning_rate": 9.469582553011539e-05, + "loss": 2.4545, + "step": 17436 + }, + { + "epoch": 1.5797603678285883, + "grad_norm": 0.8967622518539429, + "learning_rate": 9.468978432912463e-05, + "loss": 2.7837, + "step": 17437 + }, + { + "epoch": 1.5798509660030349, + "grad_norm": 0.9061062932014465, + "learning_rate": 9.468374312813387e-05, + "loss": 2.6673, + "step": 17438 + }, + { + "epoch": 1.5799415641774819, + "grad_norm": 0.7039234042167664, + "learning_rate": 9.467770192714312e-05, + "loss": 1.9002, + "step": 17439 + }, + { + "epoch": 1.5800321623519284, + "grad_norm": 0.7564327716827393, + "learning_rate": 9.467166072615237e-05, + "loss": 2.1669, + "step": 17440 + }, + { + "epoch": 1.5801227605263755, + "grad_norm": 0.9230878949165344, + "learning_rate": 9.46656195251616e-05, + "loss": 2.4993, + "step": 17441 + }, + { + "epoch": 1.580213358700822, + "grad_norm": 0.8933036923408508, + "learning_rate": 9.465957832417085e-05, + "loss": 2.6527, + "step": 17442 + }, + { + "epoch": 1.580303956875269, + "grad_norm": 0.924057126045227, + "learning_rate": 9.46535371231801e-05, + "loss": 3.0274, + "step": 17443 + }, + { + "epoch": 1.5803945550497156, + "grad_norm": 0.9231659173965454, + "learning_rate": 9.464749592218933e-05, + "loss": 2.4256, + "step": 17444 + }, + { + "epoch": 1.5804851532241626, + "grad_norm": 0.8593257665634155, + "learning_rate": 9.464145472119858e-05, + "loss": 2.7591, + "step": 17445 + }, + { + "epoch": 1.5805757513986092, + "grad_norm": 0.9819329380989075, + "learning_rate": 9.463541352020783e-05, + "loss": 2.6271, + "step": 17446 + }, + { + "epoch": 1.5806663495730562, + "grad_norm": 0.7983173131942749, + "learning_rate": 9.462937231921707e-05, + "loss": 1.841, + "step": 17447 + }, + { + "epoch": 1.5807569477475027, + "grad_norm": 0.9055704474449158, + "learning_rate": 9.462333111822631e-05, + "loss": 2.3365, + "step": 17448 + }, + { + "epoch": 1.5808475459219498, + "grad_norm": 0.9425842761993408, + "learning_rate": 9.461728991723556e-05, + "loss": 2.726, + "step": 17449 + }, + { + "epoch": 1.5809381440963963, + "grad_norm": 0.9594883918762207, + "learning_rate": 9.461124871624479e-05, + "loss": 2.8458, + "step": 17450 + }, + { + "epoch": 1.5810287422708433, + "grad_norm": 0.9032313227653503, + "learning_rate": 9.460520751525404e-05, + "loss": 3.0043, + "step": 17451 + }, + { + "epoch": 1.58111934044529, + "grad_norm": 0.8983295559883118, + "learning_rate": 9.459916631426327e-05, + "loss": 2.7184, + "step": 17452 + }, + { + "epoch": 1.581209938619737, + "grad_norm": 0.943135678768158, + "learning_rate": 9.459312511327252e-05, + "loss": 2.7159, + "step": 17453 + }, + { + "epoch": 1.5813005367941835, + "grad_norm": 0.9393490552902222, + "learning_rate": 9.458708391228177e-05, + "loss": 2.5077, + "step": 17454 + }, + { + "epoch": 1.5813911349686305, + "grad_norm": 0.956052303314209, + "learning_rate": 9.458104271129101e-05, + "loss": 2.7217, + "step": 17455 + }, + { + "epoch": 1.581481733143077, + "grad_norm": 0.9566819667816162, + "learning_rate": 9.457500151030025e-05, + "loss": 2.5246, + "step": 17456 + }, + { + "epoch": 1.581572331317524, + "grad_norm": 0.8962091207504272, + "learning_rate": 9.45689603093095e-05, + "loss": 2.7454, + "step": 17457 + }, + { + "epoch": 1.5816629294919706, + "grad_norm": 0.8740013241767883, + "learning_rate": 9.456291910831874e-05, + "loss": 2.5407, + "step": 17458 + }, + { + "epoch": 1.5817535276664176, + "grad_norm": 0.8621935844421387, + "learning_rate": 9.455687790732798e-05, + "loss": 2.8767, + "step": 17459 + }, + { + "epoch": 1.5818441258408642, + "grad_norm": 0.8810177445411682, + "learning_rate": 9.455083670633723e-05, + "loss": 2.6954, + "step": 17460 + }, + { + "epoch": 1.5819347240153112, + "grad_norm": 0.9062424302101135, + "learning_rate": 9.454479550534647e-05, + "loss": 3.0216, + "step": 17461 + }, + { + "epoch": 1.5820253221897578, + "grad_norm": 0.9454788565635681, + "learning_rate": 9.453875430435572e-05, + "loss": 2.7522, + "step": 17462 + }, + { + "epoch": 1.5821159203642048, + "grad_norm": 0.8891903162002563, + "learning_rate": 9.453271310336495e-05, + "loss": 2.8614, + "step": 17463 + }, + { + "epoch": 1.5822065185386514, + "grad_norm": 0.899540901184082, + "learning_rate": 9.45266719023742e-05, + "loss": 2.4695, + "step": 17464 + }, + { + "epoch": 1.5822971167130984, + "grad_norm": 0.8873907327651978, + "learning_rate": 9.452063070138344e-05, + "loss": 2.7638, + "step": 17465 + }, + { + "epoch": 1.582387714887545, + "grad_norm": 0.8893045783042908, + "learning_rate": 9.451458950039268e-05, + "loss": 3.0525, + "step": 17466 + }, + { + "epoch": 1.582478313061992, + "grad_norm": 0.8523391485214233, + "learning_rate": 9.450854829940192e-05, + "loss": 2.6889, + "step": 17467 + }, + { + "epoch": 1.5825689112364385, + "grad_norm": 0.8839149475097656, + "learning_rate": 9.450250709841117e-05, + "loss": 2.7263, + "step": 17468 + }, + { + "epoch": 1.5826595094108855, + "grad_norm": 0.9218348860740662, + "learning_rate": 9.449646589742041e-05, + "loss": 2.7086, + "step": 17469 + }, + { + "epoch": 1.582750107585332, + "grad_norm": 0.8931643962860107, + "learning_rate": 9.449042469642966e-05, + "loss": 2.7328, + "step": 17470 + }, + { + "epoch": 1.582840705759779, + "grad_norm": 0.9383055567741394, + "learning_rate": 9.44843834954389e-05, + "loss": 2.8628, + "step": 17471 + }, + { + "epoch": 1.5829313039342257, + "grad_norm": 0.8870400786399841, + "learning_rate": 9.447834229444814e-05, + "loss": 2.7464, + "step": 17472 + }, + { + "epoch": 1.5830219021086727, + "grad_norm": 0.896262526512146, + "learning_rate": 9.447230109345738e-05, + "loss": 2.8098, + "step": 17473 + }, + { + "epoch": 1.5831125002831192, + "grad_norm": 0.9054145216941833, + "learning_rate": 9.446625989246662e-05, + "loss": 2.836, + "step": 17474 + }, + { + "epoch": 1.5832030984575662, + "grad_norm": 0.8817854523658752, + "learning_rate": 9.446021869147587e-05, + "loss": 2.5918, + "step": 17475 + }, + { + "epoch": 1.5832936966320128, + "grad_norm": 0.9453157782554626, + "learning_rate": 9.445417749048512e-05, + "loss": 2.7627, + "step": 17476 + }, + { + "epoch": 1.5833842948064598, + "grad_norm": 0.9889433979988098, + "learning_rate": 9.444813628949437e-05, + "loss": 2.6373, + "step": 17477 + }, + { + "epoch": 1.5834748929809064, + "grad_norm": 0.9239698052406311, + "learning_rate": 9.44420950885036e-05, + "loss": 2.4784, + "step": 17478 + }, + { + "epoch": 1.5835654911553532, + "grad_norm": 0.9757924675941467, + "learning_rate": 9.443605388751285e-05, + "loss": 2.5502, + "step": 17479 + }, + { + "epoch": 1.5836560893298, + "grad_norm": 0.9068741202354431, + "learning_rate": 9.443001268652208e-05, + "loss": 2.9074, + "step": 17480 + }, + { + "epoch": 1.5837466875042467, + "grad_norm": 0.9120886325836182, + "learning_rate": 9.442397148553133e-05, + "loss": 2.8355, + "step": 17481 + }, + { + "epoch": 1.5838372856786935, + "grad_norm": 0.9344341158866882, + "learning_rate": 9.441793028454056e-05, + "loss": 3.0285, + "step": 17482 + }, + { + "epoch": 1.5839278838531403, + "grad_norm": 0.9174172878265381, + "learning_rate": 9.441188908354981e-05, + "loss": 2.6129, + "step": 17483 + }, + { + "epoch": 1.584018482027587, + "grad_norm": 0.8107998967170715, + "learning_rate": 9.440584788255906e-05, + "loss": 2.5571, + "step": 17484 + }, + { + "epoch": 1.584109080202034, + "grad_norm": 0.8905077576637268, + "learning_rate": 9.439980668156831e-05, + "loss": 2.9265, + "step": 17485 + }, + { + "epoch": 1.5841996783764807, + "grad_norm": 0.8791329264640808, + "learning_rate": 9.439376548057754e-05, + "loss": 2.6801, + "step": 17486 + }, + { + "epoch": 1.5842902765509275, + "grad_norm": 0.8238058090209961, + "learning_rate": 9.438772427958679e-05, + "loss": 2.1054, + "step": 17487 + }, + { + "epoch": 1.5843808747253743, + "grad_norm": 0.9096028208732605, + "learning_rate": 9.438168307859602e-05, + "loss": 2.4222, + "step": 17488 + }, + { + "epoch": 1.584471472899821, + "grad_norm": 0.9016973376274109, + "learning_rate": 9.437564187760527e-05, + "loss": 2.8404, + "step": 17489 + }, + { + "epoch": 1.5845620710742678, + "grad_norm": 0.9086077213287354, + "learning_rate": 9.436960067661452e-05, + "loss": 2.9055, + "step": 17490 + }, + { + "epoch": 1.5846526692487146, + "grad_norm": 0.9875266551971436, + "learning_rate": 9.436355947562377e-05, + "loss": 2.4013, + "step": 17491 + }, + { + "epoch": 1.5847432674231614, + "grad_norm": 0.9472048878669739, + "learning_rate": 9.4357518274633e-05, + "loss": 2.7289, + "step": 17492 + }, + { + "epoch": 1.5848338655976082, + "grad_norm": 0.8533011078834534, + "learning_rate": 9.435147707364225e-05, + "loss": 2.5275, + "step": 17493 + }, + { + "epoch": 1.584924463772055, + "grad_norm": 0.9418968558311462, + "learning_rate": 9.43454358726515e-05, + "loss": 2.8007, + "step": 17494 + }, + { + "epoch": 1.5850150619465018, + "grad_norm": 0.8723677396774292, + "learning_rate": 9.433939467166073e-05, + "loss": 2.7829, + "step": 17495 + }, + { + "epoch": 1.5851056601209486, + "grad_norm": 0.8756784796714783, + "learning_rate": 9.433335347066998e-05, + "loss": 2.5365, + "step": 17496 + }, + { + "epoch": 1.5851962582953953, + "grad_norm": 0.8809353113174438, + "learning_rate": 9.432731226967921e-05, + "loss": 2.7825, + "step": 17497 + }, + { + "epoch": 1.5852868564698421, + "grad_norm": 0.8672310709953308, + "learning_rate": 9.432127106868846e-05, + "loss": 2.6231, + "step": 17498 + }, + { + "epoch": 1.585377454644289, + "grad_norm": 0.9098407030105591, + "learning_rate": 9.43152298676977e-05, + "loss": 2.4241, + "step": 17499 + }, + { + "epoch": 1.5854680528187357, + "grad_norm": 0.9593366980552673, + "learning_rate": 9.430918866670695e-05, + "loss": 2.5722, + "step": 17500 + }, + { + "epoch": 1.5855586509931825, + "grad_norm": 0.8746485114097595, + "learning_rate": 9.430314746571619e-05, + "loss": 2.5549, + "step": 17501 + }, + { + "epoch": 1.5856492491676293, + "grad_norm": 0.9317200779914856, + "learning_rate": 9.429710626472543e-05, + "loss": 2.8174, + "step": 17502 + }, + { + "epoch": 1.585739847342076, + "grad_norm": 0.917698860168457, + "learning_rate": 9.429106506373467e-05, + "loss": 2.7404, + "step": 17503 + }, + { + "epoch": 1.5858304455165229, + "grad_norm": 0.9398153424263, + "learning_rate": 9.428502386274392e-05, + "loss": 2.4874, + "step": 17504 + }, + { + "epoch": 1.5859210436909696, + "grad_norm": 0.9024966359138489, + "learning_rate": 9.427898266175315e-05, + "loss": 2.6025, + "step": 17505 + }, + { + "epoch": 1.5860116418654164, + "grad_norm": 0.9253323078155518, + "learning_rate": 9.427294146076241e-05, + "loss": 2.9864, + "step": 17506 + }, + { + "epoch": 1.5861022400398632, + "grad_norm": 0.795944333076477, + "learning_rate": 9.426690025977165e-05, + "loss": 2.1771, + "step": 17507 + }, + { + "epoch": 1.58619283821431, + "grad_norm": 0.8907227516174316, + "learning_rate": 9.42608590587809e-05, + "loss": 2.5397, + "step": 17508 + }, + { + "epoch": 1.5862834363887568, + "grad_norm": 0.9033448696136475, + "learning_rate": 9.425481785779013e-05, + "loss": 2.7218, + "step": 17509 + }, + { + "epoch": 1.5863740345632036, + "grad_norm": 0.874609112739563, + "learning_rate": 9.424877665679938e-05, + "loss": 2.4621, + "step": 17510 + }, + { + "epoch": 1.5864646327376504, + "grad_norm": 0.88555508852005, + "learning_rate": 9.424273545580862e-05, + "loss": 2.777, + "step": 17511 + }, + { + "epoch": 1.5865552309120972, + "grad_norm": 0.8794095516204834, + "learning_rate": 9.423669425481786e-05, + "loss": 2.65, + "step": 17512 + }, + { + "epoch": 1.586645829086544, + "grad_norm": 0.9125868082046509, + "learning_rate": 9.42306530538271e-05, + "loss": 3.1545, + "step": 17513 + }, + { + "epoch": 1.5867364272609907, + "grad_norm": 0.960927426815033, + "learning_rate": 9.422461185283635e-05, + "loss": 2.7235, + "step": 17514 + }, + { + "epoch": 1.5868270254354375, + "grad_norm": 0.9176909923553467, + "learning_rate": 9.42185706518456e-05, + "loss": 2.5758, + "step": 17515 + }, + { + "epoch": 1.5869176236098843, + "grad_norm": 0.9054147005081177, + "learning_rate": 9.421252945085483e-05, + "loss": 2.6275, + "step": 17516 + }, + { + "epoch": 1.587008221784331, + "grad_norm": 0.9742613434791565, + "learning_rate": 9.420648824986408e-05, + "loss": 3.0092, + "step": 17517 + }, + { + "epoch": 1.5870988199587779, + "grad_norm": 0.891933262348175, + "learning_rate": 9.420044704887332e-05, + "loss": 2.6444, + "step": 17518 + }, + { + "epoch": 1.5871894181332245, + "grad_norm": 0.9003686904907227, + "learning_rate": 9.419440584788256e-05, + "loss": 2.6299, + "step": 17519 + }, + { + "epoch": 1.5872800163076715, + "grad_norm": 0.9020559787750244, + "learning_rate": 9.41883646468918e-05, + "loss": 2.7159, + "step": 17520 + }, + { + "epoch": 1.587370614482118, + "grad_norm": 0.8553079962730408, + "learning_rate": 9.418232344590106e-05, + "loss": 2.5805, + "step": 17521 + }, + { + "epoch": 1.587461212656565, + "grad_norm": 0.839425265789032, + "learning_rate": 9.417628224491029e-05, + "loss": 2.5767, + "step": 17522 + }, + { + "epoch": 1.5875518108310116, + "grad_norm": 0.921364963054657, + "learning_rate": 9.417024104391954e-05, + "loss": 2.7597, + "step": 17523 + }, + { + "epoch": 1.5876424090054586, + "grad_norm": 0.8814857006072998, + "learning_rate": 9.416419984292877e-05, + "loss": 2.6374, + "step": 17524 + }, + { + "epoch": 1.5877330071799052, + "grad_norm": 0.8977497816085815, + "learning_rate": 9.415815864193802e-05, + "loss": 2.5876, + "step": 17525 + }, + { + "epoch": 1.5878236053543522, + "grad_norm": 0.8677102327346802, + "learning_rate": 9.415211744094727e-05, + "loss": 2.7562, + "step": 17526 + }, + { + "epoch": 1.5879142035287988, + "grad_norm": 0.8974522948265076, + "learning_rate": 9.41460762399565e-05, + "loss": 2.8551, + "step": 17527 + }, + { + "epoch": 1.5880048017032458, + "grad_norm": 0.9887921214103699, + "learning_rate": 9.414003503896575e-05, + "loss": 2.7964, + "step": 17528 + }, + { + "epoch": 1.5880953998776923, + "grad_norm": 0.9549155235290527, + "learning_rate": 9.4133993837975e-05, + "loss": 2.776, + "step": 17529 + }, + { + "epoch": 1.5881859980521393, + "grad_norm": 0.926777720451355, + "learning_rate": 9.412795263698425e-05, + "loss": 2.2716, + "step": 17530 + }, + { + "epoch": 1.588276596226586, + "grad_norm": 0.8561556339263916, + "learning_rate": 9.412191143599348e-05, + "loss": 2.6529, + "step": 17531 + }, + { + "epoch": 1.588367194401033, + "grad_norm": 0.867419958114624, + "learning_rate": 9.411587023500273e-05, + "loss": 2.6775, + "step": 17532 + }, + { + "epoch": 1.5884577925754795, + "grad_norm": 0.9417465329170227, + "learning_rate": 9.410982903401196e-05, + "loss": 2.7568, + "step": 17533 + }, + { + "epoch": 1.5885483907499265, + "grad_norm": 0.8869995474815369, + "learning_rate": 9.410378783302121e-05, + "loss": 2.6673, + "step": 17534 + }, + { + "epoch": 1.588638988924373, + "grad_norm": 0.8829757571220398, + "learning_rate": 9.409774663203044e-05, + "loss": 2.8364, + "step": 17535 + }, + { + "epoch": 1.58872958709882, + "grad_norm": 0.9035511016845703, + "learning_rate": 9.40917054310397e-05, + "loss": 2.1431, + "step": 17536 + }, + { + "epoch": 1.5888201852732666, + "grad_norm": 0.8527433276176453, + "learning_rate": 9.408566423004894e-05, + "loss": 2.2951, + "step": 17537 + }, + { + "epoch": 1.5889107834477136, + "grad_norm": 0.9010955691337585, + "learning_rate": 9.407962302905819e-05, + "loss": 2.8347, + "step": 17538 + }, + { + "epoch": 1.5890013816221602, + "grad_norm": 0.86147141456604, + "learning_rate": 9.407358182806742e-05, + "loss": 2.847, + "step": 17539 + }, + { + "epoch": 1.5890919797966072, + "grad_norm": 0.8841040730476379, + "learning_rate": 9.406754062707667e-05, + "loss": 2.4616, + "step": 17540 + }, + { + "epoch": 1.5891825779710538, + "grad_norm": 0.9187616109848022, + "learning_rate": 9.40614994260859e-05, + "loss": 2.5172, + "step": 17541 + }, + { + "epoch": 1.5892731761455008, + "grad_norm": 0.8695111274719238, + "learning_rate": 9.405545822509515e-05, + "loss": 2.7192, + "step": 17542 + }, + { + "epoch": 1.5893637743199474, + "grad_norm": 0.9876024127006531, + "learning_rate": 9.40494170241044e-05, + "loss": 2.774, + "step": 17543 + }, + { + "epoch": 1.5894543724943944, + "grad_norm": 0.8373991250991821, + "learning_rate": 9.404337582311364e-05, + "loss": 2.6737, + "step": 17544 + }, + { + "epoch": 1.589544970668841, + "grad_norm": 0.8609810471534729, + "learning_rate": 9.403733462212289e-05, + "loss": 2.637, + "step": 17545 + }, + { + "epoch": 1.589635568843288, + "grad_norm": 0.8817815780639648, + "learning_rate": 9.403129342113213e-05, + "loss": 2.6808, + "step": 17546 + }, + { + "epoch": 1.5897261670177345, + "grad_norm": 0.9637230038642883, + "learning_rate": 9.402525222014137e-05, + "loss": 2.7413, + "step": 17547 + }, + { + "epoch": 1.5898167651921815, + "grad_norm": 0.9352288842201233, + "learning_rate": 9.401921101915061e-05, + "loss": 2.4407, + "step": 17548 + }, + { + "epoch": 1.589907363366628, + "grad_norm": 0.9798232316970825, + "learning_rate": 9.401316981815986e-05, + "loss": 2.639, + "step": 17549 + }, + { + "epoch": 1.589997961541075, + "grad_norm": 0.8423669338226318, + "learning_rate": 9.400712861716909e-05, + "loss": 2.6071, + "step": 17550 + }, + { + "epoch": 1.5900885597155217, + "grad_norm": 0.8765303492546082, + "learning_rate": 9.400108741617835e-05, + "loss": 2.6073, + "step": 17551 + }, + { + "epoch": 1.5901791578899687, + "grad_norm": 0.8768669962882996, + "learning_rate": 9.399504621518758e-05, + "loss": 2.5429, + "step": 17552 + }, + { + "epoch": 1.5902697560644152, + "grad_norm": 0.9716887474060059, + "learning_rate": 9.398900501419683e-05, + "loss": 2.6163, + "step": 17553 + }, + { + "epoch": 1.5903603542388622, + "grad_norm": 0.9620327353477478, + "learning_rate": 9.398296381320607e-05, + "loss": 2.7895, + "step": 17554 + }, + { + "epoch": 1.5904509524133088, + "grad_norm": 0.8714320659637451, + "learning_rate": 9.397692261221531e-05, + "loss": 2.7606, + "step": 17555 + }, + { + "epoch": 1.5905415505877558, + "grad_norm": 0.8859342932701111, + "learning_rate": 9.397088141122455e-05, + "loss": 2.7426, + "step": 17556 + }, + { + "epoch": 1.5906321487622024, + "grad_norm": 0.9105513095855713, + "learning_rate": 9.39648402102338e-05, + "loss": 2.8718, + "step": 17557 + }, + { + "epoch": 1.5907227469366494, + "grad_norm": 0.9050841927528381, + "learning_rate": 9.395879900924304e-05, + "loss": 2.7956, + "step": 17558 + }, + { + "epoch": 1.590813345111096, + "grad_norm": 0.9600034952163696, + "learning_rate": 9.395275780825229e-05, + "loss": 3.0556, + "step": 17559 + }, + { + "epoch": 1.5909039432855427, + "grad_norm": 0.8725451231002808, + "learning_rate": 9.394671660726153e-05, + "loss": 2.732, + "step": 17560 + }, + { + "epoch": 1.5909945414599895, + "grad_norm": 0.9281391501426697, + "learning_rate": 9.394067540627077e-05, + "loss": 2.7962, + "step": 17561 + }, + { + "epoch": 1.5910851396344363, + "grad_norm": 0.8381581902503967, + "learning_rate": 9.393463420528002e-05, + "loss": 2.4304, + "step": 17562 + }, + { + "epoch": 1.591175737808883, + "grad_norm": 0.8787360787391663, + "learning_rate": 9.392859300428925e-05, + "loss": 2.6107, + "step": 17563 + }, + { + "epoch": 1.59126633598333, + "grad_norm": 0.8589100241661072, + "learning_rate": 9.39225518032985e-05, + "loss": 2.6123, + "step": 17564 + }, + { + "epoch": 1.5913569341577767, + "grad_norm": 0.8929772973060608, + "learning_rate": 9.391651060230774e-05, + "loss": 2.8453, + "step": 17565 + }, + { + "epoch": 1.5914475323322235, + "grad_norm": 0.8785352110862732, + "learning_rate": 9.3910469401317e-05, + "loss": 2.6106, + "step": 17566 + }, + { + "epoch": 1.5915381305066703, + "grad_norm": 0.9334027171134949, + "learning_rate": 9.390442820032623e-05, + "loss": 2.6184, + "step": 17567 + }, + { + "epoch": 1.591628728681117, + "grad_norm": 0.9218865633010864, + "learning_rate": 9.389838699933548e-05, + "loss": 2.676, + "step": 17568 + }, + { + "epoch": 1.5917193268555638, + "grad_norm": 0.9661630988121033, + "learning_rate": 9.389234579834471e-05, + "loss": 2.6742, + "step": 17569 + }, + { + "epoch": 1.5918099250300106, + "grad_norm": 0.7447712421417236, + "learning_rate": 9.388630459735396e-05, + "loss": 2.0472, + "step": 17570 + }, + { + "epoch": 1.5919005232044574, + "grad_norm": 1.0667270421981812, + "learning_rate": 9.38802633963632e-05, + "loss": 2.6187, + "step": 17571 + }, + { + "epoch": 1.5919911213789042, + "grad_norm": 0.844812273979187, + "learning_rate": 9.387422219537244e-05, + "loss": 2.0987, + "step": 17572 + }, + { + "epoch": 1.592081719553351, + "grad_norm": 0.8647435903549194, + "learning_rate": 9.386818099438168e-05, + "loss": 2.6879, + "step": 17573 + }, + { + "epoch": 1.5921723177277978, + "grad_norm": 0.8877983689308167, + "learning_rate": 9.386213979339094e-05, + "loss": 2.7449, + "step": 17574 + }, + { + "epoch": 1.5922629159022446, + "grad_norm": 0.8624906539916992, + "learning_rate": 9.385609859240017e-05, + "loss": 2.5879, + "step": 17575 + }, + { + "epoch": 1.5923535140766913, + "grad_norm": 0.8974398970603943, + "learning_rate": 9.385005739140942e-05, + "loss": 2.7674, + "step": 17576 + }, + { + "epoch": 1.5924441122511381, + "grad_norm": 0.8647843599319458, + "learning_rate": 9.384401619041867e-05, + "loss": 2.3871, + "step": 17577 + }, + { + "epoch": 1.592534710425585, + "grad_norm": 0.9923949241638184, + "learning_rate": 9.38379749894279e-05, + "loss": 2.6989, + "step": 17578 + }, + { + "epoch": 1.5926253086000317, + "grad_norm": 0.8795757293701172, + "learning_rate": 9.383193378843715e-05, + "loss": 2.6319, + "step": 17579 + }, + { + "epoch": 1.5927159067744785, + "grad_norm": 0.8542953133583069, + "learning_rate": 9.382589258744638e-05, + "loss": 2.2943, + "step": 17580 + }, + { + "epoch": 1.5928065049489253, + "grad_norm": 0.9292035698890686, + "learning_rate": 9.381985138645564e-05, + "loss": 2.7774, + "step": 17581 + }, + { + "epoch": 1.592897103123372, + "grad_norm": 0.9360343217849731, + "learning_rate": 9.381381018546488e-05, + "loss": 2.8942, + "step": 17582 + }, + { + "epoch": 1.5929877012978189, + "grad_norm": 0.8928028345108032, + "learning_rate": 9.380776898447413e-05, + "loss": 2.8892, + "step": 17583 + }, + { + "epoch": 1.5930782994722656, + "grad_norm": 0.8996580243110657, + "learning_rate": 9.380172778348336e-05, + "loss": 2.8332, + "step": 17584 + }, + { + "epoch": 1.5931688976467124, + "grad_norm": 1.0629487037658691, + "learning_rate": 9.37956865824926e-05, + "loss": 2.9489, + "step": 17585 + }, + { + "epoch": 1.5932594958211592, + "grad_norm": 0.8555801510810852, + "learning_rate": 9.378964538150184e-05, + "loss": 2.7112, + "step": 17586 + }, + { + "epoch": 1.593350093995606, + "grad_norm": 0.8805655241012573, + "learning_rate": 9.378360418051109e-05, + "loss": 2.7045, + "step": 17587 + }, + { + "epoch": 1.5934406921700528, + "grad_norm": 0.931750476360321, + "learning_rate": 9.377756297952032e-05, + "loss": 2.669, + "step": 17588 + }, + { + "epoch": 1.5935312903444996, + "grad_norm": 0.9080677628517151, + "learning_rate": 9.377152177852958e-05, + "loss": 2.5776, + "step": 17589 + }, + { + "epoch": 1.5936218885189464, + "grad_norm": 0.902300238609314, + "learning_rate": 9.376548057753882e-05, + "loss": 2.7625, + "step": 17590 + }, + { + "epoch": 1.5937124866933932, + "grad_norm": 0.8715638518333435, + "learning_rate": 9.375943937654807e-05, + "loss": 2.5311, + "step": 17591 + }, + { + "epoch": 1.59380308486784, + "grad_norm": 0.878533661365509, + "learning_rate": 9.37533981755573e-05, + "loss": 2.9055, + "step": 17592 + }, + { + "epoch": 1.5938936830422867, + "grad_norm": 0.8932660222053528, + "learning_rate": 9.374735697456655e-05, + "loss": 2.7243, + "step": 17593 + }, + { + "epoch": 1.5939842812167335, + "grad_norm": 0.9340742230415344, + "learning_rate": 9.37413157735758e-05, + "loss": 2.7498, + "step": 17594 + }, + { + "epoch": 1.5940748793911803, + "grad_norm": 0.9062513113021851, + "learning_rate": 9.373527457258503e-05, + "loss": 2.5957, + "step": 17595 + }, + { + "epoch": 1.594165477565627, + "grad_norm": 1.0012589693069458, + "learning_rate": 9.372923337159428e-05, + "loss": 2.6293, + "step": 17596 + }, + { + "epoch": 1.5942560757400739, + "grad_norm": 0.9095675945281982, + "learning_rate": 9.372319217060352e-05, + "loss": 2.5976, + "step": 17597 + }, + { + "epoch": 1.5943466739145207, + "grad_norm": 0.940876305103302, + "learning_rate": 9.371715096961277e-05, + "loss": 2.6504, + "step": 17598 + }, + { + "epoch": 1.5944372720889675, + "grad_norm": 0.9582182765007019, + "learning_rate": 9.3711109768622e-05, + "loss": 2.9007, + "step": 17599 + }, + { + "epoch": 1.594527870263414, + "grad_norm": 0.8829522728919983, + "learning_rate": 9.370506856763125e-05, + "loss": 2.5309, + "step": 17600 + }, + { + "epoch": 1.594618468437861, + "grad_norm": 0.8865326642990112, + "learning_rate": 9.369902736664049e-05, + "loss": 2.7267, + "step": 17601 + }, + { + "epoch": 1.5947090666123076, + "grad_norm": 0.9027314782142639, + "learning_rate": 9.369298616564973e-05, + "loss": 2.8236, + "step": 17602 + }, + { + "epoch": 1.5947996647867546, + "grad_norm": 0.9407446980476379, + "learning_rate": 9.368694496465897e-05, + "loss": 2.6821, + "step": 17603 + }, + { + "epoch": 1.5948902629612012, + "grad_norm": 0.8596780300140381, + "learning_rate": 9.368090376366823e-05, + "loss": 2.552, + "step": 17604 + }, + { + "epoch": 1.5949808611356482, + "grad_norm": 0.954277515411377, + "learning_rate": 9.367486256267746e-05, + "loss": 2.775, + "step": 17605 + }, + { + "epoch": 1.5950714593100948, + "grad_norm": 0.971157431602478, + "learning_rate": 9.366882136168671e-05, + "loss": 3.013, + "step": 17606 + }, + { + "epoch": 1.5951620574845418, + "grad_norm": 0.8619102239608765, + "learning_rate": 9.366278016069595e-05, + "loss": 1.9437, + "step": 17607 + }, + { + "epoch": 1.5952526556589883, + "grad_norm": 0.8784334659576416, + "learning_rate": 9.36567389597052e-05, + "loss": 2.6492, + "step": 17608 + }, + { + "epoch": 1.5953432538334353, + "grad_norm": 0.8724831342697144, + "learning_rate": 9.365069775871443e-05, + "loss": 2.5706, + "step": 17609 + }, + { + "epoch": 1.595433852007882, + "grad_norm": 0.8910329341888428, + "learning_rate": 9.364465655772367e-05, + "loss": 2.6752, + "step": 17610 + }, + { + "epoch": 1.595524450182329, + "grad_norm": 0.8586281538009644, + "learning_rate": 9.363861535673292e-05, + "loss": 2.6033, + "step": 17611 + }, + { + "epoch": 1.5956150483567755, + "grad_norm": 0.9655230641365051, + "learning_rate": 9.363257415574217e-05, + "loss": 2.948, + "step": 17612 + }, + { + "epoch": 1.5957056465312225, + "grad_norm": 0.941001296043396, + "learning_rate": 9.362653295475142e-05, + "loss": 2.9151, + "step": 17613 + }, + { + "epoch": 1.595796244705669, + "grad_norm": 0.9724501371383667, + "learning_rate": 9.362049175376065e-05, + "loss": 2.6612, + "step": 17614 + }, + { + "epoch": 1.595886842880116, + "grad_norm": 0.9141797423362732, + "learning_rate": 9.36144505527699e-05, + "loss": 2.7657, + "step": 17615 + }, + { + "epoch": 1.5959774410545626, + "grad_norm": 0.8478700518608093, + "learning_rate": 9.360840935177913e-05, + "loss": 2.5605, + "step": 17616 + }, + { + "epoch": 1.5960680392290096, + "grad_norm": 1.071956753730774, + "learning_rate": 9.360236815078838e-05, + "loss": 2.6948, + "step": 17617 + }, + { + "epoch": 1.5961586374034562, + "grad_norm": 0.8997564315795898, + "learning_rate": 9.359632694979762e-05, + "loss": 1.9034, + "step": 17618 + }, + { + "epoch": 1.5962492355779032, + "grad_norm": 0.918794572353363, + "learning_rate": 9.359028574880688e-05, + "loss": 2.64, + "step": 17619 + }, + { + "epoch": 1.5963398337523498, + "grad_norm": 0.9262911677360535, + "learning_rate": 9.358424454781611e-05, + "loss": 2.8273, + "step": 17620 + }, + { + "epoch": 1.5964304319267968, + "grad_norm": 0.8510004878044128, + "learning_rate": 9.357820334682536e-05, + "loss": 2.3697, + "step": 17621 + }, + { + "epoch": 1.5965210301012434, + "grad_norm": 0.9012651443481445, + "learning_rate": 9.357216214583459e-05, + "loss": 2.698, + "step": 17622 + }, + { + "epoch": 1.5966116282756904, + "grad_norm": 0.9299004077911377, + "learning_rate": 9.356612094484384e-05, + "loss": 2.8845, + "step": 17623 + }, + { + "epoch": 1.596702226450137, + "grad_norm": 0.8689337372779846, + "learning_rate": 9.356007974385307e-05, + "loss": 2.4841, + "step": 17624 + }, + { + "epoch": 1.596792824624584, + "grad_norm": 0.8797716498374939, + "learning_rate": 9.355403854286232e-05, + "loss": 2.586, + "step": 17625 + }, + { + "epoch": 1.5968834227990305, + "grad_norm": 0.9282847046852112, + "learning_rate": 9.354799734187157e-05, + "loss": 2.5342, + "step": 17626 + }, + { + "epoch": 1.5969740209734775, + "grad_norm": 0.9029791951179504, + "learning_rate": 9.354195614088082e-05, + "loss": 2.5692, + "step": 17627 + }, + { + "epoch": 1.597064619147924, + "grad_norm": 0.9123587012290955, + "learning_rate": 9.353591493989005e-05, + "loss": 2.589, + "step": 17628 + }, + { + "epoch": 1.597155217322371, + "grad_norm": 0.8808429837226868, + "learning_rate": 9.35298737388993e-05, + "loss": 2.5191, + "step": 17629 + }, + { + "epoch": 1.5972458154968177, + "grad_norm": 0.9122250080108643, + "learning_rate": 9.352383253790855e-05, + "loss": 2.7268, + "step": 17630 + }, + { + "epoch": 1.5973364136712647, + "grad_norm": 0.8401623368263245, + "learning_rate": 9.351779133691778e-05, + "loss": 2.6792, + "step": 17631 + }, + { + "epoch": 1.5974270118457112, + "grad_norm": 0.9100736379623413, + "learning_rate": 9.351175013592703e-05, + "loss": 2.6584, + "step": 17632 + }, + { + "epoch": 1.5975176100201582, + "grad_norm": 0.970830500125885, + "learning_rate": 9.350570893493626e-05, + "loss": 2.7596, + "step": 17633 + }, + { + "epoch": 1.5976082081946048, + "grad_norm": 0.8983023762702942, + "learning_rate": 9.349966773394552e-05, + "loss": 2.7213, + "step": 17634 + }, + { + "epoch": 1.5976988063690518, + "grad_norm": 0.9161785244941711, + "learning_rate": 9.349362653295476e-05, + "loss": 2.8834, + "step": 17635 + }, + { + "epoch": 1.5977894045434984, + "grad_norm": 0.8814380764961243, + "learning_rate": 9.3487585331964e-05, + "loss": 2.6765, + "step": 17636 + }, + { + "epoch": 1.5978800027179454, + "grad_norm": 0.9134770035743713, + "learning_rate": 9.348154413097324e-05, + "loss": 2.8869, + "step": 17637 + }, + { + "epoch": 1.597970600892392, + "grad_norm": 0.8461428880691528, + "learning_rate": 9.347550292998249e-05, + "loss": 2.5589, + "step": 17638 + }, + { + "epoch": 1.598061199066839, + "grad_norm": 0.8964092135429382, + "learning_rate": 9.346946172899172e-05, + "loss": 2.8641, + "step": 17639 + }, + { + "epoch": 1.5981517972412855, + "grad_norm": 0.8995712995529175, + "learning_rate": 9.346342052800097e-05, + "loss": 2.7162, + "step": 17640 + }, + { + "epoch": 1.5982423954157323, + "grad_norm": 0.9115676283836365, + "learning_rate": 9.345737932701022e-05, + "loss": 2.5771, + "step": 17641 + }, + { + "epoch": 1.598332993590179, + "grad_norm": 0.8936452865600586, + "learning_rate": 9.345133812601946e-05, + "loss": 2.9086, + "step": 17642 + }, + { + "epoch": 1.598423591764626, + "grad_norm": 0.9514075517654419, + "learning_rate": 9.34452969250287e-05, + "loss": 2.6906, + "step": 17643 + }, + { + "epoch": 1.5985141899390727, + "grad_norm": 0.9157463312149048, + "learning_rate": 9.343925572403794e-05, + "loss": 2.6542, + "step": 17644 + }, + { + "epoch": 1.5986047881135195, + "grad_norm": 0.8998886942863464, + "learning_rate": 9.343321452304719e-05, + "loss": 2.6225, + "step": 17645 + }, + { + "epoch": 1.5986953862879663, + "grad_norm": 0.9332543611526489, + "learning_rate": 9.342717332205643e-05, + "loss": 2.5328, + "step": 17646 + }, + { + "epoch": 1.598785984462413, + "grad_norm": 0.8454679250717163, + "learning_rate": 9.342113212106567e-05, + "loss": 2.5213, + "step": 17647 + }, + { + "epoch": 1.5988765826368598, + "grad_norm": 0.8975241184234619, + "learning_rate": 9.341509092007491e-05, + "loss": 2.4479, + "step": 17648 + }, + { + "epoch": 1.5989671808113066, + "grad_norm": 0.9209701418876648, + "learning_rate": 9.340904971908417e-05, + "loss": 2.7318, + "step": 17649 + }, + { + "epoch": 1.5990577789857534, + "grad_norm": 0.796583890914917, + "learning_rate": 9.34030085180934e-05, + "loss": 2.2088, + "step": 17650 + }, + { + "epoch": 1.5991483771602002, + "grad_norm": 0.9540758728981018, + "learning_rate": 9.339696731710265e-05, + "loss": 2.9211, + "step": 17651 + }, + { + "epoch": 1.599238975334647, + "grad_norm": 0.8471081852912903, + "learning_rate": 9.339092611611188e-05, + "loss": 1.9652, + "step": 17652 + }, + { + "epoch": 1.5993295735090938, + "grad_norm": 0.9240389466285706, + "learning_rate": 9.338488491512113e-05, + "loss": 2.981, + "step": 17653 + }, + { + "epoch": 1.5994201716835406, + "grad_norm": 0.8639417290687561, + "learning_rate": 9.337884371413037e-05, + "loss": 2.6973, + "step": 17654 + }, + { + "epoch": 1.5995107698579873, + "grad_norm": 0.8743289113044739, + "learning_rate": 9.337280251313961e-05, + "loss": 2.8151, + "step": 17655 + }, + { + "epoch": 1.5996013680324341, + "grad_norm": 0.7071089148521423, + "learning_rate": 9.336676131214886e-05, + "loss": 1.4076, + "step": 17656 + }, + { + "epoch": 1.599691966206881, + "grad_norm": 0.9621722102165222, + "learning_rate": 9.336072011115811e-05, + "loss": 2.7263, + "step": 17657 + }, + { + "epoch": 1.5997825643813277, + "grad_norm": 1.0048229694366455, + "learning_rate": 9.335467891016734e-05, + "loss": 2.6224, + "step": 17658 + }, + { + "epoch": 1.5998731625557745, + "grad_norm": 0.9310932159423828, + "learning_rate": 9.334863770917659e-05, + "loss": 2.6052, + "step": 17659 + }, + { + "epoch": 1.5999637607302213, + "grad_norm": 0.9130824208259583, + "learning_rate": 9.334259650818582e-05, + "loss": 2.8673, + "step": 17660 + }, + { + "epoch": 1.600054358904668, + "grad_norm": 0.9091561436653137, + "learning_rate": 9.333655530719507e-05, + "loss": 2.8668, + "step": 17661 + }, + { + "epoch": 1.6001449570791149, + "grad_norm": 0.9785208106040955, + "learning_rate": 9.333051410620432e-05, + "loss": 2.4009, + "step": 17662 + }, + { + "epoch": 1.6002355552535616, + "grad_norm": 0.9284647107124329, + "learning_rate": 9.332447290521355e-05, + "loss": 3.0213, + "step": 17663 + }, + { + "epoch": 1.6003261534280084, + "grad_norm": 0.8677919507026672, + "learning_rate": 9.33184317042228e-05, + "loss": 2.5495, + "step": 17664 + }, + { + "epoch": 1.6004167516024552, + "grad_norm": 0.9545963406562805, + "learning_rate": 9.331239050323205e-05, + "loss": 2.8065, + "step": 17665 + }, + { + "epoch": 1.600507349776902, + "grad_norm": 0.872588574886322, + "learning_rate": 9.33063493022413e-05, + "loss": 2.5479, + "step": 17666 + }, + { + "epoch": 1.6005979479513488, + "grad_norm": 0.8792713284492493, + "learning_rate": 9.330030810125053e-05, + "loss": 2.4062, + "step": 17667 + }, + { + "epoch": 1.6006885461257956, + "grad_norm": 1.0171421766281128, + "learning_rate": 9.329426690025978e-05, + "loss": 2.7049, + "step": 17668 + }, + { + "epoch": 1.6007791443002424, + "grad_norm": 0.9274817705154419, + "learning_rate": 9.328822569926901e-05, + "loss": 2.6842, + "step": 17669 + }, + { + "epoch": 1.6008697424746892, + "grad_norm": 0.9203313589096069, + "learning_rate": 9.328218449827826e-05, + "loss": 2.8271, + "step": 17670 + }, + { + "epoch": 1.600960340649136, + "grad_norm": 0.9379491209983826, + "learning_rate": 9.327614329728751e-05, + "loss": 2.4486, + "step": 17671 + }, + { + "epoch": 1.6010509388235827, + "grad_norm": 0.8481637835502625, + "learning_rate": 9.327010209629676e-05, + "loss": 2.8315, + "step": 17672 + }, + { + "epoch": 1.6011415369980295, + "grad_norm": 0.9358983039855957, + "learning_rate": 9.326406089530599e-05, + "loss": 2.6063, + "step": 17673 + }, + { + "epoch": 1.6012321351724763, + "grad_norm": 0.7987032532691956, + "learning_rate": 9.325801969431524e-05, + "loss": 2.0556, + "step": 17674 + }, + { + "epoch": 1.601322733346923, + "grad_norm": 0.8861954212188721, + "learning_rate": 9.325197849332447e-05, + "loss": 2.6762, + "step": 17675 + }, + { + "epoch": 1.6014133315213699, + "grad_norm": 0.9608368873596191, + "learning_rate": 9.324593729233372e-05, + "loss": 2.7293, + "step": 17676 + }, + { + "epoch": 1.6015039296958167, + "grad_norm": 0.7872868776321411, + "learning_rate": 9.323989609134297e-05, + "loss": 2.0853, + "step": 17677 + }, + { + "epoch": 1.6015945278702635, + "grad_norm": 0.9487127065658569, + "learning_rate": 9.32338548903522e-05, + "loss": 2.5857, + "step": 17678 + }, + { + "epoch": 1.6016851260447103, + "grad_norm": 0.9422929286956787, + "learning_rate": 9.322781368936145e-05, + "loss": 2.7207, + "step": 17679 + }, + { + "epoch": 1.601775724219157, + "grad_norm": 0.8662245869636536, + "learning_rate": 9.32217724883707e-05, + "loss": 2.7205, + "step": 17680 + }, + { + "epoch": 1.6018663223936036, + "grad_norm": 0.9910593032836914, + "learning_rate": 9.321573128737994e-05, + "loss": 2.7338, + "step": 17681 + }, + { + "epoch": 1.6019569205680506, + "grad_norm": 1.2263377904891968, + "learning_rate": 9.320969008638918e-05, + "loss": 2.6467, + "step": 17682 + }, + { + "epoch": 1.6020475187424972, + "grad_norm": 0.7851386666297913, + "learning_rate": 9.320364888539842e-05, + "loss": 2.2559, + "step": 17683 + }, + { + "epoch": 1.6021381169169442, + "grad_norm": 0.9435672760009766, + "learning_rate": 9.319760768440766e-05, + "loss": 2.6574, + "step": 17684 + }, + { + "epoch": 1.6022287150913908, + "grad_norm": 1.0505335330963135, + "learning_rate": 9.31915664834169e-05, + "loss": 2.6762, + "step": 17685 + }, + { + "epoch": 1.6023193132658378, + "grad_norm": 0.9355161190032959, + "learning_rate": 9.318552528242615e-05, + "loss": 2.7525, + "step": 17686 + }, + { + "epoch": 1.6024099114402843, + "grad_norm": 0.9046921133995056, + "learning_rate": 9.31794840814354e-05, + "loss": 2.7088, + "step": 17687 + }, + { + "epoch": 1.6025005096147313, + "grad_norm": 0.8633254766464233, + "learning_rate": 9.317344288044464e-05, + "loss": 2.6533, + "step": 17688 + }, + { + "epoch": 1.602591107789178, + "grad_norm": 0.9677612781524658, + "learning_rate": 9.316740167945388e-05, + "loss": 3.027, + "step": 17689 + }, + { + "epoch": 1.602681705963625, + "grad_norm": 0.9374820590019226, + "learning_rate": 9.316136047846312e-05, + "loss": 2.5476, + "step": 17690 + }, + { + "epoch": 1.6027723041380715, + "grad_norm": 0.986644983291626, + "learning_rate": 9.315531927747237e-05, + "loss": 2.6279, + "step": 17691 + }, + { + "epoch": 1.6028629023125185, + "grad_norm": 1.064667820930481, + "learning_rate": 9.31492780764816e-05, + "loss": 2.8639, + "step": 17692 + }, + { + "epoch": 1.602953500486965, + "grad_norm": 1.035765528678894, + "learning_rate": 9.314323687549085e-05, + "loss": 2.8773, + "step": 17693 + }, + { + "epoch": 1.603044098661412, + "grad_norm": 0.9834886789321899, + "learning_rate": 9.31371956745001e-05, + "loss": 2.6499, + "step": 17694 + }, + { + "epoch": 1.6031346968358586, + "grad_norm": 0.8503183722496033, + "learning_rate": 9.313115447350934e-05, + "loss": 2.6005, + "step": 17695 + }, + { + "epoch": 1.6032252950103056, + "grad_norm": 0.924281120300293, + "learning_rate": 9.312511327251858e-05, + "loss": 2.5566, + "step": 17696 + }, + { + "epoch": 1.6033158931847522, + "grad_norm": 0.9537166357040405, + "learning_rate": 9.311907207152782e-05, + "loss": 2.6609, + "step": 17697 + }, + { + "epoch": 1.6034064913591992, + "grad_norm": 0.9159530997276306, + "learning_rate": 9.311303087053707e-05, + "loss": 2.655, + "step": 17698 + }, + { + "epoch": 1.6034970895336458, + "grad_norm": 0.8748213052749634, + "learning_rate": 9.31069896695463e-05, + "loss": 2.793, + "step": 17699 + }, + { + "epoch": 1.6035876877080928, + "grad_norm": 0.9291024208068848, + "learning_rate": 9.310094846855555e-05, + "loss": 2.7734, + "step": 17700 + }, + { + "epoch": 1.6036782858825394, + "grad_norm": 0.9232608079910278, + "learning_rate": 9.30949072675648e-05, + "loss": 2.9512, + "step": 17701 + }, + { + "epoch": 1.6037688840569864, + "grad_norm": 0.9427239298820496, + "learning_rate": 9.308886606657405e-05, + "loss": 2.7559, + "step": 17702 + }, + { + "epoch": 1.603859482231433, + "grad_norm": 0.8732020258903503, + "learning_rate": 9.308282486558328e-05, + "loss": 3.0466, + "step": 17703 + }, + { + "epoch": 1.60395008040588, + "grad_norm": 0.8793477416038513, + "learning_rate": 9.307678366459253e-05, + "loss": 2.5433, + "step": 17704 + }, + { + "epoch": 1.6040406785803265, + "grad_norm": 0.9240687489509583, + "learning_rate": 9.307074246360176e-05, + "loss": 2.68, + "step": 17705 + }, + { + "epoch": 1.6041312767547735, + "grad_norm": 0.787356972694397, + "learning_rate": 9.306470126261101e-05, + "loss": 2.0034, + "step": 17706 + }, + { + "epoch": 1.60422187492922, + "grad_norm": 0.992866039276123, + "learning_rate": 9.305866006162025e-05, + "loss": 2.7602, + "step": 17707 + }, + { + "epoch": 1.604312473103667, + "grad_norm": 0.9485662579536438, + "learning_rate": 9.305261886062949e-05, + "loss": 2.8399, + "step": 17708 + }, + { + "epoch": 1.6044030712781137, + "grad_norm": 0.890837550163269, + "learning_rate": 9.304657765963874e-05, + "loss": 2.6871, + "step": 17709 + }, + { + "epoch": 1.6044936694525607, + "grad_norm": 0.9499419927597046, + "learning_rate": 9.304053645864799e-05, + "loss": 2.7841, + "step": 17710 + }, + { + "epoch": 1.6045842676270072, + "grad_norm": 0.9001545906066895, + "learning_rate": 9.303449525765722e-05, + "loss": 2.5375, + "step": 17711 + }, + { + "epoch": 1.6046748658014542, + "grad_norm": 0.9261254668235779, + "learning_rate": 9.302845405666647e-05, + "loss": 2.9916, + "step": 17712 + }, + { + "epoch": 1.6047654639759008, + "grad_norm": 0.9033751487731934, + "learning_rate": 9.302241285567572e-05, + "loss": 2.961, + "step": 17713 + }, + { + "epoch": 1.6048560621503478, + "grad_norm": 0.8717748522758484, + "learning_rate": 9.301637165468495e-05, + "loss": 2.3995, + "step": 17714 + }, + { + "epoch": 1.6049466603247944, + "grad_norm": 0.8383679389953613, + "learning_rate": 9.30103304536942e-05, + "loss": 2.5193, + "step": 17715 + }, + { + "epoch": 1.6050372584992414, + "grad_norm": 0.8839095234870911, + "learning_rate": 9.300428925270345e-05, + "loss": 2.6452, + "step": 17716 + }, + { + "epoch": 1.605127856673688, + "grad_norm": 0.8411935567855835, + "learning_rate": 9.29982480517127e-05, + "loss": 2.0358, + "step": 17717 + }, + { + "epoch": 1.605218454848135, + "grad_norm": 0.844162106513977, + "learning_rate": 9.299220685072193e-05, + "loss": 2.8178, + "step": 17718 + }, + { + "epoch": 1.6053090530225815, + "grad_norm": 0.9549509286880493, + "learning_rate": 9.298616564973118e-05, + "loss": 2.7142, + "step": 17719 + }, + { + "epoch": 1.6053996511970285, + "grad_norm": 0.8736999034881592, + "learning_rate": 9.298012444874041e-05, + "loss": 2.7365, + "step": 17720 + }, + { + "epoch": 1.605490249371475, + "grad_norm": 0.8512074947357178, + "learning_rate": 9.297408324774966e-05, + "loss": 2.3875, + "step": 17721 + }, + { + "epoch": 1.605580847545922, + "grad_norm": 0.9473116993904114, + "learning_rate": 9.296804204675889e-05, + "loss": 3.0861, + "step": 17722 + }, + { + "epoch": 1.6056714457203687, + "grad_norm": 0.9138274788856506, + "learning_rate": 9.296200084576814e-05, + "loss": 2.7028, + "step": 17723 + }, + { + "epoch": 1.6057620438948155, + "grad_norm": 0.893629252910614, + "learning_rate": 9.295595964477739e-05, + "loss": 2.6972, + "step": 17724 + }, + { + "epoch": 1.6058526420692623, + "grad_norm": 0.9324570298194885, + "learning_rate": 9.294991844378663e-05, + "loss": 2.5627, + "step": 17725 + }, + { + "epoch": 1.605943240243709, + "grad_norm": 1.0650078058242798, + "learning_rate": 9.294387724279587e-05, + "loss": 2.8057, + "step": 17726 + }, + { + "epoch": 1.6060338384181558, + "grad_norm": 0.930749237537384, + "learning_rate": 9.293783604180512e-05, + "loss": 2.6871, + "step": 17727 + }, + { + "epoch": 1.6061244365926026, + "grad_norm": 0.8650894165039062, + "learning_rate": 9.293179484081435e-05, + "loss": 2.0774, + "step": 17728 + }, + { + "epoch": 1.6062150347670494, + "grad_norm": 0.8651999235153198, + "learning_rate": 9.29257536398236e-05, + "loss": 2.3967, + "step": 17729 + }, + { + "epoch": 1.6063056329414962, + "grad_norm": 0.9489815831184387, + "learning_rate": 9.291971243883285e-05, + "loss": 2.9916, + "step": 17730 + }, + { + "epoch": 1.606396231115943, + "grad_norm": 1.012642502784729, + "learning_rate": 9.291367123784209e-05, + "loss": 2.8446, + "step": 17731 + }, + { + "epoch": 1.6064868292903898, + "grad_norm": 0.9063881039619446, + "learning_rate": 9.290763003685134e-05, + "loss": 2.5052, + "step": 17732 + }, + { + "epoch": 1.6065774274648366, + "grad_norm": 0.9075023531913757, + "learning_rate": 9.290158883586057e-05, + "loss": 2.0888, + "step": 17733 + }, + { + "epoch": 1.6066680256392833, + "grad_norm": 0.9146521687507629, + "learning_rate": 9.289554763486982e-05, + "loss": 2.7228, + "step": 17734 + }, + { + "epoch": 1.6067586238137301, + "grad_norm": 0.8198142647743225, + "learning_rate": 9.288950643387906e-05, + "loss": 2.1495, + "step": 17735 + }, + { + "epoch": 1.606849221988177, + "grad_norm": 0.8720648288726807, + "learning_rate": 9.28834652328883e-05, + "loss": 2.5608, + "step": 17736 + }, + { + "epoch": 1.6069398201626237, + "grad_norm": 0.8485918045043945, + "learning_rate": 9.287742403189754e-05, + "loss": 2.0433, + "step": 17737 + }, + { + "epoch": 1.6070304183370705, + "grad_norm": 0.9230509996414185, + "learning_rate": 9.287138283090679e-05, + "loss": 2.7753, + "step": 17738 + }, + { + "epoch": 1.6071210165115173, + "grad_norm": 0.9044724702835083, + "learning_rate": 9.286534162991603e-05, + "loss": 2.6718, + "step": 17739 + }, + { + "epoch": 1.607211614685964, + "grad_norm": 0.9388160109519958, + "learning_rate": 9.285930042892528e-05, + "loss": 2.7323, + "step": 17740 + }, + { + "epoch": 1.6073022128604109, + "grad_norm": 0.9313284158706665, + "learning_rate": 9.285325922793451e-05, + "loss": 2.9141, + "step": 17741 + }, + { + "epoch": 1.6073928110348576, + "grad_norm": 0.9386330246925354, + "learning_rate": 9.284721802694376e-05, + "loss": 2.7698, + "step": 17742 + }, + { + "epoch": 1.6074834092093044, + "grad_norm": 0.8392333388328552, + "learning_rate": 9.2841176825953e-05, + "loss": 2.5287, + "step": 17743 + }, + { + "epoch": 1.6075740073837512, + "grad_norm": 0.9350263476371765, + "learning_rate": 9.283513562496224e-05, + "loss": 2.6231, + "step": 17744 + }, + { + "epoch": 1.607664605558198, + "grad_norm": 0.9354181289672852, + "learning_rate": 9.282909442397149e-05, + "loss": 2.8533, + "step": 17745 + }, + { + "epoch": 1.6077552037326448, + "grad_norm": 0.8847551941871643, + "learning_rate": 9.282305322298074e-05, + "loss": 2.6009, + "step": 17746 + }, + { + "epoch": 1.6078458019070916, + "grad_norm": 1.016322374343872, + "learning_rate": 9.281701202198997e-05, + "loss": 2.5955, + "step": 17747 + }, + { + "epoch": 1.6079364000815384, + "grad_norm": 0.9563291072845459, + "learning_rate": 9.281097082099922e-05, + "loss": 2.7089, + "step": 17748 + }, + { + "epoch": 1.6080269982559852, + "grad_norm": 0.9117429852485657, + "learning_rate": 9.280492962000847e-05, + "loss": 2.6122, + "step": 17749 + }, + { + "epoch": 1.608117596430432, + "grad_norm": 0.917113721370697, + "learning_rate": 9.27988884190177e-05, + "loss": 2.6618, + "step": 17750 + }, + { + "epoch": 1.6082081946048787, + "grad_norm": 0.9040265083312988, + "learning_rate": 9.279284721802695e-05, + "loss": 2.6926, + "step": 17751 + }, + { + "epoch": 1.6082987927793255, + "grad_norm": 0.9667934775352478, + "learning_rate": 9.278680601703618e-05, + "loss": 3.0562, + "step": 17752 + }, + { + "epoch": 1.6083893909537723, + "grad_norm": 0.8952207565307617, + "learning_rate": 9.278076481604543e-05, + "loss": 2.9681, + "step": 17753 + }, + { + "epoch": 1.608479989128219, + "grad_norm": 0.8986284136772156, + "learning_rate": 9.277472361505468e-05, + "loss": 2.5968, + "step": 17754 + }, + { + "epoch": 1.6085705873026659, + "grad_norm": 0.9168475270271301, + "learning_rate": 9.276868241406393e-05, + "loss": 2.9154, + "step": 17755 + }, + { + "epoch": 1.6086611854771127, + "grad_norm": 0.9357462525367737, + "learning_rate": 9.276264121307316e-05, + "loss": 2.5274, + "step": 17756 + }, + { + "epoch": 1.6087517836515595, + "grad_norm": 0.8981620073318481, + "learning_rate": 9.275660001208241e-05, + "loss": 2.508, + "step": 17757 + }, + { + "epoch": 1.6088423818260063, + "grad_norm": 0.8958845138549805, + "learning_rate": 9.275055881109164e-05, + "loss": 2.8267, + "step": 17758 + }, + { + "epoch": 1.608932980000453, + "grad_norm": 0.8742167949676514, + "learning_rate": 9.274451761010089e-05, + "loss": 2.8555, + "step": 17759 + }, + { + "epoch": 1.6090235781748998, + "grad_norm": 0.8661441802978516, + "learning_rate": 9.273847640911012e-05, + "loss": 2.6905, + "step": 17760 + }, + { + "epoch": 1.6091141763493466, + "grad_norm": 0.9252572655677795, + "learning_rate": 9.273243520811939e-05, + "loss": 2.8797, + "step": 17761 + }, + { + "epoch": 1.6092047745237932, + "grad_norm": 0.8639264106750488, + "learning_rate": 9.272639400712862e-05, + "loss": 2.5527, + "step": 17762 + }, + { + "epoch": 1.6092953726982402, + "grad_norm": 0.8878993391990662, + "learning_rate": 9.272035280613787e-05, + "loss": 2.7988, + "step": 17763 + }, + { + "epoch": 1.6093859708726868, + "grad_norm": 0.8794721961021423, + "learning_rate": 9.271431160514712e-05, + "loss": 2.7443, + "step": 17764 + }, + { + "epoch": 1.6094765690471338, + "grad_norm": 0.7375540137290955, + "learning_rate": 9.270827040415635e-05, + "loss": 2.0522, + "step": 17765 + }, + { + "epoch": 1.6095671672215803, + "grad_norm": 0.7776595950126648, + "learning_rate": 9.27022292031656e-05, + "loss": 2.0192, + "step": 17766 + }, + { + "epoch": 1.6096577653960273, + "grad_norm": 0.714682936668396, + "learning_rate": 9.269618800217483e-05, + "loss": 1.8395, + "step": 17767 + }, + { + "epoch": 1.609748363570474, + "grad_norm": 0.865774929523468, + "learning_rate": 9.269014680118408e-05, + "loss": 2.76, + "step": 17768 + }, + { + "epoch": 1.609838961744921, + "grad_norm": 0.8674358129501343, + "learning_rate": 9.268410560019333e-05, + "loss": 3.0077, + "step": 17769 + }, + { + "epoch": 1.6099295599193675, + "grad_norm": 0.8889656662940979, + "learning_rate": 9.267806439920257e-05, + "loss": 2.6964, + "step": 17770 + }, + { + "epoch": 1.6100201580938145, + "grad_norm": 0.9708667397499084, + "learning_rate": 9.267202319821181e-05, + "loss": 2.795, + "step": 17771 + }, + { + "epoch": 1.610110756268261, + "grad_norm": 0.9050554633140564, + "learning_rate": 9.266598199722106e-05, + "loss": 2.6668, + "step": 17772 + }, + { + "epoch": 1.610201354442708, + "grad_norm": 0.8997369408607483, + "learning_rate": 9.265994079623029e-05, + "loss": 2.5485, + "step": 17773 + }, + { + "epoch": 1.6102919526171546, + "grad_norm": 0.9511111378669739, + "learning_rate": 9.265389959523954e-05, + "loss": 2.7006, + "step": 17774 + }, + { + "epoch": 1.6103825507916016, + "grad_norm": 0.7407146692276001, + "learning_rate": 9.264785839424877e-05, + "loss": 1.9373, + "step": 17775 + }, + { + "epoch": 1.6104731489660482, + "grad_norm": 0.9607962369918823, + "learning_rate": 9.264181719325803e-05, + "loss": 3.139, + "step": 17776 + }, + { + "epoch": 1.6105637471404952, + "grad_norm": 0.9322160482406616, + "learning_rate": 9.263577599226727e-05, + "loss": 2.6136, + "step": 17777 + }, + { + "epoch": 1.6106543453149418, + "grad_norm": 0.930112898349762, + "learning_rate": 9.262973479127651e-05, + "loss": 2.7771, + "step": 17778 + }, + { + "epoch": 1.6107449434893888, + "grad_norm": 0.9550116658210754, + "learning_rate": 9.262369359028575e-05, + "loss": 2.5879, + "step": 17779 + }, + { + "epoch": 1.6108355416638354, + "grad_norm": 0.914970874786377, + "learning_rate": 9.2617652389295e-05, + "loss": 2.7227, + "step": 17780 + }, + { + "epoch": 1.6109261398382824, + "grad_norm": 0.9343112111091614, + "learning_rate": 9.261161118830424e-05, + "loss": 2.8159, + "step": 17781 + }, + { + "epoch": 1.611016738012729, + "grad_norm": 0.914800763130188, + "learning_rate": 9.260556998731348e-05, + "loss": 2.6427, + "step": 17782 + }, + { + "epoch": 1.611107336187176, + "grad_norm": 1.0022737979888916, + "learning_rate": 9.259952878632272e-05, + "loss": 1.9218, + "step": 17783 + }, + { + "epoch": 1.6111979343616225, + "grad_norm": 0.9253761768341064, + "learning_rate": 9.259348758533197e-05, + "loss": 2.8253, + "step": 17784 + }, + { + "epoch": 1.6112885325360695, + "grad_norm": 1.0063989162445068, + "learning_rate": 9.258744638434122e-05, + "loss": 2.7006, + "step": 17785 + }, + { + "epoch": 1.611379130710516, + "grad_norm": 0.9932718276977539, + "learning_rate": 9.258140518335045e-05, + "loss": 3.0987, + "step": 17786 + }, + { + "epoch": 1.611469728884963, + "grad_norm": 0.8805585503578186, + "learning_rate": 9.25753639823597e-05, + "loss": 2.7904, + "step": 17787 + }, + { + "epoch": 1.6115603270594097, + "grad_norm": 0.9063155651092529, + "learning_rate": 9.256932278136894e-05, + "loss": 2.4812, + "step": 17788 + }, + { + "epoch": 1.6116509252338567, + "grad_norm": 0.8806196451187134, + "learning_rate": 9.256328158037818e-05, + "loss": 2.6345, + "step": 17789 + }, + { + "epoch": 1.6117415234083032, + "grad_norm": 0.934314489364624, + "learning_rate": 9.255724037938742e-05, + "loss": 2.8606, + "step": 17790 + }, + { + "epoch": 1.6118321215827502, + "grad_norm": 0.8904823064804077, + "learning_rate": 9.255119917839668e-05, + "loss": 2.729, + "step": 17791 + }, + { + "epoch": 1.6119227197571968, + "grad_norm": 0.9073952436447144, + "learning_rate": 9.254515797740591e-05, + "loss": 2.7156, + "step": 17792 + }, + { + "epoch": 1.6120133179316438, + "grad_norm": 0.9094709753990173, + "learning_rate": 9.253911677641516e-05, + "loss": 2.8216, + "step": 17793 + }, + { + "epoch": 1.6121039161060904, + "grad_norm": 0.9557388424873352, + "learning_rate": 9.25330755754244e-05, + "loss": 2.7286, + "step": 17794 + }, + { + "epoch": 1.6121945142805374, + "grad_norm": 0.9408113360404968, + "learning_rate": 9.252703437443364e-05, + "loss": 2.8117, + "step": 17795 + }, + { + "epoch": 1.612285112454984, + "grad_norm": 0.9088544249534607, + "learning_rate": 9.252099317344288e-05, + "loss": 2.501, + "step": 17796 + }, + { + "epoch": 1.612375710629431, + "grad_norm": 0.8851742744445801, + "learning_rate": 9.251495197245212e-05, + "loss": 2.4659, + "step": 17797 + }, + { + "epoch": 1.6124663088038775, + "grad_norm": 0.8768048286437988, + "learning_rate": 9.250891077146137e-05, + "loss": 2.8221, + "step": 17798 + }, + { + "epoch": 1.6125569069783245, + "grad_norm": 0.9749795198440552, + "learning_rate": 9.250286957047062e-05, + "loss": 2.9603, + "step": 17799 + }, + { + "epoch": 1.612647505152771, + "grad_norm": 0.9019211530685425, + "learning_rate": 9.249682836947987e-05, + "loss": 2.7507, + "step": 17800 + }, + { + "epoch": 1.6127381033272181, + "grad_norm": 0.8787296414375305, + "learning_rate": 9.24907871684891e-05, + "loss": 2.6463, + "step": 17801 + }, + { + "epoch": 1.6128287015016647, + "grad_norm": 0.9428444504737854, + "learning_rate": 9.248474596749835e-05, + "loss": 2.7201, + "step": 17802 + }, + { + "epoch": 1.6129192996761115, + "grad_norm": 0.7567700147628784, + "learning_rate": 9.247870476650758e-05, + "loss": 1.9999, + "step": 17803 + }, + { + "epoch": 1.6130098978505583, + "grad_norm": 0.9229050278663635, + "learning_rate": 9.247266356551683e-05, + "loss": 2.5764, + "step": 17804 + }, + { + "epoch": 1.613100496025005, + "grad_norm": 0.9447479844093323, + "learning_rate": 9.246662236452606e-05, + "loss": 2.6667, + "step": 17805 + }, + { + "epoch": 1.6131910941994518, + "grad_norm": 0.886108934879303, + "learning_rate": 9.246058116353532e-05, + "loss": 2.6067, + "step": 17806 + }, + { + "epoch": 1.6132816923738986, + "grad_norm": 0.941328227519989, + "learning_rate": 9.245453996254456e-05, + "loss": 3.0753, + "step": 17807 + }, + { + "epoch": 1.6133722905483454, + "grad_norm": 0.9510447382926941, + "learning_rate": 9.24484987615538e-05, + "loss": 2.7475, + "step": 17808 + }, + { + "epoch": 1.6134628887227922, + "grad_norm": 0.8634048104286194, + "learning_rate": 9.244245756056304e-05, + "loss": 2.8557, + "step": 17809 + }, + { + "epoch": 1.613553486897239, + "grad_norm": 0.8560994267463684, + "learning_rate": 9.243641635957229e-05, + "loss": 1.8904, + "step": 17810 + }, + { + "epoch": 1.6136440850716858, + "grad_norm": 0.8698007464408875, + "learning_rate": 9.243037515858152e-05, + "loss": 2.589, + "step": 17811 + }, + { + "epoch": 1.6137346832461326, + "grad_norm": 1.0040816068649292, + "learning_rate": 9.242433395759077e-05, + "loss": 2.8068, + "step": 17812 + }, + { + "epoch": 1.6138252814205793, + "grad_norm": 0.8661791682243347, + "learning_rate": 9.241829275660002e-05, + "loss": 2.5675, + "step": 17813 + }, + { + "epoch": 1.6139158795950261, + "grad_norm": 0.8483428955078125, + "learning_rate": 9.241225155560926e-05, + "loss": 2.3069, + "step": 17814 + }, + { + "epoch": 1.614006477769473, + "grad_norm": 0.9035799503326416, + "learning_rate": 9.24062103546185e-05, + "loss": 2.4701, + "step": 17815 + }, + { + "epoch": 1.6140970759439197, + "grad_norm": 0.9448902010917664, + "learning_rate": 9.240016915362775e-05, + "loss": 2.7567, + "step": 17816 + }, + { + "epoch": 1.6141876741183665, + "grad_norm": 0.8628485202789307, + "learning_rate": 9.2394127952637e-05, + "loss": 2.646, + "step": 17817 + }, + { + "epoch": 1.6142782722928133, + "grad_norm": 0.971628725528717, + "learning_rate": 9.238808675164623e-05, + "loss": 2.9203, + "step": 17818 + }, + { + "epoch": 1.61436887046726, + "grad_norm": 0.8909587860107422, + "learning_rate": 9.238204555065548e-05, + "loss": 2.6556, + "step": 17819 + }, + { + "epoch": 1.6144594686417069, + "grad_norm": 0.8849179148674011, + "learning_rate": 9.237600434966471e-05, + "loss": 3.0675, + "step": 17820 + }, + { + "epoch": 1.6145500668161537, + "grad_norm": 0.9482736587524414, + "learning_rate": 9.236996314867397e-05, + "loss": 2.9998, + "step": 17821 + }, + { + "epoch": 1.6146406649906004, + "grad_norm": 0.9137590527534485, + "learning_rate": 9.23639219476832e-05, + "loss": 2.851, + "step": 17822 + }, + { + "epoch": 1.6147312631650472, + "grad_norm": 0.8712631464004517, + "learning_rate": 9.235788074669245e-05, + "loss": 2.6817, + "step": 17823 + }, + { + "epoch": 1.614821861339494, + "grad_norm": 0.8552423715591431, + "learning_rate": 9.235183954570169e-05, + "loss": 2.7504, + "step": 17824 + }, + { + "epoch": 1.6149124595139408, + "grad_norm": 0.9077909588813782, + "learning_rate": 9.234579834471093e-05, + "loss": 2.6947, + "step": 17825 + }, + { + "epoch": 1.6150030576883876, + "grad_norm": 0.9631067514419556, + "learning_rate": 9.233975714372017e-05, + "loss": 2.4836, + "step": 17826 + }, + { + "epoch": 1.6150936558628344, + "grad_norm": 0.9083793759346008, + "learning_rate": 9.233371594272942e-05, + "loss": 2.94, + "step": 17827 + }, + { + "epoch": 1.6151842540372812, + "grad_norm": 0.8884212374687195, + "learning_rate": 9.232767474173865e-05, + "loss": 2.941, + "step": 17828 + }, + { + "epoch": 1.615274852211728, + "grad_norm": 0.8500050902366638, + "learning_rate": 9.232163354074791e-05, + "loss": 2.5886, + "step": 17829 + }, + { + "epoch": 1.6153654503861747, + "grad_norm": 1.0030899047851562, + "learning_rate": 9.231559233975715e-05, + "loss": 2.7471, + "step": 17830 + }, + { + "epoch": 1.6154560485606215, + "grad_norm": 0.8491325974464417, + "learning_rate": 9.230955113876639e-05, + "loss": 2.5859, + "step": 17831 + }, + { + "epoch": 1.6155466467350683, + "grad_norm": 0.8801879286766052, + "learning_rate": 9.230350993777564e-05, + "loss": 2.75, + "step": 17832 + }, + { + "epoch": 1.615637244909515, + "grad_norm": 0.8471767902374268, + "learning_rate": 9.229746873678487e-05, + "loss": 2.6718, + "step": 17833 + }, + { + "epoch": 1.615727843083962, + "grad_norm": 0.8523404002189636, + "learning_rate": 9.229142753579412e-05, + "loss": 2.7064, + "step": 17834 + }, + { + "epoch": 1.6158184412584087, + "grad_norm": 0.867784857749939, + "learning_rate": 9.228538633480336e-05, + "loss": 2.7629, + "step": 17835 + }, + { + "epoch": 1.6159090394328555, + "grad_norm": 0.9091178774833679, + "learning_rate": 9.227934513381262e-05, + "loss": 3.0169, + "step": 17836 + }, + { + "epoch": 1.6159996376073023, + "grad_norm": 0.9541370272636414, + "learning_rate": 9.227330393282185e-05, + "loss": 2.6742, + "step": 17837 + }, + { + "epoch": 1.616090235781749, + "grad_norm": 0.7124577760696411, + "learning_rate": 9.22672627318311e-05, + "loss": 1.7785, + "step": 17838 + }, + { + "epoch": 1.6161808339561958, + "grad_norm": 0.9641091227531433, + "learning_rate": 9.226122153084033e-05, + "loss": 2.5977, + "step": 17839 + }, + { + "epoch": 1.6162714321306426, + "grad_norm": 0.9368218779563904, + "learning_rate": 9.225518032984958e-05, + "loss": 2.9091, + "step": 17840 + }, + { + "epoch": 1.6163620303050894, + "grad_norm": 0.9289238452911377, + "learning_rate": 9.224913912885881e-05, + "loss": 2.9963, + "step": 17841 + }, + { + "epoch": 1.6164526284795362, + "grad_norm": 0.9405772089958191, + "learning_rate": 9.224309792786806e-05, + "loss": 2.7339, + "step": 17842 + }, + { + "epoch": 1.6165432266539828, + "grad_norm": 0.9279703497886658, + "learning_rate": 9.22370567268773e-05, + "loss": 2.5485, + "step": 17843 + }, + { + "epoch": 1.6166338248284298, + "grad_norm": 0.9525802731513977, + "learning_rate": 9.223101552588656e-05, + "loss": 2.8422, + "step": 17844 + }, + { + "epoch": 1.6167244230028763, + "grad_norm": 0.8854608535766602, + "learning_rate": 9.222497432489579e-05, + "loss": 2.7478, + "step": 17845 + }, + { + "epoch": 1.6168150211773233, + "grad_norm": 0.9558382034301758, + "learning_rate": 9.221893312390504e-05, + "loss": 2.594, + "step": 17846 + }, + { + "epoch": 1.61690561935177, + "grad_norm": 0.8960711359977722, + "learning_rate": 9.221289192291427e-05, + "loss": 2.6741, + "step": 17847 + }, + { + "epoch": 1.616996217526217, + "grad_norm": 0.8581106662750244, + "learning_rate": 9.220685072192352e-05, + "loss": 2.7322, + "step": 17848 + }, + { + "epoch": 1.6170868157006635, + "grad_norm": 0.8659132122993469, + "learning_rate": 9.220080952093277e-05, + "loss": 2.5691, + "step": 17849 + }, + { + "epoch": 1.6171774138751105, + "grad_norm": 0.8845334649085999, + "learning_rate": 9.2194768319942e-05, + "loss": 2.7132, + "step": 17850 + }, + { + "epoch": 1.617268012049557, + "grad_norm": 0.8921311497688293, + "learning_rate": 9.218872711895125e-05, + "loss": 2.5387, + "step": 17851 + }, + { + "epoch": 1.617358610224004, + "grad_norm": 0.9448198676109314, + "learning_rate": 9.21826859179605e-05, + "loss": 2.7764, + "step": 17852 + }, + { + "epoch": 1.6174492083984506, + "grad_norm": 1.0117138624191284, + "learning_rate": 9.217664471696975e-05, + "loss": 2.9001, + "step": 17853 + }, + { + "epoch": 1.6175398065728976, + "grad_norm": 0.9494531154632568, + "learning_rate": 9.217060351597898e-05, + "loss": 2.5374, + "step": 17854 + }, + { + "epoch": 1.6176304047473442, + "grad_norm": 1.0113885402679443, + "learning_rate": 9.216456231498823e-05, + "loss": 2.8979, + "step": 17855 + }, + { + "epoch": 1.6177210029217912, + "grad_norm": 0.9607422351837158, + "learning_rate": 9.215852111399746e-05, + "loss": 2.8824, + "step": 17856 + }, + { + "epoch": 1.6178116010962378, + "grad_norm": 0.8406885862350464, + "learning_rate": 9.215247991300671e-05, + "loss": 2.616, + "step": 17857 + }, + { + "epoch": 1.6179021992706848, + "grad_norm": 0.9119359254837036, + "learning_rate": 9.214643871201594e-05, + "loss": 2.5632, + "step": 17858 + }, + { + "epoch": 1.6179927974451314, + "grad_norm": 0.8471242785453796, + "learning_rate": 9.21403975110252e-05, + "loss": 2.6414, + "step": 17859 + }, + { + "epoch": 1.6180833956195784, + "grad_norm": 0.8876606822013855, + "learning_rate": 9.213435631003444e-05, + "loss": 2.7657, + "step": 17860 + }, + { + "epoch": 1.618173993794025, + "grad_norm": 0.8593460321426392, + "learning_rate": 9.212831510904369e-05, + "loss": 2.6031, + "step": 17861 + }, + { + "epoch": 1.618264591968472, + "grad_norm": 0.9781151413917542, + "learning_rate": 9.212227390805292e-05, + "loss": 2.4651, + "step": 17862 + }, + { + "epoch": 1.6183551901429185, + "grad_norm": 0.9947316646575928, + "learning_rate": 9.211623270706217e-05, + "loss": 2.9407, + "step": 17863 + }, + { + "epoch": 1.6184457883173655, + "grad_norm": 0.7562419772148132, + "learning_rate": 9.211019150607141e-05, + "loss": 2.2578, + "step": 17864 + }, + { + "epoch": 1.618536386491812, + "grad_norm": 0.9209616780281067, + "learning_rate": 9.210415030508065e-05, + "loss": 2.7396, + "step": 17865 + }, + { + "epoch": 1.618626984666259, + "grad_norm": 0.7717037796974182, + "learning_rate": 9.20981091040899e-05, + "loss": 2.0662, + "step": 17866 + }, + { + "epoch": 1.6187175828407057, + "grad_norm": 0.8745123147964478, + "learning_rate": 9.209206790309914e-05, + "loss": 2.6166, + "step": 17867 + }, + { + "epoch": 1.6188081810151527, + "grad_norm": 0.9382268190383911, + "learning_rate": 9.208602670210839e-05, + "loss": 2.5611, + "step": 17868 + }, + { + "epoch": 1.6188987791895992, + "grad_norm": 0.8549655079841614, + "learning_rate": 9.207998550111763e-05, + "loss": 2.6527, + "step": 17869 + }, + { + "epoch": 1.6189893773640462, + "grad_norm": 0.7391670346260071, + "learning_rate": 9.207394430012687e-05, + "loss": 1.9986, + "step": 17870 + }, + { + "epoch": 1.6190799755384928, + "grad_norm": 0.9138028025627136, + "learning_rate": 9.206790309913611e-05, + "loss": 2.6428, + "step": 17871 + }, + { + "epoch": 1.6191705737129398, + "grad_norm": 0.8774232268333435, + "learning_rate": 9.206186189814536e-05, + "loss": 2.8812, + "step": 17872 + }, + { + "epoch": 1.6192611718873864, + "grad_norm": 0.8964352011680603, + "learning_rate": 9.205582069715459e-05, + "loss": 2.8794, + "step": 17873 + }, + { + "epoch": 1.6193517700618334, + "grad_norm": 0.9009150862693787, + "learning_rate": 9.204977949616385e-05, + "loss": 2.7051, + "step": 17874 + }, + { + "epoch": 1.61944236823628, + "grad_norm": 0.8760058879852295, + "learning_rate": 9.204373829517308e-05, + "loss": 2.8066, + "step": 17875 + }, + { + "epoch": 1.619532966410727, + "grad_norm": 1.019152283668518, + "learning_rate": 9.203769709418233e-05, + "loss": 2.6159, + "step": 17876 + }, + { + "epoch": 1.6196235645851735, + "grad_norm": 0.8930864334106445, + "learning_rate": 9.203165589319157e-05, + "loss": 2.6777, + "step": 17877 + }, + { + "epoch": 1.6197141627596205, + "grad_norm": 0.8783840537071228, + "learning_rate": 9.202561469220081e-05, + "loss": 2.8686, + "step": 17878 + }, + { + "epoch": 1.6198047609340671, + "grad_norm": 0.8807550668716431, + "learning_rate": 9.201957349121005e-05, + "loss": 2.971, + "step": 17879 + }, + { + "epoch": 1.6198953591085141, + "grad_norm": 0.8931401968002319, + "learning_rate": 9.20135322902193e-05, + "loss": 2.511, + "step": 17880 + }, + { + "epoch": 1.6199859572829607, + "grad_norm": 1.0580618381500244, + "learning_rate": 9.200749108922854e-05, + "loss": 2.7576, + "step": 17881 + }, + { + "epoch": 1.6200765554574077, + "grad_norm": 0.9281947016716003, + "learning_rate": 9.200144988823779e-05, + "loss": 2.8091, + "step": 17882 + }, + { + "epoch": 1.6201671536318543, + "grad_norm": 0.8747975826263428, + "learning_rate": 9.199540868724702e-05, + "loss": 2.5399, + "step": 17883 + }, + { + "epoch": 1.620257751806301, + "grad_norm": 0.8797442317008972, + "learning_rate": 9.198936748625627e-05, + "loss": 2.8134, + "step": 17884 + }, + { + "epoch": 1.6203483499807478, + "grad_norm": 0.8502287268638611, + "learning_rate": 9.198332628526552e-05, + "loss": 2.226, + "step": 17885 + }, + { + "epoch": 1.6204389481551946, + "grad_norm": 0.8491566181182861, + "learning_rate": 9.197728508427475e-05, + "loss": 2.5007, + "step": 17886 + }, + { + "epoch": 1.6205295463296414, + "grad_norm": 0.8435484170913696, + "learning_rate": 9.1971243883284e-05, + "loss": 2.2354, + "step": 17887 + }, + { + "epoch": 1.6206201445040882, + "grad_norm": 0.8732118010520935, + "learning_rate": 9.196520268229324e-05, + "loss": 2.6953, + "step": 17888 + }, + { + "epoch": 1.620710742678535, + "grad_norm": 0.9197609424591064, + "learning_rate": 9.19591614813025e-05, + "loss": 2.6443, + "step": 17889 + }, + { + "epoch": 1.6208013408529818, + "grad_norm": 0.9486446976661682, + "learning_rate": 9.195312028031173e-05, + "loss": 2.7639, + "step": 17890 + }, + { + "epoch": 1.6208919390274286, + "grad_norm": 0.8964388370513916, + "learning_rate": 9.194707907932098e-05, + "loss": 2.783, + "step": 17891 + }, + { + "epoch": 1.6209825372018754, + "grad_norm": 0.8403635025024414, + "learning_rate": 9.194103787833021e-05, + "loss": 2.642, + "step": 17892 + }, + { + "epoch": 1.6210731353763221, + "grad_norm": 0.8607802987098694, + "learning_rate": 9.193499667733946e-05, + "loss": 2.688, + "step": 17893 + }, + { + "epoch": 1.621163733550769, + "grad_norm": 0.8544458746910095, + "learning_rate": 9.19289554763487e-05, + "loss": 2.6091, + "step": 17894 + }, + { + "epoch": 1.6212543317252157, + "grad_norm": 0.9012106657028198, + "learning_rate": 9.192291427535794e-05, + "loss": 2.6934, + "step": 17895 + }, + { + "epoch": 1.6213449298996625, + "grad_norm": 0.8564023971557617, + "learning_rate": 9.191687307436719e-05, + "loss": 2.7319, + "step": 17896 + }, + { + "epoch": 1.6214355280741093, + "grad_norm": 0.9496495127677917, + "learning_rate": 9.191083187337644e-05, + "loss": 2.7484, + "step": 17897 + }, + { + "epoch": 1.621526126248556, + "grad_norm": 0.9083473682403564, + "learning_rate": 9.190479067238567e-05, + "loss": 2.8019, + "step": 17898 + }, + { + "epoch": 1.6216167244230029, + "grad_norm": 0.9177120327949524, + "learning_rate": 9.189874947139492e-05, + "loss": 2.8633, + "step": 17899 + }, + { + "epoch": 1.6217073225974497, + "grad_norm": 0.9187126755714417, + "learning_rate": 9.189270827040417e-05, + "loss": 2.7852, + "step": 17900 + }, + { + "epoch": 1.6217979207718964, + "grad_norm": 0.8756958246231079, + "learning_rate": 9.18866670694134e-05, + "loss": 2.7464, + "step": 17901 + }, + { + "epoch": 1.6218885189463432, + "grad_norm": 0.8386582732200623, + "learning_rate": 9.188062586842265e-05, + "loss": 2.4946, + "step": 17902 + }, + { + "epoch": 1.62197911712079, + "grad_norm": 0.8825865983963013, + "learning_rate": 9.187458466743188e-05, + "loss": 2.6474, + "step": 17903 + }, + { + "epoch": 1.6220697152952368, + "grad_norm": 0.8741570711135864, + "learning_rate": 9.186854346644114e-05, + "loss": 2.6731, + "step": 17904 + }, + { + "epoch": 1.6221603134696836, + "grad_norm": 0.8076184391975403, + "learning_rate": 9.186250226545038e-05, + "loss": 1.9484, + "step": 17905 + }, + { + "epoch": 1.6222509116441304, + "grad_norm": 0.8996043801307678, + "learning_rate": 9.185646106445962e-05, + "loss": 2.5703, + "step": 17906 + }, + { + "epoch": 1.6223415098185772, + "grad_norm": 0.9960467219352722, + "learning_rate": 9.185041986346886e-05, + "loss": 3.0436, + "step": 17907 + }, + { + "epoch": 1.622432107993024, + "grad_norm": 0.973169207572937, + "learning_rate": 9.18443786624781e-05, + "loss": 2.2101, + "step": 17908 + }, + { + "epoch": 1.6225227061674707, + "grad_norm": 0.9444350004196167, + "learning_rate": 9.183833746148734e-05, + "loss": 2.9482, + "step": 17909 + }, + { + "epoch": 1.6226133043419175, + "grad_norm": 0.8605048060417175, + "learning_rate": 9.183229626049659e-05, + "loss": 2.3612, + "step": 17910 + }, + { + "epoch": 1.6227039025163643, + "grad_norm": 0.6817775368690491, + "learning_rate": 9.182625505950584e-05, + "loss": 1.4781, + "step": 17911 + }, + { + "epoch": 1.622794500690811, + "grad_norm": 0.92464679479599, + "learning_rate": 9.182021385851508e-05, + "loss": 2.7841, + "step": 17912 + }, + { + "epoch": 1.622885098865258, + "grad_norm": 0.8895735144615173, + "learning_rate": 9.181417265752432e-05, + "loss": 2.6925, + "step": 17913 + }, + { + "epoch": 1.6229756970397047, + "grad_norm": 0.7221099138259888, + "learning_rate": 9.180813145653356e-05, + "loss": 1.9271, + "step": 17914 + }, + { + "epoch": 1.6230662952141515, + "grad_norm": 0.9108904600143433, + "learning_rate": 9.18020902555428e-05, + "loss": 2.7824, + "step": 17915 + }, + { + "epoch": 1.6231568933885983, + "grad_norm": 0.8973182439804077, + "learning_rate": 9.179604905455205e-05, + "loss": 2.9662, + "step": 17916 + }, + { + "epoch": 1.623247491563045, + "grad_norm": 0.7712409496307373, + "learning_rate": 9.17900078535613e-05, + "loss": 2.1593, + "step": 17917 + }, + { + "epoch": 1.6233380897374918, + "grad_norm": 0.9384790658950806, + "learning_rate": 9.178396665257053e-05, + "loss": 2.6713, + "step": 17918 + }, + { + "epoch": 1.6234286879119386, + "grad_norm": 0.8989589214324951, + "learning_rate": 9.177792545157979e-05, + "loss": 2.4911, + "step": 17919 + }, + { + "epoch": 1.6235192860863854, + "grad_norm": 0.8720753192901611, + "learning_rate": 9.177188425058902e-05, + "loss": 2.5423, + "step": 17920 + }, + { + "epoch": 1.6236098842608322, + "grad_norm": 0.7761460542678833, + "learning_rate": 9.176584304959827e-05, + "loss": 2.1213, + "step": 17921 + }, + { + "epoch": 1.623700482435279, + "grad_norm": 0.8921520709991455, + "learning_rate": 9.17598018486075e-05, + "loss": 2.6397, + "step": 17922 + }, + { + "epoch": 1.6237910806097258, + "grad_norm": 1.0099002122879028, + "learning_rate": 9.175376064761675e-05, + "loss": 2.6257, + "step": 17923 + }, + { + "epoch": 1.6238816787841723, + "grad_norm": 0.8502610325813293, + "learning_rate": 9.174771944662599e-05, + "loss": 2.5943, + "step": 17924 + }, + { + "epoch": 1.6239722769586193, + "grad_norm": 0.9332104325294495, + "learning_rate": 9.174167824563523e-05, + "loss": 3.0556, + "step": 17925 + }, + { + "epoch": 1.624062875133066, + "grad_norm": 0.9680899977684021, + "learning_rate": 9.173563704464448e-05, + "loss": 2.5777, + "step": 17926 + }, + { + "epoch": 1.624153473307513, + "grad_norm": 0.9528477787971497, + "learning_rate": 9.172959584365373e-05, + "loss": 2.8661, + "step": 17927 + }, + { + "epoch": 1.6242440714819595, + "grad_norm": 0.9391209483146667, + "learning_rate": 9.172355464266296e-05, + "loss": 2.8568, + "step": 17928 + }, + { + "epoch": 1.6243346696564065, + "grad_norm": 0.9905222058296204, + "learning_rate": 9.171751344167221e-05, + "loss": 2.6197, + "step": 17929 + }, + { + "epoch": 1.624425267830853, + "grad_norm": 1.0202468633651733, + "learning_rate": 9.171147224068145e-05, + "loss": 2.7303, + "step": 17930 + }, + { + "epoch": 1.6245158660053, + "grad_norm": 0.9412490129470825, + "learning_rate": 9.170543103969069e-05, + "loss": 2.6638, + "step": 17931 + }, + { + "epoch": 1.6246064641797466, + "grad_norm": 0.8977568745613098, + "learning_rate": 9.169938983869994e-05, + "loss": 2.9618, + "step": 17932 + }, + { + "epoch": 1.6246970623541936, + "grad_norm": 0.9005883932113647, + "learning_rate": 9.169334863770917e-05, + "loss": 2.5084, + "step": 17933 + }, + { + "epoch": 1.6247876605286402, + "grad_norm": 0.9554957151412964, + "learning_rate": 9.168730743671842e-05, + "loss": 2.6098, + "step": 17934 + }, + { + "epoch": 1.6248782587030872, + "grad_norm": 0.9470647573471069, + "learning_rate": 9.168126623572767e-05, + "loss": 2.6587, + "step": 17935 + }, + { + "epoch": 1.6249688568775338, + "grad_norm": 0.8995162844657898, + "learning_rate": 9.167522503473692e-05, + "loss": 2.8269, + "step": 17936 + }, + { + "epoch": 1.6250594550519808, + "grad_norm": 0.9165927767753601, + "learning_rate": 9.166918383374615e-05, + "loss": 2.846, + "step": 17937 + }, + { + "epoch": 1.6251500532264274, + "grad_norm": 0.8599014282226562, + "learning_rate": 9.16631426327554e-05, + "loss": 2.8606, + "step": 17938 + }, + { + "epoch": 1.6252406514008744, + "grad_norm": 0.8761343359947205, + "learning_rate": 9.165710143176463e-05, + "loss": 2.6873, + "step": 17939 + }, + { + "epoch": 1.625331249575321, + "grad_norm": 0.8472109436988831, + "learning_rate": 9.165106023077388e-05, + "loss": 2.7713, + "step": 17940 + }, + { + "epoch": 1.625421847749768, + "grad_norm": 0.7653132677078247, + "learning_rate": 9.164501902978313e-05, + "loss": 1.9718, + "step": 17941 + }, + { + "epoch": 1.6255124459242145, + "grad_norm": 0.8659104108810425, + "learning_rate": 9.163897782879238e-05, + "loss": 1.9057, + "step": 17942 + }, + { + "epoch": 1.6256030440986615, + "grad_norm": 0.8786661028862, + "learning_rate": 9.163293662780161e-05, + "loss": 2.3955, + "step": 17943 + }, + { + "epoch": 1.625693642273108, + "grad_norm": 0.8745163679122925, + "learning_rate": 9.162689542681086e-05, + "loss": 2.8185, + "step": 17944 + }, + { + "epoch": 1.625784240447555, + "grad_norm": 0.9365387558937073, + "learning_rate": 9.162085422582009e-05, + "loss": 2.6304, + "step": 17945 + }, + { + "epoch": 1.6258748386220017, + "grad_norm": 0.8925528526306152, + "learning_rate": 9.161481302482934e-05, + "loss": 2.7969, + "step": 17946 + }, + { + "epoch": 1.6259654367964487, + "grad_norm": 0.8990510702133179, + "learning_rate": 9.160877182383857e-05, + "loss": 2.5865, + "step": 17947 + }, + { + "epoch": 1.6260560349708952, + "grad_norm": 1.0263116359710693, + "learning_rate": 9.160273062284782e-05, + "loss": 2.7848, + "step": 17948 + }, + { + "epoch": 1.6261466331453422, + "grad_norm": 0.8772395253181458, + "learning_rate": 9.159668942185707e-05, + "loss": 2.6529, + "step": 17949 + }, + { + "epoch": 1.6262372313197888, + "grad_norm": 0.8979916572570801, + "learning_rate": 9.159064822086632e-05, + "loss": 2.6622, + "step": 17950 + }, + { + "epoch": 1.6263278294942358, + "grad_norm": 0.8812878131866455, + "learning_rate": 9.158460701987556e-05, + "loss": 2.569, + "step": 17951 + }, + { + "epoch": 1.6264184276686824, + "grad_norm": 0.7525177597999573, + "learning_rate": 9.15785658188848e-05, + "loss": 2.1274, + "step": 17952 + }, + { + "epoch": 1.6265090258431294, + "grad_norm": 0.897952139377594, + "learning_rate": 9.157252461789405e-05, + "loss": 2.6538, + "step": 17953 + }, + { + "epoch": 1.626599624017576, + "grad_norm": 0.9425663948059082, + "learning_rate": 9.156648341690328e-05, + "loss": 2.5857, + "step": 17954 + }, + { + "epoch": 1.626690222192023, + "grad_norm": 0.8673484921455383, + "learning_rate": 9.156044221591253e-05, + "loss": 2.8994, + "step": 17955 + }, + { + "epoch": 1.6267808203664695, + "grad_norm": 0.9165211319923401, + "learning_rate": 9.155440101492177e-05, + "loss": 2.6509, + "step": 17956 + }, + { + "epoch": 1.6268714185409165, + "grad_norm": 0.904985249042511, + "learning_rate": 9.154835981393102e-05, + "loss": 2.8336, + "step": 17957 + }, + { + "epoch": 1.6269620167153631, + "grad_norm": 0.9264861941337585, + "learning_rate": 9.154231861294026e-05, + "loss": 2.6527, + "step": 17958 + }, + { + "epoch": 1.6270526148898101, + "grad_norm": 0.9073176980018616, + "learning_rate": 9.15362774119495e-05, + "loss": 2.7609, + "step": 17959 + }, + { + "epoch": 1.6271432130642567, + "grad_norm": 0.8808057904243469, + "learning_rate": 9.153023621095874e-05, + "loss": 2.4476, + "step": 17960 + }, + { + "epoch": 1.6272338112387037, + "grad_norm": 0.8644388318061829, + "learning_rate": 9.152419500996799e-05, + "loss": 2.676, + "step": 17961 + }, + { + "epoch": 1.6273244094131503, + "grad_norm": 0.905716061592102, + "learning_rate": 9.151815380897722e-05, + "loss": 2.8273, + "step": 17962 + }, + { + "epoch": 1.6274150075875973, + "grad_norm": 0.8995169997215271, + "learning_rate": 9.151211260798647e-05, + "loss": 2.8522, + "step": 17963 + }, + { + "epoch": 1.6275056057620438, + "grad_norm": 0.837213397026062, + "learning_rate": 9.150607140699571e-05, + "loss": 2.5622, + "step": 17964 + }, + { + "epoch": 1.6275962039364906, + "grad_norm": 0.9011421203613281, + "learning_rate": 9.150003020600496e-05, + "loss": 2.7793, + "step": 17965 + }, + { + "epoch": 1.6276868021109374, + "grad_norm": 0.9207816123962402, + "learning_rate": 9.14939890050142e-05, + "loss": 2.5404, + "step": 17966 + }, + { + "epoch": 1.6277774002853842, + "grad_norm": 0.870858371257782, + "learning_rate": 9.148794780402344e-05, + "loss": 2.6919, + "step": 17967 + }, + { + "epoch": 1.627867998459831, + "grad_norm": 0.8781540393829346, + "learning_rate": 9.148190660303269e-05, + "loss": 2.4301, + "step": 17968 + }, + { + "epoch": 1.6279585966342778, + "grad_norm": 0.9247928261756897, + "learning_rate": 9.147586540204193e-05, + "loss": 2.7108, + "step": 17969 + }, + { + "epoch": 1.6280491948087246, + "grad_norm": 0.913131058216095, + "learning_rate": 9.146982420105117e-05, + "loss": 2.6468, + "step": 17970 + }, + { + "epoch": 1.6281397929831714, + "grad_norm": 0.8836131691932678, + "learning_rate": 9.146378300006042e-05, + "loss": 2.9749, + "step": 17971 + }, + { + "epoch": 1.6282303911576181, + "grad_norm": 0.8834976553916931, + "learning_rate": 9.145774179906967e-05, + "loss": 2.654, + "step": 17972 + }, + { + "epoch": 1.628320989332065, + "grad_norm": 0.9324581623077393, + "learning_rate": 9.14517005980789e-05, + "loss": 2.8108, + "step": 17973 + }, + { + "epoch": 1.6284115875065117, + "grad_norm": 0.8681703209877014, + "learning_rate": 9.144565939708815e-05, + "loss": 2.6342, + "step": 17974 + }, + { + "epoch": 1.6285021856809585, + "grad_norm": 0.87032151222229, + "learning_rate": 9.143961819609738e-05, + "loss": 2.748, + "step": 17975 + }, + { + "epoch": 1.6285927838554053, + "grad_norm": 0.9328641891479492, + "learning_rate": 9.143357699510663e-05, + "loss": 2.5787, + "step": 17976 + }, + { + "epoch": 1.628683382029852, + "grad_norm": 0.890734076499939, + "learning_rate": 9.142753579411587e-05, + "loss": 2.694, + "step": 17977 + }, + { + "epoch": 1.6287739802042989, + "grad_norm": 0.8944907188415527, + "learning_rate": 9.142149459312511e-05, + "loss": 2.8083, + "step": 17978 + }, + { + "epoch": 1.6288645783787457, + "grad_norm": 0.927912712097168, + "learning_rate": 9.141545339213436e-05, + "loss": 2.8279, + "step": 17979 + }, + { + "epoch": 1.6289551765531924, + "grad_norm": 0.8837327361106873, + "learning_rate": 9.140941219114361e-05, + "loss": 2.6448, + "step": 17980 + }, + { + "epoch": 1.6290457747276392, + "grad_norm": 0.9231458902359009, + "learning_rate": 9.140337099015284e-05, + "loss": 2.8215, + "step": 17981 + }, + { + "epoch": 1.629136372902086, + "grad_norm": 0.9248692393302917, + "learning_rate": 9.139732978916209e-05, + "loss": 2.8085, + "step": 17982 + }, + { + "epoch": 1.6292269710765328, + "grad_norm": 0.8726727962493896, + "learning_rate": 9.139128858817132e-05, + "loss": 2.8549, + "step": 17983 + }, + { + "epoch": 1.6293175692509796, + "grad_norm": 0.9139258861541748, + "learning_rate": 9.138524738718057e-05, + "loss": 2.8772, + "step": 17984 + }, + { + "epoch": 1.6294081674254264, + "grad_norm": 0.8087066411972046, + "learning_rate": 9.137920618618982e-05, + "loss": 2.1179, + "step": 17985 + }, + { + "epoch": 1.6294987655998732, + "grad_norm": 0.8809995055198669, + "learning_rate": 9.137316498519907e-05, + "loss": 2.5119, + "step": 17986 + }, + { + "epoch": 1.62958936377432, + "grad_norm": 0.8719996809959412, + "learning_rate": 9.136712378420831e-05, + "loss": 2.1647, + "step": 17987 + }, + { + "epoch": 1.6296799619487667, + "grad_norm": 0.9265745878219604, + "learning_rate": 9.136108258321755e-05, + "loss": 2.7868, + "step": 17988 + }, + { + "epoch": 1.6297705601232135, + "grad_norm": 0.7731362581253052, + "learning_rate": 9.13550413822268e-05, + "loss": 1.9675, + "step": 17989 + }, + { + "epoch": 1.6298611582976603, + "grad_norm": 0.8866428732872009, + "learning_rate": 9.134900018123603e-05, + "loss": 2.6857, + "step": 17990 + }, + { + "epoch": 1.629951756472107, + "grad_norm": 0.9383274912834167, + "learning_rate": 9.134295898024528e-05, + "loss": 2.7106, + "step": 17991 + }, + { + "epoch": 1.630042354646554, + "grad_norm": 0.9251987338066101, + "learning_rate": 9.133691777925451e-05, + "loss": 2.6905, + "step": 17992 + }, + { + "epoch": 1.6301329528210007, + "grad_norm": 0.8683757781982422, + "learning_rate": 9.133087657826377e-05, + "loss": 2.4759, + "step": 17993 + }, + { + "epoch": 1.6302235509954475, + "grad_norm": 0.8383153676986694, + "learning_rate": 9.132483537727301e-05, + "loss": 2.7154, + "step": 17994 + }, + { + "epoch": 1.6303141491698943, + "grad_norm": 0.8627203106880188, + "learning_rate": 9.131879417628225e-05, + "loss": 2.5472, + "step": 17995 + }, + { + "epoch": 1.630404747344341, + "grad_norm": 0.903915286064148, + "learning_rate": 9.131275297529149e-05, + "loss": 2.7043, + "step": 17996 + }, + { + "epoch": 1.6304953455187878, + "grad_norm": 0.9372531175613403, + "learning_rate": 9.130671177430074e-05, + "loss": 2.6634, + "step": 17997 + }, + { + "epoch": 1.6305859436932346, + "grad_norm": 0.872825026512146, + "learning_rate": 9.130067057330997e-05, + "loss": 2.784, + "step": 17998 + }, + { + "epoch": 1.6306765418676814, + "grad_norm": 0.9247845411300659, + "learning_rate": 9.129462937231922e-05, + "loss": 2.8959, + "step": 17999 + }, + { + "epoch": 1.6307671400421282, + "grad_norm": 0.9811525344848633, + "learning_rate": 9.128858817132847e-05, + "loss": 2.6386, + "step": 18000 + }, + { + "epoch": 1.630857738216575, + "grad_norm": 0.9141604900360107, + "learning_rate": 9.128254697033771e-05, + "loss": 2.7126, + "step": 18001 + }, + { + "epoch": 1.6309483363910218, + "grad_norm": 0.8936553597450256, + "learning_rate": 9.127650576934695e-05, + "loss": 2.8971, + "step": 18002 + }, + { + "epoch": 1.6310389345654686, + "grad_norm": 0.8579995036125183, + "learning_rate": 9.12704645683562e-05, + "loss": 2.8254, + "step": 18003 + }, + { + "epoch": 1.6311295327399153, + "grad_norm": 0.8906399011611938, + "learning_rate": 9.126442336736544e-05, + "loss": 2.5451, + "step": 18004 + }, + { + "epoch": 1.631220130914362, + "grad_norm": 0.9309455752372742, + "learning_rate": 9.125838216637468e-05, + "loss": 2.8794, + "step": 18005 + }, + { + "epoch": 1.631310729088809, + "grad_norm": 0.9791542887687683, + "learning_rate": 9.125234096538392e-05, + "loss": 2.5071, + "step": 18006 + }, + { + "epoch": 1.6314013272632555, + "grad_norm": 0.9388565421104431, + "learning_rate": 9.124629976439316e-05, + "loss": 2.7622, + "step": 18007 + }, + { + "epoch": 1.6314919254377025, + "grad_norm": 0.8815624117851257, + "learning_rate": 9.124025856340242e-05, + "loss": 2.7757, + "step": 18008 + }, + { + "epoch": 1.631582523612149, + "grad_norm": 0.7954680323600769, + "learning_rate": 9.123421736241165e-05, + "loss": 1.9008, + "step": 18009 + }, + { + "epoch": 1.631673121786596, + "grad_norm": 0.9250298142433167, + "learning_rate": 9.12281761614209e-05, + "loss": 2.6472, + "step": 18010 + }, + { + "epoch": 1.6317637199610426, + "grad_norm": 0.9340556263923645, + "learning_rate": 9.122213496043014e-05, + "loss": 2.9428, + "step": 18011 + }, + { + "epoch": 1.6318543181354896, + "grad_norm": 0.87044358253479, + "learning_rate": 9.121609375943938e-05, + "loss": 2.5932, + "step": 18012 + }, + { + "epoch": 1.6319449163099362, + "grad_norm": 0.8842295408248901, + "learning_rate": 9.121005255844862e-05, + "loss": 2.4328, + "step": 18013 + }, + { + "epoch": 1.6320355144843832, + "grad_norm": 0.893206775188446, + "learning_rate": 9.120401135745786e-05, + "loss": 2.7729, + "step": 18014 + }, + { + "epoch": 1.6321261126588298, + "grad_norm": 0.8622577786445618, + "learning_rate": 9.11979701564671e-05, + "loss": 2.6769, + "step": 18015 + }, + { + "epoch": 1.6322167108332768, + "grad_norm": 0.8508538603782654, + "learning_rate": 9.119192895547636e-05, + "loss": 2.5242, + "step": 18016 + }, + { + "epoch": 1.6323073090077234, + "grad_norm": 0.8854832053184509, + "learning_rate": 9.11858877544856e-05, + "loss": 2.8611, + "step": 18017 + }, + { + "epoch": 1.6323979071821704, + "grad_norm": 0.7827892899513245, + "learning_rate": 9.117984655349484e-05, + "loss": 2.2104, + "step": 18018 + }, + { + "epoch": 1.632488505356617, + "grad_norm": 0.8910443782806396, + "learning_rate": 9.117380535250409e-05, + "loss": 2.8117, + "step": 18019 + }, + { + "epoch": 1.632579103531064, + "grad_norm": 0.8657221794128418, + "learning_rate": 9.116776415151332e-05, + "loss": 2.6054, + "step": 18020 + }, + { + "epoch": 1.6326697017055105, + "grad_norm": 0.9554665088653564, + "learning_rate": 9.116172295052257e-05, + "loss": 2.7476, + "step": 18021 + }, + { + "epoch": 1.6327602998799575, + "grad_norm": 0.847847044467926, + "learning_rate": 9.11556817495318e-05, + "loss": 2.5583, + "step": 18022 + }, + { + "epoch": 1.632850898054404, + "grad_norm": 0.8270926475524902, + "learning_rate": 9.114964054854107e-05, + "loss": 2.0332, + "step": 18023 + }, + { + "epoch": 1.632941496228851, + "grad_norm": 0.928801953792572, + "learning_rate": 9.11435993475503e-05, + "loss": 2.6057, + "step": 18024 + }, + { + "epoch": 1.6330320944032977, + "grad_norm": 0.9108812808990479, + "learning_rate": 9.113755814655955e-05, + "loss": 2.6133, + "step": 18025 + }, + { + "epoch": 1.6331226925777447, + "grad_norm": 0.8296540975570679, + "learning_rate": 9.113151694556878e-05, + "loss": 2.7659, + "step": 18026 + }, + { + "epoch": 1.6332132907521912, + "grad_norm": 0.8595731854438782, + "learning_rate": 9.112547574457803e-05, + "loss": 2.7176, + "step": 18027 + }, + { + "epoch": 1.6333038889266382, + "grad_norm": 0.9358341693878174, + "learning_rate": 9.111943454358726e-05, + "loss": 2.7796, + "step": 18028 + }, + { + "epoch": 1.6333944871010848, + "grad_norm": 0.8745033740997314, + "learning_rate": 9.111339334259651e-05, + "loss": 2.1477, + "step": 18029 + }, + { + "epoch": 1.6334850852755318, + "grad_norm": 0.9185699224472046, + "learning_rate": 9.110735214160574e-05, + "loss": 2.9667, + "step": 18030 + }, + { + "epoch": 1.6335756834499784, + "grad_norm": 0.9734324812889099, + "learning_rate": 9.1101310940615e-05, + "loss": 2.712, + "step": 18031 + }, + { + "epoch": 1.6336662816244254, + "grad_norm": 0.9019016027450562, + "learning_rate": 9.109526973962424e-05, + "loss": 2.7693, + "step": 18032 + }, + { + "epoch": 1.633756879798872, + "grad_norm": 0.8647888898849487, + "learning_rate": 9.108922853863349e-05, + "loss": 2.6977, + "step": 18033 + }, + { + "epoch": 1.633847477973319, + "grad_norm": 0.9097010493278503, + "learning_rate": 9.108318733764272e-05, + "loss": 2.6164, + "step": 18034 + }, + { + "epoch": 1.6339380761477655, + "grad_norm": 0.9360483288764954, + "learning_rate": 9.107714613665197e-05, + "loss": 2.8477, + "step": 18035 + }, + { + "epoch": 1.6340286743222125, + "grad_norm": 1.0078880786895752, + "learning_rate": 9.107110493566122e-05, + "loss": 2.5756, + "step": 18036 + }, + { + "epoch": 1.6341192724966591, + "grad_norm": 0.9322060346603394, + "learning_rate": 9.106506373467045e-05, + "loss": 2.8747, + "step": 18037 + }, + { + "epoch": 1.6342098706711061, + "grad_norm": 0.7704019546508789, + "learning_rate": 9.10590225336797e-05, + "loss": 2.0342, + "step": 18038 + }, + { + "epoch": 1.6343004688455527, + "grad_norm": 0.8957656025886536, + "learning_rate": 9.105298133268895e-05, + "loss": 2.4509, + "step": 18039 + }, + { + "epoch": 1.6343910670199997, + "grad_norm": 0.913641631603241, + "learning_rate": 9.10469401316982e-05, + "loss": 2.5933, + "step": 18040 + }, + { + "epoch": 1.6344816651944463, + "grad_norm": 0.8809475898742676, + "learning_rate": 9.104089893070743e-05, + "loss": 2.8034, + "step": 18041 + }, + { + "epoch": 1.6345722633688933, + "grad_norm": 0.8623461723327637, + "learning_rate": 9.103485772971668e-05, + "loss": 2.8597, + "step": 18042 + }, + { + "epoch": 1.6346628615433398, + "grad_norm": 0.8363438248634338, + "learning_rate": 9.102881652872591e-05, + "loss": 2.4079, + "step": 18043 + }, + { + "epoch": 1.6347534597177868, + "grad_norm": 1.01633882522583, + "learning_rate": 9.102277532773516e-05, + "loss": 2.9794, + "step": 18044 + }, + { + "epoch": 1.6348440578922334, + "grad_norm": 0.925079345703125, + "learning_rate": 9.101673412674439e-05, + "loss": 2.7104, + "step": 18045 + }, + { + "epoch": 1.6349346560666802, + "grad_norm": 0.8522437214851379, + "learning_rate": 9.101069292575365e-05, + "loss": 2.6123, + "step": 18046 + }, + { + "epoch": 1.635025254241127, + "grad_norm": 0.9461352229118347, + "learning_rate": 9.100465172476289e-05, + "loss": 2.7384, + "step": 18047 + }, + { + "epoch": 1.6351158524155738, + "grad_norm": 0.7944126725196838, + "learning_rate": 9.099861052377213e-05, + "loss": 2.0487, + "step": 18048 + }, + { + "epoch": 1.6352064505900206, + "grad_norm": 0.7697715759277344, + "learning_rate": 9.099256932278137e-05, + "loss": 2.0893, + "step": 18049 + }, + { + "epoch": 1.6352970487644674, + "grad_norm": 0.9694502353668213, + "learning_rate": 9.098652812179062e-05, + "loss": 2.4437, + "step": 18050 + }, + { + "epoch": 1.6353876469389141, + "grad_norm": 0.8772050738334656, + "learning_rate": 9.098048692079986e-05, + "loss": 2.4214, + "step": 18051 + }, + { + "epoch": 1.635478245113361, + "grad_norm": 0.9331088066101074, + "learning_rate": 9.09744457198091e-05, + "loss": 2.9305, + "step": 18052 + }, + { + "epoch": 1.6355688432878077, + "grad_norm": 0.9534633159637451, + "learning_rate": 9.096840451881834e-05, + "loss": 2.8042, + "step": 18053 + }, + { + "epoch": 1.6356594414622545, + "grad_norm": 1.0447006225585938, + "learning_rate": 9.096236331782759e-05, + "loss": 2.6918, + "step": 18054 + }, + { + "epoch": 1.6357500396367013, + "grad_norm": 0.9348125457763672, + "learning_rate": 9.095632211683684e-05, + "loss": 2.6942, + "step": 18055 + }, + { + "epoch": 1.635840637811148, + "grad_norm": 0.9281875491142273, + "learning_rate": 9.095028091584607e-05, + "loss": 2.6069, + "step": 18056 + }, + { + "epoch": 1.6359312359855949, + "grad_norm": 0.9397428035736084, + "learning_rate": 9.094423971485532e-05, + "loss": 2.9433, + "step": 18057 + }, + { + "epoch": 1.6360218341600417, + "grad_norm": 0.9259713292121887, + "learning_rate": 9.093819851386456e-05, + "loss": 2.8695, + "step": 18058 + }, + { + "epoch": 1.6361124323344884, + "grad_norm": 1.020525336265564, + "learning_rate": 9.09321573128738e-05, + "loss": 2.8602, + "step": 18059 + }, + { + "epoch": 1.6362030305089352, + "grad_norm": 0.9153118133544922, + "learning_rate": 9.092611611188304e-05, + "loss": 2.5936, + "step": 18060 + }, + { + "epoch": 1.636293628683382, + "grad_norm": 0.9142446517944336, + "learning_rate": 9.09200749108923e-05, + "loss": 2.6798, + "step": 18061 + }, + { + "epoch": 1.6363842268578288, + "grad_norm": 0.9150940775871277, + "learning_rate": 9.091403370990153e-05, + "loss": 2.6588, + "step": 18062 + }, + { + "epoch": 1.6364748250322756, + "grad_norm": 0.912600576877594, + "learning_rate": 9.090799250891078e-05, + "loss": 2.7387, + "step": 18063 + }, + { + "epoch": 1.6365654232067224, + "grad_norm": 0.9422318339347839, + "learning_rate": 9.090195130792001e-05, + "loss": 3.0413, + "step": 18064 + }, + { + "epoch": 1.6366560213811692, + "grad_norm": 0.9327174425125122, + "learning_rate": 9.089591010692926e-05, + "loss": 2.6391, + "step": 18065 + }, + { + "epoch": 1.636746619555616, + "grad_norm": 0.8984633684158325, + "learning_rate": 9.08898689059385e-05, + "loss": 2.7496, + "step": 18066 + }, + { + "epoch": 1.6368372177300627, + "grad_norm": 0.8948565125465393, + "learning_rate": 9.088382770494774e-05, + "loss": 2.9527, + "step": 18067 + }, + { + "epoch": 1.6369278159045095, + "grad_norm": 0.975431501865387, + "learning_rate": 9.087778650395699e-05, + "loss": 2.8076, + "step": 18068 + }, + { + "epoch": 1.6370184140789563, + "grad_norm": 0.8682781457901001, + "learning_rate": 9.087174530296624e-05, + "loss": 2.1375, + "step": 18069 + }, + { + "epoch": 1.637109012253403, + "grad_norm": 0.8720269799232483, + "learning_rate": 9.086570410197547e-05, + "loss": 2.6818, + "step": 18070 + }, + { + "epoch": 1.63719961042785, + "grad_norm": 0.8419937491416931, + "learning_rate": 9.085966290098472e-05, + "loss": 2.4593, + "step": 18071 + }, + { + "epoch": 1.6372902086022967, + "grad_norm": 0.9172002077102661, + "learning_rate": 9.085362169999397e-05, + "loss": 2.058, + "step": 18072 + }, + { + "epoch": 1.6373808067767435, + "grad_norm": 0.9406450986862183, + "learning_rate": 9.08475804990032e-05, + "loss": 2.5034, + "step": 18073 + }, + { + "epoch": 1.6374714049511903, + "grad_norm": 0.7722863554954529, + "learning_rate": 9.084153929801245e-05, + "loss": 1.9177, + "step": 18074 + }, + { + "epoch": 1.637562003125637, + "grad_norm": 0.8731490969657898, + "learning_rate": 9.083549809702168e-05, + "loss": 2.8873, + "step": 18075 + }, + { + "epoch": 1.6376526013000838, + "grad_norm": 0.9210612773895264, + "learning_rate": 9.082945689603095e-05, + "loss": 2.5152, + "step": 18076 + }, + { + "epoch": 1.6377431994745306, + "grad_norm": 0.8284095525741577, + "learning_rate": 9.082341569504018e-05, + "loss": 2.6091, + "step": 18077 + }, + { + "epoch": 1.6378337976489774, + "grad_norm": 0.8877255916595459, + "learning_rate": 9.081737449404943e-05, + "loss": 2.6063, + "step": 18078 + }, + { + "epoch": 1.6379243958234242, + "grad_norm": 0.8800955414772034, + "learning_rate": 9.081133329305866e-05, + "loss": 2.9661, + "step": 18079 + }, + { + "epoch": 1.638014993997871, + "grad_norm": 1.0406115055084229, + "learning_rate": 9.080529209206791e-05, + "loss": 2.7719, + "step": 18080 + }, + { + "epoch": 1.6381055921723178, + "grad_norm": 0.8840624094009399, + "learning_rate": 9.079925089107714e-05, + "loss": 2.694, + "step": 18081 + }, + { + "epoch": 1.6381961903467646, + "grad_norm": 0.8794461488723755, + "learning_rate": 9.079320969008639e-05, + "loss": 2.7644, + "step": 18082 + }, + { + "epoch": 1.6382867885212113, + "grad_norm": 0.8427479863166809, + "learning_rate": 9.078716848909564e-05, + "loss": 2.8676, + "step": 18083 + }, + { + "epoch": 1.6383773866956581, + "grad_norm": 0.8509645462036133, + "learning_rate": 9.078112728810489e-05, + "loss": 2.5303, + "step": 18084 + }, + { + "epoch": 1.638467984870105, + "grad_norm": 0.8246362805366516, + "learning_rate": 9.077508608711412e-05, + "loss": 2.042, + "step": 18085 + }, + { + "epoch": 1.6385585830445515, + "grad_norm": 0.8733285665512085, + "learning_rate": 9.076904488612337e-05, + "loss": 2.7536, + "step": 18086 + }, + { + "epoch": 1.6386491812189985, + "grad_norm": 0.842009961605072, + "learning_rate": 9.076300368513261e-05, + "loss": 2.185, + "step": 18087 + }, + { + "epoch": 1.638739779393445, + "grad_norm": 0.8964748382568359, + "learning_rate": 9.075696248414185e-05, + "loss": 2.6919, + "step": 18088 + }, + { + "epoch": 1.638830377567892, + "grad_norm": 0.8854851722717285, + "learning_rate": 9.07509212831511e-05, + "loss": 2.37, + "step": 18089 + }, + { + "epoch": 1.6389209757423386, + "grad_norm": 0.8827214241027832, + "learning_rate": 9.074488008216033e-05, + "loss": 2.8033, + "step": 18090 + }, + { + "epoch": 1.6390115739167856, + "grad_norm": 0.9190190434455872, + "learning_rate": 9.073883888116959e-05, + "loss": 2.6481, + "step": 18091 + }, + { + "epoch": 1.6391021720912322, + "grad_norm": 1.029134750366211, + "learning_rate": 9.073279768017883e-05, + "loss": 2.8875, + "step": 18092 + }, + { + "epoch": 1.6391927702656792, + "grad_norm": 0.8370829224586487, + "learning_rate": 9.072675647918807e-05, + "loss": 2.6589, + "step": 18093 + }, + { + "epoch": 1.6392833684401258, + "grad_norm": 0.8954204320907593, + "learning_rate": 9.072071527819731e-05, + "loss": 2.7584, + "step": 18094 + }, + { + "epoch": 1.6393739666145728, + "grad_norm": 0.9637326598167419, + "learning_rate": 9.071467407720655e-05, + "loss": 2.5529, + "step": 18095 + }, + { + "epoch": 1.6394645647890194, + "grad_norm": 0.900023341178894, + "learning_rate": 9.070863287621579e-05, + "loss": 2.809, + "step": 18096 + }, + { + "epoch": 1.6395551629634664, + "grad_norm": 0.9681251645088196, + "learning_rate": 9.070259167522504e-05, + "loss": 2.6298, + "step": 18097 + }, + { + "epoch": 1.639645761137913, + "grad_norm": 0.909743070602417, + "learning_rate": 9.069655047423428e-05, + "loss": 2.6945, + "step": 18098 + }, + { + "epoch": 1.63973635931236, + "grad_norm": 0.9249989986419678, + "learning_rate": 9.069050927324353e-05, + "loss": 2.6956, + "step": 18099 + }, + { + "epoch": 1.6398269574868065, + "grad_norm": 0.9105225205421448, + "learning_rate": 9.068446807225277e-05, + "loss": 2.667, + "step": 18100 + }, + { + "epoch": 1.6399175556612535, + "grad_norm": 0.8311652541160583, + "learning_rate": 9.067842687126201e-05, + "loss": 2.5664, + "step": 18101 + }, + { + "epoch": 1.6400081538357, + "grad_norm": 0.8155735731124878, + "learning_rate": 9.067238567027125e-05, + "loss": 2.7225, + "step": 18102 + }, + { + "epoch": 1.640098752010147, + "grad_norm": 0.9229670763015747, + "learning_rate": 9.06663444692805e-05, + "loss": 2.8118, + "step": 18103 + }, + { + "epoch": 1.6401893501845937, + "grad_norm": 0.897759735584259, + "learning_rate": 9.066030326828974e-05, + "loss": 2.8223, + "step": 18104 + }, + { + "epoch": 1.6402799483590407, + "grad_norm": 0.92965167760849, + "learning_rate": 9.065426206729898e-05, + "loss": 2.8676, + "step": 18105 + }, + { + "epoch": 1.6403705465334872, + "grad_norm": 0.9508523941040039, + "learning_rate": 9.064822086630824e-05, + "loss": 2.7747, + "step": 18106 + }, + { + "epoch": 1.6404611447079342, + "grad_norm": 0.8897296190261841, + "learning_rate": 9.064217966531747e-05, + "loss": 2.5997, + "step": 18107 + }, + { + "epoch": 1.6405517428823808, + "grad_norm": 0.9368707537651062, + "learning_rate": 9.063613846432672e-05, + "loss": 2.7292, + "step": 18108 + }, + { + "epoch": 1.6406423410568278, + "grad_norm": 0.9517653584480286, + "learning_rate": 9.063009726333595e-05, + "loss": 2.806, + "step": 18109 + }, + { + "epoch": 1.6407329392312744, + "grad_norm": 0.9288533926010132, + "learning_rate": 9.06240560623452e-05, + "loss": 2.6732, + "step": 18110 + }, + { + "epoch": 1.6408235374057214, + "grad_norm": 0.902129054069519, + "learning_rate": 9.061801486135444e-05, + "loss": 2.5981, + "step": 18111 + }, + { + "epoch": 1.640914135580168, + "grad_norm": 0.8885258436203003, + "learning_rate": 9.061197366036368e-05, + "loss": 2.6937, + "step": 18112 + }, + { + "epoch": 1.641004733754615, + "grad_norm": 0.8660950660705566, + "learning_rate": 9.060593245937293e-05, + "loss": 2.6464, + "step": 18113 + }, + { + "epoch": 1.6410953319290615, + "grad_norm": 1.0830143690109253, + "learning_rate": 9.059989125838218e-05, + "loss": 2.6892, + "step": 18114 + }, + { + "epoch": 1.6411859301035086, + "grad_norm": 0.8982396721839905, + "learning_rate": 9.059385005739141e-05, + "loss": 2.8517, + "step": 18115 + }, + { + "epoch": 1.6412765282779551, + "grad_norm": 1.023343563079834, + "learning_rate": 9.058780885640066e-05, + "loss": 2.6327, + "step": 18116 + }, + { + "epoch": 1.6413671264524021, + "grad_norm": 0.8670898675918579, + "learning_rate": 9.05817676554099e-05, + "loss": 2.6002, + "step": 18117 + }, + { + "epoch": 1.6414577246268487, + "grad_norm": 0.837733268737793, + "learning_rate": 9.057572645441914e-05, + "loss": 2.516, + "step": 18118 + }, + { + "epoch": 1.6415483228012957, + "grad_norm": 0.9155449271202087, + "learning_rate": 9.056968525342839e-05, + "loss": 2.6495, + "step": 18119 + }, + { + "epoch": 1.6416389209757423, + "grad_norm": 0.9150014519691467, + "learning_rate": 9.056364405243762e-05, + "loss": 2.8295, + "step": 18120 + }, + { + "epoch": 1.6417295191501893, + "grad_norm": 0.9261459708213806, + "learning_rate": 9.055760285144687e-05, + "loss": 2.8872, + "step": 18121 + }, + { + "epoch": 1.6418201173246358, + "grad_norm": 0.8950507640838623, + "learning_rate": 9.055156165045612e-05, + "loss": 2.4885, + "step": 18122 + }, + { + "epoch": 1.6419107154990829, + "grad_norm": 0.9220141768455505, + "learning_rate": 9.054552044946537e-05, + "loss": 2.5852, + "step": 18123 + }, + { + "epoch": 1.6420013136735294, + "grad_norm": 0.9211923480033875, + "learning_rate": 9.05394792484746e-05, + "loss": 2.6001, + "step": 18124 + }, + { + "epoch": 1.6420919118479764, + "grad_norm": 0.8518920540809631, + "learning_rate": 9.053343804748385e-05, + "loss": 2.8429, + "step": 18125 + }, + { + "epoch": 1.642182510022423, + "grad_norm": 0.7977739572525024, + "learning_rate": 9.052739684649308e-05, + "loss": 2.0745, + "step": 18126 + }, + { + "epoch": 1.6422731081968698, + "grad_norm": 0.9323211312294006, + "learning_rate": 9.052135564550233e-05, + "loss": 2.7783, + "step": 18127 + }, + { + "epoch": 1.6423637063713166, + "grad_norm": 0.8648313283920288, + "learning_rate": 9.051531444451158e-05, + "loss": 2.7353, + "step": 18128 + }, + { + "epoch": 1.6424543045457634, + "grad_norm": 0.9221898317337036, + "learning_rate": 9.050927324352082e-05, + "loss": 2.8677, + "step": 18129 + }, + { + "epoch": 1.6425449027202101, + "grad_norm": 0.8893070220947266, + "learning_rate": 9.050323204253006e-05, + "loss": 2.745, + "step": 18130 + }, + { + "epoch": 1.642635500894657, + "grad_norm": 0.9229108691215515, + "learning_rate": 9.04971908415393e-05, + "loss": 2.8027, + "step": 18131 + }, + { + "epoch": 1.6427260990691037, + "grad_norm": 0.8543931841850281, + "learning_rate": 9.049114964054854e-05, + "loss": 2.6617, + "step": 18132 + }, + { + "epoch": 1.6428166972435505, + "grad_norm": 0.9468865394592285, + "learning_rate": 9.048510843955779e-05, + "loss": 2.3288, + "step": 18133 + }, + { + "epoch": 1.6429072954179973, + "grad_norm": 0.9006844758987427, + "learning_rate": 9.047906723856702e-05, + "loss": 2.7587, + "step": 18134 + }, + { + "epoch": 1.642997893592444, + "grad_norm": 0.8650259971618652, + "learning_rate": 9.047302603757627e-05, + "loss": 2.5884, + "step": 18135 + }, + { + "epoch": 1.6430884917668909, + "grad_norm": 0.8925493955612183, + "learning_rate": 9.046698483658552e-05, + "loss": 2.4538, + "step": 18136 + }, + { + "epoch": 1.6431790899413377, + "grad_norm": 0.8435426354408264, + "learning_rate": 9.046094363559476e-05, + "loss": 2.6304, + "step": 18137 + }, + { + "epoch": 1.6432696881157844, + "grad_norm": 0.9044862389564514, + "learning_rate": 9.045490243460401e-05, + "loss": 2.5032, + "step": 18138 + }, + { + "epoch": 1.6433602862902312, + "grad_norm": 0.9769571423530579, + "learning_rate": 9.044886123361325e-05, + "loss": 2.5958, + "step": 18139 + }, + { + "epoch": 1.643450884464678, + "grad_norm": 0.9177607297897339, + "learning_rate": 9.04428200326225e-05, + "loss": 2.8634, + "step": 18140 + }, + { + "epoch": 1.6435414826391248, + "grad_norm": 0.93782639503479, + "learning_rate": 9.043677883163173e-05, + "loss": 2.5463, + "step": 18141 + }, + { + "epoch": 1.6436320808135716, + "grad_norm": 0.8761042952537537, + "learning_rate": 9.043073763064098e-05, + "loss": 2.6161, + "step": 18142 + }, + { + "epoch": 1.6437226789880184, + "grad_norm": 0.9106640815734863, + "learning_rate": 9.042469642965022e-05, + "loss": 2.7441, + "step": 18143 + }, + { + "epoch": 1.6438132771624652, + "grad_norm": 1.0361779928207397, + "learning_rate": 9.041865522865947e-05, + "loss": 2.6571, + "step": 18144 + }, + { + "epoch": 1.643903875336912, + "grad_norm": 0.8799026012420654, + "learning_rate": 9.04126140276687e-05, + "loss": 2.6895, + "step": 18145 + }, + { + "epoch": 1.6439944735113587, + "grad_norm": 0.898806095123291, + "learning_rate": 9.040657282667795e-05, + "loss": 2.6728, + "step": 18146 + }, + { + "epoch": 1.6440850716858055, + "grad_norm": 0.8727999925613403, + "learning_rate": 9.040053162568719e-05, + "loss": 2.6738, + "step": 18147 + }, + { + "epoch": 1.6441756698602523, + "grad_norm": 0.9415304660797119, + "learning_rate": 9.039449042469643e-05, + "loss": 2.6719, + "step": 18148 + }, + { + "epoch": 1.644266268034699, + "grad_norm": 1.0761653184890747, + "learning_rate": 9.038844922370567e-05, + "loss": 3.0828, + "step": 18149 + }, + { + "epoch": 1.644356866209146, + "grad_norm": 1.0517741441726685, + "learning_rate": 9.038240802271492e-05, + "loss": 2.7269, + "step": 18150 + }, + { + "epoch": 1.6444474643835927, + "grad_norm": 0.9357236623764038, + "learning_rate": 9.037636682172416e-05, + "loss": 2.9138, + "step": 18151 + }, + { + "epoch": 1.6445380625580395, + "grad_norm": 0.9129374623298645, + "learning_rate": 9.037032562073341e-05, + "loss": 2.7641, + "step": 18152 + }, + { + "epoch": 1.6446286607324863, + "grad_norm": 0.8310191631317139, + "learning_rate": 9.036428441974264e-05, + "loss": 2.0603, + "step": 18153 + }, + { + "epoch": 1.644719258906933, + "grad_norm": 0.8940138220787048, + "learning_rate": 9.035824321875189e-05, + "loss": 2.5414, + "step": 18154 + }, + { + "epoch": 1.6448098570813798, + "grad_norm": 0.7609754204750061, + "learning_rate": 9.035220201776114e-05, + "loss": 2.0016, + "step": 18155 + }, + { + "epoch": 1.6449004552558266, + "grad_norm": 0.9381344318389893, + "learning_rate": 9.034616081677037e-05, + "loss": 2.7381, + "step": 18156 + }, + { + "epoch": 1.6449910534302734, + "grad_norm": 0.8352827429771423, + "learning_rate": 9.034011961577962e-05, + "loss": 2.7614, + "step": 18157 + }, + { + "epoch": 1.6450816516047202, + "grad_norm": 0.9309098720550537, + "learning_rate": 9.033407841478887e-05, + "loss": 2.7758, + "step": 18158 + }, + { + "epoch": 1.645172249779167, + "grad_norm": 0.9300452470779419, + "learning_rate": 9.032803721379812e-05, + "loss": 2.8636, + "step": 18159 + }, + { + "epoch": 1.6452628479536138, + "grad_norm": 0.8428595066070557, + "learning_rate": 9.032199601280735e-05, + "loss": 2.6122, + "step": 18160 + }, + { + "epoch": 1.6453534461280606, + "grad_norm": 0.8717901110649109, + "learning_rate": 9.03159548118166e-05, + "loss": 2.4919, + "step": 18161 + }, + { + "epoch": 1.6454440443025073, + "grad_norm": 0.8891830444335938, + "learning_rate": 9.030991361082583e-05, + "loss": 2.4796, + "step": 18162 + }, + { + "epoch": 1.6455346424769541, + "grad_norm": 0.9507037401199341, + "learning_rate": 9.030387240983508e-05, + "loss": 2.6384, + "step": 18163 + }, + { + "epoch": 1.645625240651401, + "grad_norm": 0.9019473791122437, + "learning_rate": 9.029783120884431e-05, + "loss": 2.8428, + "step": 18164 + }, + { + "epoch": 1.6457158388258477, + "grad_norm": 0.9886520504951477, + "learning_rate": 9.029179000785356e-05, + "loss": 2.843, + "step": 18165 + }, + { + "epoch": 1.6458064370002945, + "grad_norm": 0.8504136204719543, + "learning_rate": 9.028574880686281e-05, + "loss": 2.5739, + "step": 18166 + }, + { + "epoch": 1.645897035174741, + "grad_norm": 0.9501267075538635, + "learning_rate": 9.027970760587206e-05, + "loss": 3.0408, + "step": 18167 + }, + { + "epoch": 1.645987633349188, + "grad_norm": 0.9166481494903564, + "learning_rate": 9.027366640488129e-05, + "loss": 2.9826, + "step": 18168 + }, + { + "epoch": 1.6460782315236346, + "grad_norm": 0.9029484987258911, + "learning_rate": 9.026762520389054e-05, + "loss": 2.4385, + "step": 18169 + }, + { + "epoch": 1.6461688296980816, + "grad_norm": 0.9071301817893982, + "learning_rate": 9.026158400289977e-05, + "loss": 2.8337, + "step": 18170 + }, + { + "epoch": 1.6462594278725282, + "grad_norm": 0.8628717660903931, + "learning_rate": 9.025554280190902e-05, + "loss": 2.4586, + "step": 18171 + }, + { + "epoch": 1.6463500260469752, + "grad_norm": 0.891511857509613, + "learning_rate": 9.024950160091827e-05, + "loss": 2.5013, + "step": 18172 + }, + { + "epoch": 1.6464406242214218, + "grad_norm": 0.9349640607833862, + "learning_rate": 9.024346039992752e-05, + "loss": 2.7114, + "step": 18173 + }, + { + "epoch": 1.6465312223958688, + "grad_norm": 0.9569403529167175, + "learning_rate": 9.023741919893676e-05, + "loss": 2.5742, + "step": 18174 + }, + { + "epoch": 1.6466218205703154, + "grad_norm": 0.8346837759017944, + "learning_rate": 9.0231377997946e-05, + "loss": 2.6875, + "step": 18175 + }, + { + "epoch": 1.6467124187447624, + "grad_norm": 0.9427783489227295, + "learning_rate": 9.022533679695524e-05, + "loss": 2.5674, + "step": 18176 + }, + { + "epoch": 1.646803016919209, + "grad_norm": 0.9125744104385376, + "learning_rate": 9.021929559596448e-05, + "loss": 2.7826, + "step": 18177 + }, + { + "epoch": 1.646893615093656, + "grad_norm": 0.8527501821517944, + "learning_rate": 9.021325439497373e-05, + "loss": 2.5147, + "step": 18178 + }, + { + "epoch": 1.6469842132681025, + "grad_norm": 0.9372243881225586, + "learning_rate": 9.020721319398296e-05, + "loss": 2.9387, + "step": 18179 + }, + { + "epoch": 1.6470748114425495, + "grad_norm": 0.912359893321991, + "learning_rate": 9.020117199299221e-05, + "loss": 2.6923, + "step": 18180 + }, + { + "epoch": 1.647165409616996, + "grad_norm": 0.8825433850288391, + "learning_rate": 9.019513079200146e-05, + "loss": 2.7089, + "step": 18181 + }, + { + "epoch": 1.647256007791443, + "grad_norm": 0.9258890748023987, + "learning_rate": 9.01890895910107e-05, + "loss": 2.8501, + "step": 18182 + }, + { + "epoch": 1.6473466059658897, + "grad_norm": 0.9423608779907227, + "learning_rate": 9.018304839001994e-05, + "loss": 2.6617, + "step": 18183 + }, + { + "epoch": 1.6474372041403367, + "grad_norm": 0.9716705083847046, + "learning_rate": 9.017700718902918e-05, + "loss": 2.7524, + "step": 18184 + }, + { + "epoch": 1.6475278023147832, + "grad_norm": 0.9991719126701355, + "learning_rate": 9.017096598803842e-05, + "loss": 2.6142, + "step": 18185 + }, + { + "epoch": 1.6476184004892303, + "grad_norm": 0.9290167689323425, + "learning_rate": 9.016492478704767e-05, + "loss": 2.8936, + "step": 18186 + }, + { + "epoch": 1.6477089986636768, + "grad_norm": 0.9563291072845459, + "learning_rate": 9.015888358605691e-05, + "loss": 3.0019, + "step": 18187 + }, + { + "epoch": 1.6477995968381238, + "grad_norm": 0.9286016821861267, + "learning_rate": 9.015284238506616e-05, + "loss": 2.583, + "step": 18188 + }, + { + "epoch": 1.6478901950125704, + "grad_norm": 0.8869751691818237, + "learning_rate": 9.01468011840754e-05, + "loss": 2.5064, + "step": 18189 + }, + { + "epoch": 1.6479807931870174, + "grad_norm": 0.8410462737083435, + "learning_rate": 9.014075998308464e-05, + "loss": 2.6936, + "step": 18190 + }, + { + "epoch": 1.648071391361464, + "grad_norm": 0.9554753303527832, + "learning_rate": 9.013471878209389e-05, + "loss": 3.1504, + "step": 18191 + }, + { + "epoch": 1.648161989535911, + "grad_norm": 0.9309815168380737, + "learning_rate": 9.012867758110313e-05, + "loss": 2.5321, + "step": 18192 + }, + { + "epoch": 1.6482525877103575, + "grad_norm": 0.9543787837028503, + "learning_rate": 9.012263638011237e-05, + "loss": 2.6785, + "step": 18193 + }, + { + "epoch": 1.6483431858848046, + "grad_norm": 0.9655352234840393, + "learning_rate": 9.011659517912161e-05, + "loss": 2.783, + "step": 18194 + }, + { + "epoch": 1.6484337840592511, + "grad_norm": 0.8993216156959534, + "learning_rate": 9.011055397813085e-05, + "loss": 2.6346, + "step": 18195 + }, + { + "epoch": 1.6485243822336981, + "grad_norm": 0.9039348363876343, + "learning_rate": 9.01045127771401e-05, + "loss": 2.8424, + "step": 18196 + }, + { + "epoch": 1.6486149804081447, + "grad_norm": 1.0447582006454468, + "learning_rate": 9.009847157614935e-05, + "loss": 2.7273, + "step": 18197 + }, + { + "epoch": 1.6487055785825917, + "grad_norm": 0.7284020781517029, + "learning_rate": 9.009243037515858e-05, + "loss": 1.8534, + "step": 18198 + }, + { + "epoch": 1.6487961767570383, + "grad_norm": 0.8975841403007507, + "learning_rate": 9.008638917416783e-05, + "loss": 2.7537, + "step": 18199 + }, + { + "epoch": 1.6488867749314853, + "grad_norm": 0.8977112174034119, + "learning_rate": 9.008034797317707e-05, + "loss": 2.7592, + "step": 18200 + }, + { + "epoch": 1.6489773731059318, + "grad_norm": 0.9130078554153442, + "learning_rate": 9.007430677218631e-05, + "loss": 2.6537, + "step": 18201 + }, + { + "epoch": 1.6490679712803789, + "grad_norm": 0.8793750405311584, + "learning_rate": 9.006826557119555e-05, + "loss": 2.749, + "step": 18202 + }, + { + "epoch": 1.6491585694548254, + "grad_norm": 0.9326441884040833, + "learning_rate": 9.006222437020481e-05, + "loss": 2.8935, + "step": 18203 + }, + { + "epoch": 1.6492491676292724, + "grad_norm": 0.7533090114593506, + "learning_rate": 9.005618316921404e-05, + "loss": 1.7799, + "step": 18204 + }, + { + "epoch": 1.649339765803719, + "grad_norm": 1.036420464515686, + "learning_rate": 9.005014196822329e-05, + "loss": 3.0132, + "step": 18205 + }, + { + "epoch": 1.649430363978166, + "grad_norm": 0.810249388217926, + "learning_rate": 9.004410076723254e-05, + "loss": 2.1802, + "step": 18206 + }, + { + "epoch": 1.6495209621526126, + "grad_norm": 1.0821841955184937, + "learning_rate": 9.003805956624177e-05, + "loss": 2.6417, + "step": 18207 + }, + { + "epoch": 1.6496115603270594, + "grad_norm": 0.933910608291626, + "learning_rate": 9.003201836525102e-05, + "loss": 2.828, + "step": 18208 + }, + { + "epoch": 1.6497021585015061, + "grad_norm": 0.9127252697944641, + "learning_rate": 9.002597716426025e-05, + "loss": 2.7604, + "step": 18209 + }, + { + "epoch": 1.649792756675953, + "grad_norm": 0.9532564878463745, + "learning_rate": 9.00199359632695e-05, + "loss": 2.3568, + "step": 18210 + }, + { + "epoch": 1.6498833548503997, + "grad_norm": 0.9313064217567444, + "learning_rate": 9.001389476227875e-05, + "loss": 2.8504, + "step": 18211 + }, + { + "epoch": 1.6499739530248465, + "grad_norm": 0.8868657350540161, + "learning_rate": 9.0007853561288e-05, + "loss": 2.7972, + "step": 18212 + }, + { + "epoch": 1.6500645511992933, + "grad_norm": 0.9468491673469543, + "learning_rate": 9.000181236029723e-05, + "loss": 2.6845, + "step": 18213 + }, + { + "epoch": 1.65015514937374, + "grad_norm": 0.9163763523101807, + "learning_rate": 8.999577115930648e-05, + "loss": 2.6259, + "step": 18214 + }, + { + "epoch": 1.6502457475481869, + "grad_norm": 0.8871495127677917, + "learning_rate": 8.998972995831571e-05, + "loss": 2.6744, + "step": 18215 + }, + { + "epoch": 1.6503363457226337, + "grad_norm": 0.880790650844574, + "learning_rate": 8.998368875732496e-05, + "loss": 2.5121, + "step": 18216 + }, + { + "epoch": 1.6504269438970804, + "grad_norm": 0.8479815721511841, + "learning_rate": 8.99776475563342e-05, + "loss": 2.681, + "step": 18217 + }, + { + "epoch": 1.6505175420715272, + "grad_norm": 0.9165877103805542, + "learning_rate": 8.997160635534345e-05, + "loss": 2.713, + "step": 18218 + }, + { + "epoch": 1.650608140245974, + "grad_norm": 0.8567327260971069, + "learning_rate": 8.996556515435269e-05, + "loss": 2.652, + "step": 18219 + }, + { + "epoch": 1.6506987384204208, + "grad_norm": 0.8926439881324768, + "learning_rate": 8.995952395336194e-05, + "loss": 2.9087, + "step": 18220 + }, + { + "epoch": 1.6507893365948676, + "grad_norm": 0.9576330780982971, + "learning_rate": 8.995348275237117e-05, + "loss": 2.7186, + "step": 18221 + }, + { + "epoch": 1.6508799347693144, + "grad_norm": 0.911191999912262, + "learning_rate": 8.994744155138042e-05, + "loss": 2.8122, + "step": 18222 + }, + { + "epoch": 1.6509705329437612, + "grad_norm": 0.8997510075569153, + "learning_rate": 8.994140035038967e-05, + "loss": 2.7523, + "step": 18223 + }, + { + "epoch": 1.651061131118208, + "grad_norm": 0.8866646885871887, + "learning_rate": 8.99353591493989e-05, + "loss": 2.7139, + "step": 18224 + }, + { + "epoch": 1.6511517292926547, + "grad_norm": 0.6327499151229858, + "learning_rate": 8.992931794840815e-05, + "loss": 1.3688, + "step": 18225 + }, + { + "epoch": 1.6512423274671015, + "grad_norm": 0.9560464024543762, + "learning_rate": 8.99232767474174e-05, + "loss": 2.5594, + "step": 18226 + }, + { + "epoch": 1.6513329256415483, + "grad_norm": 0.9978243708610535, + "learning_rate": 8.991723554642664e-05, + "loss": 2.865, + "step": 18227 + }, + { + "epoch": 1.651423523815995, + "grad_norm": 0.9212357997894287, + "learning_rate": 8.991119434543588e-05, + "loss": 2.493, + "step": 18228 + }, + { + "epoch": 1.651514121990442, + "grad_norm": 0.9857481718063354, + "learning_rate": 8.990515314444512e-05, + "loss": 2.7785, + "step": 18229 + }, + { + "epoch": 1.6516047201648887, + "grad_norm": 0.9539110064506531, + "learning_rate": 8.989911194345436e-05, + "loss": 2.9184, + "step": 18230 + }, + { + "epoch": 1.6516953183393355, + "grad_norm": 0.8360929489135742, + "learning_rate": 8.98930707424636e-05, + "loss": 2.7078, + "step": 18231 + }, + { + "epoch": 1.6517859165137823, + "grad_norm": 0.8722622394561768, + "learning_rate": 8.988702954147284e-05, + "loss": 2.9214, + "step": 18232 + }, + { + "epoch": 1.651876514688229, + "grad_norm": 0.7969260215759277, + "learning_rate": 8.98809883404821e-05, + "loss": 1.9998, + "step": 18233 + }, + { + "epoch": 1.6519671128626758, + "grad_norm": 0.8690128326416016, + "learning_rate": 8.987494713949133e-05, + "loss": 2.6187, + "step": 18234 + }, + { + "epoch": 1.6520577110371226, + "grad_norm": 0.9408653974533081, + "learning_rate": 8.986890593850058e-05, + "loss": 2.7025, + "step": 18235 + }, + { + "epoch": 1.6521483092115694, + "grad_norm": 0.8932421803474426, + "learning_rate": 8.986286473750982e-05, + "loss": 2.6578, + "step": 18236 + }, + { + "epoch": 1.6522389073860162, + "grad_norm": 0.8768439888954163, + "learning_rate": 8.985682353651906e-05, + "loss": 2.8921, + "step": 18237 + }, + { + "epoch": 1.652329505560463, + "grad_norm": 0.8841222524642944, + "learning_rate": 8.985078233552831e-05, + "loss": 2.5368, + "step": 18238 + }, + { + "epoch": 1.6524201037349098, + "grad_norm": 1.0872241258621216, + "learning_rate": 8.984474113453755e-05, + "loss": 3.0978, + "step": 18239 + }, + { + "epoch": 1.6525107019093566, + "grad_norm": 0.9194790124893188, + "learning_rate": 8.98386999335468e-05, + "loss": 2.697, + "step": 18240 + }, + { + "epoch": 1.6526013000838033, + "grad_norm": 0.9844908118247986, + "learning_rate": 8.983265873255604e-05, + "loss": 2.7005, + "step": 18241 + }, + { + "epoch": 1.6526918982582501, + "grad_norm": 0.9639205932617188, + "learning_rate": 8.982661753156529e-05, + "loss": 2.9216, + "step": 18242 + }, + { + "epoch": 1.652782496432697, + "grad_norm": 1.0153392553329468, + "learning_rate": 8.982057633057452e-05, + "loss": 2.5227, + "step": 18243 + }, + { + "epoch": 1.6528730946071437, + "grad_norm": 0.9512273669242859, + "learning_rate": 8.981453512958377e-05, + "loss": 2.4906, + "step": 18244 + }, + { + "epoch": 1.6529636927815905, + "grad_norm": 0.9440514445304871, + "learning_rate": 8.9808493928593e-05, + "loss": 2.7604, + "step": 18245 + }, + { + "epoch": 1.6530542909560373, + "grad_norm": 0.8905896544456482, + "learning_rate": 8.980245272760225e-05, + "loss": 2.8001, + "step": 18246 + }, + { + "epoch": 1.653144889130484, + "grad_norm": 0.8847056031227112, + "learning_rate": 8.979641152661149e-05, + "loss": 2.8874, + "step": 18247 + }, + { + "epoch": 1.6532354873049306, + "grad_norm": 0.7422819137573242, + "learning_rate": 8.979037032562075e-05, + "loss": 1.8673, + "step": 18248 + }, + { + "epoch": 1.6533260854793776, + "grad_norm": 0.8758795857429504, + "learning_rate": 8.978432912462998e-05, + "loss": 2.7765, + "step": 18249 + }, + { + "epoch": 1.6534166836538242, + "grad_norm": 0.8646114468574524, + "learning_rate": 8.977828792363923e-05, + "loss": 1.9847, + "step": 18250 + }, + { + "epoch": 1.6535072818282712, + "grad_norm": 0.7608561515808105, + "learning_rate": 8.977224672264846e-05, + "loss": 2.1354, + "step": 18251 + }, + { + "epoch": 1.6535978800027178, + "grad_norm": 0.9168079495429993, + "learning_rate": 8.976620552165771e-05, + "loss": 2.6272, + "step": 18252 + }, + { + "epoch": 1.6536884781771648, + "grad_norm": 0.9063846468925476, + "learning_rate": 8.976016432066694e-05, + "loss": 2.4913, + "step": 18253 + }, + { + "epoch": 1.6537790763516114, + "grad_norm": 0.9428592920303345, + "learning_rate": 8.975412311967619e-05, + "loss": 2.7621, + "step": 18254 + }, + { + "epoch": 1.6538696745260584, + "grad_norm": 0.9246786236763, + "learning_rate": 8.974808191868544e-05, + "loss": 2.4436, + "step": 18255 + }, + { + "epoch": 1.653960272700505, + "grad_norm": 0.823756217956543, + "learning_rate": 8.974204071769469e-05, + "loss": 2.6212, + "step": 18256 + }, + { + "epoch": 1.654050870874952, + "grad_norm": 1.0268938541412354, + "learning_rate": 8.973599951670392e-05, + "loss": 2.805, + "step": 18257 + }, + { + "epoch": 1.6541414690493985, + "grad_norm": 0.8694432377815247, + "learning_rate": 8.972995831571317e-05, + "loss": 2.8824, + "step": 18258 + }, + { + "epoch": 1.6542320672238455, + "grad_norm": 0.9566718339920044, + "learning_rate": 8.972391711472242e-05, + "loss": 2.624, + "step": 18259 + }, + { + "epoch": 1.654322665398292, + "grad_norm": 0.8614005446434021, + "learning_rate": 8.971787591373165e-05, + "loss": 2.718, + "step": 18260 + }, + { + "epoch": 1.654413263572739, + "grad_norm": 0.9140866994857788, + "learning_rate": 8.97118347127409e-05, + "loss": 2.6711, + "step": 18261 + }, + { + "epoch": 1.6545038617471857, + "grad_norm": 0.8566284775733948, + "learning_rate": 8.970579351175013e-05, + "loss": 2.6094, + "step": 18262 + }, + { + "epoch": 1.6545944599216327, + "grad_norm": 0.9391880035400391, + "learning_rate": 8.96997523107594e-05, + "loss": 2.7244, + "step": 18263 + }, + { + "epoch": 1.6546850580960792, + "grad_norm": 0.9213687181472778, + "learning_rate": 8.969371110976863e-05, + "loss": 2.5809, + "step": 18264 + }, + { + "epoch": 1.6547756562705263, + "grad_norm": 0.9262059926986694, + "learning_rate": 8.968766990877788e-05, + "loss": 2.6881, + "step": 18265 + }, + { + "epoch": 1.6548662544449728, + "grad_norm": 0.9336791038513184, + "learning_rate": 8.968162870778711e-05, + "loss": 2.8602, + "step": 18266 + }, + { + "epoch": 1.6549568526194198, + "grad_norm": 0.8931263089179993, + "learning_rate": 8.967558750679636e-05, + "loss": 2.6888, + "step": 18267 + }, + { + "epoch": 1.6550474507938664, + "grad_norm": 0.8991877436637878, + "learning_rate": 8.966954630580559e-05, + "loss": 2.7785, + "step": 18268 + }, + { + "epoch": 1.6551380489683134, + "grad_norm": 0.9588441848754883, + "learning_rate": 8.966350510481484e-05, + "loss": 2.6276, + "step": 18269 + }, + { + "epoch": 1.65522864714276, + "grad_norm": 0.8517259359359741, + "learning_rate": 8.965746390382409e-05, + "loss": 2.5272, + "step": 18270 + }, + { + "epoch": 1.655319245317207, + "grad_norm": 0.905057966709137, + "learning_rate": 8.965142270283333e-05, + "loss": 2.8308, + "step": 18271 + }, + { + "epoch": 1.6554098434916535, + "grad_norm": 0.7956328988075256, + "learning_rate": 8.964538150184257e-05, + "loss": 2.2538, + "step": 18272 + }, + { + "epoch": 1.6555004416661006, + "grad_norm": 0.9279187917709351, + "learning_rate": 8.963934030085182e-05, + "loss": 2.7052, + "step": 18273 + }, + { + "epoch": 1.6555910398405471, + "grad_norm": 0.8651498556137085, + "learning_rate": 8.963329909986106e-05, + "loss": 1.8747, + "step": 18274 + }, + { + "epoch": 1.6556816380149941, + "grad_norm": 0.899602472782135, + "learning_rate": 8.96272578988703e-05, + "loss": 2.9807, + "step": 18275 + }, + { + "epoch": 1.6557722361894407, + "grad_norm": 0.9222041368484497, + "learning_rate": 8.962121669787954e-05, + "loss": 2.673, + "step": 18276 + }, + { + "epoch": 1.6558628343638877, + "grad_norm": 0.8872140049934387, + "learning_rate": 8.961517549688878e-05, + "loss": 2.5136, + "step": 18277 + }, + { + "epoch": 1.6559534325383343, + "grad_norm": 0.609466552734375, + "learning_rate": 8.960913429589804e-05, + "loss": 1.2055, + "step": 18278 + }, + { + "epoch": 1.6560440307127813, + "grad_norm": 0.9458768367767334, + "learning_rate": 8.960309309490727e-05, + "loss": 2.7915, + "step": 18279 + }, + { + "epoch": 1.6561346288872278, + "grad_norm": 0.9066201448440552, + "learning_rate": 8.959705189391652e-05, + "loss": 2.5812, + "step": 18280 + }, + { + "epoch": 1.6562252270616749, + "grad_norm": 0.8740645051002502, + "learning_rate": 8.959101069292576e-05, + "loss": 2.4618, + "step": 18281 + }, + { + "epoch": 1.6563158252361214, + "grad_norm": 0.9384488463401794, + "learning_rate": 8.9584969491935e-05, + "loss": 2.7335, + "step": 18282 + }, + { + "epoch": 1.6564064234105684, + "grad_norm": 0.8732643127441406, + "learning_rate": 8.957892829094424e-05, + "loss": 2.8056, + "step": 18283 + }, + { + "epoch": 1.656497021585015, + "grad_norm": 0.8837721943855286, + "learning_rate": 8.957288708995348e-05, + "loss": 2.7038, + "step": 18284 + }, + { + "epoch": 1.656587619759462, + "grad_norm": 0.9184442758560181, + "learning_rate": 8.956684588896272e-05, + "loss": 2.7828, + "step": 18285 + }, + { + "epoch": 1.6566782179339086, + "grad_norm": 0.9024496078491211, + "learning_rate": 8.956080468797198e-05, + "loss": 2.5757, + "step": 18286 + }, + { + "epoch": 1.6567688161083556, + "grad_norm": 0.9519844651222229, + "learning_rate": 8.955476348698121e-05, + "loss": 2.7829, + "step": 18287 + }, + { + "epoch": 1.6568594142828021, + "grad_norm": 0.9253730773925781, + "learning_rate": 8.954872228599046e-05, + "loss": 2.6593, + "step": 18288 + }, + { + "epoch": 1.656950012457249, + "grad_norm": 0.9876535534858704, + "learning_rate": 8.95426810849997e-05, + "loss": 2.6826, + "step": 18289 + }, + { + "epoch": 1.6570406106316957, + "grad_norm": 0.9083566069602966, + "learning_rate": 8.953663988400894e-05, + "loss": 2.6934, + "step": 18290 + }, + { + "epoch": 1.6571312088061425, + "grad_norm": 0.830508828163147, + "learning_rate": 8.953059868301819e-05, + "loss": 2.7101, + "step": 18291 + }, + { + "epoch": 1.6572218069805893, + "grad_norm": 0.8607782125473022, + "learning_rate": 8.952455748202742e-05, + "loss": 1.8402, + "step": 18292 + }, + { + "epoch": 1.657312405155036, + "grad_norm": 0.9711564183235168, + "learning_rate": 8.951851628103669e-05, + "loss": 2.8317, + "step": 18293 + }, + { + "epoch": 1.6574030033294829, + "grad_norm": 0.9792994856834412, + "learning_rate": 8.951247508004592e-05, + "loss": 2.5745, + "step": 18294 + }, + { + "epoch": 1.6574936015039297, + "grad_norm": 0.8602405190467834, + "learning_rate": 8.950643387905517e-05, + "loss": 1.9043, + "step": 18295 + }, + { + "epoch": 1.6575841996783764, + "grad_norm": 1.023780107498169, + "learning_rate": 8.95003926780644e-05, + "loss": 2.7828, + "step": 18296 + }, + { + "epoch": 1.6576747978528232, + "grad_norm": 0.801779568195343, + "learning_rate": 8.949435147707365e-05, + "loss": 2.0915, + "step": 18297 + }, + { + "epoch": 1.65776539602727, + "grad_norm": 0.911055326461792, + "learning_rate": 8.948831027608288e-05, + "loss": 2.7666, + "step": 18298 + }, + { + "epoch": 1.6578559942017168, + "grad_norm": 0.8334808945655823, + "learning_rate": 8.948226907509213e-05, + "loss": 2.0028, + "step": 18299 + }, + { + "epoch": 1.6579465923761636, + "grad_norm": 0.8638654351234436, + "learning_rate": 8.947622787410137e-05, + "loss": 2.6912, + "step": 18300 + }, + { + "epoch": 1.6580371905506104, + "grad_norm": 0.8715626001358032, + "learning_rate": 8.947018667311063e-05, + "loss": 2.713, + "step": 18301 + }, + { + "epoch": 1.6581277887250572, + "grad_norm": 0.9189971089363098, + "learning_rate": 8.946414547211986e-05, + "loss": 2.7703, + "step": 18302 + }, + { + "epoch": 1.658218386899504, + "grad_norm": 0.894425094127655, + "learning_rate": 8.945810427112911e-05, + "loss": 2.3439, + "step": 18303 + }, + { + "epoch": 1.6583089850739507, + "grad_norm": 0.94671630859375, + "learning_rate": 8.945206307013834e-05, + "loss": 2.7097, + "step": 18304 + }, + { + "epoch": 1.6583995832483975, + "grad_norm": 0.8989958167076111, + "learning_rate": 8.944602186914759e-05, + "loss": 2.7579, + "step": 18305 + }, + { + "epoch": 1.6584901814228443, + "grad_norm": 0.8928999304771423, + "learning_rate": 8.943998066815684e-05, + "loss": 2.6064, + "step": 18306 + }, + { + "epoch": 1.658580779597291, + "grad_norm": 0.9402540326118469, + "learning_rate": 8.943393946716607e-05, + "loss": 2.5926, + "step": 18307 + }, + { + "epoch": 1.658671377771738, + "grad_norm": 0.9786266088485718, + "learning_rate": 8.942789826617532e-05, + "loss": 2.6984, + "step": 18308 + }, + { + "epoch": 1.6587619759461847, + "grad_norm": 0.867085337638855, + "learning_rate": 8.942185706518457e-05, + "loss": 2.3761, + "step": 18309 + }, + { + "epoch": 1.6588525741206315, + "grad_norm": 0.801757276058197, + "learning_rate": 8.941581586419381e-05, + "loss": 1.8409, + "step": 18310 + }, + { + "epoch": 1.6589431722950783, + "grad_norm": 1.0029983520507812, + "learning_rate": 8.940977466320305e-05, + "loss": 2.6411, + "step": 18311 + }, + { + "epoch": 1.659033770469525, + "grad_norm": 0.7979591488838196, + "learning_rate": 8.94037334622123e-05, + "loss": 2.0706, + "step": 18312 + }, + { + "epoch": 1.6591243686439718, + "grad_norm": 0.8741567134857178, + "learning_rate": 8.939769226122153e-05, + "loss": 2.0067, + "step": 18313 + }, + { + "epoch": 1.6592149668184186, + "grad_norm": 0.9018563032150269, + "learning_rate": 8.939165106023078e-05, + "loss": 2.703, + "step": 18314 + }, + { + "epoch": 1.6593055649928654, + "grad_norm": 0.945678174495697, + "learning_rate": 8.938560985924001e-05, + "loss": 2.5829, + "step": 18315 + }, + { + "epoch": 1.6593961631673122, + "grad_norm": 0.9086164832115173, + "learning_rate": 8.937956865824927e-05, + "loss": 2.8992, + "step": 18316 + }, + { + "epoch": 1.659486761341759, + "grad_norm": 0.8727195262908936, + "learning_rate": 8.93735274572585e-05, + "loss": 2.7974, + "step": 18317 + }, + { + "epoch": 1.6595773595162058, + "grad_norm": 0.830090343952179, + "learning_rate": 8.936748625626775e-05, + "loss": 2.4402, + "step": 18318 + }, + { + "epoch": 1.6596679576906526, + "grad_norm": 1.0144177675247192, + "learning_rate": 8.936144505527699e-05, + "loss": 2.3897, + "step": 18319 + }, + { + "epoch": 1.6597585558650993, + "grad_norm": 0.9255249500274658, + "learning_rate": 8.935540385428624e-05, + "loss": 2.8736, + "step": 18320 + }, + { + "epoch": 1.6598491540395461, + "grad_norm": 0.839808464050293, + "learning_rate": 8.934936265329547e-05, + "loss": 2.32, + "step": 18321 + }, + { + "epoch": 1.659939752213993, + "grad_norm": 0.9018031358718872, + "learning_rate": 8.934332145230472e-05, + "loss": 2.6722, + "step": 18322 + }, + { + "epoch": 1.6600303503884397, + "grad_norm": 0.8818577527999878, + "learning_rate": 8.933728025131397e-05, + "loss": 2.6156, + "step": 18323 + }, + { + "epoch": 1.6601209485628865, + "grad_norm": 0.8100004196166992, + "learning_rate": 8.933123905032321e-05, + "loss": 2.0334, + "step": 18324 + }, + { + "epoch": 1.6602115467373333, + "grad_norm": 0.9023257493972778, + "learning_rate": 8.932519784933246e-05, + "loss": 2.6232, + "step": 18325 + }, + { + "epoch": 1.66030214491178, + "grad_norm": 0.8785620331764221, + "learning_rate": 8.93191566483417e-05, + "loss": 2.6316, + "step": 18326 + }, + { + "epoch": 1.6603927430862269, + "grad_norm": 0.8954155445098877, + "learning_rate": 8.931311544735094e-05, + "loss": 2.7928, + "step": 18327 + }, + { + "epoch": 1.6604833412606737, + "grad_norm": 0.8967413306236267, + "learning_rate": 8.930707424636018e-05, + "loss": 3.0472, + "step": 18328 + }, + { + "epoch": 1.6605739394351202, + "grad_norm": 0.9027884006500244, + "learning_rate": 8.930103304536942e-05, + "loss": 2.8133, + "step": 18329 + }, + { + "epoch": 1.6606645376095672, + "grad_norm": 0.8694092631340027, + "learning_rate": 8.929499184437866e-05, + "loss": 2.576, + "step": 18330 + }, + { + "epoch": 1.6607551357840138, + "grad_norm": 0.8981335163116455, + "learning_rate": 8.928895064338792e-05, + "loss": 2.8385, + "step": 18331 + }, + { + "epoch": 1.6608457339584608, + "grad_norm": 0.9346521496772766, + "learning_rate": 8.928290944239715e-05, + "loss": 2.7738, + "step": 18332 + }, + { + "epoch": 1.6609363321329074, + "grad_norm": 0.9246647357940674, + "learning_rate": 8.92768682414064e-05, + "loss": 2.5572, + "step": 18333 + }, + { + "epoch": 1.6610269303073544, + "grad_norm": 0.8665767908096313, + "learning_rate": 8.927082704041563e-05, + "loss": 2.4594, + "step": 18334 + }, + { + "epoch": 1.661117528481801, + "grad_norm": 0.9730363488197327, + "learning_rate": 8.926478583942488e-05, + "loss": 2.5678, + "step": 18335 + }, + { + "epoch": 1.661208126656248, + "grad_norm": 0.9229373931884766, + "learning_rate": 8.925874463843412e-05, + "loss": 2.8247, + "step": 18336 + }, + { + "epoch": 1.6612987248306945, + "grad_norm": 0.876155436038971, + "learning_rate": 8.925270343744336e-05, + "loss": 2.8642, + "step": 18337 + }, + { + "epoch": 1.6613893230051415, + "grad_norm": 0.842025101184845, + "learning_rate": 8.924666223645261e-05, + "loss": 2.9168, + "step": 18338 + }, + { + "epoch": 1.661479921179588, + "grad_norm": 0.9711382389068604, + "learning_rate": 8.924062103546186e-05, + "loss": 2.4826, + "step": 18339 + }, + { + "epoch": 1.661570519354035, + "grad_norm": 0.9300910830497742, + "learning_rate": 8.923457983447109e-05, + "loss": 2.8641, + "step": 18340 + }, + { + "epoch": 1.6616611175284817, + "grad_norm": 0.8540501594543457, + "learning_rate": 8.922853863348034e-05, + "loss": 2.5777, + "step": 18341 + }, + { + "epoch": 1.6617517157029287, + "grad_norm": 0.9076122641563416, + "learning_rate": 8.922249743248959e-05, + "loss": 2.5537, + "step": 18342 + }, + { + "epoch": 1.6618423138773752, + "grad_norm": 0.8900040984153748, + "learning_rate": 8.921645623149882e-05, + "loss": 2.9776, + "step": 18343 + }, + { + "epoch": 1.6619329120518223, + "grad_norm": 0.829642653465271, + "learning_rate": 8.921041503050807e-05, + "loss": 2.2662, + "step": 18344 + }, + { + "epoch": 1.6620235102262688, + "grad_norm": 0.8779907822608948, + "learning_rate": 8.92043738295173e-05, + "loss": 2.5882, + "step": 18345 + }, + { + "epoch": 1.6621141084007158, + "grad_norm": 0.751255989074707, + "learning_rate": 8.919833262852657e-05, + "loss": 2.0753, + "step": 18346 + }, + { + "epoch": 1.6622047065751624, + "grad_norm": 0.963580846786499, + "learning_rate": 8.91922914275358e-05, + "loss": 2.525, + "step": 18347 + }, + { + "epoch": 1.6622953047496094, + "grad_norm": 0.9239116907119751, + "learning_rate": 8.918625022654505e-05, + "loss": 2.684, + "step": 18348 + }, + { + "epoch": 1.662385902924056, + "grad_norm": 0.953765332698822, + "learning_rate": 8.918020902555428e-05, + "loss": 2.7083, + "step": 18349 + }, + { + "epoch": 1.662476501098503, + "grad_norm": 0.886603832244873, + "learning_rate": 8.917416782456353e-05, + "loss": 2.5609, + "step": 18350 + }, + { + "epoch": 1.6625670992729495, + "grad_norm": 0.9148468971252441, + "learning_rate": 8.916812662357276e-05, + "loss": 2.612, + "step": 18351 + }, + { + "epoch": 1.6626576974473966, + "grad_norm": 0.9573349952697754, + "learning_rate": 8.916208542258201e-05, + "loss": 2.9581, + "step": 18352 + }, + { + "epoch": 1.6627482956218431, + "grad_norm": 0.9173582196235657, + "learning_rate": 8.915604422159126e-05, + "loss": 2.8976, + "step": 18353 + }, + { + "epoch": 1.6628388937962901, + "grad_norm": 0.8901341557502747, + "learning_rate": 8.91500030206005e-05, + "loss": 2.4833, + "step": 18354 + }, + { + "epoch": 1.6629294919707367, + "grad_norm": 0.9340344071388245, + "learning_rate": 8.914396181960974e-05, + "loss": 2.8167, + "step": 18355 + }, + { + "epoch": 1.6630200901451837, + "grad_norm": 0.9101521968841553, + "learning_rate": 8.913792061861899e-05, + "loss": 2.6519, + "step": 18356 + }, + { + "epoch": 1.6631106883196303, + "grad_norm": 0.961995005607605, + "learning_rate": 8.913187941762822e-05, + "loss": 2.7295, + "step": 18357 + }, + { + "epoch": 1.6632012864940773, + "grad_norm": 0.8727536201477051, + "learning_rate": 8.912583821663747e-05, + "loss": 2.7372, + "step": 18358 + }, + { + "epoch": 1.6632918846685238, + "grad_norm": 0.9127344489097595, + "learning_rate": 8.911979701564672e-05, + "loss": 2.9276, + "step": 18359 + }, + { + "epoch": 1.6633824828429709, + "grad_norm": 0.7695741653442383, + "learning_rate": 8.911375581465595e-05, + "loss": 2.1214, + "step": 18360 + }, + { + "epoch": 1.6634730810174174, + "grad_norm": 0.7562899589538574, + "learning_rate": 8.910771461366521e-05, + "loss": 2.2675, + "step": 18361 + }, + { + "epoch": 1.6635636791918644, + "grad_norm": 0.8583160042762756, + "learning_rate": 8.910167341267445e-05, + "loss": 2.406, + "step": 18362 + }, + { + "epoch": 1.663654277366311, + "grad_norm": 0.8862966895103455, + "learning_rate": 8.90956322116837e-05, + "loss": 2.7077, + "step": 18363 + }, + { + "epoch": 1.663744875540758, + "grad_norm": 0.8911168575286865, + "learning_rate": 8.908959101069293e-05, + "loss": 2.4984, + "step": 18364 + }, + { + "epoch": 1.6638354737152046, + "grad_norm": 0.8831158876419067, + "learning_rate": 8.908354980970217e-05, + "loss": 2.4417, + "step": 18365 + }, + { + "epoch": 1.6639260718896516, + "grad_norm": 0.8224755525588989, + "learning_rate": 8.907750860871141e-05, + "loss": 2.0278, + "step": 18366 + }, + { + "epoch": 1.6640166700640981, + "grad_norm": 0.8734176754951477, + "learning_rate": 8.907146740772066e-05, + "loss": 2.7749, + "step": 18367 + }, + { + "epoch": 1.6641072682385452, + "grad_norm": 1.0198256969451904, + "learning_rate": 8.90654262067299e-05, + "loss": 2.5426, + "step": 18368 + }, + { + "epoch": 1.6641978664129917, + "grad_norm": 1.0031156539916992, + "learning_rate": 8.905938500573915e-05, + "loss": 2.6987, + "step": 18369 + }, + { + "epoch": 1.6642884645874385, + "grad_norm": 0.8994479179382324, + "learning_rate": 8.905334380474839e-05, + "loss": 2.741, + "step": 18370 + }, + { + "epoch": 1.6643790627618853, + "grad_norm": 0.9215230345726013, + "learning_rate": 8.904730260375763e-05, + "loss": 2.6556, + "step": 18371 + }, + { + "epoch": 1.664469660936332, + "grad_norm": 0.9802262783050537, + "learning_rate": 8.904126140276687e-05, + "loss": 2.8253, + "step": 18372 + }, + { + "epoch": 1.6645602591107789, + "grad_norm": 0.876674234867096, + "learning_rate": 8.903522020177612e-05, + "loss": 2.4243, + "step": 18373 + }, + { + "epoch": 1.6646508572852257, + "grad_norm": 0.8161757588386536, + "learning_rate": 8.902917900078536e-05, + "loss": 2.2484, + "step": 18374 + }, + { + "epoch": 1.6647414554596724, + "grad_norm": 0.938705325126648, + "learning_rate": 8.90231377997946e-05, + "loss": 2.7833, + "step": 18375 + }, + { + "epoch": 1.6648320536341192, + "grad_norm": 0.7504476308822632, + "learning_rate": 8.901709659880384e-05, + "loss": 1.8785, + "step": 18376 + }, + { + "epoch": 1.664922651808566, + "grad_norm": 0.9316332936286926, + "learning_rate": 8.901105539781309e-05, + "loss": 2.6991, + "step": 18377 + }, + { + "epoch": 1.6650132499830128, + "grad_norm": 0.9534742832183838, + "learning_rate": 8.900501419682234e-05, + "loss": 2.6937, + "step": 18378 + }, + { + "epoch": 1.6651038481574596, + "grad_norm": 0.8956654071807861, + "learning_rate": 8.899897299583157e-05, + "loss": 2.9119, + "step": 18379 + }, + { + "epoch": 1.6651944463319064, + "grad_norm": 0.9255574345588684, + "learning_rate": 8.899293179484082e-05, + "loss": 2.602, + "step": 18380 + }, + { + "epoch": 1.6652850445063532, + "grad_norm": 0.9189168810844421, + "learning_rate": 8.898689059385006e-05, + "loss": 2.7927, + "step": 18381 + }, + { + "epoch": 1.6653756426808, + "grad_norm": 0.902066707611084, + "learning_rate": 8.89808493928593e-05, + "loss": 2.805, + "step": 18382 + }, + { + "epoch": 1.6654662408552467, + "grad_norm": 0.9030341506004333, + "learning_rate": 8.897480819186855e-05, + "loss": 2.8193, + "step": 18383 + }, + { + "epoch": 1.6655568390296935, + "grad_norm": 0.9740949273109436, + "learning_rate": 8.89687669908778e-05, + "loss": 2.9798, + "step": 18384 + }, + { + "epoch": 1.6656474372041403, + "grad_norm": 0.9713550806045532, + "learning_rate": 8.896272578988703e-05, + "loss": 2.7672, + "step": 18385 + }, + { + "epoch": 1.6657380353785871, + "grad_norm": 0.8660795092582703, + "learning_rate": 8.895668458889628e-05, + "loss": 2.5796, + "step": 18386 + }, + { + "epoch": 1.665828633553034, + "grad_norm": 0.865691065788269, + "learning_rate": 8.895064338790551e-05, + "loss": 2.7765, + "step": 18387 + }, + { + "epoch": 1.6659192317274807, + "grad_norm": 0.8901170492172241, + "learning_rate": 8.894460218691476e-05, + "loss": 2.8412, + "step": 18388 + }, + { + "epoch": 1.6660098299019275, + "grad_norm": 0.8978349566459656, + "learning_rate": 8.8938560985924e-05, + "loss": 2.7267, + "step": 18389 + }, + { + "epoch": 1.6661004280763743, + "grad_norm": 0.9196301698684692, + "learning_rate": 8.893251978493324e-05, + "loss": 2.7607, + "step": 18390 + }, + { + "epoch": 1.666191026250821, + "grad_norm": 0.9113948941230774, + "learning_rate": 8.892647858394249e-05, + "loss": 2.78, + "step": 18391 + }, + { + "epoch": 1.6662816244252678, + "grad_norm": 0.8300904035568237, + "learning_rate": 8.892043738295174e-05, + "loss": 1.7559, + "step": 18392 + }, + { + "epoch": 1.6663722225997146, + "grad_norm": 0.9850376844406128, + "learning_rate": 8.891439618196099e-05, + "loss": 2.7642, + "step": 18393 + }, + { + "epoch": 1.6664628207741614, + "grad_norm": 0.9447972178459167, + "learning_rate": 8.890835498097022e-05, + "loss": 2.698, + "step": 18394 + }, + { + "epoch": 1.6665534189486082, + "grad_norm": 0.8603693246841431, + "learning_rate": 8.890231377997947e-05, + "loss": 2.4923, + "step": 18395 + }, + { + "epoch": 1.666644017123055, + "grad_norm": 0.8168054223060608, + "learning_rate": 8.88962725789887e-05, + "loss": 2.1931, + "step": 18396 + }, + { + "epoch": 1.6667346152975018, + "grad_norm": 0.9199009537696838, + "learning_rate": 8.889023137799795e-05, + "loss": 2.5803, + "step": 18397 + }, + { + "epoch": 1.6668252134719486, + "grad_norm": 0.8698669672012329, + "learning_rate": 8.88841901770072e-05, + "loss": 2.6282, + "step": 18398 + }, + { + "epoch": 1.6669158116463954, + "grad_norm": 0.8904764652252197, + "learning_rate": 8.887814897601644e-05, + "loss": 2.6534, + "step": 18399 + }, + { + "epoch": 1.6670064098208421, + "grad_norm": 0.9027024507522583, + "learning_rate": 8.887210777502568e-05, + "loss": 2.8205, + "step": 18400 + }, + { + "epoch": 1.667097007995289, + "grad_norm": 0.8345210552215576, + "learning_rate": 8.886606657403493e-05, + "loss": 2.6421, + "step": 18401 + }, + { + "epoch": 1.6671876061697357, + "grad_norm": 0.9798306226730347, + "learning_rate": 8.886002537304416e-05, + "loss": 2.7206, + "step": 18402 + }, + { + "epoch": 1.6672782043441825, + "grad_norm": 0.895845890045166, + "learning_rate": 8.885398417205341e-05, + "loss": 2.5552, + "step": 18403 + }, + { + "epoch": 1.6673688025186293, + "grad_norm": 0.8748459219932556, + "learning_rate": 8.884794297106264e-05, + "loss": 2.5313, + "step": 18404 + }, + { + "epoch": 1.667459400693076, + "grad_norm": 0.9216280579566956, + "learning_rate": 8.884190177007189e-05, + "loss": 2.4567, + "step": 18405 + }, + { + "epoch": 1.6675499988675229, + "grad_norm": 1.0004403591156006, + "learning_rate": 8.883586056908114e-05, + "loss": 2.5873, + "step": 18406 + }, + { + "epoch": 1.6676405970419697, + "grad_norm": 0.9369929432868958, + "learning_rate": 8.882981936809038e-05, + "loss": 2.6439, + "step": 18407 + }, + { + "epoch": 1.6677311952164164, + "grad_norm": 0.8970972299575806, + "learning_rate": 8.882377816709962e-05, + "loss": 2.6644, + "step": 18408 + }, + { + "epoch": 1.6678217933908632, + "grad_norm": 0.8255478143692017, + "learning_rate": 8.881773696610887e-05, + "loss": 2.6697, + "step": 18409 + }, + { + "epoch": 1.6679123915653098, + "grad_norm": 0.9689391255378723, + "learning_rate": 8.881169576511811e-05, + "loss": 2.5556, + "step": 18410 + }, + { + "epoch": 1.6680029897397568, + "grad_norm": 0.8888877630233765, + "learning_rate": 8.880565456412735e-05, + "loss": 2.3675, + "step": 18411 + }, + { + "epoch": 1.6680935879142034, + "grad_norm": 0.8752612471580505, + "learning_rate": 8.87996133631366e-05, + "loss": 2.5615, + "step": 18412 + }, + { + "epoch": 1.6681841860886504, + "grad_norm": 0.8692070245742798, + "learning_rate": 8.879357216214584e-05, + "loss": 2.3477, + "step": 18413 + }, + { + "epoch": 1.668274784263097, + "grad_norm": 0.9146534204483032, + "learning_rate": 8.878753096115509e-05, + "loss": 2.0519, + "step": 18414 + }, + { + "epoch": 1.668365382437544, + "grad_norm": 0.9245871901512146, + "learning_rate": 8.878148976016432e-05, + "loss": 2.686, + "step": 18415 + }, + { + "epoch": 1.6684559806119905, + "grad_norm": 1.0531154870986938, + "learning_rate": 8.877544855917357e-05, + "loss": 2.4542, + "step": 18416 + }, + { + "epoch": 1.6685465787864375, + "grad_norm": 0.9058739542961121, + "learning_rate": 8.87694073581828e-05, + "loss": 2.7529, + "step": 18417 + }, + { + "epoch": 1.668637176960884, + "grad_norm": 1.0206421613693237, + "learning_rate": 8.876336615719205e-05, + "loss": 2.495, + "step": 18418 + }, + { + "epoch": 1.668727775135331, + "grad_norm": 0.8912993669509888, + "learning_rate": 8.875732495620129e-05, + "loss": 2.7174, + "step": 18419 + }, + { + "epoch": 1.6688183733097777, + "grad_norm": 0.8130811452865601, + "learning_rate": 8.875128375521054e-05, + "loss": 2.0375, + "step": 18420 + }, + { + "epoch": 1.6689089714842247, + "grad_norm": 0.8329029679298401, + "learning_rate": 8.874524255421978e-05, + "loss": 2.0074, + "step": 18421 + }, + { + "epoch": 1.6689995696586712, + "grad_norm": 0.879607617855072, + "learning_rate": 8.873920135322903e-05, + "loss": 2.8252, + "step": 18422 + }, + { + "epoch": 1.6690901678331183, + "grad_norm": 0.8790045380592346, + "learning_rate": 8.873316015223827e-05, + "loss": 2.5491, + "step": 18423 + }, + { + "epoch": 1.6691807660075648, + "grad_norm": 0.8312674164772034, + "learning_rate": 8.872711895124751e-05, + "loss": 2.6272, + "step": 18424 + }, + { + "epoch": 1.6692713641820118, + "grad_norm": 1.0047016143798828, + "learning_rate": 8.872107775025676e-05, + "loss": 2.829, + "step": 18425 + }, + { + "epoch": 1.6693619623564584, + "grad_norm": 0.855423629283905, + "learning_rate": 8.8715036549266e-05, + "loss": 2.722, + "step": 18426 + }, + { + "epoch": 1.6694525605309054, + "grad_norm": 0.9346977472305298, + "learning_rate": 8.870899534827524e-05, + "loss": 2.7706, + "step": 18427 + }, + { + "epoch": 1.669543158705352, + "grad_norm": 0.8524986505508423, + "learning_rate": 8.870295414728449e-05, + "loss": 2.6174, + "step": 18428 + }, + { + "epoch": 1.669633756879799, + "grad_norm": 0.9072665572166443, + "learning_rate": 8.869691294629374e-05, + "loss": 2.8411, + "step": 18429 + }, + { + "epoch": 1.6697243550542455, + "grad_norm": 0.910940408706665, + "learning_rate": 8.869087174530297e-05, + "loss": 2.5955, + "step": 18430 + }, + { + "epoch": 1.6698149532286926, + "grad_norm": 0.9643276929855347, + "learning_rate": 8.868483054431222e-05, + "loss": 2.7169, + "step": 18431 + }, + { + "epoch": 1.6699055514031391, + "grad_norm": 0.8616435527801514, + "learning_rate": 8.867878934332145e-05, + "loss": 2.6761, + "step": 18432 + }, + { + "epoch": 1.6699961495775861, + "grad_norm": 0.8738482594490051, + "learning_rate": 8.86727481423307e-05, + "loss": 2.6758, + "step": 18433 + }, + { + "epoch": 1.6700867477520327, + "grad_norm": 0.9403178691864014, + "learning_rate": 8.866670694133993e-05, + "loss": 2.4526, + "step": 18434 + }, + { + "epoch": 1.6701773459264797, + "grad_norm": 0.9614265561103821, + "learning_rate": 8.866066574034918e-05, + "loss": 2.8063, + "step": 18435 + }, + { + "epoch": 1.6702679441009263, + "grad_norm": 0.8816977143287659, + "learning_rate": 8.865462453935843e-05, + "loss": 2.8704, + "step": 18436 + }, + { + "epoch": 1.6703585422753733, + "grad_norm": 0.9089474081993103, + "learning_rate": 8.864858333836768e-05, + "loss": 2.5065, + "step": 18437 + }, + { + "epoch": 1.6704491404498198, + "grad_norm": 0.8354084491729736, + "learning_rate": 8.864254213737691e-05, + "loss": 2.1259, + "step": 18438 + }, + { + "epoch": 1.6705397386242669, + "grad_norm": 0.9287251234054565, + "learning_rate": 8.863650093638616e-05, + "loss": 2.7514, + "step": 18439 + }, + { + "epoch": 1.6706303367987134, + "grad_norm": 0.7798693776130676, + "learning_rate": 8.863045973539539e-05, + "loss": 1.7633, + "step": 18440 + }, + { + "epoch": 1.6707209349731604, + "grad_norm": 0.7552372813224792, + "learning_rate": 8.862441853440464e-05, + "loss": 2.0458, + "step": 18441 + }, + { + "epoch": 1.670811533147607, + "grad_norm": 0.9784806966781616, + "learning_rate": 8.861837733341389e-05, + "loss": 2.8124, + "step": 18442 + }, + { + "epoch": 1.670902131322054, + "grad_norm": 0.8788432478904724, + "learning_rate": 8.861233613242314e-05, + "loss": 2.5815, + "step": 18443 + }, + { + "epoch": 1.6709927294965006, + "grad_norm": 0.9104172587394714, + "learning_rate": 8.860629493143237e-05, + "loss": 2.7326, + "step": 18444 + }, + { + "epoch": 1.6710833276709476, + "grad_norm": 0.9295267462730408, + "learning_rate": 8.860025373044162e-05, + "loss": 3.0039, + "step": 18445 + }, + { + "epoch": 1.6711739258453941, + "grad_norm": 0.9201570749282837, + "learning_rate": 8.859421252945087e-05, + "loss": 2.8159, + "step": 18446 + }, + { + "epoch": 1.6712645240198412, + "grad_norm": 0.8156833648681641, + "learning_rate": 8.85881713284601e-05, + "loss": 2.0384, + "step": 18447 + }, + { + "epoch": 1.6713551221942877, + "grad_norm": 0.903316080570221, + "learning_rate": 8.858213012746935e-05, + "loss": 2.7031, + "step": 18448 + }, + { + "epoch": 1.6714457203687347, + "grad_norm": 0.8836400508880615, + "learning_rate": 8.857608892647858e-05, + "loss": 2.6887, + "step": 18449 + }, + { + "epoch": 1.6715363185431813, + "grad_norm": 0.8837777376174927, + "learning_rate": 8.857004772548783e-05, + "loss": 2.626, + "step": 18450 + }, + { + "epoch": 1.671626916717628, + "grad_norm": 0.7957376837730408, + "learning_rate": 8.856400652449708e-05, + "loss": 1.8956, + "step": 18451 + }, + { + "epoch": 1.6717175148920749, + "grad_norm": 0.9436192512512207, + "learning_rate": 8.855796532350632e-05, + "loss": 2.6481, + "step": 18452 + }, + { + "epoch": 1.6718081130665217, + "grad_norm": 0.8652886152267456, + "learning_rate": 8.855192412251556e-05, + "loss": 2.941, + "step": 18453 + }, + { + "epoch": 1.6718987112409684, + "grad_norm": 0.9190033674240112, + "learning_rate": 8.85458829215248e-05, + "loss": 2.8585, + "step": 18454 + }, + { + "epoch": 1.6719893094154152, + "grad_norm": 0.9329369068145752, + "learning_rate": 8.853984172053404e-05, + "loss": 2.7396, + "step": 18455 + }, + { + "epoch": 1.672079907589862, + "grad_norm": 0.8899609446525574, + "learning_rate": 8.853380051954329e-05, + "loss": 2.7361, + "step": 18456 + }, + { + "epoch": 1.6721705057643088, + "grad_norm": 0.9297319650650024, + "learning_rate": 8.852775931855253e-05, + "loss": 2.71, + "step": 18457 + }, + { + "epoch": 1.6722611039387556, + "grad_norm": 0.8987589478492737, + "learning_rate": 8.852171811756178e-05, + "loss": 2.6467, + "step": 18458 + }, + { + "epoch": 1.6723517021132024, + "grad_norm": 0.9584594368934631, + "learning_rate": 8.851567691657102e-05, + "loss": 2.7663, + "step": 18459 + }, + { + "epoch": 1.6724423002876492, + "grad_norm": 1.0972609519958496, + "learning_rate": 8.850963571558026e-05, + "loss": 2.5537, + "step": 18460 + }, + { + "epoch": 1.672532898462096, + "grad_norm": 0.8442614674568176, + "learning_rate": 8.850359451458951e-05, + "loss": 2.0735, + "step": 18461 + }, + { + "epoch": 1.6726234966365428, + "grad_norm": 0.8509442806243896, + "learning_rate": 8.849755331359875e-05, + "loss": 2.5779, + "step": 18462 + }, + { + "epoch": 1.6727140948109895, + "grad_norm": 0.8962050080299377, + "learning_rate": 8.849151211260799e-05, + "loss": 2.5739, + "step": 18463 + }, + { + "epoch": 1.6728046929854363, + "grad_norm": 0.9454655647277832, + "learning_rate": 8.848547091161723e-05, + "loss": 2.7541, + "step": 18464 + }, + { + "epoch": 1.6728952911598831, + "grad_norm": 1.0044314861297607, + "learning_rate": 8.847942971062647e-05, + "loss": 2.772, + "step": 18465 + }, + { + "epoch": 1.67298588933433, + "grad_norm": 0.8645709753036499, + "learning_rate": 8.847338850963572e-05, + "loss": 2.6437, + "step": 18466 + }, + { + "epoch": 1.6730764875087767, + "grad_norm": 0.9280937314033508, + "learning_rate": 8.846734730864497e-05, + "loss": 2.8683, + "step": 18467 + }, + { + "epoch": 1.6731670856832235, + "grad_norm": 0.8553311824798584, + "learning_rate": 8.84613061076542e-05, + "loss": 2.1133, + "step": 18468 + }, + { + "epoch": 1.6732576838576703, + "grad_norm": 0.9231939911842346, + "learning_rate": 8.845526490666345e-05, + "loss": 2.7468, + "step": 18469 + }, + { + "epoch": 1.673348282032117, + "grad_norm": 0.9616873264312744, + "learning_rate": 8.844922370567269e-05, + "loss": 2.7049, + "step": 18470 + }, + { + "epoch": 1.6734388802065638, + "grad_norm": 1.0278455018997192, + "learning_rate": 8.844318250468193e-05, + "loss": 2.508, + "step": 18471 + }, + { + "epoch": 1.6735294783810106, + "grad_norm": 0.9292929768562317, + "learning_rate": 8.843714130369117e-05, + "loss": 2.7477, + "step": 18472 + }, + { + "epoch": 1.6736200765554574, + "grad_norm": 0.894249439239502, + "learning_rate": 8.843110010270043e-05, + "loss": 2.7285, + "step": 18473 + }, + { + "epoch": 1.6737106747299042, + "grad_norm": 0.9214165210723877, + "learning_rate": 8.842505890170966e-05, + "loss": 2.5716, + "step": 18474 + }, + { + "epoch": 1.673801272904351, + "grad_norm": 0.7987234592437744, + "learning_rate": 8.841901770071891e-05, + "loss": 2.222, + "step": 18475 + }, + { + "epoch": 1.6738918710787978, + "grad_norm": 0.9811480045318604, + "learning_rate": 8.841297649972814e-05, + "loss": 2.8628, + "step": 18476 + }, + { + "epoch": 1.6739824692532446, + "grad_norm": 0.9127530455589294, + "learning_rate": 8.840693529873739e-05, + "loss": 2.641, + "step": 18477 + }, + { + "epoch": 1.6740730674276914, + "grad_norm": 0.8810842037200928, + "learning_rate": 8.840089409774664e-05, + "loss": 2.8283, + "step": 18478 + }, + { + "epoch": 1.6741636656021381, + "grad_norm": 0.9211958050727844, + "learning_rate": 8.839485289675587e-05, + "loss": 2.7367, + "step": 18479 + }, + { + "epoch": 1.674254263776585, + "grad_norm": 0.7542299628257751, + "learning_rate": 8.838881169576512e-05, + "loss": 2.1069, + "step": 18480 + }, + { + "epoch": 1.6743448619510317, + "grad_norm": 0.8384354114532471, + "learning_rate": 8.838277049477437e-05, + "loss": 2.736, + "step": 18481 + }, + { + "epoch": 1.6744354601254785, + "grad_norm": 0.894424319267273, + "learning_rate": 8.837672929378362e-05, + "loss": 2.6418, + "step": 18482 + }, + { + "epoch": 1.6745260582999253, + "grad_norm": 0.8985494375228882, + "learning_rate": 8.837068809279285e-05, + "loss": 2.654, + "step": 18483 + }, + { + "epoch": 1.674616656474372, + "grad_norm": 1.0238800048828125, + "learning_rate": 8.83646468918021e-05, + "loss": 2.8941, + "step": 18484 + }, + { + "epoch": 1.6747072546488189, + "grad_norm": 0.9368466734886169, + "learning_rate": 8.835860569081133e-05, + "loss": 2.8824, + "step": 18485 + }, + { + "epoch": 1.6747978528232657, + "grad_norm": 0.9046407341957092, + "learning_rate": 8.835256448982058e-05, + "loss": 2.9476, + "step": 18486 + }, + { + "epoch": 1.6748884509977124, + "grad_norm": 0.9405519366264343, + "learning_rate": 8.834652328882981e-05, + "loss": 2.3881, + "step": 18487 + }, + { + "epoch": 1.6749790491721592, + "grad_norm": 0.8976496458053589, + "learning_rate": 8.834048208783907e-05, + "loss": 2.8201, + "step": 18488 + }, + { + "epoch": 1.675069647346606, + "grad_norm": 0.8967608213424683, + "learning_rate": 8.833444088684831e-05, + "loss": 2.9311, + "step": 18489 + }, + { + "epoch": 1.6751602455210528, + "grad_norm": 0.763492226600647, + "learning_rate": 8.832839968585756e-05, + "loss": 2.0259, + "step": 18490 + }, + { + "epoch": 1.6752508436954994, + "grad_norm": 0.7598515748977661, + "learning_rate": 8.832235848486679e-05, + "loss": 1.7525, + "step": 18491 + }, + { + "epoch": 1.6753414418699464, + "grad_norm": 0.7697839736938477, + "learning_rate": 8.831631728387604e-05, + "loss": 1.9302, + "step": 18492 + }, + { + "epoch": 1.675432040044393, + "grad_norm": 0.9478750824928284, + "learning_rate": 8.831027608288529e-05, + "loss": 3.0404, + "step": 18493 + }, + { + "epoch": 1.67552263821884, + "grad_norm": 0.9099804759025574, + "learning_rate": 8.830423488189452e-05, + "loss": 2.7262, + "step": 18494 + }, + { + "epoch": 1.6756132363932865, + "grad_norm": 0.878356397151947, + "learning_rate": 8.829819368090377e-05, + "loss": 2.4786, + "step": 18495 + }, + { + "epoch": 1.6757038345677335, + "grad_norm": 0.9284319877624512, + "learning_rate": 8.829215247991301e-05, + "loss": 2.6244, + "step": 18496 + }, + { + "epoch": 1.67579443274218, + "grad_norm": 0.8974910378456116, + "learning_rate": 8.828611127892226e-05, + "loss": 2.5566, + "step": 18497 + }, + { + "epoch": 1.675885030916627, + "grad_norm": 0.9887363314628601, + "learning_rate": 8.82800700779315e-05, + "loss": 2.7461, + "step": 18498 + }, + { + "epoch": 1.6759756290910737, + "grad_norm": 0.8901341557502747, + "learning_rate": 8.827402887694074e-05, + "loss": 2.7514, + "step": 18499 + }, + { + "epoch": 1.6760662272655207, + "grad_norm": 0.8846592307090759, + "learning_rate": 8.826798767594998e-05, + "loss": 2.7734, + "step": 18500 + }, + { + "epoch": 1.6761568254399672, + "grad_norm": 0.9464463591575623, + "learning_rate": 8.826194647495923e-05, + "loss": 2.6307, + "step": 18501 + }, + { + "epoch": 1.6762474236144143, + "grad_norm": 0.9108757376670837, + "learning_rate": 8.825590527396846e-05, + "loss": 2.6484, + "step": 18502 + }, + { + "epoch": 1.6763380217888608, + "grad_norm": 0.7951147556304932, + "learning_rate": 8.824986407297772e-05, + "loss": 2.3274, + "step": 18503 + }, + { + "epoch": 1.6764286199633078, + "grad_norm": 0.9224326014518738, + "learning_rate": 8.824382287198696e-05, + "loss": 2.6004, + "step": 18504 + }, + { + "epoch": 1.6765192181377544, + "grad_norm": 0.8417302370071411, + "learning_rate": 8.82377816709962e-05, + "loss": 2.6192, + "step": 18505 + }, + { + "epoch": 1.6766098163122014, + "grad_norm": 1.0002580881118774, + "learning_rate": 8.823174047000544e-05, + "loss": 2.7109, + "step": 18506 + }, + { + "epoch": 1.676700414486648, + "grad_norm": 0.9428223967552185, + "learning_rate": 8.822569926901468e-05, + "loss": 2.6533, + "step": 18507 + }, + { + "epoch": 1.676791012661095, + "grad_norm": 0.8857312202453613, + "learning_rate": 8.821965806802392e-05, + "loss": 2.8982, + "step": 18508 + }, + { + "epoch": 1.6768816108355415, + "grad_norm": 0.9319019913673401, + "learning_rate": 8.821361686703317e-05, + "loss": 2.6642, + "step": 18509 + }, + { + "epoch": 1.6769722090099886, + "grad_norm": 0.9573090672492981, + "learning_rate": 8.820757566604241e-05, + "loss": 2.7221, + "step": 18510 + }, + { + "epoch": 1.6770628071844351, + "grad_norm": 0.8921038508415222, + "learning_rate": 8.820153446505166e-05, + "loss": 2.6904, + "step": 18511 + }, + { + "epoch": 1.6771534053588821, + "grad_norm": 0.8927163481712341, + "learning_rate": 8.819549326406091e-05, + "loss": 2.7536, + "step": 18512 + }, + { + "epoch": 1.6772440035333287, + "grad_norm": 0.9337182641029358, + "learning_rate": 8.818945206307014e-05, + "loss": 2.7155, + "step": 18513 + }, + { + "epoch": 1.6773346017077757, + "grad_norm": 0.8622167706489563, + "learning_rate": 8.818341086207939e-05, + "loss": 2.9333, + "step": 18514 + }, + { + "epoch": 1.6774251998822223, + "grad_norm": 0.7073259949684143, + "learning_rate": 8.817736966108862e-05, + "loss": 1.9498, + "step": 18515 + }, + { + "epoch": 1.6775157980566693, + "grad_norm": 0.8859599232673645, + "learning_rate": 8.817132846009787e-05, + "loss": 2.7793, + "step": 18516 + }, + { + "epoch": 1.6776063962311158, + "grad_norm": 0.8937868475914001, + "learning_rate": 8.81652872591071e-05, + "loss": 2.7753, + "step": 18517 + }, + { + "epoch": 1.6776969944055629, + "grad_norm": 0.77039635181427, + "learning_rate": 8.815924605811637e-05, + "loss": 2.275, + "step": 18518 + }, + { + "epoch": 1.6777875925800094, + "grad_norm": 0.9105709195137024, + "learning_rate": 8.81532048571256e-05, + "loss": 2.9024, + "step": 18519 + }, + { + "epoch": 1.6778781907544564, + "grad_norm": 0.8675664663314819, + "learning_rate": 8.814716365613485e-05, + "loss": 2.5876, + "step": 18520 + }, + { + "epoch": 1.677968788928903, + "grad_norm": 0.8765612840652466, + "learning_rate": 8.814112245514408e-05, + "loss": 2.683, + "step": 18521 + }, + { + "epoch": 1.67805938710335, + "grad_norm": 0.930168092250824, + "learning_rate": 8.813508125415333e-05, + "loss": 2.7342, + "step": 18522 + }, + { + "epoch": 1.6781499852777966, + "grad_norm": 0.9589544534683228, + "learning_rate": 8.812904005316256e-05, + "loss": 2.7781, + "step": 18523 + }, + { + "epoch": 1.6782405834522436, + "grad_norm": 0.8824976682662964, + "learning_rate": 8.812299885217181e-05, + "loss": 2.5461, + "step": 18524 + }, + { + "epoch": 1.6783311816266901, + "grad_norm": 0.9068827629089355, + "learning_rate": 8.811695765118106e-05, + "loss": 2.7716, + "step": 18525 + }, + { + "epoch": 1.6784217798011372, + "grad_norm": 0.9209137558937073, + "learning_rate": 8.811091645019031e-05, + "loss": 2.8797, + "step": 18526 + }, + { + "epoch": 1.6785123779755837, + "grad_norm": 0.909557044506073, + "learning_rate": 8.810487524919954e-05, + "loss": 2.5921, + "step": 18527 + }, + { + "epoch": 1.6786029761500307, + "grad_norm": 0.8661126494407654, + "learning_rate": 8.809883404820879e-05, + "loss": 2.687, + "step": 18528 + }, + { + "epoch": 1.6786935743244773, + "grad_norm": 0.9590510129928589, + "learning_rate": 8.809279284721804e-05, + "loss": 2.6672, + "step": 18529 + }, + { + "epoch": 1.6787841724989243, + "grad_norm": 0.9036756157875061, + "learning_rate": 8.808675164622727e-05, + "loss": 2.7158, + "step": 18530 + }, + { + "epoch": 1.6788747706733709, + "grad_norm": 0.8224554657936096, + "learning_rate": 8.808071044523652e-05, + "loss": 2.5843, + "step": 18531 + }, + { + "epoch": 1.6789653688478177, + "grad_norm": 0.9334282279014587, + "learning_rate": 8.807466924424575e-05, + "loss": 2.7495, + "step": 18532 + }, + { + "epoch": 1.6790559670222645, + "grad_norm": 0.9191049337387085, + "learning_rate": 8.806862804325501e-05, + "loss": 2.7945, + "step": 18533 + }, + { + "epoch": 1.6791465651967112, + "grad_norm": 0.9575837850570679, + "learning_rate": 8.806258684226425e-05, + "loss": 2.3891, + "step": 18534 + }, + { + "epoch": 1.679237163371158, + "grad_norm": 0.9429097771644592, + "learning_rate": 8.80565456412735e-05, + "loss": 2.9563, + "step": 18535 + }, + { + "epoch": 1.6793277615456048, + "grad_norm": 0.9394159913063049, + "learning_rate": 8.805050444028273e-05, + "loss": 2.5721, + "step": 18536 + }, + { + "epoch": 1.6794183597200516, + "grad_norm": 0.9352813363075256, + "learning_rate": 8.804446323929198e-05, + "loss": 2.6515, + "step": 18537 + }, + { + "epoch": 1.6795089578944984, + "grad_norm": 0.9126171469688416, + "learning_rate": 8.803842203830121e-05, + "loss": 2.7109, + "step": 18538 + }, + { + "epoch": 1.6795995560689452, + "grad_norm": 0.891503095626831, + "learning_rate": 8.803238083731046e-05, + "loss": 2.6963, + "step": 18539 + }, + { + "epoch": 1.679690154243392, + "grad_norm": 0.9299917817115784, + "learning_rate": 8.802633963631969e-05, + "loss": 2.8085, + "step": 18540 + }, + { + "epoch": 1.6797807524178388, + "grad_norm": 0.8802416324615479, + "learning_rate": 8.802029843532895e-05, + "loss": 2.7624, + "step": 18541 + }, + { + "epoch": 1.6798713505922855, + "grad_norm": 1.1057785749435425, + "learning_rate": 8.801425723433819e-05, + "loss": 2.1997, + "step": 18542 + }, + { + "epoch": 1.6799619487667323, + "grad_norm": 0.8428349494934082, + "learning_rate": 8.800821603334744e-05, + "loss": 2.1374, + "step": 18543 + }, + { + "epoch": 1.6800525469411791, + "grad_norm": 0.9954579472541809, + "learning_rate": 8.800217483235667e-05, + "loss": 2.726, + "step": 18544 + }, + { + "epoch": 1.680143145115626, + "grad_norm": 0.9385818243026733, + "learning_rate": 8.799613363136592e-05, + "loss": 2.7573, + "step": 18545 + }, + { + "epoch": 1.6802337432900727, + "grad_norm": 0.986182451248169, + "learning_rate": 8.799009243037516e-05, + "loss": 2.7274, + "step": 18546 + }, + { + "epoch": 1.6803243414645195, + "grad_norm": 0.936642587184906, + "learning_rate": 8.79840512293844e-05, + "loss": 2.6978, + "step": 18547 + }, + { + "epoch": 1.6804149396389663, + "grad_norm": 0.934877336025238, + "learning_rate": 8.797801002839366e-05, + "loss": 2.8395, + "step": 18548 + }, + { + "epoch": 1.680505537813413, + "grad_norm": 0.9747503399848938, + "learning_rate": 8.79719688274029e-05, + "loss": 3.0415, + "step": 18549 + }, + { + "epoch": 1.6805961359878598, + "grad_norm": 0.9385760426521301, + "learning_rate": 8.796592762641214e-05, + "loss": 2.631, + "step": 18550 + }, + { + "epoch": 1.6806867341623066, + "grad_norm": 0.731115996837616, + "learning_rate": 8.795988642542138e-05, + "loss": 2.0149, + "step": 18551 + }, + { + "epoch": 1.6807773323367534, + "grad_norm": 0.8943241834640503, + "learning_rate": 8.795384522443062e-05, + "loss": 2.6063, + "step": 18552 + }, + { + "epoch": 1.6808679305112002, + "grad_norm": 0.9194218516349792, + "learning_rate": 8.794780402343986e-05, + "loss": 2.7428, + "step": 18553 + }, + { + "epoch": 1.680958528685647, + "grad_norm": 0.9243901371955872, + "learning_rate": 8.79417628224491e-05, + "loss": 2.712, + "step": 18554 + }, + { + "epoch": 1.6810491268600938, + "grad_norm": 0.9101425409317017, + "learning_rate": 8.793572162145834e-05, + "loss": 2.5319, + "step": 18555 + }, + { + "epoch": 1.6811397250345406, + "grad_norm": 0.9330378770828247, + "learning_rate": 8.79296804204676e-05, + "loss": 2.8251, + "step": 18556 + }, + { + "epoch": 1.6812303232089874, + "grad_norm": 0.943444013595581, + "learning_rate": 8.792363921947683e-05, + "loss": 2.5814, + "step": 18557 + }, + { + "epoch": 1.6813209213834341, + "grad_norm": 0.9105261564254761, + "learning_rate": 8.791759801848608e-05, + "loss": 2.6757, + "step": 18558 + }, + { + "epoch": 1.681411519557881, + "grad_norm": 0.9515185952186584, + "learning_rate": 8.791155681749532e-05, + "loss": 2.6033, + "step": 18559 + }, + { + "epoch": 1.6815021177323277, + "grad_norm": 0.8731868267059326, + "learning_rate": 8.790551561650456e-05, + "loss": 2.7483, + "step": 18560 + }, + { + "epoch": 1.6815927159067745, + "grad_norm": 0.8488654494285583, + "learning_rate": 8.789947441551381e-05, + "loss": 2.1882, + "step": 18561 + }, + { + "epoch": 1.6816833140812213, + "grad_norm": 0.8740465641021729, + "learning_rate": 8.789343321452305e-05, + "loss": 2.6458, + "step": 18562 + }, + { + "epoch": 1.681773912255668, + "grad_norm": 1.0285841226577759, + "learning_rate": 8.788739201353229e-05, + "loss": 2.7306, + "step": 18563 + }, + { + "epoch": 1.6818645104301149, + "grad_norm": 0.9234151840209961, + "learning_rate": 8.788135081254154e-05, + "loss": 2.8693, + "step": 18564 + }, + { + "epoch": 1.6819551086045617, + "grad_norm": 0.8859221935272217, + "learning_rate": 8.787530961155079e-05, + "loss": 2.6335, + "step": 18565 + }, + { + "epoch": 1.6820457067790084, + "grad_norm": 0.9683419466018677, + "learning_rate": 8.786926841056002e-05, + "loss": 2.6465, + "step": 18566 + }, + { + "epoch": 1.6821363049534552, + "grad_norm": 0.8785679340362549, + "learning_rate": 8.786322720956927e-05, + "loss": 2.8699, + "step": 18567 + }, + { + "epoch": 1.682226903127902, + "grad_norm": 0.8700917363166809, + "learning_rate": 8.78571860085785e-05, + "loss": 2.6778, + "step": 18568 + }, + { + "epoch": 1.6823175013023488, + "grad_norm": 0.9502460360527039, + "learning_rate": 8.785114480758775e-05, + "loss": 2.6519, + "step": 18569 + }, + { + "epoch": 1.6824080994767956, + "grad_norm": 0.9125015735626221, + "learning_rate": 8.7845103606597e-05, + "loss": 2.5797, + "step": 18570 + }, + { + "epoch": 1.6824986976512424, + "grad_norm": 0.950901210308075, + "learning_rate": 8.783906240560625e-05, + "loss": 3.1248, + "step": 18571 + }, + { + "epoch": 1.682589295825689, + "grad_norm": 0.9065220355987549, + "learning_rate": 8.783302120461548e-05, + "loss": 2.7375, + "step": 18572 + }, + { + "epoch": 1.682679894000136, + "grad_norm": 0.6496978402137756, + "learning_rate": 8.782698000362473e-05, + "loss": 1.3322, + "step": 18573 + }, + { + "epoch": 1.6827704921745825, + "grad_norm": 0.876873791217804, + "learning_rate": 8.782093880263396e-05, + "loss": 2.8645, + "step": 18574 + }, + { + "epoch": 1.6828610903490295, + "grad_norm": 0.9010580778121948, + "learning_rate": 8.781489760164321e-05, + "loss": 3.0472, + "step": 18575 + }, + { + "epoch": 1.682951688523476, + "grad_norm": 0.9426229000091553, + "learning_rate": 8.780885640065244e-05, + "loss": 2.7026, + "step": 18576 + }, + { + "epoch": 1.683042286697923, + "grad_norm": 0.9231821894645691, + "learning_rate": 8.780281519966169e-05, + "loss": 3.0217, + "step": 18577 + }, + { + "epoch": 1.6831328848723697, + "grad_norm": 0.7356603741645813, + "learning_rate": 8.779677399867094e-05, + "loss": 1.9155, + "step": 18578 + }, + { + "epoch": 1.6832234830468167, + "grad_norm": 0.9398048520088196, + "learning_rate": 8.779073279768019e-05, + "loss": 2.7816, + "step": 18579 + }, + { + "epoch": 1.6833140812212632, + "grad_norm": 0.797260582447052, + "learning_rate": 8.778469159668943e-05, + "loss": 2.132, + "step": 18580 + }, + { + "epoch": 1.6834046793957103, + "grad_norm": 0.8900602459907532, + "learning_rate": 8.777865039569867e-05, + "loss": 2.6115, + "step": 18581 + }, + { + "epoch": 1.6834952775701568, + "grad_norm": 0.8301871418952942, + "learning_rate": 8.777260919470792e-05, + "loss": 2.4772, + "step": 18582 + }, + { + "epoch": 1.6835858757446038, + "grad_norm": 0.8461264371871948, + "learning_rate": 8.776656799371715e-05, + "loss": 2.6323, + "step": 18583 + }, + { + "epoch": 1.6836764739190504, + "grad_norm": 0.9236810207366943, + "learning_rate": 8.77605267927264e-05, + "loss": 2.6153, + "step": 18584 + }, + { + "epoch": 1.6837670720934974, + "grad_norm": 1.0357190370559692, + "learning_rate": 8.775448559173565e-05, + "loss": 2.8511, + "step": 18585 + }, + { + "epoch": 1.683857670267944, + "grad_norm": 0.8741876482963562, + "learning_rate": 8.774844439074489e-05, + "loss": 2.687, + "step": 18586 + }, + { + "epoch": 1.683948268442391, + "grad_norm": 0.9540687203407288, + "learning_rate": 8.774240318975413e-05, + "loss": 2.6034, + "step": 18587 + }, + { + "epoch": 1.6840388666168375, + "grad_norm": 0.8537337779998779, + "learning_rate": 8.773636198876337e-05, + "loss": 2.7275, + "step": 18588 + }, + { + "epoch": 1.6841294647912846, + "grad_norm": 0.8699935078620911, + "learning_rate": 8.773032078777261e-05, + "loss": 2.4719, + "step": 18589 + }, + { + "epoch": 1.6842200629657311, + "grad_norm": 0.9011217951774597, + "learning_rate": 8.772427958678186e-05, + "loss": 2.7211, + "step": 18590 + }, + { + "epoch": 1.6843106611401781, + "grad_norm": 0.8332630395889282, + "learning_rate": 8.771823838579109e-05, + "loss": 2.6519, + "step": 18591 + }, + { + "epoch": 1.6844012593146247, + "grad_norm": 0.8863285183906555, + "learning_rate": 8.771219718480034e-05, + "loss": 2.4693, + "step": 18592 + }, + { + "epoch": 1.6844918574890717, + "grad_norm": 0.9896650314331055, + "learning_rate": 8.770615598380959e-05, + "loss": 2.8052, + "step": 18593 + }, + { + "epoch": 1.6845824556635183, + "grad_norm": 0.8710923790931702, + "learning_rate": 8.770011478281883e-05, + "loss": 2.557, + "step": 18594 + }, + { + "epoch": 1.6846730538379653, + "grad_norm": 0.9124618768692017, + "learning_rate": 8.769407358182807e-05, + "loss": 2.6485, + "step": 18595 + }, + { + "epoch": 1.6847636520124118, + "grad_norm": 0.9332256317138672, + "learning_rate": 8.768803238083731e-05, + "loss": 2.6183, + "step": 18596 + }, + { + "epoch": 1.6848542501868589, + "grad_norm": 0.9016361236572266, + "learning_rate": 8.768199117984656e-05, + "loss": 2.7266, + "step": 18597 + }, + { + "epoch": 1.6849448483613054, + "grad_norm": 0.9402879476547241, + "learning_rate": 8.76759499788558e-05, + "loss": 2.917, + "step": 18598 + }, + { + "epoch": 1.6850354465357524, + "grad_norm": 0.8951857686042786, + "learning_rate": 8.766990877786504e-05, + "loss": 2.5405, + "step": 18599 + }, + { + "epoch": 1.685126044710199, + "grad_norm": 0.6350014209747314, + "learning_rate": 8.766386757687429e-05, + "loss": 1.3929, + "step": 18600 + }, + { + "epoch": 1.685216642884646, + "grad_norm": 0.9286030530929565, + "learning_rate": 8.765782637588354e-05, + "loss": 2.6536, + "step": 18601 + }, + { + "epoch": 1.6853072410590926, + "grad_norm": 0.9038360118865967, + "learning_rate": 8.765178517489277e-05, + "loss": 2.9394, + "step": 18602 + }, + { + "epoch": 1.6853978392335396, + "grad_norm": 0.7329103350639343, + "learning_rate": 8.764574397390202e-05, + "loss": 1.8369, + "step": 18603 + }, + { + "epoch": 1.6854884374079862, + "grad_norm": 0.9454905986785889, + "learning_rate": 8.763970277291125e-05, + "loss": 2.8642, + "step": 18604 + }, + { + "epoch": 1.6855790355824332, + "grad_norm": 0.8784813284873962, + "learning_rate": 8.76336615719205e-05, + "loss": 2.7628, + "step": 18605 + }, + { + "epoch": 1.6856696337568797, + "grad_norm": 0.8849201798439026, + "learning_rate": 8.762762037092974e-05, + "loss": 2.9214, + "step": 18606 + }, + { + "epoch": 1.6857602319313267, + "grad_norm": 0.9723582863807678, + "learning_rate": 8.762157916993898e-05, + "loss": 2.5256, + "step": 18607 + }, + { + "epoch": 1.6858508301057733, + "grad_norm": 0.9629167318344116, + "learning_rate": 8.761553796894823e-05, + "loss": 2.7432, + "step": 18608 + }, + { + "epoch": 1.6859414282802203, + "grad_norm": 0.791355311870575, + "learning_rate": 8.760949676795748e-05, + "loss": 2.1223, + "step": 18609 + }, + { + "epoch": 1.6860320264546669, + "grad_norm": 0.920096755027771, + "learning_rate": 8.760345556696671e-05, + "loss": 2.8392, + "step": 18610 + }, + { + "epoch": 1.6861226246291139, + "grad_norm": 0.8158356547355652, + "learning_rate": 8.759741436597596e-05, + "loss": 1.9268, + "step": 18611 + }, + { + "epoch": 1.6862132228035605, + "grad_norm": 0.9311850070953369, + "learning_rate": 8.759137316498521e-05, + "loss": 2.7772, + "step": 18612 + }, + { + "epoch": 1.6863038209780072, + "grad_norm": 1.004091501235962, + "learning_rate": 8.758533196399444e-05, + "loss": 3.003, + "step": 18613 + }, + { + "epoch": 1.686394419152454, + "grad_norm": 0.8993408679962158, + "learning_rate": 8.757929076300369e-05, + "loss": 2.6639, + "step": 18614 + }, + { + "epoch": 1.6864850173269008, + "grad_norm": 0.8965812921524048, + "learning_rate": 8.757324956201294e-05, + "loss": 2.6409, + "step": 18615 + }, + { + "epoch": 1.6865756155013476, + "grad_norm": 0.8580872416496277, + "learning_rate": 8.756720836102219e-05, + "loss": 2.6343, + "step": 18616 + }, + { + "epoch": 1.6866662136757944, + "grad_norm": 0.8770912289619446, + "learning_rate": 8.756116716003142e-05, + "loss": 2.8439, + "step": 18617 + }, + { + "epoch": 1.6867568118502412, + "grad_norm": 1.0100558996200562, + "learning_rate": 8.755512595904067e-05, + "loss": 2.4948, + "step": 18618 + }, + { + "epoch": 1.686847410024688, + "grad_norm": 0.9528440237045288, + "learning_rate": 8.75490847580499e-05, + "loss": 2.766, + "step": 18619 + }, + { + "epoch": 1.6869380081991348, + "grad_norm": 0.9612616896629333, + "learning_rate": 8.754304355705915e-05, + "loss": 2.718, + "step": 18620 + }, + { + "epoch": 1.6870286063735815, + "grad_norm": 0.9024462699890137, + "learning_rate": 8.753700235606838e-05, + "loss": 2.6472, + "step": 18621 + }, + { + "epoch": 1.6871192045480283, + "grad_norm": 0.9150655269622803, + "learning_rate": 8.753096115507763e-05, + "loss": 2.8657, + "step": 18622 + }, + { + "epoch": 1.6872098027224751, + "grad_norm": 0.9550025463104248, + "learning_rate": 8.752491995408688e-05, + "loss": 2.9916, + "step": 18623 + }, + { + "epoch": 1.687300400896922, + "grad_norm": 0.7315872311592102, + "learning_rate": 8.751887875309613e-05, + "loss": 1.759, + "step": 18624 + }, + { + "epoch": 1.6873909990713687, + "grad_norm": 1.0544402599334717, + "learning_rate": 8.751283755210536e-05, + "loss": 2.7844, + "step": 18625 + }, + { + "epoch": 1.6874815972458155, + "grad_norm": 0.9062749147415161, + "learning_rate": 8.750679635111461e-05, + "loss": 2.6229, + "step": 18626 + }, + { + "epoch": 1.6875721954202623, + "grad_norm": 0.8519092798233032, + "learning_rate": 8.750075515012384e-05, + "loss": 2.2084, + "step": 18627 + }, + { + "epoch": 1.687662793594709, + "grad_norm": 0.9281721711158752, + "learning_rate": 8.749471394913309e-05, + "loss": 2.7661, + "step": 18628 + }, + { + "epoch": 1.6877533917691558, + "grad_norm": 0.8739771842956543, + "learning_rate": 8.748867274814234e-05, + "loss": 2.9505, + "step": 18629 + }, + { + "epoch": 1.6878439899436026, + "grad_norm": 0.8811063766479492, + "learning_rate": 8.748263154715158e-05, + "loss": 2.6483, + "step": 18630 + }, + { + "epoch": 1.6879345881180494, + "grad_norm": 0.7841736078262329, + "learning_rate": 8.747659034616082e-05, + "loss": 1.8357, + "step": 18631 + }, + { + "epoch": 1.6880251862924962, + "grad_norm": 0.8702060580253601, + "learning_rate": 8.747054914517007e-05, + "loss": 2.5208, + "step": 18632 + }, + { + "epoch": 1.688115784466943, + "grad_norm": 0.8984804749488831, + "learning_rate": 8.746450794417931e-05, + "loss": 2.8701, + "step": 18633 + }, + { + "epoch": 1.6882063826413898, + "grad_norm": 0.8601782321929932, + "learning_rate": 8.745846674318855e-05, + "loss": 2.5443, + "step": 18634 + }, + { + "epoch": 1.6882969808158366, + "grad_norm": 0.9217478036880493, + "learning_rate": 8.74524255421978e-05, + "loss": 2.8486, + "step": 18635 + }, + { + "epoch": 1.6883875789902834, + "grad_norm": 1.0142650604248047, + "learning_rate": 8.744638434120703e-05, + "loss": 2.5729, + "step": 18636 + }, + { + "epoch": 1.6884781771647301, + "grad_norm": 0.8907175064086914, + "learning_rate": 8.744034314021628e-05, + "loss": 2.8028, + "step": 18637 + }, + { + "epoch": 1.688568775339177, + "grad_norm": 1.039231538772583, + "learning_rate": 8.743430193922552e-05, + "loss": 2.5082, + "step": 18638 + }, + { + "epoch": 1.6886593735136237, + "grad_norm": 0.9839072823524475, + "learning_rate": 8.742826073823477e-05, + "loss": 2.7466, + "step": 18639 + }, + { + "epoch": 1.6887499716880705, + "grad_norm": 0.9277280569076538, + "learning_rate": 8.7422219537244e-05, + "loss": 2.836, + "step": 18640 + }, + { + "epoch": 1.6888405698625173, + "grad_norm": 0.9816229343414307, + "learning_rate": 8.741617833625325e-05, + "loss": 2.6676, + "step": 18641 + }, + { + "epoch": 1.688931168036964, + "grad_norm": 0.8992120027542114, + "learning_rate": 8.741013713526249e-05, + "loss": 2.9407, + "step": 18642 + }, + { + "epoch": 1.6890217662114109, + "grad_norm": 0.8711400032043457, + "learning_rate": 8.740409593427174e-05, + "loss": 2.6185, + "step": 18643 + }, + { + "epoch": 1.6891123643858577, + "grad_norm": 0.9203866720199585, + "learning_rate": 8.739805473328098e-05, + "loss": 2.7467, + "step": 18644 + }, + { + "epoch": 1.6892029625603044, + "grad_norm": 0.8144819140434265, + "learning_rate": 8.739201353229023e-05, + "loss": 2.6656, + "step": 18645 + }, + { + "epoch": 1.6892935607347512, + "grad_norm": 0.9017792344093323, + "learning_rate": 8.738597233129946e-05, + "loss": 2.7747, + "step": 18646 + }, + { + "epoch": 1.689384158909198, + "grad_norm": 0.9432948231697083, + "learning_rate": 8.737993113030871e-05, + "loss": 2.6863, + "step": 18647 + }, + { + "epoch": 1.6894747570836448, + "grad_norm": 0.9089299440383911, + "learning_rate": 8.737388992931796e-05, + "loss": 2.8792, + "step": 18648 + }, + { + "epoch": 1.6895653552580916, + "grad_norm": 0.8670492172241211, + "learning_rate": 8.73678487283272e-05, + "loss": 2.3232, + "step": 18649 + }, + { + "epoch": 1.6896559534325384, + "grad_norm": 0.8648967742919922, + "learning_rate": 8.736180752733644e-05, + "loss": 2.5736, + "step": 18650 + }, + { + "epoch": 1.6897465516069852, + "grad_norm": 0.924455463886261, + "learning_rate": 8.735576632634568e-05, + "loss": 2.6066, + "step": 18651 + }, + { + "epoch": 1.689837149781432, + "grad_norm": 0.8683279752731323, + "learning_rate": 8.734972512535492e-05, + "loss": 2.5935, + "step": 18652 + }, + { + "epoch": 1.6899277479558785, + "grad_norm": 0.9131736755371094, + "learning_rate": 8.734368392436417e-05, + "loss": 2.0037, + "step": 18653 + }, + { + "epoch": 1.6900183461303255, + "grad_norm": 0.9322067499160767, + "learning_rate": 8.733764272337342e-05, + "loss": 2.7614, + "step": 18654 + }, + { + "epoch": 1.690108944304772, + "grad_norm": 0.9354993104934692, + "learning_rate": 8.733160152238265e-05, + "loss": 2.9681, + "step": 18655 + }, + { + "epoch": 1.690199542479219, + "grad_norm": 0.9124139547348022, + "learning_rate": 8.73255603213919e-05, + "loss": 2.7876, + "step": 18656 + }, + { + "epoch": 1.6902901406536657, + "grad_norm": 0.8812891840934753, + "learning_rate": 8.731951912040113e-05, + "loss": 2.8265, + "step": 18657 + }, + { + "epoch": 1.6903807388281127, + "grad_norm": 0.8449689149856567, + "learning_rate": 8.731347791941038e-05, + "loss": 2.7837, + "step": 18658 + }, + { + "epoch": 1.6904713370025592, + "grad_norm": 0.8902340531349182, + "learning_rate": 8.730743671841962e-05, + "loss": 2.7987, + "step": 18659 + }, + { + "epoch": 1.6905619351770063, + "grad_norm": 0.919333815574646, + "learning_rate": 8.730139551742888e-05, + "loss": 2.5409, + "step": 18660 + }, + { + "epoch": 1.6906525333514528, + "grad_norm": 0.9376592040061951, + "learning_rate": 8.729535431643811e-05, + "loss": 2.63, + "step": 18661 + }, + { + "epoch": 1.6907431315258998, + "grad_norm": 0.8979858756065369, + "learning_rate": 8.728931311544736e-05, + "loss": 2.8635, + "step": 18662 + }, + { + "epoch": 1.6908337297003464, + "grad_norm": 0.8990286588668823, + "learning_rate": 8.728327191445659e-05, + "loss": 2.5405, + "step": 18663 + }, + { + "epoch": 1.6909243278747934, + "grad_norm": 0.8945300579071045, + "learning_rate": 8.727723071346584e-05, + "loss": 2.8953, + "step": 18664 + }, + { + "epoch": 1.69101492604924, + "grad_norm": 0.9959678053855896, + "learning_rate": 8.727118951247509e-05, + "loss": 2.529, + "step": 18665 + }, + { + "epoch": 1.691105524223687, + "grad_norm": 0.9565070271492004, + "learning_rate": 8.726514831148432e-05, + "loss": 2.5609, + "step": 18666 + }, + { + "epoch": 1.6911961223981336, + "grad_norm": 0.9162713289260864, + "learning_rate": 8.725910711049357e-05, + "loss": 2.2608, + "step": 18667 + }, + { + "epoch": 1.6912867205725806, + "grad_norm": 0.9441280961036682, + "learning_rate": 8.725306590950282e-05, + "loss": 2.5505, + "step": 18668 + }, + { + "epoch": 1.6913773187470271, + "grad_norm": 0.8881834745407104, + "learning_rate": 8.724702470851206e-05, + "loss": 2.462, + "step": 18669 + }, + { + "epoch": 1.6914679169214741, + "grad_norm": 0.8649636507034302, + "learning_rate": 8.72409835075213e-05, + "loss": 2.624, + "step": 18670 + }, + { + "epoch": 1.6915585150959207, + "grad_norm": 0.9188531637191772, + "learning_rate": 8.723494230653055e-05, + "loss": 2.6723, + "step": 18671 + }, + { + "epoch": 1.6916491132703677, + "grad_norm": 0.857244610786438, + "learning_rate": 8.722890110553978e-05, + "loss": 2.7912, + "step": 18672 + }, + { + "epoch": 1.6917397114448143, + "grad_norm": 0.9106013774871826, + "learning_rate": 8.722285990454903e-05, + "loss": 2.6884, + "step": 18673 + }, + { + "epoch": 1.6918303096192613, + "grad_norm": 0.8815662264823914, + "learning_rate": 8.721681870355826e-05, + "loss": 2.5874, + "step": 18674 + }, + { + "epoch": 1.6919209077937079, + "grad_norm": 0.8675123453140259, + "learning_rate": 8.721077750256752e-05, + "loss": 2.6841, + "step": 18675 + }, + { + "epoch": 1.6920115059681549, + "grad_norm": 0.7912418246269226, + "learning_rate": 8.720473630157676e-05, + "loss": 2.0667, + "step": 18676 + }, + { + "epoch": 1.6921021041426014, + "grad_norm": 0.879889190196991, + "learning_rate": 8.7198695100586e-05, + "loss": 2.6537, + "step": 18677 + }, + { + "epoch": 1.6921927023170484, + "grad_norm": 0.849799633026123, + "learning_rate": 8.719265389959524e-05, + "loss": 2.7907, + "step": 18678 + }, + { + "epoch": 1.692283300491495, + "grad_norm": 0.9398007392883301, + "learning_rate": 8.718661269860449e-05, + "loss": 2.8668, + "step": 18679 + }, + { + "epoch": 1.692373898665942, + "grad_norm": 0.980231761932373, + "learning_rate": 8.718057149761373e-05, + "loss": 2.6, + "step": 18680 + }, + { + "epoch": 1.6924644968403886, + "grad_norm": 0.9241411685943604, + "learning_rate": 8.717453029662297e-05, + "loss": 2.8719, + "step": 18681 + }, + { + "epoch": 1.6925550950148356, + "grad_norm": 0.9007735252380371, + "learning_rate": 8.716848909563222e-05, + "loss": 2.5525, + "step": 18682 + }, + { + "epoch": 1.6926456931892822, + "grad_norm": 0.8616564273834229, + "learning_rate": 8.716244789464146e-05, + "loss": 2.626, + "step": 18683 + }, + { + "epoch": 1.6927362913637292, + "grad_norm": 0.7784326672554016, + "learning_rate": 8.715640669365071e-05, + "loss": 2.0807, + "step": 18684 + }, + { + "epoch": 1.6928268895381757, + "grad_norm": 0.858666718006134, + "learning_rate": 8.715036549265995e-05, + "loss": 2.588, + "step": 18685 + }, + { + "epoch": 1.6929174877126227, + "grad_norm": 0.8970535397529602, + "learning_rate": 8.714432429166919e-05, + "loss": 2.518, + "step": 18686 + }, + { + "epoch": 1.6930080858870693, + "grad_norm": 1.0456275939941406, + "learning_rate": 8.713828309067843e-05, + "loss": 3.1603, + "step": 18687 + }, + { + "epoch": 1.6930986840615163, + "grad_norm": 0.911361038684845, + "learning_rate": 8.713224188968767e-05, + "loss": 2.605, + "step": 18688 + }, + { + "epoch": 1.6931892822359629, + "grad_norm": 0.9103113412857056, + "learning_rate": 8.712620068869691e-05, + "loss": 2.6677, + "step": 18689 + }, + { + "epoch": 1.6932798804104099, + "grad_norm": 0.948123037815094, + "learning_rate": 8.712015948770617e-05, + "loss": 2.6907, + "step": 18690 + }, + { + "epoch": 1.6933704785848565, + "grad_norm": 0.955207347869873, + "learning_rate": 8.71141182867154e-05, + "loss": 2.4364, + "step": 18691 + }, + { + "epoch": 1.6934610767593035, + "grad_norm": 0.8809656500816345, + "learning_rate": 8.710807708572465e-05, + "loss": 2.4941, + "step": 18692 + }, + { + "epoch": 1.69355167493375, + "grad_norm": 0.8940250873565674, + "learning_rate": 8.710203588473389e-05, + "loss": 2.6097, + "step": 18693 + }, + { + "epoch": 1.6936422731081968, + "grad_norm": 0.9484995603561401, + "learning_rate": 8.709599468374313e-05, + "loss": 2.6869, + "step": 18694 + }, + { + "epoch": 1.6937328712826436, + "grad_norm": 0.872945249080658, + "learning_rate": 8.708995348275237e-05, + "loss": 2.7235, + "step": 18695 + }, + { + "epoch": 1.6938234694570904, + "grad_norm": 0.906812846660614, + "learning_rate": 8.708391228176161e-05, + "loss": 2.4582, + "step": 18696 + }, + { + "epoch": 1.6939140676315372, + "grad_norm": 1.1398061513900757, + "learning_rate": 8.707787108077086e-05, + "loss": 2.7847, + "step": 18697 + }, + { + "epoch": 1.694004665805984, + "grad_norm": 0.9281225800514221, + "learning_rate": 8.707182987978011e-05, + "loss": 2.8403, + "step": 18698 + }, + { + "epoch": 1.6940952639804308, + "grad_norm": 0.8788741230964661, + "learning_rate": 8.706578867878936e-05, + "loss": 2.7498, + "step": 18699 + }, + { + "epoch": 1.6941858621548775, + "grad_norm": 0.8992636799812317, + "learning_rate": 8.705974747779859e-05, + "loss": 2.8282, + "step": 18700 + }, + { + "epoch": 1.6942764603293243, + "grad_norm": 0.9411607980728149, + "learning_rate": 8.705370627680784e-05, + "loss": 2.7187, + "step": 18701 + }, + { + "epoch": 1.6943670585037711, + "grad_norm": 0.909072995185852, + "learning_rate": 8.704766507581707e-05, + "loss": 2.2361, + "step": 18702 + }, + { + "epoch": 1.694457656678218, + "grad_norm": 0.898684024810791, + "learning_rate": 8.704162387482632e-05, + "loss": 2.7851, + "step": 18703 + }, + { + "epoch": 1.6945482548526647, + "grad_norm": 0.9027601480484009, + "learning_rate": 8.703558267383555e-05, + "loss": 2.7789, + "step": 18704 + }, + { + "epoch": 1.6946388530271115, + "grad_norm": 0.9772803783416748, + "learning_rate": 8.702954147284482e-05, + "loss": 2.8868, + "step": 18705 + }, + { + "epoch": 1.6947294512015583, + "grad_norm": 0.906249463558197, + "learning_rate": 8.702350027185405e-05, + "loss": 2.6292, + "step": 18706 + }, + { + "epoch": 1.694820049376005, + "grad_norm": 0.8723203539848328, + "learning_rate": 8.70174590708633e-05, + "loss": 2.4156, + "step": 18707 + }, + { + "epoch": 1.6949106475504518, + "grad_norm": 0.989725649356842, + "learning_rate": 8.701141786987253e-05, + "loss": 2.8774, + "step": 18708 + }, + { + "epoch": 1.6950012457248986, + "grad_norm": 0.8692582845687866, + "learning_rate": 8.700537666888178e-05, + "loss": 2.5259, + "step": 18709 + }, + { + "epoch": 1.6950918438993454, + "grad_norm": 0.8724292516708374, + "learning_rate": 8.699933546789101e-05, + "loss": 2.6231, + "step": 18710 + }, + { + "epoch": 1.6951824420737922, + "grad_norm": 0.9934160113334656, + "learning_rate": 8.699329426690026e-05, + "loss": 2.5172, + "step": 18711 + }, + { + "epoch": 1.695273040248239, + "grad_norm": 0.8262358903884888, + "learning_rate": 8.698725306590951e-05, + "loss": 2.1017, + "step": 18712 + }, + { + "epoch": 1.6953636384226858, + "grad_norm": 0.9250694513320923, + "learning_rate": 8.698121186491876e-05, + "loss": 2.4917, + "step": 18713 + }, + { + "epoch": 1.6954542365971326, + "grad_norm": 0.9071146845817566, + "learning_rate": 8.697517066392799e-05, + "loss": 2.6454, + "step": 18714 + }, + { + "epoch": 1.6955448347715794, + "grad_norm": 0.8824493885040283, + "learning_rate": 8.696912946293724e-05, + "loss": 2.661, + "step": 18715 + }, + { + "epoch": 1.6956354329460261, + "grad_norm": 0.9094998240470886, + "learning_rate": 8.696308826194649e-05, + "loss": 2.948, + "step": 18716 + }, + { + "epoch": 1.695726031120473, + "grad_norm": 0.9355564117431641, + "learning_rate": 8.695704706095572e-05, + "loss": 3.0475, + "step": 18717 + }, + { + "epoch": 1.6958166292949197, + "grad_norm": 0.90375155210495, + "learning_rate": 8.695100585996497e-05, + "loss": 2.7638, + "step": 18718 + }, + { + "epoch": 1.6959072274693665, + "grad_norm": 0.8401216864585876, + "learning_rate": 8.69449646589742e-05, + "loss": 2.5731, + "step": 18719 + }, + { + "epoch": 1.6959978256438133, + "grad_norm": 0.9842594265937805, + "learning_rate": 8.693892345798346e-05, + "loss": 2.5992, + "step": 18720 + }, + { + "epoch": 1.69608842381826, + "grad_norm": 0.937005877494812, + "learning_rate": 8.69328822569927e-05, + "loss": 2.9691, + "step": 18721 + }, + { + "epoch": 1.6961790219927069, + "grad_norm": 0.9364286661148071, + "learning_rate": 8.692684105600194e-05, + "loss": 2.7543, + "step": 18722 + }, + { + "epoch": 1.6962696201671537, + "grad_norm": 0.880551278591156, + "learning_rate": 8.692079985501118e-05, + "loss": 2.789, + "step": 18723 + }, + { + "epoch": 1.6963602183416004, + "grad_norm": 0.9061506986618042, + "learning_rate": 8.691475865402043e-05, + "loss": 2.7693, + "step": 18724 + }, + { + "epoch": 1.6964508165160472, + "grad_norm": 0.7661094069480896, + "learning_rate": 8.690871745302966e-05, + "loss": 2.0731, + "step": 18725 + }, + { + "epoch": 1.696541414690494, + "grad_norm": 0.887268602848053, + "learning_rate": 8.690267625203891e-05, + "loss": 2.7723, + "step": 18726 + }, + { + "epoch": 1.6966320128649408, + "grad_norm": 0.9349336624145508, + "learning_rate": 8.689663505104814e-05, + "loss": 2.6567, + "step": 18727 + }, + { + "epoch": 1.6967226110393876, + "grad_norm": 0.9065641164779663, + "learning_rate": 8.68905938500574e-05, + "loss": 2.466, + "step": 18728 + }, + { + "epoch": 1.6968132092138344, + "grad_norm": 0.8896023631095886, + "learning_rate": 8.688455264906664e-05, + "loss": 2.7248, + "step": 18729 + }, + { + "epoch": 1.6969038073882812, + "grad_norm": 0.8567166328430176, + "learning_rate": 8.687851144807588e-05, + "loss": 2.617, + "step": 18730 + }, + { + "epoch": 1.696994405562728, + "grad_norm": 1.0411150455474854, + "learning_rate": 8.687247024708512e-05, + "loss": 2.6317, + "step": 18731 + }, + { + "epoch": 1.6970850037371747, + "grad_norm": 0.9144918322563171, + "learning_rate": 8.686642904609437e-05, + "loss": 3.0466, + "step": 18732 + }, + { + "epoch": 1.6971756019116215, + "grad_norm": 0.8893457651138306, + "learning_rate": 8.686038784510361e-05, + "loss": 2.7337, + "step": 18733 + }, + { + "epoch": 1.697266200086068, + "grad_norm": 0.8075100779533386, + "learning_rate": 8.685434664411285e-05, + "loss": 2.0358, + "step": 18734 + }, + { + "epoch": 1.697356798260515, + "grad_norm": 0.7565149068832397, + "learning_rate": 8.684830544312211e-05, + "loss": 2.0247, + "step": 18735 + }, + { + "epoch": 1.6974473964349617, + "grad_norm": 0.8081187605857849, + "learning_rate": 8.684226424213134e-05, + "loss": 2.3924, + "step": 18736 + }, + { + "epoch": 1.6975379946094087, + "grad_norm": 0.9086578488349915, + "learning_rate": 8.683622304114059e-05, + "loss": 2.5534, + "step": 18737 + }, + { + "epoch": 1.6976285927838553, + "grad_norm": 0.8611649870872498, + "learning_rate": 8.683018184014982e-05, + "loss": 2.6042, + "step": 18738 + }, + { + "epoch": 1.6977191909583023, + "grad_norm": 0.8734367489814758, + "learning_rate": 8.682414063915907e-05, + "loss": 2.785, + "step": 18739 + }, + { + "epoch": 1.6978097891327488, + "grad_norm": 0.9110957384109497, + "learning_rate": 8.68180994381683e-05, + "loss": 2.8528, + "step": 18740 + }, + { + "epoch": 1.6979003873071958, + "grad_norm": 0.8534959554672241, + "learning_rate": 8.681205823717755e-05, + "loss": 2.8233, + "step": 18741 + }, + { + "epoch": 1.6979909854816424, + "grad_norm": 0.9042748808860779, + "learning_rate": 8.680601703618679e-05, + "loss": 2.7865, + "step": 18742 + }, + { + "epoch": 1.6980815836560894, + "grad_norm": 0.9663477540016174, + "learning_rate": 8.679997583519605e-05, + "loss": 2.784, + "step": 18743 + }, + { + "epoch": 1.698172181830536, + "grad_norm": 0.9174508452415466, + "learning_rate": 8.679393463420528e-05, + "loss": 2.6311, + "step": 18744 + }, + { + "epoch": 1.698262780004983, + "grad_norm": 0.8780522346496582, + "learning_rate": 8.678789343321453e-05, + "loss": 2.5194, + "step": 18745 + }, + { + "epoch": 1.6983533781794296, + "grad_norm": 0.8603352904319763, + "learning_rate": 8.678185223222376e-05, + "loss": 2.7203, + "step": 18746 + }, + { + "epoch": 1.6984439763538766, + "grad_norm": 0.8765100836753845, + "learning_rate": 8.677581103123301e-05, + "loss": 2.8471, + "step": 18747 + }, + { + "epoch": 1.6985345745283231, + "grad_norm": 0.8161816000938416, + "learning_rate": 8.676976983024226e-05, + "loss": 2.0459, + "step": 18748 + }, + { + "epoch": 1.6986251727027701, + "grad_norm": 0.8922753930091858, + "learning_rate": 8.67637286292515e-05, + "loss": 2.5644, + "step": 18749 + }, + { + "epoch": 1.6987157708772167, + "grad_norm": 0.9891538023948669, + "learning_rate": 8.675768742826074e-05, + "loss": 2.4473, + "step": 18750 + }, + { + "epoch": 1.6988063690516637, + "grad_norm": 0.8677719235420227, + "learning_rate": 8.675164622726999e-05, + "loss": 2.4888, + "step": 18751 + }, + { + "epoch": 1.6988969672261103, + "grad_norm": 0.916372537612915, + "learning_rate": 8.674560502627924e-05, + "loss": 2.971, + "step": 18752 + }, + { + "epoch": 1.6989875654005573, + "grad_norm": 0.9252551198005676, + "learning_rate": 8.673956382528847e-05, + "loss": 2.5368, + "step": 18753 + }, + { + "epoch": 1.6990781635750039, + "grad_norm": 0.7100439071655273, + "learning_rate": 8.673352262429772e-05, + "loss": 1.9253, + "step": 18754 + }, + { + "epoch": 1.6991687617494509, + "grad_norm": 0.9194141626358032, + "learning_rate": 8.672748142330695e-05, + "loss": 2.5074, + "step": 18755 + }, + { + "epoch": 1.6992593599238974, + "grad_norm": 0.850690484046936, + "learning_rate": 8.67214402223162e-05, + "loss": 2.7317, + "step": 18756 + }, + { + "epoch": 1.6993499580983444, + "grad_norm": 0.9237200021743774, + "learning_rate": 8.671539902132543e-05, + "loss": 2.5748, + "step": 18757 + }, + { + "epoch": 1.699440556272791, + "grad_norm": 0.8929405808448792, + "learning_rate": 8.67093578203347e-05, + "loss": 2.2666, + "step": 18758 + }, + { + "epoch": 1.699531154447238, + "grad_norm": 0.9283174872398376, + "learning_rate": 8.670331661934393e-05, + "loss": 2.8009, + "step": 18759 + }, + { + "epoch": 1.6996217526216846, + "grad_norm": 0.5933922529220581, + "learning_rate": 8.669727541835318e-05, + "loss": 1.4199, + "step": 18760 + }, + { + "epoch": 1.6997123507961316, + "grad_norm": 0.9294158816337585, + "learning_rate": 8.669123421736241e-05, + "loss": 2.6427, + "step": 18761 + }, + { + "epoch": 1.6998029489705782, + "grad_norm": 0.8866182565689087, + "learning_rate": 8.668519301637166e-05, + "loss": 2.7465, + "step": 18762 + }, + { + "epoch": 1.6998935471450252, + "grad_norm": 0.9325005412101746, + "learning_rate": 8.667915181538089e-05, + "loss": 2.952, + "step": 18763 + }, + { + "epoch": 1.6999841453194717, + "grad_norm": 0.7631505727767944, + "learning_rate": 8.667311061439014e-05, + "loss": 1.9686, + "step": 18764 + }, + { + "epoch": 1.7000747434939187, + "grad_norm": 1.2509698867797852, + "learning_rate": 8.666706941339939e-05, + "loss": 2.71, + "step": 18765 + }, + { + "epoch": 1.7001653416683653, + "grad_norm": 0.9403699040412903, + "learning_rate": 8.666102821240864e-05, + "loss": 2.61, + "step": 18766 + }, + { + "epoch": 1.7002559398428123, + "grad_norm": 0.8584901094436646, + "learning_rate": 8.665498701141788e-05, + "loss": 2.8328, + "step": 18767 + }, + { + "epoch": 1.7003465380172589, + "grad_norm": 0.9108468294143677, + "learning_rate": 8.664894581042712e-05, + "loss": 2.6593, + "step": 18768 + }, + { + "epoch": 1.700437136191706, + "grad_norm": 0.8824644684791565, + "learning_rate": 8.664290460943636e-05, + "loss": 2.6713, + "step": 18769 + }, + { + "epoch": 1.7005277343661525, + "grad_norm": 0.9199238419532776, + "learning_rate": 8.66368634084456e-05, + "loss": 2.7818, + "step": 18770 + }, + { + "epoch": 1.7006183325405995, + "grad_norm": 0.7725322842597961, + "learning_rate": 8.663082220745485e-05, + "loss": 2.0592, + "step": 18771 + }, + { + "epoch": 1.700708930715046, + "grad_norm": 0.8462106585502625, + "learning_rate": 8.662478100646408e-05, + "loss": 2.6285, + "step": 18772 + }, + { + "epoch": 1.700799528889493, + "grad_norm": 0.8910284042358398, + "learning_rate": 8.661873980547334e-05, + "loss": 2.7941, + "step": 18773 + }, + { + "epoch": 1.7008901270639396, + "grad_norm": 0.9297933578491211, + "learning_rate": 8.661269860448258e-05, + "loss": 2.748, + "step": 18774 + }, + { + "epoch": 1.7009807252383864, + "grad_norm": 0.9201915860176086, + "learning_rate": 8.660665740349182e-05, + "loss": 2.8162, + "step": 18775 + }, + { + "epoch": 1.7010713234128332, + "grad_norm": 0.9027858376502991, + "learning_rate": 8.660061620250106e-05, + "loss": 2.8017, + "step": 18776 + }, + { + "epoch": 1.70116192158728, + "grad_norm": 0.8920837640762329, + "learning_rate": 8.65945750015103e-05, + "loss": 2.8802, + "step": 18777 + }, + { + "epoch": 1.7012525197617268, + "grad_norm": 1.0690661668777466, + "learning_rate": 8.658853380051954e-05, + "loss": 2.4049, + "step": 18778 + }, + { + "epoch": 1.7013431179361735, + "grad_norm": 1.0512453317642212, + "learning_rate": 8.658249259952879e-05, + "loss": 2.6849, + "step": 18779 + }, + { + "epoch": 1.7014337161106203, + "grad_norm": 0.9874483942985535, + "learning_rate": 8.657645139853803e-05, + "loss": 2.6722, + "step": 18780 + }, + { + "epoch": 1.7015243142850671, + "grad_norm": 0.9132779240608215, + "learning_rate": 8.657041019754728e-05, + "loss": 2.8583, + "step": 18781 + }, + { + "epoch": 1.701614912459514, + "grad_norm": 0.8839556574821472, + "learning_rate": 8.656436899655652e-05, + "loss": 2.8899, + "step": 18782 + }, + { + "epoch": 1.7017055106339607, + "grad_norm": 0.7361336350440979, + "learning_rate": 8.655832779556576e-05, + "loss": 1.8707, + "step": 18783 + }, + { + "epoch": 1.7017961088084075, + "grad_norm": 0.7969521284103394, + "learning_rate": 8.655228659457501e-05, + "loss": 1.8826, + "step": 18784 + }, + { + "epoch": 1.7018867069828543, + "grad_norm": 0.8403065800666809, + "learning_rate": 8.654624539358424e-05, + "loss": 2.5072, + "step": 18785 + }, + { + "epoch": 1.701977305157301, + "grad_norm": 0.9076806902885437, + "learning_rate": 8.654020419259349e-05, + "loss": 2.6585, + "step": 18786 + }, + { + "epoch": 1.7020679033317478, + "grad_norm": 0.8451006412506104, + "learning_rate": 8.653416299160273e-05, + "loss": 2.5681, + "step": 18787 + }, + { + "epoch": 1.7021585015061946, + "grad_norm": 0.7796689867973328, + "learning_rate": 8.652812179061199e-05, + "loss": 1.887, + "step": 18788 + }, + { + "epoch": 1.7022490996806414, + "grad_norm": 0.8892112970352173, + "learning_rate": 8.652208058962122e-05, + "loss": 2.9323, + "step": 18789 + }, + { + "epoch": 1.7023396978550882, + "grad_norm": 0.9557266235351562, + "learning_rate": 8.651603938863047e-05, + "loss": 2.6973, + "step": 18790 + }, + { + "epoch": 1.702430296029535, + "grad_norm": 0.9701082706451416, + "learning_rate": 8.65099981876397e-05, + "loss": 2.7242, + "step": 18791 + }, + { + "epoch": 1.7025208942039818, + "grad_norm": 0.9932067394256592, + "learning_rate": 8.650395698664895e-05, + "loss": 2.631, + "step": 18792 + }, + { + "epoch": 1.7026114923784286, + "grad_norm": 0.9023438096046448, + "learning_rate": 8.649791578565819e-05, + "loss": 2.8189, + "step": 18793 + }, + { + "epoch": 1.7027020905528754, + "grad_norm": 0.872773289680481, + "learning_rate": 8.649187458466743e-05, + "loss": 2.4153, + "step": 18794 + }, + { + "epoch": 1.7027926887273221, + "grad_norm": 0.8543131947517395, + "learning_rate": 8.648583338367668e-05, + "loss": 2.932, + "step": 18795 + }, + { + "epoch": 1.702883286901769, + "grad_norm": 0.8676150441169739, + "learning_rate": 8.647979218268593e-05, + "loss": 2.7027, + "step": 18796 + }, + { + "epoch": 1.7029738850762157, + "grad_norm": 0.9307572841644287, + "learning_rate": 8.647375098169516e-05, + "loss": 2.9482, + "step": 18797 + }, + { + "epoch": 1.7030644832506625, + "grad_norm": 0.9044474959373474, + "learning_rate": 8.646770978070441e-05, + "loss": 2.8658, + "step": 18798 + }, + { + "epoch": 1.7031550814251093, + "grad_norm": 0.9569827318191528, + "learning_rate": 8.646166857971366e-05, + "loss": 2.5848, + "step": 18799 + }, + { + "epoch": 1.703245679599556, + "grad_norm": 0.9074748754501343, + "learning_rate": 8.645562737872289e-05, + "loss": 2.7849, + "step": 18800 + }, + { + "epoch": 1.7033362777740029, + "grad_norm": 0.9086644053459167, + "learning_rate": 8.644958617773214e-05, + "loss": 2.5638, + "step": 18801 + }, + { + "epoch": 1.7034268759484497, + "grad_norm": 0.9020550847053528, + "learning_rate": 8.644354497674137e-05, + "loss": 2.5725, + "step": 18802 + }, + { + "epoch": 1.7035174741228964, + "grad_norm": 0.9726545810699463, + "learning_rate": 8.643750377575063e-05, + "loss": 2.8882, + "step": 18803 + }, + { + "epoch": 1.7036080722973432, + "grad_norm": 0.7735524773597717, + "learning_rate": 8.643146257475987e-05, + "loss": 2.205, + "step": 18804 + }, + { + "epoch": 1.70369867047179, + "grad_norm": 0.9338355660438538, + "learning_rate": 8.642542137376912e-05, + "loss": 1.9493, + "step": 18805 + }, + { + "epoch": 1.7037892686462368, + "grad_norm": 0.7832440733909607, + "learning_rate": 8.641938017277835e-05, + "loss": 2.1265, + "step": 18806 + }, + { + "epoch": 1.7038798668206836, + "grad_norm": 0.9612724184989929, + "learning_rate": 8.64133389717876e-05, + "loss": 2.7003, + "step": 18807 + }, + { + "epoch": 1.7039704649951304, + "grad_norm": 0.9222022891044617, + "learning_rate": 8.640729777079683e-05, + "loss": 2.9925, + "step": 18808 + }, + { + "epoch": 1.7040610631695772, + "grad_norm": 0.856384813785553, + "learning_rate": 8.640125656980608e-05, + "loss": 2.6597, + "step": 18809 + }, + { + "epoch": 1.704151661344024, + "grad_norm": 0.8736075162887573, + "learning_rate": 8.639521536881533e-05, + "loss": 2.7458, + "step": 18810 + }, + { + "epoch": 1.7042422595184707, + "grad_norm": 0.958223283290863, + "learning_rate": 8.638917416782457e-05, + "loss": 2.5798, + "step": 18811 + }, + { + "epoch": 1.7043328576929175, + "grad_norm": 0.8894063830375671, + "learning_rate": 8.638313296683381e-05, + "loss": 2.697, + "step": 18812 + }, + { + "epoch": 1.7044234558673643, + "grad_norm": 0.8986214995384216, + "learning_rate": 8.637709176584306e-05, + "loss": 2.943, + "step": 18813 + }, + { + "epoch": 1.704514054041811, + "grad_norm": 0.8900374174118042, + "learning_rate": 8.637105056485229e-05, + "loss": 2.461, + "step": 18814 + }, + { + "epoch": 1.7046046522162577, + "grad_norm": 0.8843531608581543, + "learning_rate": 8.636500936386154e-05, + "loss": 2.7282, + "step": 18815 + }, + { + "epoch": 1.7046952503907047, + "grad_norm": 0.8845153450965881, + "learning_rate": 8.635896816287079e-05, + "loss": 2.5557, + "step": 18816 + }, + { + "epoch": 1.7047858485651513, + "grad_norm": 0.9672693610191345, + "learning_rate": 8.635292696188002e-05, + "loss": 2.5238, + "step": 18817 + }, + { + "epoch": 1.7048764467395983, + "grad_norm": 0.8795903921127319, + "learning_rate": 8.634688576088927e-05, + "loss": 2.549, + "step": 18818 + }, + { + "epoch": 1.7049670449140448, + "grad_norm": 0.9056599736213684, + "learning_rate": 8.634084455989851e-05, + "loss": 2.5372, + "step": 18819 + }, + { + "epoch": 1.7050576430884918, + "grad_norm": 0.906542181968689, + "learning_rate": 8.633480335890776e-05, + "loss": 2.6873, + "step": 18820 + }, + { + "epoch": 1.7051482412629384, + "grad_norm": 0.8598234057426453, + "learning_rate": 8.6328762157917e-05, + "loss": 2.645, + "step": 18821 + }, + { + "epoch": 1.7052388394373854, + "grad_norm": 0.9642475247383118, + "learning_rate": 8.632272095692624e-05, + "loss": 2.8555, + "step": 18822 + }, + { + "epoch": 1.705329437611832, + "grad_norm": 0.9024627804756165, + "learning_rate": 8.631667975593548e-05, + "loss": 2.8581, + "step": 18823 + }, + { + "epoch": 1.705420035786279, + "grad_norm": 0.7815133929252625, + "learning_rate": 8.631063855494473e-05, + "loss": 1.9884, + "step": 18824 + }, + { + "epoch": 1.7055106339607256, + "grad_norm": 0.9288280010223389, + "learning_rate": 8.630459735395397e-05, + "loss": 2.8927, + "step": 18825 + }, + { + "epoch": 1.7056012321351726, + "grad_norm": 0.875553548336029, + "learning_rate": 8.629855615296322e-05, + "loss": 2.7863, + "step": 18826 + }, + { + "epoch": 1.7056918303096191, + "grad_norm": 0.9373236298561096, + "learning_rate": 8.629251495197245e-05, + "loss": 2.5953, + "step": 18827 + }, + { + "epoch": 1.7057824284840661, + "grad_norm": 0.9035174250602722, + "learning_rate": 8.62864737509817e-05, + "loss": 2.8203, + "step": 18828 + }, + { + "epoch": 1.7058730266585127, + "grad_norm": 0.867781400680542, + "learning_rate": 8.628043254999094e-05, + "loss": 2.6606, + "step": 18829 + }, + { + "epoch": 1.7059636248329597, + "grad_norm": 0.8882527351379395, + "learning_rate": 8.627439134900018e-05, + "loss": 2.7569, + "step": 18830 + }, + { + "epoch": 1.7060542230074063, + "grad_norm": 0.898124635219574, + "learning_rate": 8.626835014800943e-05, + "loss": 2.6706, + "step": 18831 + }, + { + "epoch": 1.7061448211818533, + "grad_norm": 0.9383034110069275, + "learning_rate": 8.626230894701867e-05, + "loss": 2.7215, + "step": 18832 + }, + { + "epoch": 1.7062354193562999, + "grad_norm": 0.897682785987854, + "learning_rate": 8.625626774602791e-05, + "loss": 2.8535, + "step": 18833 + }, + { + "epoch": 1.7063260175307469, + "grad_norm": 0.8756290078163147, + "learning_rate": 8.625022654503716e-05, + "loss": 2.4264, + "step": 18834 + }, + { + "epoch": 1.7064166157051934, + "grad_norm": 0.8914374709129333, + "learning_rate": 8.624418534404641e-05, + "loss": 1.868, + "step": 18835 + }, + { + "epoch": 1.7065072138796404, + "grad_norm": 0.9305874705314636, + "learning_rate": 8.623814414305564e-05, + "loss": 2.8207, + "step": 18836 + }, + { + "epoch": 1.706597812054087, + "grad_norm": 0.9684446454048157, + "learning_rate": 8.623210294206489e-05, + "loss": 2.6516, + "step": 18837 + }, + { + "epoch": 1.706688410228534, + "grad_norm": 0.926422119140625, + "learning_rate": 8.622606174107412e-05, + "loss": 2.5344, + "step": 18838 + }, + { + "epoch": 1.7067790084029806, + "grad_norm": 0.8593525886535645, + "learning_rate": 8.622002054008337e-05, + "loss": 2.7666, + "step": 18839 + }, + { + "epoch": 1.7068696065774276, + "grad_norm": 0.9658313989639282, + "learning_rate": 8.621397933909262e-05, + "loss": 2.5323, + "step": 18840 + }, + { + "epoch": 1.7069602047518742, + "grad_norm": 0.8854548335075378, + "learning_rate": 8.620793813810187e-05, + "loss": 2.4552, + "step": 18841 + }, + { + "epoch": 1.7070508029263212, + "grad_norm": 1.0695406198501587, + "learning_rate": 8.62018969371111e-05, + "loss": 2.9743, + "step": 18842 + }, + { + "epoch": 1.7071414011007677, + "grad_norm": 0.9593316316604614, + "learning_rate": 8.619585573612035e-05, + "loss": 2.8648, + "step": 18843 + }, + { + "epoch": 1.7072319992752147, + "grad_norm": 0.8435819149017334, + "learning_rate": 8.618981453512958e-05, + "loss": 1.9729, + "step": 18844 + }, + { + "epoch": 1.7073225974496613, + "grad_norm": 0.8997443318367004, + "learning_rate": 8.618377333413883e-05, + "loss": 2.7111, + "step": 18845 + }, + { + "epoch": 1.7074131956241083, + "grad_norm": 0.9608314633369446, + "learning_rate": 8.617773213314806e-05, + "loss": 2.7487, + "step": 18846 + }, + { + "epoch": 1.7075037937985549, + "grad_norm": 0.8398642539978027, + "learning_rate": 8.617169093215731e-05, + "loss": 2.0256, + "step": 18847 + }, + { + "epoch": 1.707594391973002, + "grad_norm": 0.9728562235832214, + "learning_rate": 8.616564973116656e-05, + "loss": 3.0796, + "step": 18848 + }, + { + "epoch": 1.7076849901474485, + "grad_norm": 0.8520325422286987, + "learning_rate": 8.615960853017581e-05, + "loss": 2.7681, + "step": 18849 + }, + { + "epoch": 1.7077755883218955, + "grad_norm": 0.7882859706878662, + "learning_rate": 8.615356732918504e-05, + "loss": 2.0291, + "step": 18850 + }, + { + "epoch": 1.707866186496342, + "grad_norm": 0.9812688231468201, + "learning_rate": 8.614752612819429e-05, + "loss": 2.7843, + "step": 18851 + }, + { + "epoch": 1.707956784670789, + "grad_norm": 0.9068756699562073, + "learning_rate": 8.614148492720354e-05, + "loss": 2.4006, + "step": 18852 + }, + { + "epoch": 1.7080473828452356, + "grad_norm": 0.8562931418418884, + "learning_rate": 8.613544372621277e-05, + "loss": 2.9355, + "step": 18853 + }, + { + "epoch": 1.7081379810196826, + "grad_norm": 0.87293940782547, + "learning_rate": 8.612940252522202e-05, + "loss": 2.8593, + "step": 18854 + }, + { + "epoch": 1.7082285791941292, + "grad_norm": 0.9550366401672363, + "learning_rate": 8.612336132423127e-05, + "loss": 2.6152, + "step": 18855 + }, + { + "epoch": 1.708319177368576, + "grad_norm": 0.8455175161361694, + "learning_rate": 8.611732012324051e-05, + "loss": 2.0691, + "step": 18856 + }, + { + "epoch": 1.7084097755430228, + "grad_norm": 0.9670917987823486, + "learning_rate": 8.611127892224975e-05, + "loss": 2.9239, + "step": 18857 + }, + { + "epoch": 1.7085003737174695, + "grad_norm": 0.9661166667938232, + "learning_rate": 8.6105237721259e-05, + "loss": 2.7224, + "step": 18858 + }, + { + "epoch": 1.7085909718919163, + "grad_norm": 1.0094252824783325, + "learning_rate": 8.609919652026823e-05, + "loss": 2.7588, + "step": 18859 + }, + { + "epoch": 1.7086815700663631, + "grad_norm": 0.9085001349449158, + "learning_rate": 8.609315531927748e-05, + "loss": 2.7734, + "step": 18860 + }, + { + "epoch": 1.70877216824081, + "grad_norm": 0.9517122507095337, + "learning_rate": 8.608711411828671e-05, + "loss": 2.7439, + "step": 18861 + }, + { + "epoch": 1.7088627664152567, + "grad_norm": 0.7683156132698059, + "learning_rate": 8.608107291729596e-05, + "loss": 2.089, + "step": 18862 + }, + { + "epoch": 1.7089533645897035, + "grad_norm": 0.9496252536773682, + "learning_rate": 8.60750317163052e-05, + "loss": 2.9764, + "step": 18863 + }, + { + "epoch": 1.7090439627641503, + "grad_norm": 1.1289368867874146, + "learning_rate": 8.606899051531445e-05, + "loss": 2.6837, + "step": 18864 + }, + { + "epoch": 1.709134560938597, + "grad_norm": 0.9016065001487732, + "learning_rate": 8.606294931432369e-05, + "loss": 2.7604, + "step": 18865 + }, + { + "epoch": 1.7092251591130438, + "grad_norm": 0.9275996088981628, + "learning_rate": 8.605690811333294e-05, + "loss": 2.595, + "step": 18866 + }, + { + "epoch": 1.7093157572874906, + "grad_norm": 0.921175479888916, + "learning_rate": 8.605086691234218e-05, + "loss": 2.8832, + "step": 18867 + }, + { + "epoch": 1.7094063554619374, + "grad_norm": 0.8874607682228088, + "learning_rate": 8.604482571135142e-05, + "loss": 2.8152, + "step": 18868 + }, + { + "epoch": 1.7094969536363842, + "grad_norm": 0.7421913743019104, + "learning_rate": 8.603878451036066e-05, + "loss": 2.0926, + "step": 18869 + }, + { + "epoch": 1.709587551810831, + "grad_norm": 0.9317567348480225, + "learning_rate": 8.603274330936991e-05, + "loss": 2.5968, + "step": 18870 + }, + { + "epoch": 1.7096781499852778, + "grad_norm": 0.8971171379089355, + "learning_rate": 8.602670210837916e-05, + "loss": 2.71, + "step": 18871 + }, + { + "epoch": 1.7097687481597246, + "grad_norm": 1.0563137531280518, + "learning_rate": 8.60206609073884e-05, + "loss": 3.0604, + "step": 18872 + }, + { + "epoch": 1.7098593463341714, + "grad_norm": 0.9534181356430054, + "learning_rate": 8.601461970639764e-05, + "loss": 2.7977, + "step": 18873 + }, + { + "epoch": 1.7099499445086181, + "grad_norm": 0.9404094815254211, + "learning_rate": 8.600857850540688e-05, + "loss": 2.9229, + "step": 18874 + }, + { + "epoch": 1.710040542683065, + "grad_norm": 0.9303011298179626, + "learning_rate": 8.600253730441612e-05, + "loss": 2.5931, + "step": 18875 + }, + { + "epoch": 1.7101311408575117, + "grad_norm": 0.9127684235572815, + "learning_rate": 8.599649610342536e-05, + "loss": 2.7519, + "step": 18876 + }, + { + "epoch": 1.7102217390319585, + "grad_norm": 0.9374192953109741, + "learning_rate": 8.59904549024346e-05, + "loss": 2.8689, + "step": 18877 + }, + { + "epoch": 1.7103123372064053, + "grad_norm": 0.7603441476821899, + "learning_rate": 8.598441370144385e-05, + "loss": 1.948, + "step": 18878 + }, + { + "epoch": 1.710402935380852, + "grad_norm": 0.7927286028862, + "learning_rate": 8.59783725004531e-05, + "loss": 2.0516, + "step": 18879 + }, + { + "epoch": 1.7104935335552989, + "grad_norm": 0.9738894104957581, + "learning_rate": 8.597233129946233e-05, + "loss": 2.8872, + "step": 18880 + }, + { + "epoch": 1.7105841317297457, + "grad_norm": 0.7553644776344299, + "learning_rate": 8.596629009847158e-05, + "loss": 1.8996, + "step": 18881 + }, + { + "epoch": 1.7106747299041924, + "grad_norm": 0.9693710803985596, + "learning_rate": 8.596024889748082e-05, + "loss": 2.0253, + "step": 18882 + }, + { + "epoch": 1.7107653280786392, + "grad_norm": 0.9545847773551941, + "learning_rate": 8.595420769649006e-05, + "loss": 2.7829, + "step": 18883 + }, + { + "epoch": 1.710855926253086, + "grad_norm": 0.8685669302940369, + "learning_rate": 8.594816649549931e-05, + "loss": 2.6885, + "step": 18884 + }, + { + "epoch": 1.7109465244275328, + "grad_norm": 0.8967962265014648, + "learning_rate": 8.594212529450856e-05, + "loss": 2.6417, + "step": 18885 + }, + { + "epoch": 1.7110371226019796, + "grad_norm": 0.9585636854171753, + "learning_rate": 8.59360840935178e-05, + "loss": 2.7426, + "step": 18886 + }, + { + "epoch": 1.7111277207764264, + "grad_norm": 0.9208478927612305, + "learning_rate": 8.593004289252704e-05, + "loss": 2.6751, + "step": 18887 + }, + { + "epoch": 1.7112183189508732, + "grad_norm": 0.9148968458175659, + "learning_rate": 8.592400169153629e-05, + "loss": 2.6478, + "step": 18888 + }, + { + "epoch": 1.71130891712532, + "grad_norm": 0.9388518333435059, + "learning_rate": 8.591796049054552e-05, + "loss": 2.8716, + "step": 18889 + }, + { + "epoch": 1.7113995152997667, + "grad_norm": 0.9372444152832031, + "learning_rate": 8.591191928955477e-05, + "loss": 2.4933, + "step": 18890 + }, + { + "epoch": 1.7114901134742135, + "grad_norm": 0.9234726428985596, + "learning_rate": 8.5905878088564e-05, + "loss": 2.7313, + "step": 18891 + }, + { + "epoch": 1.7115807116486603, + "grad_norm": 0.8989801406860352, + "learning_rate": 8.589983688757325e-05, + "loss": 2.6493, + "step": 18892 + }, + { + "epoch": 1.7116713098231071, + "grad_norm": 0.9034523367881775, + "learning_rate": 8.58937956865825e-05, + "loss": 2.9274, + "step": 18893 + }, + { + "epoch": 1.711761907997554, + "grad_norm": 0.9956997632980347, + "learning_rate": 8.588775448559175e-05, + "loss": 3.0037, + "step": 18894 + }, + { + "epoch": 1.7118525061720007, + "grad_norm": 0.851225733757019, + "learning_rate": 8.588171328460098e-05, + "loss": 2.6303, + "step": 18895 + }, + { + "epoch": 1.7119431043464473, + "grad_norm": 0.8233171105384827, + "learning_rate": 8.587567208361023e-05, + "loss": 2.3109, + "step": 18896 + }, + { + "epoch": 1.7120337025208943, + "grad_norm": 0.9131065607070923, + "learning_rate": 8.586963088261946e-05, + "loss": 2.5149, + "step": 18897 + }, + { + "epoch": 1.7121243006953408, + "grad_norm": 0.9439956545829773, + "learning_rate": 8.586358968162871e-05, + "loss": 3.0647, + "step": 18898 + }, + { + "epoch": 1.7122148988697878, + "grad_norm": 0.8541439771652222, + "learning_rate": 8.585754848063796e-05, + "loss": 2.6892, + "step": 18899 + }, + { + "epoch": 1.7123054970442344, + "grad_norm": 0.8361426591873169, + "learning_rate": 8.58515072796472e-05, + "loss": 2.621, + "step": 18900 + }, + { + "epoch": 1.7123960952186814, + "grad_norm": 0.8919748067855835, + "learning_rate": 8.584546607865644e-05, + "loss": 2.508, + "step": 18901 + }, + { + "epoch": 1.712486693393128, + "grad_norm": 0.9295383095741272, + "learning_rate": 8.583942487766569e-05, + "loss": 2.727, + "step": 18902 + }, + { + "epoch": 1.712577291567575, + "grad_norm": 0.8784781694412231, + "learning_rate": 8.583338367667493e-05, + "loss": 2.7876, + "step": 18903 + }, + { + "epoch": 1.7126678897420216, + "grad_norm": 0.9292201399803162, + "learning_rate": 8.582734247568417e-05, + "loss": 2.582, + "step": 18904 + }, + { + "epoch": 1.7127584879164686, + "grad_norm": 0.9187555909156799, + "learning_rate": 8.582130127469342e-05, + "loss": 2.5566, + "step": 18905 + }, + { + "epoch": 1.7128490860909151, + "grad_norm": 0.8967837691307068, + "learning_rate": 8.581526007370265e-05, + "loss": 2.7931, + "step": 18906 + }, + { + "epoch": 1.7129396842653621, + "grad_norm": 1.043785810470581, + "learning_rate": 8.58092188727119e-05, + "loss": 2.7567, + "step": 18907 + }, + { + "epoch": 1.7130302824398087, + "grad_norm": 0.8967852592468262, + "learning_rate": 8.580317767172114e-05, + "loss": 2.862, + "step": 18908 + }, + { + "epoch": 1.7131208806142557, + "grad_norm": 0.9656515121459961, + "learning_rate": 8.579713647073039e-05, + "loss": 3.0157, + "step": 18909 + }, + { + "epoch": 1.7132114787887023, + "grad_norm": 0.8892914652824402, + "learning_rate": 8.579109526973963e-05, + "loss": 2.8289, + "step": 18910 + }, + { + "epoch": 1.7133020769631493, + "grad_norm": 1.0256623029708862, + "learning_rate": 8.578505406874887e-05, + "loss": 2.8587, + "step": 18911 + }, + { + "epoch": 1.7133926751375959, + "grad_norm": 0.9746643304824829, + "learning_rate": 8.577901286775811e-05, + "loss": 2.5225, + "step": 18912 + }, + { + "epoch": 1.7134832733120429, + "grad_norm": 0.8589649796485901, + "learning_rate": 8.577297166676736e-05, + "loss": 2.8465, + "step": 18913 + }, + { + "epoch": 1.7135738714864894, + "grad_norm": 0.9913033246994019, + "learning_rate": 8.576693046577659e-05, + "loss": 2.8267, + "step": 18914 + }, + { + "epoch": 1.7136644696609364, + "grad_norm": 1.0073589086532593, + "learning_rate": 8.576088926478585e-05, + "loss": 2.0416, + "step": 18915 + }, + { + "epoch": 1.713755067835383, + "grad_norm": 0.9118175506591797, + "learning_rate": 8.575484806379508e-05, + "loss": 2.7917, + "step": 18916 + }, + { + "epoch": 1.71384566600983, + "grad_norm": 0.9644820094108582, + "learning_rate": 8.574880686280433e-05, + "loss": 2.6883, + "step": 18917 + }, + { + "epoch": 1.7139362641842766, + "grad_norm": 0.8213871121406555, + "learning_rate": 8.574276566181357e-05, + "loss": 2.6758, + "step": 18918 + }, + { + "epoch": 1.7140268623587236, + "grad_norm": 0.9092422127723694, + "learning_rate": 8.573672446082281e-05, + "loss": 2.7512, + "step": 18919 + }, + { + "epoch": 1.7141174605331702, + "grad_norm": 0.8786671757698059, + "learning_rate": 8.573068325983206e-05, + "loss": 2.8052, + "step": 18920 + }, + { + "epoch": 1.7142080587076172, + "grad_norm": 0.9166659712791443, + "learning_rate": 8.57246420588413e-05, + "loss": 2.6576, + "step": 18921 + }, + { + "epoch": 1.7142986568820637, + "grad_norm": 0.9489235281944275, + "learning_rate": 8.571860085785054e-05, + "loss": 2.8159, + "step": 18922 + }, + { + "epoch": 1.7143892550565107, + "grad_norm": 0.7798995971679688, + "learning_rate": 8.571255965685979e-05, + "loss": 1.8276, + "step": 18923 + }, + { + "epoch": 1.7144798532309573, + "grad_norm": 0.8811794519424438, + "learning_rate": 8.570651845586904e-05, + "loss": 2.628, + "step": 18924 + }, + { + "epoch": 1.7145704514054043, + "grad_norm": 0.7842978239059448, + "learning_rate": 8.570047725487827e-05, + "loss": 2.0445, + "step": 18925 + }, + { + "epoch": 1.7146610495798509, + "grad_norm": 1.035503625869751, + "learning_rate": 8.569443605388752e-05, + "loss": 2.7701, + "step": 18926 + }, + { + "epoch": 1.714751647754298, + "grad_norm": 0.9007681608200073, + "learning_rate": 8.568839485289675e-05, + "loss": 2.7882, + "step": 18927 + }, + { + "epoch": 1.7148422459287445, + "grad_norm": 0.897336483001709, + "learning_rate": 8.5682353651906e-05, + "loss": 2.6854, + "step": 18928 + }, + { + "epoch": 1.7149328441031915, + "grad_norm": 0.9328346848487854, + "learning_rate": 8.567631245091524e-05, + "loss": 2.4985, + "step": 18929 + }, + { + "epoch": 1.715023442277638, + "grad_norm": 0.8424553871154785, + "learning_rate": 8.56702712499245e-05, + "loss": 2.5571, + "step": 18930 + }, + { + "epoch": 1.715114040452085, + "grad_norm": 0.9627186059951782, + "learning_rate": 8.566423004893373e-05, + "loss": 2.747, + "step": 18931 + }, + { + "epoch": 1.7152046386265316, + "grad_norm": 0.9803282618522644, + "learning_rate": 8.565818884794298e-05, + "loss": 2.7712, + "step": 18932 + }, + { + "epoch": 1.7152952368009786, + "grad_norm": 0.8998266458511353, + "learning_rate": 8.565214764695221e-05, + "loss": 2.7621, + "step": 18933 + }, + { + "epoch": 1.7153858349754252, + "grad_norm": 0.8561972975730896, + "learning_rate": 8.564610644596146e-05, + "loss": 2.5972, + "step": 18934 + }, + { + "epoch": 1.7154764331498722, + "grad_norm": 0.9093216061592102, + "learning_rate": 8.564006524497071e-05, + "loss": 2.8866, + "step": 18935 + }, + { + "epoch": 1.7155670313243188, + "grad_norm": 0.8583789467811584, + "learning_rate": 8.563402404397994e-05, + "loss": 2.5306, + "step": 18936 + }, + { + "epoch": 1.7156576294987655, + "grad_norm": 0.851784884929657, + "learning_rate": 8.562798284298919e-05, + "loss": 2.768, + "step": 18937 + }, + { + "epoch": 1.7157482276732123, + "grad_norm": 0.8811962604522705, + "learning_rate": 8.562194164199844e-05, + "loss": 2.7531, + "step": 18938 + }, + { + "epoch": 1.7158388258476591, + "grad_norm": 0.931989312171936, + "learning_rate": 8.561590044100769e-05, + "loss": 2.5233, + "step": 18939 + }, + { + "epoch": 1.715929424022106, + "grad_norm": 0.923224151134491, + "learning_rate": 8.560985924001692e-05, + "loss": 2.6743, + "step": 18940 + }, + { + "epoch": 1.7160200221965527, + "grad_norm": 0.9451169371604919, + "learning_rate": 8.560381803902617e-05, + "loss": 2.685, + "step": 18941 + }, + { + "epoch": 1.7161106203709995, + "grad_norm": 0.9562526941299438, + "learning_rate": 8.55977768380354e-05, + "loss": 2.5371, + "step": 18942 + }, + { + "epoch": 1.7162012185454463, + "grad_norm": 1.0266019105911255, + "learning_rate": 8.559173563704465e-05, + "loss": 2.745, + "step": 18943 + }, + { + "epoch": 1.716291816719893, + "grad_norm": 0.8203122019767761, + "learning_rate": 8.558569443605388e-05, + "loss": 2.7805, + "step": 18944 + }, + { + "epoch": 1.7163824148943398, + "grad_norm": 0.9295710921287537, + "learning_rate": 8.557965323506314e-05, + "loss": 2.7847, + "step": 18945 + }, + { + "epoch": 1.7164730130687866, + "grad_norm": 0.8483569622039795, + "learning_rate": 8.557361203407238e-05, + "loss": 2.4694, + "step": 18946 + }, + { + "epoch": 1.7165636112432334, + "grad_norm": 0.965757429599762, + "learning_rate": 8.556757083308163e-05, + "loss": 2.714, + "step": 18947 + }, + { + "epoch": 1.7166542094176802, + "grad_norm": 0.874838650226593, + "learning_rate": 8.556152963209086e-05, + "loss": 2.7997, + "step": 18948 + }, + { + "epoch": 1.716744807592127, + "grad_norm": 0.9136267304420471, + "learning_rate": 8.555548843110011e-05, + "loss": 3.0864, + "step": 18949 + }, + { + "epoch": 1.7168354057665738, + "grad_norm": 0.8358362913131714, + "learning_rate": 8.554944723010934e-05, + "loss": 1.6811, + "step": 18950 + }, + { + "epoch": 1.7169260039410206, + "grad_norm": 0.9015701413154602, + "learning_rate": 8.554340602911859e-05, + "loss": 2.6155, + "step": 18951 + }, + { + "epoch": 1.7170166021154674, + "grad_norm": 0.76667720079422, + "learning_rate": 8.553736482812784e-05, + "loss": 2.1247, + "step": 18952 + }, + { + "epoch": 1.7171072002899141, + "grad_norm": 0.8612238764762878, + "learning_rate": 8.553132362713708e-05, + "loss": 2.6943, + "step": 18953 + }, + { + "epoch": 1.717197798464361, + "grad_norm": 0.9360568523406982, + "learning_rate": 8.552528242614633e-05, + "loss": 2.5863, + "step": 18954 + }, + { + "epoch": 1.7172883966388077, + "grad_norm": 0.778134286403656, + "learning_rate": 8.551924122515557e-05, + "loss": 2.1362, + "step": 18955 + }, + { + "epoch": 1.7173789948132545, + "grad_norm": 0.9452617168426514, + "learning_rate": 8.551320002416481e-05, + "loss": 2.6917, + "step": 18956 + }, + { + "epoch": 1.7174695929877013, + "grad_norm": 0.8799728155136108, + "learning_rate": 8.550715882317405e-05, + "loss": 2.7385, + "step": 18957 + }, + { + "epoch": 1.717560191162148, + "grad_norm": 0.8722020983695984, + "learning_rate": 8.55011176221833e-05, + "loss": 2.5665, + "step": 18958 + }, + { + "epoch": 1.7176507893365949, + "grad_norm": 0.9405189752578735, + "learning_rate": 8.549507642119253e-05, + "loss": 2.6382, + "step": 18959 + }, + { + "epoch": 1.7177413875110417, + "grad_norm": 0.883031964302063, + "learning_rate": 8.548903522020179e-05, + "loss": 2.7047, + "step": 18960 + }, + { + "epoch": 1.7178319856854884, + "grad_norm": 0.9080924391746521, + "learning_rate": 8.548299401921102e-05, + "loss": 2.8854, + "step": 18961 + }, + { + "epoch": 1.7179225838599352, + "grad_norm": 0.9714654684066772, + "learning_rate": 8.547695281822027e-05, + "loss": 2.7269, + "step": 18962 + }, + { + "epoch": 1.718013182034382, + "grad_norm": 0.874775230884552, + "learning_rate": 8.54709116172295e-05, + "loss": 2.6737, + "step": 18963 + }, + { + "epoch": 1.7181037802088288, + "grad_norm": 0.9411298036575317, + "learning_rate": 8.546487041623875e-05, + "loss": 2.6536, + "step": 18964 + }, + { + "epoch": 1.7181943783832756, + "grad_norm": 0.8444939851760864, + "learning_rate": 8.545882921524799e-05, + "loss": 2.7917, + "step": 18965 + }, + { + "epoch": 1.7182849765577224, + "grad_norm": 0.9311535358428955, + "learning_rate": 8.545278801425723e-05, + "loss": 2.9937, + "step": 18966 + }, + { + "epoch": 1.7183755747321692, + "grad_norm": 0.8843961954116821, + "learning_rate": 8.544674681326648e-05, + "loss": 2.7982, + "step": 18967 + }, + { + "epoch": 1.718466172906616, + "grad_norm": 1.1433886289596558, + "learning_rate": 8.544070561227573e-05, + "loss": 2.7504, + "step": 18968 + }, + { + "epoch": 1.7185567710810628, + "grad_norm": 0.8395654559135437, + "learning_rate": 8.543466441128496e-05, + "loss": 2.4958, + "step": 18969 + }, + { + "epoch": 1.7186473692555095, + "grad_norm": 0.8221590518951416, + "learning_rate": 8.542862321029421e-05, + "loss": 2.607, + "step": 18970 + }, + { + "epoch": 1.7187379674299563, + "grad_norm": 0.8850284814834595, + "learning_rate": 8.542258200930346e-05, + "loss": 2.5496, + "step": 18971 + }, + { + "epoch": 1.7188285656044031, + "grad_norm": 0.8706067800521851, + "learning_rate": 8.54165408083127e-05, + "loss": 2.569, + "step": 18972 + }, + { + "epoch": 1.71891916377885, + "grad_norm": 0.8927939534187317, + "learning_rate": 8.541049960732194e-05, + "loss": 2.8174, + "step": 18973 + }, + { + "epoch": 1.7190097619532967, + "grad_norm": 0.9168726205825806, + "learning_rate": 8.540445840633118e-05, + "loss": 2.9047, + "step": 18974 + }, + { + "epoch": 1.7191003601277435, + "grad_norm": 0.9033050537109375, + "learning_rate": 8.539841720534044e-05, + "loss": 2.522, + "step": 18975 + }, + { + "epoch": 1.7191909583021903, + "grad_norm": 0.8875633478164673, + "learning_rate": 8.539237600434967e-05, + "loss": 2.0085, + "step": 18976 + }, + { + "epoch": 1.7192815564766368, + "grad_norm": 0.8759881854057312, + "learning_rate": 8.538633480335892e-05, + "loss": 2.3835, + "step": 18977 + }, + { + "epoch": 1.7193721546510838, + "grad_norm": 0.805854856967926, + "learning_rate": 8.538029360236815e-05, + "loss": 2.3103, + "step": 18978 + }, + { + "epoch": 1.7194627528255304, + "grad_norm": 0.8923512101173401, + "learning_rate": 8.53742524013774e-05, + "loss": 2.6912, + "step": 18979 + }, + { + "epoch": 1.7195533509999774, + "grad_norm": 0.9581510424613953, + "learning_rate": 8.536821120038663e-05, + "loss": 2.6736, + "step": 18980 + }, + { + "epoch": 1.719643949174424, + "grad_norm": 0.8790852427482605, + "learning_rate": 8.536216999939588e-05, + "loss": 2.7194, + "step": 18981 + }, + { + "epoch": 1.719734547348871, + "grad_norm": 0.9209879040718079, + "learning_rate": 8.535612879840512e-05, + "loss": 2.7041, + "step": 18982 + }, + { + "epoch": 1.7198251455233176, + "grad_norm": 0.8849728107452393, + "learning_rate": 8.535008759741438e-05, + "loss": 2.552, + "step": 18983 + }, + { + "epoch": 1.7199157436977646, + "grad_norm": 0.9461813569068909, + "learning_rate": 8.534404639642361e-05, + "loss": 2.5888, + "step": 18984 + }, + { + "epoch": 1.7200063418722111, + "grad_norm": 0.9475606083869934, + "learning_rate": 8.533800519543286e-05, + "loss": 2.6361, + "step": 18985 + }, + { + "epoch": 1.7200969400466581, + "grad_norm": 1.0407235622406006, + "learning_rate": 8.53319639944421e-05, + "loss": 2.8663, + "step": 18986 + }, + { + "epoch": 1.7201875382211047, + "grad_norm": 0.8974904417991638, + "learning_rate": 8.532592279345134e-05, + "loss": 2.9608, + "step": 18987 + }, + { + "epoch": 1.7202781363955517, + "grad_norm": 0.9443309903144836, + "learning_rate": 8.531988159246059e-05, + "loss": 2.7465, + "step": 18988 + }, + { + "epoch": 1.7203687345699983, + "grad_norm": 0.9415462017059326, + "learning_rate": 8.531384039146982e-05, + "loss": 2.5854, + "step": 18989 + }, + { + "epoch": 1.7204593327444453, + "grad_norm": 0.9003146290779114, + "learning_rate": 8.530779919047908e-05, + "loss": 2.7154, + "step": 18990 + }, + { + "epoch": 1.7205499309188919, + "grad_norm": 0.9039657711982727, + "learning_rate": 8.530175798948832e-05, + "loss": 2.6784, + "step": 18991 + }, + { + "epoch": 1.7206405290933389, + "grad_norm": 0.8863382339477539, + "learning_rate": 8.529571678849756e-05, + "loss": 3.0576, + "step": 18992 + }, + { + "epoch": 1.7207311272677854, + "grad_norm": 0.8791058659553528, + "learning_rate": 8.52896755875068e-05, + "loss": 2.7233, + "step": 18993 + }, + { + "epoch": 1.7208217254422324, + "grad_norm": 0.9502424001693726, + "learning_rate": 8.528363438651605e-05, + "loss": 2.6173, + "step": 18994 + }, + { + "epoch": 1.720912323616679, + "grad_norm": 0.900592565536499, + "learning_rate": 8.527759318552528e-05, + "loss": 2.9037, + "step": 18995 + }, + { + "epoch": 1.721002921791126, + "grad_norm": 0.8412489295005798, + "learning_rate": 8.527155198453453e-05, + "loss": 2.611, + "step": 18996 + }, + { + "epoch": 1.7210935199655726, + "grad_norm": 0.8751377463340759, + "learning_rate": 8.526551078354376e-05, + "loss": 2.8493, + "step": 18997 + }, + { + "epoch": 1.7211841181400196, + "grad_norm": 0.8597036004066467, + "learning_rate": 8.525946958255302e-05, + "loss": 2.572, + "step": 18998 + }, + { + "epoch": 1.7212747163144662, + "grad_norm": 0.9182726144790649, + "learning_rate": 8.525342838156226e-05, + "loss": 2.7844, + "step": 18999 + }, + { + "epoch": 1.7213653144889132, + "grad_norm": 0.8603104948997498, + "learning_rate": 8.52473871805715e-05, + "loss": 2.7311, + "step": 19000 + }, + { + "epoch": 1.7214559126633597, + "grad_norm": 0.9451298117637634, + "learning_rate": 8.524134597958074e-05, + "loss": 2.474, + "step": 19001 + }, + { + "epoch": 1.7215465108378067, + "grad_norm": 0.8617708683013916, + "learning_rate": 8.523530477858999e-05, + "loss": 2.7616, + "step": 19002 + }, + { + "epoch": 1.7216371090122533, + "grad_norm": 0.8855604529380798, + "learning_rate": 8.522926357759923e-05, + "loss": 2.633, + "step": 19003 + }, + { + "epoch": 1.7217277071867003, + "grad_norm": 0.833337664604187, + "learning_rate": 8.522322237660847e-05, + "loss": 2.5427, + "step": 19004 + }, + { + "epoch": 1.7218183053611469, + "grad_norm": 0.8668698072433472, + "learning_rate": 8.521718117561772e-05, + "loss": 2.6281, + "step": 19005 + }, + { + "epoch": 1.721908903535594, + "grad_norm": 0.894802987575531, + "learning_rate": 8.521113997462696e-05, + "loss": 2.6769, + "step": 19006 + }, + { + "epoch": 1.7219995017100405, + "grad_norm": 0.831740140914917, + "learning_rate": 8.520509877363621e-05, + "loss": 1.9871, + "step": 19007 + }, + { + "epoch": 1.7220900998844875, + "grad_norm": 0.8844682574272156, + "learning_rate": 8.519905757264544e-05, + "loss": 2.8205, + "step": 19008 + }, + { + "epoch": 1.722180698058934, + "grad_norm": 0.9011998176574707, + "learning_rate": 8.519301637165469e-05, + "loss": 2.7241, + "step": 19009 + }, + { + "epoch": 1.722271296233381, + "grad_norm": 0.8869423866271973, + "learning_rate": 8.518697517066393e-05, + "loss": 2.7196, + "step": 19010 + }, + { + "epoch": 1.7223618944078276, + "grad_norm": 0.9472286701202393, + "learning_rate": 8.518093396967317e-05, + "loss": 2.6047, + "step": 19011 + }, + { + "epoch": 1.7224524925822746, + "grad_norm": 0.9483063220977783, + "learning_rate": 8.517489276868241e-05, + "loss": 2.5321, + "step": 19012 + }, + { + "epoch": 1.7225430907567212, + "grad_norm": 0.8935244083404541, + "learning_rate": 8.516885156769167e-05, + "loss": 2.5151, + "step": 19013 + }, + { + "epoch": 1.7226336889311682, + "grad_norm": 0.5864728689193726, + "learning_rate": 8.51628103667009e-05, + "loss": 1.3114, + "step": 19014 + }, + { + "epoch": 1.7227242871056148, + "grad_norm": 0.7455282211303711, + "learning_rate": 8.515676916571015e-05, + "loss": 1.8348, + "step": 19015 + }, + { + "epoch": 1.7228148852800618, + "grad_norm": 0.8625308871269226, + "learning_rate": 8.515072796471938e-05, + "loss": 2.8905, + "step": 19016 + }, + { + "epoch": 1.7229054834545083, + "grad_norm": 0.8108717203140259, + "learning_rate": 8.514468676372863e-05, + "loss": 2.1318, + "step": 19017 + }, + { + "epoch": 1.7229960816289551, + "grad_norm": 0.9505404829978943, + "learning_rate": 8.513864556273788e-05, + "loss": 2.8045, + "step": 19018 + }, + { + "epoch": 1.723086679803402, + "grad_norm": 0.9648869037628174, + "learning_rate": 8.513260436174711e-05, + "loss": 2.5873, + "step": 19019 + }, + { + "epoch": 1.7231772779778487, + "grad_norm": 0.9043778777122498, + "learning_rate": 8.512656316075636e-05, + "loss": 2.8872, + "step": 19020 + }, + { + "epoch": 1.7232678761522955, + "grad_norm": 0.7784208655357361, + "learning_rate": 8.512052195976561e-05, + "loss": 2.0274, + "step": 19021 + }, + { + "epoch": 1.7233584743267423, + "grad_norm": 0.8597776889801025, + "learning_rate": 8.511448075877486e-05, + "loss": 2.821, + "step": 19022 + }, + { + "epoch": 1.723449072501189, + "grad_norm": 0.9169141054153442, + "learning_rate": 8.510843955778409e-05, + "loss": 2.8179, + "step": 19023 + }, + { + "epoch": 1.7235396706756358, + "grad_norm": 0.9128818511962891, + "learning_rate": 8.510239835679334e-05, + "loss": 2.6482, + "step": 19024 + }, + { + "epoch": 1.7236302688500826, + "grad_norm": 1.0245037078857422, + "learning_rate": 8.509635715580257e-05, + "loss": 2.4712, + "step": 19025 + }, + { + "epoch": 1.7237208670245294, + "grad_norm": 0.9598806500434875, + "learning_rate": 8.509031595481182e-05, + "loss": 2.6779, + "step": 19026 + }, + { + "epoch": 1.7238114651989762, + "grad_norm": 0.7874058485031128, + "learning_rate": 8.508427475382105e-05, + "loss": 2.0296, + "step": 19027 + }, + { + "epoch": 1.723902063373423, + "grad_norm": 0.8545730113983154, + "learning_rate": 8.507823355283032e-05, + "loss": 2.9163, + "step": 19028 + }, + { + "epoch": 1.7239926615478698, + "grad_norm": 0.8385167717933655, + "learning_rate": 8.507219235183955e-05, + "loss": 2.0656, + "step": 19029 + }, + { + "epoch": 1.7240832597223166, + "grad_norm": 1.035715103149414, + "learning_rate": 8.50661511508488e-05, + "loss": 2.8402, + "step": 19030 + }, + { + "epoch": 1.7241738578967634, + "grad_norm": 0.8742390871047974, + "learning_rate": 8.506010994985803e-05, + "loss": 2.5356, + "step": 19031 + }, + { + "epoch": 1.7242644560712101, + "grad_norm": 0.9089169502258301, + "learning_rate": 8.505406874886728e-05, + "loss": 2.718, + "step": 19032 + }, + { + "epoch": 1.724355054245657, + "grad_norm": 0.9756463766098022, + "learning_rate": 8.504802754787651e-05, + "loss": 2.8257, + "step": 19033 + }, + { + "epoch": 1.7244456524201037, + "grad_norm": 0.9141187071800232, + "learning_rate": 8.504198634688576e-05, + "loss": 2.9278, + "step": 19034 + }, + { + "epoch": 1.7245362505945505, + "grad_norm": 0.9246835708618164, + "learning_rate": 8.503594514589501e-05, + "loss": 2.7672, + "step": 19035 + }, + { + "epoch": 1.7246268487689973, + "grad_norm": 0.8852809071540833, + "learning_rate": 8.502990394490426e-05, + "loss": 2.5483, + "step": 19036 + }, + { + "epoch": 1.724717446943444, + "grad_norm": 0.8932013511657715, + "learning_rate": 8.502386274391349e-05, + "loss": 2.8756, + "step": 19037 + }, + { + "epoch": 1.7248080451178909, + "grad_norm": 0.8786904811859131, + "learning_rate": 8.501782154292274e-05, + "loss": 2.6458, + "step": 19038 + }, + { + "epoch": 1.7248986432923377, + "grad_norm": 0.9147391319274902, + "learning_rate": 8.501178034193198e-05, + "loss": 2.566, + "step": 19039 + }, + { + "epoch": 1.7249892414667845, + "grad_norm": 0.9639135599136353, + "learning_rate": 8.500573914094122e-05, + "loss": 2.7056, + "step": 19040 + }, + { + "epoch": 1.7250798396412312, + "grad_norm": 0.956576943397522, + "learning_rate": 8.499969793995047e-05, + "loss": 2.59, + "step": 19041 + }, + { + "epoch": 1.725170437815678, + "grad_norm": 0.9286782145500183, + "learning_rate": 8.49936567389597e-05, + "loss": 2.7933, + "step": 19042 + }, + { + "epoch": 1.7252610359901248, + "grad_norm": 0.8798651695251465, + "learning_rate": 8.498761553796896e-05, + "loss": 2.7861, + "step": 19043 + }, + { + "epoch": 1.7253516341645716, + "grad_norm": 0.956134557723999, + "learning_rate": 8.49815743369782e-05, + "loss": 2.5679, + "step": 19044 + }, + { + "epoch": 1.7254422323390184, + "grad_norm": 0.7770836353302002, + "learning_rate": 8.497553313598744e-05, + "loss": 2.1363, + "step": 19045 + }, + { + "epoch": 1.7255328305134652, + "grad_norm": 0.9020747542381287, + "learning_rate": 8.496949193499668e-05, + "loss": 2.547, + "step": 19046 + }, + { + "epoch": 1.725623428687912, + "grad_norm": 0.9072487950325012, + "learning_rate": 8.496345073400592e-05, + "loss": 2.8397, + "step": 19047 + }, + { + "epoch": 1.7257140268623588, + "grad_norm": 0.9584402441978455, + "learning_rate": 8.495740953301516e-05, + "loss": 3.0407, + "step": 19048 + }, + { + "epoch": 1.7258046250368055, + "grad_norm": 0.964225172996521, + "learning_rate": 8.49513683320244e-05, + "loss": 2.5053, + "step": 19049 + }, + { + "epoch": 1.7258952232112523, + "grad_norm": 1.0366582870483398, + "learning_rate": 8.494532713103365e-05, + "loss": 2.874, + "step": 19050 + }, + { + "epoch": 1.7259858213856991, + "grad_norm": 0.8330137729644775, + "learning_rate": 8.49392859300429e-05, + "loss": 1.765, + "step": 19051 + }, + { + "epoch": 1.726076419560146, + "grad_norm": 0.8924700021743774, + "learning_rate": 8.493324472905214e-05, + "loss": 2.6084, + "step": 19052 + }, + { + "epoch": 1.7261670177345927, + "grad_norm": 0.8483251929283142, + "learning_rate": 8.492720352806138e-05, + "loss": 2.6253, + "step": 19053 + }, + { + "epoch": 1.7262576159090395, + "grad_norm": 0.929094135761261, + "learning_rate": 8.492116232707063e-05, + "loss": 2.7895, + "step": 19054 + }, + { + "epoch": 1.7263482140834863, + "grad_norm": 0.8693047761917114, + "learning_rate": 8.491512112607987e-05, + "loss": 2.7041, + "step": 19055 + }, + { + "epoch": 1.726438812257933, + "grad_norm": 0.9048235416412354, + "learning_rate": 8.490907992508911e-05, + "loss": 2.5535, + "step": 19056 + }, + { + "epoch": 1.7265294104323798, + "grad_norm": 0.9081422686576843, + "learning_rate": 8.490303872409835e-05, + "loss": 2.6394, + "step": 19057 + }, + { + "epoch": 1.7266200086068264, + "grad_norm": 0.8485438227653503, + "learning_rate": 8.489699752310761e-05, + "loss": 2.6304, + "step": 19058 + }, + { + "epoch": 1.7267106067812734, + "grad_norm": 1.0258077383041382, + "learning_rate": 8.489095632211684e-05, + "loss": 2.92, + "step": 19059 + }, + { + "epoch": 1.72680120495572, + "grad_norm": 0.886188268661499, + "learning_rate": 8.488491512112609e-05, + "loss": 2.6219, + "step": 19060 + }, + { + "epoch": 1.726891803130167, + "grad_norm": 0.7748475074768066, + "learning_rate": 8.487887392013532e-05, + "loss": 2.099, + "step": 19061 + }, + { + "epoch": 1.7269824013046136, + "grad_norm": 0.8088141083717346, + "learning_rate": 8.487283271914457e-05, + "loss": 2.1127, + "step": 19062 + }, + { + "epoch": 1.7270729994790606, + "grad_norm": 0.9144259691238403, + "learning_rate": 8.48667915181538e-05, + "loss": 2.5214, + "step": 19063 + }, + { + "epoch": 1.7271635976535071, + "grad_norm": 0.9579052925109863, + "learning_rate": 8.486075031716305e-05, + "loss": 3.086, + "step": 19064 + }, + { + "epoch": 1.7272541958279541, + "grad_norm": 0.8726304173469543, + "learning_rate": 8.48547091161723e-05, + "loss": 2.8655, + "step": 19065 + }, + { + "epoch": 1.7273447940024007, + "grad_norm": 0.8978464603424072, + "learning_rate": 8.484866791518155e-05, + "loss": 2.7113, + "step": 19066 + }, + { + "epoch": 1.7274353921768477, + "grad_norm": 0.897230863571167, + "learning_rate": 8.484262671419078e-05, + "loss": 2.637, + "step": 19067 + }, + { + "epoch": 1.7275259903512943, + "grad_norm": 0.9525938630104065, + "learning_rate": 8.483658551320003e-05, + "loss": 2.5869, + "step": 19068 + }, + { + "epoch": 1.7276165885257413, + "grad_norm": 0.8976401686668396, + "learning_rate": 8.483054431220926e-05, + "loss": 2.6808, + "step": 19069 + }, + { + "epoch": 1.7277071867001879, + "grad_norm": 0.9378364682197571, + "learning_rate": 8.482450311121851e-05, + "loss": 2.7588, + "step": 19070 + }, + { + "epoch": 1.7277977848746349, + "grad_norm": 0.9019719958305359, + "learning_rate": 8.481846191022776e-05, + "loss": 2.5935, + "step": 19071 + }, + { + "epoch": 1.7278883830490814, + "grad_norm": 0.8697265982627869, + "learning_rate": 8.481242070923699e-05, + "loss": 2.7694, + "step": 19072 + }, + { + "epoch": 1.7279789812235284, + "grad_norm": 0.9771739840507507, + "learning_rate": 8.480637950824625e-05, + "loss": 2.7611, + "step": 19073 + }, + { + "epoch": 1.728069579397975, + "grad_norm": 0.748912513256073, + "learning_rate": 8.480033830725549e-05, + "loss": 1.9155, + "step": 19074 + }, + { + "epoch": 1.728160177572422, + "grad_norm": 0.963290810585022, + "learning_rate": 8.479429710626474e-05, + "loss": 2.7351, + "step": 19075 + }, + { + "epoch": 1.7282507757468686, + "grad_norm": 0.8608541488647461, + "learning_rate": 8.478825590527397e-05, + "loss": 2.6419, + "step": 19076 + }, + { + "epoch": 1.7283413739213156, + "grad_norm": 0.752019464969635, + "learning_rate": 8.478221470428322e-05, + "loss": 1.9519, + "step": 19077 + }, + { + "epoch": 1.7284319720957622, + "grad_norm": 0.9257081151008606, + "learning_rate": 8.477617350329245e-05, + "loss": 2.6752, + "step": 19078 + }, + { + "epoch": 1.7285225702702092, + "grad_norm": 0.9553324580192566, + "learning_rate": 8.47701323023017e-05, + "loss": 2.559, + "step": 19079 + }, + { + "epoch": 1.7286131684446557, + "grad_norm": 0.8549805283546448, + "learning_rate": 8.476409110131095e-05, + "loss": 2.4407, + "step": 19080 + }, + { + "epoch": 1.7287037666191027, + "grad_norm": 0.8622103333473206, + "learning_rate": 8.47580499003202e-05, + "loss": 2.2237, + "step": 19081 + }, + { + "epoch": 1.7287943647935493, + "grad_norm": 0.9191164374351501, + "learning_rate": 8.475200869932943e-05, + "loss": 2.6632, + "step": 19082 + }, + { + "epoch": 1.7288849629679963, + "grad_norm": 0.8921335339546204, + "learning_rate": 8.474596749833868e-05, + "loss": 2.6266, + "step": 19083 + }, + { + "epoch": 1.7289755611424429, + "grad_norm": 0.8943267464637756, + "learning_rate": 8.473992629734791e-05, + "loss": 2.6519, + "step": 19084 + }, + { + "epoch": 1.72906615931689, + "grad_norm": 0.9106718301773071, + "learning_rate": 8.473388509635716e-05, + "loss": 2.7546, + "step": 19085 + }, + { + "epoch": 1.7291567574913365, + "grad_norm": 0.8579716682434082, + "learning_rate": 8.47278438953664e-05, + "loss": 1.9781, + "step": 19086 + }, + { + "epoch": 1.7292473556657835, + "grad_norm": 0.9210829734802246, + "learning_rate": 8.472180269437564e-05, + "loss": 2.6373, + "step": 19087 + }, + { + "epoch": 1.72933795384023, + "grad_norm": 0.9607811570167542, + "learning_rate": 8.471576149338489e-05, + "loss": 2.8384, + "step": 19088 + }, + { + "epoch": 1.729428552014677, + "grad_norm": 0.8885219097137451, + "learning_rate": 8.470972029239413e-05, + "loss": 2.6832, + "step": 19089 + }, + { + "epoch": 1.7295191501891236, + "grad_norm": 0.7665426135063171, + "learning_rate": 8.470367909140338e-05, + "loss": 2.1189, + "step": 19090 + }, + { + "epoch": 1.7296097483635706, + "grad_norm": 0.9441961646080017, + "learning_rate": 8.469763789041262e-05, + "loss": 2.7242, + "step": 19091 + }, + { + "epoch": 1.7297003465380172, + "grad_norm": 0.8241105675697327, + "learning_rate": 8.469159668942186e-05, + "loss": 2.0503, + "step": 19092 + }, + { + "epoch": 1.7297909447124642, + "grad_norm": 0.928878128528595, + "learning_rate": 8.46855554884311e-05, + "loss": 2.5705, + "step": 19093 + }, + { + "epoch": 1.7298815428869108, + "grad_norm": 0.8836233615875244, + "learning_rate": 8.467951428744035e-05, + "loss": 2.4669, + "step": 19094 + }, + { + "epoch": 1.7299721410613578, + "grad_norm": 0.9353368878364563, + "learning_rate": 8.467347308644959e-05, + "loss": 2.6679, + "step": 19095 + }, + { + "epoch": 1.7300627392358043, + "grad_norm": 0.9229956269264221, + "learning_rate": 8.466743188545884e-05, + "loss": 2.6079, + "step": 19096 + }, + { + "epoch": 1.7301533374102513, + "grad_norm": 0.9507061839103699, + "learning_rate": 8.466139068446807e-05, + "loss": 2.5866, + "step": 19097 + }, + { + "epoch": 1.730243935584698, + "grad_norm": 1.0538820028305054, + "learning_rate": 8.465534948347732e-05, + "loss": 2.6854, + "step": 19098 + }, + { + "epoch": 1.7303345337591447, + "grad_norm": 0.8101008534431458, + "learning_rate": 8.464930828248656e-05, + "loss": 1.8921, + "step": 19099 + }, + { + "epoch": 1.7304251319335915, + "grad_norm": 0.8852002024650574, + "learning_rate": 8.46432670814958e-05, + "loss": 3.0545, + "step": 19100 + }, + { + "epoch": 1.7305157301080383, + "grad_norm": 0.8744781017303467, + "learning_rate": 8.463722588050504e-05, + "loss": 2.7202, + "step": 19101 + }, + { + "epoch": 1.730606328282485, + "grad_norm": 0.9660572409629822, + "learning_rate": 8.463118467951429e-05, + "loss": 2.6944, + "step": 19102 + }, + { + "epoch": 1.7306969264569319, + "grad_norm": 0.9278613328933716, + "learning_rate": 8.462514347852353e-05, + "loss": 2.6908, + "step": 19103 + }, + { + "epoch": 1.7307875246313786, + "grad_norm": 0.8747604489326477, + "learning_rate": 8.461910227753278e-05, + "loss": 2.712, + "step": 19104 + }, + { + "epoch": 1.7308781228058254, + "grad_norm": 0.83293616771698, + "learning_rate": 8.461306107654202e-05, + "loss": 2.5842, + "step": 19105 + }, + { + "epoch": 1.7309687209802722, + "grad_norm": 0.8638909459114075, + "learning_rate": 8.460701987555126e-05, + "loss": 2.9089, + "step": 19106 + }, + { + "epoch": 1.731059319154719, + "grad_norm": 0.9167358875274658, + "learning_rate": 8.460097867456051e-05, + "loss": 2.9397, + "step": 19107 + }, + { + "epoch": 1.7311499173291658, + "grad_norm": 0.9180126190185547, + "learning_rate": 8.459493747356974e-05, + "loss": 2.7485, + "step": 19108 + }, + { + "epoch": 1.7312405155036126, + "grad_norm": 0.8919090628623962, + "learning_rate": 8.458889627257899e-05, + "loss": 2.9148, + "step": 19109 + }, + { + "epoch": 1.7313311136780594, + "grad_norm": 0.9184460043907166, + "learning_rate": 8.458285507158824e-05, + "loss": 2.7802, + "step": 19110 + }, + { + "epoch": 1.7314217118525062, + "grad_norm": 0.994452178478241, + "learning_rate": 8.457681387059749e-05, + "loss": 2.6373, + "step": 19111 + }, + { + "epoch": 1.731512310026953, + "grad_norm": 0.8847805261611938, + "learning_rate": 8.457077266960672e-05, + "loss": 2.8365, + "step": 19112 + }, + { + "epoch": 1.7316029082013997, + "grad_norm": 0.9827089309692383, + "learning_rate": 8.456473146861597e-05, + "loss": 2.3818, + "step": 19113 + }, + { + "epoch": 1.7316935063758465, + "grad_norm": 0.8713309168815613, + "learning_rate": 8.45586902676252e-05, + "loss": 2.53, + "step": 19114 + }, + { + "epoch": 1.7317841045502933, + "grad_norm": 0.8587559461593628, + "learning_rate": 8.455264906663445e-05, + "loss": 3.1286, + "step": 19115 + }, + { + "epoch": 1.73187470272474, + "grad_norm": 0.8877401947975159, + "learning_rate": 8.454660786564368e-05, + "loss": 2.5627, + "step": 19116 + }, + { + "epoch": 1.7319653008991869, + "grad_norm": 0.8683608174324036, + "learning_rate": 8.454056666465293e-05, + "loss": 2.7936, + "step": 19117 + }, + { + "epoch": 1.7320558990736337, + "grad_norm": 0.8837193846702576, + "learning_rate": 8.453452546366218e-05, + "loss": 2.9982, + "step": 19118 + }, + { + "epoch": 1.7321464972480805, + "grad_norm": 0.859865128993988, + "learning_rate": 8.452848426267143e-05, + "loss": 2.8393, + "step": 19119 + }, + { + "epoch": 1.7322370954225272, + "grad_norm": 0.9184340834617615, + "learning_rate": 8.452244306168066e-05, + "loss": 2.8045, + "step": 19120 + }, + { + "epoch": 1.732327693596974, + "grad_norm": 0.9741985201835632, + "learning_rate": 8.451640186068991e-05, + "loss": 3.0965, + "step": 19121 + }, + { + "epoch": 1.7324182917714208, + "grad_norm": 0.9460785388946533, + "learning_rate": 8.451036065969916e-05, + "loss": 2.8156, + "step": 19122 + }, + { + "epoch": 1.7325088899458676, + "grad_norm": 0.9122779965400696, + "learning_rate": 8.450431945870839e-05, + "loss": 2.7909, + "step": 19123 + }, + { + "epoch": 1.7325994881203144, + "grad_norm": 0.9066150784492493, + "learning_rate": 8.449827825771764e-05, + "loss": 2.7119, + "step": 19124 + }, + { + "epoch": 1.7326900862947612, + "grad_norm": 0.894981861114502, + "learning_rate": 8.449223705672689e-05, + "loss": 2.7893, + "step": 19125 + }, + { + "epoch": 1.732780684469208, + "grad_norm": 0.8378793001174927, + "learning_rate": 8.448619585573613e-05, + "loss": 2.6384, + "step": 19126 + }, + { + "epoch": 1.7328712826436548, + "grad_norm": 0.8818342685699463, + "learning_rate": 8.448015465474537e-05, + "loss": 2.8602, + "step": 19127 + }, + { + "epoch": 1.7329618808181015, + "grad_norm": 0.7732452750205994, + "learning_rate": 8.447411345375462e-05, + "loss": 2.0621, + "step": 19128 + }, + { + "epoch": 1.7330524789925483, + "grad_norm": 0.9482911229133606, + "learning_rate": 8.446807225276385e-05, + "loss": 2.8055, + "step": 19129 + }, + { + "epoch": 1.7331430771669951, + "grad_norm": 0.916545033454895, + "learning_rate": 8.44620310517731e-05, + "loss": 2.7032, + "step": 19130 + }, + { + "epoch": 1.733233675341442, + "grad_norm": 1.057155728340149, + "learning_rate": 8.445598985078233e-05, + "loss": 2.829, + "step": 19131 + }, + { + "epoch": 1.7333242735158887, + "grad_norm": 0.8368772268295288, + "learning_rate": 8.444994864979159e-05, + "loss": 2.9104, + "step": 19132 + }, + { + "epoch": 1.7334148716903355, + "grad_norm": 0.8539777398109436, + "learning_rate": 8.444390744880083e-05, + "loss": 2.65, + "step": 19133 + }, + { + "epoch": 1.7335054698647823, + "grad_norm": 0.8087371587753296, + "learning_rate": 8.443786624781007e-05, + "loss": 2.5653, + "step": 19134 + }, + { + "epoch": 1.733596068039229, + "grad_norm": 0.8563439846038818, + "learning_rate": 8.443182504681931e-05, + "loss": 2.7451, + "step": 19135 + }, + { + "epoch": 1.7336866662136758, + "grad_norm": 0.7816891670227051, + "learning_rate": 8.442578384582856e-05, + "loss": 2.2735, + "step": 19136 + }, + { + "epoch": 1.7337772643881226, + "grad_norm": 0.8842403292655945, + "learning_rate": 8.441974264483779e-05, + "loss": 2.7527, + "step": 19137 + }, + { + "epoch": 1.7338678625625694, + "grad_norm": 0.8719128370285034, + "learning_rate": 8.441370144384704e-05, + "loss": 2.7582, + "step": 19138 + }, + { + "epoch": 1.733958460737016, + "grad_norm": 0.9628984928131104, + "learning_rate": 8.440766024285628e-05, + "loss": 2.7013, + "step": 19139 + }, + { + "epoch": 1.734049058911463, + "grad_norm": 0.9103395342826843, + "learning_rate": 8.440161904186553e-05, + "loss": 2.8234, + "step": 19140 + }, + { + "epoch": 1.7341396570859096, + "grad_norm": 1.068960189819336, + "learning_rate": 8.439557784087478e-05, + "loss": 1.4414, + "step": 19141 + }, + { + "epoch": 1.7342302552603566, + "grad_norm": 0.7567023634910583, + "learning_rate": 8.438953663988401e-05, + "loss": 2.1277, + "step": 19142 + }, + { + "epoch": 1.7343208534348031, + "grad_norm": 0.8846742510795593, + "learning_rate": 8.438349543889326e-05, + "loss": 2.8478, + "step": 19143 + }, + { + "epoch": 1.7344114516092501, + "grad_norm": 0.9442623853683472, + "learning_rate": 8.43774542379025e-05, + "loss": 2.6842, + "step": 19144 + }, + { + "epoch": 1.7345020497836967, + "grad_norm": 0.9287198781967163, + "learning_rate": 8.437141303691174e-05, + "loss": 2.8155, + "step": 19145 + }, + { + "epoch": 1.7345926479581437, + "grad_norm": 1.012826681137085, + "learning_rate": 8.436537183592098e-05, + "loss": 2.6894, + "step": 19146 + }, + { + "epoch": 1.7346832461325903, + "grad_norm": 0.9623587131500244, + "learning_rate": 8.435933063493024e-05, + "loss": 2.8004, + "step": 19147 + }, + { + "epoch": 1.7347738443070373, + "grad_norm": 0.9256280064582825, + "learning_rate": 8.435328943393947e-05, + "loss": 3.0303, + "step": 19148 + }, + { + "epoch": 1.7348644424814839, + "grad_norm": 0.8737202286720276, + "learning_rate": 8.434724823294872e-05, + "loss": 2.8359, + "step": 19149 + }, + { + "epoch": 1.7349550406559309, + "grad_norm": 0.9106942415237427, + "learning_rate": 8.434120703195795e-05, + "loss": 2.8537, + "step": 19150 + }, + { + "epoch": 1.7350456388303774, + "grad_norm": 0.7978126406669617, + "learning_rate": 8.43351658309672e-05, + "loss": 2.4753, + "step": 19151 + }, + { + "epoch": 1.7351362370048244, + "grad_norm": 0.9652724266052246, + "learning_rate": 8.432912462997644e-05, + "loss": 2.449, + "step": 19152 + }, + { + "epoch": 1.735226835179271, + "grad_norm": 0.9131665229797363, + "learning_rate": 8.432308342898568e-05, + "loss": 2.7828, + "step": 19153 + }, + { + "epoch": 1.735317433353718, + "grad_norm": 0.9007995128631592, + "learning_rate": 8.431704222799493e-05, + "loss": 2.7006, + "step": 19154 + }, + { + "epoch": 1.7354080315281646, + "grad_norm": 0.8340973854064941, + "learning_rate": 8.431100102700418e-05, + "loss": 2.7351, + "step": 19155 + }, + { + "epoch": 1.7354986297026116, + "grad_norm": 0.9264453649520874, + "learning_rate": 8.430495982601341e-05, + "loss": 2.8166, + "step": 19156 + }, + { + "epoch": 1.7355892278770582, + "grad_norm": 0.8788025379180908, + "learning_rate": 8.429891862502266e-05, + "loss": 2.5151, + "step": 19157 + }, + { + "epoch": 1.7356798260515052, + "grad_norm": 0.8284144997596741, + "learning_rate": 8.429287742403191e-05, + "loss": 2.6414, + "step": 19158 + }, + { + "epoch": 1.7357704242259517, + "grad_norm": 0.8978042602539062, + "learning_rate": 8.428683622304114e-05, + "loss": 2.967, + "step": 19159 + }, + { + "epoch": 1.7358610224003987, + "grad_norm": 0.8697168827056885, + "learning_rate": 8.428079502205039e-05, + "loss": 2.6811, + "step": 19160 + }, + { + "epoch": 1.7359516205748453, + "grad_norm": 0.9363920092582703, + "learning_rate": 8.427475382105962e-05, + "loss": 2.9006, + "step": 19161 + }, + { + "epoch": 1.7360422187492923, + "grad_norm": 0.9355846047401428, + "learning_rate": 8.426871262006888e-05, + "loss": 2.7286, + "step": 19162 + }, + { + "epoch": 1.7361328169237389, + "grad_norm": 0.8906271457672119, + "learning_rate": 8.426267141907812e-05, + "loss": 2.3926, + "step": 19163 + }, + { + "epoch": 1.736223415098186, + "grad_norm": 0.9063444137573242, + "learning_rate": 8.425663021808737e-05, + "loss": 2.6991, + "step": 19164 + }, + { + "epoch": 1.7363140132726325, + "grad_norm": 0.8576631546020508, + "learning_rate": 8.42505890170966e-05, + "loss": 2.6526, + "step": 19165 + }, + { + "epoch": 1.7364046114470795, + "grad_norm": 0.757361114025116, + "learning_rate": 8.424454781610585e-05, + "loss": 1.9889, + "step": 19166 + }, + { + "epoch": 1.736495209621526, + "grad_norm": 0.8729048371315002, + "learning_rate": 8.423850661511508e-05, + "loss": 2.4705, + "step": 19167 + }, + { + "epoch": 1.736585807795973, + "grad_norm": 0.8454176783561707, + "learning_rate": 8.423246541412433e-05, + "loss": 2.481, + "step": 19168 + }, + { + "epoch": 1.7366764059704196, + "grad_norm": 0.8279584646224976, + "learning_rate": 8.422642421313356e-05, + "loss": 2.1013, + "step": 19169 + }, + { + "epoch": 1.7367670041448666, + "grad_norm": 0.88612961769104, + "learning_rate": 8.422038301214282e-05, + "loss": 2.8068, + "step": 19170 + }, + { + "epoch": 1.7368576023193132, + "grad_norm": 0.9126024842262268, + "learning_rate": 8.421434181115206e-05, + "loss": 2.6846, + "step": 19171 + }, + { + "epoch": 1.7369482004937602, + "grad_norm": 0.9236650466918945, + "learning_rate": 8.42083006101613e-05, + "loss": 2.8716, + "step": 19172 + }, + { + "epoch": 1.7370387986682068, + "grad_norm": 0.8871752619743347, + "learning_rate": 8.420225940917055e-05, + "loss": 2.5611, + "step": 19173 + }, + { + "epoch": 1.7371293968426538, + "grad_norm": 0.866802990436554, + "learning_rate": 8.419621820817979e-05, + "loss": 2.8066, + "step": 19174 + }, + { + "epoch": 1.7372199950171003, + "grad_norm": 0.8931326866149902, + "learning_rate": 8.419017700718904e-05, + "loss": 2.8999, + "step": 19175 + }, + { + "epoch": 1.7373105931915473, + "grad_norm": 0.9059023261070251, + "learning_rate": 8.418413580619827e-05, + "loss": 2.9347, + "step": 19176 + }, + { + "epoch": 1.737401191365994, + "grad_norm": 0.8953938484191895, + "learning_rate": 8.417809460520753e-05, + "loss": 2.5252, + "step": 19177 + }, + { + "epoch": 1.737491789540441, + "grad_norm": 0.9136295914649963, + "learning_rate": 8.417205340421677e-05, + "loss": 2.5172, + "step": 19178 + }, + { + "epoch": 1.7375823877148875, + "grad_norm": 0.9120106101036072, + "learning_rate": 8.416601220322601e-05, + "loss": 2.6813, + "step": 19179 + }, + { + "epoch": 1.7376729858893343, + "grad_norm": 0.9404539465904236, + "learning_rate": 8.415997100223525e-05, + "loss": 2.7433, + "step": 19180 + }, + { + "epoch": 1.737763584063781, + "grad_norm": 0.8971822261810303, + "learning_rate": 8.41539298012445e-05, + "loss": 2.725, + "step": 19181 + }, + { + "epoch": 1.7378541822382279, + "grad_norm": 0.8768846392631531, + "learning_rate": 8.414788860025373e-05, + "loss": 2.6152, + "step": 19182 + }, + { + "epoch": 1.7379447804126746, + "grad_norm": 0.7770742177963257, + "learning_rate": 8.414184739926298e-05, + "loss": 1.928, + "step": 19183 + }, + { + "epoch": 1.7380353785871214, + "grad_norm": 0.6471853256225586, + "learning_rate": 8.413580619827221e-05, + "loss": 1.2982, + "step": 19184 + }, + { + "epoch": 1.7381259767615682, + "grad_norm": 0.8910969495773315, + "learning_rate": 8.412976499728147e-05, + "loss": 2.6956, + "step": 19185 + }, + { + "epoch": 1.738216574936015, + "grad_norm": 0.8388199210166931, + "learning_rate": 8.41237237962907e-05, + "loss": 2.5829, + "step": 19186 + }, + { + "epoch": 1.7383071731104618, + "grad_norm": 0.8989675641059875, + "learning_rate": 8.411768259529995e-05, + "loss": 2.6469, + "step": 19187 + }, + { + "epoch": 1.7383977712849086, + "grad_norm": 0.7320691347122192, + "learning_rate": 8.411164139430919e-05, + "loss": 1.8986, + "step": 19188 + }, + { + "epoch": 1.7384883694593554, + "grad_norm": 0.9076601266860962, + "learning_rate": 8.410560019331843e-05, + "loss": 2.7594, + "step": 19189 + }, + { + "epoch": 1.7385789676338022, + "grad_norm": 0.9844849705696106, + "learning_rate": 8.409955899232768e-05, + "loss": 2.8208, + "step": 19190 + }, + { + "epoch": 1.738669565808249, + "grad_norm": 0.8771894574165344, + "learning_rate": 8.409351779133692e-05, + "loss": 2.9565, + "step": 19191 + }, + { + "epoch": 1.7387601639826957, + "grad_norm": 1.0234041213989258, + "learning_rate": 8.408747659034616e-05, + "loss": 2.7563, + "step": 19192 + }, + { + "epoch": 1.7388507621571425, + "grad_norm": 0.9062701463699341, + "learning_rate": 8.408143538935541e-05, + "loss": 2.7018, + "step": 19193 + }, + { + "epoch": 1.7389413603315893, + "grad_norm": 0.935956597328186, + "learning_rate": 8.407539418836466e-05, + "loss": 2.6761, + "step": 19194 + }, + { + "epoch": 1.739031958506036, + "grad_norm": 0.9122586846351624, + "learning_rate": 8.406935298737389e-05, + "loss": 2.666, + "step": 19195 + }, + { + "epoch": 1.7391225566804829, + "grad_norm": 0.9247958660125732, + "learning_rate": 8.406331178638314e-05, + "loss": 2.8604, + "step": 19196 + }, + { + "epoch": 1.7392131548549297, + "grad_norm": 0.8723556399345398, + "learning_rate": 8.405727058539237e-05, + "loss": 2.7338, + "step": 19197 + }, + { + "epoch": 1.7393037530293765, + "grad_norm": 0.8607945442199707, + "learning_rate": 8.405122938440162e-05, + "loss": 2.6376, + "step": 19198 + }, + { + "epoch": 1.7393943512038232, + "grad_norm": 0.9485868811607361, + "learning_rate": 8.404518818341086e-05, + "loss": 2.7758, + "step": 19199 + }, + { + "epoch": 1.73948494937827, + "grad_norm": 0.9619538187980652, + "learning_rate": 8.403914698242012e-05, + "loss": 2.7012, + "step": 19200 + }, + { + "epoch": 1.7395755475527168, + "grad_norm": 0.9005190134048462, + "learning_rate": 8.403310578142935e-05, + "loss": 2.8399, + "step": 19201 + }, + { + "epoch": 1.7396661457271636, + "grad_norm": 0.9543351531028748, + "learning_rate": 8.40270645804386e-05, + "loss": 2.6195, + "step": 19202 + }, + { + "epoch": 1.7397567439016104, + "grad_norm": 0.927223265171051, + "learning_rate": 8.402102337944783e-05, + "loss": 2.6176, + "step": 19203 + }, + { + "epoch": 1.7398473420760572, + "grad_norm": 0.896538257598877, + "learning_rate": 8.401498217845708e-05, + "loss": 2.5429, + "step": 19204 + }, + { + "epoch": 1.739937940250504, + "grad_norm": 0.8950377702713013, + "learning_rate": 8.400894097746633e-05, + "loss": 2.4376, + "step": 19205 + }, + { + "epoch": 1.7400285384249508, + "grad_norm": 0.883423924446106, + "learning_rate": 8.400289977647556e-05, + "loss": 2.6468, + "step": 19206 + }, + { + "epoch": 1.7401191365993975, + "grad_norm": 0.9300658106803894, + "learning_rate": 8.399685857548481e-05, + "loss": 2.7467, + "step": 19207 + }, + { + "epoch": 1.7402097347738443, + "grad_norm": 0.8618428707122803, + "learning_rate": 8.399081737449406e-05, + "loss": 2.6435, + "step": 19208 + }, + { + "epoch": 1.7403003329482911, + "grad_norm": 0.9185193777084351, + "learning_rate": 8.39847761735033e-05, + "loss": 2.8302, + "step": 19209 + }, + { + "epoch": 1.740390931122738, + "grad_norm": 0.912300705909729, + "learning_rate": 8.397873497251254e-05, + "loss": 2.6595, + "step": 19210 + }, + { + "epoch": 1.7404815292971847, + "grad_norm": 0.8821260333061218, + "learning_rate": 8.397269377152179e-05, + "loss": 2.689, + "step": 19211 + }, + { + "epoch": 1.7405721274716315, + "grad_norm": 0.9009373188018799, + "learning_rate": 8.396665257053102e-05, + "loss": 2.9567, + "step": 19212 + }, + { + "epoch": 1.7406627256460783, + "grad_norm": 0.9355631470680237, + "learning_rate": 8.396061136954027e-05, + "loss": 2.3891, + "step": 19213 + }, + { + "epoch": 1.740753323820525, + "grad_norm": 0.9540493488311768, + "learning_rate": 8.39545701685495e-05, + "loss": 2.4971, + "step": 19214 + }, + { + "epoch": 1.7408439219949718, + "grad_norm": 0.9153216481208801, + "learning_rate": 8.394852896755876e-05, + "loss": 2.8601, + "step": 19215 + }, + { + "epoch": 1.7409345201694186, + "grad_norm": 0.9057450294494629, + "learning_rate": 8.3942487766568e-05, + "loss": 2.7546, + "step": 19216 + }, + { + "epoch": 1.7410251183438654, + "grad_norm": 0.8746894598007202, + "learning_rate": 8.393644656557725e-05, + "loss": 2.8386, + "step": 19217 + }, + { + "epoch": 1.7411157165183122, + "grad_norm": 0.95604407787323, + "learning_rate": 8.393040536458648e-05, + "loss": 2.779, + "step": 19218 + }, + { + "epoch": 1.741206314692759, + "grad_norm": 0.8368790149688721, + "learning_rate": 8.392436416359573e-05, + "loss": 2.7374, + "step": 19219 + }, + { + "epoch": 1.7412969128672056, + "grad_norm": 0.8985276818275452, + "learning_rate": 8.391832296260496e-05, + "loss": 2.6037, + "step": 19220 + }, + { + "epoch": 1.7413875110416526, + "grad_norm": 0.8526079654693604, + "learning_rate": 8.391228176161421e-05, + "loss": 2.5284, + "step": 19221 + }, + { + "epoch": 1.7414781092160991, + "grad_norm": 0.8901728391647339, + "learning_rate": 8.390624056062346e-05, + "loss": 2.6729, + "step": 19222 + }, + { + "epoch": 1.7415687073905461, + "grad_norm": 0.8238480091094971, + "learning_rate": 8.39001993596327e-05, + "loss": 2.1725, + "step": 19223 + }, + { + "epoch": 1.7416593055649927, + "grad_norm": 0.8067852854728699, + "learning_rate": 8.389415815864194e-05, + "loss": 2.5891, + "step": 19224 + }, + { + "epoch": 1.7417499037394397, + "grad_norm": 1.0133583545684814, + "learning_rate": 8.388811695765119e-05, + "loss": 2.6811, + "step": 19225 + }, + { + "epoch": 1.7418405019138863, + "grad_norm": 0.9197369813919067, + "learning_rate": 8.388207575666043e-05, + "loss": 2.6866, + "step": 19226 + }, + { + "epoch": 1.7419311000883333, + "grad_norm": 0.8177459239959717, + "learning_rate": 8.387603455566967e-05, + "loss": 2.4456, + "step": 19227 + }, + { + "epoch": 1.7420216982627799, + "grad_norm": 0.9390717148780823, + "learning_rate": 8.386999335467891e-05, + "loss": 2.7345, + "step": 19228 + }, + { + "epoch": 1.7421122964372269, + "grad_norm": 0.9125722646713257, + "learning_rate": 8.386395215368815e-05, + "loss": 2.9634, + "step": 19229 + }, + { + "epoch": 1.7422028946116734, + "grad_norm": 0.7872886061668396, + "learning_rate": 8.385791095269741e-05, + "loss": 1.872, + "step": 19230 + }, + { + "epoch": 1.7422934927861204, + "grad_norm": 0.8309394717216492, + "learning_rate": 8.385186975170664e-05, + "loss": 1.9143, + "step": 19231 + }, + { + "epoch": 1.742384090960567, + "grad_norm": 0.8019067049026489, + "learning_rate": 8.384582855071589e-05, + "loss": 2.1266, + "step": 19232 + }, + { + "epoch": 1.742474689135014, + "grad_norm": 0.8978244066238403, + "learning_rate": 8.383978734972513e-05, + "loss": 2.6451, + "step": 19233 + }, + { + "epoch": 1.7425652873094606, + "grad_norm": 0.8646270036697388, + "learning_rate": 8.383374614873437e-05, + "loss": 2.7138, + "step": 19234 + }, + { + "epoch": 1.7426558854839076, + "grad_norm": 0.9384584426879883, + "learning_rate": 8.382770494774361e-05, + "loss": 2.5896, + "step": 19235 + }, + { + "epoch": 1.7427464836583542, + "grad_norm": 0.9100878238677979, + "learning_rate": 8.382166374675286e-05, + "loss": 2.5763, + "step": 19236 + }, + { + "epoch": 1.7428370818328012, + "grad_norm": 0.9292876124382019, + "learning_rate": 8.38156225457621e-05, + "loss": 2.6962, + "step": 19237 + }, + { + "epoch": 1.7429276800072477, + "grad_norm": 0.9231335520744324, + "learning_rate": 8.380958134477135e-05, + "loss": 2.8436, + "step": 19238 + }, + { + "epoch": 1.7430182781816947, + "grad_norm": 0.9459805488586426, + "learning_rate": 8.380354014378058e-05, + "loss": 2.8149, + "step": 19239 + }, + { + "epoch": 1.7431088763561413, + "grad_norm": 0.9823189377784729, + "learning_rate": 8.379749894278983e-05, + "loss": 2.725, + "step": 19240 + }, + { + "epoch": 1.7431994745305883, + "grad_norm": 0.8285547494888306, + "learning_rate": 8.379145774179908e-05, + "loss": 1.94, + "step": 19241 + }, + { + "epoch": 1.7432900727050349, + "grad_norm": 0.871583104133606, + "learning_rate": 8.378541654080831e-05, + "loss": 2.776, + "step": 19242 + }, + { + "epoch": 1.743380670879482, + "grad_norm": 0.8755044937133789, + "learning_rate": 8.377937533981756e-05, + "loss": 2.5449, + "step": 19243 + }, + { + "epoch": 1.7434712690539285, + "grad_norm": 0.8874847292900085, + "learning_rate": 8.37733341388268e-05, + "loss": 2.5617, + "step": 19244 + }, + { + "epoch": 1.7435618672283755, + "grad_norm": 0.9491962790489197, + "learning_rate": 8.376729293783606e-05, + "loss": 2.766, + "step": 19245 + }, + { + "epoch": 1.743652465402822, + "grad_norm": 0.9165169596672058, + "learning_rate": 8.376125173684529e-05, + "loss": 2.7549, + "step": 19246 + }, + { + "epoch": 1.743743063577269, + "grad_norm": 0.9537692666053772, + "learning_rate": 8.375521053585454e-05, + "loss": 3.0953, + "step": 19247 + }, + { + "epoch": 1.7438336617517156, + "grad_norm": 0.8087518215179443, + "learning_rate": 8.374916933486377e-05, + "loss": 1.9721, + "step": 19248 + }, + { + "epoch": 1.7439242599261626, + "grad_norm": 0.9176576733589172, + "learning_rate": 8.374312813387302e-05, + "loss": 3.2416, + "step": 19249 + }, + { + "epoch": 1.7440148581006092, + "grad_norm": 0.9141443967819214, + "learning_rate": 8.373708693288225e-05, + "loss": 2.683, + "step": 19250 + }, + { + "epoch": 1.7441054562750562, + "grad_norm": 0.9458569288253784, + "learning_rate": 8.37310457318915e-05, + "loss": 2.7551, + "step": 19251 + }, + { + "epoch": 1.7441960544495028, + "grad_norm": 0.9221270680427551, + "learning_rate": 8.372500453090075e-05, + "loss": 2.9545, + "step": 19252 + }, + { + "epoch": 1.7442866526239498, + "grad_norm": 0.8971509337425232, + "learning_rate": 8.371896332991e-05, + "loss": 2.7768, + "step": 19253 + }, + { + "epoch": 1.7443772507983963, + "grad_norm": 0.8814634084701538, + "learning_rate": 8.371292212891923e-05, + "loss": 2.5856, + "step": 19254 + }, + { + "epoch": 1.7444678489728433, + "grad_norm": 1.0218563079833984, + "learning_rate": 8.370688092792848e-05, + "loss": 2.5651, + "step": 19255 + }, + { + "epoch": 1.74455844714729, + "grad_norm": 0.9411182999610901, + "learning_rate": 8.370083972693771e-05, + "loss": 2.7216, + "step": 19256 + }, + { + "epoch": 1.744649045321737, + "grad_norm": 0.9210218191146851, + "learning_rate": 8.369479852594696e-05, + "loss": 2.8335, + "step": 19257 + }, + { + "epoch": 1.7447396434961835, + "grad_norm": 1.0173320770263672, + "learning_rate": 8.368875732495621e-05, + "loss": 2.7501, + "step": 19258 + }, + { + "epoch": 1.7448302416706305, + "grad_norm": 0.8855067491531372, + "learning_rate": 8.368271612396544e-05, + "loss": 2.6054, + "step": 19259 + }, + { + "epoch": 1.744920839845077, + "grad_norm": 0.9160159230232239, + "learning_rate": 8.36766749229747e-05, + "loss": 2.5239, + "step": 19260 + }, + { + "epoch": 1.7450114380195239, + "grad_norm": 0.8357824087142944, + "learning_rate": 8.367063372198394e-05, + "loss": 2.5849, + "step": 19261 + }, + { + "epoch": 1.7451020361939706, + "grad_norm": 1.1027522087097168, + "learning_rate": 8.366459252099318e-05, + "loss": 2.9088, + "step": 19262 + }, + { + "epoch": 1.7451926343684174, + "grad_norm": 0.8473795652389526, + "learning_rate": 8.365855132000242e-05, + "loss": 2.1895, + "step": 19263 + }, + { + "epoch": 1.7452832325428642, + "grad_norm": 0.8776654005050659, + "learning_rate": 8.365251011901167e-05, + "loss": 2.8479, + "step": 19264 + }, + { + "epoch": 1.745373830717311, + "grad_norm": 0.8956475853919983, + "learning_rate": 8.36464689180209e-05, + "loss": 2.5706, + "step": 19265 + }, + { + "epoch": 1.7454644288917578, + "grad_norm": 0.8110536932945251, + "learning_rate": 8.364042771703015e-05, + "loss": 2.0381, + "step": 19266 + }, + { + "epoch": 1.7455550270662046, + "grad_norm": 0.8886701464653015, + "learning_rate": 8.36343865160394e-05, + "loss": 2.7349, + "step": 19267 + }, + { + "epoch": 1.7456456252406514, + "grad_norm": 0.8036133050918579, + "learning_rate": 8.362834531504864e-05, + "loss": 1.9987, + "step": 19268 + }, + { + "epoch": 1.7457362234150982, + "grad_norm": 0.8796784281730652, + "learning_rate": 8.362230411405788e-05, + "loss": 2.8033, + "step": 19269 + }, + { + "epoch": 1.745826821589545, + "grad_norm": 0.95008385181427, + "learning_rate": 8.361626291306712e-05, + "loss": 3.3251, + "step": 19270 + }, + { + "epoch": 1.7459174197639917, + "grad_norm": 0.8990117311477661, + "learning_rate": 8.361022171207636e-05, + "loss": 2.8704, + "step": 19271 + }, + { + "epoch": 1.7460080179384385, + "grad_norm": 0.9117347002029419, + "learning_rate": 8.36041805110856e-05, + "loss": 2.6676, + "step": 19272 + }, + { + "epoch": 1.7460986161128853, + "grad_norm": 0.9047205448150635, + "learning_rate": 8.359813931009485e-05, + "loss": 2.7625, + "step": 19273 + }, + { + "epoch": 1.746189214287332, + "grad_norm": 0.919909656047821, + "learning_rate": 8.359209810910409e-05, + "loss": 2.7662, + "step": 19274 + }, + { + "epoch": 1.7462798124617789, + "grad_norm": 0.9003329277038574, + "learning_rate": 8.358605690811334e-05, + "loss": 2.842, + "step": 19275 + }, + { + "epoch": 1.7463704106362257, + "grad_norm": 1.1111700534820557, + "learning_rate": 8.358001570712258e-05, + "loss": 3.0209, + "step": 19276 + }, + { + "epoch": 1.7464610088106725, + "grad_norm": 0.8869321346282959, + "learning_rate": 8.357397450613183e-05, + "loss": 2.7237, + "step": 19277 + }, + { + "epoch": 1.7465516069851192, + "grad_norm": 0.7371549010276794, + "learning_rate": 8.356793330514106e-05, + "loss": 2.0293, + "step": 19278 + }, + { + "epoch": 1.746642205159566, + "grad_norm": 0.9197916388511658, + "learning_rate": 8.356189210415031e-05, + "loss": 2.8453, + "step": 19279 + }, + { + "epoch": 1.7467328033340128, + "grad_norm": 0.7504376769065857, + "learning_rate": 8.355585090315955e-05, + "loss": 1.8751, + "step": 19280 + }, + { + "epoch": 1.7468234015084596, + "grad_norm": 0.9112528562545776, + "learning_rate": 8.35498097021688e-05, + "loss": 2.7336, + "step": 19281 + }, + { + "epoch": 1.7469139996829064, + "grad_norm": 0.8972108960151672, + "learning_rate": 8.354376850117804e-05, + "loss": 2.6508, + "step": 19282 + }, + { + "epoch": 1.7470045978573532, + "grad_norm": 0.9492946267127991, + "learning_rate": 8.353772730018729e-05, + "loss": 2.9995, + "step": 19283 + }, + { + "epoch": 1.7470951960318, + "grad_norm": 0.9025574326515198, + "learning_rate": 8.353168609919652e-05, + "loss": 2.6327, + "step": 19284 + }, + { + "epoch": 1.7471857942062468, + "grad_norm": 0.9113127589225769, + "learning_rate": 8.352564489820577e-05, + "loss": 2.6426, + "step": 19285 + }, + { + "epoch": 1.7472763923806935, + "grad_norm": 0.8432759642601013, + "learning_rate": 8.3519603697215e-05, + "loss": 2.5803, + "step": 19286 + }, + { + "epoch": 1.7473669905551403, + "grad_norm": 0.9162805676460266, + "learning_rate": 8.351356249622425e-05, + "loss": 2.772, + "step": 19287 + }, + { + "epoch": 1.7474575887295871, + "grad_norm": 0.8424472808837891, + "learning_rate": 8.350752129523349e-05, + "loss": 2.5639, + "step": 19288 + }, + { + "epoch": 1.747548186904034, + "grad_norm": 0.8589915037155151, + "learning_rate": 8.350148009424273e-05, + "loss": 2.7325, + "step": 19289 + }, + { + "epoch": 1.7476387850784807, + "grad_norm": 0.8663289546966553, + "learning_rate": 8.349543889325198e-05, + "loss": 2.6616, + "step": 19290 + }, + { + "epoch": 1.7477293832529275, + "grad_norm": 0.8623829483985901, + "learning_rate": 8.348939769226123e-05, + "loss": 2.5489, + "step": 19291 + }, + { + "epoch": 1.7478199814273743, + "grad_norm": 0.8843762874603271, + "learning_rate": 8.348335649127046e-05, + "loss": 1.8376, + "step": 19292 + }, + { + "epoch": 1.747910579601821, + "grad_norm": 1.1315410137176514, + "learning_rate": 8.347731529027971e-05, + "loss": 2.7529, + "step": 19293 + }, + { + "epoch": 1.7480011777762678, + "grad_norm": 0.9278745055198669, + "learning_rate": 8.347127408928896e-05, + "loss": 2.8159, + "step": 19294 + }, + { + "epoch": 1.7480917759507146, + "grad_norm": 0.9365713000297546, + "learning_rate": 8.346523288829819e-05, + "loss": 2.8058, + "step": 19295 + }, + { + "epoch": 1.7481823741251614, + "grad_norm": 0.8480716943740845, + "learning_rate": 8.345919168730744e-05, + "loss": 2.739, + "step": 19296 + }, + { + "epoch": 1.7482729722996082, + "grad_norm": 0.8885935544967651, + "learning_rate": 8.345315048631669e-05, + "loss": 2.7152, + "step": 19297 + }, + { + "epoch": 1.748363570474055, + "grad_norm": 1.0261192321777344, + "learning_rate": 8.344710928532594e-05, + "loss": 2.6002, + "step": 19298 + }, + { + "epoch": 1.7484541686485018, + "grad_norm": 0.8519473075866699, + "learning_rate": 8.344106808433517e-05, + "loss": 2.7973, + "step": 19299 + }, + { + "epoch": 1.7485447668229486, + "grad_norm": 0.8827699422836304, + "learning_rate": 8.343502688334442e-05, + "loss": 2.5078, + "step": 19300 + }, + { + "epoch": 1.7486353649973951, + "grad_norm": 0.9004347324371338, + "learning_rate": 8.342898568235365e-05, + "loss": 2.6614, + "step": 19301 + }, + { + "epoch": 1.7487259631718421, + "grad_norm": 0.9375503659248352, + "learning_rate": 8.34229444813629e-05, + "loss": 2.7208, + "step": 19302 + }, + { + "epoch": 1.7488165613462887, + "grad_norm": 0.845129668712616, + "learning_rate": 8.341690328037213e-05, + "loss": 2.757, + "step": 19303 + }, + { + "epoch": 1.7489071595207357, + "grad_norm": 0.8420230150222778, + "learning_rate": 8.341086207938138e-05, + "loss": 2.3873, + "step": 19304 + }, + { + "epoch": 1.7489977576951823, + "grad_norm": 0.8700177669525146, + "learning_rate": 8.340482087839063e-05, + "loss": 2.765, + "step": 19305 + }, + { + "epoch": 1.7490883558696293, + "grad_norm": 0.973498523235321, + "learning_rate": 8.339877967739988e-05, + "loss": 2.831, + "step": 19306 + }, + { + "epoch": 1.7491789540440759, + "grad_norm": 0.8951279520988464, + "learning_rate": 8.339273847640911e-05, + "loss": 2.669, + "step": 19307 + }, + { + "epoch": 1.7492695522185229, + "grad_norm": 1.0679785013198853, + "learning_rate": 8.338669727541836e-05, + "loss": 2.7473, + "step": 19308 + }, + { + "epoch": 1.7493601503929694, + "grad_norm": 0.9243155121803284, + "learning_rate": 8.33806560744276e-05, + "loss": 2.1408, + "step": 19309 + }, + { + "epoch": 1.7494507485674164, + "grad_norm": 0.9021610617637634, + "learning_rate": 8.337461487343684e-05, + "loss": 2.6115, + "step": 19310 + }, + { + "epoch": 1.749541346741863, + "grad_norm": 0.9103171825408936, + "learning_rate": 8.336857367244609e-05, + "loss": 2.6911, + "step": 19311 + }, + { + "epoch": 1.74963194491631, + "grad_norm": 0.9098116755485535, + "learning_rate": 8.336253247145533e-05, + "loss": 2.6023, + "step": 19312 + }, + { + "epoch": 1.7497225430907566, + "grad_norm": 0.8961266875267029, + "learning_rate": 8.335649127046458e-05, + "loss": 2.432, + "step": 19313 + }, + { + "epoch": 1.7498131412652036, + "grad_norm": 0.9848847985267639, + "learning_rate": 8.335045006947382e-05, + "loss": 2.682, + "step": 19314 + }, + { + "epoch": 1.7499037394396502, + "grad_norm": 0.9128042459487915, + "learning_rate": 8.334440886848306e-05, + "loss": 2.4755, + "step": 19315 + }, + { + "epoch": 1.7499943376140972, + "grad_norm": 0.9310892820358276, + "learning_rate": 8.33383676674923e-05, + "loss": 2.624, + "step": 19316 + }, + { + "epoch": 1.7500849357885437, + "grad_norm": 0.9036229252815247, + "learning_rate": 8.333232646650155e-05, + "loss": 2.5942, + "step": 19317 + }, + { + "epoch": 1.7501755339629907, + "grad_norm": 0.911467432975769, + "learning_rate": 8.332628526551078e-05, + "loss": 2.7132, + "step": 19318 + }, + { + "epoch": 1.7502661321374373, + "grad_norm": 0.9256231784820557, + "learning_rate": 8.332024406452003e-05, + "loss": 2.8424, + "step": 19319 + }, + { + "epoch": 1.7503567303118843, + "grad_norm": 0.8575384616851807, + "learning_rate": 8.331420286352927e-05, + "loss": 2.5707, + "step": 19320 + }, + { + "epoch": 1.750447328486331, + "grad_norm": 0.8596173524856567, + "learning_rate": 8.330816166253852e-05, + "loss": 2.7576, + "step": 19321 + }, + { + "epoch": 1.750537926660778, + "grad_norm": 0.6198755502700806, + "learning_rate": 8.330212046154776e-05, + "loss": 1.2994, + "step": 19322 + }, + { + "epoch": 1.7506285248352245, + "grad_norm": 0.7852420806884766, + "learning_rate": 8.3296079260557e-05, + "loss": 2.0343, + "step": 19323 + }, + { + "epoch": 1.7507191230096715, + "grad_norm": 0.8953802585601807, + "learning_rate": 8.329003805956624e-05, + "loss": 2.8149, + "step": 19324 + }, + { + "epoch": 1.750809721184118, + "grad_norm": 0.9230308532714844, + "learning_rate": 8.328399685857549e-05, + "loss": 2.8085, + "step": 19325 + }, + { + "epoch": 1.750900319358565, + "grad_norm": 0.7680663466453552, + "learning_rate": 8.327795565758473e-05, + "loss": 2.0827, + "step": 19326 + }, + { + "epoch": 1.7509909175330116, + "grad_norm": 1.06729257106781, + "learning_rate": 8.327191445659398e-05, + "loss": 2.305, + "step": 19327 + }, + { + "epoch": 1.7510815157074586, + "grad_norm": 0.9513075947761536, + "learning_rate": 8.326587325560323e-05, + "loss": 2.5735, + "step": 19328 + }, + { + "epoch": 1.7511721138819052, + "grad_norm": 0.8566990494728088, + "learning_rate": 8.325983205461246e-05, + "loss": 2.5764, + "step": 19329 + }, + { + "epoch": 1.7512627120563522, + "grad_norm": 0.8879427909851074, + "learning_rate": 8.325379085362171e-05, + "loss": 2.5086, + "step": 19330 + }, + { + "epoch": 1.7513533102307988, + "grad_norm": 0.8716111183166504, + "learning_rate": 8.324774965263094e-05, + "loss": 2.6627, + "step": 19331 + }, + { + "epoch": 1.7514439084052458, + "grad_norm": 1.0608159303665161, + "learning_rate": 8.324170845164019e-05, + "loss": 2.7908, + "step": 19332 + }, + { + "epoch": 1.7515345065796923, + "grad_norm": 1.0214136838912964, + "learning_rate": 8.323566725064943e-05, + "loss": 2.5766, + "step": 19333 + }, + { + "epoch": 1.7516251047541394, + "grad_norm": 0.9733409881591797, + "learning_rate": 8.322962604965867e-05, + "loss": 2.5782, + "step": 19334 + }, + { + "epoch": 1.751715702928586, + "grad_norm": 0.9099740982055664, + "learning_rate": 8.322358484866792e-05, + "loss": 2.4988, + "step": 19335 + }, + { + "epoch": 1.751806301103033, + "grad_norm": 0.980668306350708, + "learning_rate": 8.321754364767717e-05, + "loss": 2.6928, + "step": 19336 + }, + { + "epoch": 1.7518968992774795, + "grad_norm": 0.8743113279342651, + "learning_rate": 8.32115024466864e-05, + "loss": 2.5823, + "step": 19337 + }, + { + "epoch": 1.7519874974519265, + "grad_norm": 0.8833303451538086, + "learning_rate": 8.320546124569565e-05, + "loss": 2.7361, + "step": 19338 + }, + { + "epoch": 1.752078095626373, + "grad_norm": 0.9264092445373535, + "learning_rate": 8.319942004470488e-05, + "loss": 2.8265, + "step": 19339 + }, + { + "epoch": 1.75216869380082, + "grad_norm": 0.8734321594238281, + "learning_rate": 8.319337884371413e-05, + "loss": 2.5732, + "step": 19340 + }, + { + "epoch": 1.7522592919752666, + "grad_norm": 0.9607085585594177, + "learning_rate": 8.318733764272338e-05, + "loss": 2.6316, + "step": 19341 + }, + { + "epoch": 1.7523498901497134, + "grad_norm": 0.9786383509635925, + "learning_rate": 8.318129644173263e-05, + "loss": 2.7101, + "step": 19342 + }, + { + "epoch": 1.7524404883241602, + "grad_norm": 0.9944911599159241, + "learning_rate": 8.317525524074186e-05, + "loss": 2.8221, + "step": 19343 + }, + { + "epoch": 1.752531086498607, + "grad_norm": 1.039011001586914, + "learning_rate": 8.316921403975111e-05, + "loss": 2.5135, + "step": 19344 + }, + { + "epoch": 1.7526216846730538, + "grad_norm": 0.9239693284034729, + "learning_rate": 8.316317283876036e-05, + "loss": 2.53, + "step": 19345 + }, + { + "epoch": 1.7527122828475006, + "grad_norm": 0.9252746105194092, + "learning_rate": 8.315713163776959e-05, + "loss": 2.9813, + "step": 19346 + }, + { + "epoch": 1.7528028810219474, + "grad_norm": 0.9104175567626953, + "learning_rate": 8.315109043677884e-05, + "loss": 2.4386, + "step": 19347 + }, + { + "epoch": 1.7528934791963942, + "grad_norm": 0.9634670615196228, + "learning_rate": 8.314504923578807e-05, + "loss": 2.5658, + "step": 19348 + }, + { + "epoch": 1.752984077370841, + "grad_norm": 0.8657625913619995, + "learning_rate": 8.313900803479732e-05, + "loss": 2.7415, + "step": 19349 + }, + { + "epoch": 1.7530746755452877, + "grad_norm": 1.022563099861145, + "learning_rate": 8.313296683380657e-05, + "loss": 2.8295, + "step": 19350 + }, + { + "epoch": 1.7531652737197345, + "grad_norm": 0.9496790170669556, + "learning_rate": 8.312692563281581e-05, + "loss": 2.7022, + "step": 19351 + }, + { + "epoch": 1.7532558718941813, + "grad_norm": 0.955254852771759, + "learning_rate": 8.312088443182505e-05, + "loss": 2.6782, + "step": 19352 + }, + { + "epoch": 1.753346470068628, + "grad_norm": 0.8642305731773376, + "learning_rate": 8.31148432308343e-05, + "loss": 2.6872, + "step": 19353 + }, + { + "epoch": 1.7534370682430749, + "grad_norm": 0.8606187105178833, + "learning_rate": 8.310880202984353e-05, + "loss": 2.8143, + "step": 19354 + }, + { + "epoch": 1.7535276664175217, + "grad_norm": 0.980409562587738, + "learning_rate": 8.310276082885278e-05, + "loss": 2.6452, + "step": 19355 + }, + { + "epoch": 1.7536182645919685, + "grad_norm": 0.9192613363265991, + "learning_rate": 8.309671962786201e-05, + "loss": 2.7252, + "step": 19356 + }, + { + "epoch": 1.7537088627664152, + "grad_norm": 0.9419431686401367, + "learning_rate": 8.309067842687127e-05, + "loss": 2.5117, + "step": 19357 + }, + { + "epoch": 1.753799460940862, + "grad_norm": 0.9119380712509155, + "learning_rate": 8.308463722588051e-05, + "loss": 2.4219, + "step": 19358 + }, + { + "epoch": 1.7538900591153088, + "grad_norm": 0.8891183733940125, + "learning_rate": 8.307859602488975e-05, + "loss": 2.5131, + "step": 19359 + }, + { + "epoch": 1.7539806572897556, + "grad_norm": 0.9200453162193298, + "learning_rate": 8.3072554823899e-05, + "loss": 2.7177, + "step": 19360 + }, + { + "epoch": 1.7540712554642024, + "grad_norm": 0.9140604138374329, + "learning_rate": 8.306651362290824e-05, + "loss": 2.7054, + "step": 19361 + }, + { + "epoch": 1.7541618536386492, + "grad_norm": 0.9486719369888306, + "learning_rate": 8.306047242191748e-05, + "loss": 2.8964, + "step": 19362 + }, + { + "epoch": 1.754252451813096, + "grad_norm": 0.9161674380302429, + "learning_rate": 8.305443122092672e-05, + "loss": 2.7429, + "step": 19363 + }, + { + "epoch": 1.7543430499875428, + "grad_norm": 0.8961835503578186, + "learning_rate": 8.304839001993597e-05, + "loss": 2.6403, + "step": 19364 + }, + { + "epoch": 1.7544336481619895, + "grad_norm": 0.9373151063919067, + "learning_rate": 8.304234881894521e-05, + "loss": 3.0535, + "step": 19365 + }, + { + "epoch": 1.7545242463364363, + "grad_norm": 0.8838963508605957, + "learning_rate": 8.303630761795446e-05, + "loss": 2.7846, + "step": 19366 + }, + { + "epoch": 1.7546148445108831, + "grad_norm": 0.8731338977813721, + "learning_rate": 8.30302664169637e-05, + "loss": 2.5329, + "step": 19367 + }, + { + "epoch": 1.75470544268533, + "grad_norm": 0.9488006830215454, + "learning_rate": 8.302422521597294e-05, + "loss": 2.8017, + "step": 19368 + }, + { + "epoch": 1.7547960408597767, + "grad_norm": 0.8194495439529419, + "learning_rate": 8.301818401498218e-05, + "loss": 2.2407, + "step": 19369 + }, + { + "epoch": 1.7548866390342235, + "grad_norm": 0.9656870365142822, + "learning_rate": 8.301214281399142e-05, + "loss": 2.6491, + "step": 19370 + }, + { + "epoch": 1.7549772372086703, + "grad_norm": 0.9534590244293213, + "learning_rate": 8.300610161300066e-05, + "loss": 2.6898, + "step": 19371 + }, + { + "epoch": 1.755067835383117, + "grad_norm": 0.92641282081604, + "learning_rate": 8.300006041200992e-05, + "loss": 2.5882, + "step": 19372 + }, + { + "epoch": 1.7551584335575638, + "grad_norm": 0.8893155455589294, + "learning_rate": 8.299401921101915e-05, + "loss": 2.726, + "step": 19373 + }, + { + "epoch": 1.7552490317320106, + "grad_norm": 0.9494348168373108, + "learning_rate": 8.29879780100284e-05, + "loss": 2.6758, + "step": 19374 + }, + { + "epoch": 1.7553396299064574, + "grad_norm": 0.8016531467437744, + "learning_rate": 8.298193680903764e-05, + "loss": 2.1236, + "step": 19375 + }, + { + "epoch": 1.7554302280809042, + "grad_norm": 0.7881389260292053, + "learning_rate": 8.297589560804688e-05, + "loss": 2.2123, + "step": 19376 + }, + { + "epoch": 1.755520826255351, + "grad_norm": 0.9373083710670471, + "learning_rate": 8.296985440705613e-05, + "loss": 2.677, + "step": 19377 + }, + { + "epoch": 1.7556114244297978, + "grad_norm": 0.823043942451477, + "learning_rate": 8.296381320606536e-05, + "loss": 2.5932, + "step": 19378 + }, + { + "epoch": 1.7557020226042446, + "grad_norm": 0.9238923788070679, + "learning_rate": 8.295777200507461e-05, + "loss": 2.7443, + "step": 19379 + }, + { + "epoch": 1.7557926207786914, + "grad_norm": 0.7554842829704285, + "learning_rate": 8.295173080408386e-05, + "loss": 2.1132, + "step": 19380 + }, + { + "epoch": 1.7558832189531381, + "grad_norm": 0.900806725025177, + "learning_rate": 8.294568960309311e-05, + "loss": 2.8459, + "step": 19381 + }, + { + "epoch": 1.7559738171275847, + "grad_norm": 0.8620060086250305, + "learning_rate": 8.293964840210234e-05, + "loss": 2.6665, + "step": 19382 + }, + { + "epoch": 1.7560644153020317, + "grad_norm": 0.8909385800361633, + "learning_rate": 8.293360720111159e-05, + "loss": 2.812, + "step": 19383 + }, + { + "epoch": 1.7561550134764783, + "grad_norm": 0.885611891746521, + "learning_rate": 8.292756600012082e-05, + "loss": 2.7476, + "step": 19384 + }, + { + "epoch": 1.7562456116509253, + "grad_norm": 0.9221195578575134, + "learning_rate": 8.292152479913007e-05, + "loss": 2.636, + "step": 19385 + }, + { + "epoch": 1.7563362098253719, + "grad_norm": 0.924767255783081, + "learning_rate": 8.29154835981393e-05, + "loss": 2.4655, + "step": 19386 + }, + { + "epoch": 1.7564268079998189, + "grad_norm": 0.8777778744697571, + "learning_rate": 8.290944239714857e-05, + "loss": 2.3512, + "step": 19387 + }, + { + "epoch": 1.7565174061742654, + "grad_norm": 0.9163035154342651, + "learning_rate": 8.29034011961578e-05, + "loss": 2.4813, + "step": 19388 + }, + { + "epoch": 1.7566080043487124, + "grad_norm": 0.9044856429100037, + "learning_rate": 8.289735999516705e-05, + "loss": 2.525, + "step": 19389 + }, + { + "epoch": 1.756698602523159, + "grad_norm": 0.8958684206008911, + "learning_rate": 8.289131879417628e-05, + "loss": 2.7408, + "step": 19390 + }, + { + "epoch": 1.756789200697606, + "grad_norm": 0.9229834675788879, + "learning_rate": 8.288527759318553e-05, + "loss": 2.8458, + "step": 19391 + }, + { + "epoch": 1.7568797988720526, + "grad_norm": 0.7583656311035156, + "learning_rate": 8.287923639219476e-05, + "loss": 2.0183, + "step": 19392 + }, + { + "epoch": 1.7569703970464996, + "grad_norm": 0.9436946511268616, + "learning_rate": 8.287319519120401e-05, + "loss": 2.8351, + "step": 19393 + }, + { + "epoch": 1.7570609952209462, + "grad_norm": 0.9394080638885498, + "learning_rate": 8.286715399021326e-05, + "loss": 2.5941, + "step": 19394 + }, + { + "epoch": 1.7571515933953932, + "grad_norm": 0.9690269827842712, + "learning_rate": 8.28611127892225e-05, + "loss": 2.8999, + "step": 19395 + }, + { + "epoch": 1.7572421915698397, + "grad_norm": 0.8989764451980591, + "learning_rate": 8.285507158823175e-05, + "loss": 2.6337, + "step": 19396 + }, + { + "epoch": 1.7573327897442867, + "grad_norm": 0.8993691205978394, + "learning_rate": 8.284903038724099e-05, + "loss": 2.6829, + "step": 19397 + }, + { + "epoch": 1.7574233879187333, + "grad_norm": 0.8902035355567932, + "learning_rate": 8.284298918625024e-05, + "loss": 1.867, + "step": 19398 + }, + { + "epoch": 1.7575139860931803, + "grad_norm": 0.9062471985816956, + "learning_rate": 8.283694798525947e-05, + "loss": 2.6009, + "step": 19399 + }, + { + "epoch": 1.757604584267627, + "grad_norm": 0.9779696464538574, + "learning_rate": 8.283090678426872e-05, + "loss": 2.9496, + "step": 19400 + }, + { + "epoch": 1.757695182442074, + "grad_norm": 0.9362868666648865, + "learning_rate": 8.282486558327795e-05, + "loss": 2.5449, + "step": 19401 + }, + { + "epoch": 1.7577857806165205, + "grad_norm": 0.9388024210929871, + "learning_rate": 8.281882438228721e-05, + "loss": 2.9026, + "step": 19402 + }, + { + "epoch": 1.7578763787909675, + "grad_norm": 0.8510270714759827, + "learning_rate": 8.281278318129645e-05, + "loss": 2.5689, + "step": 19403 + }, + { + "epoch": 1.757966976965414, + "grad_norm": 0.8979925513267517, + "learning_rate": 8.28067419803057e-05, + "loss": 2.6659, + "step": 19404 + }, + { + "epoch": 1.758057575139861, + "grad_norm": 0.8549314737319946, + "learning_rate": 8.280070077931493e-05, + "loss": 2.7721, + "step": 19405 + }, + { + "epoch": 1.7581481733143076, + "grad_norm": 0.9792516231536865, + "learning_rate": 8.279465957832418e-05, + "loss": 2.8152, + "step": 19406 + }, + { + "epoch": 1.7582387714887546, + "grad_norm": 0.9019548892974854, + "learning_rate": 8.278861837733341e-05, + "loss": 2.5686, + "step": 19407 + }, + { + "epoch": 1.7583293696632012, + "grad_norm": 0.9454085826873779, + "learning_rate": 8.278257717634266e-05, + "loss": 2.8008, + "step": 19408 + }, + { + "epoch": 1.7584199678376482, + "grad_norm": 0.9166638851165771, + "learning_rate": 8.27765359753519e-05, + "loss": 2.4649, + "step": 19409 + }, + { + "epoch": 1.7585105660120948, + "grad_norm": 0.9147728681564331, + "learning_rate": 8.277049477436115e-05, + "loss": 2.7757, + "step": 19410 + }, + { + "epoch": 1.7586011641865418, + "grad_norm": 0.7628579139709473, + "learning_rate": 8.276445357337039e-05, + "loss": 2.0861, + "step": 19411 + }, + { + "epoch": 1.7586917623609883, + "grad_norm": 0.9219827055931091, + "learning_rate": 8.275841237237963e-05, + "loss": 2.5359, + "step": 19412 + }, + { + "epoch": 1.7587823605354354, + "grad_norm": 0.9212744235992432, + "learning_rate": 8.275237117138888e-05, + "loss": 2.6065, + "step": 19413 + }, + { + "epoch": 1.758872958709882, + "grad_norm": 0.8649815917015076, + "learning_rate": 8.274632997039812e-05, + "loss": 2.6815, + "step": 19414 + }, + { + "epoch": 1.758963556884329, + "grad_norm": 0.8866795301437378, + "learning_rate": 8.274028876940736e-05, + "loss": 2.6183, + "step": 19415 + }, + { + "epoch": 1.7590541550587755, + "grad_norm": 0.8278993964195251, + "learning_rate": 8.27342475684166e-05, + "loss": 2.5024, + "step": 19416 + }, + { + "epoch": 1.7591447532332225, + "grad_norm": 0.8622792363166809, + "learning_rate": 8.272820636742586e-05, + "loss": 2.5883, + "step": 19417 + }, + { + "epoch": 1.759235351407669, + "grad_norm": 0.8593022227287292, + "learning_rate": 8.272216516643509e-05, + "loss": 2.6093, + "step": 19418 + }, + { + "epoch": 1.759325949582116, + "grad_norm": 0.9693800210952759, + "learning_rate": 8.271612396544434e-05, + "loss": 2.7411, + "step": 19419 + }, + { + "epoch": 1.7594165477565626, + "grad_norm": 0.9450018405914307, + "learning_rate": 8.271008276445357e-05, + "loss": 2.793, + "step": 19420 + }, + { + "epoch": 1.7595071459310097, + "grad_norm": 0.9542829990386963, + "learning_rate": 8.270404156346282e-05, + "loss": 2.7781, + "step": 19421 + }, + { + "epoch": 1.7595977441054562, + "grad_norm": 0.9439447522163391, + "learning_rate": 8.269800036247206e-05, + "loss": 2.6964, + "step": 19422 + }, + { + "epoch": 1.759688342279903, + "grad_norm": 0.9144842624664307, + "learning_rate": 8.26919591614813e-05, + "loss": 2.7957, + "step": 19423 + }, + { + "epoch": 1.7597789404543498, + "grad_norm": 0.8995895981788635, + "learning_rate": 8.268591796049054e-05, + "loss": 2.8425, + "step": 19424 + }, + { + "epoch": 1.7598695386287966, + "grad_norm": 0.9600659012794495, + "learning_rate": 8.26798767594998e-05, + "loss": 2.1587, + "step": 19425 + }, + { + "epoch": 1.7599601368032434, + "grad_norm": 0.9095194339752197, + "learning_rate": 8.267383555850903e-05, + "loss": 2.668, + "step": 19426 + }, + { + "epoch": 1.7600507349776902, + "grad_norm": 1.0649762153625488, + "learning_rate": 8.266779435751828e-05, + "loss": 2.8307, + "step": 19427 + }, + { + "epoch": 1.760141333152137, + "grad_norm": 0.7877419590950012, + "learning_rate": 8.266175315652753e-05, + "loss": 1.9742, + "step": 19428 + }, + { + "epoch": 1.7602319313265837, + "grad_norm": 0.6217471957206726, + "learning_rate": 8.265571195553676e-05, + "loss": 1.2557, + "step": 19429 + }, + { + "epoch": 1.7603225295010305, + "grad_norm": 0.8814419507980347, + "learning_rate": 8.264967075454601e-05, + "loss": 2.6223, + "step": 19430 + }, + { + "epoch": 1.7604131276754773, + "grad_norm": 0.9519923329353333, + "learning_rate": 8.264362955355524e-05, + "loss": 2.6781, + "step": 19431 + }, + { + "epoch": 1.760503725849924, + "grad_norm": 0.9211328625679016, + "learning_rate": 8.26375883525645e-05, + "loss": 2.9539, + "step": 19432 + }, + { + "epoch": 1.7605943240243709, + "grad_norm": 0.8676775097846985, + "learning_rate": 8.263154715157374e-05, + "loss": 2.6373, + "step": 19433 + }, + { + "epoch": 1.7606849221988177, + "grad_norm": 0.8991796970367432, + "learning_rate": 8.262550595058299e-05, + "loss": 2.8682, + "step": 19434 + }, + { + "epoch": 1.7607755203732645, + "grad_norm": 0.8001048564910889, + "learning_rate": 8.261946474959222e-05, + "loss": 2.0006, + "step": 19435 + }, + { + "epoch": 1.7608661185477112, + "grad_norm": 0.9407731294631958, + "learning_rate": 8.261342354860147e-05, + "loss": 2.5237, + "step": 19436 + }, + { + "epoch": 1.760956716722158, + "grad_norm": 0.8412814140319824, + "learning_rate": 8.26073823476107e-05, + "loss": 2.7004, + "step": 19437 + }, + { + "epoch": 1.7610473148966048, + "grad_norm": 0.9045721888542175, + "learning_rate": 8.260134114661995e-05, + "loss": 2.9433, + "step": 19438 + }, + { + "epoch": 1.7611379130710516, + "grad_norm": 0.8655075430870056, + "learning_rate": 8.259529994562918e-05, + "loss": 2.6167, + "step": 19439 + }, + { + "epoch": 1.7612285112454984, + "grad_norm": 0.8785018920898438, + "learning_rate": 8.258925874463845e-05, + "loss": 2.9592, + "step": 19440 + }, + { + "epoch": 1.7613191094199452, + "grad_norm": 0.8959497213363647, + "learning_rate": 8.258321754364768e-05, + "loss": 2.466, + "step": 19441 + }, + { + "epoch": 1.761409707594392, + "grad_norm": 0.8754498362541199, + "learning_rate": 8.257717634265693e-05, + "loss": 2.608, + "step": 19442 + }, + { + "epoch": 1.7615003057688388, + "grad_norm": 0.8758057355880737, + "learning_rate": 8.257113514166616e-05, + "loss": 2.8153, + "step": 19443 + }, + { + "epoch": 1.7615909039432855, + "grad_norm": 0.9153103828430176, + "learning_rate": 8.256509394067541e-05, + "loss": 2.7993, + "step": 19444 + }, + { + "epoch": 1.7616815021177323, + "grad_norm": 0.9578746557235718, + "learning_rate": 8.255905273968466e-05, + "loss": 2.6402, + "step": 19445 + }, + { + "epoch": 1.7617721002921791, + "grad_norm": 0.8617240786552429, + "learning_rate": 8.255301153869389e-05, + "loss": 2.5897, + "step": 19446 + }, + { + "epoch": 1.761862698466626, + "grad_norm": 0.8726919889450073, + "learning_rate": 8.254697033770314e-05, + "loss": 2.5661, + "step": 19447 + }, + { + "epoch": 1.7619532966410727, + "grad_norm": 0.8844044804573059, + "learning_rate": 8.254092913671239e-05, + "loss": 2.6759, + "step": 19448 + }, + { + "epoch": 1.7620438948155195, + "grad_norm": 0.9706103205680847, + "learning_rate": 8.253488793572163e-05, + "loss": 2.6588, + "step": 19449 + }, + { + "epoch": 1.7621344929899663, + "grad_norm": 0.8776695132255554, + "learning_rate": 8.252884673473087e-05, + "loss": 2.7494, + "step": 19450 + }, + { + "epoch": 1.762225091164413, + "grad_norm": 0.941166877746582, + "learning_rate": 8.252280553374011e-05, + "loss": 2.7335, + "step": 19451 + }, + { + "epoch": 1.7623156893388598, + "grad_norm": 0.8882907629013062, + "learning_rate": 8.251676433274935e-05, + "loss": 2.6155, + "step": 19452 + }, + { + "epoch": 1.7624062875133066, + "grad_norm": 0.872061550617218, + "learning_rate": 8.25107231317586e-05, + "loss": 2.6222, + "step": 19453 + }, + { + "epoch": 1.7624968856877534, + "grad_norm": 0.964751124382019, + "learning_rate": 8.250468193076783e-05, + "loss": 2.6842, + "step": 19454 + }, + { + "epoch": 1.7625874838622002, + "grad_norm": 0.9128377437591553, + "learning_rate": 8.249864072977709e-05, + "loss": 2.8657, + "step": 19455 + }, + { + "epoch": 1.762678082036647, + "grad_norm": 0.9476401209831238, + "learning_rate": 8.249259952878633e-05, + "loss": 2.8304, + "step": 19456 + }, + { + "epoch": 1.7627686802110938, + "grad_norm": 0.9079055190086365, + "learning_rate": 8.248655832779557e-05, + "loss": 2.7419, + "step": 19457 + }, + { + "epoch": 1.7628592783855406, + "grad_norm": 0.996536374092102, + "learning_rate": 8.248051712680481e-05, + "loss": 2.4469, + "step": 19458 + }, + { + "epoch": 1.7629498765599874, + "grad_norm": 0.767732560634613, + "learning_rate": 8.247447592581405e-05, + "loss": 1.9986, + "step": 19459 + }, + { + "epoch": 1.7630404747344341, + "grad_norm": 0.9004416465759277, + "learning_rate": 8.24684347248233e-05, + "loss": 2.913, + "step": 19460 + }, + { + "epoch": 1.763131072908881, + "grad_norm": 0.9347500801086426, + "learning_rate": 8.246239352383254e-05, + "loss": 3.2454, + "step": 19461 + }, + { + "epoch": 1.7632216710833277, + "grad_norm": 0.9488748908042908, + "learning_rate": 8.245635232284178e-05, + "loss": 3.0201, + "step": 19462 + }, + { + "epoch": 1.7633122692577743, + "grad_norm": 0.8989860415458679, + "learning_rate": 8.245031112185103e-05, + "loss": 2.7794, + "step": 19463 + }, + { + "epoch": 1.7634028674322213, + "grad_norm": 0.8504040837287903, + "learning_rate": 8.244426992086028e-05, + "loss": 2.615, + "step": 19464 + }, + { + "epoch": 1.7634934656066679, + "grad_norm": 0.9017173051834106, + "learning_rate": 8.243822871986951e-05, + "loss": 2.6047, + "step": 19465 + }, + { + "epoch": 1.7635840637811149, + "grad_norm": 1.1402945518493652, + "learning_rate": 8.243218751887876e-05, + "loss": 2.6003, + "step": 19466 + }, + { + "epoch": 1.7636746619555614, + "grad_norm": 0.756359338760376, + "learning_rate": 8.2426146317888e-05, + "loss": 2.0501, + "step": 19467 + }, + { + "epoch": 1.7637652601300084, + "grad_norm": 0.9969084858894348, + "learning_rate": 8.242010511689724e-05, + "loss": 2.5396, + "step": 19468 + }, + { + "epoch": 1.763855858304455, + "grad_norm": 0.8899930119514465, + "learning_rate": 8.241406391590648e-05, + "loss": 2.6286, + "step": 19469 + }, + { + "epoch": 1.763946456478902, + "grad_norm": 0.9074265360832214, + "learning_rate": 8.240802271491574e-05, + "loss": 2.6974, + "step": 19470 + }, + { + "epoch": 1.7640370546533486, + "grad_norm": 0.9098271727561951, + "learning_rate": 8.240198151392497e-05, + "loss": 2.8087, + "step": 19471 + }, + { + "epoch": 1.7641276528277956, + "grad_norm": 0.9841477274894714, + "learning_rate": 8.239594031293422e-05, + "loss": 2.7041, + "step": 19472 + }, + { + "epoch": 1.7642182510022422, + "grad_norm": 0.87688809633255, + "learning_rate": 8.238989911194345e-05, + "loss": 2.5022, + "step": 19473 + }, + { + "epoch": 1.7643088491766892, + "grad_norm": 0.9254142045974731, + "learning_rate": 8.23838579109527e-05, + "loss": 2.6244, + "step": 19474 + }, + { + "epoch": 1.7643994473511357, + "grad_norm": 0.8636718988418579, + "learning_rate": 8.237781670996194e-05, + "loss": 2.6572, + "step": 19475 + }, + { + "epoch": 1.7644900455255828, + "grad_norm": 0.8998365998268127, + "learning_rate": 8.237177550897118e-05, + "loss": 1.8414, + "step": 19476 + }, + { + "epoch": 1.7645806437000293, + "grad_norm": 0.8672518730163574, + "learning_rate": 8.236573430798043e-05, + "loss": 2.782, + "step": 19477 + }, + { + "epoch": 1.7646712418744763, + "grad_norm": 0.8640868067741394, + "learning_rate": 8.235969310698968e-05, + "loss": 2.8421, + "step": 19478 + }, + { + "epoch": 1.764761840048923, + "grad_norm": 0.8484343886375427, + "learning_rate": 8.235365190599891e-05, + "loss": 2.6241, + "step": 19479 + }, + { + "epoch": 1.76485243822337, + "grad_norm": 0.8907312154769897, + "learning_rate": 8.234761070500816e-05, + "loss": 2.6982, + "step": 19480 + }, + { + "epoch": 1.7649430363978165, + "grad_norm": 0.8961678147315979, + "learning_rate": 8.234156950401741e-05, + "loss": 2.6818, + "step": 19481 + }, + { + "epoch": 1.7650336345722635, + "grad_norm": 0.9094061851501465, + "learning_rate": 8.233552830302664e-05, + "loss": 2.4989, + "step": 19482 + }, + { + "epoch": 1.76512423274671, + "grad_norm": 0.9055162668228149, + "learning_rate": 8.232948710203589e-05, + "loss": 2.5606, + "step": 19483 + }, + { + "epoch": 1.765214830921157, + "grad_norm": 0.84820157289505, + "learning_rate": 8.232344590104512e-05, + "loss": 2.4375, + "step": 19484 + }, + { + "epoch": 1.7653054290956036, + "grad_norm": 0.9369449019432068, + "learning_rate": 8.231740470005438e-05, + "loss": 2.7665, + "step": 19485 + }, + { + "epoch": 1.7653960272700506, + "grad_norm": 0.9220300912857056, + "learning_rate": 8.231136349906362e-05, + "loss": 2.6858, + "step": 19486 + }, + { + "epoch": 1.7654866254444972, + "grad_norm": 0.9327909350395203, + "learning_rate": 8.230532229807287e-05, + "loss": 2.551, + "step": 19487 + }, + { + "epoch": 1.7655772236189442, + "grad_norm": 0.8731947541236877, + "learning_rate": 8.22992810970821e-05, + "loss": 2.6857, + "step": 19488 + }, + { + "epoch": 1.7656678217933908, + "grad_norm": 0.8672605156898499, + "learning_rate": 8.229323989609135e-05, + "loss": 2.8479, + "step": 19489 + }, + { + "epoch": 1.7657584199678378, + "grad_norm": 0.8508649468421936, + "learning_rate": 8.228719869510058e-05, + "loss": 2.0731, + "step": 19490 + }, + { + "epoch": 1.7658490181422843, + "grad_norm": 0.9003800749778748, + "learning_rate": 8.228115749410983e-05, + "loss": 2.8975, + "step": 19491 + }, + { + "epoch": 1.7659396163167314, + "grad_norm": 0.9349711537361145, + "learning_rate": 8.227511629311908e-05, + "loss": 2.7353, + "step": 19492 + }, + { + "epoch": 1.766030214491178, + "grad_norm": 0.8811445832252502, + "learning_rate": 8.226907509212832e-05, + "loss": 2.9905, + "step": 19493 + }, + { + "epoch": 1.766120812665625, + "grad_norm": 0.8654109239578247, + "learning_rate": 8.226303389113756e-05, + "loss": 2.4857, + "step": 19494 + }, + { + "epoch": 1.7662114108400715, + "grad_norm": 0.9041765332221985, + "learning_rate": 8.22569926901468e-05, + "loss": 2.6152, + "step": 19495 + }, + { + "epoch": 1.7663020090145185, + "grad_norm": 0.8846322894096375, + "learning_rate": 8.225095148915605e-05, + "loss": 2.5903, + "step": 19496 + }, + { + "epoch": 1.766392607188965, + "grad_norm": 0.8379100561141968, + "learning_rate": 8.224491028816529e-05, + "loss": 2.6209, + "step": 19497 + }, + { + "epoch": 1.766483205363412, + "grad_norm": 0.9812373518943787, + "learning_rate": 8.223886908717454e-05, + "loss": 2.7729, + "step": 19498 + }, + { + "epoch": 1.7665738035378586, + "grad_norm": 0.9197331070899963, + "learning_rate": 8.223282788618377e-05, + "loss": 2.8429, + "step": 19499 + }, + { + "epoch": 1.7666644017123057, + "grad_norm": 0.8957871198654175, + "learning_rate": 8.222678668519303e-05, + "loss": 2.6393, + "step": 19500 + }, + { + "epoch": 1.7667549998867522, + "grad_norm": 0.8016147017478943, + "learning_rate": 8.222074548420226e-05, + "loss": 2.1515, + "step": 19501 + }, + { + "epoch": 1.7668455980611992, + "grad_norm": 0.8715692162513733, + "learning_rate": 8.221470428321151e-05, + "loss": 2.7917, + "step": 19502 + }, + { + "epoch": 1.7669361962356458, + "grad_norm": 0.866324245929718, + "learning_rate": 8.220866308222075e-05, + "loss": 2.3711, + "step": 19503 + }, + { + "epoch": 1.7670267944100926, + "grad_norm": 0.8996441960334778, + "learning_rate": 8.220262188123e-05, + "loss": 2.6789, + "step": 19504 + }, + { + "epoch": 1.7671173925845394, + "grad_norm": 0.8831196427345276, + "learning_rate": 8.219658068023923e-05, + "loss": 2.4405, + "step": 19505 + }, + { + "epoch": 1.7672079907589862, + "grad_norm": 0.9580673575401306, + "learning_rate": 8.219053947924848e-05, + "loss": 2.8996, + "step": 19506 + }, + { + "epoch": 1.767298588933433, + "grad_norm": 0.8619243502616882, + "learning_rate": 8.218449827825772e-05, + "loss": 2.6181, + "step": 19507 + }, + { + "epoch": 1.7673891871078797, + "grad_norm": 0.9398897290229797, + "learning_rate": 8.217845707726697e-05, + "loss": 2.6066, + "step": 19508 + }, + { + "epoch": 1.7674797852823265, + "grad_norm": 0.9298999905586243, + "learning_rate": 8.21724158762762e-05, + "loss": 2.7041, + "step": 19509 + }, + { + "epoch": 1.7675703834567733, + "grad_norm": 0.895423948764801, + "learning_rate": 8.216637467528545e-05, + "loss": 2.7909, + "step": 19510 + }, + { + "epoch": 1.76766098163122, + "grad_norm": 0.8755496144294739, + "learning_rate": 8.216033347429469e-05, + "loss": 2.7242, + "step": 19511 + }, + { + "epoch": 1.7677515798056669, + "grad_norm": 0.9894078969955444, + "learning_rate": 8.215429227330393e-05, + "loss": 2.7693, + "step": 19512 + }, + { + "epoch": 1.7678421779801137, + "grad_norm": 0.8396396636962891, + "learning_rate": 8.214825107231318e-05, + "loss": 2.1019, + "step": 19513 + }, + { + "epoch": 1.7679327761545605, + "grad_norm": 0.8737626671791077, + "learning_rate": 8.214220987132242e-05, + "loss": 2.0379, + "step": 19514 + }, + { + "epoch": 1.7680233743290072, + "grad_norm": 0.8392212986946106, + "learning_rate": 8.213616867033168e-05, + "loss": 2.5507, + "step": 19515 + }, + { + "epoch": 1.768113972503454, + "grad_norm": 0.9117357134819031, + "learning_rate": 8.213012746934091e-05, + "loss": 2.7637, + "step": 19516 + }, + { + "epoch": 1.7682045706779008, + "grad_norm": 0.9147598147392273, + "learning_rate": 8.212408626835016e-05, + "loss": 2.6952, + "step": 19517 + }, + { + "epoch": 1.7682951688523476, + "grad_norm": 0.8665197491645813, + "learning_rate": 8.211804506735939e-05, + "loss": 2.6469, + "step": 19518 + }, + { + "epoch": 1.7683857670267944, + "grad_norm": 0.9406129121780396, + "learning_rate": 8.211200386636864e-05, + "loss": 2.8098, + "step": 19519 + }, + { + "epoch": 1.7684763652012412, + "grad_norm": 0.9101338982582092, + "learning_rate": 8.210596266537787e-05, + "loss": 2.5754, + "step": 19520 + }, + { + "epoch": 1.768566963375688, + "grad_norm": 0.8343985676765442, + "learning_rate": 8.209992146438712e-05, + "loss": 2.5624, + "step": 19521 + }, + { + "epoch": 1.7686575615501348, + "grad_norm": 0.9159219861030579, + "learning_rate": 8.209388026339637e-05, + "loss": 2.5479, + "step": 19522 + }, + { + "epoch": 1.7687481597245815, + "grad_norm": 0.8887324333190918, + "learning_rate": 8.208783906240562e-05, + "loss": 2.7512, + "step": 19523 + }, + { + "epoch": 1.7688387578990283, + "grad_norm": 0.9149633646011353, + "learning_rate": 8.208179786141485e-05, + "loss": 2.8263, + "step": 19524 + }, + { + "epoch": 1.7689293560734751, + "grad_norm": 0.9351266622543335, + "learning_rate": 8.20757566604241e-05, + "loss": 2.856, + "step": 19525 + }, + { + "epoch": 1.769019954247922, + "grad_norm": 0.9217565059661865, + "learning_rate": 8.206971545943333e-05, + "loss": 2.8452, + "step": 19526 + }, + { + "epoch": 1.7691105524223687, + "grad_norm": 0.9475778341293335, + "learning_rate": 8.206367425844258e-05, + "loss": 2.5232, + "step": 19527 + }, + { + "epoch": 1.7692011505968155, + "grad_norm": 1.0147227048873901, + "learning_rate": 8.205763305745183e-05, + "loss": 2.8998, + "step": 19528 + }, + { + "epoch": 1.7692917487712623, + "grad_norm": 0.9581873416900635, + "learning_rate": 8.205159185646106e-05, + "loss": 2.4979, + "step": 19529 + }, + { + "epoch": 1.769382346945709, + "grad_norm": 0.7733221054077148, + "learning_rate": 8.204555065547031e-05, + "loss": 2.2757, + "step": 19530 + }, + { + "epoch": 1.7694729451201558, + "grad_norm": 0.8442001342773438, + "learning_rate": 8.203950945447956e-05, + "loss": 2.7737, + "step": 19531 + }, + { + "epoch": 1.7695635432946026, + "grad_norm": 0.874258816242218, + "learning_rate": 8.20334682534888e-05, + "loss": 2.5905, + "step": 19532 + }, + { + "epoch": 1.7696541414690494, + "grad_norm": 0.8998083472251892, + "learning_rate": 8.202742705249804e-05, + "loss": 2.6391, + "step": 19533 + }, + { + "epoch": 1.7697447396434962, + "grad_norm": 0.9064455032348633, + "learning_rate": 8.202138585150729e-05, + "loss": 2.6708, + "step": 19534 + }, + { + "epoch": 1.769835337817943, + "grad_norm": 0.8558913469314575, + "learning_rate": 8.201534465051652e-05, + "loss": 2.5529, + "step": 19535 + }, + { + "epoch": 1.7699259359923898, + "grad_norm": 0.8448553681373596, + "learning_rate": 8.200930344952577e-05, + "loss": 2.5429, + "step": 19536 + }, + { + "epoch": 1.7700165341668366, + "grad_norm": 0.8788304328918457, + "learning_rate": 8.200326224853502e-05, + "loss": 2.5674, + "step": 19537 + }, + { + "epoch": 1.7701071323412834, + "grad_norm": 0.9343538880348206, + "learning_rate": 8.199722104754426e-05, + "loss": 2.7958, + "step": 19538 + }, + { + "epoch": 1.7701977305157302, + "grad_norm": 0.9489685297012329, + "learning_rate": 8.19911798465535e-05, + "loss": 2.8051, + "step": 19539 + }, + { + "epoch": 1.770288328690177, + "grad_norm": 1.058539867401123, + "learning_rate": 8.198513864556274e-05, + "loss": 2.7846, + "step": 19540 + }, + { + "epoch": 1.7703789268646237, + "grad_norm": 0.9600950479507446, + "learning_rate": 8.197909744457198e-05, + "loss": 2.6787, + "step": 19541 + }, + { + "epoch": 1.7704695250390705, + "grad_norm": 0.8672740459442139, + "learning_rate": 8.197305624358123e-05, + "loss": 2.7185, + "step": 19542 + }, + { + "epoch": 1.7705601232135173, + "grad_norm": 0.7692965865135193, + "learning_rate": 8.196701504259046e-05, + "loss": 1.8288, + "step": 19543 + }, + { + "epoch": 1.7706507213879639, + "grad_norm": 0.9247640371322632, + "learning_rate": 8.196097384159971e-05, + "loss": 2.7524, + "step": 19544 + }, + { + "epoch": 1.7707413195624109, + "grad_norm": 0.9177553057670593, + "learning_rate": 8.195493264060896e-05, + "loss": 2.3491, + "step": 19545 + }, + { + "epoch": 1.7708319177368574, + "grad_norm": 0.7980698347091675, + "learning_rate": 8.19488914396182e-05, + "loss": 2.3574, + "step": 19546 + }, + { + "epoch": 1.7709225159113045, + "grad_norm": 0.8784685730934143, + "learning_rate": 8.194285023862745e-05, + "loss": 2.8454, + "step": 19547 + }, + { + "epoch": 1.771013114085751, + "grad_norm": 0.8541979789733887, + "learning_rate": 8.193680903763669e-05, + "loss": 2.6648, + "step": 19548 + }, + { + "epoch": 1.771103712260198, + "grad_norm": 0.6925917267799377, + "learning_rate": 8.193076783664593e-05, + "loss": 1.301, + "step": 19549 + }, + { + "epoch": 1.7711943104346446, + "grad_norm": 0.9358913898468018, + "learning_rate": 8.192472663565517e-05, + "loss": 2.6254, + "step": 19550 + }, + { + "epoch": 1.7712849086090916, + "grad_norm": 0.8627835512161255, + "learning_rate": 8.191868543466441e-05, + "loss": 2.8575, + "step": 19551 + }, + { + "epoch": 1.7713755067835382, + "grad_norm": 0.9737862348556519, + "learning_rate": 8.191264423367366e-05, + "loss": 2.5351, + "step": 19552 + }, + { + "epoch": 1.7714661049579852, + "grad_norm": 0.8369135856628418, + "learning_rate": 8.190660303268291e-05, + "loss": 2.9287, + "step": 19553 + }, + { + "epoch": 1.7715567031324317, + "grad_norm": 0.9494174718856812, + "learning_rate": 8.190056183169214e-05, + "loss": 2.8278, + "step": 19554 + }, + { + "epoch": 1.7716473013068788, + "grad_norm": 0.9109101295471191, + "learning_rate": 8.189452063070139e-05, + "loss": 2.7518, + "step": 19555 + }, + { + "epoch": 1.7717378994813253, + "grad_norm": 0.8044883012771606, + "learning_rate": 8.188847942971063e-05, + "loss": 1.9809, + "step": 19556 + }, + { + "epoch": 1.7718284976557723, + "grad_norm": 0.8990906476974487, + "learning_rate": 8.188243822871987e-05, + "loss": 2.8616, + "step": 19557 + }, + { + "epoch": 1.771919095830219, + "grad_norm": 0.979174017906189, + "learning_rate": 8.187639702772911e-05, + "loss": 2.7479, + "step": 19558 + }, + { + "epoch": 1.772009694004666, + "grad_norm": 0.9228441715240479, + "learning_rate": 8.187035582673835e-05, + "loss": 2.7356, + "step": 19559 + }, + { + "epoch": 1.7721002921791125, + "grad_norm": 0.8255552053451538, + "learning_rate": 8.18643146257476e-05, + "loss": 2.1397, + "step": 19560 + }, + { + "epoch": 1.7721908903535595, + "grad_norm": 0.911835789680481, + "learning_rate": 8.185827342475685e-05, + "loss": 2.6536, + "step": 19561 + }, + { + "epoch": 1.772281488528006, + "grad_norm": 0.8914241194725037, + "learning_rate": 8.185223222376608e-05, + "loss": 2.5302, + "step": 19562 + }, + { + "epoch": 1.772372086702453, + "grad_norm": 0.901167631149292, + "learning_rate": 8.184619102277533e-05, + "loss": 2.6794, + "step": 19563 + }, + { + "epoch": 1.7724626848768996, + "grad_norm": 0.8975508809089661, + "learning_rate": 8.184014982178458e-05, + "loss": 2.669, + "step": 19564 + }, + { + "epoch": 1.7725532830513466, + "grad_norm": 0.9427306652069092, + "learning_rate": 8.183410862079381e-05, + "loss": 2.4876, + "step": 19565 + }, + { + "epoch": 1.7726438812257932, + "grad_norm": 0.9513353109359741, + "learning_rate": 8.182806741980306e-05, + "loss": 2.7561, + "step": 19566 + }, + { + "epoch": 1.7727344794002402, + "grad_norm": 0.8655354380607605, + "learning_rate": 8.182202621881231e-05, + "loss": 2.5952, + "step": 19567 + }, + { + "epoch": 1.7728250775746868, + "grad_norm": 0.9422406554222107, + "learning_rate": 8.181598501782156e-05, + "loss": 2.5078, + "step": 19568 + }, + { + "epoch": 1.7729156757491338, + "grad_norm": 0.9571396708488464, + "learning_rate": 8.180994381683079e-05, + "loss": 2.6981, + "step": 19569 + }, + { + "epoch": 1.7730062739235803, + "grad_norm": 0.9072791934013367, + "learning_rate": 8.180390261584004e-05, + "loss": 2.564, + "step": 19570 + }, + { + "epoch": 1.7730968720980274, + "grad_norm": 0.904443621635437, + "learning_rate": 8.179786141484927e-05, + "loss": 2.4161, + "step": 19571 + }, + { + "epoch": 1.773187470272474, + "grad_norm": 0.8577370047569275, + "learning_rate": 8.179182021385852e-05, + "loss": 2.6002, + "step": 19572 + }, + { + "epoch": 1.773278068446921, + "grad_norm": 1.019465684890747, + "learning_rate": 8.178577901286775e-05, + "loss": 2.9403, + "step": 19573 + }, + { + "epoch": 1.7733686666213675, + "grad_norm": 0.8698573112487793, + "learning_rate": 8.1779737811877e-05, + "loss": 2.6047, + "step": 19574 + }, + { + "epoch": 1.7734592647958145, + "grad_norm": 0.94648677110672, + "learning_rate": 8.177369661088625e-05, + "loss": 2.7758, + "step": 19575 + }, + { + "epoch": 1.773549862970261, + "grad_norm": 1.0489498376846313, + "learning_rate": 8.17676554098955e-05, + "loss": 2.5088, + "step": 19576 + }, + { + "epoch": 1.773640461144708, + "grad_norm": 0.8243321776390076, + "learning_rate": 8.176161420890473e-05, + "loss": 2.0537, + "step": 19577 + }, + { + "epoch": 1.7737310593191546, + "grad_norm": 0.8817712664604187, + "learning_rate": 8.175557300791398e-05, + "loss": 2.5686, + "step": 19578 + }, + { + "epoch": 1.7738216574936017, + "grad_norm": 0.8701996207237244, + "learning_rate": 8.174953180692321e-05, + "loss": 2.6419, + "step": 19579 + }, + { + "epoch": 1.7739122556680482, + "grad_norm": 0.9143155813217163, + "learning_rate": 8.174349060593246e-05, + "loss": 2.7458, + "step": 19580 + }, + { + "epoch": 1.7740028538424952, + "grad_norm": 0.8561567664146423, + "learning_rate": 8.173744940494171e-05, + "loss": 2.8158, + "step": 19581 + }, + { + "epoch": 1.7740934520169418, + "grad_norm": 0.8446936011314392, + "learning_rate": 8.173140820395095e-05, + "loss": 2.6783, + "step": 19582 + }, + { + "epoch": 1.7741840501913888, + "grad_norm": 0.8940356969833374, + "learning_rate": 8.17253670029602e-05, + "loss": 2.6268, + "step": 19583 + }, + { + "epoch": 1.7742746483658354, + "grad_norm": 0.9468141794204712, + "learning_rate": 8.171932580196944e-05, + "loss": 2.5422, + "step": 19584 + }, + { + "epoch": 1.7743652465402822, + "grad_norm": 0.8537949919700623, + "learning_rate": 8.171328460097868e-05, + "loss": 2.5717, + "step": 19585 + }, + { + "epoch": 1.774455844714729, + "grad_norm": 0.7906335592269897, + "learning_rate": 8.170724339998792e-05, + "loss": 2.4138, + "step": 19586 + }, + { + "epoch": 1.7745464428891757, + "grad_norm": 0.9354702830314636, + "learning_rate": 8.170120219899717e-05, + "loss": 2.7513, + "step": 19587 + }, + { + "epoch": 1.7746370410636225, + "grad_norm": 0.8840936422348022, + "learning_rate": 8.16951609980064e-05, + "loss": 2.7677, + "step": 19588 + }, + { + "epoch": 1.7747276392380693, + "grad_norm": 0.9286210536956787, + "learning_rate": 8.168911979701565e-05, + "loss": 2.656, + "step": 19589 + }, + { + "epoch": 1.774818237412516, + "grad_norm": 0.8706139326095581, + "learning_rate": 8.16830785960249e-05, + "loss": 2.7247, + "step": 19590 + }, + { + "epoch": 1.7749088355869629, + "grad_norm": 0.9376037120819092, + "learning_rate": 8.167703739503414e-05, + "loss": 2.7566, + "step": 19591 + }, + { + "epoch": 1.7749994337614097, + "grad_norm": 0.8418271541595459, + "learning_rate": 8.167099619404338e-05, + "loss": 2.4501, + "step": 19592 + }, + { + "epoch": 1.7750900319358565, + "grad_norm": 0.9588034152984619, + "learning_rate": 8.166495499305262e-05, + "loss": 2.4477, + "step": 19593 + }, + { + "epoch": 1.7751806301103032, + "grad_norm": 0.7600575089454651, + "learning_rate": 8.165891379206186e-05, + "loss": 1.8386, + "step": 19594 + }, + { + "epoch": 1.77527122828475, + "grad_norm": 0.8768284916877747, + "learning_rate": 8.16528725910711e-05, + "loss": 2.5948, + "step": 19595 + }, + { + "epoch": 1.7753618264591968, + "grad_norm": 0.7644971013069153, + "learning_rate": 8.164683139008035e-05, + "loss": 2.0925, + "step": 19596 + }, + { + "epoch": 1.7754524246336436, + "grad_norm": 0.931972861289978, + "learning_rate": 8.16407901890896e-05, + "loss": 2.6997, + "step": 19597 + }, + { + "epoch": 1.7755430228080904, + "grad_norm": 0.9227539300918579, + "learning_rate": 8.163474898809883e-05, + "loss": 2.7481, + "step": 19598 + }, + { + "epoch": 1.7756336209825372, + "grad_norm": 0.8937901854515076, + "learning_rate": 8.162870778710808e-05, + "loss": 2.6666, + "step": 19599 + }, + { + "epoch": 1.775724219156984, + "grad_norm": 0.8410772085189819, + "learning_rate": 8.162266658611733e-05, + "loss": 2.4643, + "step": 19600 + }, + { + "epoch": 1.7758148173314308, + "grad_norm": 0.9998024106025696, + "learning_rate": 8.161662538512656e-05, + "loss": 2.6637, + "step": 19601 + }, + { + "epoch": 1.7759054155058775, + "grad_norm": 0.9275742769241333, + "learning_rate": 8.161058418413581e-05, + "loss": 3.0477, + "step": 19602 + }, + { + "epoch": 1.7759960136803243, + "grad_norm": 0.8785709738731384, + "learning_rate": 8.160454298314505e-05, + "loss": 2.6807, + "step": 19603 + }, + { + "epoch": 1.7760866118547711, + "grad_norm": 0.9178439974784851, + "learning_rate": 8.15985017821543e-05, + "loss": 2.5536, + "step": 19604 + }, + { + "epoch": 1.776177210029218, + "grad_norm": 0.9052304625511169, + "learning_rate": 8.159246058116354e-05, + "loss": 2.6542, + "step": 19605 + }, + { + "epoch": 1.7762678082036647, + "grad_norm": 0.9749343395233154, + "learning_rate": 8.158641938017279e-05, + "loss": 2.5213, + "step": 19606 + }, + { + "epoch": 1.7763584063781115, + "grad_norm": 0.9396253824234009, + "learning_rate": 8.158037817918202e-05, + "loss": 2.4308, + "step": 19607 + }, + { + "epoch": 1.7764490045525583, + "grad_norm": 0.8774400353431702, + "learning_rate": 8.157433697819127e-05, + "loss": 2.4756, + "step": 19608 + }, + { + "epoch": 1.776539602727005, + "grad_norm": 0.8623829483985901, + "learning_rate": 8.15682957772005e-05, + "loss": 2.7579, + "step": 19609 + }, + { + "epoch": 1.7766302009014519, + "grad_norm": 0.926905632019043, + "learning_rate": 8.156225457620975e-05, + "loss": 2.686, + "step": 19610 + }, + { + "epoch": 1.7767207990758986, + "grad_norm": 0.8705319166183472, + "learning_rate": 8.155621337521899e-05, + "loss": 2.6039, + "step": 19611 + }, + { + "epoch": 1.7768113972503454, + "grad_norm": 0.9270986318588257, + "learning_rate": 8.155017217422825e-05, + "loss": 2.8992, + "step": 19612 + }, + { + "epoch": 1.7769019954247922, + "grad_norm": 0.8846521973609924, + "learning_rate": 8.154413097323748e-05, + "loss": 2.759, + "step": 19613 + }, + { + "epoch": 1.776992593599239, + "grad_norm": 0.8774383068084717, + "learning_rate": 8.153808977224673e-05, + "loss": 2.7388, + "step": 19614 + }, + { + "epoch": 1.7770831917736858, + "grad_norm": 0.9329836964607239, + "learning_rate": 8.153204857125598e-05, + "loss": 3.0132, + "step": 19615 + }, + { + "epoch": 1.7771737899481326, + "grad_norm": 0.909386396408081, + "learning_rate": 8.152600737026521e-05, + "loss": 2.76, + "step": 19616 + }, + { + "epoch": 1.7772643881225794, + "grad_norm": 0.8861119747161865, + "learning_rate": 8.151996616927446e-05, + "loss": 2.8432, + "step": 19617 + }, + { + "epoch": 1.7773549862970262, + "grad_norm": 0.8953719735145569, + "learning_rate": 8.151392496828369e-05, + "loss": 2.6613, + "step": 19618 + }, + { + "epoch": 1.777445584471473, + "grad_norm": 0.8074958324432373, + "learning_rate": 8.150788376729294e-05, + "loss": 1.7938, + "step": 19619 + }, + { + "epoch": 1.7775361826459197, + "grad_norm": 0.8626850247383118, + "learning_rate": 8.150184256630219e-05, + "loss": 2.6364, + "step": 19620 + }, + { + "epoch": 1.7776267808203665, + "grad_norm": 0.9123237729072571, + "learning_rate": 8.149580136531144e-05, + "loss": 2.5361, + "step": 19621 + }, + { + "epoch": 1.7777173789948133, + "grad_norm": 0.998664140701294, + "learning_rate": 8.148976016432067e-05, + "loss": 2.9309, + "step": 19622 + }, + { + "epoch": 1.77780797716926, + "grad_norm": 0.9767919778823853, + "learning_rate": 8.148371896332992e-05, + "loss": 2.864, + "step": 19623 + }, + { + "epoch": 1.7778985753437069, + "grad_norm": 0.996992290019989, + "learning_rate": 8.147767776233915e-05, + "loss": 2.736, + "step": 19624 + }, + { + "epoch": 1.7779891735181534, + "grad_norm": 0.8967661261558533, + "learning_rate": 8.14716365613484e-05, + "loss": 2.7554, + "step": 19625 + }, + { + "epoch": 1.7780797716926005, + "grad_norm": 0.8800758719444275, + "learning_rate": 8.146559536035763e-05, + "loss": 2.8843, + "step": 19626 + }, + { + "epoch": 1.778170369867047, + "grad_norm": 0.9215776920318604, + "learning_rate": 8.14595541593669e-05, + "loss": 2.7418, + "step": 19627 + }, + { + "epoch": 1.778260968041494, + "grad_norm": 0.8746472597122192, + "learning_rate": 8.145351295837613e-05, + "loss": 2.6264, + "step": 19628 + }, + { + "epoch": 1.7783515662159406, + "grad_norm": 1.0204259157180786, + "learning_rate": 8.144747175738538e-05, + "loss": 2.99, + "step": 19629 + }, + { + "epoch": 1.7784421643903876, + "grad_norm": 0.9744011759757996, + "learning_rate": 8.144143055639461e-05, + "loss": 2.6036, + "step": 19630 + }, + { + "epoch": 1.7785327625648342, + "grad_norm": 0.8919046521186829, + "learning_rate": 8.143538935540386e-05, + "loss": 2.7638, + "step": 19631 + }, + { + "epoch": 1.7786233607392812, + "grad_norm": 0.9209610819816589, + "learning_rate": 8.14293481544131e-05, + "loss": 2.6644, + "step": 19632 + }, + { + "epoch": 1.7787139589137277, + "grad_norm": 0.9023592472076416, + "learning_rate": 8.142330695342234e-05, + "loss": 2.8297, + "step": 19633 + }, + { + "epoch": 1.7788045570881748, + "grad_norm": 0.8907949328422546, + "learning_rate": 8.141726575243159e-05, + "loss": 2.7325, + "step": 19634 + }, + { + "epoch": 1.7788951552626213, + "grad_norm": 0.9759233593940735, + "learning_rate": 8.141122455144083e-05, + "loss": 2.7514, + "step": 19635 + }, + { + "epoch": 1.7789857534370683, + "grad_norm": 0.9358410835266113, + "learning_rate": 8.140518335045008e-05, + "loss": 2.6372, + "step": 19636 + }, + { + "epoch": 1.779076351611515, + "grad_norm": 0.7136829495429993, + "learning_rate": 8.139914214945932e-05, + "loss": 1.8961, + "step": 19637 + }, + { + "epoch": 1.779166949785962, + "grad_norm": 0.9374995827674866, + "learning_rate": 8.139310094846856e-05, + "loss": 2.8107, + "step": 19638 + }, + { + "epoch": 1.7792575479604085, + "grad_norm": 0.8953163623809814, + "learning_rate": 8.13870597474778e-05, + "loss": 2.6605, + "step": 19639 + }, + { + "epoch": 1.7793481461348555, + "grad_norm": 0.955460786819458, + "learning_rate": 8.138101854648704e-05, + "loss": 2.6817, + "step": 19640 + }, + { + "epoch": 1.779438744309302, + "grad_norm": 0.8604552149772644, + "learning_rate": 8.137497734549628e-05, + "loss": 2.1031, + "step": 19641 + }, + { + "epoch": 1.779529342483749, + "grad_norm": 0.9202382564544678, + "learning_rate": 8.136893614450554e-05, + "loss": 2.8174, + "step": 19642 + }, + { + "epoch": 1.7796199406581956, + "grad_norm": 1.000631332397461, + "learning_rate": 8.136289494351477e-05, + "loss": 2.7726, + "step": 19643 + }, + { + "epoch": 1.7797105388326426, + "grad_norm": 0.9700411558151245, + "learning_rate": 8.135685374252402e-05, + "loss": 2.8088, + "step": 19644 + }, + { + "epoch": 1.7798011370070892, + "grad_norm": 0.9009131193161011, + "learning_rate": 8.135081254153326e-05, + "loss": 2.7273, + "step": 19645 + }, + { + "epoch": 1.7798917351815362, + "grad_norm": 0.9220311641693115, + "learning_rate": 8.13447713405425e-05, + "loss": 2.7012, + "step": 19646 + }, + { + "epoch": 1.7799823333559828, + "grad_norm": 0.9161410927772522, + "learning_rate": 8.133873013955175e-05, + "loss": 2.699, + "step": 19647 + }, + { + "epoch": 1.7800729315304298, + "grad_norm": 0.8774468302726746, + "learning_rate": 8.133268893856098e-05, + "loss": 2.6205, + "step": 19648 + }, + { + "epoch": 1.7801635297048763, + "grad_norm": 0.8908450603485107, + "learning_rate": 8.132664773757023e-05, + "loss": 2.5734, + "step": 19649 + }, + { + "epoch": 1.7802541278793234, + "grad_norm": 0.9782586097717285, + "learning_rate": 8.132060653657948e-05, + "loss": 2.9559, + "step": 19650 + }, + { + "epoch": 1.78034472605377, + "grad_norm": 0.8492426872253418, + "learning_rate": 8.131456533558873e-05, + "loss": 2.6788, + "step": 19651 + }, + { + "epoch": 1.780435324228217, + "grad_norm": 0.9044932723045349, + "learning_rate": 8.130852413459796e-05, + "loss": 3.0027, + "step": 19652 + }, + { + "epoch": 1.7805259224026635, + "grad_norm": 0.8932974934577942, + "learning_rate": 8.130248293360721e-05, + "loss": 2.6197, + "step": 19653 + }, + { + "epoch": 1.7806165205771105, + "grad_norm": 0.9369896054267883, + "learning_rate": 8.129644173261644e-05, + "loss": 2.9406, + "step": 19654 + }, + { + "epoch": 1.780707118751557, + "grad_norm": 0.8747892379760742, + "learning_rate": 8.129040053162569e-05, + "loss": 2.6633, + "step": 19655 + }, + { + "epoch": 1.780797716926004, + "grad_norm": 0.788303792476654, + "learning_rate": 8.128435933063493e-05, + "loss": 2.2002, + "step": 19656 + }, + { + "epoch": 1.7808883151004506, + "grad_norm": 0.769682765007019, + "learning_rate": 8.127831812964419e-05, + "loss": 2.0084, + "step": 19657 + }, + { + "epoch": 1.7809789132748977, + "grad_norm": 0.9569039940834045, + "learning_rate": 8.127227692865342e-05, + "loss": 2.7212, + "step": 19658 + }, + { + "epoch": 1.7810695114493442, + "grad_norm": 0.8825284838676453, + "learning_rate": 8.126623572766267e-05, + "loss": 2.6279, + "step": 19659 + }, + { + "epoch": 1.7811601096237912, + "grad_norm": 0.8995934724807739, + "learning_rate": 8.12601945266719e-05, + "loss": 3.0294, + "step": 19660 + }, + { + "epoch": 1.7812507077982378, + "grad_norm": 0.9174348711967468, + "learning_rate": 8.125415332568115e-05, + "loss": 3.0147, + "step": 19661 + }, + { + "epoch": 1.7813413059726848, + "grad_norm": 0.8568856120109558, + "learning_rate": 8.124811212469038e-05, + "loss": 2.6275, + "step": 19662 + }, + { + "epoch": 1.7814319041471314, + "grad_norm": 0.9774879813194275, + "learning_rate": 8.124207092369963e-05, + "loss": 2.8454, + "step": 19663 + }, + { + "epoch": 1.7815225023215784, + "grad_norm": 0.8607246279716492, + "learning_rate": 8.123602972270888e-05, + "loss": 2.5296, + "step": 19664 + }, + { + "epoch": 1.781613100496025, + "grad_norm": 0.9497876763343811, + "learning_rate": 8.122998852171813e-05, + "loss": 2.8131, + "step": 19665 + }, + { + "epoch": 1.7817036986704717, + "grad_norm": 0.8903675079345703, + "learning_rate": 8.122394732072736e-05, + "loss": 2.6734, + "step": 19666 + }, + { + "epoch": 1.7817942968449185, + "grad_norm": 0.9056803584098816, + "learning_rate": 8.121790611973661e-05, + "loss": 2.5736, + "step": 19667 + }, + { + "epoch": 1.7818848950193653, + "grad_norm": 1.0086009502410889, + "learning_rate": 8.121186491874586e-05, + "loss": 2.588, + "step": 19668 + }, + { + "epoch": 1.781975493193812, + "grad_norm": 0.8641682267189026, + "learning_rate": 8.120582371775509e-05, + "loss": 2.3759, + "step": 19669 + }, + { + "epoch": 1.7820660913682589, + "grad_norm": 0.9461573362350464, + "learning_rate": 8.119978251676434e-05, + "loss": 2.675, + "step": 19670 + }, + { + "epoch": 1.7821566895427057, + "grad_norm": 0.7591676712036133, + "learning_rate": 8.119374131577357e-05, + "loss": 1.7995, + "step": 19671 + }, + { + "epoch": 1.7822472877171525, + "grad_norm": 0.8782507181167603, + "learning_rate": 8.118770011478283e-05, + "loss": 2.9478, + "step": 19672 + }, + { + "epoch": 1.7823378858915992, + "grad_norm": 0.9458013772964478, + "learning_rate": 8.118165891379207e-05, + "loss": 2.8322, + "step": 19673 + }, + { + "epoch": 1.782428484066046, + "grad_norm": 0.8684398531913757, + "learning_rate": 8.117561771280131e-05, + "loss": 2.7141, + "step": 19674 + }, + { + "epoch": 1.7825190822404928, + "grad_norm": 0.9578033685684204, + "learning_rate": 8.116957651181055e-05, + "loss": 2.6057, + "step": 19675 + }, + { + "epoch": 1.7826096804149396, + "grad_norm": 0.9111964106559753, + "learning_rate": 8.11635353108198e-05, + "loss": 2.8669, + "step": 19676 + }, + { + "epoch": 1.7827002785893864, + "grad_norm": 0.9193137288093567, + "learning_rate": 8.115749410982903e-05, + "loss": 2.8169, + "step": 19677 + }, + { + "epoch": 1.7827908767638332, + "grad_norm": 0.8988669514656067, + "learning_rate": 8.115145290883828e-05, + "loss": 2.6579, + "step": 19678 + }, + { + "epoch": 1.78288147493828, + "grad_norm": 0.8169118762016296, + "learning_rate": 8.114541170784753e-05, + "loss": 2.4257, + "step": 19679 + }, + { + "epoch": 1.7829720731127268, + "grad_norm": 0.8434300422668457, + "learning_rate": 8.113937050685677e-05, + "loss": 1.9464, + "step": 19680 + }, + { + "epoch": 1.7830626712871736, + "grad_norm": 0.863307774066925, + "learning_rate": 8.113332930586601e-05, + "loss": 2.6964, + "step": 19681 + }, + { + "epoch": 1.7831532694616203, + "grad_norm": 0.9210168123245239, + "learning_rate": 8.112728810487525e-05, + "loss": 2.7465, + "step": 19682 + }, + { + "epoch": 1.7832438676360671, + "grad_norm": 0.9046961069107056, + "learning_rate": 8.11212469038845e-05, + "loss": 3.0658, + "step": 19683 + }, + { + "epoch": 1.783334465810514, + "grad_norm": 0.7368083596229553, + "learning_rate": 8.111520570289374e-05, + "loss": 1.3987, + "step": 19684 + }, + { + "epoch": 1.7834250639849607, + "grad_norm": 0.982388973236084, + "learning_rate": 8.110916450190298e-05, + "loss": 2.8802, + "step": 19685 + }, + { + "epoch": 1.7835156621594075, + "grad_norm": 0.8819802403450012, + "learning_rate": 8.110312330091222e-05, + "loss": 2.7366, + "step": 19686 + }, + { + "epoch": 1.7836062603338543, + "grad_norm": 0.960934579372406, + "learning_rate": 8.109708209992148e-05, + "loss": 2.5691, + "step": 19687 + }, + { + "epoch": 1.783696858508301, + "grad_norm": 0.8613075017929077, + "learning_rate": 8.109104089893071e-05, + "loss": 2.9213, + "step": 19688 + }, + { + "epoch": 1.7837874566827479, + "grad_norm": 0.951096773147583, + "learning_rate": 8.108499969793996e-05, + "loss": 2.4214, + "step": 19689 + }, + { + "epoch": 1.7838780548571946, + "grad_norm": 0.9074843525886536, + "learning_rate": 8.10789584969492e-05, + "loss": 2.8014, + "step": 19690 + }, + { + "epoch": 1.7839686530316414, + "grad_norm": 0.9244731068611145, + "learning_rate": 8.107291729595844e-05, + "loss": 2.7555, + "step": 19691 + }, + { + "epoch": 1.7840592512060882, + "grad_norm": 1.030596137046814, + "learning_rate": 8.106687609496768e-05, + "loss": 2.8402, + "step": 19692 + }, + { + "epoch": 1.784149849380535, + "grad_norm": 0.748628556728363, + "learning_rate": 8.106083489397692e-05, + "loss": 2.0174, + "step": 19693 + }, + { + "epoch": 1.7842404475549818, + "grad_norm": 1.0428632497787476, + "learning_rate": 8.105479369298616e-05, + "loss": 2.3891, + "step": 19694 + }, + { + "epoch": 1.7843310457294286, + "grad_norm": 0.9431568384170532, + "learning_rate": 8.104875249199542e-05, + "loss": 2.8488, + "step": 19695 + }, + { + "epoch": 1.7844216439038754, + "grad_norm": 0.8680261969566345, + "learning_rate": 8.104271129100465e-05, + "loss": 2.7092, + "step": 19696 + }, + { + "epoch": 1.7845122420783222, + "grad_norm": 0.9394903779029846, + "learning_rate": 8.10366700900139e-05, + "loss": 2.8847, + "step": 19697 + }, + { + "epoch": 1.784602840252769, + "grad_norm": 0.9331144094467163, + "learning_rate": 8.103062888902313e-05, + "loss": 2.6499, + "step": 19698 + }, + { + "epoch": 1.7846934384272157, + "grad_norm": 0.830708920955658, + "learning_rate": 8.102458768803238e-05, + "loss": 2.111, + "step": 19699 + }, + { + "epoch": 1.7847840366016625, + "grad_norm": 0.8821356296539307, + "learning_rate": 8.101854648704163e-05, + "loss": 2.6998, + "step": 19700 + }, + { + "epoch": 1.7848746347761093, + "grad_norm": 0.8279032707214355, + "learning_rate": 8.101250528605086e-05, + "loss": 1.9397, + "step": 19701 + }, + { + "epoch": 1.784965232950556, + "grad_norm": 0.9767627716064453, + "learning_rate": 8.100646408506013e-05, + "loss": 2.8001, + "step": 19702 + }, + { + "epoch": 1.7850558311250029, + "grad_norm": 0.8540474772453308, + "learning_rate": 8.100042288406936e-05, + "loss": 2.662, + "step": 19703 + }, + { + "epoch": 1.7851464292994497, + "grad_norm": 0.9408717155456543, + "learning_rate": 8.099438168307861e-05, + "loss": 2.7488, + "step": 19704 + }, + { + "epoch": 1.7852370274738965, + "grad_norm": 0.9622148275375366, + "learning_rate": 8.098834048208784e-05, + "loss": 2.7964, + "step": 19705 + }, + { + "epoch": 1.785327625648343, + "grad_norm": 0.9090145826339722, + "learning_rate": 8.098229928109709e-05, + "loss": 2.5981, + "step": 19706 + }, + { + "epoch": 1.78541822382279, + "grad_norm": 0.853467583656311, + "learning_rate": 8.097625808010632e-05, + "loss": 2.5289, + "step": 19707 + }, + { + "epoch": 1.7855088219972366, + "grad_norm": 0.8701715469360352, + "learning_rate": 8.097021687911557e-05, + "loss": 2.7278, + "step": 19708 + }, + { + "epoch": 1.7855994201716836, + "grad_norm": 0.8911206126213074, + "learning_rate": 8.096417567812482e-05, + "loss": 2.7513, + "step": 19709 + }, + { + "epoch": 1.7856900183461302, + "grad_norm": 0.8643510937690735, + "learning_rate": 8.095813447713407e-05, + "loss": 2.4141, + "step": 19710 + }, + { + "epoch": 1.7857806165205772, + "grad_norm": 0.9288781881332397, + "learning_rate": 8.09520932761433e-05, + "loss": 2.844, + "step": 19711 + }, + { + "epoch": 1.7858712146950237, + "grad_norm": 0.9439629912376404, + "learning_rate": 8.094605207515255e-05, + "loss": 2.6894, + "step": 19712 + }, + { + "epoch": 1.7859618128694708, + "grad_norm": 0.8653723001480103, + "learning_rate": 8.094001087416178e-05, + "loss": 2.6904, + "step": 19713 + }, + { + "epoch": 1.7860524110439173, + "grad_norm": 0.8637730479240417, + "learning_rate": 8.093396967317103e-05, + "loss": 2.7684, + "step": 19714 + }, + { + "epoch": 1.7861430092183643, + "grad_norm": 0.9048565030097961, + "learning_rate": 8.092792847218028e-05, + "loss": 2.7696, + "step": 19715 + }, + { + "epoch": 1.786233607392811, + "grad_norm": 0.9004048109054565, + "learning_rate": 8.092188727118951e-05, + "loss": 2.888, + "step": 19716 + }, + { + "epoch": 1.786324205567258, + "grad_norm": 0.9488416910171509, + "learning_rate": 8.091584607019876e-05, + "loss": 2.6203, + "step": 19717 + }, + { + "epoch": 1.7864148037417045, + "grad_norm": 0.9171770811080933, + "learning_rate": 8.0909804869208e-05, + "loss": 2.5854, + "step": 19718 + }, + { + "epoch": 1.7865054019161515, + "grad_norm": 0.9349489212036133, + "learning_rate": 8.090376366821725e-05, + "loss": 2.7545, + "step": 19719 + }, + { + "epoch": 1.786596000090598, + "grad_norm": 0.8785011768341064, + "learning_rate": 8.089772246722649e-05, + "loss": 2.7345, + "step": 19720 + }, + { + "epoch": 1.786686598265045, + "grad_norm": 0.9557050466537476, + "learning_rate": 8.089168126623573e-05, + "loss": 2.6362, + "step": 19721 + }, + { + "epoch": 1.7867771964394916, + "grad_norm": 0.8210780024528503, + "learning_rate": 8.088564006524497e-05, + "loss": 2.0641, + "step": 19722 + }, + { + "epoch": 1.7868677946139386, + "grad_norm": 0.7922274470329285, + "learning_rate": 8.087959886425422e-05, + "loss": 2.1497, + "step": 19723 + }, + { + "epoch": 1.7869583927883852, + "grad_norm": 0.8925907611846924, + "learning_rate": 8.087355766326346e-05, + "loss": 2.6766, + "step": 19724 + }, + { + "epoch": 1.7870489909628322, + "grad_norm": 0.8914843201637268, + "learning_rate": 8.086751646227271e-05, + "loss": 2.224, + "step": 19725 + }, + { + "epoch": 1.7871395891372788, + "grad_norm": 0.8863329887390137, + "learning_rate": 8.086147526128195e-05, + "loss": 2.7939, + "step": 19726 + }, + { + "epoch": 1.7872301873117258, + "grad_norm": 0.9581995010375977, + "learning_rate": 8.08554340602912e-05, + "loss": 2.6156, + "step": 19727 + }, + { + "epoch": 1.7873207854861723, + "grad_norm": 0.9670870900154114, + "learning_rate": 8.084939285930043e-05, + "loss": 2.6206, + "step": 19728 + }, + { + "epoch": 1.7874113836606194, + "grad_norm": 0.9736208319664001, + "learning_rate": 8.084335165830968e-05, + "loss": 2.7515, + "step": 19729 + }, + { + "epoch": 1.787501981835066, + "grad_norm": 0.8778705596923828, + "learning_rate": 8.083731045731891e-05, + "loss": 2.6981, + "step": 19730 + }, + { + "epoch": 1.787592580009513, + "grad_norm": 0.8957769870758057, + "learning_rate": 8.083126925632816e-05, + "loss": 2.7004, + "step": 19731 + }, + { + "epoch": 1.7876831781839595, + "grad_norm": 0.917412519454956, + "learning_rate": 8.08252280553374e-05, + "loss": 2.7876, + "step": 19732 + }, + { + "epoch": 1.7877737763584065, + "grad_norm": 0.9539158344268799, + "learning_rate": 8.081918685434665e-05, + "loss": 2.6823, + "step": 19733 + }, + { + "epoch": 1.787864374532853, + "grad_norm": 1.058030366897583, + "learning_rate": 8.08131456533559e-05, + "loss": 2.6458, + "step": 19734 + }, + { + "epoch": 1.7879549727073, + "grad_norm": 0.8534111380577087, + "learning_rate": 8.080710445236513e-05, + "loss": 2.7836, + "step": 19735 + }, + { + "epoch": 1.7880455708817466, + "grad_norm": 0.8757511973381042, + "learning_rate": 8.080106325137438e-05, + "loss": 2.5509, + "step": 19736 + }, + { + "epoch": 1.7881361690561937, + "grad_norm": 0.9588481783866882, + "learning_rate": 8.079502205038362e-05, + "loss": 2.6118, + "step": 19737 + }, + { + "epoch": 1.7882267672306402, + "grad_norm": 0.7526991367340088, + "learning_rate": 8.078898084939286e-05, + "loss": 2.0351, + "step": 19738 + }, + { + "epoch": 1.7883173654050872, + "grad_norm": 0.9748169183731079, + "learning_rate": 8.078293964840211e-05, + "loss": 2.7977, + "step": 19739 + }, + { + "epoch": 1.7884079635795338, + "grad_norm": 0.9526834487915039, + "learning_rate": 8.077689844741136e-05, + "loss": 2.7098, + "step": 19740 + }, + { + "epoch": 1.7884985617539808, + "grad_norm": 0.8737282752990723, + "learning_rate": 8.077085724642059e-05, + "loss": 2.7973, + "step": 19741 + }, + { + "epoch": 1.7885891599284274, + "grad_norm": 0.9000438451766968, + "learning_rate": 8.076481604542984e-05, + "loss": 2.5003, + "step": 19742 + }, + { + "epoch": 1.7886797581028744, + "grad_norm": 0.9636933207511902, + "learning_rate": 8.075877484443907e-05, + "loss": 2.7619, + "step": 19743 + }, + { + "epoch": 1.788770356277321, + "grad_norm": 0.9706093668937683, + "learning_rate": 8.075273364344832e-05, + "loss": 2.7746, + "step": 19744 + }, + { + "epoch": 1.788860954451768, + "grad_norm": 0.840020477771759, + "learning_rate": 8.074669244245756e-05, + "loss": 1.9973, + "step": 19745 + }, + { + "epoch": 1.7889515526262145, + "grad_norm": 0.8951588273048401, + "learning_rate": 8.07406512414668e-05, + "loss": 2.4421, + "step": 19746 + }, + { + "epoch": 1.7890421508006613, + "grad_norm": 0.8088259696960449, + "learning_rate": 8.073461004047605e-05, + "loss": 2.5985, + "step": 19747 + }, + { + "epoch": 1.789132748975108, + "grad_norm": 0.9532963633537292, + "learning_rate": 8.07285688394853e-05, + "loss": 2.7467, + "step": 19748 + }, + { + "epoch": 1.7892233471495549, + "grad_norm": 0.9333350658416748, + "learning_rate": 8.072252763849453e-05, + "loss": 3.3558, + "step": 19749 + }, + { + "epoch": 1.7893139453240017, + "grad_norm": 0.942086398601532, + "learning_rate": 8.071648643750378e-05, + "loss": 2.7834, + "step": 19750 + }, + { + "epoch": 1.7894045434984485, + "grad_norm": 0.8497042655944824, + "learning_rate": 8.071044523651303e-05, + "loss": 2.7331, + "step": 19751 + }, + { + "epoch": 1.7894951416728953, + "grad_norm": 0.901066243648529, + "learning_rate": 8.070440403552226e-05, + "loss": 2.5922, + "step": 19752 + }, + { + "epoch": 1.789585739847342, + "grad_norm": 1.0345962047576904, + "learning_rate": 8.069836283453151e-05, + "loss": 2.9125, + "step": 19753 + }, + { + "epoch": 1.7896763380217888, + "grad_norm": 0.9368999600410461, + "learning_rate": 8.069232163354076e-05, + "loss": 2.6152, + "step": 19754 + }, + { + "epoch": 1.7897669361962356, + "grad_norm": 0.8804991841316223, + "learning_rate": 8.068628043255e-05, + "loss": 2.6626, + "step": 19755 + }, + { + "epoch": 1.7898575343706824, + "grad_norm": 1.0046852827072144, + "learning_rate": 8.068023923155924e-05, + "loss": 2.6131, + "step": 19756 + }, + { + "epoch": 1.7899481325451292, + "grad_norm": 0.8563573956489563, + "learning_rate": 8.067419803056849e-05, + "loss": 2.7742, + "step": 19757 + }, + { + "epoch": 1.790038730719576, + "grad_norm": 0.8408615589141846, + "learning_rate": 8.066815682957772e-05, + "loss": 2.8692, + "step": 19758 + }, + { + "epoch": 1.7901293288940228, + "grad_norm": 0.9847007989883423, + "learning_rate": 8.066211562858697e-05, + "loss": 2.5417, + "step": 19759 + }, + { + "epoch": 1.7902199270684696, + "grad_norm": 0.8657693266868591, + "learning_rate": 8.06560744275962e-05, + "loss": 2.8421, + "step": 19760 + }, + { + "epoch": 1.7903105252429163, + "grad_norm": 0.922189474105835, + "learning_rate": 8.065003322660545e-05, + "loss": 2.7727, + "step": 19761 + }, + { + "epoch": 1.7904011234173631, + "grad_norm": 0.7762869000434875, + "learning_rate": 8.06439920256147e-05, + "loss": 2.0217, + "step": 19762 + }, + { + "epoch": 1.79049172159181, + "grad_norm": 1.0066486597061157, + "learning_rate": 8.063795082462394e-05, + "loss": 2.6752, + "step": 19763 + }, + { + "epoch": 1.7905823197662567, + "grad_norm": 0.8990219235420227, + "learning_rate": 8.063190962363318e-05, + "loss": 2.882, + "step": 19764 + }, + { + "epoch": 1.7906729179407035, + "grad_norm": 0.8701414465904236, + "learning_rate": 8.062586842264243e-05, + "loss": 2.6925, + "step": 19765 + }, + { + "epoch": 1.7907635161151503, + "grad_norm": 0.7220498323440552, + "learning_rate": 8.061982722165166e-05, + "loss": 1.8996, + "step": 19766 + }, + { + "epoch": 1.790854114289597, + "grad_norm": 0.9768568873405457, + "learning_rate": 8.061378602066091e-05, + "loss": 2.6958, + "step": 19767 + }, + { + "epoch": 1.7909447124640439, + "grad_norm": 0.8639654517173767, + "learning_rate": 8.060774481967016e-05, + "loss": 2.7756, + "step": 19768 + }, + { + "epoch": 1.7910353106384906, + "grad_norm": 0.9222977161407471, + "learning_rate": 8.06017036186794e-05, + "loss": 2.9058, + "step": 19769 + }, + { + "epoch": 1.7911259088129374, + "grad_norm": 0.7305577993392944, + "learning_rate": 8.059566241768865e-05, + "loss": 1.8017, + "step": 19770 + }, + { + "epoch": 1.7912165069873842, + "grad_norm": 1.0473480224609375, + "learning_rate": 8.058962121669788e-05, + "loss": 2.6333, + "step": 19771 + }, + { + "epoch": 1.791307105161831, + "grad_norm": 0.871393084526062, + "learning_rate": 8.058358001570713e-05, + "loss": 2.4529, + "step": 19772 + }, + { + "epoch": 1.7913977033362778, + "grad_norm": 1.0104193687438965, + "learning_rate": 8.057753881471637e-05, + "loss": 2.4642, + "step": 19773 + }, + { + "epoch": 1.7914883015107246, + "grad_norm": 0.9130479693412781, + "learning_rate": 8.057149761372561e-05, + "loss": 2.844, + "step": 19774 + }, + { + "epoch": 1.7915788996851714, + "grad_norm": 0.9136279225349426, + "learning_rate": 8.056545641273485e-05, + "loss": 2.81, + "step": 19775 + }, + { + "epoch": 1.7916694978596182, + "grad_norm": 0.8587570190429688, + "learning_rate": 8.05594152117441e-05, + "loss": 2.5075, + "step": 19776 + }, + { + "epoch": 1.791760096034065, + "grad_norm": 1.0002840757369995, + "learning_rate": 8.055337401075334e-05, + "loss": 2.8345, + "step": 19777 + }, + { + "epoch": 1.7918506942085117, + "grad_norm": 0.9498633742332458, + "learning_rate": 8.054733280976259e-05, + "loss": 2.7504, + "step": 19778 + }, + { + "epoch": 1.7919412923829585, + "grad_norm": 0.8948879837989807, + "learning_rate": 8.054129160877182e-05, + "loss": 2.6752, + "step": 19779 + }, + { + "epoch": 1.7920318905574053, + "grad_norm": 0.8671194911003113, + "learning_rate": 8.053525040778107e-05, + "loss": 2.4564, + "step": 19780 + }, + { + "epoch": 1.792122488731852, + "grad_norm": 0.8577576279640198, + "learning_rate": 8.05292092067903e-05, + "loss": 2.4444, + "step": 19781 + }, + { + "epoch": 1.7922130869062989, + "grad_norm": 1.0229629278182983, + "learning_rate": 8.052316800579955e-05, + "loss": 2.7842, + "step": 19782 + }, + { + "epoch": 1.7923036850807457, + "grad_norm": 0.9790910482406616, + "learning_rate": 8.05171268048088e-05, + "loss": 2.8844, + "step": 19783 + }, + { + "epoch": 1.7923942832551925, + "grad_norm": 0.9945358633995056, + "learning_rate": 8.051108560381805e-05, + "loss": 2.7873, + "step": 19784 + }, + { + "epoch": 1.7924848814296392, + "grad_norm": 0.8777796030044556, + "learning_rate": 8.050504440282728e-05, + "loss": 2.7322, + "step": 19785 + }, + { + "epoch": 1.792575479604086, + "grad_norm": 0.9667484164237976, + "learning_rate": 8.049900320183653e-05, + "loss": 2.5856, + "step": 19786 + }, + { + "epoch": 1.7926660777785326, + "grad_norm": 0.9581465125083923, + "learning_rate": 8.049296200084578e-05, + "loss": 2.6728, + "step": 19787 + }, + { + "epoch": 1.7927566759529796, + "grad_norm": 0.9172707200050354, + "learning_rate": 8.048692079985501e-05, + "loss": 2.8909, + "step": 19788 + }, + { + "epoch": 1.7928472741274262, + "grad_norm": 0.9701022505760193, + "learning_rate": 8.048087959886426e-05, + "loss": 2.8995, + "step": 19789 + }, + { + "epoch": 1.7929378723018732, + "grad_norm": 0.943630576133728, + "learning_rate": 8.04748383978735e-05, + "loss": 2.7346, + "step": 19790 + }, + { + "epoch": 1.7930284704763197, + "grad_norm": 1.0080926418304443, + "learning_rate": 8.046879719688274e-05, + "loss": 2.5558, + "step": 19791 + }, + { + "epoch": 1.7931190686507668, + "grad_norm": 0.8297812938690186, + "learning_rate": 8.046275599589199e-05, + "loss": 2.1597, + "step": 19792 + }, + { + "epoch": 1.7932096668252133, + "grad_norm": 0.9262785911560059, + "learning_rate": 8.045671479490124e-05, + "loss": 2.7911, + "step": 19793 + }, + { + "epoch": 1.7933002649996603, + "grad_norm": 0.8834097981452942, + "learning_rate": 8.045067359391047e-05, + "loss": 2.5978, + "step": 19794 + }, + { + "epoch": 1.793390863174107, + "grad_norm": 0.9041386842727661, + "learning_rate": 8.044463239291972e-05, + "loss": 2.6123, + "step": 19795 + }, + { + "epoch": 1.793481461348554, + "grad_norm": 1.0295443534851074, + "learning_rate": 8.043859119192895e-05, + "loss": 2.7122, + "step": 19796 + }, + { + "epoch": 1.7935720595230005, + "grad_norm": 0.8820552229881287, + "learning_rate": 8.04325499909382e-05, + "loss": 2.5524, + "step": 19797 + }, + { + "epoch": 1.7936626576974475, + "grad_norm": 0.9327291250228882, + "learning_rate": 8.042650878994743e-05, + "loss": 2.6542, + "step": 19798 + }, + { + "epoch": 1.793753255871894, + "grad_norm": 0.9092989563941956, + "learning_rate": 8.04204675889567e-05, + "loss": 2.8866, + "step": 19799 + }, + { + "epoch": 1.793843854046341, + "grad_norm": 0.8815040588378906, + "learning_rate": 8.041442638796593e-05, + "loss": 2.6544, + "step": 19800 + }, + { + "epoch": 1.7939344522207876, + "grad_norm": 0.9467434287071228, + "learning_rate": 8.040838518697518e-05, + "loss": 2.6406, + "step": 19801 + }, + { + "epoch": 1.7940250503952346, + "grad_norm": 0.8928407430648804, + "learning_rate": 8.040234398598442e-05, + "loss": 2.7016, + "step": 19802 + }, + { + "epoch": 1.7941156485696812, + "grad_norm": 0.8825139999389648, + "learning_rate": 8.039630278499366e-05, + "loss": 2.77, + "step": 19803 + }, + { + "epoch": 1.7942062467441282, + "grad_norm": 0.7214400768280029, + "learning_rate": 8.03902615840029e-05, + "loss": 1.8876, + "step": 19804 + }, + { + "epoch": 1.7942968449185748, + "grad_norm": 0.8723080158233643, + "learning_rate": 8.038422038301214e-05, + "loss": 2.9083, + "step": 19805 + }, + { + "epoch": 1.7943874430930218, + "grad_norm": 0.9094799160957336, + "learning_rate": 8.037817918202139e-05, + "loss": 2.8569, + "step": 19806 + }, + { + "epoch": 1.7944780412674683, + "grad_norm": 0.9170792698860168, + "learning_rate": 8.037213798103064e-05, + "loss": 2.9116, + "step": 19807 + }, + { + "epoch": 1.7945686394419154, + "grad_norm": 0.9614048600196838, + "learning_rate": 8.036609678003988e-05, + "loss": 2.6575, + "step": 19808 + }, + { + "epoch": 1.794659237616362, + "grad_norm": 0.8703429102897644, + "learning_rate": 8.036005557904912e-05, + "loss": 2.7583, + "step": 19809 + }, + { + "epoch": 1.794749835790809, + "grad_norm": 0.8465467095375061, + "learning_rate": 8.035401437805837e-05, + "loss": 2.1655, + "step": 19810 + }, + { + "epoch": 1.7948404339652555, + "grad_norm": 0.7513996958732605, + "learning_rate": 8.03479731770676e-05, + "loss": 2.0916, + "step": 19811 + }, + { + "epoch": 1.7949310321397025, + "grad_norm": 0.8952657580375671, + "learning_rate": 8.034193197607685e-05, + "loss": 2.8189, + "step": 19812 + }, + { + "epoch": 1.795021630314149, + "grad_norm": 0.9331550002098083, + "learning_rate": 8.033589077508608e-05, + "loss": 2.9603, + "step": 19813 + }, + { + "epoch": 1.795112228488596, + "grad_norm": 0.8915163278579712, + "learning_rate": 8.032984957409534e-05, + "loss": 2.4864, + "step": 19814 + }, + { + "epoch": 1.7952028266630427, + "grad_norm": 0.9637637734413147, + "learning_rate": 8.032380837310458e-05, + "loss": 2.7568, + "step": 19815 + }, + { + "epoch": 1.7952934248374897, + "grad_norm": 0.9570146203041077, + "learning_rate": 8.031776717211382e-05, + "loss": 2.7973, + "step": 19816 + }, + { + "epoch": 1.7953840230119362, + "grad_norm": 0.8796288371086121, + "learning_rate": 8.031172597112306e-05, + "loss": 2.9946, + "step": 19817 + }, + { + "epoch": 1.7954746211863832, + "grad_norm": 0.9198752641677856, + "learning_rate": 8.03056847701323e-05, + "loss": 2.7537, + "step": 19818 + }, + { + "epoch": 1.7955652193608298, + "grad_norm": 0.8917800784111023, + "learning_rate": 8.029964356914155e-05, + "loss": 2.5742, + "step": 19819 + }, + { + "epoch": 1.7956558175352768, + "grad_norm": 0.9746752381324768, + "learning_rate": 8.029360236815079e-05, + "loss": 2.5159, + "step": 19820 + }, + { + "epoch": 1.7957464157097234, + "grad_norm": 0.8912858366966248, + "learning_rate": 8.028756116716003e-05, + "loss": 2.5291, + "step": 19821 + }, + { + "epoch": 1.7958370138841704, + "grad_norm": 0.947095513343811, + "learning_rate": 8.028151996616928e-05, + "loss": 2.7284, + "step": 19822 + }, + { + "epoch": 1.795927612058617, + "grad_norm": 0.909668505191803, + "learning_rate": 8.027547876517853e-05, + "loss": 2.7471, + "step": 19823 + }, + { + "epoch": 1.796018210233064, + "grad_norm": 0.7852030992507935, + "learning_rate": 8.026943756418776e-05, + "loss": 2.3476, + "step": 19824 + }, + { + "epoch": 1.7961088084075105, + "grad_norm": 0.9211668372154236, + "learning_rate": 8.026339636319701e-05, + "loss": 2.477, + "step": 19825 + }, + { + "epoch": 1.7961994065819575, + "grad_norm": 0.8855127096176147, + "learning_rate": 8.025735516220625e-05, + "loss": 2.8972, + "step": 19826 + }, + { + "epoch": 1.796290004756404, + "grad_norm": 0.9469332098960876, + "learning_rate": 8.025131396121549e-05, + "loss": 2.9208, + "step": 19827 + }, + { + "epoch": 1.796380602930851, + "grad_norm": 0.8681029677391052, + "learning_rate": 8.024527276022473e-05, + "loss": 2.7154, + "step": 19828 + }, + { + "epoch": 1.7964712011052977, + "grad_norm": 0.9029445052146912, + "learning_rate": 8.023923155923399e-05, + "loss": 2.6418, + "step": 19829 + }, + { + "epoch": 1.7965617992797445, + "grad_norm": 0.8505279421806335, + "learning_rate": 8.023319035824322e-05, + "loss": 2.5799, + "step": 19830 + }, + { + "epoch": 1.7966523974541913, + "grad_norm": 0.9283725023269653, + "learning_rate": 8.022714915725247e-05, + "loss": 2.8656, + "step": 19831 + }, + { + "epoch": 1.796742995628638, + "grad_norm": 0.8582577705383301, + "learning_rate": 8.02211079562617e-05, + "loss": 2.7836, + "step": 19832 + }, + { + "epoch": 1.7968335938030848, + "grad_norm": 0.9831733703613281, + "learning_rate": 8.021506675527095e-05, + "loss": 2.6015, + "step": 19833 + }, + { + "epoch": 1.7969241919775316, + "grad_norm": 0.8769025206565857, + "learning_rate": 8.02090255542802e-05, + "loss": 2.6171, + "step": 19834 + }, + { + "epoch": 1.7970147901519784, + "grad_norm": 0.8868414759635925, + "learning_rate": 8.020298435328943e-05, + "loss": 2.6714, + "step": 19835 + }, + { + "epoch": 1.7971053883264252, + "grad_norm": 0.9427586793899536, + "learning_rate": 8.019694315229868e-05, + "loss": 2.7659, + "step": 19836 + }, + { + "epoch": 1.797195986500872, + "grad_norm": 0.9204096794128418, + "learning_rate": 8.019090195130793e-05, + "loss": 2.8973, + "step": 19837 + }, + { + "epoch": 1.7972865846753188, + "grad_norm": 0.9396561980247498, + "learning_rate": 8.018486075031718e-05, + "loss": 2.6307, + "step": 19838 + }, + { + "epoch": 1.7973771828497656, + "grad_norm": 0.6233600974082947, + "learning_rate": 8.017881954932641e-05, + "loss": 1.3625, + "step": 19839 + }, + { + "epoch": 1.7974677810242123, + "grad_norm": 0.9038007855415344, + "learning_rate": 8.017277834833566e-05, + "loss": 2.847, + "step": 19840 + }, + { + "epoch": 1.7975583791986591, + "grad_norm": 0.9397311210632324, + "learning_rate": 8.016673714734489e-05, + "loss": 2.7416, + "step": 19841 + }, + { + "epoch": 1.797648977373106, + "grad_norm": 0.8707376718521118, + "learning_rate": 8.016069594635414e-05, + "loss": 2.7327, + "step": 19842 + }, + { + "epoch": 1.7977395755475527, + "grad_norm": 0.8572217226028442, + "learning_rate": 8.015465474536337e-05, + "loss": 2.5212, + "step": 19843 + }, + { + "epoch": 1.7978301737219995, + "grad_norm": 1.0750044584274292, + "learning_rate": 8.014861354437263e-05, + "loss": 2.5265, + "step": 19844 + }, + { + "epoch": 1.7979207718964463, + "grad_norm": 0.9853305220603943, + "learning_rate": 8.014257234338187e-05, + "loss": 2.8751, + "step": 19845 + }, + { + "epoch": 1.798011370070893, + "grad_norm": 0.8761340975761414, + "learning_rate": 8.013653114239112e-05, + "loss": 2.6355, + "step": 19846 + }, + { + "epoch": 1.7981019682453399, + "grad_norm": 1.0231187343597412, + "learning_rate": 8.013048994140035e-05, + "loss": 2.4821, + "step": 19847 + }, + { + "epoch": 1.7981925664197866, + "grad_norm": 0.8722389340400696, + "learning_rate": 8.01244487404096e-05, + "loss": 2.5824, + "step": 19848 + }, + { + "epoch": 1.7982831645942334, + "grad_norm": 0.9261764883995056, + "learning_rate": 8.011840753941883e-05, + "loss": 2.5239, + "step": 19849 + }, + { + "epoch": 1.7983737627686802, + "grad_norm": 0.9281926155090332, + "learning_rate": 8.011236633842808e-05, + "loss": 2.6878, + "step": 19850 + }, + { + "epoch": 1.798464360943127, + "grad_norm": 0.8547356724739075, + "learning_rate": 8.010632513743733e-05, + "loss": 2.1302, + "step": 19851 + }, + { + "epoch": 1.7985549591175738, + "grad_norm": 0.8476301431655884, + "learning_rate": 8.010028393644657e-05, + "loss": 2.5412, + "step": 19852 + }, + { + "epoch": 1.7986455572920206, + "grad_norm": 0.7343735098838806, + "learning_rate": 8.009424273545581e-05, + "loss": 2.084, + "step": 19853 + }, + { + "epoch": 1.7987361554664674, + "grad_norm": 0.7986831068992615, + "learning_rate": 8.008820153446506e-05, + "loss": 1.9791, + "step": 19854 + }, + { + "epoch": 1.7988267536409142, + "grad_norm": 1.0122369527816772, + "learning_rate": 8.00821603334743e-05, + "loss": 2.6889, + "step": 19855 + }, + { + "epoch": 1.798917351815361, + "grad_norm": 0.9233565926551819, + "learning_rate": 8.007611913248354e-05, + "loss": 2.7431, + "step": 19856 + }, + { + "epoch": 1.7990079499898077, + "grad_norm": 0.8717123866081238, + "learning_rate": 8.007007793149279e-05, + "loss": 2.6354, + "step": 19857 + }, + { + "epoch": 1.7990985481642545, + "grad_norm": 0.895219087600708, + "learning_rate": 8.006403673050202e-05, + "loss": 2.8214, + "step": 19858 + }, + { + "epoch": 1.7991891463387013, + "grad_norm": 0.8317356705665588, + "learning_rate": 8.005799552951128e-05, + "loss": 2.0423, + "step": 19859 + }, + { + "epoch": 1.799279744513148, + "grad_norm": 0.9077824950218201, + "learning_rate": 8.005195432852052e-05, + "loss": 2.6792, + "step": 19860 + }, + { + "epoch": 1.7993703426875949, + "grad_norm": 0.8530577421188354, + "learning_rate": 8.004591312752976e-05, + "loss": 2.5943, + "step": 19861 + }, + { + "epoch": 1.7994609408620417, + "grad_norm": 0.8825093507766724, + "learning_rate": 8.0039871926539e-05, + "loss": 2.661, + "step": 19862 + }, + { + "epoch": 1.7995515390364885, + "grad_norm": 0.9085451364517212, + "learning_rate": 8.003383072554824e-05, + "loss": 2.7284, + "step": 19863 + }, + { + "epoch": 1.7996421372109352, + "grad_norm": 0.9453433752059937, + "learning_rate": 8.002778952455748e-05, + "loss": 2.672, + "step": 19864 + }, + { + "epoch": 1.799732735385382, + "grad_norm": 1.0456981658935547, + "learning_rate": 8.002174832356673e-05, + "loss": 3.056, + "step": 19865 + }, + { + "epoch": 1.7998233335598288, + "grad_norm": 0.9087267518043518, + "learning_rate": 8.001570712257597e-05, + "loss": 2.7076, + "step": 19866 + }, + { + "epoch": 1.7999139317342756, + "grad_norm": 0.9996299743652344, + "learning_rate": 8.000966592158522e-05, + "loss": 2.5701, + "step": 19867 + }, + { + "epoch": 1.8000045299087222, + "grad_norm": 0.9525749087333679, + "learning_rate": 8.000362472059446e-05, + "loss": 2.5581, + "step": 19868 + }, + { + "epoch": 1.8000951280831692, + "grad_norm": 0.8563581109046936, + "learning_rate": 7.99975835196037e-05, + "loss": 2.6244, + "step": 19869 + }, + { + "epoch": 1.8001857262576157, + "grad_norm": 0.8891896605491638, + "learning_rate": 7.999154231861295e-05, + "loss": 2.5936, + "step": 19870 + }, + { + "epoch": 1.8002763244320628, + "grad_norm": 0.8772788047790527, + "learning_rate": 7.998550111762218e-05, + "loss": 2.554, + "step": 19871 + }, + { + "epoch": 1.8003669226065093, + "grad_norm": 0.956807553768158, + "learning_rate": 7.997945991663143e-05, + "loss": 3.0272, + "step": 19872 + }, + { + "epoch": 1.8004575207809563, + "grad_norm": 1.027179479598999, + "learning_rate": 7.997341871564067e-05, + "loss": 2.7485, + "step": 19873 + }, + { + "epoch": 1.800548118955403, + "grad_norm": 0.9106263518333435, + "learning_rate": 7.996737751464993e-05, + "loss": 2.5932, + "step": 19874 + }, + { + "epoch": 1.80063871712985, + "grad_norm": 0.9282930493354797, + "learning_rate": 7.996133631365916e-05, + "loss": 2.7003, + "step": 19875 + }, + { + "epoch": 1.8007293153042965, + "grad_norm": 0.9397226572036743, + "learning_rate": 7.995529511266841e-05, + "loss": 2.6595, + "step": 19876 + }, + { + "epoch": 1.8008199134787435, + "grad_norm": 0.9200889468193054, + "learning_rate": 7.994925391167764e-05, + "loss": 2.6687, + "step": 19877 + }, + { + "epoch": 1.80091051165319, + "grad_norm": 0.6140425801277161, + "learning_rate": 7.994321271068689e-05, + "loss": 1.2713, + "step": 19878 + }, + { + "epoch": 1.801001109827637, + "grad_norm": 0.8863111734390259, + "learning_rate": 7.993717150969612e-05, + "loss": 2.0061, + "step": 19879 + }, + { + "epoch": 1.8010917080020836, + "grad_norm": 0.913133442401886, + "learning_rate": 7.993113030870537e-05, + "loss": 2.5492, + "step": 19880 + }, + { + "epoch": 1.8011823061765306, + "grad_norm": 0.9997656345367432, + "learning_rate": 7.99250891077146e-05, + "loss": 2.5545, + "step": 19881 + }, + { + "epoch": 1.8012729043509772, + "grad_norm": 0.9374469518661499, + "learning_rate": 7.991904790672387e-05, + "loss": 2.8941, + "step": 19882 + }, + { + "epoch": 1.8013635025254242, + "grad_norm": 0.8288390636444092, + "learning_rate": 7.99130067057331e-05, + "loss": 2.3842, + "step": 19883 + }, + { + "epoch": 1.8014541006998708, + "grad_norm": 0.9079241752624512, + "learning_rate": 7.990696550474235e-05, + "loss": 2.8641, + "step": 19884 + }, + { + "epoch": 1.8015446988743178, + "grad_norm": 0.861506998538971, + "learning_rate": 7.990092430375158e-05, + "loss": 2.7059, + "step": 19885 + }, + { + "epoch": 1.8016352970487644, + "grad_norm": 0.8795350790023804, + "learning_rate": 7.989488310276083e-05, + "loss": 2.7704, + "step": 19886 + }, + { + "epoch": 1.8017258952232114, + "grad_norm": 0.915363073348999, + "learning_rate": 7.988884190177008e-05, + "loss": 2.7149, + "step": 19887 + }, + { + "epoch": 1.801816493397658, + "grad_norm": 0.9311078786849976, + "learning_rate": 7.988280070077931e-05, + "loss": 2.6718, + "step": 19888 + }, + { + "epoch": 1.801907091572105, + "grad_norm": 0.8725493550300598, + "learning_rate": 7.987675949978857e-05, + "loss": 2.5327, + "step": 19889 + }, + { + "epoch": 1.8019976897465515, + "grad_norm": 0.7288026809692383, + "learning_rate": 7.987071829879781e-05, + "loss": 2.0147, + "step": 19890 + }, + { + "epoch": 1.8020882879209985, + "grad_norm": 0.7604244947433472, + "learning_rate": 7.986467709780706e-05, + "loss": 2.1818, + "step": 19891 + }, + { + "epoch": 1.802178886095445, + "grad_norm": 0.8932091593742371, + "learning_rate": 7.985863589681629e-05, + "loss": 2.7393, + "step": 19892 + }, + { + "epoch": 1.802269484269892, + "grad_norm": 0.8638725876808167, + "learning_rate": 7.985259469582554e-05, + "loss": 2.587, + "step": 19893 + }, + { + "epoch": 1.8023600824443387, + "grad_norm": 1.0486233234405518, + "learning_rate": 7.984655349483477e-05, + "loss": 2.6637, + "step": 19894 + }, + { + "epoch": 1.8024506806187857, + "grad_norm": 0.919373631477356, + "learning_rate": 7.984051229384402e-05, + "loss": 2.9668, + "step": 19895 + }, + { + "epoch": 1.8025412787932322, + "grad_norm": 0.8904073238372803, + "learning_rate": 7.983447109285325e-05, + "loss": 2.7684, + "step": 19896 + }, + { + "epoch": 1.8026318769676792, + "grad_norm": 0.9708479642868042, + "learning_rate": 7.982842989186251e-05, + "loss": 2.8035, + "step": 19897 + }, + { + "epoch": 1.8027224751421258, + "grad_norm": 0.908381998538971, + "learning_rate": 7.982238869087175e-05, + "loss": 3.0502, + "step": 19898 + }, + { + "epoch": 1.8028130733165728, + "grad_norm": 0.8748654723167419, + "learning_rate": 7.9816347489881e-05, + "loss": 2.5164, + "step": 19899 + }, + { + "epoch": 1.8029036714910194, + "grad_norm": 0.9799836277961731, + "learning_rate": 7.981030628889023e-05, + "loss": 2.513, + "step": 19900 + }, + { + "epoch": 1.8029942696654664, + "grad_norm": 0.9047328233718872, + "learning_rate": 7.980426508789948e-05, + "loss": 2.6298, + "step": 19901 + }, + { + "epoch": 1.803084867839913, + "grad_norm": 0.9488750100135803, + "learning_rate": 7.979822388690872e-05, + "loss": 2.6446, + "step": 19902 + }, + { + "epoch": 1.80317546601436, + "grad_norm": 0.8895475268363953, + "learning_rate": 7.979218268591796e-05, + "loss": 2.5714, + "step": 19903 + }, + { + "epoch": 1.8032660641888065, + "grad_norm": 0.9025459885597229, + "learning_rate": 7.97861414849272e-05, + "loss": 2.7659, + "step": 19904 + }, + { + "epoch": 1.8033566623632535, + "grad_norm": 0.9254783987998962, + "learning_rate": 7.978010028393645e-05, + "loss": 2.7582, + "step": 19905 + }, + { + "epoch": 1.8034472605377, + "grad_norm": 0.8945161700248718, + "learning_rate": 7.97740590829457e-05, + "loss": 2.6517, + "step": 19906 + }, + { + "epoch": 1.8035378587121471, + "grad_norm": 0.9186192750930786, + "learning_rate": 7.976801788195494e-05, + "loss": 2.4342, + "step": 19907 + }, + { + "epoch": 1.8036284568865937, + "grad_norm": 0.9741571545600891, + "learning_rate": 7.976197668096418e-05, + "loss": 2.8664, + "step": 19908 + }, + { + "epoch": 1.8037190550610405, + "grad_norm": 0.8757257461547852, + "learning_rate": 7.975593547997342e-05, + "loss": 2.5657, + "step": 19909 + }, + { + "epoch": 1.8038096532354873, + "grad_norm": 0.9185802936553955, + "learning_rate": 7.974989427898266e-05, + "loss": 3.0928, + "step": 19910 + }, + { + "epoch": 1.803900251409934, + "grad_norm": 0.9029408097267151, + "learning_rate": 7.97438530779919e-05, + "loss": 2.6558, + "step": 19911 + }, + { + "epoch": 1.8039908495843808, + "grad_norm": 0.9639163613319397, + "learning_rate": 7.973781187700116e-05, + "loss": 2.8031, + "step": 19912 + }, + { + "epoch": 1.8040814477588276, + "grad_norm": 0.9864394068717957, + "learning_rate": 7.97317706760104e-05, + "loss": 2.8387, + "step": 19913 + }, + { + "epoch": 1.8041720459332744, + "grad_norm": 0.8930122256278992, + "learning_rate": 7.972572947501964e-05, + "loss": 2.7838, + "step": 19914 + }, + { + "epoch": 1.8042626441077212, + "grad_norm": 0.9311510920524597, + "learning_rate": 7.971968827402888e-05, + "loss": 2.7029, + "step": 19915 + }, + { + "epoch": 1.804353242282168, + "grad_norm": 0.9906938672065735, + "learning_rate": 7.971364707303812e-05, + "loss": 2.7054, + "step": 19916 + }, + { + "epoch": 1.8044438404566148, + "grad_norm": 0.9282276034355164, + "learning_rate": 7.970760587204736e-05, + "loss": 2.7539, + "step": 19917 + }, + { + "epoch": 1.8045344386310616, + "grad_norm": 0.7419955134391785, + "learning_rate": 7.97015646710566e-05, + "loss": 2.0553, + "step": 19918 + }, + { + "epoch": 1.8046250368055083, + "grad_norm": 1.0553631782531738, + "learning_rate": 7.969552347006585e-05, + "loss": 3.1471, + "step": 19919 + }, + { + "epoch": 1.8047156349799551, + "grad_norm": 0.8567253351211548, + "learning_rate": 7.96894822690751e-05, + "loss": 2.6325, + "step": 19920 + }, + { + "epoch": 1.804806233154402, + "grad_norm": 0.921349048614502, + "learning_rate": 7.968344106808435e-05, + "loss": 2.6422, + "step": 19921 + }, + { + "epoch": 1.8048968313288487, + "grad_norm": 0.8680830001831055, + "learning_rate": 7.967739986709358e-05, + "loss": 2.1138, + "step": 19922 + }, + { + "epoch": 1.8049874295032955, + "grad_norm": 0.9072445631027222, + "learning_rate": 7.967135866610283e-05, + "loss": 2.6738, + "step": 19923 + }, + { + "epoch": 1.8050780276777423, + "grad_norm": 0.9351055026054382, + "learning_rate": 7.966531746511206e-05, + "loss": 2.4284, + "step": 19924 + }, + { + "epoch": 1.805168625852189, + "grad_norm": 0.8921867609024048, + "learning_rate": 7.965927626412131e-05, + "loss": 2.6171, + "step": 19925 + }, + { + "epoch": 1.8052592240266359, + "grad_norm": 0.9398729205131531, + "learning_rate": 7.965323506313055e-05, + "loss": 3.004, + "step": 19926 + }, + { + "epoch": 1.8053498222010826, + "grad_norm": 0.9724181890487671, + "learning_rate": 7.96471938621398e-05, + "loss": 2.7904, + "step": 19927 + }, + { + "epoch": 1.8054404203755294, + "grad_norm": 0.8724191188812256, + "learning_rate": 7.964115266114904e-05, + "loss": 2.6694, + "step": 19928 + }, + { + "epoch": 1.8055310185499762, + "grad_norm": 0.7328202724456787, + "learning_rate": 7.963511146015829e-05, + "loss": 1.962, + "step": 19929 + }, + { + "epoch": 1.805621616724423, + "grad_norm": 0.9142529964447021, + "learning_rate": 7.962907025916752e-05, + "loss": 2.6148, + "step": 19930 + }, + { + "epoch": 1.8057122148988698, + "grad_norm": 0.7951529622077942, + "learning_rate": 7.962302905817677e-05, + "loss": 2.0001, + "step": 19931 + }, + { + "epoch": 1.8058028130733166, + "grad_norm": 0.8765657544136047, + "learning_rate": 7.9616987857186e-05, + "loss": 2.6728, + "step": 19932 + }, + { + "epoch": 1.8058934112477634, + "grad_norm": 1.009568214416504, + "learning_rate": 7.961094665619525e-05, + "loss": 2.8247, + "step": 19933 + }, + { + "epoch": 1.8059840094222102, + "grad_norm": 0.8029274940490723, + "learning_rate": 7.96049054552045e-05, + "loss": 2.2156, + "step": 19934 + }, + { + "epoch": 1.806074607596657, + "grad_norm": 0.8787333369255066, + "learning_rate": 7.959886425421375e-05, + "loss": 2.7625, + "step": 19935 + }, + { + "epoch": 1.8061652057711037, + "grad_norm": 0.9904034733772278, + "learning_rate": 7.959282305322298e-05, + "loss": 3.0339, + "step": 19936 + }, + { + "epoch": 1.8062558039455505, + "grad_norm": 0.8575915098190308, + "learning_rate": 7.958678185223223e-05, + "loss": 2.5565, + "step": 19937 + }, + { + "epoch": 1.8063464021199973, + "grad_norm": 0.9444987773895264, + "learning_rate": 7.958074065124148e-05, + "loss": 2.5871, + "step": 19938 + }, + { + "epoch": 1.806437000294444, + "grad_norm": 0.9003795981407166, + "learning_rate": 7.957469945025071e-05, + "loss": 2.7806, + "step": 19939 + }, + { + "epoch": 1.8065275984688909, + "grad_norm": 0.8783276677131653, + "learning_rate": 7.956865824925996e-05, + "loss": 2.5985, + "step": 19940 + }, + { + "epoch": 1.8066181966433377, + "grad_norm": 0.9368207454681396, + "learning_rate": 7.956261704826919e-05, + "loss": 2.9047, + "step": 19941 + }, + { + "epoch": 1.8067087948177845, + "grad_norm": 0.8446524739265442, + "learning_rate": 7.955657584727845e-05, + "loss": 2.6636, + "step": 19942 + }, + { + "epoch": 1.8067993929922312, + "grad_norm": 0.8916694521903992, + "learning_rate": 7.955053464628769e-05, + "loss": 2.8502, + "step": 19943 + }, + { + "epoch": 1.806889991166678, + "grad_norm": 0.8832552433013916, + "learning_rate": 7.954449344529693e-05, + "loss": 2.9674, + "step": 19944 + }, + { + "epoch": 1.8069805893411248, + "grad_norm": 0.8980756998062134, + "learning_rate": 7.953845224430617e-05, + "loss": 2.2877, + "step": 19945 + }, + { + "epoch": 1.8070711875155716, + "grad_norm": 0.9365697503089905, + "learning_rate": 7.953241104331542e-05, + "loss": 2.7469, + "step": 19946 + }, + { + "epoch": 1.8071617856900184, + "grad_norm": 0.861371636390686, + "learning_rate": 7.952636984232465e-05, + "loss": 2.5237, + "step": 19947 + }, + { + "epoch": 1.8072523838644652, + "grad_norm": 0.9517042636871338, + "learning_rate": 7.95203286413339e-05, + "loss": 2.7961, + "step": 19948 + }, + { + "epoch": 1.8073429820389117, + "grad_norm": 0.9097279906272888, + "learning_rate": 7.951428744034315e-05, + "loss": 2.6745, + "step": 19949 + }, + { + "epoch": 1.8074335802133588, + "grad_norm": 0.7925310730934143, + "learning_rate": 7.950824623935239e-05, + "loss": 2.3938, + "step": 19950 + }, + { + "epoch": 1.8075241783878053, + "grad_norm": 0.9843773245811462, + "learning_rate": 7.950220503836163e-05, + "loss": 2.6355, + "step": 19951 + }, + { + "epoch": 1.8076147765622523, + "grad_norm": 0.8882350921630859, + "learning_rate": 7.949616383737087e-05, + "loss": 2.7056, + "step": 19952 + }, + { + "epoch": 1.807705374736699, + "grad_norm": 0.9875028133392334, + "learning_rate": 7.949012263638011e-05, + "loss": 2.6646, + "step": 19953 + }, + { + "epoch": 1.807795972911146, + "grad_norm": 0.9358144402503967, + "learning_rate": 7.948408143538936e-05, + "loss": 2.8657, + "step": 19954 + }, + { + "epoch": 1.8078865710855925, + "grad_norm": 0.8550472259521484, + "learning_rate": 7.94780402343986e-05, + "loss": 2.694, + "step": 19955 + }, + { + "epoch": 1.8079771692600395, + "grad_norm": 0.7949036359786987, + "learning_rate": 7.947199903340784e-05, + "loss": 2.2271, + "step": 19956 + }, + { + "epoch": 1.808067767434486, + "grad_norm": 0.7566214203834534, + "learning_rate": 7.94659578324171e-05, + "loss": 1.9819, + "step": 19957 + }, + { + "epoch": 1.808158365608933, + "grad_norm": 0.822529673576355, + "learning_rate": 7.945991663142633e-05, + "loss": 2.1, + "step": 19958 + }, + { + "epoch": 1.8082489637833796, + "grad_norm": 0.936005711555481, + "learning_rate": 7.945387543043558e-05, + "loss": 2.8024, + "step": 19959 + }, + { + "epoch": 1.8083395619578266, + "grad_norm": 0.8692055940628052, + "learning_rate": 7.944783422944481e-05, + "loss": 2.6634, + "step": 19960 + }, + { + "epoch": 1.8084301601322732, + "grad_norm": 0.8678922653198242, + "learning_rate": 7.944179302845406e-05, + "loss": 2.6512, + "step": 19961 + }, + { + "epoch": 1.8085207583067202, + "grad_norm": 0.9042878746986389, + "learning_rate": 7.94357518274633e-05, + "loss": 2.6974, + "step": 19962 + }, + { + "epoch": 1.8086113564811668, + "grad_norm": 0.8521676063537598, + "learning_rate": 7.942971062647254e-05, + "loss": 2.4484, + "step": 19963 + }, + { + "epoch": 1.8087019546556138, + "grad_norm": 0.8549389243125916, + "learning_rate": 7.942366942548179e-05, + "loss": 2.9367, + "step": 19964 + }, + { + "epoch": 1.8087925528300604, + "grad_norm": 0.7005032300949097, + "learning_rate": 7.941762822449104e-05, + "loss": 1.2201, + "step": 19965 + }, + { + "epoch": 1.8088831510045074, + "grad_norm": 0.9162421226501465, + "learning_rate": 7.941158702350027e-05, + "loss": 2.8843, + "step": 19966 + }, + { + "epoch": 1.808973749178954, + "grad_norm": 0.8980793952941895, + "learning_rate": 7.940554582250952e-05, + "loss": 2.6716, + "step": 19967 + }, + { + "epoch": 1.809064347353401, + "grad_norm": 0.8391587138175964, + "learning_rate": 7.939950462151876e-05, + "loss": 2.4515, + "step": 19968 + }, + { + "epoch": 1.8091549455278475, + "grad_norm": 0.9536582231521606, + "learning_rate": 7.9393463420528e-05, + "loss": 2.7491, + "step": 19969 + }, + { + "epoch": 1.8092455437022945, + "grad_norm": 0.8489549160003662, + "learning_rate": 7.938742221953725e-05, + "loss": 2.5247, + "step": 19970 + }, + { + "epoch": 1.809336141876741, + "grad_norm": 0.9805177450180054, + "learning_rate": 7.938138101854648e-05, + "loss": 2.6481, + "step": 19971 + }, + { + "epoch": 1.809426740051188, + "grad_norm": 0.9286538362503052, + "learning_rate": 7.937533981755573e-05, + "loss": 2.6188, + "step": 19972 + }, + { + "epoch": 1.8095173382256347, + "grad_norm": 0.9291961789131165, + "learning_rate": 7.936929861656498e-05, + "loss": 2.6351, + "step": 19973 + }, + { + "epoch": 1.8096079364000817, + "grad_norm": 1.0224705934524536, + "learning_rate": 7.936325741557423e-05, + "loss": 2.5786, + "step": 19974 + }, + { + "epoch": 1.8096985345745282, + "grad_norm": 0.7854882478713989, + "learning_rate": 7.935721621458346e-05, + "loss": 2.0241, + "step": 19975 + }, + { + "epoch": 1.8097891327489752, + "grad_norm": 0.9278445839881897, + "learning_rate": 7.935117501359271e-05, + "loss": 2.8173, + "step": 19976 + }, + { + "epoch": 1.8098797309234218, + "grad_norm": 0.7518685460090637, + "learning_rate": 7.934513381260194e-05, + "loss": 2.1417, + "step": 19977 + }, + { + "epoch": 1.8099703290978688, + "grad_norm": 0.9830196499824524, + "learning_rate": 7.933909261161119e-05, + "loss": 2.7991, + "step": 19978 + }, + { + "epoch": 1.8100609272723154, + "grad_norm": 0.8879215717315674, + "learning_rate": 7.933305141062044e-05, + "loss": 2.9774, + "step": 19979 + }, + { + "epoch": 1.8101515254467624, + "grad_norm": 0.916167140007019, + "learning_rate": 7.932701020962969e-05, + "loss": 2.7579, + "step": 19980 + }, + { + "epoch": 1.810242123621209, + "grad_norm": 0.8430920839309692, + "learning_rate": 7.932096900863892e-05, + "loss": 2.8065, + "step": 19981 + }, + { + "epoch": 1.810332721795656, + "grad_norm": 0.9127697348594666, + "learning_rate": 7.931492780764817e-05, + "loss": 2.6792, + "step": 19982 + }, + { + "epoch": 1.8104233199701025, + "grad_norm": 1.061435341835022, + "learning_rate": 7.93088866066574e-05, + "loss": 3.0907, + "step": 19983 + }, + { + "epoch": 1.8105139181445495, + "grad_norm": 0.7839395999908447, + "learning_rate": 7.930284540566665e-05, + "loss": 1.958, + "step": 19984 + }, + { + "epoch": 1.810604516318996, + "grad_norm": 0.934509813785553, + "learning_rate": 7.929680420467588e-05, + "loss": 2.7274, + "step": 19985 + }, + { + "epoch": 1.8106951144934431, + "grad_norm": 0.897650420665741, + "learning_rate": 7.929076300368513e-05, + "loss": 2.713, + "step": 19986 + }, + { + "epoch": 1.8107857126678897, + "grad_norm": 0.9037002921104431, + "learning_rate": 7.928472180269438e-05, + "loss": 2.7006, + "step": 19987 + }, + { + "epoch": 1.8108763108423367, + "grad_norm": 0.9229339957237244, + "learning_rate": 7.927868060170363e-05, + "loss": 2.9389, + "step": 19988 + }, + { + "epoch": 1.8109669090167833, + "grad_norm": 0.8743513226509094, + "learning_rate": 7.927263940071287e-05, + "loss": 2.5786, + "step": 19989 + }, + { + "epoch": 1.81105750719123, + "grad_norm": 0.9236484169960022, + "learning_rate": 7.926659819972211e-05, + "loss": 2.9722, + "step": 19990 + }, + { + "epoch": 1.8111481053656768, + "grad_norm": 0.8806544542312622, + "learning_rate": 7.926055699873136e-05, + "loss": 2.6927, + "step": 19991 + }, + { + "epoch": 1.8112387035401236, + "grad_norm": 0.8861164450645447, + "learning_rate": 7.925451579774059e-05, + "loss": 2.5567, + "step": 19992 + }, + { + "epoch": 1.8113293017145704, + "grad_norm": 0.9319732785224915, + "learning_rate": 7.924847459674984e-05, + "loss": 2.5612, + "step": 19993 + }, + { + "epoch": 1.8114198998890172, + "grad_norm": 0.904719352722168, + "learning_rate": 7.924243339575908e-05, + "loss": 2.6019, + "step": 19994 + }, + { + "epoch": 1.811510498063464, + "grad_norm": 0.7560587525367737, + "learning_rate": 7.923639219476833e-05, + "loss": 2.0477, + "step": 19995 + }, + { + "epoch": 1.8116010962379108, + "grad_norm": 0.9328410625457764, + "learning_rate": 7.923035099377757e-05, + "loss": 2.7107, + "step": 19996 + }, + { + "epoch": 1.8116916944123576, + "grad_norm": 0.8929027915000916, + "learning_rate": 7.922430979278681e-05, + "loss": 2.5387, + "step": 19997 + }, + { + "epoch": 1.8117822925868043, + "grad_norm": 0.9096964597702026, + "learning_rate": 7.921826859179605e-05, + "loss": 2.5421, + "step": 19998 + }, + { + "epoch": 1.8118728907612511, + "grad_norm": 0.9683943390846252, + "learning_rate": 7.92122273908053e-05, + "loss": 2.6067, + "step": 19999 + }, + { + "epoch": 1.811963488935698, + "grad_norm": 0.8861603736877441, + "learning_rate": 7.920618618981453e-05, + "loss": 3.0542, + "step": 20000 + }, + { + "epoch": 1.8120540871101447, + "grad_norm": 0.9754856824874878, + "learning_rate": 7.920014498882378e-05, + "loss": 2.8808, + "step": 20001 + }, + { + "epoch": 1.8121446852845915, + "grad_norm": 0.9188331961631775, + "learning_rate": 7.919410378783302e-05, + "loss": 2.7394, + "step": 20002 + }, + { + "epoch": 1.8122352834590383, + "grad_norm": 0.9213197231292725, + "learning_rate": 7.918806258684227e-05, + "loss": 2.8953, + "step": 20003 + }, + { + "epoch": 1.812325881633485, + "grad_norm": 0.974711537361145, + "learning_rate": 7.91820213858515e-05, + "loss": 2.9206, + "step": 20004 + }, + { + "epoch": 1.8124164798079319, + "grad_norm": 0.8761358261108398, + "learning_rate": 7.917598018486075e-05, + "loss": 2.7436, + "step": 20005 + }, + { + "epoch": 1.8125070779823786, + "grad_norm": 0.8818124532699585, + "learning_rate": 7.916993898387e-05, + "loss": 2.5025, + "step": 20006 + }, + { + "epoch": 1.8125976761568254, + "grad_norm": 0.8604636192321777, + "learning_rate": 7.916389778287924e-05, + "loss": 2.9176, + "step": 20007 + }, + { + "epoch": 1.8126882743312722, + "grad_norm": 1.0304378271102905, + "learning_rate": 7.915785658188848e-05, + "loss": 2.5943, + "step": 20008 + }, + { + "epoch": 1.812778872505719, + "grad_norm": 0.78641676902771, + "learning_rate": 7.915181538089773e-05, + "loss": 2.0616, + "step": 20009 + }, + { + "epoch": 1.8128694706801658, + "grad_norm": 0.9596242308616638, + "learning_rate": 7.914577417990698e-05, + "loss": 2.518, + "step": 20010 + }, + { + "epoch": 1.8129600688546126, + "grad_norm": 0.9162099957466125, + "learning_rate": 7.913973297891621e-05, + "loss": 2.5601, + "step": 20011 + }, + { + "epoch": 1.8130506670290594, + "grad_norm": 0.8907425403594971, + "learning_rate": 7.913369177792546e-05, + "loss": 2.6501, + "step": 20012 + }, + { + "epoch": 1.8131412652035062, + "grad_norm": 0.7396141886711121, + "learning_rate": 7.91276505769347e-05, + "loss": 1.9255, + "step": 20013 + }, + { + "epoch": 1.813231863377953, + "grad_norm": 0.8859812617301941, + "learning_rate": 7.912160937594394e-05, + "loss": 2.6184, + "step": 20014 + }, + { + "epoch": 1.8133224615523997, + "grad_norm": 0.8299242258071899, + "learning_rate": 7.911556817495318e-05, + "loss": 2.6519, + "step": 20015 + }, + { + "epoch": 1.8134130597268465, + "grad_norm": 0.9745957851409912, + "learning_rate": 7.910952697396242e-05, + "loss": 2.6849, + "step": 20016 + }, + { + "epoch": 1.8135036579012933, + "grad_norm": 1.105072021484375, + "learning_rate": 7.910348577297167e-05, + "loss": 2.7199, + "step": 20017 + }, + { + "epoch": 1.81359425607574, + "grad_norm": 0.9249234795570374, + "learning_rate": 7.909744457198092e-05, + "loss": 2.8716, + "step": 20018 + }, + { + "epoch": 1.8136848542501869, + "grad_norm": 0.8695465326309204, + "learning_rate": 7.909140337099015e-05, + "loss": 2.5902, + "step": 20019 + }, + { + "epoch": 1.8137754524246337, + "grad_norm": 0.928646445274353, + "learning_rate": 7.90853621699994e-05, + "loss": 2.7423, + "step": 20020 + }, + { + "epoch": 1.8138660505990805, + "grad_norm": 0.8403438329696655, + "learning_rate": 7.907932096900865e-05, + "loss": 2.4296, + "step": 20021 + }, + { + "epoch": 1.8139566487735272, + "grad_norm": 0.9025550484657288, + "learning_rate": 7.907327976801788e-05, + "loss": 2.6992, + "step": 20022 + }, + { + "epoch": 1.814047246947974, + "grad_norm": 0.9443486332893372, + "learning_rate": 7.906723856702713e-05, + "loss": 2.6355, + "step": 20023 + }, + { + "epoch": 1.8141378451224208, + "grad_norm": 0.8744177222251892, + "learning_rate": 7.906119736603638e-05, + "loss": 2.7234, + "step": 20024 + }, + { + "epoch": 1.8142284432968676, + "grad_norm": 0.9098483920097351, + "learning_rate": 7.905515616504562e-05, + "loss": 2.7516, + "step": 20025 + }, + { + "epoch": 1.8143190414713144, + "grad_norm": 0.8778315186500549, + "learning_rate": 7.904911496405486e-05, + "loss": 2.8331, + "step": 20026 + }, + { + "epoch": 1.8144096396457612, + "grad_norm": 0.9178867936134338, + "learning_rate": 7.90430737630641e-05, + "loss": 2.4643, + "step": 20027 + }, + { + "epoch": 1.814500237820208, + "grad_norm": 0.8978434205055237, + "learning_rate": 7.903703256207334e-05, + "loss": 2.5261, + "step": 20028 + }, + { + "epoch": 1.8145908359946548, + "grad_norm": 0.9819051027297974, + "learning_rate": 7.903099136108259e-05, + "loss": 2.6869, + "step": 20029 + }, + { + "epoch": 1.8146814341691013, + "grad_norm": 0.8084960579872131, + "learning_rate": 7.902495016009182e-05, + "loss": 2.2984, + "step": 20030 + }, + { + "epoch": 1.8147720323435483, + "grad_norm": 0.9205712080001831, + "learning_rate": 7.901890895910107e-05, + "loss": 2.4931, + "step": 20031 + }, + { + "epoch": 1.814862630517995, + "grad_norm": 0.8987858891487122, + "learning_rate": 7.901286775811032e-05, + "loss": 2.5459, + "step": 20032 + }, + { + "epoch": 1.814953228692442, + "grad_norm": 0.9179820418357849, + "learning_rate": 7.900682655711956e-05, + "loss": 2.9278, + "step": 20033 + }, + { + "epoch": 1.8150438268668885, + "grad_norm": 0.9760302901268005, + "learning_rate": 7.90007853561288e-05, + "loss": 2.8812, + "step": 20034 + }, + { + "epoch": 1.8151344250413355, + "grad_norm": 0.8518596887588501, + "learning_rate": 7.899474415513805e-05, + "loss": 2.5951, + "step": 20035 + }, + { + "epoch": 1.815225023215782, + "grad_norm": 0.7943060398101807, + "learning_rate": 7.898870295414728e-05, + "loss": 2.2685, + "step": 20036 + }, + { + "epoch": 1.815315621390229, + "grad_norm": 0.9699404835700989, + "learning_rate": 7.898266175315653e-05, + "loss": 2.7983, + "step": 20037 + }, + { + "epoch": 1.8154062195646756, + "grad_norm": 0.887169361114502, + "learning_rate": 7.897662055216578e-05, + "loss": 2.6559, + "step": 20038 + }, + { + "epoch": 1.8154968177391226, + "grad_norm": 0.929145097732544, + "learning_rate": 7.897057935117502e-05, + "loss": 2.9962, + "step": 20039 + }, + { + "epoch": 1.8155874159135692, + "grad_norm": 1.0068098306655884, + "learning_rate": 7.896453815018426e-05, + "loss": 2.8315, + "step": 20040 + }, + { + "epoch": 1.8156780140880162, + "grad_norm": 0.8741114735603333, + "learning_rate": 7.89584969491935e-05, + "loss": 2.75, + "step": 20041 + }, + { + "epoch": 1.8157686122624628, + "grad_norm": 0.864861249923706, + "learning_rate": 7.895245574820275e-05, + "loss": 2.8128, + "step": 20042 + }, + { + "epoch": 1.8158592104369098, + "grad_norm": 0.9590873122215271, + "learning_rate": 7.894641454721199e-05, + "loss": 2.7525, + "step": 20043 + }, + { + "epoch": 1.8159498086113564, + "grad_norm": 1.0299104452133179, + "learning_rate": 7.894037334622123e-05, + "loss": 2.5677, + "step": 20044 + }, + { + "epoch": 1.8160404067858034, + "grad_norm": 0.9044857621192932, + "learning_rate": 7.893433214523047e-05, + "loss": 2.7576, + "step": 20045 + }, + { + "epoch": 1.81613100496025, + "grad_norm": 0.8822203278541565, + "learning_rate": 7.892829094423972e-05, + "loss": 2.6708, + "step": 20046 + }, + { + "epoch": 1.816221603134697, + "grad_norm": 0.9032922387123108, + "learning_rate": 7.892224974324896e-05, + "loss": 2.6999, + "step": 20047 + }, + { + "epoch": 1.8163122013091435, + "grad_norm": 1.0070592164993286, + "learning_rate": 7.891620854225821e-05, + "loss": 2.8252, + "step": 20048 + }, + { + "epoch": 1.8164027994835905, + "grad_norm": 0.7922534942626953, + "learning_rate": 7.891016734126745e-05, + "loss": 2.1156, + "step": 20049 + }, + { + "epoch": 1.816493397658037, + "grad_norm": 0.8735122680664062, + "learning_rate": 7.890412614027669e-05, + "loss": 2.6901, + "step": 20050 + }, + { + "epoch": 1.816583995832484, + "grad_norm": 0.93192058801651, + "learning_rate": 7.889808493928593e-05, + "loss": 2.7394, + "step": 20051 + }, + { + "epoch": 1.8166745940069307, + "grad_norm": 0.9717953205108643, + "learning_rate": 7.889204373829517e-05, + "loss": 2.8479, + "step": 20052 + }, + { + "epoch": 1.8167651921813777, + "grad_norm": 0.6670577526092529, + "learning_rate": 7.888600253730442e-05, + "loss": 1.4501, + "step": 20053 + }, + { + "epoch": 1.8168557903558242, + "grad_norm": 0.8552561402320862, + "learning_rate": 7.887996133631367e-05, + "loss": 2.6178, + "step": 20054 + }, + { + "epoch": 1.8169463885302712, + "grad_norm": 0.8948311805725098, + "learning_rate": 7.88739201353229e-05, + "loss": 2.7616, + "step": 20055 + }, + { + "epoch": 1.8170369867047178, + "grad_norm": 0.9302178025245667, + "learning_rate": 7.886787893433215e-05, + "loss": 2.5703, + "step": 20056 + }, + { + "epoch": 1.8171275848791648, + "grad_norm": 0.8187023997306824, + "learning_rate": 7.88618377333414e-05, + "loss": 2.5415, + "step": 20057 + }, + { + "epoch": 1.8172181830536114, + "grad_norm": 0.9148251414299011, + "learning_rate": 7.885579653235063e-05, + "loss": 2.6765, + "step": 20058 + }, + { + "epoch": 1.8173087812280584, + "grad_norm": 0.8891935348510742, + "learning_rate": 7.884975533135988e-05, + "loss": 2.4334, + "step": 20059 + }, + { + "epoch": 1.817399379402505, + "grad_norm": 0.9435514807701111, + "learning_rate": 7.884371413036911e-05, + "loss": 2.8272, + "step": 20060 + }, + { + "epoch": 1.817489977576952, + "grad_norm": 0.8905547857284546, + "learning_rate": 7.883767292937836e-05, + "loss": 2.5111, + "step": 20061 + }, + { + "epoch": 1.8175805757513985, + "grad_norm": 0.9618327021598816, + "learning_rate": 7.883163172838761e-05, + "loss": 2.5276, + "step": 20062 + }, + { + "epoch": 1.8176711739258455, + "grad_norm": 0.8977437615394592, + "learning_rate": 7.882559052739686e-05, + "loss": 2.4407, + "step": 20063 + }, + { + "epoch": 1.817761772100292, + "grad_norm": 0.8745532631874084, + "learning_rate": 7.881954932640609e-05, + "loss": 2.6008, + "step": 20064 + }, + { + "epoch": 1.8178523702747391, + "grad_norm": 0.7682298421859741, + "learning_rate": 7.881350812541534e-05, + "loss": 1.9598, + "step": 20065 + }, + { + "epoch": 1.8179429684491857, + "grad_norm": 0.8641358613967896, + "learning_rate": 7.880746692442457e-05, + "loss": 2.5774, + "step": 20066 + }, + { + "epoch": 1.8180335666236327, + "grad_norm": 0.9155133962631226, + "learning_rate": 7.880142572343382e-05, + "loss": 2.6686, + "step": 20067 + }, + { + "epoch": 1.8181241647980793, + "grad_norm": 0.7864473462104797, + "learning_rate": 7.879538452244305e-05, + "loss": 2.1334, + "step": 20068 + }, + { + "epoch": 1.8182147629725263, + "grad_norm": 0.8209555149078369, + "learning_rate": 7.878934332145232e-05, + "loss": 2.6068, + "step": 20069 + }, + { + "epoch": 1.8183053611469728, + "grad_norm": 0.9273419976234436, + "learning_rate": 7.878330212046155e-05, + "loss": 2.7511, + "step": 20070 + }, + { + "epoch": 1.8183959593214196, + "grad_norm": 0.8901598453521729, + "learning_rate": 7.87772609194708e-05, + "loss": 2.75, + "step": 20071 + }, + { + "epoch": 1.8184865574958664, + "grad_norm": 0.7836031913757324, + "learning_rate": 7.877121971848003e-05, + "loss": 2.0975, + "step": 20072 + }, + { + "epoch": 1.8185771556703132, + "grad_norm": 0.8508947491645813, + "learning_rate": 7.876517851748928e-05, + "loss": 2.6841, + "step": 20073 + }, + { + "epoch": 1.81866775384476, + "grad_norm": 0.8940457105636597, + "learning_rate": 7.875913731649853e-05, + "loss": 2.6315, + "step": 20074 + }, + { + "epoch": 1.8187583520192068, + "grad_norm": 0.8268910050392151, + "learning_rate": 7.875309611550776e-05, + "loss": 1.9869, + "step": 20075 + }, + { + "epoch": 1.8188489501936536, + "grad_norm": 0.8802278637886047, + "learning_rate": 7.874705491451701e-05, + "loss": 2.4596, + "step": 20076 + }, + { + "epoch": 1.8189395483681003, + "grad_norm": 0.870719850063324, + "learning_rate": 7.874101371352626e-05, + "loss": 2.6173, + "step": 20077 + }, + { + "epoch": 1.8190301465425471, + "grad_norm": 0.8853182196617126, + "learning_rate": 7.87349725125355e-05, + "loss": 2.7231, + "step": 20078 + }, + { + "epoch": 1.819120744716994, + "grad_norm": 0.8977349400520325, + "learning_rate": 7.872893131154474e-05, + "loss": 2.5282, + "step": 20079 + }, + { + "epoch": 1.8192113428914407, + "grad_norm": 0.938753604888916, + "learning_rate": 7.872289011055399e-05, + "loss": 2.948, + "step": 20080 + }, + { + "epoch": 1.8193019410658875, + "grad_norm": 0.9599813222885132, + "learning_rate": 7.871684890956322e-05, + "loss": 2.5919, + "step": 20081 + }, + { + "epoch": 1.8193925392403343, + "grad_norm": 0.9378184080123901, + "learning_rate": 7.871080770857247e-05, + "loss": 2.6435, + "step": 20082 + }, + { + "epoch": 1.819483137414781, + "grad_norm": 0.6235406994819641, + "learning_rate": 7.87047665075817e-05, + "loss": 1.2108, + "step": 20083 + }, + { + "epoch": 1.8195737355892279, + "grad_norm": 0.8928853869438171, + "learning_rate": 7.869872530659096e-05, + "loss": 2.4932, + "step": 20084 + }, + { + "epoch": 1.8196643337636746, + "grad_norm": 0.8225389719009399, + "learning_rate": 7.86926841056002e-05, + "loss": 2.7346, + "step": 20085 + }, + { + "epoch": 1.8197549319381214, + "grad_norm": 0.9359320998191833, + "learning_rate": 7.868664290460944e-05, + "loss": 2.6276, + "step": 20086 + }, + { + "epoch": 1.8198455301125682, + "grad_norm": 0.8760091066360474, + "learning_rate": 7.868060170361868e-05, + "loss": 2.6427, + "step": 20087 + }, + { + "epoch": 1.819936128287015, + "grad_norm": 0.8630682229995728, + "learning_rate": 7.867456050262793e-05, + "loss": 1.8637, + "step": 20088 + }, + { + "epoch": 1.8200267264614618, + "grad_norm": 0.8865365982055664, + "learning_rate": 7.866851930163717e-05, + "loss": 2.5367, + "step": 20089 + }, + { + "epoch": 1.8201173246359086, + "grad_norm": 0.8611730933189392, + "learning_rate": 7.866247810064641e-05, + "loss": 2.7651, + "step": 20090 + }, + { + "epoch": 1.8202079228103554, + "grad_norm": 0.8238025307655334, + "learning_rate": 7.865643689965565e-05, + "loss": 1.9665, + "step": 20091 + }, + { + "epoch": 1.8202985209848022, + "grad_norm": 0.9155856966972351, + "learning_rate": 7.86503956986649e-05, + "loss": 2.5625, + "step": 20092 + }, + { + "epoch": 1.820389119159249, + "grad_norm": 0.776448130607605, + "learning_rate": 7.864435449767415e-05, + "loss": 2.5633, + "step": 20093 + }, + { + "epoch": 1.8204797173336957, + "grad_norm": 0.9805244207382202, + "learning_rate": 7.863831329668338e-05, + "loss": 2.7018, + "step": 20094 + }, + { + "epoch": 1.8205703155081425, + "grad_norm": 1.0712523460388184, + "learning_rate": 7.863227209569263e-05, + "loss": 2.8923, + "step": 20095 + }, + { + "epoch": 1.8206609136825893, + "grad_norm": 0.7516217231750488, + "learning_rate": 7.862623089470187e-05, + "loss": 1.9463, + "step": 20096 + }, + { + "epoch": 1.820751511857036, + "grad_norm": 0.9226735830307007, + "learning_rate": 7.862018969371111e-05, + "loss": 2.5201, + "step": 20097 + }, + { + "epoch": 1.8208421100314829, + "grad_norm": 0.8627856373786926, + "learning_rate": 7.861414849272035e-05, + "loss": 2.5504, + "step": 20098 + }, + { + "epoch": 1.8209327082059297, + "grad_norm": 0.9020451903343201, + "learning_rate": 7.860810729172961e-05, + "loss": 2.6588, + "step": 20099 + }, + { + "epoch": 1.8210233063803765, + "grad_norm": 0.9053206443786621, + "learning_rate": 7.860206609073884e-05, + "loss": 2.5955, + "step": 20100 + }, + { + "epoch": 1.8211139045548232, + "grad_norm": 0.9657183885574341, + "learning_rate": 7.859602488974809e-05, + "loss": 2.4633, + "step": 20101 + }, + { + "epoch": 1.82120450272927, + "grad_norm": 0.8823796510696411, + "learning_rate": 7.858998368875732e-05, + "loss": 2.7489, + "step": 20102 + }, + { + "epoch": 1.8212951009037168, + "grad_norm": 0.9432483911514282, + "learning_rate": 7.858394248776657e-05, + "loss": 2.5243, + "step": 20103 + }, + { + "epoch": 1.8213856990781636, + "grad_norm": 0.8579074740409851, + "learning_rate": 7.85779012867758e-05, + "loss": 2.6038, + "step": 20104 + }, + { + "epoch": 1.8214762972526104, + "grad_norm": 1.0050530433654785, + "learning_rate": 7.857186008578505e-05, + "loss": 2.8621, + "step": 20105 + }, + { + "epoch": 1.8215668954270572, + "grad_norm": 0.8866049647331238, + "learning_rate": 7.85658188847943e-05, + "loss": 2.7463, + "step": 20106 + }, + { + "epoch": 1.821657493601504, + "grad_norm": 0.8575589656829834, + "learning_rate": 7.855977768380355e-05, + "loss": 2.5546, + "step": 20107 + }, + { + "epoch": 1.8217480917759508, + "grad_norm": 0.7722617387771606, + "learning_rate": 7.85537364828128e-05, + "loss": 1.9567, + "step": 20108 + }, + { + "epoch": 1.8218386899503975, + "grad_norm": 0.9150044322013855, + "learning_rate": 7.854769528182203e-05, + "loss": 2.8224, + "step": 20109 + }, + { + "epoch": 1.8219292881248443, + "grad_norm": 0.9137762784957886, + "learning_rate": 7.854165408083128e-05, + "loss": 2.4925, + "step": 20110 + }, + { + "epoch": 1.822019886299291, + "grad_norm": 0.9322357773780823, + "learning_rate": 7.853561287984051e-05, + "loss": 2.5656, + "step": 20111 + }, + { + "epoch": 1.822110484473738, + "grad_norm": 0.8695874810218811, + "learning_rate": 7.852957167884976e-05, + "loss": 2.7776, + "step": 20112 + }, + { + "epoch": 1.8222010826481845, + "grad_norm": 0.9795105457305908, + "learning_rate": 7.8523530477859e-05, + "loss": 2.9589, + "step": 20113 + }, + { + "epoch": 1.8222916808226315, + "grad_norm": 0.9140171408653259, + "learning_rate": 7.851748927686825e-05, + "loss": 2.7186, + "step": 20114 + }, + { + "epoch": 1.822382278997078, + "grad_norm": 0.9382168054580688, + "learning_rate": 7.851144807587749e-05, + "loss": 2.6224, + "step": 20115 + }, + { + "epoch": 1.822472877171525, + "grad_norm": 0.9222667217254639, + "learning_rate": 7.850540687488674e-05, + "loss": 2.6973, + "step": 20116 + }, + { + "epoch": 1.8225634753459716, + "grad_norm": 0.8401442766189575, + "learning_rate": 7.849936567389597e-05, + "loss": 2.4067, + "step": 20117 + }, + { + "epoch": 1.8226540735204186, + "grad_norm": 0.8700619339942932, + "learning_rate": 7.849332447290522e-05, + "loss": 2.6376, + "step": 20118 + }, + { + "epoch": 1.8227446716948652, + "grad_norm": 0.9084345698356628, + "learning_rate": 7.848728327191445e-05, + "loss": 2.7336, + "step": 20119 + }, + { + "epoch": 1.8228352698693122, + "grad_norm": 0.9078928232192993, + "learning_rate": 7.84812420709237e-05, + "loss": 2.8334, + "step": 20120 + }, + { + "epoch": 1.8229258680437588, + "grad_norm": 1.068235993385315, + "learning_rate": 7.847520086993295e-05, + "loss": 2.4969, + "step": 20121 + }, + { + "epoch": 1.8230164662182058, + "grad_norm": 0.9526100158691406, + "learning_rate": 7.84691596689422e-05, + "loss": 2.6815, + "step": 20122 + }, + { + "epoch": 1.8231070643926524, + "grad_norm": 0.7634152770042419, + "learning_rate": 7.846311846795143e-05, + "loss": 2.1162, + "step": 20123 + }, + { + "epoch": 1.8231976625670994, + "grad_norm": 0.8756105303764343, + "learning_rate": 7.845707726696068e-05, + "loss": 2.7062, + "step": 20124 + }, + { + "epoch": 1.823288260741546, + "grad_norm": 0.8808054327964783, + "learning_rate": 7.845103606596992e-05, + "loss": 2.8013, + "step": 20125 + }, + { + "epoch": 1.823378858915993, + "grad_norm": 0.8328356146812439, + "learning_rate": 7.844499486497916e-05, + "loss": 2.6301, + "step": 20126 + }, + { + "epoch": 1.8234694570904395, + "grad_norm": 0.8452497720718384, + "learning_rate": 7.84389536639884e-05, + "loss": 2.798, + "step": 20127 + }, + { + "epoch": 1.8235600552648865, + "grad_norm": 0.8522842526435852, + "learning_rate": 7.843291246299764e-05, + "loss": 2.6289, + "step": 20128 + }, + { + "epoch": 1.823650653439333, + "grad_norm": 0.8685544729232788, + "learning_rate": 7.84268712620069e-05, + "loss": 2.9216, + "step": 20129 + }, + { + "epoch": 1.82374125161378, + "grad_norm": 0.912041425704956, + "learning_rate": 7.842083006101614e-05, + "loss": 2.6388, + "step": 20130 + }, + { + "epoch": 1.8238318497882267, + "grad_norm": 0.7714132070541382, + "learning_rate": 7.841478886002538e-05, + "loss": 1.1822, + "step": 20131 + }, + { + "epoch": 1.8239224479626737, + "grad_norm": 0.8927156925201416, + "learning_rate": 7.840874765903462e-05, + "loss": 2.9976, + "step": 20132 + }, + { + "epoch": 1.8240130461371202, + "grad_norm": 0.8943359851837158, + "learning_rate": 7.840270645804386e-05, + "loss": 2.6385, + "step": 20133 + }, + { + "epoch": 1.8241036443115672, + "grad_norm": 0.7792040109634399, + "learning_rate": 7.83966652570531e-05, + "loss": 1.9249, + "step": 20134 + }, + { + "epoch": 1.8241942424860138, + "grad_norm": 0.8684461116790771, + "learning_rate": 7.839062405606235e-05, + "loss": 2.7029, + "step": 20135 + }, + { + "epoch": 1.8242848406604608, + "grad_norm": 0.8543504476547241, + "learning_rate": 7.838458285507158e-05, + "loss": 2.3809, + "step": 20136 + }, + { + "epoch": 1.8243754388349074, + "grad_norm": 0.9204977750778198, + "learning_rate": 7.837854165408084e-05, + "loss": 2.8469, + "step": 20137 + }, + { + "epoch": 1.8244660370093544, + "grad_norm": 0.9171838760375977, + "learning_rate": 7.837250045309008e-05, + "loss": 2.5074, + "step": 20138 + }, + { + "epoch": 1.824556635183801, + "grad_norm": 0.9293959736824036, + "learning_rate": 7.836645925209932e-05, + "loss": 3.2578, + "step": 20139 + }, + { + "epoch": 1.824647233358248, + "grad_norm": 0.894504189491272, + "learning_rate": 7.836041805110856e-05, + "loss": 2.8115, + "step": 20140 + }, + { + "epoch": 1.8247378315326945, + "grad_norm": 0.9102955460548401, + "learning_rate": 7.83543768501178e-05, + "loss": 2.9474, + "step": 20141 + }, + { + "epoch": 1.8248284297071415, + "grad_norm": 0.9791390895843506, + "learning_rate": 7.834833564912705e-05, + "loss": 2.5388, + "step": 20142 + }, + { + "epoch": 1.824919027881588, + "grad_norm": 0.9340082406997681, + "learning_rate": 7.834229444813629e-05, + "loss": 2.7688, + "step": 20143 + }, + { + "epoch": 1.8250096260560351, + "grad_norm": 0.8401660323143005, + "learning_rate": 7.833625324714555e-05, + "loss": 2.5175, + "step": 20144 + }, + { + "epoch": 1.8251002242304817, + "grad_norm": 0.8726222515106201, + "learning_rate": 7.833021204615478e-05, + "loss": 2.4509, + "step": 20145 + }, + { + "epoch": 1.8251908224049287, + "grad_norm": 0.8760557174682617, + "learning_rate": 7.832417084516403e-05, + "loss": 2.5338, + "step": 20146 + }, + { + "epoch": 1.8252814205793753, + "grad_norm": 0.9225988984107971, + "learning_rate": 7.831812964417326e-05, + "loss": 2.8266, + "step": 20147 + }, + { + "epoch": 1.8253720187538223, + "grad_norm": 0.9097449779510498, + "learning_rate": 7.831208844318251e-05, + "loss": 2.7732, + "step": 20148 + }, + { + "epoch": 1.8254626169282688, + "grad_norm": 0.9519768953323364, + "learning_rate": 7.830604724219175e-05, + "loss": 2.636, + "step": 20149 + }, + { + "epoch": 1.8255532151027158, + "grad_norm": 0.8878796100616455, + "learning_rate": 7.830000604120099e-05, + "loss": 2.7807, + "step": 20150 + }, + { + "epoch": 1.8256438132771624, + "grad_norm": 0.8907794952392578, + "learning_rate": 7.829396484021023e-05, + "loss": 2.8576, + "step": 20151 + }, + { + "epoch": 1.8257344114516092, + "grad_norm": 0.7784291505813599, + "learning_rate": 7.828792363921949e-05, + "loss": 2.1329, + "step": 20152 + }, + { + "epoch": 1.825825009626056, + "grad_norm": 0.9302776455879211, + "learning_rate": 7.828188243822872e-05, + "loss": 2.768, + "step": 20153 + }, + { + "epoch": 1.8259156078005028, + "grad_norm": 0.9294601082801819, + "learning_rate": 7.827584123723797e-05, + "loss": 2.5666, + "step": 20154 + }, + { + "epoch": 1.8260062059749496, + "grad_norm": 0.9521052837371826, + "learning_rate": 7.82698000362472e-05, + "loss": 2.3685, + "step": 20155 + }, + { + "epoch": 1.8260968041493963, + "grad_norm": 0.9132661819458008, + "learning_rate": 7.826375883525645e-05, + "loss": 2.893, + "step": 20156 + }, + { + "epoch": 1.8261874023238431, + "grad_norm": 0.9353191256523132, + "learning_rate": 7.82577176342657e-05, + "loss": 2.8304, + "step": 20157 + }, + { + "epoch": 1.82627800049829, + "grad_norm": 0.9051839113235474, + "learning_rate": 7.825167643327493e-05, + "loss": 2.9507, + "step": 20158 + }, + { + "epoch": 1.8263685986727367, + "grad_norm": 1.0099538564682007, + "learning_rate": 7.824563523228418e-05, + "loss": 2.7581, + "step": 20159 + }, + { + "epoch": 1.8264591968471835, + "grad_norm": 0.8761767148971558, + "learning_rate": 7.823959403129343e-05, + "loss": 2.5167, + "step": 20160 + }, + { + "epoch": 1.8265497950216303, + "grad_norm": 0.9841244220733643, + "learning_rate": 7.823355283030268e-05, + "loss": 3.1895, + "step": 20161 + }, + { + "epoch": 1.826640393196077, + "grad_norm": 0.8935104608535767, + "learning_rate": 7.822751162931191e-05, + "loss": 2.7153, + "step": 20162 + }, + { + "epoch": 1.8267309913705239, + "grad_norm": 0.8779062032699585, + "learning_rate": 7.822147042832116e-05, + "loss": 2.6755, + "step": 20163 + }, + { + "epoch": 1.8268215895449706, + "grad_norm": 1.042120099067688, + "learning_rate": 7.821542922733039e-05, + "loss": 2.8873, + "step": 20164 + }, + { + "epoch": 1.8269121877194174, + "grad_norm": 0.9356328248977661, + "learning_rate": 7.820938802633964e-05, + "loss": 2.701, + "step": 20165 + }, + { + "epoch": 1.8270027858938642, + "grad_norm": 0.9682297110557556, + "learning_rate": 7.820334682534887e-05, + "loss": 3.1971, + "step": 20166 + }, + { + "epoch": 1.827093384068311, + "grad_norm": 0.9470559358596802, + "learning_rate": 7.819730562435813e-05, + "loss": 2.8157, + "step": 20167 + }, + { + "epoch": 1.8271839822427578, + "grad_norm": 0.8403677344322205, + "learning_rate": 7.819126442336737e-05, + "loss": 2.5929, + "step": 20168 + }, + { + "epoch": 1.8272745804172046, + "grad_norm": 0.9183537364006042, + "learning_rate": 7.818522322237662e-05, + "loss": 2.6072, + "step": 20169 + }, + { + "epoch": 1.8273651785916514, + "grad_norm": 0.9016404747962952, + "learning_rate": 7.817918202138585e-05, + "loss": 2.8538, + "step": 20170 + }, + { + "epoch": 1.8274557767660982, + "grad_norm": 0.9354453682899475, + "learning_rate": 7.81731408203951e-05, + "loss": 2.4859, + "step": 20171 + }, + { + "epoch": 1.827546374940545, + "grad_norm": 0.9141800403594971, + "learning_rate": 7.816709961940433e-05, + "loss": 2.6377, + "step": 20172 + }, + { + "epoch": 1.8276369731149917, + "grad_norm": 0.870181143283844, + "learning_rate": 7.816105841841358e-05, + "loss": 2.7629, + "step": 20173 + }, + { + "epoch": 1.8277275712894385, + "grad_norm": 0.876093327999115, + "learning_rate": 7.815501721742283e-05, + "loss": 2.7183, + "step": 20174 + }, + { + "epoch": 1.8278181694638853, + "grad_norm": 0.8546983599662781, + "learning_rate": 7.814897601643207e-05, + "loss": 2.5295, + "step": 20175 + }, + { + "epoch": 1.827908767638332, + "grad_norm": 0.785535454750061, + "learning_rate": 7.814293481544132e-05, + "loss": 2.3235, + "step": 20176 + }, + { + "epoch": 1.8279993658127789, + "grad_norm": 0.852470338344574, + "learning_rate": 7.813689361445056e-05, + "loss": 2.5448, + "step": 20177 + }, + { + "epoch": 1.8280899639872257, + "grad_norm": 0.8646169900894165, + "learning_rate": 7.81308524134598e-05, + "loss": 2.6869, + "step": 20178 + }, + { + "epoch": 1.8281805621616725, + "grad_norm": 0.8475267291069031, + "learning_rate": 7.812481121246904e-05, + "loss": 2.4923, + "step": 20179 + }, + { + "epoch": 1.8282711603361192, + "grad_norm": 0.927301287651062, + "learning_rate": 7.811877001147829e-05, + "loss": 2.4646, + "step": 20180 + }, + { + "epoch": 1.828361758510566, + "grad_norm": 0.892938494682312, + "learning_rate": 7.811272881048752e-05, + "loss": 2.6885, + "step": 20181 + }, + { + "epoch": 1.8284523566850128, + "grad_norm": 0.9812962412834167, + "learning_rate": 7.810668760949678e-05, + "loss": 2.782, + "step": 20182 + }, + { + "epoch": 1.8285429548594596, + "grad_norm": 0.8122937083244324, + "learning_rate": 7.810064640850601e-05, + "loss": 2.5682, + "step": 20183 + }, + { + "epoch": 1.8286335530339064, + "grad_norm": 1.0272977352142334, + "learning_rate": 7.809460520751526e-05, + "loss": 2.5598, + "step": 20184 + }, + { + "epoch": 1.8287241512083532, + "grad_norm": 0.9166468381881714, + "learning_rate": 7.80885640065245e-05, + "loss": 2.8228, + "step": 20185 + }, + { + "epoch": 1.8288147493828, + "grad_norm": 0.970853328704834, + "learning_rate": 7.808252280553374e-05, + "loss": 2.6595, + "step": 20186 + }, + { + "epoch": 1.8289053475572468, + "grad_norm": 0.8532607555389404, + "learning_rate": 7.807648160454298e-05, + "loss": 2.6166, + "step": 20187 + }, + { + "epoch": 1.8289959457316936, + "grad_norm": 0.953767716884613, + "learning_rate": 7.807044040355223e-05, + "loss": 2.929, + "step": 20188 + }, + { + "epoch": 1.8290865439061403, + "grad_norm": 0.8937206864356995, + "learning_rate": 7.806439920256147e-05, + "loss": 2.6191, + "step": 20189 + }, + { + "epoch": 1.8291771420805871, + "grad_norm": 0.927531898021698, + "learning_rate": 7.805835800157072e-05, + "loss": 2.9098, + "step": 20190 + }, + { + "epoch": 1.829267740255034, + "grad_norm": 0.8563929200172424, + "learning_rate": 7.805231680057995e-05, + "loss": 2.7236, + "step": 20191 + }, + { + "epoch": 1.8293583384294805, + "grad_norm": 0.8841241002082825, + "learning_rate": 7.80462755995892e-05, + "loss": 2.6515, + "step": 20192 + }, + { + "epoch": 1.8294489366039275, + "grad_norm": 0.9071182608604431, + "learning_rate": 7.804023439859845e-05, + "loss": 2.653, + "step": 20193 + }, + { + "epoch": 1.829539534778374, + "grad_norm": 0.8934647440910339, + "learning_rate": 7.803419319760768e-05, + "loss": 2.5663, + "step": 20194 + }, + { + "epoch": 1.829630132952821, + "grad_norm": 0.8993472456932068, + "learning_rate": 7.802815199661693e-05, + "loss": 2.7435, + "step": 20195 + }, + { + "epoch": 1.8297207311272676, + "grad_norm": 0.9532661437988281, + "learning_rate": 7.802211079562617e-05, + "loss": 2.5567, + "step": 20196 + }, + { + "epoch": 1.8298113293017146, + "grad_norm": 0.736790657043457, + "learning_rate": 7.801606959463543e-05, + "loss": 2.0894, + "step": 20197 + }, + { + "epoch": 1.8299019274761612, + "grad_norm": 0.9574112892150879, + "learning_rate": 7.801002839364466e-05, + "loss": 2.6933, + "step": 20198 + }, + { + "epoch": 1.8299925256506082, + "grad_norm": 0.9091559648513794, + "learning_rate": 7.800398719265391e-05, + "loss": 2.7839, + "step": 20199 + }, + { + "epoch": 1.8300831238250548, + "grad_norm": 0.9418073892593384, + "learning_rate": 7.799794599166314e-05, + "loss": 2.6901, + "step": 20200 + }, + { + "epoch": 1.8301737219995018, + "grad_norm": 0.942821741104126, + "learning_rate": 7.799190479067239e-05, + "loss": 2.8381, + "step": 20201 + }, + { + "epoch": 1.8302643201739484, + "grad_norm": 0.8721308708190918, + "learning_rate": 7.798586358968162e-05, + "loss": 2.7399, + "step": 20202 + }, + { + "epoch": 1.8303549183483954, + "grad_norm": 0.9573378562927246, + "learning_rate": 7.797982238869087e-05, + "loss": 2.8723, + "step": 20203 + }, + { + "epoch": 1.830445516522842, + "grad_norm": 0.8671541810035706, + "learning_rate": 7.797378118770012e-05, + "loss": 2.7038, + "step": 20204 + }, + { + "epoch": 1.830536114697289, + "grad_norm": 0.7726173996925354, + "learning_rate": 7.796773998670937e-05, + "loss": 1.9031, + "step": 20205 + }, + { + "epoch": 1.8306267128717355, + "grad_norm": 0.9816793203353882, + "learning_rate": 7.79616987857186e-05, + "loss": 2.7697, + "step": 20206 + }, + { + "epoch": 1.8307173110461825, + "grad_norm": 1.0003993511199951, + "learning_rate": 7.795565758472785e-05, + "loss": 2.7503, + "step": 20207 + }, + { + "epoch": 1.830807909220629, + "grad_norm": 0.830001175403595, + "learning_rate": 7.79496163837371e-05, + "loss": 2.4671, + "step": 20208 + }, + { + "epoch": 1.830898507395076, + "grad_norm": 0.9731109738349915, + "learning_rate": 7.794357518274633e-05, + "loss": 2.4967, + "step": 20209 + }, + { + "epoch": 1.8309891055695227, + "grad_norm": 0.9808710217475891, + "learning_rate": 7.793753398175558e-05, + "loss": 2.483, + "step": 20210 + }, + { + "epoch": 1.8310797037439697, + "grad_norm": 0.9735525250434875, + "learning_rate": 7.793149278076481e-05, + "loss": 2.7109, + "step": 20211 + }, + { + "epoch": 1.8311703019184162, + "grad_norm": 0.9315820932388306, + "learning_rate": 7.792545157977407e-05, + "loss": 2.9305, + "step": 20212 + }, + { + "epoch": 1.8312609000928632, + "grad_norm": 1.0268539190292358, + "learning_rate": 7.791941037878331e-05, + "loss": 2.9082, + "step": 20213 + }, + { + "epoch": 1.8313514982673098, + "grad_norm": 0.9435291290283203, + "learning_rate": 7.791336917779255e-05, + "loss": 2.6491, + "step": 20214 + }, + { + "epoch": 1.8314420964417568, + "grad_norm": 0.9181504249572754, + "learning_rate": 7.790732797680179e-05, + "loss": 2.9095, + "step": 20215 + }, + { + "epoch": 1.8315326946162034, + "grad_norm": 0.8534542322158813, + "learning_rate": 7.790128677581104e-05, + "loss": 2.7938, + "step": 20216 + }, + { + "epoch": 1.8316232927906504, + "grad_norm": 0.8935950994491577, + "learning_rate": 7.789524557482027e-05, + "loss": 2.6984, + "step": 20217 + }, + { + "epoch": 1.831713890965097, + "grad_norm": 0.7259282469749451, + "learning_rate": 7.788920437382952e-05, + "loss": 1.6234, + "step": 20218 + }, + { + "epoch": 1.831804489139544, + "grad_norm": 0.925908088684082, + "learning_rate": 7.788316317283877e-05, + "loss": 2.5138, + "step": 20219 + }, + { + "epoch": 1.8318950873139905, + "grad_norm": 0.9129888415336609, + "learning_rate": 7.787712197184801e-05, + "loss": 2.7241, + "step": 20220 + }, + { + "epoch": 1.8319856854884375, + "grad_norm": 0.9029122591018677, + "learning_rate": 7.787108077085725e-05, + "loss": 2.8044, + "step": 20221 + }, + { + "epoch": 1.832076283662884, + "grad_norm": 0.8827618360519409, + "learning_rate": 7.78650395698665e-05, + "loss": 2.7094, + "step": 20222 + }, + { + "epoch": 1.8321668818373311, + "grad_norm": 0.9115000367164612, + "learning_rate": 7.785899836887573e-05, + "loss": 2.7125, + "step": 20223 + }, + { + "epoch": 1.8322574800117777, + "grad_norm": 0.8864482641220093, + "learning_rate": 7.785295716788498e-05, + "loss": 2.7612, + "step": 20224 + }, + { + "epoch": 1.8323480781862247, + "grad_norm": 0.882919430732727, + "learning_rate": 7.784691596689422e-05, + "loss": 2.843, + "step": 20225 + }, + { + "epoch": 1.8324386763606713, + "grad_norm": 0.7718276977539062, + "learning_rate": 7.784087476590346e-05, + "loss": 2.109, + "step": 20226 + }, + { + "epoch": 1.8325292745351183, + "grad_norm": 0.7797431945800781, + "learning_rate": 7.78348335649127e-05, + "loss": 2.383, + "step": 20227 + }, + { + "epoch": 1.8326198727095648, + "grad_norm": 0.9084407687187195, + "learning_rate": 7.782879236392195e-05, + "loss": 2.7009, + "step": 20228 + }, + { + "epoch": 1.8327104708840118, + "grad_norm": 0.9108140468597412, + "learning_rate": 7.78227511629312e-05, + "loss": 2.8932, + "step": 20229 + }, + { + "epoch": 1.8328010690584584, + "grad_norm": 0.7704602479934692, + "learning_rate": 7.781670996194044e-05, + "loss": 1.9888, + "step": 20230 + }, + { + "epoch": 1.8328916672329054, + "grad_norm": 0.8437477946281433, + "learning_rate": 7.781066876094968e-05, + "loss": 2.6622, + "step": 20231 + }, + { + "epoch": 1.832982265407352, + "grad_norm": 0.7815380692481995, + "learning_rate": 7.780462755995892e-05, + "loss": 2.1018, + "step": 20232 + }, + { + "epoch": 1.8330728635817988, + "grad_norm": 0.928504467010498, + "learning_rate": 7.779858635896816e-05, + "loss": 2.6306, + "step": 20233 + }, + { + "epoch": 1.8331634617562456, + "grad_norm": 0.9115841388702393, + "learning_rate": 7.779254515797741e-05, + "loss": 2.6309, + "step": 20234 + }, + { + "epoch": 1.8332540599306923, + "grad_norm": 0.9073794484138489, + "learning_rate": 7.778650395698666e-05, + "loss": 2.9901, + "step": 20235 + }, + { + "epoch": 1.8333446581051391, + "grad_norm": 0.8581316471099854, + "learning_rate": 7.77804627559959e-05, + "loss": 2.755, + "step": 20236 + }, + { + "epoch": 1.833435256279586, + "grad_norm": 0.8885548114776611, + "learning_rate": 7.777442155500514e-05, + "loss": 2.6456, + "step": 20237 + }, + { + "epoch": 1.8335258544540327, + "grad_norm": 0.8793916702270508, + "learning_rate": 7.776838035401438e-05, + "loss": 2.798, + "step": 20238 + }, + { + "epoch": 1.8336164526284795, + "grad_norm": 0.7496606707572937, + "learning_rate": 7.776233915302362e-05, + "loss": 1.935, + "step": 20239 + }, + { + "epoch": 1.8337070508029263, + "grad_norm": 0.9273364543914795, + "learning_rate": 7.775629795203287e-05, + "loss": 2.8609, + "step": 20240 + }, + { + "epoch": 1.833797648977373, + "grad_norm": 0.9004775881767273, + "learning_rate": 7.77502567510421e-05, + "loss": 2.5397, + "step": 20241 + }, + { + "epoch": 1.8338882471518199, + "grad_norm": 0.977106511592865, + "learning_rate": 7.774421555005135e-05, + "loss": 2.875, + "step": 20242 + }, + { + "epoch": 1.8339788453262666, + "grad_norm": 0.8942633867263794, + "learning_rate": 7.77381743490606e-05, + "loss": 2.8369, + "step": 20243 + }, + { + "epoch": 1.8340694435007134, + "grad_norm": 0.9725558757781982, + "learning_rate": 7.773213314806985e-05, + "loss": 2.5628, + "step": 20244 + }, + { + "epoch": 1.8341600416751602, + "grad_norm": 0.959102213382721, + "learning_rate": 7.772609194707908e-05, + "loss": 2.7752, + "step": 20245 + }, + { + "epoch": 1.834250639849607, + "grad_norm": 0.9413577318191528, + "learning_rate": 7.772005074608833e-05, + "loss": 3.0987, + "step": 20246 + }, + { + "epoch": 1.8343412380240538, + "grad_norm": 0.9789476990699768, + "learning_rate": 7.771400954509756e-05, + "loss": 2.7836, + "step": 20247 + }, + { + "epoch": 1.8344318361985006, + "grad_norm": 0.8679420948028564, + "learning_rate": 7.770796834410681e-05, + "loss": 2.4494, + "step": 20248 + }, + { + "epoch": 1.8345224343729474, + "grad_norm": 0.9462974667549133, + "learning_rate": 7.770192714311606e-05, + "loss": 2.5056, + "step": 20249 + }, + { + "epoch": 1.8346130325473942, + "grad_norm": 0.9053322672843933, + "learning_rate": 7.76958859421253e-05, + "loss": 2.6866, + "step": 20250 + }, + { + "epoch": 1.834703630721841, + "grad_norm": 0.8868676424026489, + "learning_rate": 7.768984474113454e-05, + "loss": 2.9139, + "step": 20251 + }, + { + "epoch": 1.8347942288962877, + "grad_norm": 0.8557708859443665, + "learning_rate": 7.768380354014379e-05, + "loss": 2.582, + "step": 20252 + }, + { + "epoch": 1.8348848270707345, + "grad_norm": 0.8864154815673828, + "learning_rate": 7.767776233915302e-05, + "loss": 2.6232, + "step": 20253 + }, + { + "epoch": 1.8349754252451813, + "grad_norm": 0.9356675148010254, + "learning_rate": 7.767172113816227e-05, + "loss": 2.7292, + "step": 20254 + }, + { + "epoch": 1.835066023419628, + "grad_norm": 0.8840067982673645, + "learning_rate": 7.76656799371715e-05, + "loss": 2.471, + "step": 20255 + }, + { + "epoch": 1.8351566215940749, + "grad_norm": 0.9170788526535034, + "learning_rate": 7.765963873618075e-05, + "loss": 2.7145, + "step": 20256 + }, + { + "epoch": 1.8352472197685217, + "grad_norm": 0.9017168879508972, + "learning_rate": 7.765359753519e-05, + "loss": 2.8446, + "step": 20257 + }, + { + "epoch": 1.8353378179429685, + "grad_norm": 0.9960169792175293, + "learning_rate": 7.764755633419925e-05, + "loss": 2.7609, + "step": 20258 + }, + { + "epoch": 1.8354284161174153, + "grad_norm": 0.9518468379974365, + "learning_rate": 7.764151513320848e-05, + "loss": 2.7068, + "step": 20259 + }, + { + "epoch": 1.835519014291862, + "grad_norm": 0.9005335569381714, + "learning_rate": 7.763547393221773e-05, + "loss": 2.6395, + "step": 20260 + }, + { + "epoch": 1.8356096124663088, + "grad_norm": 0.7734286189079285, + "learning_rate": 7.762943273122698e-05, + "loss": 2.1596, + "step": 20261 + }, + { + "epoch": 1.8357002106407556, + "grad_norm": 0.7654691338539124, + "learning_rate": 7.762339153023621e-05, + "loss": 1.9046, + "step": 20262 + }, + { + "epoch": 1.8357908088152024, + "grad_norm": 0.8141324520111084, + "learning_rate": 7.761735032924546e-05, + "loss": 1.9139, + "step": 20263 + }, + { + "epoch": 1.8358814069896492, + "grad_norm": 1.0430039167404175, + "learning_rate": 7.76113091282547e-05, + "loss": 2.8919, + "step": 20264 + }, + { + "epoch": 1.835972005164096, + "grad_norm": 0.962896466255188, + "learning_rate": 7.760526792726395e-05, + "loss": 2.4759, + "step": 20265 + }, + { + "epoch": 1.8360626033385428, + "grad_norm": 0.914374053478241, + "learning_rate": 7.759922672627319e-05, + "loss": 2.7933, + "step": 20266 + }, + { + "epoch": 1.8361532015129896, + "grad_norm": 0.8101989030838013, + "learning_rate": 7.759318552528243e-05, + "loss": 1.9238, + "step": 20267 + }, + { + "epoch": 1.8362437996874363, + "grad_norm": 0.8603246212005615, + "learning_rate": 7.758714432429167e-05, + "loss": 2.6299, + "step": 20268 + }, + { + "epoch": 1.8363343978618831, + "grad_norm": 0.895560085773468, + "learning_rate": 7.758110312330092e-05, + "loss": 2.847, + "step": 20269 + }, + { + "epoch": 1.83642499603633, + "grad_norm": 0.9429414868354797, + "learning_rate": 7.757506192231015e-05, + "loss": 2.9247, + "step": 20270 + }, + { + "epoch": 1.8365155942107767, + "grad_norm": 0.9469118118286133, + "learning_rate": 7.756902072131941e-05, + "loss": 2.5773, + "step": 20271 + }, + { + "epoch": 1.8366061923852235, + "grad_norm": 0.9212376475334167, + "learning_rate": 7.756297952032864e-05, + "loss": 2.9204, + "step": 20272 + }, + { + "epoch": 1.8366967905596703, + "grad_norm": 0.8603307008743286, + "learning_rate": 7.755693831933789e-05, + "loss": 2.716, + "step": 20273 + }, + { + "epoch": 1.836787388734117, + "grad_norm": 0.9123177528381348, + "learning_rate": 7.755089711834713e-05, + "loss": 2.6498, + "step": 20274 + }, + { + "epoch": 1.8368779869085636, + "grad_norm": 0.7584738731384277, + "learning_rate": 7.754485591735637e-05, + "loss": 1.9091, + "step": 20275 + }, + { + "epoch": 1.8369685850830106, + "grad_norm": 0.9158599376678467, + "learning_rate": 7.753881471636562e-05, + "loss": 2.9966, + "step": 20276 + }, + { + "epoch": 1.8370591832574572, + "grad_norm": 1.0016597509384155, + "learning_rate": 7.753277351537486e-05, + "loss": 3.2091, + "step": 20277 + }, + { + "epoch": 1.8371497814319042, + "grad_norm": 0.9148194789886475, + "learning_rate": 7.75267323143841e-05, + "loss": 2.7228, + "step": 20278 + }, + { + "epoch": 1.8372403796063508, + "grad_norm": 0.8986095190048218, + "learning_rate": 7.752069111339335e-05, + "loss": 2.7706, + "step": 20279 + }, + { + "epoch": 1.8373309777807978, + "grad_norm": 0.977319598197937, + "learning_rate": 7.75146499124026e-05, + "loss": 2.6789, + "step": 20280 + }, + { + "epoch": 1.8374215759552444, + "grad_norm": 0.8810192942619324, + "learning_rate": 7.750860871141183e-05, + "loss": 2.6365, + "step": 20281 + }, + { + "epoch": 1.8375121741296914, + "grad_norm": 0.992836594581604, + "learning_rate": 7.750256751042108e-05, + "loss": 2.7355, + "step": 20282 + }, + { + "epoch": 1.837602772304138, + "grad_norm": 0.9166913628578186, + "learning_rate": 7.749652630943031e-05, + "loss": 2.746, + "step": 20283 + }, + { + "epoch": 1.837693370478585, + "grad_norm": 0.9565890431404114, + "learning_rate": 7.749048510843956e-05, + "loss": 2.8651, + "step": 20284 + }, + { + "epoch": 1.8377839686530315, + "grad_norm": 0.9333193302154541, + "learning_rate": 7.74844439074488e-05, + "loss": 2.4235, + "step": 20285 + }, + { + "epoch": 1.8378745668274785, + "grad_norm": 0.8823670744895935, + "learning_rate": 7.747840270645806e-05, + "loss": 2.7216, + "step": 20286 + }, + { + "epoch": 1.837965165001925, + "grad_norm": 0.9014944434165955, + "learning_rate": 7.747236150546729e-05, + "loss": 2.9122, + "step": 20287 + }, + { + "epoch": 1.838055763176372, + "grad_norm": 0.8922752141952515, + "learning_rate": 7.746632030447654e-05, + "loss": 2.6712, + "step": 20288 + }, + { + "epoch": 1.8381463613508187, + "grad_norm": 0.9129081964492798, + "learning_rate": 7.746027910348577e-05, + "loss": 2.6382, + "step": 20289 + }, + { + "epoch": 1.8382369595252657, + "grad_norm": 0.9882758855819702, + "learning_rate": 7.745423790249502e-05, + "loss": 2.5457, + "step": 20290 + }, + { + "epoch": 1.8383275576997122, + "grad_norm": 0.9228031039237976, + "learning_rate": 7.744819670150425e-05, + "loss": 2.8962, + "step": 20291 + }, + { + "epoch": 1.8384181558741592, + "grad_norm": 0.9742048978805542, + "learning_rate": 7.74421555005135e-05, + "loss": 2.6291, + "step": 20292 + }, + { + "epoch": 1.8385087540486058, + "grad_norm": 0.781738817691803, + "learning_rate": 7.743611429952275e-05, + "loss": 2.0991, + "step": 20293 + }, + { + "epoch": 1.8385993522230528, + "grad_norm": 0.929104745388031, + "learning_rate": 7.7430073098532e-05, + "loss": 2.6948, + "step": 20294 + }, + { + "epoch": 1.8386899503974994, + "grad_norm": 0.9285093545913696, + "learning_rate": 7.742403189754124e-05, + "loss": 2.5151, + "step": 20295 + }, + { + "epoch": 1.8387805485719464, + "grad_norm": 0.9646064639091492, + "learning_rate": 7.741799069655048e-05, + "loss": 2.6415, + "step": 20296 + }, + { + "epoch": 1.838871146746393, + "grad_norm": 0.9107156991958618, + "learning_rate": 7.741194949555973e-05, + "loss": 2.7514, + "step": 20297 + }, + { + "epoch": 1.83896174492084, + "grad_norm": 0.9366807341575623, + "learning_rate": 7.740590829456896e-05, + "loss": 2.7921, + "step": 20298 + }, + { + "epoch": 1.8390523430952865, + "grad_norm": 0.8996151089668274, + "learning_rate": 7.739986709357821e-05, + "loss": 2.9008, + "step": 20299 + }, + { + "epoch": 1.8391429412697335, + "grad_norm": 0.912574052810669, + "learning_rate": 7.739382589258744e-05, + "loss": 2.9359, + "step": 20300 + }, + { + "epoch": 1.83923353944418, + "grad_norm": 0.8998012542724609, + "learning_rate": 7.73877846915967e-05, + "loss": 2.7087, + "step": 20301 + }, + { + "epoch": 1.8393241376186271, + "grad_norm": 0.9007163047790527, + "learning_rate": 7.738174349060594e-05, + "loss": 2.5928, + "step": 20302 + }, + { + "epoch": 1.8394147357930737, + "grad_norm": 0.8987355828285217, + "learning_rate": 7.737570228961519e-05, + "loss": 2.777, + "step": 20303 + }, + { + "epoch": 1.8395053339675207, + "grad_norm": 0.9354403614997864, + "learning_rate": 7.736966108862442e-05, + "loss": 2.6484, + "step": 20304 + }, + { + "epoch": 1.8395959321419673, + "grad_norm": 0.9675593972206116, + "learning_rate": 7.736361988763367e-05, + "loss": 2.6092, + "step": 20305 + }, + { + "epoch": 1.8396865303164143, + "grad_norm": 0.9261701107025146, + "learning_rate": 7.73575786866429e-05, + "loss": 2.6219, + "step": 20306 + }, + { + "epoch": 1.8397771284908608, + "grad_norm": 0.9336337447166443, + "learning_rate": 7.735153748565215e-05, + "loss": 2.7891, + "step": 20307 + }, + { + "epoch": 1.8398677266653078, + "grad_norm": 0.8555988073348999, + "learning_rate": 7.73454962846614e-05, + "loss": 2.5689, + "step": 20308 + }, + { + "epoch": 1.8399583248397544, + "grad_norm": 0.8788047432899475, + "learning_rate": 7.733945508367064e-05, + "loss": 2.716, + "step": 20309 + }, + { + "epoch": 1.8400489230142014, + "grad_norm": 0.9080169200897217, + "learning_rate": 7.733341388267988e-05, + "loss": 2.808, + "step": 20310 + }, + { + "epoch": 1.840139521188648, + "grad_norm": 0.9271611571311951, + "learning_rate": 7.732737268168913e-05, + "loss": 2.8533, + "step": 20311 + }, + { + "epoch": 1.840230119363095, + "grad_norm": 0.8891882300376892, + "learning_rate": 7.732133148069837e-05, + "loss": 2.7025, + "step": 20312 + }, + { + "epoch": 1.8403207175375416, + "grad_norm": 0.8583112359046936, + "learning_rate": 7.731529027970761e-05, + "loss": 2.7815, + "step": 20313 + }, + { + "epoch": 1.8404113157119883, + "grad_norm": 0.9053146243095398, + "learning_rate": 7.730924907871685e-05, + "loss": 2.7372, + "step": 20314 + }, + { + "epoch": 1.8405019138864351, + "grad_norm": 0.7611249685287476, + "learning_rate": 7.730320787772609e-05, + "loss": 2.0365, + "step": 20315 + }, + { + "epoch": 1.840592512060882, + "grad_norm": 0.8137850165367126, + "learning_rate": 7.729716667673535e-05, + "loss": 2.6265, + "step": 20316 + }, + { + "epoch": 1.8406831102353287, + "grad_norm": 0.9071595668792725, + "learning_rate": 7.729112547574458e-05, + "loss": 2.5, + "step": 20317 + }, + { + "epoch": 1.8407737084097755, + "grad_norm": 0.899834394454956, + "learning_rate": 7.728508427475383e-05, + "loss": 2.8375, + "step": 20318 + }, + { + "epoch": 1.8408643065842223, + "grad_norm": 0.8955226540565491, + "learning_rate": 7.727904307376307e-05, + "loss": 2.5496, + "step": 20319 + }, + { + "epoch": 1.840954904758669, + "grad_norm": 0.9553337693214417, + "learning_rate": 7.727300187277231e-05, + "loss": 2.616, + "step": 20320 + }, + { + "epoch": 1.8410455029331159, + "grad_norm": 0.861609697341919, + "learning_rate": 7.726696067178155e-05, + "loss": 2.7649, + "step": 20321 + }, + { + "epoch": 1.8411361011075627, + "grad_norm": 0.7666820287704468, + "learning_rate": 7.72609194707908e-05, + "loss": 2.0479, + "step": 20322 + }, + { + "epoch": 1.8412266992820094, + "grad_norm": 0.9160669445991516, + "learning_rate": 7.725487826980003e-05, + "loss": 2.9339, + "step": 20323 + }, + { + "epoch": 1.8413172974564562, + "grad_norm": 0.8709301352500916, + "learning_rate": 7.724883706880929e-05, + "loss": 2.9238, + "step": 20324 + }, + { + "epoch": 1.841407895630903, + "grad_norm": 0.9285378456115723, + "learning_rate": 7.724279586781852e-05, + "loss": 2.7901, + "step": 20325 + }, + { + "epoch": 1.8414984938053498, + "grad_norm": 0.819087564945221, + "learning_rate": 7.723675466682777e-05, + "loss": 2.1624, + "step": 20326 + }, + { + "epoch": 1.8415890919797966, + "grad_norm": 0.9151854515075684, + "learning_rate": 7.7230713465837e-05, + "loss": 2.6296, + "step": 20327 + }, + { + "epoch": 1.8416796901542434, + "grad_norm": 0.9092082977294922, + "learning_rate": 7.722467226484625e-05, + "loss": 2.9765, + "step": 20328 + }, + { + "epoch": 1.8417702883286902, + "grad_norm": 0.8731589913368225, + "learning_rate": 7.72186310638555e-05, + "loss": 2.7426, + "step": 20329 + }, + { + "epoch": 1.841860886503137, + "grad_norm": 0.745587944984436, + "learning_rate": 7.721258986286473e-05, + "loss": 2.0932, + "step": 20330 + }, + { + "epoch": 1.8419514846775837, + "grad_norm": 0.9719732403755188, + "learning_rate": 7.7206548661874e-05, + "loss": 2.6045, + "step": 20331 + }, + { + "epoch": 1.8420420828520305, + "grad_norm": 0.9045608639717102, + "learning_rate": 7.720050746088323e-05, + "loss": 2.8702, + "step": 20332 + }, + { + "epoch": 1.8421326810264773, + "grad_norm": 0.9492810368537903, + "learning_rate": 7.719446625989248e-05, + "loss": 2.5596, + "step": 20333 + }, + { + "epoch": 1.842223279200924, + "grad_norm": 0.9121752381324768, + "learning_rate": 7.718842505890171e-05, + "loss": 2.6112, + "step": 20334 + }, + { + "epoch": 1.842313877375371, + "grad_norm": 0.8778383731842041, + "learning_rate": 7.718238385791096e-05, + "loss": 2.6504, + "step": 20335 + }, + { + "epoch": 1.8424044755498177, + "grad_norm": 0.9335923194885254, + "learning_rate": 7.71763426569202e-05, + "loss": 2.6314, + "step": 20336 + }, + { + "epoch": 1.8424950737242645, + "grad_norm": 0.9933277368545532, + "learning_rate": 7.717030145592944e-05, + "loss": 2.444, + "step": 20337 + }, + { + "epoch": 1.8425856718987113, + "grad_norm": 0.833389163017273, + "learning_rate": 7.716426025493868e-05, + "loss": 2.3872, + "step": 20338 + }, + { + "epoch": 1.842676270073158, + "grad_norm": 1.0055652856826782, + "learning_rate": 7.715821905394794e-05, + "loss": 2.6082, + "step": 20339 + }, + { + "epoch": 1.8427668682476048, + "grad_norm": 0.9831059575080872, + "learning_rate": 7.715217785295717e-05, + "loss": 2.6146, + "step": 20340 + }, + { + "epoch": 1.8428574664220516, + "grad_norm": 0.9934636950492859, + "learning_rate": 7.714613665196642e-05, + "loss": 2.5503, + "step": 20341 + }, + { + "epoch": 1.8429480645964984, + "grad_norm": 0.8661195039749146, + "learning_rate": 7.714009545097565e-05, + "loss": 2.61, + "step": 20342 + }, + { + "epoch": 1.8430386627709452, + "grad_norm": 0.9685442447662354, + "learning_rate": 7.71340542499849e-05, + "loss": 2.6845, + "step": 20343 + }, + { + "epoch": 1.843129260945392, + "grad_norm": 0.881770133972168, + "learning_rate": 7.712801304899415e-05, + "loss": 2.9275, + "step": 20344 + }, + { + "epoch": 1.8432198591198388, + "grad_norm": 0.8577285408973694, + "learning_rate": 7.712197184800338e-05, + "loss": 2.5047, + "step": 20345 + }, + { + "epoch": 1.8433104572942856, + "grad_norm": 0.8256192803382874, + "learning_rate": 7.711593064701263e-05, + "loss": 2.6713, + "step": 20346 + }, + { + "epoch": 1.8434010554687323, + "grad_norm": 1.0033740997314453, + "learning_rate": 7.710988944602188e-05, + "loss": 2.8083, + "step": 20347 + }, + { + "epoch": 1.8434916536431791, + "grad_norm": 0.8726778626441956, + "learning_rate": 7.710384824503112e-05, + "loss": 1.9774, + "step": 20348 + }, + { + "epoch": 1.843582251817626, + "grad_norm": 0.9053054451942444, + "learning_rate": 7.709780704404036e-05, + "loss": 2.4715, + "step": 20349 + }, + { + "epoch": 1.8436728499920727, + "grad_norm": 1.0130168199539185, + "learning_rate": 7.70917658430496e-05, + "loss": 2.7937, + "step": 20350 + }, + { + "epoch": 1.8437634481665195, + "grad_norm": 0.8560870885848999, + "learning_rate": 7.708572464205884e-05, + "loss": 2.6169, + "step": 20351 + }, + { + "epoch": 1.8438540463409663, + "grad_norm": 0.8408955335617065, + "learning_rate": 7.707968344106809e-05, + "loss": 2.6145, + "step": 20352 + }, + { + "epoch": 1.843944644515413, + "grad_norm": 0.966546356678009, + "learning_rate": 7.707364224007732e-05, + "loss": 2.8849, + "step": 20353 + }, + { + "epoch": 1.8440352426898599, + "grad_norm": 0.7485771179199219, + "learning_rate": 7.706760103908658e-05, + "loss": 2.0282, + "step": 20354 + }, + { + "epoch": 1.8441258408643066, + "grad_norm": 0.9335477948188782, + "learning_rate": 7.706155983809582e-05, + "loss": 2.7303, + "step": 20355 + }, + { + "epoch": 1.8442164390387532, + "grad_norm": 0.9218019247055054, + "learning_rate": 7.705551863710506e-05, + "loss": 2.7589, + "step": 20356 + }, + { + "epoch": 1.8443070372132002, + "grad_norm": 0.8953384757041931, + "learning_rate": 7.70494774361143e-05, + "loss": 2.6718, + "step": 20357 + }, + { + "epoch": 1.8443976353876468, + "grad_norm": 0.9160562753677368, + "learning_rate": 7.704343623512355e-05, + "loss": 2.8839, + "step": 20358 + }, + { + "epoch": 1.8444882335620938, + "grad_norm": 0.8596379160881042, + "learning_rate": 7.703739503413278e-05, + "loss": 2.5992, + "step": 20359 + }, + { + "epoch": 1.8445788317365404, + "grad_norm": 1.035328984260559, + "learning_rate": 7.703135383314203e-05, + "loss": 2.9624, + "step": 20360 + }, + { + "epoch": 1.8446694299109874, + "grad_norm": 0.9067130088806152, + "learning_rate": 7.702531263215128e-05, + "loss": 2.61, + "step": 20361 + }, + { + "epoch": 1.844760028085434, + "grad_norm": 0.8646628856658936, + "learning_rate": 7.701927143116052e-05, + "loss": 2.7532, + "step": 20362 + }, + { + "epoch": 1.844850626259881, + "grad_norm": 0.9748340845108032, + "learning_rate": 7.701323023016977e-05, + "loss": 2.6969, + "step": 20363 + }, + { + "epoch": 1.8449412244343275, + "grad_norm": 0.847591757774353, + "learning_rate": 7.7007189029179e-05, + "loss": 2.6214, + "step": 20364 + }, + { + "epoch": 1.8450318226087745, + "grad_norm": 0.852816104888916, + "learning_rate": 7.700114782818825e-05, + "loss": 2.7057, + "step": 20365 + }, + { + "epoch": 1.845122420783221, + "grad_norm": 0.9477347135543823, + "learning_rate": 7.699510662719749e-05, + "loss": 2.9325, + "step": 20366 + }, + { + "epoch": 1.845213018957668, + "grad_norm": 0.7367287278175354, + "learning_rate": 7.698906542620673e-05, + "loss": 2.1021, + "step": 20367 + }, + { + "epoch": 1.8453036171321147, + "grad_norm": 0.882357120513916, + "learning_rate": 7.698302422521597e-05, + "loss": 2.7092, + "step": 20368 + }, + { + "epoch": 1.8453942153065617, + "grad_norm": 0.7903696894645691, + "learning_rate": 7.697698302422523e-05, + "loss": 2.0295, + "step": 20369 + }, + { + "epoch": 1.8454848134810082, + "grad_norm": 0.7526810169219971, + "learning_rate": 7.697094182323446e-05, + "loss": 2.0588, + "step": 20370 + }, + { + "epoch": 1.8455754116554552, + "grad_norm": 0.9397837519645691, + "learning_rate": 7.696490062224371e-05, + "loss": 2.6759, + "step": 20371 + }, + { + "epoch": 1.8456660098299018, + "grad_norm": 0.9565214514732361, + "learning_rate": 7.695885942125294e-05, + "loss": 2.7297, + "step": 20372 + }, + { + "epoch": 1.8457566080043488, + "grad_norm": 0.9024198055267334, + "learning_rate": 7.695281822026219e-05, + "loss": 2.7586, + "step": 20373 + }, + { + "epoch": 1.8458472061787954, + "grad_norm": 0.9451749324798584, + "learning_rate": 7.694677701927143e-05, + "loss": 2.9019, + "step": 20374 + }, + { + "epoch": 1.8459378043532424, + "grad_norm": 0.9170210361480713, + "learning_rate": 7.694073581828067e-05, + "loss": 2.8129, + "step": 20375 + }, + { + "epoch": 1.846028402527689, + "grad_norm": 0.9928160905838013, + "learning_rate": 7.693469461728992e-05, + "loss": 2.6115, + "step": 20376 + }, + { + "epoch": 1.846119000702136, + "grad_norm": 0.9194918870925903, + "learning_rate": 7.692865341629917e-05, + "loss": 2.6608, + "step": 20377 + }, + { + "epoch": 1.8462095988765825, + "grad_norm": 0.8907580971717834, + "learning_rate": 7.69226122153084e-05, + "loss": 2.762, + "step": 20378 + }, + { + "epoch": 1.8463001970510295, + "grad_norm": 0.8448914289474487, + "learning_rate": 7.691657101431765e-05, + "loss": 2.6578, + "step": 20379 + }, + { + "epoch": 1.8463907952254761, + "grad_norm": 0.9515262246131897, + "learning_rate": 7.69105298133269e-05, + "loss": 2.6943, + "step": 20380 + }, + { + "epoch": 1.8464813933999231, + "grad_norm": 0.8364024758338928, + "learning_rate": 7.690448861233613e-05, + "loss": 2.5776, + "step": 20381 + }, + { + "epoch": 1.8465719915743697, + "grad_norm": 0.8710337281227112, + "learning_rate": 7.689844741134538e-05, + "loss": 2.5964, + "step": 20382 + }, + { + "epoch": 1.8466625897488167, + "grad_norm": 1.0372296571731567, + "learning_rate": 7.689240621035461e-05, + "loss": 2.6331, + "step": 20383 + }, + { + "epoch": 1.8467531879232633, + "grad_norm": 0.7775468826293945, + "learning_rate": 7.688636500936388e-05, + "loss": 2.0362, + "step": 20384 + }, + { + "epoch": 1.8468437860977103, + "grad_norm": 0.8444197177886963, + "learning_rate": 7.688032380837311e-05, + "loss": 2.2417, + "step": 20385 + }, + { + "epoch": 1.8469343842721568, + "grad_norm": 0.8994075655937195, + "learning_rate": 7.687428260738236e-05, + "loss": 2.651, + "step": 20386 + }, + { + "epoch": 1.8470249824466038, + "grad_norm": 0.816493034362793, + "learning_rate": 7.686824140639159e-05, + "loss": 2.6013, + "step": 20387 + }, + { + "epoch": 1.8471155806210504, + "grad_norm": 0.9143253564834595, + "learning_rate": 7.686220020540084e-05, + "loss": 2.8388, + "step": 20388 + }, + { + "epoch": 1.8472061787954974, + "grad_norm": 0.9190605282783508, + "learning_rate": 7.685615900441007e-05, + "loss": 2.7684, + "step": 20389 + }, + { + "epoch": 1.847296776969944, + "grad_norm": 0.935136616230011, + "learning_rate": 7.685011780341932e-05, + "loss": 2.5544, + "step": 20390 + }, + { + "epoch": 1.847387375144391, + "grad_norm": 0.9464049935340881, + "learning_rate": 7.684407660242857e-05, + "loss": 3.1677, + "step": 20391 + }, + { + "epoch": 1.8474779733188376, + "grad_norm": 0.6623664498329163, + "learning_rate": 7.683803540143782e-05, + "loss": 1.3006, + "step": 20392 + }, + { + "epoch": 1.8475685714932846, + "grad_norm": 1.019702672958374, + "learning_rate": 7.683199420044705e-05, + "loss": 2.7317, + "step": 20393 + }, + { + "epoch": 1.8476591696677311, + "grad_norm": 0.9985131621360779, + "learning_rate": 7.68259529994563e-05, + "loss": 2.7596, + "step": 20394 + }, + { + "epoch": 1.847749767842178, + "grad_norm": 0.9078385233879089, + "learning_rate": 7.681991179846554e-05, + "loss": 3.3272, + "step": 20395 + }, + { + "epoch": 1.8478403660166247, + "grad_norm": 0.8897879719734192, + "learning_rate": 7.681387059747478e-05, + "loss": 2.4978, + "step": 20396 + }, + { + "epoch": 1.8479309641910715, + "grad_norm": 0.9498252272605896, + "learning_rate": 7.680782939648403e-05, + "loss": 2.6362, + "step": 20397 + }, + { + "epoch": 1.8480215623655183, + "grad_norm": 0.8792654275894165, + "learning_rate": 7.680178819549326e-05, + "loss": 2.7196, + "step": 20398 + }, + { + "epoch": 1.848112160539965, + "grad_norm": 0.9197742938995361, + "learning_rate": 7.679574699450252e-05, + "loss": 2.6249, + "step": 20399 + }, + { + "epoch": 1.8482027587144119, + "grad_norm": 0.7569084167480469, + "learning_rate": 7.678970579351176e-05, + "loss": 1.8653, + "step": 20400 + }, + { + "epoch": 1.8482933568888587, + "grad_norm": 0.8719643354415894, + "learning_rate": 7.6783664592521e-05, + "loss": 2.4944, + "step": 20401 + }, + { + "epoch": 1.8483839550633054, + "grad_norm": 0.9514539241790771, + "learning_rate": 7.677762339153024e-05, + "loss": 2.8521, + "step": 20402 + }, + { + "epoch": 1.8484745532377522, + "grad_norm": 0.9272658228874207, + "learning_rate": 7.677158219053948e-05, + "loss": 2.6194, + "step": 20403 + }, + { + "epoch": 1.848565151412199, + "grad_norm": 0.8274535536766052, + "learning_rate": 7.676554098954872e-05, + "loss": 2.0293, + "step": 20404 + }, + { + "epoch": 1.8486557495866458, + "grad_norm": 0.9391127824783325, + "learning_rate": 7.675949978855797e-05, + "loss": 2.917, + "step": 20405 + }, + { + "epoch": 1.8487463477610926, + "grad_norm": 0.8603679537773132, + "learning_rate": 7.675345858756721e-05, + "loss": 2.8415, + "step": 20406 + }, + { + "epoch": 1.8488369459355394, + "grad_norm": 0.8639307618141174, + "learning_rate": 7.674741738657646e-05, + "loss": 2.584, + "step": 20407 + }, + { + "epoch": 1.8489275441099862, + "grad_norm": 0.9417500495910645, + "learning_rate": 7.67413761855857e-05, + "loss": 2.6949, + "step": 20408 + }, + { + "epoch": 1.849018142284433, + "grad_norm": 0.8605596423149109, + "learning_rate": 7.673533498459494e-05, + "loss": 2.5776, + "step": 20409 + }, + { + "epoch": 1.8491087404588797, + "grad_norm": 0.930277943611145, + "learning_rate": 7.672929378360418e-05, + "loss": 2.7206, + "step": 20410 + }, + { + "epoch": 1.8491993386333265, + "grad_norm": 0.8818120360374451, + "learning_rate": 7.672325258261343e-05, + "loss": 2.7083, + "step": 20411 + }, + { + "epoch": 1.8492899368077733, + "grad_norm": 0.9285097122192383, + "learning_rate": 7.671721138162267e-05, + "loss": 2.8905, + "step": 20412 + }, + { + "epoch": 1.84938053498222, + "grad_norm": 0.8897932171821594, + "learning_rate": 7.67111701806319e-05, + "loss": 2.6915, + "step": 20413 + }, + { + "epoch": 1.849471133156667, + "grad_norm": 0.8794203400611877, + "learning_rate": 7.670512897964115e-05, + "loss": 2.7318, + "step": 20414 + }, + { + "epoch": 1.8495617313311137, + "grad_norm": 0.9443454742431641, + "learning_rate": 7.66990877786504e-05, + "loss": 2.9267, + "step": 20415 + }, + { + "epoch": 1.8496523295055605, + "grad_norm": 0.9007145166397095, + "learning_rate": 7.669304657765965e-05, + "loss": 2.5657, + "step": 20416 + }, + { + "epoch": 1.8497429276800073, + "grad_norm": 0.9867387413978577, + "learning_rate": 7.668700537666888e-05, + "loss": 2.5897, + "step": 20417 + }, + { + "epoch": 1.849833525854454, + "grad_norm": 0.8808964490890503, + "learning_rate": 7.668096417567813e-05, + "loss": 2.8941, + "step": 20418 + }, + { + "epoch": 1.8499241240289008, + "grad_norm": 0.8050032258033752, + "learning_rate": 7.667492297468737e-05, + "loss": 2.0185, + "step": 20419 + }, + { + "epoch": 1.8500147222033476, + "grad_norm": 0.9052850008010864, + "learning_rate": 7.666888177369661e-05, + "loss": 2.8004, + "step": 20420 + }, + { + "epoch": 1.8501053203777944, + "grad_norm": 0.8518828749656677, + "learning_rate": 7.666284057270586e-05, + "loss": 2.7567, + "step": 20421 + }, + { + "epoch": 1.8501959185522412, + "grad_norm": 0.8192505836486816, + "learning_rate": 7.665679937171511e-05, + "loss": 2.4748, + "step": 20422 + }, + { + "epoch": 1.850286516726688, + "grad_norm": 0.828520655632019, + "learning_rate": 7.665075817072434e-05, + "loss": 1.9808, + "step": 20423 + }, + { + "epoch": 1.8503771149011348, + "grad_norm": 0.9049224853515625, + "learning_rate": 7.664471696973359e-05, + "loss": 2.7948, + "step": 20424 + }, + { + "epoch": 1.8504677130755816, + "grad_norm": 0.8896938562393188, + "learning_rate": 7.663867576874282e-05, + "loss": 2.6897, + "step": 20425 + }, + { + "epoch": 1.8505583112500283, + "grad_norm": 0.9058800935745239, + "learning_rate": 7.663263456775207e-05, + "loss": 2.6624, + "step": 20426 + }, + { + "epoch": 1.8506489094244751, + "grad_norm": 0.9501200318336487, + "learning_rate": 7.662659336676132e-05, + "loss": 2.7553, + "step": 20427 + }, + { + "epoch": 1.850739507598922, + "grad_norm": 0.9340964555740356, + "learning_rate": 7.662055216577055e-05, + "loss": 2.9148, + "step": 20428 + }, + { + "epoch": 1.8508301057733687, + "grad_norm": 0.8456510305404663, + "learning_rate": 7.66145109647798e-05, + "loss": 2.5522, + "step": 20429 + }, + { + "epoch": 1.8509207039478155, + "grad_norm": 0.9895932078361511, + "learning_rate": 7.660846976378905e-05, + "loss": 2.6628, + "step": 20430 + }, + { + "epoch": 1.8510113021222623, + "grad_norm": 0.9653523564338684, + "learning_rate": 7.66024285627983e-05, + "loss": 2.8049, + "step": 20431 + }, + { + "epoch": 1.851101900296709, + "grad_norm": 0.908409595489502, + "learning_rate": 7.659638736180753e-05, + "loss": 2.5959, + "step": 20432 + }, + { + "epoch": 1.8511924984711559, + "grad_norm": 0.9270759224891663, + "learning_rate": 7.659034616081678e-05, + "loss": 2.9798, + "step": 20433 + }, + { + "epoch": 1.8512830966456026, + "grad_norm": 0.7736039161682129, + "learning_rate": 7.658430495982601e-05, + "loss": 1.9307, + "step": 20434 + }, + { + "epoch": 1.8513736948200494, + "grad_norm": 0.8640376925468445, + "learning_rate": 7.657826375883526e-05, + "loss": 2.5644, + "step": 20435 + }, + { + "epoch": 1.8514642929944962, + "grad_norm": 0.9736324548721313, + "learning_rate": 7.657222255784451e-05, + "loss": 2.7539, + "step": 20436 + }, + { + "epoch": 1.8515548911689428, + "grad_norm": 0.8823409080505371, + "learning_rate": 7.656618135685375e-05, + "loss": 2.5874, + "step": 20437 + }, + { + "epoch": 1.8516454893433898, + "grad_norm": 0.9262875914573669, + "learning_rate": 7.656014015586299e-05, + "loss": 2.6484, + "step": 20438 + }, + { + "epoch": 1.8517360875178364, + "grad_norm": 0.8707300424575806, + "learning_rate": 7.655409895487224e-05, + "loss": 2.4492, + "step": 20439 + }, + { + "epoch": 1.8518266856922834, + "grad_norm": 0.9419215321540833, + "learning_rate": 7.654805775388147e-05, + "loss": 2.6628, + "step": 20440 + }, + { + "epoch": 1.85191728386673, + "grad_norm": 0.9120450019836426, + "learning_rate": 7.654201655289072e-05, + "loss": 2.8354, + "step": 20441 + }, + { + "epoch": 1.852007882041177, + "grad_norm": 0.9662636518478394, + "learning_rate": 7.653597535189995e-05, + "loss": 2.6617, + "step": 20442 + }, + { + "epoch": 1.8520984802156235, + "grad_norm": 0.8711923360824585, + "learning_rate": 7.65299341509092e-05, + "loss": 2.8081, + "step": 20443 + }, + { + "epoch": 1.8521890783900705, + "grad_norm": 0.9989628195762634, + "learning_rate": 7.652389294991845e-05, + "loss": 2.3143, + "step": 20444 + }, + { + "epoch": 1.852279676564517, + "grad_norm": 0.8926323056221008, + "learning_rate": 7.65178517489277e-05, + "loss": 2.8656, + "step": 20445 + }, + { + "epoch": 1.852370274738964, + "grad_norm": 1.0364506244659424, + "learning_rate": 7.651181054793693e-05, + "loss": 2.9433, + "step": 20446 + }, + { + "epoch": 1.8524608729134107, + "grad_norm": 0.8899058103561401, + "learning_rate": 7.650576934694618e-05, + "loss": 2.7797, + "step": 20447 + }, + { + "epoch": 1.8525514710878577, + "grad_norm": 0.9818093180656433, + "learning_rate": 7.649972814595542e-05, + "loss": 2.4531, + "step": 20448 + }, + { + "epoch": 1.8526420692623042, + "grad_norm": 0.9370747804641724, + "learning_rate": 7.649368694496466e-05, + "loss": 2.5192, + "step": 20449 + }, + { + "epoch": 1.8527326674367512, + "grad_norm": 0.8685317039489746, + "learning_rate": 7.64876457439739e-05, + "loss": 2.7495, + "step": 20450 + }, + { + "epoch": 1.8528232656111978, + "grad_norm": 0.9611332416534424, + "learning_rate": 7.648160454298315e-05, + "loss": 2.9114, + "step": 20451 + }, + { + "epoch": 1.8529138637856448, + "grad_norm": 0.9586687684059143, + "learning_rate": 7.64755633419924e-05, + "loss": 2.5472, + "step": 20452 + }, + { + "epoch": 1.8530044619600914, + "grad_norm": 0.9434218406677246, + "learning_rate": 7.646952214100163e-05, + "loss": 2.8122, + "step": 20453 + }, + { + "epoch": 1.8530950601345384, + "grad_norm": 1.0464094877243042, + "learning_rate": 7.646348094001088e-05, + "loss": 2.7012, + "step": 20454 + }, + { + "epoch": 1.853185658308985, + "grad_norm": 0.9073173999786377, + "learning_rate": 7.645743973902012e-05, + "loss": 2.7024, + "step": 20455 + }, + { + "epoch": 1.853276256483432, + "grad_norm": 0.8812392354011536, + "learning_rate": 7.645139853802936e-05, + "loss": 2.6199, + "step": 20456 + }, + { + "epoch": 1.8533668546578785, + "grad_norm": 0.7426113486289978, + "learning_rate": 7.64453573370386e-05, + "loss": 1.8314, + "step": 20457 + }, + { + "epoch": 1.8534574528323255, + "grad_norm": 0.9565780758857727, + "learning_rate": 7.643931613604785e-05, + "loss": 2.8225, + "step": 20458 + }, + { + "epoch": 1.8535480510067721, + "grad_norm": 0.8605374693870544, + "learning_rate": 7.64332749350571e-05, + "loss": 2.6699, + "step": 20459 + }, + { + "epoch": 1.8536386491812191, + "grad_norm": 1.0083671808242798, + "learning_rate": 7.642723373406634e-05, + "loss": 2.7206, + "step": 20460 + }, + { + "epoch": 1.8537292473556657, + "grad_norm": 0.9469422698020935, + "learning_rate": 7.642119253307557e-05, + "loss": 2.4075, + "step": 20461 + }, + { + "epoch": 1.8538198455301127, + "grad_norm": 0.8486142158508301, + "learning_rate": 7.641515133208482e-05, + "loss": 2.4833, + "step": 20462 + }, + { + "epoch": 1.8539104437045593, + "grad_norm": 0.8992189168930054, + "learning_rate": 7.640911013109407e-05, + "loss": 2.658, + "step": 20463 + }, + { + "epoch": 1.8540010418790063, + "grad_norm": 0.8796189427375793, + "learning_rate": 7.64030689301033e-05, + "loss": 2.6229, + "step": 20464 + }, + { + "epoch": 1.8540916400534528, + "grad_norm": 0.8798068761825562, + "learning_rate": 7.639702772911255e-05, + "loss": 2.7656, + "step": 20465 + }, + { + "epoch": 1.8541822382278998, + "grad_norm": 0.9257946014404297, + "learning_rate": 7.63909865281218e-05, + "loss": 2.8315, + "step": 20466 + }, + { + "epoch": 1.8542728364023464, + "grad_norm": 0.8814901113510132, + "learning_rate": 7.638494532713105e-05, + "loss": 2.6853, + "step": 20467 + }, + { + "epoch": 1.8543634345767934, + "grad_norm": 0.88615882396698, + "learning_rate": 7.637890412614028e-05, + "loss": 2.6174, + "step": 20468 + }, + { + "epoch": 1.85445403275124, + "grad_norm": 0.906890332698822, + "learning_rate": 7.637286292514953e-05, + "loss": 2.6443, + "step": 20469 + }, + { + "epoch": 1.854544630925687, + "grad_norm": 0.8671799302101135, + "learning_rate": 7.636682172415876e-05, + "loss": 2.3897, + "step": 20470 + }, + { + "epoch": 1.8546352291001336, + "grad_norm": 0.9602862596511841, + "learning_rate": 7.636078052316801e-05, + "loss": 2.988, + "step": 20471 + }, + { + "epoch": 1.8547258272745806, + "grad_norm": 0.8442705869674683, + "learning_rate": 7.635473932217724e-05, + "loss": 2.4297, + "step": 20472 + }, + { + "epoch": 1.8548164254490271, + "grad_norm": 0.9753922820091248, + "learning_rate": 7.634869812118649e-05, + "loss": 2.6417, + "step": 20473 + }, + { + "epoch": 1.8549070236234741, + "grad_norm": 0.9071880578994751, + "learning_rate": 7.634265692019574e-05, + "loss": 2.869, + "step": 20474 + }, + { + "epoch": 1.8549976217979207, + "grad_norm": 0.9273914694786072, + "learning_rate": 7.633661571920499e-05, + "loss": 2.3989, + "step": 20475 + }, + { + "epoch": 1.8550882199723675, + "grad_norm": 0.8965575695037842, + "learning_rate": 7.633057451821422e-05, + "loss": 2.84, + "step": 20476 + }, + { + "epoch": 1.8551788181468143, + "grad_norm": 0.8671168684959412, + "learning_rate": 7.632453331722347e-05, + "loss": 2.689, + "step": 20477 + }, + { + "epoch": 1.855269416321261, + "grad_norm": 0.8949119448661804, + "learning_rate": 7.63184921162327e-05, + "loss": 2.814, + "step": 20478 + }, + { + "epoch": 1.8553600144957079, + "grad_norm": 0.845125675201416, + "learning_rate": 7.631245091524195e-05, + "loss": 2.6353, + "step": 20479 + }, + { + "epoch": 1.8554506126701547, + "grad_norm": 0.9263250231742859, + "learning_rate": 7.63064097142512e-05, + "loss": 2.8458, + "step": 20480 + }, + { + "epoch": 1.8555412108446014, + "grad_norm": 0.9231727719306946, + "learning_rate": 7.630036851326045e-05, + "loss": 2.7258, + "step": 20481 + }, + { + "epoch": 1.8556318090190482, + "grad_norm": 0.8627766370773315, + "learning_rate": 7.62943273122697e-05, + "loss": 2.849, + "step": 20482 + }, + { + "epoch": 1.855722407193495, + "grad_norm": 0.8978521823883057, + "learning_rate": 7.628828611127893e-05, + "loss": 2.7823, + "step": 20483 + }, + { + "epoch": 1.8558130053679418, + "grad_norm": 0.9082027673721313, + "learning_rate": 7.628224491028818e-05, + "loss": 2.5595, + "step": 20484 + }, + { + "epoch": 1.8559036035423886, + "grad_norm": 0.7791251540184021, + "learning_rate": 7.627620370929741e-05, + "loss": 1.9141, + "step": 20485 + }, + { + "epoch": 1.8559942017168354, + "grad_norm": 0.9323210716247559, + "learning_rate": 7.627016250830666e-05, + "loss": 2.7222, + "step": 20486 + }, + { + "epoch": 1.8560847998912822, + "grad_norm": 0.9698770046234131, + "learning_rate": 7.626412130731589e-05, + "loss": 2.5751, + "step": 20487 + }, + { + "epoch": 1.856175398065729, + "grad_norm": 0.8993678092956543, + "learning_rate": 7.625808010632514e-05, + "loss": 2.7785, + "step": 20488 + }, + { + "epoch": 1.8562659962401757, + "grad_norm": 0.8877344131469727, + "learning_rate": 7.625203890533439e-05, + "loss": 2.6652, + "step": 20489 + }, + { + "epoch": 1.8563565944146225, + "grad_norm": 0.9327599406242371, + "learning_rate": 7.624599770434363e-05, + "loss": 2.7846, + "step": 20490 + }, + { + "epoch": 1.8564471925890693, + "grad_norm": 0.9657633304595947, + "learning_rate": 7.623995650335287e-05, + "loss": 2.6052, + "step": 20491 + }, + { + "epoch": 1.856537790763516, + "grad_norm": 0.8872758150100708, + "learning_rate": 7.623391530236212e-05, + "loss": 2.8126, + "step": 20492 + }, + { + "epoch": 1.856628388937963, + "grad_norm": 0.8706942200660706, + "learning_rate": 7.622787410137135e-05, + "loss": 2.7714, + "step": 20493 + }, + { + "epoch": 1.8567189871124097, + "grad_norm": 0.8679236173629761, + "learning_rate": 7.62218329003806e-05, + "loss": 3.0076, + "step": 20494 + }, + { + "epoch": 1.8568095852868565, + "grad_norm": 0.9625681042671204, + "learning_rate": 7.621579169938984e-05, + "loss": 2.6842, + "step": 20495 + }, + { + "epoch": 1.8569001834613033, + "grad_norm": 0.7714703679084778, + "learning_rate": 7.620975049839909e-05, + "loss": 1.9671, + "step": 20496 + }, + { + "epoch": 1.85699078163575, + "grad_norm": 0.9321379661560059, + "learning_rate": 7.620370929740833e-05, + "loss": 2.7163, + "step": 20497 + }, + { + "epoch": 1.8570813798101968, + "grad_norm": 0.8434170484542847, + "learning_rate": 7.619766809641757e-05, + "loss": 1.754, + "step": 20498 + }, + { + "epoch": 1.8571719779846436, + "grad_norm": 0.9314574599266052, + "learning_rate": 7.619162689542682e-05, + "loss": 2.7555, + "step": 20499 + }, + { + "epoch": 1.8572625761590904, + "grad_norm": 0.9237228631973267, + "learning_rate": 7.618558569443606e-05, + "loss": 2.6304, + "step": 20500 + }, + { + "epoch": 1.8573531743335372, + "grad_norm": 0.9376498460769653, + "learning_rate": 7.61795444934453e-05, + "loss": 2.6144, + "step": 20501 + }, + { + "epoch": 1.857443772507984, + "grad_norm": 0.9219529628753662, + "learning_rate": 7.617350329245454e-05, + "loss": 2.8496, + "step": 20502 + }, + { + "epoch": 1.8575343706824308, + "grad_norm": 0.9154138565063477, + "learning_rate": 7.616746209146378e-05, + "loss": 2.7306, + "step": 20503 + }, + { + "epoch": 1.8576249688568776, + "grad_norm": 0.8963912129402161, + "learning_rate": 7.616142089047303e-05, + "loss": 2.8941, + "step": 20504 + }, + { + "epoch": 1.8577155670313243, + "grad_norm": 0.9544962644577026, + "learning_rate": 7.615537968948228e-05, + "loss": 2.6713, + "step": 20505 + }, + { + "epoch": 1.8578061652057711, + "grad_norm": 0.8960745334625244, + "learning_rate": 7.614933848849151e-05, + "loss": 2.5631, + "step": 20506 + }, + { + "epoch": 1.857896763380218, + "grad_norm": 0.9316712617874146, + "learning_rate": 7.614329728750076e-05, + "loss": 2.6903, + "step": 20507 + }, + { + "epoch": 1.8579873615546647, + "grad_norm": 0.7869904637336731, + "learning_rate": 7.613725608651e-05, + "loss": 2.0087, + "step": 20508 + }, + { + "epoch": 1.8580779597291115, + "grad_norm": 0.9401962757110596, + "learning_rate": 7.613121488551924e-05, + "loss": 2.6752, + "step": 20509 + }, + { + "epoch": 1.8581685579035583, + "grad_norm": 0.8823017477989197, + "learning_rate": 7.612517368452848e-05, + "loss": 2.7637, + "step": 20510 + }, + { + "epoch": 1.858259156078005, + "grad_norm": 0.8958636522293091, + "learning_rate": 7.611913248353774e-05, + "loss": 2.825, + "step": 20511 + }, + { + "epoch": 1.8583497542524519, + "grad_norm": 1.2394161224365234, + "learning_rate": 7.611309128254697e-05, + "loss": 2.5461, + "step": 20512 + }, + { + "epoch": 1.8584403524268986, + "grad_norm": 0.9292541146278381, + "learning_rate": 7.610705008155622e-05, + "loss": 2.7995, + "step": 20513 + }, + { + "epoch": 1.8585309506013454, + "grad_norm": 0.9525678157806396, + "learning_rate": 7.610100888056545e-05, + "loss": 2.8493, + "step": 20514 + }, + { + "epoch": 1.8586215487757922, + "grad_norm": 0.9657670855522156, + "learning_rate": 7.60949676795747e-05, + "loss": 2.9127, + "step": 20515 + }, + { + "epoch": 1.858712146950239, + "grad_norm": 0.9396757483482361, + "learning_rate": 7.608892647858395e-05, + "loss": 2.632, + "step": 20516 + }, + { + "epoch": 1.8588027451246858, + "grad_norm": 0.9539279341697693, + "learning_rate": 7.608288527759318e-05, + "loss": 2.4823, + "step": 20517 + }, + { + "epoch": 1.8588933432991324, + "grad_norm": 0.9195917844772339, + "learning_rate": 7.607684407660243e-05, + "loss": 2.947, + "step": 20518 + }, + { + "epoch": 1.8589839414735794, + "grad_norm": 0.9026752710342407, + "learning_rate": 7.607080287561168e-05, + "loss": 2.7648, + "step": 20519 + }, + { + "epoch": 1.859074539648026, + "grad_norm": 0.9336991310119629, + "learning_rate": 7.606476167462093e-05, + "loss": 3.0889, + "step": 20520 + }, + { + "epoch": 1.859165137822473, + "grad_norm": 0.893718421459198, + "learning_rate": 7.605872047363016e-05, + "loss": 2.5325, + "step": 20521 + }, + { + "epoch": 1.8592557359969195, + "grad_norm": 0.8499588966369629, + "learning_rate": 7.605267927263941e-05, + "loss": 2.5532, + "step": 20522 + }, + { + "epoch": 1.8593463341713665, + "grad_norm": 0.7738664150238037, + "learning_rate": 7.604663807164864e-05, + "loss": 1.8531, + "step": 20523 + }, + { + "epoch": 1.859436932345813, + "grad_norm": 0.9440516829490662, + "learning_rate": 7.604059687065789e-05, + "loss": 2.4788, + "step": 20524 + }, + { + "epoch": 1.85952753052026, + "grad_norm": 0.8615112900733948, + "learning_rate": 7.603455566966712e-05, + "loss": 2.7315, + "step": 20525 + }, + { + "epoch": 1.8596181286947067, + "grad_norm": 0.8696357607841492, + "learning_rate": 7.602851446867638e-05, + "loss": 2.1693, + "step": 20526 + }, + { + "epoch": 1.8597087268691537, + "grad_norm": 0.884162425994873, + "learning_rate": 7.602247326768562e-05, + "loss": 2.673, + "step": 20527 + }, + { + "epoch": 1.8597993250436002, + "grad_norm": 0.8699331283569336, + "learning_rate": 7.601643206669487e-05, + "loss": 2.7909, + "step": 20528 + }, + { + "epoch": 1.8598899232180472, + "grad_norm": 0.878807008266449, + "learning_rate": 7.60103908657041e-05, + "loss": 2.5883, + "step": 20529 + }, + { + "epoch": 1.8599805213924938, + "grad_norm": 0.9131700396537781, + "learning_rate": 7.600434966471335e-05, + "loss": 2.4817, + "step": 20530 + }, + { + "epoch": 1.8600711195669408, + "grad_norm": 0.7898720502853394, + "learning_rate": 7.59983084637226e-05, + "loss": 1.9832, + "step": 20531 + }, + { + "epoch": 1.8601617177413874, + "grad_norm": 0.8458994030952454, + "learning_rate": 7.599226726273183e-05, + "loss": 2.5926, + "step": 20532 + }, + { + "epoch": 1.8602523159158344, + "grad_norm": 0.8676538467407227, + "learning_rate": 7.598622606174108e-05, + "loss": 2.7598, + "step": 20533 + }, + { + "epoch": 1.860342914090281, + "grad_norm": 0.8836081624031067, + "learning_rate": 7.598018486075032e-05, + "loss": 2.6487, + "step": 20534 + }, + { + "epoch": 1.860433512264728, + "grad_norm": 0.829328715801239, + "learning_rate": 7.597414365975957e-05, + "loss": 2.1155, + "step": 20535 + }, + { + "epoch": 1.8605241104391745, + "grad_norm": 0.8928012847900391, + "learning_rate": 7.59681024587688e-05, + "loss": 2.8005, + "step": 20536 + }, + { + "epoch": 1.8606147086136215, + "grad_norm": 0.8573111295700073, + "learning_rate": 7.596206125777805e-05, + "loss": 2.6672, + "step": 20537 + }, + { + "epoch": 1.8607053067880681, + "grad_norm": 0.9506421685218811, + "learning_rate": 7.595602005678729e-05, + "loss": 2.744, + "step": 20538 + }, + { + "epoch": 1.8607959049625151, + "grad_norm": 0.8088353276252747, + "learning_rate": 7.594997885579654e-05, + "loss": 1.8375, + "step": 20539 + }, + { + "epoch": 1.8608865031369617, + "grad_norm": 0.91485595703125, + "learning_rate": 7.594393765480577e-05, + "loss": 2.8133, + "step": 20540 + }, + { + "epoch": 1.8609771013114087, + "grad_norm": 0.8570718765258789, + "learning_rate": 7.593789645381503e-05, + "loss": 2.5329, + "step": 20541 + }, + { + "epoch": 1.8610676994858553, + "grad_norm": 0.8851035833358765, + "learning_rate": 7.593185525282427e-05, + "loss": 1.9784, + "step": 20542 + }, + { + "epoch": 1.8611582976603023, + "grad_norm": 0.8736917972564697, + "learning_rate": 7.592581405183351e-05, + "loss": 2.57, + "step": 20543 + }, + { + "epoch": 1.8612488958347488, + "grad_norm": 0.9179188013076782, + "learning_rate": 7.591977285084275e-05, + "loss": 2.8662, + "step": 20544 + }, + { + "epoch": 1.8613394940091958, + "grad_norm": 0.9308531284332275, + "learning_rate": 7.5913731649852e-05, + "loss": 2.8719, + "step": 20545 + }, + { + "epoch": 1.8614300921836424, + "grad_norm": 0.9475685358047485, + "learning_rate": 7.590769044886123e-05, + "loss": 2.6395, + "step": 20546 + }, + { + "epoch": 1.8615206903580894, + "grad_norm": 0.8613458275794983, + "learning_rate": 7.590164924787048e-05, + "loss": 2.7578, + "step": 20547 + }, + { + "epoch": 1.861611288532536, + "grad_norm": 0.9171109795570374, + "learning_rate": 7.589560804687972e-05, + "loss": 2.639, + "step": 20548 + }, + { + "epoch": 1.861701886706983, + "grad_norm": 1.0654081106185913, + "learning_rate": 7.588956684588897e-05, + "loss": 2.6821, + "step": 20549 + }, + { + "epoch": 1.8617924848814296, + "grad_norm": 0.8998258709907532, + "learning_rate": 7.588352564489822e-05, + "loss": 2.7093, + "step": 20550 + }, + { + "epoch": 1.8618830830558766, + "grad_norm": 0.9151889085769653, + "learning_rate": 7.587748444390745e-05, + "loss": 2.449, + "step": 20551 + }, + { + "epoch": 1.8619736812303231, + "grad_norm": 0.7925155758857727, + "learning_rate": 7.58714432429167e-05, + "loss": 1.9303, + "step": 20552 + }, + { + "epoch": 1.8620642794047702, + "grad_norm": 0.7929262518882751, + "learning_rate": 7.586540204192593e-05, + "loss": 2.0465, + "step": 20553 + }, + { + "epoch": 1.8621548775792167, + "grad_norm": 0.8404635190963745, + "learning_rate": 7.585936084093518e-05, + "loss": 2.5424, + "step": 20554 + }, + { + "epoch": 1.8622454757536637, + "grad_norm": 0.8087645173072815, + "learning_rate": 7.585331963994442e-05, + "loss": 1.9612, + "step": 20555 + }, + { + "epoch": 1.8623360739281103, + "grad_norm": 0.8847969770431519, + "learning_rate": 7.584727843895368e-05, + "loss": 2.754, + "step": 20556 + }, + { + "epoch": 1.862426672102557, + "grad_norm": 0.897747278213501, + "learning_rate": 7.584123723796291e-05, + "loss": 2.7067, + "step": 20557 + }, + { + "epoch": 1.8625172702770039, + "grad_norm": 0.9786962270736694, + "learning_rate": 7.583519603697216e-05, + "loss": 2.7843, + "step": 20558 + }, + { + "epoch": 1.8626078684514507, + "grad_norm": 0.8646273016929626, + "learning_rate": 7.582915483598139e-05, + "loss": 2.7045, + "step": 20559 + }, + { + "epoch": 1.8626984666258974, + "grad_norm": 0.9308980703353882, + "learning_rate": 7.582311363499064e-05, + "loss": 3.0103, + "step": 20560 + }, + { + "epoch": 1.8627890648003442, + "grad_norm": 0.9253168106079102, + "learning_rate": 7.581707243399987e-05, + "loss": 2.7855, + "step": 20561 + }, + { + "epoch": 1.862879662974791, + "grad_norm": 0.8630519509315491, + "learning_rate": 7.581103123300912e-05, + "loss": 2.1884, + "step": 20562 + }, + { + "epoch": 1.8629702611492378, + "grad_norm": 0.906859278678894, + "learning_rate": 7.580499003201837e-05, + "loss": 2.5344, + "step": 20563 + }, + { + "epoch": 1.8630608593236846, + "grad_norm": 1.010217547416687, + "learning_rate": 7.579894883102762e-05, + "loss": 2.653, + "step": 20564 + }, + { + "epoch": 1.8631514574981314, + "grad_norm": 0.8781803250312805, + "learning_rate": 7.579290763003685e-05, + "loss": 2.1569, + "step": 20565 + }, + { + "epoch": 1.8632420556725782, + "grad_norm": 0.9520281553268433, + "learning_rate": 7.57868664290461e-05, + "loss": 2.7981, + "step": 20566 + }, + { + "epoch": 1.863332653847025, + "grad_norm": 0.9220349192619324, + "learning_rate": 7.578082522805535e-05, + "loss": 2.6683, + "step": 20567 + }, + { + "epoch": 1.8634232520214717, + "grad_norm": 0.9635587930679321, + "learning_rate": 7.577478402706458e-05, + "loss": 2.9095, + "step": 20568 + }, + { + "epoch": 1.8635138501959185, + "grad_norm": 0.9141726493835449, + "learning_rate": 7.576874282607383e-05, + "loss": 2.5865, + "step": 20569 + }, + { + "epoch": 1.8636044483703653, + "grad_norm": 0.8897383213043213, + "learning_rate": 7.576270162508306e-05, + "loss": 2.6346, + "step": 20570 + }, + { + "epoch": 1.863695046544812, + "grad_norm": 0.9947674870491028, + "learning_rate": 7.575666042409232e-05, + "loss": 2.9941, + "step": 20571 + }, + { + "epoch": 1.863785644719259, + "grad_norm": 0.922577440738678, + "learning_rate": 7.575061922310156e-05, + "loss": 2.363, + "step": 20572 + }, + { + "epoch": 1.8638762428937057, + "grad_norm": 0.9326639771461487, + "learning_rate": 7.57445780221108e-05, + "loss": 2.8478, + "step": 20573 + }, + { + "epoch": 1.8639668410681525, + "grad_norm": 0.9502148032188416, + "learning_rate": 7.573853682112004e-05, + "loss": 2.5204, + "step": 20574 + }, + { + "epoch": 1.8640574392425993, + "grad_norm": 0.8826220035552979, + "learning_rate": 7.573249562012929e-05, + "loss": 2.653, + "step": 20575 + }, + { + "epoch": 1.864148037417046, + "grad_norm": 0.9098939895629883, + "learning_rate": 7.572645441913852e-05, + "loss": 2.7086, + "step": 20576 + }, + { + "epoch": 1.8642386355914928, + "grad_norm": 0.8417253494262695, + "learning_rate": 7.572041321814777e-05, + "loss": 2.605, + "step": 20577 + }, + { + "epoch": 1.8643292337659396, + "grad_norm": 0.9579405188560486, + "learning_rate": 7.5714372017157e-05, + "loss": 2.7827, + "step": 20578 + }, + { + "epoch": 1.8644198319403864, + "grad_norm": 0.9855954051017761, + "learning_rate": 7.570833081616626e-05, + "loss": 2.644, + "step": 20579 + }, + { + "epoch": 1.8645104301148332, + "grad_norm": 0.9856294393539429, + "learning_rate": 7.57022896151755e-05, + "loss": 2.7639, + "step": 20580 + }, + { + "epoch": 1.86460102828928, + "grad_norm": 0.9511736035346985, + "learning_rate": 7.569624841418475e-05, + "loss": 2.7489, + "step": 20581 + }, + { + "epoch": 1.8646916264637268, + "grad_norm": 0.7887997627258301, + "learning_rate": 7.569020721319399e-05, + "loss": 2.0385, + "step": 20582 + }, + { + "epoch": 1.8647822246381736, + "grad_norm": 0.8938313126564026, + "learning_rate": 7.568416601220323e-05, + "loss": 2.6712, + "step": 20583 + }, + { + "epoch": 1.8648728228126203, + "grad_norm": 0.9458194971084595, + "learning_rate": 7.567812481121247e-05, + "loss": 2.6646, + "step": 20584 + }, + { + "epoch": 1.8649634209870671, + "grad_norm": 0.7588551044464111, + "learning_rate": 7.567208361022171e-05, + "loss": 2.0556, + "step": 20585 + }, + { + "epoch": 1.865054019161514, + "grad_norm": 0.9608599543571472, + "learning_rate": 7.566604240923097e-05, + "loss": 2.7301, + "step": 20586 + }, + { + "epoch": 1.8651446173359607, + "grad_norm": 0.9462727904319763, + "learning_rate": 7.56600012082402e-05, + "loss": 2.6997, + "step": 20587 + }, + { + "epoch": 1.8652352155104075, + "grad_norm": 0.8813378214836121, + "learning_rate": 7.565396000724945e-05, + "loss": 2.732, + "step": 20588 + }, + { + "epoch": 1.8653258136848543, + "grad_norm": 0.8854078054428101, + "learning_rate": 7.564791880625869e-05, + "loss": 2.7962, + "step": 20589 + }, + { + "epoch": 1.865416411859301, + "grad_norm": 0.8715134859085083, + "learning_rate": 7.564187760526793e-05, + "loss": 2.6558, + "step": 20590 + }, + { + "epoch": 1.8655070100337479, + "grad_norm": 0.9782679080963135, + "learning_rate": 7.563583640427717e-05, + "loss": 2.608, + "step": 20591 + }, + { + "epoch": 1.8655976082081946, + "grad_norm": 1.190470576286316, + "learning_rate": 7.562979520328642e-05, + "loss": 2.6257, + "step": 20592 + }, + { + "epoch": 1.8656882063826414, + "grad_norm": 0.9218496084213257, + "learning_rate": 7.562375400229565e-05, + "loss": 2.8395, + "step": 20593 + }, + { + "epoch": 1.8657788045570882, + "grad_norm": 0.8611630797386169, + "learning_rate": 7.561771280130491e-05, + "loss": 2.7375, + "step": 20594 + }, + { + "epoch": 1.865869402731535, + "grad_norm": 0.9102029800415039, + "learning_rate": 7.561167160031414e-05, + "loss": 3.1113, + "step": 20595 + }, + { + "epoch": 1.8659600009059818, + "grad_norm": 0.9570385813713074, + "learning_rate": 7.560563039932339e-05, + "loss": 2.8104, + "step": 20596 + }, + { + "epoch": 1.8660505990804286, + "grad_norm": 0.9272493720054626, + "learning_rate": 7.559958919833263e-05, + "loss": 2.5224, + "step": 20597 + }, + { + "epoch": 1.8661411972548754, + "grad_norm": 0.7534897923469543, + "learning_rate": 7.559354799734187e-05, + "loss": 1.9801, + "step": 20598 + }, + { + "epoch": 1.866231795429322, + "grad_norm": 0.8932461142539978, + "learning_rate": 7.558750679635112e-05, + "loss": 2.5014, + "step": 20599 + }, + { + "epoch": 1.866322393603769, + "grad_norm": 0.8503133654594421, + "learning_rate": 7.558146559536036e-05, + "loss": 3.0302, + "step": 20600 + }, + { + "epoch": 1.8664129917782155, + "grad_norm": 0.8979883193969727, + "learning_rate": 7.55754243943696e-05, + "loss": 2.5405, + "step": 20601 + }, + { + "epoch": 1.8665035899526625, + "grad_norm": 0.8636007905006409, + "learning_rate": 7.556938319337885e-05, + "loss": 2.6133, + "step": 20602 + }, + { + "epoch": 1.866594188127109, + "grad_norm": 0.8936758041381836, + "learning_rate": 7.55633419923881e-05, + "loss": 2.6695, + "step": 20603 + }, + { + "epoch": 1.866684786301556, + "grad_norm": 0.7425009608268738, + "learning_rate": 7.555730079139733e-05, + "loss": 1.7016, + "step": 20604 + }, + { + "epoch": 1.8667753844760027, + "grad_norm": 0.9176557064056396, + "learning_rate": 7.555125959040658e-05, + "loss": 2.6741, + "step": 20605 + }, + { + "epoch": 1.8668659826504497, + "grad_norm": 0.9023463726043701, + "learning_rate": 7.554521838941581e-05, + "loss": 2.5953, + "step": 20606 + }, + { + "epoch": 1.8669565808248962, + "grad_norm": 0.9280257821083069, + "learning_rate": 7.553917718842506e-05, + "loss": 2.4901, + "step": 20607 + }, + { + "epoch": 1.8670471789993432, + "grad_norm": 0.8578869104385376, + "learning_rate": 7.55331359874343e-05, + "loss": 2.3729, + "step": 20608 + }, + { + "epoch": 1.8671377771737898, + "grad_norm": 0.8324829339981079, + "learning_rate": 7.552709478644356e-05, + "loss": 2.6414, + "step": 20609 + }, + { + "epoch": 1.8672283753482368, + "grad_norm": 0.8880448341369629, + "learning_rate": 7.552105358545279e-05, + "loss": 3.0027, + "step": 20610 + }, + { + "epoch": 1.8673189735226834, + "grad_norm": 0.9310717582702637, + "learning_rate": 7.551501238446204e-05, + "loss": 2.4163, + "step": 20611 + }, + { + "epoch": 1.8674095716971304, + "grad_norm": 0.8903889060020447, + "learning_rate": 7.550897118347127e-05, + "loss": 2.6382, + "step": 20612 + }, + { + "epoch": 1.867500169871577, + "grad_norm": 0.9290569424629211, + "learning_rate": 7.550292998248052e-05, + "loss": 2.6561, + "step": 20613 + }, + { + "epoch": 1.867590768046024, + "grad_norm": 0.988174557685852, + "learning_rate": 7.549688878148977e-05, + "loss": 2.7808, + "step": 20614 + }, + { + "epoch": 1.8676813662204705, + "grad_norm": 0.9084984064102173, + "learning_rate": 7.5490847580499e-05, + "loss": 2.8344, + "step": 20615 + }, + { + "epoch": 1.8677719643949175, + "grad_norm": 0.9022164344787598, + "learning_rate": 7.548480637950825e-05, + "loss": 2.6968, + "step": 20616 + }, + { + "epoch": 1.8678625625693641, + "grad_norm": 0.9624186754226685, + "learning_rate": 7.54787651785175e-05, + "loss": 2.5735, + "step": 20617 + }, + { + "epoch": 1.8679531607438111, + "grad_norm": 0.9179377555847168, + "learning_rate": 7.547272397752674e-05, + "loss": 2.7303, + "step": 20618 + }, + { + "epoch": 1.8680437589182577, + "grad_norm": 0.787907063961029, + "learning_rate": 7.546668277653598e-05, + "loss": 2.0624, + "step": 20619 + }, + { + "epoch": 1.8681343570927047, + "grad_norm": 0.9176207780838013, + "learning_rate": 7.546064157554523e-05, + "loss": 2.7602, + "step": 20620 + }, + { + "epoch": 1.8682249552671513, + "grad_norm": 0.8865274786949158, + "learning_rate": 7.545460037455446e-05, + "loss": 2.9136, + "step": 20621 + }, + { + "epoch": 1.8683155534415983, + "grad_norm": 0.9467405080795288, + "learning_rate": 7.544855917356371e-05, + "loss": 2.6845, + "step": 20622 + }, + { + "epoch": 1.8684061516160448, + "grad_norm": 0.8759433031082153, + "learning_rate": 7.544251797257294e-05, + "loss": 2.5939, + "step": 20623 + }, + { + "epoch": 1.8684967497904919, + "grad_norm": 0.8377286195755005, + "learning_rate": 7.54364767715822e-05, + "loss": 2.817, + "step": 20624 + }, + { + "epoch": 1.8685873479649384, + "grad_norm": 0.9273084402084351, + "learning_rate": 7.543043557059144e-05, + "loss": 2.0001, + "step": 20625 + }, + { + "epoch": 1.8686779461393854, + "grad_norm": 0.7773498296737671, + "learning_rate": 7.542439436960068e-05, + "loss": 2.1688, + "step": 20626 + }, + { + "epoch": 1.868768544313832, + "grad_norm": 0.9439284205436707, + "learning_rate": 7.541835316860992e-05, + "loss": 2.829, + "step": 20627 + }, + { + "epoch": 1.868859142488279, + "grad_norm": 0.9191653728485107, + "learning_rate": 7.541231196761917e-05, + "loss": 2.8772, + "step": 20628 + }, + { + "epoch": 1.8689497406627256, + "grad_norm": 0.9655677080154419, + "learning_rate": 7.54062707666284e-05, + "loss": 2.5468, + "step": 20629 + }, + { + "epoch": 1.8690403388371726, + "grad_norm": 0.9126403331756592, + "learning_rate": 7.540022956563765e-05, + "loss": 2.5306, + "step": 20630 + }, + { + "epoch": 1.8691309370116191, + "grad_norm": 1.0225191116333008, + "learning_rate": 7.53941883646469e-05, + "loss": 2.7642, + "step": 20631 + }, + { + "epoch": 1.8692215351860662, + "grad_norm": 0.8511524200439453, + "learning_rate": 7.538814716365614e-05, + "loss": 2.4707, + "step": 20632 + }, + { + "epoch": 1.8693121333605127, + "grad_norm": 0.9617327451705933, + "learning_rate": 7.538210596266538e-05, + "loss": 2.3448, + "step": 20633 + }, + { + "epoch": 1.8694027315349597, + "grad_norm": 0.8616581559181213, + "learning_rate": 7.537606476167462e-05, + "loss": 2.692, + "step": 20634 + }, + { + "epoch": 1.8694933297094063, + "grad_norm": 0.8572224974632263, + "learning_rate": 7.537002356068387e-05, + "loss": 2.6271, + "step": 20635 + }, + { + "epoch": 1.8695839278838533, + "grad_norm": 0.9031741619110107, + "learning_rate": 7.53639823596931e-05, + "loss": 2.9844, + "step": 20636 + }, + { + "epoch": 1.8696745260582999, + "grad_norm": 0.7771698236465454, + "learning_rate": 7.535794115870235e-05, + "loss": 2.2687, + "step": 20637 + }, + { + "epoch": 1.8697651242327467, + "grad_norm": 0.9633823037147522, + "learning_rate": 7.535189995771159e-05, + "loss": 2.5237, + "step": 20638 + }, + { + "epoch": 1.8698557224071934, + "grad_norm": 0.9094836115837097, + "learning_rate": 7.534585875672085e-05, + "loss": 2.8092, + "step": 20639 + }, + { + "epoch": 1.8699463205816402, + "grad_norm": 0.8367407917976379, + "learning_rate": 7.533981755573008e-05, + "loss": 1.7697, + "step": 20640 + }, + { + "epoch": 1.870036918756087, + "grad_norm": 0.7622293829917908, + "learning_rate": 7.533377635473933e-05, + "loss": 2.0479, + "step": 20641 + }, + { + "epoch": 1.8701275169305338, + "grad_norm": 0.8589248657226562, + "learning_rate": 7.532773515374856e-05, + "loss": 2.7168, + "step": 20642 + }, + { + "epoch": 1.8702181151049806, + "grad_norm": 0.9010306000709534, + "learning_rate": 7.532169395275781e-05, + "loss": 2.4921, + "step": 20643 + }, + { + "epoch": 1.8703087132794274, + "grad_norm": 0.8984118103981018, + "learning_rate": 7.531565275176705e-05, + "loss": 2.734, + "step": 20644 + }, + { + "epoch": 1.8703993114538742, + "grad_norm": 0.8959285616874695, + "learning_rate": 7.53096115507763e-05, + "loss": 2.4935, + "step": 20645 + }, + { + "epoch": 1.870489909628321, + "grad_norm": 0.8517422676086426, + "learning_rate": 7.530357034978554e-05, + "loss": 2.6361, + "step": 20646 + }, + { + "epoch": 1.8705805078027677, + "grad_norm": 0.9353243112564087, + "learning_rate": 7.529752914879479e-05, + "loss": 2.8496, + "step": 20647 + }, + { + "epoch": 1.8706711059772145, + "grad_norm": 0.8087140917778015, + "learning_rate": 7.529148794780402e-05, + "loss": 2.5535, + "step": 20648 + }, + { + "epoch": 1.8707617041516613, + "grad_norm": 0.9102082848548889, + "learning_rate": 7.528544674681327e-05, + "loss": 2.6824, + "step": 20649 + }, + { + "epoch": 1.870852302326108, + "grad_norm": 0.9467134475708008, + "learning_rate": 7.527940554582252e-05, + "loss": 2.5059, + "step": 20650 + }, + { + "epoch": 1.870942900500555, + "grad_norm": 0.9434103965759277, + "learning_rate": 7.527336434483175e-05, + "loss": 2.7189, + "step": 20651 + }, + { + "epoch": 1.8710334986750017, + "grad_norm": 0.9452081322669983, + "learning_rate": 7.5267323143841e-05, + "loss": 2.7019, + "step": 20652 + }, + { + "epoch": 1.8711240968494485, + "grad_norm": 0.8674378395080566, + "learning_rate": 7.526128194285023e-05, + "loss": 2.6681, + "step": 20653 + }, + { + "epoch": 1.8712146950238953, + "grad_norm": 0.8626528978347778, + "learning_rate": 7.52552407418595e-05, + "loss": 2.6806, + "step": 20654 + }, + { + "epoch": 1.871305293198342, + "grad_norm": 0.8244199156761169, + "learning_rate": 7.524919954086873e-05, + "loss": 1.8844, + "step": 20655 + }, + { + "epoch": 1.8713958913727888, + "grad_norm": 0.8929108381271362, + "learning_rate": 7.524315833987798e-05, + "loss": 2.7133, + "step": 20656 + }, + { + "epoch": 1.8714864895472356, + "grad_norm": 0.8398464918136597, + "learning_rate": 7.523711713888721e-05, + "loss": 2.646, + "step": 20657 + }, + { + "epoch": 1.8715770877216824, + "grad_norm": 1.0237754583358765, + "learning_rate": 7.523107593789646e-05, + "loss": 2.6458, + "step": 20658 + }, + { + "epoch": 1.8716676858961292, + "grad_norm": 0.9993488192558289, + "learning_rate": 7.522503473690569e-05, + "loss": 2.7676, + "step": 20659 + }, + { + "epoch": 1.871758284070576, + "grad_norm": 0.909079372882843, + "learning_rate": 7.521899353591494e-05, + "loss": 2.7939, + "step": 20660 + }, + { + "epoch": 1.8718488822450228, + "grad_norm": 0.9182084202766418, + "learning_rate": 7.521295233492419e-05, + "loss": 2.6314, + "step": 20661 + }, + { + "epoch": 1.8719394804194696, + "grad_norm": 0.8800373673439026, + "learning_rate": 7.520691113393344e-05, + "loss": 3.1536, + "step": 20662 + }, + { + "epoch": 1.8720300785939163, + "grad_norm": 0.9025242328643799, + "learning_rate": 7.520086993294267e-05, + "loss": 2.5272, + "step": 20663 + }, + { + "epoch": 1.8721206767683631, + "grad_norm": 0.8615577220916748, + "learning_rate": 7.519482873195192e-05, + "loss": 2.8656, + "step": 20664 + }, + { + "epoch": 1.87221127494281, + "grad_norm": 0.8523828983306885, + "learning_rate": 7.518878753096115e-05, + "loss": 2.6192, + "step": 20665 + }, + { + "epoch": 1.8723018731172567, + "grad_norm": 0.9127860069274902, + "learning_rate": 7.51827463299704e-05, + "loss": 2.8759, + "step": 20666 + }, + { + "epoch": 1.8723924712917035, + "grad_norm": 0.9121459722518921, + "learning_rate": 7.517670512897965e-05, + "loss": 2.9481, + "step": 20667 + }, + { + "epoch": 1.8724830694661503, + "grad_norm": 0.8963260650634766, + "learning_rate": 7.517066392798888e-05, + "loss": 2.7635, + "step": 20668 + }, + { + "epoch": 1.872573667640597, + "grad_norm": 0.9351972341537476, + "learning_rate": 7.516462272699814e-05, + "loss": 2.7075, + "step": 20669 + }, + { + "epoch": 1.8726642658150439, + "grad_norm": 0.9358455538749695, + "learning_rate": 7.515858152600738e-05, + "loss": 2.7971, + "step": 20670 + }, + { + "epoch": 1.8727548639894906, + "grad_norm": 0.9935629963874817, + "learning_rate": 7.515254032501662e-05, + "loss": 2.8508, + "step": 20671 + }, + { + "epoch": 1.8728454621639374, + "grad_norm": 0.8639551401138306, + "learning_rate": 7.514649912402586e-05, + "loss": 2.4921, + "step": 20672 + }, + { + "epoch": 1.8729360603383842, + "grad_norm": 0.7589502930641174, + "learning_rate": 7.51404579230351e-05, + "loss": 2.2114, + "step": 20673 + }, + { + "epoch": 1.873026658512831, + "grad_norm": 0.9420322775840759, + "learning_rate": 7.513441672204434e-05, + "loss": 2.7311, + "step": 20674 + }, + { + "epoch": 1.8731172566872778, + "grad_norm": 0.8902769088745117, + "learning_rate": 7.512837552105359e-05, + "loss": 2.7373, + "step": 20675 + }, + { + "epoch": 1.8732078548617246, + "grad_norm": 0.990742027759552, + "learning_rate": 7.512233432006283e-05, + "loss": 2.6094, + "step": 20676 + }, + { + "epoch": 1.8732984530361714, + "grad_norm": 0.9475538730621338, + "learning_rate": 7.511629311907208e-05, + "loss": 2.931, + "step": 20677 + }, + { + "epoch": 1.8733890512106182, + "grad_norm": 0.8972119092941284, + "learning_rate": 7.511025191808132e-05, + "loss": 2.5432, + "step": 20678 + }, + { + "epoch": 1.873479649385065, + "grad_norm": 0.8530491590499878, + "learning_rate": 7.510421071709056e-05, + "loss": 2.5673, + "step": 20679 + }, + { + "epoch": 1.8735702475595115, + "grad_norm": 0.8996081948280334, + "learning_rate": 7.50981695160998e-05, + "loss": 2.7512, + "step": 20680 + }, + { + "epoch": 1.8736608457339585, + "grad_norm": 0.9554506540298462, + "learning_rate": 7.509212831510905e-05, + "loss": 2.6746, + "step": 20681 + }, + { + "epoch": 1.873751443908405, + "grad_norm": 0.7903851866722107, + "learning_rate": 7.508608711411829e-05, + "loss": 1.9681, + "step": 20682 + }, + { + "epoch": 1.873842042082852, + "grad_norm": 0.9215217232704163, + "learning_rate": 7.508004591312753e-05, + "loss": 2.739, + "step": 20683 + }, + { + "epoch": 1.8739326402572987, + "grad_norm": 0.8058003783226013, + "learning_rate": 7.507400471213677e-05, + "loss": 2.0006, + "step": 20684 + }, + { + "epoch": 1.8740232384317457, + "grad_norm": 0.908415675163269, + "learning_rate": 7.506796351114602e-05, + "loss": 2.6961, + "step": 20685 + }, + { + "epoch": 1.8741138366061922, + "grad_norm": 0.951839029788971, + "learning_rate": 7.506192231015527e-05, + "loss": 2.7643, + "step": 20686 + }, + { + "epoch": 1.8742044347806393, + "grad_norm": 0.9414897561073303, + "learning_rate": 7.50558811091645e-05, + "loss": 2.8975, + "step": 20687 + }, + { + "epoch": 1.8742950329550858, + "grad_norm": 0.935900866985321, + "learning_rate": 7.504983990817375e-05, + "loss": 2.6088, + "step": 20688 + }, + { + "epoch": 1.8743856311295328, + "grad_norm": 0.9001939296722412, + "learning_rate": 7.504379870718299e-05, + "loss": 2.861, + "step": 20689 + }, + { + "epoch": 1.8744762293039794, + "grad_norm": 0.9364535212516785, + "learning_rate": 7.503775750619223e-05, + "loss": 2.8969, + "step": 20690 + }, + { + "epoch": 1.8745668274784264, + "grad_norm": 0.8696790337562561, + "learning_rate": 7.503171630520148e-05, + "loss": 2.7353, + "step": 20691 + }, + { + "epoch": 1.874657425652873, + "grad_norm": 0.7507516145706177, + "learning_rate": 7.502567510421073e-05, + "loss": 1.9001, + "step": 20692 + }, + { + "epoch": 1.87474802382732, + "grad_norm": 0.909278392791748, + "learning_rate": 7.501963390321996e-05, + "loss": 2.601, + "step": 20693 + }, + { + "epoch": 1.8748386220017665, + "grad_norm": 0.8429240584373474, + "learning_rate": 7.501359270222921e-05, + "loss": 2.0903, + "step": 20694 + }, + { + "epoch": 1.8749292201762136, + "grad_norm": 0.8801158666610718, + "learning_rate": 7.500755150123844e-05, + "loss": 2.6048, + "step": 20695 + }, + { + "epoch": 1.8750198183506601, + "grad_norm": 0.8936107754707336, + "learning_rate": 7.500151030024769e-05, + "loss": 2.5753, + "step": 20696 + }, + { + "epoch": 1.8751104165251071, + "grad_norm": 0.9598868489265442, + "learning_rate": 7.499546909925693e-05, + "loss": 2.649, + "step": 20697 + }, + { + "epoch": 1.8752010146995537, + "grad_norm": 0.8906314373016357, + "learning_rate": 7.498942789826617e-05, + "loss": 2.7123, + "step": 20698 + }, + { + "epoch": 1.8752916128740007, + "grad_norm": 0.8856261968612671, + "learning_rate": 7.498338669727542e-05, + "loss": 2.5202, + "step": 20699 + }, + { + "epoch": 1.8753822110484473, + "grad_norm": 0.8535234928131104, + "learning_rate": 7.497734549628467e-05, + "loss": 2.5477, + "step": 20700 + }, + { + "epoch": 1.8754728092228943, + "grad_norm": 0.988138735294342, + "learning_rate": 7.49713042952939e-05, + "loss": 2.5692, + "step": 20701 + }, + { + "epoch": 1.8755634073973408, + "grad_norm": 0.9267114996910095, + "learning_rate": 7.496526309430315e-05, + "loss": 2.8155, + "step": 20702 + }, + { + "epoch": 1.8756540055717879, + "grad_norm": 0.9409618377685547, + "learning_rate": 7.49592218933124e-05, + "loss": 2.5601, + "step": 20703 + }, + { + "epoch": 1.8757446037462344, + "grad_norm": 0.9163870215415955, + "learning_rate": 7.495318069232163e-05, + "loss": 2.649, + "step": 20704 + }, + { + "epoch": 1.8758352019206814, + "grad_norm": 0.8855432868003845, + "learning_rate": 7.494713949133088e-05, + "loss": 2.2614, + "step": 20705 + }, + { + "epoch": 1.875925800095128, + "grad_norm": 0.8657905459403992, + "learning_rate": 7.494109829034013e-05, + "loss": 2.4583, + "step": 20706 + }, + { + "epoch": 1.876016398269575, + "grad_norm": 0.8717722296714783, + "learning_rate": 7.493505708934937e-05, + "loss": 2.7392, + "step": 20707 + }, + { + "epoch": 1.8761069964440216, + "grad_norm": 0.776357114315033, + "learning_rate": 7.492901588835861e-05, + "loss": 1.9699, + "step": 20708 + }, + { + "epoch": 1.8761975946184686, + "grad_norm": 0.887251615524292, + "learning_rate": 7.492297468736786e-05, + "loss": 2.4809, + "step": 20709 + }, + { + "epoch": 1.8762881927929151, + "grad_norm": 0.8989970088005066, + "learning_rate": 7.491693348637709e-05, + "loss": 2.8915, + "step": 20710 + }, + { + "epoch": 1.8763787909673622, + "grad_norm": 0.920628547668457, + "learning_rate": 7.491089228538634e-05, + "loss": 2.7769, + "step": 20711 + }, + { + "epoch": 1.8764693891418087, + "grad_norm": 0.8781797885894775, + "learning_rate": 7.490485108439557e-05, + "loss": 2.5952, + "step": 20712 + }, + { + "epoch": 1.8765599873162557, + "grad_norm": 0.8202977776527405, + "learning_rate": 7.489880988340482e-05, + "loss": 2.4419, + "step": 20713 + }, + { + "epoch": 1.8766505854907023, + "grad_norm": 0.8782743811607361, + "learning_rate": 7.489276868241407e-05, + "loss": 2.7942, + "step": 20714 + }, + { + "epoch": 1.8767411836651493, + "grad_norm": 0.930004358291626, + "learning_rate": 7.488672748142331e-05, + "loss": 2.6659, + "step": 20715 + }, + { + "epoch": 1.8768317818395959, + "grad_norm": 0.8985806703567505, + "learning_rate": 7.488068628043255e-05, + "loss": 2.6306, + "step": 20716 + }, + { + "epoch": 1.8769223800140429, + "grad_norm": 0.8183773159980774, + "learning_rate": 7.48746450794418e-05, + "loss": 2.6785, + "step": 20717 + }, + { + "epoch": 1.8770129781884894, + "grad_norm": 0.7467184066772461, + "learning_rate": 7.486860387845104e-05, + "loss": 2.0487, + "step": 20718 + }, + { + "epoch": 1.8771035763629362, + "grad_norm": 1.0107358694076538, + "learning_rate": 7.486256267746028e-05, + "loss": 2.9002, + "step": 20719 + }, + { + "epoch": 1.877194174537383, + "grad_norm": 0.8130970001220703, + "learning_rate": 7.485652147646953e-05, + "loss": 2.5611, + "step": 20720 + }, + { + "epoch": 1.8772847727118298, + "grad_norm": 0.8654338717460632, + "learning_rate": 7.485048027547877e-05, + "loss": 2.6404, + "step": 20721 + }, + { + "epoch": 1.8773753708862766, + "grad_norm": 0.9242512583732605, + "learning_rate": 7.484443907448802e-05, + "loss": 2.8771, + "step": 20722 + }, + { + "epoch": 1.8774659690607234, + "grad_norm": 0.929704487323761, + "learning_rate": 7.483839787349726e-05, + "loss": 2.7162, + "step": 20723 + }, + { + "epoch": 1.8775565672351702, + "grad_norm": 0.9782583713531494, + "learning_rate": 7.48323566725065e-05, + "loss": 2.8746, + "step": 20724 + }, + { + "epoch": 1.877647165409617, + "grad_norm": 0.9237008094787598, + "learning_rate": 7.482631547151574e-05, + "loss": 3.0447, + "step": 20725 + }, + { + "epoch": 1.8777377635840637, + "grad_norm": 0.7880338430404663, + "learning_rate": 7.482027427052498e-05, + "loss": 2.2691, + "step": 20726 + }, + { + "epoch": 1.8778283617585105, + "grad_norm": 0.9296515583992004, + "learning_rate": 7.481423306953422e-05, + "loss": 2.7415, + "step": 20727 + }, + { + "epoch": 1.8779189599329573, + "grad_norm": 0.8627997636795044, + "learning_rate": 7.480819186854347e-05, + "loss": 2.7096, + "step": 20728 + }, + { + "epoch": 1.878009558107404, + "grad_norm": 0.7869163751602173, + "learning_rate": 7.480215066755271e-05, + "loss": 2.2062, + "step": 20729 + }, + { + "epoch": 1.878100156281851, + "grad_norm": 0.9262455105781555, + "learning_rate": 7.479610946656196e-05, + "loss": 2.6068, + "step": 20730 + }, + { + "epoch": 1.8781907544562977, + "grad_norm": 0.907701313495636, + "learning_rate": 7.47900682655712e-05, + "loss": 2.8405, + "step": 20731 + }, + { + "epoch": 1.8782813526307445, + "grad_norm": 1.0652027130126953, + "learning_rate": 7.478402706458044e-05, + "loss": 2.5696, + "step": 20732 + }, + { + "epoch": 1.8783719508051913, + "grad_norm": 0.9343970417976379, + "learning_rate": 7.477798586358968e-05, + "loss": 2.8256, + "step": 20733 + }, + { + "epoch": 1.878462548979638, + "grad_norm": 1.0390098094940186, + "learning_rate": 7.477194466259892e-05, + "loss": 2.591, + "step": 20734 + }, + { + "epoch": 1.8785531471540848, + "grad_norm": 0.9479806423187256, + "learning_rate": 7.476590346160817e-05, + "loss": 2.6281, + "step": 20735 + }, + { + "epoch": 1.8786437453285316, + "grad_norm": 1.0664376020431519, + "learning_rate": 7.475986226061742e-05, + "loss": 2.82, + "step": 20736 + }, + { + "epoch": 1.8787343435029784, + "grad_norm": 0.8958094120025635, + "learning_rate": 7.475382105962667e-05, + "loss": 2.7229, + "step": 20737 + }, + { + "epoch": 1.8788249416774252, + "grad_norm": 0.9290590286254883, + "learning_rate": 7.47477798586359e-05, + "loss": 2.8171, + "step": 20738 + }, + { + "epoch": 1.878915539851872, + "grad_norm": 0.9415106773376465, + "learning_rate": 7.474173865764515e-05, + "loss": 2.7719, + "step": 20739 + }, + { + "epoch": 1.8790061380263188, + "grad_norm": 0.859859824180603, + "learning_rate": 7.473569745665438e-05, + "loss": 2.6597, + "step": 20740 + }, + { + "epoch": 1.8790967362007656, + "grad_norm": 0.9579901695251465, + "learning_rate": 7.472965625566363e-05, + "loss": 2.5824, + "step": 20741 + }, + { + "epoch": 1.8791873343752123, + "grad_norm": 0.8898912072181702, + "learning_rate": 7.472361505467286e-05, + "loss": 2.8104, + "step": 20742 + }, + { + "epoch": 1.8792779325496591, + "grad_norm": 0.9215490221977234, + "learning_rate": 7.471757385368211e-05, + "loss": 2.6836, + "step": 20743 + }, + { + "epoch": 1.879368530724106, + "grad_norm": 0.9113911986351013, + "learning_rate": 7.471153265269136e-05, + "loss": 2.7752, + "step": 20744 + }, + { + "epoch": 1.8794591288985527, + "grad_norm": 0.9873340725898743, + "learning_rate": 7.470549145170061e-05, + "loss": 2.7162, + "step": 20745 + }, + { + "epoch": 1.8795497270729995, + "grad_norm": 1.0455520153045654, + "learning_rate": 7.469945025070984e-05, + "loss": 2.8915, + "step": 20746 + }, + { + "epoch": 1.8796403252474463, + "grad_norm": 0.957438588142395, + "learning_rate": 7.469340904971909e-05, + "loss": 2.9183, + "step": 20747 + }, + { + "epoch": 1.879730923421893, + "grad_norm": 0.9321348071098328, + "learning_rate": 7.468736784872832e-05, + "loss": 2.6474, + "step": 20748 + }, + { + "epoch": 1.8798215215963399, + "grad_norm": 0.7774617671966553, + "learning_rate": 7.468132664773757e-05, + "loss": 2.054, + "step": 20749 + }, + { + "epoch": 1.8799121197707866, + "grad_norm": 0.8453069925308228, + "learning_rate": 7.467528544674682e-05, + "loss": 2.5839, + "step": 20750 + }, + { + "epoch": 1.8800027179452334, + "grad_norm": 0.9213162660598755, + "learning_rate": 7.466924424575607e-05, + "loss": 2.7162, + "step": 20751 + }, + { + "epoch": 1.8800933161196802, + "grad_norm": 0.6853042840957642, + "learning_rate": 7.46632030447653e-05, + "loss": 1.3165, + "step": 20752 + }, + { + "epoch": 1.880183914294127, + "grad_norm": 0.9593327045440674, + "learning_rate": 7.465716184377455e-05, + "loss": 2.5246, + "step": 20753 + }, + { + "epoch": 1.8802745124685738, + "grad_norm": 0.8893232941627502, + "learning_rate": 7.46511206427838e-05, + "loss": 2.7346, + "step": 20754 + }, + { + "epoch": 1.8803651106430206, + "grad_norm": 0.8653284311294556, + "learning_rate": 7.464507944179303e-05, + "loss": 2.755, + "step": 20755 + }, + { + "epoch": 1.8804557088174674, + "grad_norm": 0.9434458017349243, + "learning_rate": 7.463903824080228e-05, + "loss": 2.7515, + "step": 20756 + }, + { + "epoch": 1.8805463069919142, + "grad_norm": 0.9962179064750671, + "learning_rate": 7.463299703981151e-05, + "loss": 2.4905, + "step": 20757 + }, + { + "epoch": 1.880636905166361, + "grad_norm": 0.7292430400848389, + "learning_rate": 7.462695583882076e-05, + "loss": 1.8334, + "step": 20758 + }, + { + "epoch": 1.8807275033408077, + "grad_norm": 0.9165195226669312, + "learning_rate": 7.462091463783e-05, + "loss": 2.7404, + "step": 20759 + }, + { + "epoch": 1.8808181015152545, + "grad_norm": 0.8883291482925415, + "learning_rate": 7.461487343683925e-05, + "loss": 2.5334, + "step": 20760 + }, + { + "epoch": 1.880908699689701, + "grad_norm": 0.8670195937156677, + "learning_rate": 7.460883223584849e-05, + "loss": 2.6161, + "step": 20761 + }, + { + "epoch": 1.880999297864148, + "grad_norm": 0.90550696849823, + "learning_rate": 7.460279103485774e-05, + "loss": 2.9242, + "step": 20762 + }, + { + "epoch": 1.8810898960385947, + "grad_norm": 0.8289362192153931, + "learning_rate": 7.459674983386697e-05, + "loss": 2.4904, + "step": 20763 + }, + { + "epoch": 1.8811804942130417, + "grad_norm": 0.888378381729126, + "learning_rate": 7.459070863287622e-05, + "loss": 2.6108, + "step": 20764 + }, + { + "epoch": 1.8812710923874882, + "grad_norm": 0.914753794670105, + "learning_rate": 7.458466743188545e-05, + "loss": 2.7239, + "step": 20765 + }, + { + "epoch": 1.8813616905619353, + "grad_norm": 0.75483638048172, + "learning_rate": 7.457862623089471e-05, + "loss": 2.2243, + "step": 20766 + }, + { + "epoch": 1.8814522887363818, + "grad_norm": 0.967326283454895, + "learning_rate": 7.457258502990395e-05, + "loss": 2.518, + "step": 20767 + }, + { + "epoch": 1.8815428869108288, + "grad_norm": 0.8748853206634521, + "learning_rate": 7.45665438289132e-05, + "loss": 2.6956, + "step": 20768 + }, + { + "epoch": 1.8816334850852754, + "grad_norm": 0.8657510280609131, + "learning_rate": 7.456050262792244e-05, + "loss": 2.5759, + "step": 20769 + }, + { + "epoch": 1.8817240832597224, + "grad_norm": 0.8228154182434082, + "learning_rate": 7.455446142693168e-05, + "loss": 1.9396, + "step": 20770 + }, + { + "epoch": 1.881814681434169, + "grad_norm": 0.8809016942977905, + "learning_rate": 7.454842022594092e-05, + "loss": 2.7033, + "step": 20771 + }, + { + "epoch": 1.881905279608616, + "grad_norm": 0.8759298324584961, + "learning_rate": 7.454237902495016e-05, + "loss": 2.6742, + "step": 20772 + }, + { + "epoch": 1.8819958777830625, + "grad_norm": 0.9445110559463501, + "learning_rate": 7.45363378239594e-05, + "loss": 2.7362, + "step": 20773 + }, + { + "epoch": 1.8820864759575096, + "grad_norm": 0.953280508518219, + "learning_rate": 7.453029662296865e-05, + "loss": 2.7409, + "step": 20774 + }, + { + "epoch": 1.8821770741319561, + "grad_norm": 0.871025800704956, + "learning_rate": 7.45242554219779e-05, + "loss": 2.66, + "step": 20775 + }, + { + "epoch": 1.8822676723064031, + "grad_norm": 0.8873557448387146, + "learning_rate": 7.451821422098713e-05, + "loss": 2.7082, + "step": 20776 + }, + { + "epoch": 1.8823582704808497, + "grad_norm": 0.7197635173797607, + "learning_rate": 7.451217301999638e-05, + "loss": 1.5026, + "step": 20777 + }, + { + "epoch": 1.8824488686552967, + "grad_norm": 0.8800001740455627, + "learning_rate": 7.450613181900562e-05, + "loss": 2.7405, + "step": 20778 + }, + { + "epoch": 1.8825394668297433, + "grad_norm": 1.02481210231781, + "learning_rate": 7.450009061801486e-05, + "loss": 3.1328, + "step": 20779 + }, + { + "epoch": 1.8826300650041903, + "grad_norm": 0.9225991368293762, + "learning_rate": 7.44940494170241e-05, + "loss": 2.5118, + "step": 20780 + }, + { + "epoch": 1.8827206631786368, + "grad_norm": 0.920170783996582, + "learning_rate": 7.448800821603336e-05, + "loss": 2.8196, + "step": 20781 + }, + { + "epoch": 1.8828112613530839, + "grad_norm": 0.902776300907135, + "learning_rate": 7.448196701504259e-05, + "loss": 2.6498, + "step": 20782 + }, + { + "epoch": 1.8829018595275304, + "grad_norm": 0.9204795956611633, + "learning_rate": 7.447592581405184e-05, + "loss": 2.6441, + "step": 20783 + }, + { + "epoch": 1.8829924577019774, + "grad_norm": 0.8443091511726379, + "learning_rate": 7.446988461306107e-05, + "loss": 2.5189, + "step": 20784 + }, + { + "epoch": 1.883083055876424, + "grad_norm": 0.9062831401824951, + "learning_rate": 7.446384341207032e-05, + "loss": 2.4832, + "step": 20785 + }, + { + "epoch": 1.883173654050871, + "grad_norm": 0.7692003846168518, + "learning_rate": 7.445780221107957e-05, + "loss": 2.1895, + "step": 20786 + }, + { + "epoch": 1.8832642522253176, + "grad_norm": 0.9007678031921387, + "learning_rate": 7.44517610100888e-05, + "loss": 2.7777, + "step": 20787 + }, + { + "epoch": 1.8833548503997646, + "grad_norm": 0.8560895919799805, + "learning_rate": 7.444571980909805e-05, + "loss": 2.4287, + "step": 20788 + }, + { + "epoch": 1.8834454485742111, + "grad_norm": 0.897391140460968, + "learning_rate": 7.44396786081073e-05, + "loss": 2.627, + "step": 20789 + }, + { + "epoch": 1.8835360467486582, + "grad_norm": 0.8910406231880188, + "learning_rate": 7.443363740711655e-05, + "loss": 2.7359, + "step": 20790 + }, + { + "epoch": 1.8836266449231047, + "grad_norm": 0.8819050788879395, + "learning_rate": 7.442759620612578e-05, + "loss": 2.431, + "step": 20791 + }, + { + "epoch": 1.8837172430975517, + "grad_norm": 0.7696633338928223, + "learning_rate": 7.442155500513503e-05, + "loss": 2.0386, + "step": 20792 + }, + { + "epoch": 1.8838078412719983, + "grad_norm": 0.8546013236045837, + "learning_rate": 7.441551380414426e-05, + "loss": 1.9431, + "step": 20793 + }, + { + "epoch": 1.8838984394464453, + "grad_norm": 0.9396781921386719, + "learning_rate": 7.440947260315351e-05, + "loss": 2.6408, + "step": 20794 + }, + { + "epoch": 1.8839890376208919, + "grad_norm": 0.9673017859458923, + "learning_rate": 7.440343140216274e-05, + "loss": 2.7114, + "step": 20795 + }, + { + "epoch": 1.8840796357953389, + "grad_norm": 0.9022378325462341, + "learning_rate": 7.4397390201172e-05, + "loss": 2.7726, + "step": 20796 + }, + { + "epoch": 1.8841702339697854, + "grad_norm": 0.9066103100776672, + "learning_rate": 7.439134900018124e-05, + "loss": 2.8678, + "step": 20797 + }, + { + "epoch": 1.8842608321442325, + "grad_norm": 0.8995541930198669, + "learning_rate": 7.438530779919049e-05, + "loss": 2.6907, + "step": 20798 + }, + { + "epoch": 1.884351430318679, + "grad_norm": 0.7654350399971008, + "learning_rate": 7.437926659819972e-05, + "loss": 2.0705, + "step": 20799 + }, + { + "epoch": 1.8844420284931258, + "grad_norm": 0.8850207924842834, + "learning_rate": 7.437322539720897e-05, + "loss": 2.6489, + "step": 20800 + }, + { + "epoch": 1.8845326266675726, + "grad_norm": 0.8678693175315857, + "learning_rate": 7.436718419621822e-05, + "loss": 2.6673, + "step": 20801 + }, + { + "epoch": 1.8846232248420194, + "grad_norm": 0.9821422696113586, + "learning_rate": 7.436114299522745e-05, + "loss": 2.6397, + "step": 20802 + }, + { + "epoch": 1.8847138230164662, + "grad_norm": 0.9040132164955139, + "learning_rate": 7.43551017942367e-05, + "loss": 2.7821, + "step": 20803 + }, + { + "epoch": 1.884804421190913, + "grad_norm": 1.006536602973938, + "learning_rate": 7.434906059324595e-05, + "loss": 2.9066, + "step": 20804 + }, + { + "epoch": 1.8848950193653597, + "grad_norm": 0.9437253475189209, + "learning_rate": 7.434301939225519e-05, + "loss": 2.7348, + "step": 20805 + }, + { + "epoch": 1.8849856175398065, + "grad_norm": 0.9925093650817871, + "learning_rate": 7.433697819126443e-05, + "loss": 2.4275, + "step": 20806 + }, + { + "epoch": 1.8850762157142533, + "grad_norm": 0.9103530049324036, + "learning_rate": 7.433093699027367e-05, + "loss": 2.6602, + "step": 20807 + }, + { + "epoch": 1.8851668138887, + "grad_norm": 0.8860085606575012, + "learning_rate": 7.432489578928291e-05, + "loss": 2.652, + "step": 20808 + }, + { + "epoch": 1.885257412063147, + "grad_norm": 0.8743497729301453, + "learning_rate": 7.431885458829216e-05, + "loss": 2.5247, + "step": 20809 + }, + { + "epoch": 1.8853480102375937, + "grad_norm": 0.944260835647583, + "learning_rate": 7.431281338730139e-05, + "loss": 2.5465, + "step": 20810 + }, + { + "epoch": 1.8854386084120405, + "grad_norm": 0.8927987217903137, + "learning_rate": 7.430677218631065e-05, + "loss": 2.7123, + "step": 20811 + }, + { + "epoch": 1.8855292065864873, + "grad_norm": 0.9464439749717712, + "learning_rate": 7.430073098531989e-05, + "loss": 3.0246, + "step": 20812 + }, + { + "epoch": 1.885619804760934, + "grad_norm": 0.7988377213478088, + "learning_rate": 7.429468978432913e-05, + "loss": 2.647, + "step": 20813 + }, + { + "epoch": 1.8857104029353808, + "grad_norm": 0.8997238278388977, + "learning_rate": 7.428864858333837e-05, + "loss": 2.9166, + "step": 20814 + }, + { + "epoch": 1.8858010011098276, + "grad_norm": 0.8974297642707825, + "learning_rate": 7.428260738234761e-05, + "loss": 2.9267, + "step": 20815 + }, + { + "epoch": 1.8858915992842744, + "grad_norm": 0.9239491820335388, + "learning_rate": 7.427656618135685e-05, + "loss": 2.8494, + "step": 20816 + }, + { + "epoch": 1.8859821974587212, + "grad_norm": 0.8463714122772217, + "learning_rate": 7.42705249803661e-05, + "loss": 2.1313, + "step": 20817 + }, + { + "epoch": 1.886072795633168, + "grad_norm": 0.9129440784454346, + "learning_rate": 7.426448377937534e-05, + "loss": 2.614, + "step": 20818 + }, + { + "epoch": 1.8861633938076148, + "grad_norm": 0.8567416667938232, + "learning_rate": 7.425844257838459e-05, + "loss": 2.4557, + "step": 20819 + }, + { + "epoch": 1.8862539919820616, + "grad_norm": 0.9305804371833801, + "learning_rate": 7.425240137739383e-05, + "loss": 2.6915, + "step": 20820 + }, + { + "epoch": 1.8863445901565083, + "grad_norm": 0.8635857105255127, + "learning_rate": 7.424636017640307e-05, + "loss": 2.5932, + "step": 20821 + }, + { + "epoch": 1.8864351883309551, + "grad_norm": 0.896864652633667, + "learning_rate": 7.424031897541232e-05, + "loss": 2.7203, + "step": 20822 + }, + { + "epoch": 1.886525786505402, + "grad_norm": 0.7700363993644714, + "learning_rate": 7.423427777442155e-05, + "loss": 2.0098, + "step": 20823 + }, + { + "epoch": 1.8866163846798487, + "grad_norm": 0.9608995914459229, + "learning_rate": 7.42282365734308e-05, + "loss": 2.6465, + "step": 20824 + }, + { + "epoch": 1.8867069828542955, + "grad_norm": 0.8411058187484741, + "learning_rate": 7.422219537244004e-05, + "loss": 2.637, + "step": 20825 + }, + { + "epoch": 1.8867975810287423, + "grad_norm": 0.8953222036361694, + "learning_rate": 7.42161541714493e-05, + "loss": 2.7545, + "step": 20826 + }, + { + "epoch": 1.886888179203189, + "grad_norm": 0.903070330619812, + "learning_rate": 7.421011297045853e-05, + "loss": 2.8017, + "step": 20827 + }, + { + "epoch": 1.8869787773776359, + "grad_norm": 0.87238609790802, + "learning_rate": 7.420407176946778e-05, + "loss": 2.6814, + "step": 20828 + }, + { + "epoch": 1.8870693755520827, + "grad_norm": 0.8990077972412109, + "learning_rate": 7.419803056847701e-05, + "loss": 2.8707, + "step": 20829 + }, + { + "epoch": 1.8871599737265294, + "grad_norm": 1.0137414932250977, + "learning_rate": 7.419198936748626e-05, + "loss": 2.5895, + "step": 20830 + }, + { + "epoch": 1.8872505719009762, + "grad_norm": 0.9286887049674988, + "learning_rate": 7.41859481664955e-05, + "loss": 2.676, + "step": 20831 + }, + { + "epoch": 1.887341170075423, + "grad_norm": 0.87664794921875, + "learning_rate": 7.417990696550474e-05, + "loss": 2.7969, + "step": 20832 + }, + { + "epoch": 1.8874317682498698, + "grad_norm": 0.9631295204162598, + "learning_rate": 7.417386576451398e-05, + "loss": 2.6258, + "step": 20833 + }, + { + "epoch": 1.8875223664243166, + "grad_norm": 0.9661415815353394, + "learning_rate": 7.416782456352324e-05, + "loss": 2.4315, + "step": 20834 + }, + { + "epoch": 1.8876129645987634, + "grad_norm": 0.7591650485992432, + "learning_rate": 7.416178336253247e-05, + "loss": 1.9767, + "step": 20835 + }, + { + "epoch": 1.8877035627732102, + "grad_norm": 0.9750538468360901, + "learning_rate": 7.415574216154172e-05, + "loss": 2.8313, + "step": 20836 + }, + { + "epoch": 1.887794160947657, + "grad_norm": 0.9165111780166626, + "learning_rate": 7.414970096055097e-05, + "loss": 2.6432, + "step": 20837 + }, + { + "epoch": 1.8878847591221037, + "grad_norm": 0.9102511405944824, + "learning_rate": 7.41436597595602e-05, + "loss": 2.6722, + "step": 20838 + }, + { + "epoch": 1.8879753572965505, + "grad_norm": 0.8928884863853455, + "learning_rate": 7.413761855856945e-05, + "loss": 2.6973, + "step": 20839 + }, + { + "epoch": 1.8880659554709973, + "grad_norm": 0.9826366901397705, + "learning_rate": 7.413157735757868e-05, + "loss": 2.496, + "step": 20840 + }, + { + "epoch": 1.888156553645444, + "grad_norm": 0.8951637148857117, + "learning_rate": 7.412553615658794e-05, + "loss": 2.1295, + "step": 20841 + }, + { + "epoch": 1.8882471518198907, + "grad_norm": 0.8996809124946594, + "learning_rate": 7.411949495559718e-05, + "loss": 2.7808, + "step": 20842 + }, + { + "epoch": 1.8883377499943377, + "grad_norm": 0.9036409258842468, + "learning_rate": 7.411345375460643e-05, + "loss": 2.7021, + "step": 20843 + }, + { + "epoch": 1.8884283481687842, + "grad_norm": 0.8742799758911133, + "learning_rate": 7.410741255361566e-05, + "loss": 2.3685, + "step": 20844 + }, + { + "epoch": 1.8885189463432313, + "grad_norm": 0.8131303191184998, + "learning_rate": 7.410137135262491e-05, + "loss": 2.1734, + "step": 20845 + }, + { + "epoch": 1.8886095445176778, + "grad_norm": 0.9662442803382874, + "learning_rate": 7.409533015163414e-05, + "loss": 2.7683, + "step": 20846 + }, + { + "epoch": 1.8887001426921248, + "grad_norm": 0.931475818157196, + "learning_rate": 7.408928895064339e-05, + "loss": 2.4435, + "step": 20847 + }, + { + "epoch": 1.8887907408665714, + "grad_norm": 0.9148204922676086, + "learning_rate": 7.408324774965264e-05, + "loss": 2.562, + "step": 20848 + }, + { + "epoch": 1.8888813390410184, + "grad_norm": 0.8685190677642822, + "learning_rate": 7.407720654866188e-05, + "loss": 2.7306, + "step": 20849 + }, + { + "epoch": 1.888971937215465, + "grad_norm": 0.9744651317596436, + "learning_rate": 7.407116534767112e-05, + "loss": 3.0064, + "step": 20850 + }, + { + "epoch": 1.889062535389912, + "grad_norm": 0.8938658237457275, + "learning_rate": 7.406512414668037e-05, + "loss": 2.7967, + "step": 20851 + }, + { + "epoch": 1.8891531335643585, + "grad_norm": 0.927815318107605, + "learning_rate": 7.40590829456896e-05, + "loss": 2.3576, + "step": 20852 + }, + { + "epoch": 1.8892437317388056, + "grad_norm": 0.924933910369873, + "learning_rate": 7.405304174469885e-05, + "loss": 2.7475, + "step": 20853 + }, + { + "epoch": 1.8893343299132521, + "grad_norm": 0.9176977872848511, + "learning_rate": 7.40470005437081e-05, + "loss": 2.8731, + "step": 20854 + }, + { + "epoch": 1.8894249280876991, + "grad_norm": 0.9240277409553528, + "learning_rate": 7.404095934271733e-05, + "loss": 2.6873, + "step": 20855 + }, + { + "epoch": 1.8895155262621457, + "grad_norm": 0.9388186931610107, + "learning_rate": 7.403491814172659e-05, + "loss": 2.6543, + "step": 20856 + }, + { + "epoch": 1.8896061244365927, + "grad_norm": 0.9363269805908203, + "learning_rate": 7.402887694073582e-05, + "loss": 3.063, + "step": 20857 + }, + { + "epoch": 1.8896967226110393, + "grad_norm": 0.8200960159301758, + "learning_rate": 7.402283573974507e-05, + "loss": 1.9845, + "step": 20858 + }, + { + "epoch": 1.8897873207854863, + "grad_norm": 0.9528302550315857, + "learning_rate": 7.40167945387543e-05, + "loss": 2.7269, + "step": 20859 + }, + { + "epoch": 1.8898779189599328, + "grad_norm": 0.9758461117744446, + "learning_rate": 7.401075333776355e-05, + "loss": 2.705, + "step": 20860 + }, + { + "epoch": 1.8899685171343799, + "grad_norm": 0.8010510206222534, + "learning_rate": 7.400471213677279e-05, + "loss": 2.0506, + "step": 20861 + }, + { + "epoch": 1.8900591153088264, + "grad_norm": 0.9936564564704895, + "learning_rate": 7.399867093578204e-05, + "loss": 2.7356, + "step": 20862 + }, + { + "epoch": 1.8901497134832734, + "grad_norm": 0.9127338528633118, + "learning_rate": 7.399262973479128e-05, + "loss": 2.5153, + "step": 20863 + }, + { + "epoch": 1.89024031165772, + "grad_norm": 0.9309182167053223, + "learning_rate": 7.398658853380053e-05, + "loss": 2.5745, + "step": 20864 + }, + { + "epoch": 1.890330909832167, + "grad_norm": 0.8765797019004822, + "learning_rate": 7.398054733280976e-05, + "loss": 2.8157, + "step": 20865 + }, + { + "epoch": 1.8904215080066136, + "grad_norm": 0.8645979762077332, + "learning_rate": 7.397450613181901e-05, + "loss": 2.6677, + "step": 20866 + }, + { + "epoch": 1.8905121061810606, + "grad_norm": 0.9552477598190308, + "learning_rate": 7.396846493082825e-05, + "loss": 2.7599, + "step": 20867 + }, + { + "epoch": 1.8906027043555071, + "grad_norm": 1.013226866722107, + "learning_rate": 7.39624237298375e-05, + "loss": 2.8013, + "step": 20868 + }, + { + "epoch": 1.8906933025299542, + "grad_norm": 0.7935774326324463, + "learning_rate": 7.395638252884674e-05, + "loss": 2.024, + "step": 20869 + }, + { + "epoch": 1.8907839007044007, + "grad_norm": 0.9813008904457092, + "learning_rate": 7.395034132785598e-05, + "loss": 2.6453, + "step": 20870 + }, + { + "epoch": 1.8908744988788477, + "grad_norm": 0.8243908882141113, + "learning_rate": 7.394430012686522e-05, + "loss": 2.0699, + "step": 20871 + }, + { + "epoch": 1.8909650970532943, + "grad_norm": 0.8897958993911743, + "learning_rate": 7.393825892587447e-05, + "loss": 2.5519, + "step": 20872 + }, + { + "epoch": 1.8910556952277413, + "grad_norm": 0.877030074596405, + "learning_rate": 7.393221772488372e-05, + "loss": 2.1928, + "step": 20873 + }, + { + "epoch": 1.8911462934021879, + "grad_norm": 0.9082607626914978, + "learning_rate": 7.392617652389295e-05, + "loss": 2.659, + "step": 20874 + }, + { + "epoch": 1.8912368915766349, + "grad_norm": 0.7857770919799805, + "learning_rate": 7.39201353229022e-05, + "loss": 2.235, + "step": 20875 + }, + { + "epoch": 1.8913274897510814, + "grad_norm": 0.8926247954368591, + "learning_rate": 7.391409412191143e-05, + "loss": 2.6056, + "step": 20876 + }, + { + "epoch": 1.8914180879255285, + "grad_norm": 1.0198748111724854, + "learning_rate": 7.390805292092068e-05, + "loss": 2.5955, + "step": 20877 + }, + { + "epoch": 1.891508686099975, + "grad_norm": 0.8846321702003479, + "learning_rate": 7.390201171992993e-05, + "loss": 2.6388, + "step": 20878 + }, + { + "epoch": 1.891599284274422, + "grad_norm": 0.9055754542350769, + "learning_rate": 7.389597051893918e-05, + "loss": 2.8349, + "step": 20879 + }, + { + "epoch": 1.8916898824488686, + "grad_norm": 0.9111806750297546, + "learning_rate": 7.388992931794841e-05, + "loss": 2.5582, + "step": 20880 + }, + { + "epoch": 1.8917804806233154, + "grad_norm": 0.9092007875442505, + "learning_rate": 7.388388811695766e-05, + "loss": 2.5241, + "step": 20881 + }, + { + "epoch": 1.8918710787977622, + "grad_norm": 0.9348793029785156, + "learning_rate": 7.387784691596689e-05, + "loss": 2.6645, + "step": 20882 + }, + { + "epoch": 1.891961676972209, + "grad_norm": 1.008825421333313, + "learning_rate": 7.387180571497614e-05, + "loss": 2.7892, + "step": 20883 + }, + { + "epoch": 1.8920522751466557, + "grad_norm": 0.8802333474159241, + "learning_rate": 7.386576451398537e-05, + "loss": 2.492, + "step": 20884 + }, + { + "epoch": 1.8921428733211025, + "grad_norm": 0.8767416477203369, + "learning_rate": 7.385972331299462e-05, + "loss": 2.8371, + "step": 20885 + }, + { + "epoch": 1.8922334714955493, + "grad_norm": 0.9240865111351013, + "learning_rate": 7.385368211200387e-05, + "loss": 2.6723, + "step": 20886 + }, + { + "epoch": 1.8923240696699961, + "grad_norm": 0.9543725848197937, + "learning_rate": 7.384764091101312e-05, + "loss": 2.6536, + "step": 20887 + }, + { + "epoch": 1.892414667844443, + "grad_norm": 1.0165269374847412, + "learning_rate": 7.384159971002235e-05, + "loss": 2.4971, + "step": 20888 + }, + { + "epoch": 1.8925052660188897, + "grad_norm": 1.1622358560562134, + "learning_rate": 7.38355585090316e-05, + "loss": 2.7267, + "step": 20889 + }, + { + "epoch": 1.8925958641933365, + "grad_norm": 0.8802496194839478, + "learning_rate": 7.382951730804085e-05, + "loss": 2.4766, + "step": 20890 + }, + { + "epoch": 1.8926864623677833, + "grad_norm": 0.7895940542221069, + "learning_rate": 7.382347610705008e-05, + "loss": 1.9109, + "step": 20891 + }, + { + "epoch": 1.89277706054223, + "grad_norm": 0.8758484125137329, + "learning_rate": 7.381743490605933e-05, + "loss": 2.6216, + "step": 20892 + }, + { + "epoch": 1.8928676587166768, + "grad_norm": 0.9951639175415039, + "learning_rate": 7.381139370506858e-05, + "loss": 2.449, + "step": 20893 + }, + { + "epoch": 1.8929582568911236, + "grad_norm": 0.9039151072502136, + "learning_rate": 7.380535250407782e-05, + "loss": 2.9189, + "step": 20894 + }, + { + "epoch": 1.8930488550655704, + "grad_norm": 0.8889598250389099, + "learning_rate": 7.379931130308706e-05, + "loss": 2.832, + "step": 20895 + }, + { + "epoch": 1.8931394532400172, + "grad_norm": 1.0734717845916748, + "learning_rate": 7.37932701020963e-05, + "loss": 2.6864, + "step": 20896 + }, + { + "epoch": 1.893230051414464, + "grad_norm": 0.8955008387565613, + "learning_rate": 7.378722890110554e-05, + "loss": 2.7801, + "step": 20897 + }, + { + "epoch": 1.8933206495889108, + "grad_norm": 0.8183462619781494, + "learning_rate": 7.378118770011479e-05, + "loss": 2.7149, + "step": 20898 + }, + { + "epoch": 1.8934112477633576, + "grad_norm": 0.8744916319847107, + "learning_rate": 7.377514649912402e-05, + "loss": 2.5915, + "step": 20899 + }, + { + "epoch": 1.8935018459378044, + "grad_norm": 0.9047444462776184, + "learning_rate": 7.376910529813327e-05, + "loss": 2.5851, + "step": 20900 + }, + { + "epoch": 1.8935924441122511, + "grad_norm": 0.8785221576690674, + "learning_rate": 7.376306409714252e-05, + "loss": 2.776, + "step": 20901 + }, + { + "epoch": 1.893683042286698, + "grad_norm": 0.8787136673927307, + "learning_rate": 7.375702289615176e-05, + "loss": 2.645, + "step": 20902 + }, + { + "epoch": 1.8937736404611447, + "grad_norm": 0.9449939727783203, + "learning_rate": 7.3750981695161e-05, + "loss": 2.6567, + "step": 20903 + }, + { + "epoch": 1.8938642386355915, + "grad_norm": 0.9075304865837097, + "learning_rate": 7.374494049417025e-05, + "loss": 2.7451, + "step": 20904 + }, + { + "epoch": 1.8939548368100383, + "grad_norm": 1.0484390258789062, + "learning_rate": 7.373889929317949e-05, + "loss": 2.959, + "step": 20905 + }, + { + "epoch": 1.894045434984485, + "grad_norm": 0.9045038819313049, + "learning_rate": 7.373285809218873e-05, + "loss": 2.6381, + "step": 20906 + }, + { + "epoch": 1.8941360331589319, + "grad_norm": 1.028199315071106, + "learning_rate": 7.372681689119797e-05, + "loss": 2.7132, + "step": 20907 + }, + { + "epoch": 1.8942266313333787, + "grad_norm": 0.949894905090332, + "learning_rate": 7.372077569020722e-05, + "loss": 2.8425, + "step": 20908 + }, + { + "epoch": 1.8943172295078254, + "grad_norm": 0.9052489995956421, + "learning_rate": 7.371473448921647e-05, + "loss": 2.7712, + "step": 20909 + }, + { + "epoch": 1.8944078276822722, + "grad_norm": 0.7366258502006531, + "learning_rate": 7.37086932882257e-05, + "loss": 1.9343, + "step": 20910 + }, + { + "epoch": 1.894498425856719, + "grad_norm": 0.9151061773300171, + "learning_rate": 7.370265208723495e-05, + "loss": 2.6091, + "step": 20911 + }, + { + "epoch": 1.8945890240311658, + "grad_norm": 0.9148080945014954, + "learning_rate": 7.369661088624419e-05, + "loss": 2.8021, + "step": 20912 + }, + { + "epoch": 1.8946796222056126, + "grad_norm": 0.9312249422073364, + "learning_rate": 7.369056968525343e-05, + "loss": 2.3273, + "step": 20913 + }, + { + "epoch": 1.8947702203800594, + "grad_norm": 1.001359224319458, + "learning_rate": 7.368452848426267e-05, + "loss": 2.9087, + "step": 20914 + }, + { + "epoch": 1.8948608185545062, + "grad_norm": 0.8710689544677734, + "learning_rate": 7.367848728327191e-05, + "loss": 2.675, + "step": 20915 + }, + { + "epoch": 1.894951416728953, + "grad_norm": 0.8709700107574463, + "learning_rate": 7.367244608228116e-05, + "loss": 2.6899, + "step": 20916 + }, + { + "epoch": 1.8950420149033997, + "grad_norm": 0.8353473544120789, + "learning_rate": 7.366640488129041e-05, + "loss": 2.6669, + "step": 20917 + }, + { + "epoch": 1.8951326130778465, + "grad_norm": 0.9175320863723755, + "learning_rate": 7.366036368029964e-05, + "loss": 2.5944, + "step": 20918 + }, + { + "epoch": 1.8952232112522933, + "grad_norm": 0.9254857301712036, + "learning_rate": 7.365432247930889e-05, + "loss": 2.744, + "step": 20919 + }, + { + "epoch": 1.89531380942674, + "grad_norm": 0.8427469730377197, + "learning_rate": 7.364828127831813e-05, + "loss": 2.6177, + "step": 20920 + }, + { + "epoch": 1.895404407601187, + "grad_norm": 0.8852584362030029, + "learning_rate": 7.364224007732737e-05, + "loss": 2.5453, + "step": 20921 + }, + { + "epoch": 1.8954950057756337, + "grad_norm": 0.9430511593818665, + "learning_rate": 7.363619887633662e-05, + "loss": 2.7224, + "step": 20922 + }, + { + "epoch": 1.8955856039500802, + "grad_norm": 0.8916782140731812, + "learning_rate": 7.363015767534587e-05, + "loss": 2.7143, + "step": 20923 + }, + { + "epoch": 1.8956762021245273, + "grad_norm": 0.7635695934295654, + "learning_rate": 7.362411647435512e-05, + "loss": 2.0048, + "step": 20924 + }, + { + "epoch": 1.8957668002989738, + "grad_norm": 0.8976249694824219, + "learning_rate": 7.361807527336435e-05, + "loss": 2.7667, + "step": 20925 + }, + { + "epoch": 1.8958573984734208, + "grad_norm": 0.8656861186027527, + "learning_rate": 7.36120340723736e-05, + "loss": 2.7984, + "step": 20926 + }, + { + "epoch": 1.8959479966478674, + "grad_norm": 0.9123824834823608, + "learning_rate": 7.360599287138283e-05, + "loss": 2.7301, + "step": 20927 + }, + { + "epoch": 1.8960385948223144, + "grad_norm": 0.9524587988853455, + "learning_rate": 7.359995167039208e-05, + "loss": 2.7485, + "step": 20928 + }, + { + "epoch": 1.896129192996761, + "grad_norm": 0.8406687378883362, + "learning_rate": 7.359391046940131e-05, + "loss": 2.545, + "step": 20929 + }, + { + "epoch": 1.896219791171208, + "grad_norm": 0.9938035011291504, + "learning_rate": 7.358786926841056e-05, + "loss": 2.4888, + "step": 20930 + }, + { + "epoch": 1.8963103893456545, + "grad_norm": 0.9036288261413574, + "learning_rate": 7.358182806741981e-05, + "loss": 2.5765, + "step": 20931 + }, + { + "epoch": 1.8964009875201016, + "grad_norm": 0.9715306758880615, + "learning_rate": 7.357578686642906e-05, + "loss": 2.7474, + "step": 20932 + }, + { + "epoch": 1.8964915856945481, + "grad_norm": 0.9083801507949829, + "learning_rate": 7.356974566543829e-05, + "loss": 2.4449, + "step": 20933 + }, + { + "epoch": 1.8965821838689951, + "grad_norm": 0.781195342540741, + "learning_rate": 7.356370446444754e-05, + "loss": 1.8321, + "step": 20934 + }, + { + "epoch": 1.8966727820434417, + "grad_norm": 0.90357905626297, + "learning_rate": 7.355766326345677e-05, + "loss": 2.7933, + "step": 20935 + }, + { + "epoch": 1.8967633802178887, + "grad_norm": 0.8402625322341919, + "learning_rate": 7.355162206246602e-05, + "loss": 2.06, + "step": 20936 + }, + { + "epoch": 1.8968539783923353, + "grad_norm": 0.8514797687530518, + "learning_rate": 7.354558086147527e-05, + "loss": 2.4846, + "step": 20937 + }, + { + "epoch": 1.8969445765667823, + "grad_norm": 0.883826494216919, + "learning_rate": 7.353953966048451e-05, + "loss": 2.7024, + "step": 20938 + }, + { + "epoch": 1.8970351747412288, + "grad_norm": 0.9000139236450195, + "learning_rate": 7.353349845949375e-05, + "loss": 2.6262, + "step": 20939 + }, + { + "epoch": 1.8971257729156759, + "grad_norm": 0.9385125637054443, + "learning_rate": 7.3527457258503e-05, + "loss": 2.8679, + "step": 20940 + }, + { + "epoch": 1.8972163710901224, + "grad_norm": 0.8542750477790833, + "learning_rate": 7.352141605751224e-05, + "loss": 2.6857, + "step": 20941 + }, + { + "epoch": 1.8973069692645694, + "grad_norm": 0.8480224013328552, + "learning_rate": 7.351537485652148e-05, + "loss": 2.7042, + "step": 20942 + }, + { + "epoch": 1.897397567439016, + "grad_norm": 0.839003324508667, + "learning_rate": 7.350933365553073e-05, + "loss": 2.5452, + "step": 20943 + }, + { + "epoch": 1.897488165613463, + "grad_norm": 0.923685610294342, + "learning_rate": 7.350329245453996e-05, + "loss": 2.6173, + "step": 20944 + }, + { + "epoch": 1.8975787637879096, + "grad_norm": 0.916562557220459, + "learning_rate": 7.349725125354921e-05, + "loss": 2.8871, + "step": 20945 + }, + { + "epoch": 1.8976693619623566, + "grad_norm": 0.9168766140937805, + "learning_rate": 7.349121005255845e-05, + "loss": 2.6509, + "step": 20946 + }, + { + "epoch": 1.8977599601368031, + "grad_norm": 0.9302178025245667, + "learning_rate": 7.34851688515677e-05, + "loss": 2.6852, + "step": 20947 + }, + { + "epoch": 1.8978505583112502, + "grad_norm": 0.9307301044464111, + "learning_rate": 7.347912765057694e-05, + "loss": 2.8253, + "step": 20948 + }, + { + "epoch": 1.8979411564856967, + "grad_norm": 1.0565375089645386, + "learning_rate": 7.347308644958618e-05, + "loss": 3.0247, + "step": 20949 + }, + { + "epoch": 1.8980317546601437, + "grad_norm": 0.822937548160553, + "learning_rate": 7.346704524859542e-05, + "loss": 2.4406, + "step": 20950 + }, + { + "epoch": 1.8981223528345903, + "grad_norm": 1.0804429054260254, + "learning_rate": 7.346100404760467e-05, + "loss": 3.0519, + "step": 20951 + }, + { + "epoch": 1.8982129510090373, + "grad_norm": 0.9878852367401123, + "learning_rate": 7.34549628466139e-05, + "loss": 2.6057, + "step": 20952 + }, + { + "epoch": 1.8983035491834839, + "grad_norm": 0.9322752952575684, + "learning_rate": 7.344892164562316e-05, + "loss": 2.5297, + "step": 20953 + }, + { + "epoch": 1.8983941473579309, + "grad_norm": 0.8951143026351929, + "learning_rate": 7.34428804446324e-05, + "loss": 2.591, + "step": 20954 + }, + { + "epoch": 1.8984847455323774, + "grad_norm": 0.6842237114906311, + "learning_rate": 7.343683924364164e-05, + "loss": 1.184, + "step": 20955 + }, + { + "epoch": 1.8985753437068245, + "grad_norm": 0.9867695569992065, + "learning_rate": 7.343079804265089e-05, + "loss": 2.122, + "step": 20956 + }, + { + "epoch": 1.898665941881271, + "grad_norm": 0.957048773765564, + "learning_rate": 7.342475684166012e-05, + "loss": 2.9126, + "step": 20957 + }, + { + "epoch": 1.898756540055718, + "grad_norm": 0.8754870295524597, + "learning_rate": 7.341871564066937e-05, + "loss": 2.9039, + "step": 20958 + }, + { + "epoch": 1.8988471382301646, + "grad_norm": 0.8970413208007812, + "learning_rate": 7.34126744396786e-05, + "loss": 2.7206, + "step": 20959 + }, + { + "epoch": 1.8989377364046116, + "grad_norm": 0.8526992797851562, + "learning_rate": 7.340663323868785e-05, + "loss": 2.6776, + "step": 20960 + }, + { + "epoch": 1.8990283345790582, + "grad_norm": 0.7640036940574646, + "learning_rate": 7.34005920376971e-05, + "loss": 2.1046, + "step": 20961 + }, + { + "epoch": 1.899118932753505, + "grad_norm": 0.8898575305938721, + "learning_rate": 7.339455083670635e-05, + "loss": 2.6755, + "step": 20962 + }, + { + "epoch": 1.8992095309279518, + "grad_norm": 0.9451842904090881, + "learning_rate": 7.338850963571558e-05, + "loss": 2.793, + "step": 20963 + }, + { + "epoch": 1.8993001291023985, + "grad_norm": 0.7700482606887817, + "learning_rate": 7.338246843472483e-05, + "loss": 1.9861, + "step": 20964 + }, + { + "epoch": 1.8993907272768453, + "grad_norm": 0.8418443202972412, + "learning_rate": 7.337642723373406e-05, + "loss": 2.6189, + "step": 20965 + }, + { + "epoch": 1.8994813254512921, + "grad_norm": 1.0017577409744263, + "learning_rate": 7.337038603274331e-05, + "loss": 2.5807, + "step": 20966 + }, + { + "epoch": 1.899571923625739, + "grad_norm": 1.00525963306427, + "learning_rate": 7.336434483175255e-05, + "loss": 2.4815, + "step": 20967 + }, + { + "epoch": 1.8996625218001857, + "grad_norm": 0.9370409250259399, + "learning_rate": 7.335830363076181e-05, + "loss": 2.6895, + "step": 20968 + }, + { + "epoch": 1.8997531199746325, + "grad_norm": 0.8376080393791199, + "learning_rate": 7.335226242977104e-05, + "loss": 2.9376, + "step": 20969 + }, + { + "epoch": 1.8998437181490793, + "grad_norm": 0.928785502910614, + "learning_rate": 7.334622122878029e-05, + "loss": 2.6396, + "step": 20970 + }, + { + "epoch": 1.899934316323526, + "grad_norm": 0.8922337889671326, + "learning_rate": 7.334018002778952e-05, + "loss": 2.55, + "step": 20971 + }, + { + "epoch": 1.9000249144979728, + "grad_norm": 0.9367621541023254, + "learning_rate": 7.333413882679877e-05, + "loss": 2.7307, + "step": 20972 + }, + { + "epoch": 1.9001155126724196, + "grad_norm": 0.9512344002723694, + "learning_rate": 7.332809762580802e-05, + "loss": 2.6305, + "step": 20973 + }, + { + "epoch": 1.9002061108468664, + "grad_norm": 0.8876590132713318, + "learning_rate": 7.332205642481725e-05, + "loss": 2.5126, + "step": 20974 + }, + { + "epoch": 1.9002967090213132, + "grad_norm": 0.7650489807128906, + "learning_rate": 7.33160152238265e-05, + "loss": 1.8945, + "step": 20975 + }, + { + "epoch": 1.90038730719576, + "grad_norm": 0.8944228291511536, + "learning_rate": 7.330997402283575e-05, + "loss": 2.4979, + "step": 20976 + }, + { + "epoch": 1.9004779053702068, + "grad_norm": 0.8553958535194397, + "learning_rate": 7.3303932821845e-05, + "loss": 1.9339, + "step": 20977 + }, + { + "epoch": 1.9005685035446536, + "grad_norm": 0.8981584906578064, + "learning_rate": 7.329789162085423e-05, + "loss": 2.6546, + "step": 20978 + }, + { + "epoch": 1.9006591017191004, + "grad_norm": 0.9316869378089905, + "learning_rate": 7.329185041986348e-05, + "loss": 2.5722, + "step": 20979 + }, + { + "epoch": 1.9007496998935471, + "grad_norm": 0.914092481136322, + "learning_rate": 7.328580921887271e-05, + "loss": 2.7598, + "step": 20980 + }, + { + "epoch": 1.900840298067994, + "grad_norm": 0.889792263507843, + "learning_rate": 7.327976801788196e-05, + "loss": 2.5899, + "step": 20981 + }, + { + "epoch": 1.9009308962424407, + "grad_norm": 0.8398778438568115, + "learning_rate": 7.327372681689119e-05, + "loss": 2.0008, + "step": 20982 + }, + { + "epoch": 1.9010214944168875, + "grad_norm": 0.8262958526611328, + "learning_rate": 7.326768561590045e-05, + "loss": 2.4349, + "step": 20983 + }, + { + "epoch": 1.9011120925913343, + "grad_norm": 0.7461556792259216, + "learning_rate": 7.326164441490969e-05, + "loss": 2.1453, + "step": 20984 + }, + { + "epoch": 1.901202690765781, + "grad_norm": 0.9224506616592407, + "learning_rate": 7.325560321391894e-05, + "loss": 2.7951, + "step": 20985 + }, + { + "epoch": 1.9012932889402279, + "grad_norm": 0.9048070311546326, + "learning_rate": 7.324956201292817e-05, + "loss": 2.7949, + "step": 20986 + }, + { + "epoch": 1.9013838871146747, + "grad_norm": 0.8819717764854431, + "learning_rate": 7.324352081193742e-05, + "loss": 2.5433, + "step": 20987 + }, + { + "epoch": 1.9014744852891214, + "grad_norm": 0.9025039672851562, + "learning_rate": 7.323747961094666e-05, + "loss": 2.7229, + "step": 20988 + }, + { + "epoch": 1.9015650834635682, + "grad_norm": 0.8877102136611938, + "learning_rate": 7.32314384099559e-05, + "loss": 2.8172, + "step": 20989 + }, + { + "epoch": 1.901655681638015, + "grad_norm": 0.9174376130104065, + "learning_rate": 7.322539720896515e-05, + "loss": 2.5285, + "step": 20990 + }, + { + "epoch": 1.9017462798124618, + "grad_norm": 0.8428361415863037, + "learning_rate": 7.32193560079744e-05, + "loss": 2.5195, + "step": 20991 + }, + { + "epoch": 1.9018368779869086, + "grad_norm": 0.9302067756652832, + "learning_rate": 7.321331480698364e-05, + "loss": 2.6073, + "step": 20992 + }, + { + "epoch": 1.9019274761613554, + "grad_norm": 0.8771026730537415, + "learning_rate": 7.320727360599288e-05, + "loss": 2.7279, + "step": 20993 + }, + { + "epoch": 1.9020180743358022, + "grad_norm": 0.8992894887924194, + "learning_rate": 7.320123240500212e-05, + "loss": 2.6901, + "step": 20994 + }, + { + "epoch": 1.902108672510249, + "grad_norm": 0.9360191822052002, + "learning_rate": 7.319519120401136e-05, + "loss": 2.752, + "step": 20995 + }, + { + "epoch": 1.9021992706846957, + "grad_norm": 0.9133129715919495, + "learning_rate": 7.31891500030206e-05, + "loss": 2.6848, + "step": 20996 + }, + { + "epoch": 1.9022898688591425, + "grad_norm": 0.9134761095046997, + "learning_rate": 7.318310880202984e-05, + "loss": 2.6493, + "step": 20997 + }, + { + "epoch": 1.9023804670335893, + "grad_norm": 0.8381057977676392, + "learning_rate": 7.31770676010391e-05, + "loss": 2.0724, + "step": 20998 + }, + { + "epoch": 1.902471065208036, + "grad_norm": 0.8868497610092163, + "learning_rate": 7.317102640004833e-05, + "loss": 2.5807, + "step": 20999 + }, + { + "epoch": 1.902561663382483, + "grad_norm": 0.8886556625366211, + "learning_rate": 7.316498519905758e-05, + "loss": 2.817, + "step": 21000 + }, + { + "epoch": 1.9026522615569297, + "grad_norm": 0.9024673104286194, + "learning_rate": 7.315894399806682e-05, + "loss": 2.7861, + "step": 21001 + }, + { + "epoch": 1.9027428597313765, + "grad_norm": 0.8999277949333191, + "learning_rate": 7.315290279707606e-05, + "loss": 2.6151, + "step": 21002 + }, + { + "epoch": 1.9028334579058233, + "grad_norm": 0.8634093999862671, + "learning_rate": 7.31468615960853e-05, + "loss": 2.6132, + "step": 21003 + }, + { + "epoch": 1.9029240560802698, + "grad_norm": 0.8887990713119507, + "learning_rate": 7.314082039509454e-05, + "loss": 2.6835, + "step": 21004 + }, + { + "epoch": 1.9030146542547168, + "grad_norm": 0.8759530782699585, + "learning_rate": 7.313477919410379e-05, + "loss": 2.7657, + "step": 21005 + }, + { + "epoch": 1.9031052524291634, + "grad_norm": 0.9568186402320862, + "learning_rate": 7.312873799311304e-05, + "loss": 2.7278, + "step": 21006 + }, + { + "epoch": 1.9031958506036104, + "grad_norm": 0.8384071588516235, + "learning_rate": 7.312269679212227e-05, + "loss": 2.43, + "step": 21007 + }, + { + "epoch": 1.903286448778057, + "grad_norm": 0.99925696849823, + "learning_rate": 7.311665559113152e-05, + "loss": 2.6956, + "step": 21008 + }, + { + "epoch": 1.903377046952504, + "grad_norm": 0.8732210993766785, + "learning_rate": 7.311061439014077e-05, + "loss": 2.8882, + "step": 21009 + }, + { + "epoch": 1.9034676451269505, + "grad_norm": 0.8883938193321228, + "learning_rate": 7.310457318915e-05, + "loss": 2.6674, + "step": 21010 + }, + { + "epoch": 1.9035582433013976, + "grad_norm": 0.9305460453033447, + "learning_rate": 7.309853198815925e-05, + "loss": 2.925, + "step": 21011 + }, + { + "epoch": 1.9036488414758441, + "grad_norm": 0.9183872938156128, + "learning_rate": 7.309249078716848e-05, + "loss": 2.6941, + "step": 21012 + }, + { + "epoch": 1.9037394396502911, + "grad_norm": 0.9200016260147095, + "learning_rate": 7.308644958617775e-05, + "loss": 2.7157, + "step": 21013 + }, + { + "epoch": 1.9038300378247377, + "grad_norm": 0.8873023390769958, + "learning_rate": 7.308040838518698e-05, + "loss": 2.6468, + "step": 21014 + }, + { + "epoch": 1.9039206359991847, + "grad_norm": 1.0048316717147827, + "learning_rate": 7.307436718419623e-05, + "loss": 2.7917, + "step": 21015 + }, + { + "epoch": 1.9040112341736313, + "grad_norm": 0.9797114133834839, + "learning_rate": 7.306832598320546e-05, + "loss": 3.0217, + "step": 21016 + }, + { + "epoch": 1.9041018323480783, + "grad_norm": 0.9186366200447083, + "learning_rate": 7.306228478221471e-05, + "loss": 2.7966, + "step": 21017 + }, + { + "epoch": 1.9041924305225248, + "grad_norm": 1.0071773529052734, + "learning_rate": 7.305624358122394e-05, + "loss": 2.7827, + "step": 21018 + }, + { + "epoch": 1.9042830286969719, + "grad_norm": 0.8854652047157288, + "learning_rate": 7.305020238023319e-05, + "loss": 2.7152, + "step": 21019 + }, + { + "epoch": 1.9043736268714184, + "grad_norm": 0.9393354654312134, + "learning_rate": 7.304416117924243e-05, + "loss": 2.5887, + "step": 21020 + }, + { + "epoch": 1.9044642250458654, + "grad_norm": 0.914412260055542, + "learning_rate": 7.303811997825169e-05, + "loss": 2.9314, + "step": 21021 + }, + { + "epoch": 1.904554823220312, + "grad_norm": 0.9449445605278015, + "learning_rate": 7.303207877726092e-05, + "loss": 2.6962, + "step": 21022 + }, + { + "epoch": 1.904645421394759, + "grad_norm": 0.9121960401535034, + "learning_rate": 7.302603757627017e-05, + "loss": 2.6291, + "step": 21023 + }, + { + "epoch": 1.9047360195692056, + "grad_norm": 0.8602733612060547, + "learning_rate": 7.301999637527942e-05, + "loss": 2.4613, + "step": 21024 + }, + { + "epoch": 1.9048266177436526, + "grad_norm": 0.944913923740387, + "learning_rate": 7.301395517428865e-05, + "loss": 2.7639, + "step": 21025 + }, + { + "epoch": 1.9049172159180991, + "grad_norm": 0.8307715654373169, + "learning_rate": 7.30079139732979e-05, + "loss": 2.5817, + "step": 21026 + }, + { + "epoch": 1.9050078140925462, + "grad_norm": 0.9528908729553223, + "learning_rate": 7.300187277230713e-05, + "loss": 2.5377, + "step": 21027 + }, + { + "epoch": 1.9050984122669927, + "grad_norm": 0.9252762198448181, + "learning_rate": 7.299583157131639e-05, + "loss": 2.7356, + "step": 21028 + }, + { + "epoch": 1.9051890104414397, + "grad_norm": 0.8756992220878601, + "learning_rate": 7.298979037032563e-05, + "loss": 2.638, + "step": 21029 + }, + { + "epoch": 1.9052796086158863, + "grad_norm": 0.8887617588043213, + "learning_rate": 7.298374916933487e-05, + "loss": 2.5674, + "step": 21030 + }, + { + "epoch": 1.9053702067903333, + "grad_norm": 0.8547356724739075, + "learning_rate": 7.297770796834411e-05, + "loss": 2.5092, + "step": 21031 + }, + { + "epoch": 1.9054608049647799, + "grad_norm": 0.8975184559822083, + "learning_rate": 7.297166676735336e-05, + "loss": 2.7977, + "step": 21032 + }, + { + "epoch": 1.9055514031392269, + "grad_norm": 0.9130718111991882, + "learning_rate": 7.296562556636259e-05, + "loss": 2.7043, + "step": 21033 + }, + { + "epoch": 1.9056420013136735, + "grad_norm": 0.9711509346961975, + "learning_rate": 7.295958436537184e-05, + "loss": 3.1119, + "step": 21034 + }, + { + "epoch": 1.9057325994881205, + "grad_norm": 0.8174846172332764, + "learning_rate": 7.295354316438107e-05, + "loss": 2.334, + "step": 21035 + }, + { + "epoch": 1.905823197662567, + "grad_norm": 0.976460337638855, + "learning_rate": 7.294750196339033e-05, + "loss": 3.0319, + "step": 21036 + }, + { + "epoch": 1.905913795837014, + "grad_norm": 0.7240404486656189, + "learning_rate": 7.294146076239957e-05, + "loss": 1.8455, + "step": 21037 + }, + { + "epoch": 1.9060043940114606, + "grad_norm": 0.8779725432395935, + "learning_rate": 7.293541956140881e-05, + "loss": 2.7002, + "step": 21038 + }, + { + "epoch": 1.9060949921859076, + "grad_norm": 1.0632193088531494, + "learning_rate": 7.292937836041805e-05, + "loss": 2.8922, + "step": 21039 + }, + { + "epoch": 1.9061855903603542, + "grad_norm": 0.9767159223556519, + "learning_rate": 7.29233371594273e-05, + "loss": 2.7491, + "step": 21040 + }, + { + "epoch": 1.9062761885348012, + "grad_norm": 0.8495371341705322, + "learning_rate": 7.291729595843654e-05, + "loss": 2.5371, + "step": 21041 + }, + { + "epoch": 1.9063667867092478, + "grad_norm": 0.9233307242393494, + "learning_rate": 7.291125475744578e-05, + "loss": 2.6029, + "step": 21042 + }, + { + "epoch": 1.9064573848836945, + "grad_norm": 0.9255993962287903, + "learning_rate": 7.290521355645504e-05, + "loss": 2.8881, + "step": 21043 + }, + { + "epoch": 1.9065479830581413, + "grad_norm": 0.811370849609375, + "learning_rate": 7.289917235546427e-05, + "loss": 2.0786, + "step": 21044 + }, + { + "epoch": 1.9066385812325881, + "grad_norm": 0.9265390038490295, + "learning_rate": 7.289313115447352e-05, + "loss": 2.6559, + "step": 21045 + }, + { + "epoch": 1.906729179407035, + "grad_norm": 0.880262553691864, + "learning_rate": 7.288708995348275e-05, + "loss": 2.5072, + "step": 21046 + }, + { + "epoch": 1.9068197775814817, + "grad_norm": 0.9174660444259644, + "learning_rate": 7.2881048752492e-05, + "loss": 2.6413, + "step": 21047 + }, + { + "epoch": 1.9069103757559285, + "grad_norm": 0.8557831048965454, + "learning_rate": 7.287500755150124e-05, + "loss": 2.6362, + "step": 21048 + }, + { + "epoch": 1.9070009739303753, + "grad_norm": 0.8792663812637329, + "learning_rate": 7.286896635051048e-05, + "loss": 2.7873, + "step": 21049 + }, + { + "epoch": 1.907091572104822, + "grad_norm": 0.7316179871559143, + "learning_rate": 7.286292514951972e-05, + "loss": 2.0769, + "step": 21050 + }, + { + "epoch": 1.9071821702792688, + "grad_norm": 0.9314950108528137, + "learning_rate": 7.285688394852898e-05, + "loss": 2.6761, + "step": 21051 + }, + { + "epoch": 1.9072727684537156, + "grad_norm": 0.7971512079238892, + "learning_rate": 7.285084274753821e-05, + "loss": 2.2788, + "step": 21052 + }, + { + "epoch": 1.9073633666281624, + "grad_norm": 0.9926621317863464, + "learning_rate": 7.284480154654746e-05, + "loss": 2.6422, + "step": 21053 + }, + { + "epoch": 1.9074539648026092, + "grad_norm": 0.9017044305801392, + "learning_rate": 7.28387603455567e-05, + "loss": 2.6321, + "step": 21054 + }, + { + "epoch": 1.907544562977056, + "grad_norm": 0.9692980051040649, + "learning_rate": 7.283271914456594e-05, + "loss": 2.8275, + "step": 21055 + }, + { + "epoch": 1.9076351611515028, + "grad_norm": 0.9458670616149902, + "learning_rate": 7.282667794357519e-05, + "loss": 2.8336, + "step": 21056 + }, + { + "epoch": 1.9077257593259496, + "grad_norm": 0.7679195404052734, + "learning_rate": 7.282063674258442e-05, + "loss": 1.9803, + "step": 21057 + }, + { + "epoch": 1.9078163575003964, + "grad_norm": 0.877784788608551, + "learning_rate": 7.281459554159367e-05, + "loss": 2.8156, + "step": 21058 + }, + { + "epoch": 1.9079069556748431, + "grad_norm": 0.950771689414978, + "learning_rate": 7.280855434060292e-05, + "loss": 2.8031, + "step": 21059 + }, + { + "epoch": 1.90799755384929, + "grad_norm": 0.8872076272964478, + "learning_rate": 7.280251313961217e-05, + "loss": 2.7087, + "step": 21060 + }, + { + "epoch": 1.9080881520237367, + "grad_norm": 0.9262461066246033, + "learning_rate": 7.27964719386214e-05, + "loss": 2.6352, + "step": 21061 + }, + { + "epoch": 1.9081787501981835, + "grad_norm": 0.9019664525985718, + "learning_rate": 7.279043073763065e-05, + "loss": 2.7561, + "step": 21062 + }, + { + "epoch": 1.9082693483726303, + "grad_norm": 0.9673509001731873, + "learning_rate": 7.278438953663988e-05, + "loss": 2.6516, + "step": 21063 + }, + { + "epoch": 1.908359946547077, + "grad_norm": 0.9597764015197754, + "learning_rate": 7.277834833564913e-05, + "loss": 2.8706, + "step": 21064 + }, + { + "epoch": 1.9084505447215239, + "grad_norm": 0.9245177507400513, + "learning_rate": 7.277230713465836e-05, + "loss": 2.8593, + "step": 21065 + }, + { + "epoch": 1.9085411428959707, + "grad_norm": 0.8479037880897522, + "learning_rate": 7.276626593366763e-05, + "loss": 2.7674, + "step": 21066 + }, + { + "epoch": 1.9086317410704174, + "grad_norm": 0.8216056227684021, + "learning_rate": 7.276022473267686e-05, + "loss": 2.6581, + "step": 21067 + }, + { + "epoch": 1.9087223392448642, + "grad_norm": 0.9524850249290466, + "learning_rate": 7.275418353168611e-05, + "loss": 2.5874, + "step": 21068 + }, + { + "epoch": 1.908812937419311, + "grad_norm": 0.9451521039009094, + "learning_rate": 7.274814233069534e-05, + "loss": 3.0812, + "step": 21069 + }, + { + "epoch": 1.9089035355937578, + "grad_norm": 0.8867204189300537, + "learning_rate": 7.274210112970459e-05, + "loss": 2.5386, + "step": 21070 + }, + { + "epoch": 1.9089941337682046, + "grad_norm": 0.8252332210540771, + "learning_rate": 7.273605992871382e-05, + "loss": 2.8459, + "step": 21071 + }, + { + "epoch": 1.9090847319426514, + "grad_norm": 0.9099574089050293, + "learning_rate": 7.273001872772307e-05, + "loss": 2.9797, + "step": 21072 + }, + { + "epoch": 1.9091753301170982, + "grad_norm": 0.7633792757987976, + "learning_rate": 7.272397752673232e-05, + "loss": 1.9737, + "step": 21073 + }, + { + "epoch": 1.909265928291545, + "grad_norm": 0.9609772562980652, + "learning_rate": 7.271793632574157e-05, + "loss": 2.5761, + "step": 21074 + }, + { + "epoch": 1.9093565264659917, + "grad_norm": 0.8715193271636963, + "learning_rate": 7.27118951247508e-05, + "loss": 2.1775, + "step": 21075 + }, + { + "epoch": 1.9094471246404385, + "grad_norm": 0.797870934009552, + "learning_rate": 7.270585392376005e-05, + "loss": 2.2711, + "step": 21076 + }, + { + "epoch": 1.9095377228148853, + "grad_norm": 0.8540139198303223, + "learning_rate": 7.26998127227693e-05, + "loss": 2.5593, + "step": 21077 + }, + { + "epoch": 1.909628320989332, + "grad_norm": 0.9273702502250671, + "learning_rate": 7.269377152177853e-05, + "loss": 2.5664, + "step": 21078 + }, + { + "epoch": 1.909718919163779, + "grad_norm": 0.9193525314331055, + "learning_rate": 7.268773032078778e-05, + "loss": 2.6477, + "step": 21079 + }, + { + "epoch": 1.9098095173382257, + "grad_norm": 0.9119673371315002, + "learning_rate": 7.268168911979701e-05, + "loss": 2.6629, + "step": 21080 + }, + { + "epoch": 1.9099001155126725, + "grad_norm": 0.9910651445388794, + "learning_rate": 7.267564791880627e-05, + "loss": 2.6616, + "step": 21081 + }, + { + "epoch": 1.9099907136871193, + "grad_norm": 1.0084643363952637, + "learning_rate": 7.26696067178155e-05, + "loss": 2.6517, + "step": 21082 + }, + { + "epoch": 1.910081311861566, + "grad_norm": 0.9061895608901978, + "learning_rate": 7.266356551682475e-05, + "loss": 2.9942, + "step": 21083 + }, + { + "epoch": 1.9101719100360128, + "grad_norm": 0.8886370062828064, + "learning_rate": 7.265752431583399e-05, + "loss": 2.6215, + "step": 21084 + }, + { + "epoch": 1.9102625082104594, + "grad_norm": 0.9225432276725769, + "learning_rate": 7.265148311484323e-05, + "loss": 2.5356, + "step": 21085 + }, + { + "epoch": 1.9103531063849064, + "grad_norm": 0.8996452689170837, + "learning_rate": 7.264544191385247e-05, + "loss": 2.6482, + "step": 21086 + }, + { + "epoch": 1.910443704559353, + "grad_norm": 0.8885630965232849, + "learning_rate": 7.263940071286172e-05, + "loss": 2.6177, + "step": 21087 + }, + { + "epoch": 1.9105343027338, + "grad_norm": 0.790282666683197, + "learning_rate": 7.263335951187096e-05, + "loss": 2.0221, + "step": 21088 + }, + { + "epoch": 1.9106249009082465, + "grad_norm": 0.9403713345527649, + "learning_rate": 7.262731831088021e-05, + "loss": 2.6744, + "step": 21089 + }, + { + "epoch": 1.9107154990826936, + "grad_norm": 0.9419604539871216, + "learning_rate": 7.262127710988945e-05, + "loss": 2.4555, + "step": 21090 + }, + { + "epoch": 1.9108060972571401, + "grad_norm": 1.039170503616333, + "learning_rate": 7.26152359088987e-05, + "loss": 2.6414, + "step": 21091 + }, + { + "epoch": 1.9108966954315871, + "grad_norm": 0.9234788417816162, + "learning_rate": 7.260919470790794e-05, + "loss": 2.6864, + "step": 21092 + }, + { + "epoch": 1.9109872936060337, + "grad_norm": 0.8277661800384521, + "learning_rate": 7.260315350691718e-05, + "loss": 2.1581, + "step": 21093 + }, + { + "epoch": 1.9110778917804807, + "grad_norm": 0.8976879715919495, + "learning_rate": 7.259711230592642e-05, + "loss": 2.4994, + "step": 21094 + }, + { + "epoch": 1.9111684899549273, + "grad_norm": 0.9327512383460999, + "learning_rate": 7.259107110493566e-05, + "loss": 2.5726, + "step": 21095 + }, + { + "epoch": 1.9112590881293743, + "grad_norm": 0.8085076808929443, + "learning_rate": 7.258502990394492e-05, + "loss": 2.1584, + "step": 21096 + }, + { + "epoch": 1.9113496863038208, + "grad_norm": 0.8432108759880066, + "learning_rate": 7.257898870295415e-05, + "loss": 2.6561, + "step": 21097 + }, + { + "epoch": 1.9114402844782679, + "grad_norm": 0.9807536602020264, + "learning_rate": 7.25729475019634e-05, + "loss": 2.8859, + "step": 21098 + }, + { + "epoch": 1.9115308826527144, + "grad_norm": 0.9161026477813721, + "learning_rate": 7.256690630097263e-05, + "loss": 2.6748, + "step": 21099 + }, + { + "epoch": 1.9116214808271614, + "grad_norm": 1.0492771863937378, + "learning_rate": 7.256086509998188e-05, + "loss": 2.934, + "step": 21100 + }, + { + "epoch": 1.911712079001608, + "grad_norm": 0.8957008719444275, + "learning_rate": 7.255482389899112e-05, + "loss": 2.5878, + "step": 21101 + }, + { + "epoch": 1.911802677176055, + "grad_norm": 0.9469310641288757, + "learning_rate": 7.254878269800036e-05, + "loss": 2.645, + "step": 21102 + }, + { + "epoch": 1.9118932753505016, + "grad_norm": 0.8953062295913696, + "learning_rate": 7.254274149700961e-05, + "loss": 2.7069, + "step": 21103 + }, + { + "epoch": 1.9119838735249486, + "grad_norm": 0.9301618337631226, + "learning_rate": 7.253670029601886e-05, + "loss": 2.701, + "step": 21104 + }, + { + "epoch": 1.9120744716993952, + "grad_norm": 0.8936797976493835, + "learning_rate": 7.253065909502809e-05, + "loss": 2.5142, + "step": 21105 + }, + { + "epoch": 1.9121650698738422, + "grad_norm": 0.8863322138786316, + "learning_rate": 7.252461789403734e-05, + "loss": 2.8527, + "step": 21106 + }, + { + "epoch": 1.9122556680482887, + "grad_norm": 0.8735069036483765, + "learning_rate": 7.251857669304657e-05, + "loss": 2.4664, + "step": 21107 + }, + { + "epoch": 1.9123462662227357, + "grad_norm": 0.8868123292922974, + "learning_rate": 7.251253549205582e-05, + "loss": 2.7427, + "step": 21108 + }, + { + "epoch": 1.9124368643971823, + "grad_norm": 0.9208387732505798, + "learning_rate": 7.250649429106507e-05, + "loss": 2.7295, + "step": 21109 + }, + { + "epoch": 1.9125274625716293, + "grad_norm": 0.904309868812561, + "learning_rate": 7.25004530900743e-05, + "loss": 2.9053, + "step": 21110 + }, + { + "epoch": 1.9126180607460759, + "grad_norm": 0.8795705437660217, + "learning_rate": 7.249441188908356e-05, + "loss": 2.8275, + "step": 21111 + }, + { + "epoch": 1.9127086589205229, + "grad_norm": 0.9865974187850952, + "learning_rate": 7.24883706880928e-05, + "loss": 2.6496, + "step": 21112 + }, + { + "epoch": 1.9127992570949695, + "grad_norm": 0.9455968141555786, + "learning_rate": 7.248232948710205e-05, + "loss": 2.7638, + "step": 21113 + }, + { + "epoch": 1.9128898552694165, + "grad_norm": 0.9441313743591309, + "learning_rate": 7.247628828611128e-05, + "loss": 2.5351, + "step": 21114 + }, + { + "epoch": 1.912980453443863, + "grad_norm": 0.9353181719779968, + "learning_rate": 7.247024708512053e-05, + "loss": 2.8452, + "step": 21115 + }, + { + "epoch": 1.91307105161831, + "grad_norm": 0.9410268664360046, + "learning_rate": 7.246420588412976e-05, + "loss": 2.7332, + "step": 21116 + }, + { + "epoch": 1.9131616497927566, + "grad_norm": 0.9680608510971069, + "learning_rate": 7.245816468313901e-05, + "loss": 2.6584, + "step": 21117 + }, + { + "epoch": 1.9132522479672036, + "grad_norm": 0.902754545211792, + "learning_rate": 7.245212348214826e-05, + "loss": 2.7327, + "step": 21118 + }, + { + "epoch": 1.9133428461416502, + "grad_norm": 0.9890328645706177, + "learning_rate": 7.24460822811575e-05, + "loss": 2.4928, + "step": 21119 + }, + { + "epoch": 1.9134334443160972, + "grad_norm": 0.8759199380874634, + "learning_rate": 7.244004108016674e-05, + "loss": 2.8026, + "step": 21120 + }, + { + "epoch": 1.9135240424905438, + "grad_norm": 0.9132920503616333, + "learning_rate": 7.243399987917599e-05, + "loss": 2.6844, + "step": 21121 + }, + { + "epoch": 1.9136146406649908, + "grad_norm": 0.7933570146560669, + "learning_rate": 7.242795867818522e-05, + "loss": 2.0592, + "step": 21122 + }, + { + "epoch": 1.9137052388394373, + "grad_norm": 0.7117796540260315, + "learning_rate": 7.242191747719447e-05, + "loss": 1.8286, + "step": 21123 + }, + { + "epoch": 1.9137958370138841, + "grad_norm": 0.977120041847229, + "learning_rate": 7.241587627620372e-05, + "loss": 2.6784, + "step": 21124 + }, + { + "epoch": 1.913886435188331, + "grad_norm": 0.8698854446411133, + "learning_rate": 7.240983507521295e-05, + "loss": 2.5113, + "step": 21125 + }, + { + "epoch": 1.9139770333627777, + "grad_norm": 0.9631557464599609, + "learning_rate": 7.24037938742222e-05, + "loss": 2.6865, + "step": 21126 + }, + { + "epoch": 1.9140676315372245, + "grad_norm": 0.9705899357795715, + "learning_rate": 7.239775267323144e-05, + "loss": 2.6386, + "step": 21127 + }, + { + "epoch": 1.9141582297116713, + "grad_norm": 0.8597540855407715, + "learning_rate": 7.239171147224069e-05, + "loss": 2.739, + "step": 21128 + }, + { + "epoch": 1.914248827886118, + "grad_norm": 0.9234615564346313, + "learning_rate": 7.238567027124993e-05, + "loss": 2.7514, + "step": 21129 + }, + { + "epoch": 1.9143394260605648, + "grad_norm": 0.9180841445922852, + "learning_rate": 7.237962907025917e-05, + "loss": 2.2681, + "step": 21130 + }, + { + "epoch": 1.9144300242350116, + "grad_norm": 0.9423999190330505, + "learning_rate": 7.237358786926841e-05, + "loss": 2.7765, + "step": 21131 + }, + { + "epoch": 1.9145206224094584, + "grad_norm": 0.9131466150283813, + "learning_rate": 7.236754666827766e-05, + "loss": 2.6606, + "step": 21132 + }, + { + "epoch": 1.9146112205839052, + "grad_norm": 0.8639022707939148, + "learning_rate": 7.23615054672869e-05, + "loss": 2.6922, + "step": 21133 + }, + { + "epoch": 1.914701818758352, + "grad_norm": 0.9660391211509705, + "learning_rate": 7.235546426629615e-05, + "loss": 2.632, + "step": 21134 + }, + { + "epoch": 1.9147924169327988, + "grad_norm": 0.9152865409851074, + "learning_rate": 7.234942306530538e-05, + "loss": 2.8752, + "step": 21135 + }, + { + "epoch": 1.9148830151072456, + "grad_norm": 0.9442715644836426, + "learning_rate": 7.234338186431463e-05, + "loss": 2.7909, + "step": 21136 + }, + { + "epoch": 1.9149736132816924, + "grad_norm": 0.8893885016441345, + "learning_rate": 7.233734066332387e-05, + "loss": 2.5754, + "step": 21137 + }, + { + "epoch": 1.9150642114561391, + "grad_norm": 0.9536647796630859, + "learning_rate": 7.233129946233311e-05, + "loss": 2.5808, + "step": 21138 + }, + { + "epoch": 1.915154809630586, + "grad_norm": 1.0108805894851685, + "learning_rate": 7.232525826134235e-05, + "loss": 2.7299, + "step": 21139 + }, + { + "epoch": 1.9152454078050327, + "grad_norm": 0.9782835841178894, + "learning_rate": 7.23192170603516e-05, + "loss": 2.5789, + "step": 21140 + }, + { + "epoch": 1.9153360059794795, + "grad_norm": 0.941855788230896, + "learning_rate": 7.231317585936084e-05, + "loss": 2.7586, + "step": 21141 + }, + { + "epoch": 1.9154266041539263, + "grad_norm": 0.8933389782905579, + "learning_rate": 7.230713465837009e-05, + "loss": 2.5686, + "step": 21142 + }, + { + "epoch": 1.915517202328373, + "grad_norm": 0.989151656627655, + "learning_rate": 7.230109345737934e-05, + "loss": 2.6193, + "step": 21143 + }, + { + "epoch": 1.9156078005028199, + "grad_norm": 1.000336766242981, + "learning_rate": 7.229505225638857e-05, + "loss": 2.3833, + "step": 21144 + }, + { + "epoch": 1.9156983986772667, + "grad_norm": 0.899821937084198, + "learning_rate": 7.228901105539782e-05, + "loss": 2.5887, + "step": 21145 + }, + { + "epoch": 1.9157889968517134, + "grad_norm": 0.8995721936225891, + "learning_rate": 7.228296985440705e-05, + "loss": 2.5046, + "step": 21146 + }, + { + "epoch": 1.9158795950261602, + "grad_norm": 0.8423988819122314, + "learning_rate": 7.22769286534163e-05, + "loss": 2.6568, + "step": 21147 + }, + { + "epoch": 1.915970193200607, + "grad_norm": 0.9318632483482361, + "learning_rate": 7.227088745242555e-05, + "loss": 2.6063, + "step": 21148 + }, + { + "epoch": 1.9160607913750538, + "grad_norm": 0.8319920897483826, + "learning_rate": 7.22648462514348e-05, + "loss": 2.6408, + "step": 21149 + }, + { + "epoch": 1.9161513895495006, + "grad_norm": 0.9076424837112427, + "learning_rate": 7.225880505044403e-05, + "loss": 2.9123, + "step": 21150 + }, + { + "epoch": 1.9162419877239474, + "grad_norm": 0.7533502578735352, + "learning_rate": 7.225276384945328e-05, + "loss": 2.043, + "step": 21151 + }, + { + "epoch": 1.9163325858983942, + "grad_norm": 0.90687096118927, + "learning_rate": 7.224672264846251e-05, + "loss": 2.7018, + "step": 21152 + }, + { + "epoch": 1.916423184072841, + "grad_norm": 0.7921266555786133, + "learning_rate": 7.224068144747176e-05, + "loss": 1.8989, + "step": 21153 + }, + { + "epoch": 1.9165137822472877, + "grad_norm": 0.9284470081329346, + "learning_rate": 7.2234640246481e-05, + "loss": 2.7607, + "step": 21154 + }, + { + "epoch": 1.9166043804217345, + "grad_norm": 0.962561845779419, + "learning_rate": 7.222859904549024e-05, + "loss": 2.4929, + "step": 21155 + }, + { + "epoch": 1.9166949785961813, + "grad_norm": 0.9318045973777771, + "learning_rate": 7.222255784449949e-05, + "loss": 2.9301, + "step": 21156 + }, + { + "epoch": 1.916785576770628, + "grad_norm": 0.868480384349823, + "learning_rate": 7.221651664350874e-05, + "loss": 2.5635, + "step": 21157 + }, + { + "epoch": 1.916876174945075, + "grad_norm": 0.9178460836410522, + "learning_rate": 7.221047544251797e-05, + "loss": 2.4809, + "step": 21158 + }, + { + "epoch": 1.9169667731195217, + "grad_norm": 0.9684503674507141, + "learning_rate": 7.220443424152722e-05, + "loss": 2.6432, + "step": 21159 + }, + { + "epoch": 1.9170573712939685, + "grad_norm": 0.9858421683311462, + "learning_rate": 7.219839304053647e-05, + "loss": 2.8223, + "step": 21160 + }, + { + "epoch": 1.9171479694684153, + "grad_norm": 0.8872658610343933, + "learning_rate": 7.21923518395457e-05, + "loss": 2.7945, + "step": 21161 + }, + { + "epoch": 1.917238567642862, + "grad_norm": 0.8078083395957947, + "learning_rate": 7.218631063855495e-05, + "loss": 2.2894, + "step": 21162 + }, + { + "epoch": 1.9173291658173088, + "grad_norm": 0.8455821871757507, + "learning_rate": 7.21802694375642e-05, + "loss": 2.6078, + "step": 21163 + }, + { + "epoch": 1.9174197639917556, + "grad_norm": 0.8241410851478577, + "learning_rate": 7.217422823657344e-05, + "loss": 2.1602, + "step": 21164 + }, + { + "epoch": 1.9175103621662024, + "grad_norm": 0.8749362230300903, + "learning_rate": 7.216818703558268e-05, + "loss": 2.6921, + "step": 21165 + }, + { + "epoch": 1.917600960340649, + "grad_norm": 0.9481250047683716, + "learning_rate": 7.216214583459193e-05, + "loss": 2.7054, + "step": 21166 + }, + { + "epoch": 1.917691558515096, + "grad_norm": 0.933709979057312, + "learning_rate": 7.215610463360116e-05, + "loss": 2.773, + "step": 21167 + }, + { + "epoch": 1.9177821566895425, + "grad_norm": 0.9124207496643066, + "learning_rate": 7.21500634326104e-05, + "loss": 2.6375, + "step": 21168 + }, + { + "epoch": 1.9178727548639896, + "grad_norm": 0.9998805522918701, + "learning_rate": 7.214402223161964e-05, + "loss": 2.754, + "step": 21169 + }, + { + "epoch": 1.9179633530384361, + "grad_norm": 0.8937065005302429, + "learning_rate": 7.213798103062889e-05, + "loss": 2.5303, + "step": 21170 + }, + { + "epoch": 1.9180539512128831, + "grad_norm": 0.8578945994377136, + "learning_rate": 7.213193982963814e-05, + "loss": 2.6207, + "step": 21171 + }, + { + "epoch": 1.9181445493873297, + "grad_norm": 0.9598652720451355, + "learning_rate": 7.212589862864738e-05, + "loss": 2.8319, + "step": 21172 + }, + { + "epoch": 1.9182351475617767, + "grad_norm": 0.8918821215629578, + "learning_rate": 7.211985742765662e-05, + "loss": 2.9055, + "step": 21173 + }, + { + "epoch": 1.9183257457362233, + "grad_norm": 0.8804032802581787, + "learning_rate": 7.211381622666587e-05, + "loss": 3.031, + "step": 21174 + }, + { + "epoch": 1.9184163439106703, + "grad_norm": 0.7508083581924438, + "learning_rate": 7.21077750256751e-05, + "loss": 1.9897, + "step": 21175 + }, + { + "epoch": 1.9185069420851169, + "grad_norm": 0.7857128381729126, + "learning_rate": 7.210173382468435e-05, + "loss": 2.0792, + "step": 21176 + }, + { + "epoch": 1.9185975402595639, + "grad_norm": 0.8584805727005005, + "learning_rate": 7.20956926236936e-05, + "loss": 2.6637, + "step": 21177 + }, + { + "epoch": 1.9186881384340104, + "grad_norm": 0.942674458026886, + "learning_rate": 7.208965142270284e-05, + "loss": 2.9203, + "step": 21178 + }, + { + "epoch": 1.9187787366084574, + "grad_norm": 0.9439936876296997, + "learning_rate": 7.208361022171209e-05, + "loss": 2.8017, + "step": 21179 + }, + { + "epoch": 1.918869334782904, + "grad_norm": 0.9840463995933533, + "learning_rate": 7.207756902072132e-05, + "loss": 2.6562, + "step": 21180 + }, + { + "epoch": 1.918959932957351, + "grad_norm": 0.8949111104011536, + "learning_rate": 7.207152781973057e-05, + "loss": 2.8215, + "step": 21181 + }, + { + "epoch": 1.9190505311317976, + "grad_norm": 0.9049927592277527, + "learning_rate": 7.20654866187398e-05, + "loss": 2.7428, + "step": 21182 + }, + { + "epoch": 1.9191411293062446, + "grad_norm": 0.8788699507713318, + "learning_rate": 7.205944541774905e-05, + "loss": 2.6323, + "step": 21183 + }, + { + "epoch": 1.9192317274806912, + "grad_norm": 0.9303457140922546, + "learning_rate": 7.205340421675829e-05, + "loss": 2.7737, + "step": 21184 + }, + { + "epoch": 1.9193223256551382, + "grad_norm": 0.93577641248703, + "learning_rate": 7.204736301576753e-05, + "loss": 2.918, + "step": 21185 + }, + { + "epoch": 1.9194129238295847, + "grad_norm": 0.8980421423912048, + "learning_rate": 7.204132181477678e-05, + "loss": 2.0687, + "step": 21186 + }, + { + "epoch": 1.9195035220040317, + "grad_norm": 0.852232038974762, + "learning_rate": 7.203528061378603e-05, + "loss": 2.3032, + "step": 21187 + }, + { + "epoch": 1.9195941201784783, + "grad_norm": 0.8484058976173401, + "learning_rate": 7.202923941279526e-05, + "loss": 2.5115, + "step": 21188 + }, + { + "epoch": 1.9196847183529253, + "grad_norm": 0.8691514134407043, + "learning_rate": 7.202319821180451e-05, + "loss": 2.6625, + "step": 21189 + }, + { + "epoch": 1.9197753165273719, + "grad_norm": 0.9424632787704468, + "learning_rate": 7.201715701081375e-05, + "loss": 2.7986, + "step": 21190 + }, + { + "epoch": 1.9198659147018189, + "grad_norm": 0.7917196750640869, + "learning_rate": 7.2011115809823e-05, + "loss": 2.1794, + "step": 21191 + }, + { + "epoch": 1.9199565128762655, + "grad_norm": 0.8996807336807251, + "learning_rate": 7.200507460883224e-05, + "loss": 2.6818, + "step": 21192 + }, + { + "epoch": 1.9200471110507125, + "grad_norm": 0.8709964156150818, + "learning_rate": 7.199903340784149e-05, + "loss": 2.5492, + "step": 21193 + }, + { + "epoch": 1.920137709225159, + "grad_norm": 0.8715016841888428, + "learning_rate": 7.199299220685072e-05, + "loss": 2.8555, + "step": 21194 + }, + { + "epoch": 1.920228307399606, + "grad_norm": 0.9005587697029114, + "learning_rate": 7.198695100585997e-05, + "loss": 2.9586, + "step": 21195 + }, + { + "epoch": 1.9203189055740526, + "grad_norm": 0.9432859420776367, + "learning_rate": 7.198090980486922e-05, + "loss": 2.7102, + "step": 21196 + }, + { + "epoch": 1.9204095037484996, + "grad_norm": 0.8880987167358398, + "learning_rate": 7.197486860387845e-05, + "loss": 2.7353, + "step": 21197 + }, + { + "epoch": 1.9205001019229462, + "grad_norm": 1.0359699726104736, + "learning_rate": 7.19688274028877e-05, + "loss": 2.8245, + "step": 21198 + }, + { + "epoch": 1.9205907000973932, + "grad_norm": 0.9212369918823242, + "learning_rate": 7.196278620189693e-05, + "loss": 2.5877, + "step": 21199 + }, + { + "epoch": 1.9206812982718398, + "grad_norm": 0.9342612028121948, + "learning_rate": 7.195674500090618e-05, + "loss": 2.9825, + "step": 21200 + }, + { + "epoch": 1.9207718964462868, + "grad_norm": 0.8905466794967651, + "learning_rate": 7.195070379991543e-05, + "loss": 2.7361, + "step": 21201 + }, + { + "epoch": 1.9208624946207333, + "grad_norm": 0.89704430103302, + "learning_rate": 7.194466259892468e-05, + "loss": 2.7541, + "step": 21202 + }, + { + "epoch": 1.9209530927951803, + "grad_norm": 0.8696275353431702, + "learning_rate": 7.193862139793391e-05, + "loss": 2.735, + "step": 21203 + }, + { + "epoch": 1.921043690969627, + "grad_norm": 0.9823219776153564, + "learning_rate": 7.193258019694316e-05, + "loss": 2.6664, + "step": 21204 + }, + { + "epoch": 1.9211342891440737, + "grad_norm": 0.9271928668022156, + "learning_rate": 7.192653899595239e-05, + "loss": 2.6274, + "step": 21205 + }, + { + "epoch": 1.9212248873185205, + "grad_norm": 0.9573991298675537, + "learning_rate": 7.192049779496164e-05, + "loss": 2.7763, + "step": 21206 + }, + { + "epoch": 1.9213154854929673, + "grad_norm": 0.903875470161438, + "learning_rate": 7.191445659397087e-05, + "loss": 2.91, + "step": 21207 + }, + { + "epoch": 1.921406083667414, + "grad_norm": 0.8217737078666687, + "learning_rate": 7.190841539298013e-05, + "loss": 1.9203, + "step": 21208 + }, + { + "epoch": 1.9214966818418608, + "grad_norm": 0.892660915851593, + "learning_rate": 7.190237419198937e-05, + "loss": 2.7562, + "step": 21209 + }, + { + "epoch": 1.9215872800163076, + "grad_norm": 0.8579511642456055, + "learning_rate": 7.189633299099862e-05, + "loss": 2.6384, + "step": 21210 + }, + { + "epoch": 1.9216778781907544, + "grad_norm": 0.9325515627861023, + "learning_rate": 7.189029179000786e-05, + "loss": 3.1261, + "step": 21211 + }, + { + "epoch": 1.9217684763652012, + "grad_norm": 0.8536458611488342, + "learning_rate": 7.18842505890171e-05, + "loss": 2.6509, + "step": 21212 + }, + { + "epoch": 1.921859074539648, + "grad_norm": 0.83998703956604, + "learning_rate": 7.187820938802635e-05, + "loss": 2.7098, + "step": 21213 + }, + { + "epoch": 1.9219496727140948, + "grad_norm": 0.9229088425636292, + "learning_rate": 7.187216818703558e-05, + "loss": 2.5674, + "step": 21214 + }, + { + "epoch": 1.9220402708885416, + "grad_norm": 0.925140917301178, + "learning_rate": 7.186612698604483e-05, + "loss": 2.7399, + "step": 21215 + }, + { + "epoch": 1.9221308690629884, + "grad_norm": 0.9429259896278381, + "learning_rate": 7.186008578505408e-05, + "loss": 2.6752, + "step": 21216 + }, + { + "epoch": 1.9222214672374351, + "grad_norm": 0.9403401613235474, + "learning_rate": 7.185404458406332e-05, + "loss": 2.6495, + "step": 21217 + }, + { + "epoch": 1.922312065411882, + "grad_norm": 0.9799927473068237, + "learning_rate": 7.184800338307256e-05, + "loss": 2.8164, + "step": 21218 + }, + { + "epoch": 1.9224026635863287, + "grad_norm": 0.7809609770774841, + "learning_rate": 7.18419621820818e-05, + "loss": 2.0972, + "step": 21219 + }, + { + "epoch": 1.9224932617607755, + "grad_norm": 0.9445986151695251, + "learning_rate": 7.183592098109104e-05, + "loss": 2.5928, + "step": 21220 + }, + { + "epoch": 1.9225838599352223, + "grad_norm": 0.8469727635383606, + "learning_rate": 7.182987978010029e-05, + "loss": 2.6312, + "step": 21221 + }, + { + "epoch": 1.922674458109669, + "grad_norm": 0.7905257344245911, + "learning_rate": 7.182383857910952e-05, + "loss": 1.9997, + "step": 21222 + }, + { + "epoch": 1.9227650562841159, + "grad_norm": 0.9562363028526306, + "learning_rate": 7.181779737811878e-05, + "loss": 3.0826, + "step": 21223 + }, + { + "epoch": 1.9228556544585627, + "grad_norm": 0.9116189479827881, + "learning_rate": 7.181175617712802e-05, + "loss": 2.7618, + "step": 21224 + }, + { + "epoch": 1.9229462526330094, + "grad_norm": 0.9221009612083435, + "learning_rate": 7.180571497613726e-05, + "loss": 2.6329, + "step": 21225 + }, + { + "epoch": 1.9230368508074562, + "grad_norm": 0.9435914158821106, + "learning_rate": 7.17996737751465e-05, + "loss": 2.8315, + "step": 21226 + }, + { + "epoch": 1.923127448981903, + "grad_norm": 0.6634804010391235, + "learning_rate": 7.179363257415574e-05, + "loss": 1.2656, + "step": 21227 + }, + { + "epoch": 1.9232180471563498, + "grad_norm": 0.9797996282577515, + "learning_rate": 7.178759137316499e-05, + "loss": 2.489, + "step": 21228 + }, + { + "epoch": 1.9233086453307966, + "grad_norm": 0.8168291449546814, + "learning_rate": 7.178155017217423e-05, + "loss": 2.0836, + "step": 21229 + }, + { + "epoch": 1.9233992435052434, + "grad_norm": 0.8880752325057983, + "learning_rate": 7.177550897118347e-05, + "loss": 2.5656, + "step": 21230 + }, + { + "epoch": 1.9234898416796902, + "grad_norm": 0.9075810313224792, + "learning_rate": 7.176946777019272e-05, + "loss": 2.703, + "step": 21231 + }, + { + "epoch": 1.923580439854137, + "grad_norm": 0.874496579170227, + "learning_rate": 7.176342656920197e-05, + "loss": 2.4872, + "step": 21232 + }, + { + "epoch": 1.9236710380285837, + "grad_norm": 0.8710805177688599, + "learning_rate": 7.17573853682112e-05, + "loss": 2.6634, + "step": 21233 + }, + { + "epoch": 1.9237616362030305, + "grad_norm": 0.8576568961143494, + "learning_rate": 7.175134416722045e-05, + "loss": 2.0314, + "step": 21234 + }, + { + "epoch": 1.9238522343774773, + "grad_norm": 0.8774252533912659, + "learning_rate": 7.174530296622968e-05, + "loss": 2.7829, + "step": 21235 + }, + { + "epoch": 1.923942832551924, + "grad_norm": 0.9146538972854614, + "learning_rate": 7.173926176523893e-05, + "loss": 2.649, + "step": 21236 + }, + { + "epoch": 1.924033430726371, + "grad_norm": 0.9173739552497864, + "learning_rate": 7.173322056424817e-05, + "loss": 2.6501, + "step": 21237 + }, + { + "epoch": 1.9241240289008177, + "grad_norm": 0.8958026766777039, + "learning_rate": 7.172717936325743e-05, + "loss": 2.4237, + "step": 21238 + }, + { + "epoch": 1.9242146270752645, + "grad_norm": 0.7626826763153076, + "learning_rate": 7.172113816226666e-05, + "loss": 1.9143, + "step": 21239 + }, + { + "epoch": 1.9243052252497113, + "grad_norm": 0.9008208513259888, + "learning_rate": 7.171509696127591e-05, + "loss": 2.7004, + "step": 21240 + }, + { + "epoch": 1.924395823424158, + "grad_norm": 0.896770179271698, + "learning_rate": 7.170905576028514e-05, + "loss": 2.8493, + "step": 21241 + }, + { + "epoch": 1.9244864215986048, + "grad_norm": 0.9740801453590393, + "learning_rate": 7.170301455929439e-05, + "loss": 2.7302, + "step": 21242 + }, + { + "epoch": 1.9245770197730516, + "grad_norm": 0.777015745639801, + "learning_rate": 7.169697335830364e-05, + "loss": 2.0362, + "step": 21243 + }, + { + "epoch": 1.9246676179474984, + "grad_norm": 0.9409393072128296, + "learning_rate": 7.169093215731287e-05, + "loss": 2.7408, + "step": 21244 + }, + { + "epoch": 1.9247582161219452, + "grad_norm": 0.8755160570144653, + "learning_rate": 7.168489095632212e-05, + "loss": 2.9201, + "step": 21245 + }, + { + "epoch": 1.924848814296392, + "grad_norm": 0.9147483110427856, + "learning_rate": 7.167884975533137e-05, + "loss": 2.6531, + "step": 21246 + }, + { + "epoch": 1.9249394124708386, + "grad_norm": 0.7765094637870789, + "learning_rate": 7.167280855434062e-05, + "loss": 2.0265, + "step": 21247 + }, + { + "epoch": 1.9250300106452856, + "grad_norm": 0.8675411939620972, + "learning_rate": 7.166676735334985e-05, + "loss": 3.0294, + "step": 21248 + }, + { + "epoch": 1.9251206088197321, + "grad_norm": 0.8885800242424011, + "learning_rate": 7.16607261523591e-05, + "loss": 2.4423, + "step": 21249 + }, + { + "epoch": 1.9252112069941791, + "grad_norm": 0.8469780683517456, + "learning_rate": 7.165468495136833e-05, + "loss": 2.621, + "step": 21250 + }, + { + "epoch": 1.9253018051686257, + "grad_norm": 0.848438024520874, + "learning_rate": 7.164864375037758e-05, + "loss": 2.8238, + "step": 21251 + }, + { + "epoch": 1.9253924033430727, + "grad_norm": 0.9949032068252563, + "learning_rate": 7.164260254938681e-05, + "loss": 2.6748, + "step": 21252 + }, + { + "epoch": 1.9254830015175193, + "grad_norm": 0.9420497417449951, + "learning_rate": 7.163656134839607e-05, + "loss": 2.7801, + "step": 21253 + }, + { + "epoch": 1.9255735996919663, + "grad_norm": 0.8130595684051514, + "learning_rate": 7.163052014740531e-05, + "loss": 2.0879, + "step": 21254 + }, + { + "epoch": 1.9256641978664129, + "grad_norm": 0.7445350885391235, + "learning_rate": 7.162447894641456e-05, + "loss": 1.9169, + "step": 21255 + }, + { + "epoch": 1.9257547960408599, + "grad_norm": 0.9177160859107971, + "learning_rate": 7.161843774542379e-05, + "loss": 2.5624, + "step": 21256 + }, + { + "epoch": 1.9258453942153064, + "grad_norm": 0.8769016265869141, + "learning_rate": 7.161239654443304e-05, + "loss": 2.6415, + "step": 21257 + }, + { + "epoch": 1.9259359923897534, + "grad_norm": 1.0065209865570068, + "learning_rate": 7.160635534344227e-05, + "loss": 2.6239, + "step": 21258 + }, + { + "epoch": 1.9260265905642, + "grad_norm": 0.9516480565071106, + "learning_rate": 7.160031414245152e-05, + "loss": 2.7394, + "step": 21259 + }, + { + "epoch": 1.926117188738647, + "grad_norm": 0.9523574709892273, + "learning_rate": 7.159427294146077e-05, + "loss": 3.0805, + "step": 21260 + }, + { + "epoch": 1.9262077869130936, + "grad_norm": 0.900451123714447, + "learning_rate": 7.158823174047001e-05, + "loss": 2.5668, + "step": 21261 + }, + { + "epoch": 1.9262983850875406, + "grad_norm": 0.9103401899337769, + "learning_rate": 7.158219053947925e-05, + "loss": 2.8534, + "step": 21262 + }, + { + "epoch": 1.9263889832619872, + "grad_norm": 0.8852167129516602, + "learning_rate": 7.15761493384885e-05, + "loss": 2.6929, + "step": 21263 + }, + { + "epoch": 1.9264795814364342, + "grad_norm": 0.9349122047424316, + "learning_rate": 7.157010813749774e-05, + "loss": 2.8636, + "step": 21264 + }, + { + "epoch": 1.9265701796108807, + "grad_norm": 0.9020360708236694, + "learning_rate": 7.156406693650698e-05, + "loss": 2.5508, + "step": 21265 + }, + { + "epoch": 1.9266607777853277, + "grad_norm": 0.9815864562988281, + "learning_rate": 7.155802573551622e-05, + "loss": 2.8664, + "step": 21266 + }, + { + "epoch": 1.9267513759597743, + "grad_norm": 0.954453706741333, + "learning_rate": 7.155198453452546e-05, + "loss": 2.5692, + "step": 21267 + }, + { + "epoch": 1.9268419741342213, + "grad_norm": 0.9249866604804993, + "learning_rate": 7.154594333353472e-05, + "loss": 2.5719, + "step": 21268 + }, + { + "epoch": 1.9269325723086679, + "grad_norm": 0.8837669491767883, + "learning_rate": 7.153990213254395e-05, + "loss": 2.8357, + "step": 21269 + }, + { + "epoch": 1.9270231704831149, + "grad_norm": 0.8070549368858337, + "learning_rate": 7.15338609315532e-05, + "loss": 2.246, + "step": 21270 + }, + { + "epoch": 1.9271137686575615, + "grad_norm": 0.8424904346466064, + "learning_rate": 7.152781973056244e-05, + "loss": 2.7367, + "step": 21271 + }, + { + "epoch": 1.9272043668320085, + "grad_norm": 0.9153093695640564, + "learning_rate": 7.152177852957168e-05, + "loss": 2.6209, + "step": 21272 + }, + { + "epoch": 1.927294965006455, + "grad_norm": 0.9378952980041504, + "learning_rate": 7.151573732858092e-05, + "loss": 2.6397, + "step": 21273 + }, + { + "epoch": 1.927385563180902, + "grad_norm": 0.9775789380073547, + "learning_rate": 7.150969612759017e-05, + "loss": 2.6563, + "step": 21274 + }, + { + "epoch": 1.9274761613553486, + "grad_norm": 0.932372510433197, + "learning_rate": 7.150365492659941e-05, + "loss": 2.821, + "step": 21275 + }, + { + "epoch": 1.9275667595297956, + "grad_norm": 0.935834527015686, + "learning_rate": 7.149761372560866e-05, + "loss": 2.9152, + "step": 21276 + }, + { + "epoch": 1.9276573577042422, + "grad_norm": 0.8445051908493042, + "learning_rate": 7.14915725246179e-05, + "loss": 2.5728, + "step": 21277 + }, + { + "epoch": 1.9277479558786892, + "grad_norm": 0.881183922290802, + "learning_rate": 7.148553132362714e-05, + "loss": 2.6899, + "step": 21278 + }, + { + "epoch": 1.9278385540531358, + "grad_norm": 0.9241178631782532, + "learning_rate": 7.147949012263639e-05, + "loss": 2.9246, + "step": 21279 + }, + { + "epoch": 1.9279291522275828, + "grad_norm": 0.9430112838745117, + "learning_rate": 7.147344892164562e-05, + "loss": 2.4944, + "step": 21280 + }, + { + "epoch": 1.9280197504020293, + "grad_norm": 0.8668832778930664, + "learning_rate": 7.146740772065487e-05, + "loss": 2.344, + "step": 21281 + }, + { + "epoch": 1.9281103485764763, + "grad_norm": 0.8387733101844788, + "learning_rate": 7.14613665196641e-05, + "loss": 2.6832, + "step": 21282 + }, + { + "epoch": 1.928200946750923, + "grad_norm": 0.9595854878425598, + "learning_rate": 7.145532531867337e-05, + "loss": 2.9668, + "step": 21283 + }, + { + "epoch": 1.92829154492537, + "grad_norm": 0.8483068346977234, + "learning_rate": 7.14492841176826e-05, + "loss": 1.9965, + "step": 21284 + }, + { + "epoch": 1.9283821430998165, + "grad_norm": 0.8767257928848267, + "learning_rate": 7.144324291669185e-05, + "loss": 2.8731, + "step": 21285 + }, + { + "epoch": 1.9284727412742633, + "grad_norm": 0.8388242721557617, + "learning_rate": 7.143720171570108e-05, + "loss": 2.6384, + "step": 21286 + }, + { + "epoch": 1.92856333944871, + "grad_norm": 0.8621494174003601, + "learning_rate": 7.143116051471033e-05, + "loss": 2.5156, + "step": 21287 + }, + { + "epoch": 1.9286539376231568, + "grad_norm": 0.876083254814148, + "learning_rate": 7.142511931371956e-05, + "loss": 2.8098, + "step": 21288 + }, + { + "epoch": 1.9287445357976036, + "grad_norm": 0.9384536147117615, + "learning_rate": 7.141907811272881e-05, + "loss": 2.6788, + "step": 21289 + }, + { + "epoch": 1.9288351339720504, + "grad_norm": 0.9063676595687866, + "learning_rate": 7.141303691173805e-05, + "loss": 2.855, + "step": 21290 + }, + { + "epoch": 1.9289257321464972, + "grad_norm": 0.8935407996177673, + "learning_rate": 7.14069957107473e-05, + "loss": 2.8366, + "step": 21291 + }, + { + "epoch": 1.929016330320944, + "grad_norm": 0.931136429309845, + "learning_rate": 7.140095450975654e-05, + "loss": 2.5936, + "step": 21292 + }, + { + "epoch": 1.9291069284953908, + "grad_norm": 0.8956623077392578, + "learning_rate": 7.139491330876579e-05, + "loss": 2.8328, + "step": 21293 + }, + { + "epoch": 1.9291975266698376, + "grad_norm": 0.9321392178535461, + "learning_rate": 7.138887210777502e-05, + "loss": 2.803, + "step": 21294 + }, + { + "epoch": 1.9292881248442844, + "grad_norm": 0.8598942756652832, + "learning_rate": 7.138283090678427e-05, + "loss": 2.7003, + "step": 21295 + }, + { + "epoch": 1.9293787230187311, + "grad_norm": 0.8457760810852051, + "learning_rate": 7.137678970579352e-05, + "loss": 1.9115, + "step": 21296 + }, + { + "epoch": 1.929469321193178, + "grad_norm": 0.9321707487106323, + "learning_rate": 7.137074850480275e-05, + "loss": 3.0341, + "step": 21297 + }, + { + "epoch": 1.9295599193676247, + "grad_norm": 0.8669061064720154, + "learning_rate": 7.136470730381201e-05, + "loss": 2.8027, + "step": 21298 + }, + { + "epoch": 1.9296505175420715, + "grad_norm": 0.8393198251724243, + "learning_rate": 7.135866610282125e-05, + "loss": 2.7012, + "step": 21299 + }, + { + "epoch": 1.9297411157165183, + "grad_norm": 0.861809492111206, + "learning_rate": 7.13526249018305e-05, + "loss": 2.6858, + "step": 21300 + }, + { + "epoch": 1.929831713890965, + "grad_norm": 0.8665444850921631, + "learning_rate": 7.134658370083973e-05, + "loss": 2.6034, + "step": 21301 + }, + { + "epoch": 1.9299223120654119, + "grad_norm": 0.8497835397720337, + "learning_rate": 7.134054249984898e-05, + "loss": 2.8261, + "step": 21302 + }, + { + "epoch": 1.9300129102398587, + "grad_norm": 0.8874651193618774, + "learning_rate": 7.133450129885821e-05, + "loss": 2.6423, + "step": 21303 + }, + { + "epoch": 1.9301035084143054, + "grad_norm": 0.9315003156661987, + "learning_rate": 7.132846009786746e-05, + "loss": 2.6272, + "step": 21304 + }, + { + "epoch": 1.9301941065887522, + "grad_norm": 0.958508312702179, + "learning_rate": 7.132241889687669e-05, + "loss": 2.7677, + "step": 21305 + }, + { + "epoch": 1.930284704763199, + "grad_norm": 1.0374410152435303, + "learning_rate": 7.131637769588595e-05, + "loss": 2.6858, + "step": 21306 + }, + { + "epoch": 1.9303753029376458, + "grad_norm": 1.0440093278884888, + "learning_rate": 7.131033649489519e-05, + "loss": 2.671, + "step": 21307 + }, + { + "epoch": 1.9304659011120926, + "grad_norm": 0.9392465353012085, + "learning_rate": 7.130429529390443e-05, + "loss": 2.7855, + "step": 21308 + }, + { + "epoch": 1.9305564992865394, + "grad_norm": 0.8151550889015198, + "learning_rate": 7.129825409291367e-05, + "loss": 2.1772, + "step": 21309 + }, + { + "epoch": 1.9306470974609862, + "grad_norm": 0.9076350331306458, + "learning_rate": 7.129221289192292e-05, + "loss": 2.7478, + "step": 21310 + }, + { + "epoch": 1.930737695635433, + "grad_norm": 0.9976247549057007, + "learning_rate": 7.128617169093216e-05, + "loss": 2.7162, + "step": 21311 + }, + { + "epoch": 1.9308282938098797, + "grad_norm": 0.8881407380104065, + "learning_rate": 7.12801304899414e-05, + "loss": 2.852, + "step": 21312 + }, + { + "epoch": 1.9309188919843265, + "grad_norm": 0.7737486362457275, + "learning_rate": 7.127408928895065e-05, + "loss": 2.0729, + "step": 21313 + }, + { + "epoch": 1.9310094901587733, + "grad_norm": 0.8918362259864807, + "learning_rate": 7.126804808795989e-05, + "loss": 2.718, + "step": 21314 + }, + { + "epoch": 1.93110008833322, + "grad_norm": 0.928361177444458, + "learning_rate": 7.126200688696914e-05, + "loss": 2.7083, + "step": 21315 + }, + { + "epoch": 1.931190686507667, + "grad_norm": 0.9344384074211121, + "learning_rate": 7.125596568597837e-05, + "loss": 2.7254, + "step": 21316 + }, + { + "epoch": 1.9312812846821137, + "grad_norm": 0.8628073930740356, + "learning_rate": 7.124992448498762e-05, + "loss": 2.2003, + "step": 21317 + }, + { + "epoch": 1.9313718828565605, + "grad_norm": 0.9359415173530579, + "learning_rate": 7.124388328399686e-05, + "loss": 2.8308, + "step": 21318 + }, + { + "epoch": 1.9314624810310073, + "grad_norm": 0.9388156533241272, + "learning_rate": 7.12378420830061e-05, + "loss": 2.6786, + "step": 21319 + }, + { + "epoch": 1.931553079205454, + "grad_norm": 0.8870775699615479, + "learning_rate": 7.123180088201534e-05, + "loss": 2.6541, + "step": 21320 + }, + { + "epoch": 1.9316436773799008, + "grad_norm": 0.9402648210525513, + "learning_rate": 7.12257596810246e-05, + "loss": 2.8304, + "step": 21321 + }, + { + "epoch": 1.9317342755543476, + "grad_norm": 0.9936944246292114, + "learning_rate": 7.121971848003383e-05, + "loss": 2.7285, + "step": 21322 + }, + { + "epoch": 1.9318248737287944, + "grad_norm": 0.9694564938545227, + "learning_rate": 7.121367727904308e-05, + "loss": 2.777, + "step": 21323 + }, + { + "epoch": 1.9319154719032412, + "grad_norm": 1.001037359237671, + "learning_rate": 7.120763607805231e-05, + "loss": 2.5013, + "step": 21324 + }, + { + "epoch": 1.932006070077688, + "grad_norm": 0.8885409832000732, + "learning_rate": 7.120159487706156e-05, + "loss": 2.6744, + "step": 21325 + }, + { + "epoch": 1.9320966682521348, + "grad_norm": 0.9482621550559998, + "learning_rate": 7.11955536760708e-05, + "loss": 2.7654, + "step": 21326 + }, + { + "epoch": 1.9321872664265816, + "grad_norm": 0.90577632188797, + "learning_rate": 7.118951247508004e-05, + "loss": 2.6841, + "step": 21327 + }, + { + "epoch": 1.9322778646010281, + "grad_norm": 0.8854788541793823, + "learning_rate": 7.118347127408929e-05, + "loss": 2.8738, + "step": 21328 + }, + { + "epoch": 1.9323684627754751, + "grad_norm": 0.8994242548942566, + "learning_rate": 7.117743007309854e-05, + "loss": 2.5926, + "step": 21329 + }, + { + "epoch": 1.9324590609499217, + "grad_norm": 0.865674614906311, + "learning_rate": 7.117138887210779e-05, + "loss": 2.6988, + "step": 21330 + }, + { + "epoch": 1.9325496591243687, + "grad_norm": 0.840247392654419, + "learning_rate": 7.116534767111702e-05, + "loss": 2.6045, + "step": 21331 + }, + { + "epoch": 1.9326402572988153, + "grad_norm": 0.9084627032279968, + "learning_rate": 7.115930647012627e-05, + "loss": 2.6195, + "step": 21332 + }, + { + "epoch": 1.9327308554732623, + "grad_norm": 0.8896356821060181, + "learning_rate": 7.11532652691355e-05, + "loss": 2.6154, + "step": 21333 + }, + { + "epoch": 1.9328214536477089, + "grad_norm": 0.9728823900222778, + "learning_rate": 7.114722406814475e-05, + "loss": 2.4797, + "step": 21334 + }, + { + "epoch": 1.9329120518221559, + "grad_norm": 0.9380890130996704, + "learning_rate": 7.114118286715398e-05, + "loss": 2.9261, + "step": 21335 + }, + { + "epoch": 1.9330026499966024, + "grad_norm": 0.8638208508491516, + "learning_rate": 7.113514166616325e-05, + "loss": 2.6311, + "step": 21336 + }, + { + "epoch": 1.9330932481710494, + "grad_norm": 0.8397929072380066, + "learning_rate": 7.112910046517248e-05, + "loss": 2.4148, + "step": 21337 + }, + { + "epoch": 1.933183846345496, + "grad_norm": 0.8735815286636353, + "learning_rate": 7.112305926418173e-05, + "loss": 2.597, + "step": 21338 + }, + { + "epoch": 1.933274444519943, + "grad_norm": 0.8567140698432922, + "learning_rate": 7.111701806319096e-05, + "loss": 2.767, + "step": 21339 + }, + { + "epoch": 1.9333650426943896, + "grad_norm": 0.8873973488807678, + "learning_rate": 7.111097686220021e-05, + "loss": 2.8064, + "step": 21340 + }, + { + "epoch": 1.9334556408688366, + "grad_norm": 0.887466311454773, + "learning_rate": 7.110493566120944e-05, + "loss": 2.7545, + "step": 21341 + }, + { + "epoch": 1.9335462390432832, + "grad_norm": 0.9424914717674255, + "learning_rate": 7.109889446021869e-05, + "loss": 2.7147, + "step": 21342 + }, + { + "epoch": 1.9336368372177302, + "grad_norm": 0.8709929585456848, + "learning_rate": 7.109285325922794e-05, + "loss": 2.8453, + "step": 21343 + }, + { + "epoch": 1.9337274353921767, + "grad_norm": 0.8652706742286682, + "learning_rate": 7.108681205823719e-05, + "loss": 2.972, + "step": 21344 + }, + { + "epoch": 1.9338180335666237, + "grad_norm": 0.9211491346359253, + "learning_rate": 7.108077085724642e-05, + "loss": 2.8533, + "step": 21345 + }, + { + "epoch": 1.9339086317410703, + "grad_norm": 1.0402369499206543, + "learning_rate": 7.107472965625567e-05, + "loss": 2.6475, + "step": 21346 + }, + { + "epoch": 1.9339992299155173, + "grad_norm": 0.9256344437599182, + "learning_rate": 7.106868845526492e-05, + "loss": 2.1182, + "step": 21347 + }, + { + "epoch": 1.9340898280899639, + "grad_norm": 0.9045554399490356, + "learning_rate": 7.106264725427415e-05, + "loss": 2.7926, + "step": 21348 + }, + { + "epoch": 1.934180426264411, + "grad_norm": 0.8851720690727234, + "learning_rate": 7.10566060532834e-05, + "loss": 2.7147, + "step": 21349 + }, + { + "epoch": 1.9342710244388575, + "grad_norm": 0.8849565386772156, + "learning_rate": 7.105056485229263e-05, + "loss": 2.6687, + "step": 21350 + }, + { + "epoch": 1.9343616226133045, + "grad_norm": 0.8391456604003906, + "learning_rate": 7.104452365130189e-05, + "loss": 2.3813, + "step": 21351 + }, + { + "epoch": 1.934452220787751, + "grad_norm": 0.8745900392532349, + "learning_rate": 7.103848245031113e-05, + "loss": 2.6556, + "step": 21352 + }, + { + "epoch": 1.934542818962198, + "grad_norm": 0.8675061464309692, + "learning_rate": 7.103244124932037e-05, + "loss": 2.6916, + "step": 21353 + }, + { + "epoch": 1.9346334171366446, + "grad_norm": 0.8572285175323486, + "learning_rate": 7.102640004832961e-05, + "loss": 2.6374, + "step": 21354 + }, + { + "epoch": 1.9347240153110916, + "grad_norm": 0.9203149676322937, + "learning_rate": 7.102035884733886e-05, + "loss": 2.7093, + "step": 21355 + }, + { + "epoch": 1.9348146134855382, + "grad_norm": 0.8515616655349731, + "learning_rate": 7.101431764634809e-05, + "loss": 2.0557, + "step": 21356 + }, + { + "epoch": 1.9349052116599852, + "grad_norm": 0.9434120655059814, + "learning_rate": 7.100827644535734e-05, + "loss": 2.6112, + "step": 21357 + }, + { + "epoch": 1.9349958098344318, + "grad_norm": 0.9333325624465942, + "learning_rate": 7.100223524436658e-05, + "loss": 2.5193, + "step": 21358 + }, + { + "epoch": 1.9350864080088788, + "grad_norm": 0.9828762412071228, + "learning_rate": 7.099619404337583e-05, + "loss": 2.6035, + "step": 21359 + }, + { + "epoch": 1.9351770061833253, + "grad_norm": 0.9533191323280334, + "learning_rate": 7.099015284238507e-05, + "loss": 2.8972, + "step": 21360 + }, + { + "epoch": 1.9352676043577723, + "grad_norm": 0.8665292859077454, + "learning_rate": 7.098411164139431e-05, + "loss": 2.5438, + "step": 21361 + }, + { + "epoch": 1.935358202532219, + "grad_norm": 0.9272143840789795, + "learning_rate": 7.097807044040355e-05, + "loss": 2.92, + "step": 21362 + }, + { + "epoch": 1.935448800706666, + "grad_norm": 0.8996096253395081, + "learning_rate": 7.09720292394128e-05, + "loss": 2.6926, + "step": 21363 + }, + { + "epoch": 1.9355393988811125, + "grad_norm": 0.8554601073265076, + "learning_rate": 7.096598803842204e-05, + "loss": 2.4862, + "step": 21364 + }, + { + "epoch": 1.9356299970555595, + "grad_norm": 0.8049933314323425, + "learning_rate": 7.095994683743128e-05, + "loss": 1.9182, + "step": 21365 + }, + { + "epoch": 1.935720595230006, + "grad_norm": 1.009906530380249, + "learning_rate": 7.095390563644054e-05, + "loss": 2.6024, + "step": 21366 + }, + { + "epoch": 1.9358111934044528, + "grad_norm": 0.8859551548957825, + "learning_rate": 7.094786443544977e-05, + "loss": 2.8596, + "step": 21367 + }, + { + "epoch": 1.9359017915788996, + "grad_norm": 0.9429709911346436, + "learning_rate": 7.094182323445902e-05, + "loss": 2.6834, + "step": 21368 + }, + { + "epoch": 1.9359923897533464, + "grad_norm": 1.0627959966659546, + "learning_rate": 7.093578203346825e-05, + "loss": 2.7982, + "step": 21369 + }, + { + "epoch": 1.9360829879277932, + "grad_norm": 0.8865575194358826, + "learning_rate": 7.09297408324775e-05, + "loss": 2.7947, + "step": 21370 + }, + { + "epoch": 1.93617358610224, + "grad_norm": 0.9483733177185059, + "learning_rate": 7.092369963148674e-05, + "loss": 2.7618, + "step": 21371 + }, + { + "epoch": 1.9362641842766868, + "grad_norm": 0.7902035713195801, + "learning_rate": 7.091765843049598e-05, + "loss": 1.9495, + "step": 21372 + }, + { + "epoch": 1.9363547824511336, + "grad_norm": 0.8846361041069031, + "learning_rate": 7.091161722950523e-05, + "loss": 2.6718, + "step": 21373 + }, + { + "epoch": 1.9364453806255804, + "grad_norm": 0.7651477456092834, + "learning_rate": 7.090557602851448e-05, + "loss": 2.0256, + "step": 21374 + }, + { + "epoch": 1.9365359788000271, + "grad_norm": 0.8645923137664795, + "learning_rate": 7.089953482752371e-05, + "loss": 2.9843, + "step": 21375 + }, + { + "epoch": 1.936626576974474, + "grad_norm": 0.9062924981117249, + "learning_rate": 7.089349362653296e-05, + "loss": 2.8684, + "step": 21376 + }, + { + "epoch": 1.9367171751489207, + "grad_norm": 0.8640046715736389, + "learning_rate": 7.08874524255422e-05, + "loss": 2.652, + "step": 21377 + }, + { + "epoch": 1.9368077733233675, + "grad_norm": 0.9432845115661621, + "learning_rate": 7.088141122455144e-05, + "loss": 2.6365, + "step": 21378 + }, + { + "epoch": 1.9368983714978143, + "grad_norm": 0.8522267937660217, + "learning_rate": 7.087537002356069e-05, + "loss": 2.864, + "step": 21379 + }, + { + "epoch": 1.936988969672261, + "grad_norm": 0.895315945148468, + "learning_rate": 7.086932882256992e-05, + "loss": 2.5732, + "step": 21380 + }, + { + "epoch": 1.9370795678467079, + "grad_norm": 0.855515718460083, + "learning_rate": 7.086328762157917e-05, + "loss": 2.6627, + "step": 21381 + }, + { + "epoch": 1.9371701660211547, + "grad_norm": 0.8708662986755371, + "learning_rate": 7.085724642058842e-05, + "loss": 2.9314, + "step": 21382 + }, + { + "epoch": 1.9372607641956014, + "grad_norm": 0.8225528001785278, + "learning_rate": 7.085120521959767e-05, + "loss": 2.2275, + "step": 21383 + }, + { + "epoch": 1.9373513623700482, + "grad_norm": 0.8703046441078186, + "learning_rate": 7.08451640186069e-05, + "loss": 2.4683, + "step": 21384 + }, + { + "epoch": 1.937441960544495, + "grad_norm": 0.9151303768157959, + "learning_rate": 7.083912281761615e-05, + "loss": 2.5415, + "step": 21385 + }, + { + "epoch": 1.9375325587189418, + "grad_norm": 0.9024950265884399, + "learning_rate": 7.083308161662538e-05, + "loss": 2.7036, + "step": 21386 + }, + { + "epoch": 1.9376231568933886, + "grad_norm": 0.7311999797821045, + "learning_rate": 7.082704041563463e-05, + "loss": 2.0708, + "step": 21387 + }, + { + "epoch": 1.9377137550678354, + "grad_norm": 0.9047617316246033, + "learning_rate": 7.082099921464388e-05, + "loss": 2.1067, + "step": 21388 + }, + { + "epoch": 1.9378043532422822, + "grad_norm": 0.9018579125404358, + "learning_rate": 7.081495801365312e-05, + "loss": 2.7379, + "step": 21389 + }, + { + "epoch": 1.937894951416729, + "grad_norm": 0.940284252166748, + "learning_rate": 7.080891681266236e-05, + "loss": 2.9155, + "step": 21390 + }, + { + "epoch": 1.9379855495911757, + "grad_norm": 0.8981332182884216, + "learning_rate": 7.08028756116716e-05, + "loss": 2.8123, + "step": 21391 + }, + { + "epoch": 1.9380761477656225, + "grad_norm": 0.8742023706436157, + "learning_rate": 7.079683441068084e-05, + "loss": 2.7376, + "step": 21392 + }, + { + "epoch": 1.9381667459400693, + "grad_norm": 0.9269112348556519, + "learning_rate": 7.079079320969009e-05, + "loss": 2.7282, + "step": 21393 + }, + { + "epoch": 1.9382573441145161, + "grad_norm": 0.8950915336608887, + "learning_rate": 7.078475200869932e-05, + "loss": 2.8423, + "step": 21394 + }, + { + "epoch": 1.938347942288963, + "grad_norm": 0.8729124665260315, + "learning_rate": 7.077871080770857e-05, + "loss": 2.965, + "step": 21395 + }, + { + "epoch": 1.9384385404634097, + "grad_norm": 0.9066458940505981, + "learning_rate": 7.077266960671782e-05, + "loss": 2.7452, + "step": 21396 + }, + { + "epoch": 1.9385291386378565, + "grad_norm": 0.7679815888404846, + "learning_rate": 7.076662840572706e-05, + "loss": 2.2214, + "step": 21397 + }, + { + "epoch": 1.9386197368123033, + "grad_norm": 0.8779614567756653, + "learning_rate": 7.076058720473631e-05, + "loss": 2.616, + "step": 21398 + }, + { + "epoch": 1.93871033498675, + "grad_norm": 0.8911440968513489, + "learning_rate": 7.075454600374555e-05, + "loss": 2.7208, + "step": 21399 + }, + { + "epoch": 1.9388009331611968, + "grad_norm": 0.9036619663238525, + "learning_rate": 7.07485048027548e-05, + "loss": 2.9055, + "step": 21400 + }, + { + "epoch": 1.9388915313356436, + "grad_norm": 0.8615651726722717, + "learning_rate": 7.074246360176403e-05, + "loss": 2.6103, + "step": 21401 + }, + { + "epoch": 1.9389821295100904, + "grad_norm": 0.8372489809989929, + "learning_rate": 7.073642240077328e-05, + "loss": 2.6156, + "step": 21402 + }, + { + "epoch": 1.9390727276845372, + "grad_norm": 0.8851627111434937, + "learning_rate": 7.073038119978252e-05, + "loss": 2.8088, + "step": 21403 + }, + { + "epoch": 1.939163325858984, + "grad_norm": 0.8053268790245056, + "learning_rate": 7.072433999879177e-05, + "loss": 2.0074, + "step": 21404 + }, + { + "epoch": 1.9392539240334308, + "grad_norm": 0.9710307121276855, + "learning_rate": 7.0718298797801e-05, + "loss": 2.6236, + "step": 21405 + }, + { + "epoch": 1.9393445222078776, + "grad_norm": 0.9035629630088806, + "learning_rate": 7.071225759681025e-05, + "loss": 2.6447, + "step": 21406 + }, + { + "epoch": 1.9394351203823244, + "grad_norm": 0.9382756948471069, + "learning_rate": 7.070621639581949e-05, + "loss": 2.7014, + "step": 21407 + }, + { + "epoch": 1.9395257185567711, + "grad_norm": 0.942621111869812, + "learning_rate": 7.070017519482873e-05, + "loss": 2.8253, + "step": 21408 + }, + { + "epoch": 1.9396163167312177, + "grad_norm": 0.8699255585670471, + "learning_rate": 7.069413399383797e-05, + "loss": 2.5695, + "step": 21409 + }, + { + "epoch": 1.9397069149056647, + "grad_norm": 0.902031660079956, + "learning_rate": 7.068809279284723e-05, + "loss": 2.6811, + "step": 21410 + }, + { + "epoch": 1.9397975130801113, + "grad_norm": 0.8775182366371155, + "learning_rate": 7.068205159185646e-05, + "loss": 2.5411, + "step": 21411 + }, + { + "epoch": 1.9398881112545583, + "grad_norm": 0.9340461492538452, + "learning_rate": 7.067601039086571e-05, + "loss": 2.7419, + "step": 21412 + }, + { + "epoch": 1.9399787094290049, + "grad_norm": 0.8339435458183289, + "learning_rate": 7.066996918987495e-05, + "loss": 2.416, + "step": 21413 + }, + { + "epoch": 1.9400693076034519, + "grad_norm": 0.8453670144081116, + "learning_rate": 7.066392798888419e-05, + "loss": 2.8875, + "step": 21414 + }, + { + "epoch": 1.9401599057778984, + "grad_norm": 0.9061565399169922, + "learning_rate": 7.065788678789344e-05, + "loss": 2.7833, + "step": 21415 + }, + { + "epoch": 1.9402505039523454, + "grad_norm": 0.8793354034423828, + "learning_rate": 7.065184558690267e-05, + "loss": 2.7685, + "step": 21416 + }, + { + "epoch": 1.940341102126792, + "grad_norm": 0.8498323559761047, + "learning_rate": 7.064580438591192e-05, + "loss": 2.6619, + "step": 21417 + }, + { + "epoch": 1.940431700301239, + "grad_norm": 0.9289116859436035, + "learning_rate": 7.063976318492117e-05, + "loss": 2.9413, + "step": 21418 + }, + { + "epoch": 1.9405222984756856, + "grad_norm": 0.9009135961532593, + "learning_rate": 7.063372198393042e-05, + "loss": 2.7385, + "step": 21419 + }, + { + "epoch": 1.9406128966501326, + "grad_norm": 0.7682763934135437, + "learning_rate": 7.062768078293965e-05, + "loss": 2.0683, + "step": 21420 + }, + { + "epoch": 1.9407034948245792, + "grad_norm": 0.9063001275062561, + "learning_rate": 7.06216395819489e-05, + "loss": 2.9112, + "step": 21421 + }, + { + "epoch": 1.9407940929990262, + "grad_norm": 0.9150256514549255, + "learning_rate": 7.061559838095813e-05, + "loss": 2.9861, + "step": 21422 + }, + { + "epoch": 1.9408846911734727, + "grad_norm": 0.854620635509491, + "learning_rate": 7.060955717996738e-05, + "loss": 2.5568, + "step": 21423 + }, + { + "epoch": 1.9409752893479197, + "grad_norm": 0.9944232106208801, + "learning_rate": 7.060351597897661e-05, + "loss": 2.5409, + "step": 21424 + }, + { + "epoch": 1.9410658875223663, + "grad_norm": 0.8595946431159973, + "learning_rate": 7.059747477798588e-05, + "loss": 2.7257, + "step": 21425 + }, + { + "epoch": 1.9411564856968133, + "grad_norm": 0.8603255748748779, + "learning_rate": 7.059143357699511e-05, + "loss": 2.6703, + "step": 21426 + }, + { + "epoch": 1.9412470838712599, + "grad_norm": 0.7599997520446777, + "learning_rate": 7.058539237600436e-05, + "loss": 1.8709, + "step": 21427 + }, + { + "epoch": 1.941337682045707, + "grad_norm": 0.8609932065010071, + "learning_rate": 7.057935117501359e-05, + "loss": 2.7505, + "step": 21428 + }, + { + "epoch": 1.9414282802201535, + "grad_norm": 0.9618818759918213, + "learning_rate": 7.057330997402284e-05, + "loss": 2.8924, + "step": 21429 + }, + { + "epoch": 1.9415188783946005, + "grad_norm": 1.1141657829284668, + "learning_rate": 7.056726877303209e-05, + "loss": 2.7981, + "step": 21430 + }, + { + "epoch": 1.941609476569047, + "grad_norm": 0.9019598364830017, + "learning_rate": 7.056122757204132e-05, + "loss": 2.6422, + "step": 21431 + }, + { + "epoch": 1.941700074743494, + "grad_norm": 0.9461126923561096, + "learning_rate": 7.055518637105057e-05, + "loss": 2.5725, + "step": 21432 + }, + { + "epoch": 1.9417906729179406, + "grad_norm": 0.9745409488677979, + "learning_rate": 7.054914517005982e-05, + "loss": 2.8413, + "step": 21433 + }, + { + "epoch": 1.9418812710923876, + "grad_norm": 0.817601203918457, + "learning_rate": 7.054310396906906e-05, + "loss": 2.0225, + "step": 21434 + }, + { + "epoch": 1.9419718692668342, + "grad_norm": 0.8689414262771606, + "learning_rate": 7.05370627680783e-05, + "loss": 2.4525, + "step": 21435 + }, + { + "epoch": 1.9420624674412812, + "grad_norm": 0.8570758700370789, + "learning_rate": 7.053102156708755e-05, + "loss": 2.4556, + "step": 21436 + }, + { + "epoch": 1.9421530656157278, + "grad_norm": 0.8577831983566284, + "learning_rate": 7.052498036609678e-05, + "loss": 2.4855, + "step": 21437 + }, + { + "epoch": 1.9422436637901748, + "grad_norm": 0.9173377752304077, + "learning_rate": 7.051893916510603e-05, + "loss": 2.6362, + "step": 21438 + }, + { + "epoch": 1.9423342619646213, + "grad_norm": 0.8762646913528442, + "learning_rate": 7.051289796411526e-05, + "loss": 2.577, + "step": 21439 + }, + { + "epoch": 1.9424248601390683, + "grad_norm": 0.8661736845970154, + "learning_rate": 7.050685676312452e-05, + "loss": 2.8486, + "step": 21440 + }, + { + "epoch": 1.942515458313515, + "grad_norm": 0.9316906929016113, + "learning_rate": 7.050081556213376e-05, + "loss": 2.9575, + "step": 21441 + }, + { + "epoch": 1.942606056487962, + "grad_norm": 0.9527552127838135, + "learning_rate": 7.0494774361143e-05, + "loss": 2.783, + "step": 21442 + }, + { + "epoch": 1.9426966546624085, + "grad_norm": 1.0056155920028687, + "learning_rate": 7.048873316015224e-05, + "loss": 2.5883, + "step": 21443 + }, + { + "epoch": 1.9427872528368555, + "grad_norm": 0.9313099384307861, + "learning_rate": 7.048269195916149e-05, + "loss": 2.79, + "step": 21444 + }, + { + "epoch": 1.942877851011302, + "grad_norm": 0.9384580254554749, + "learning_rate": 7.047665075817072e-05, + "loss": 2.7259, + "step": 21445 + }, + { + "epoch": 1.942968449185749, + "grad_norm": 0.9368085265159607, + "learning_rate": 7.047060955717997e-05, + "loss": 2.6657, + "step": 21446 + }, + { + "epoch": 1.9430590473601956, + "grad_norm": 0.8770172595977783, + "learning_rate": 7.046456835618921e-05, + "loss": 2.6645, + "step": 21447 + }, + { + "epoch": 1.9431496455346424, + "grad_norm": 0.7951927185058594, + "learning_rate": 7.045852715519846e-05, + "loss": 2.1319, + "step": 21448 + }, + { + "epoch": 1.9432402437090892, + "grad_norm": 0.8742026090621948, + "learning_rate": 7.04524859542077e-05, + "loss": 2.8522, + "step": 21449 + }, + { + "epoch": 1.943330841883536, + "grad_norm": 0.8799706697463989, + "learning_rate": 7.044644475321694e-05, + "loss": 2.6823, + "step": 21450 + }, + { + "epoch": 1.9434214400579828, + "grad_norm": 0.8244929313659668, + "learning_rate": 7.044040355222619e-05, + "loss": 2.2294, + "step": 21451 + }, + { + "epoch": 1.9435120382324296, + "grad_norm": 0.8909645676612854, + "learning_rate": 7.043436235123543e-05, + "loss": 2.5325, + "step": 21452 + }, + { + "epoch": 1.9436026364068764, + "grad_norm": 0.8921630382537842, + "learning_rate": 7.042832115024467e-05, + "loss": 2.6618, + "step": 21453 + }, + { + "epoch": 1.9436932345813231, + "grad_norm": 1.2067015171051025, + "learning_rate": 7.042227994925391e-05, + "loss": 2.907, + "step": 21454 + }, + { + "epoch": 1.94378383275577, + "grad_norm": 1.292823314666748, + "learning_rate": 7.041623874826317e-05, + "loss": 2.8974, + "step": 21455 + }, + { + "epoch": 1.9438744309302167, + "grad_norm": 0.9748074412345886, + "learning_rate": 7.04101975472724e-05, + "loss": 2.5951, + "step": 21456 + }, + { + "epoch": 1.9439650291046635, + "grad_norm": 0.9980021119117737, + "learning_rate": 7.040415634628165e-05, + "loss": 2.7485, + "step": 21457 + }, + { + "epoch": 1.9440556272791103, + "grad_norm": 0.8583952188491821, + "learning_rate": 7.039811514529088e-05, + "loss": 2.7904, + "step": 21458 + }, + { + "epoch": 1.944146225453557, + "grad_norm": 0.8810657858848572, + "learning_rate": 7.039207394430013e-05, + "loss": 2.7752, + "step": 21459 + }, + { + "epoch": 1.9442368236280039, + "grad_norm": 0.9478110074996948, + "learning_rate": 7.038603274330937e-05, + "loss": 2.3107, + "step": 21460 + }, + { + "epoch": 1.9443274218024507, + "grad_norm": 0.6714735627174377, + "learning_rate": 7.037999154231861e-05, + "loss": 1.1766, + "step": 21461 + }, + { + "epoch": 1.9444180199768974, + "grad_norm": 0.9016593098640442, + "learning_rate": 7.037395034132786e-05, + "loss": 2.6416, + "step": 21462 + }, + { + "epoch": 1.9445086181513442, + "grad_norm": 0.8934945464134216, + "learning_rate": 7.036790914033711e-05, + "loss": 2.7756, + "step": 21463 + }, + { + "epoch": 1.944599216325791, + "grad_norm": 1.030387282371521, + "learning_rate": 7.036186793934634e-05, + "loss": 2.6249, + "step": 21464 + }, + { + "epoch": 1.9446898145002378, + "grad_norm": 0.9137849807739258, + "learning_rate": 7.035582673835559e-05, + "loss": 2.6576, + "step": 21465 + }, + { + "epoch": 1.9447804126746846, + "grad_norm": 0.8969834446907043, + "learning_rate": 7.034978553736484e-05, + "loss": 2.72, + "step": 21466 + }, + { + "epoch": 1.9448710108491314, + "grad_norm": 0.911336362361908, + "learning_rate": 7.034374433637407e-05, + "loss": 2.7454, + "step": 21467 + }, + { + "epoch": 1.9449616090235782, + "grad_norm": 0.8954268097877502, + "learning_rate": 7.033770313538332e-05, + "loss": 2.4717, + "step": 21468 + }, + { + "epoch": 1.945052207198025, + "grad_norm": 0.8351906538009644, + "learning_rate": 7.033166193439255e-05, + "loss": 2.3664, + "step": 21469 + }, + { + "epoch": 1.9451428053724718, + "grad_norm": 0.925228476524353, + "learning_rate": 7.032562073340181e-05, + "loss": 2.6815, + "step": 21470 + }, + { + "epoch": 1.9452334035469185, + "grad_norm": 0.9492278099060059, + "learning_rate": 7.031957953241105e-05, + "loss": 2.5204, + "step": 21471 + }, + { + "epoch": 1.9453240017213653, + "grad_norm": 0.831676185131073, + "learning_rate": 7.03135383314203e-05, + "loss": 1.8157, + "step": 21472 + }, + { + "epoch": 1.9454145998958121, + "grad_norm": 0.9047337770462036, + "learning_rate": 7.030749713042953e-05, + "loss": 2.7095, + "step": 21473 + }, + { + "epoch": 1.945505198070259, + "grad_norm": 0.940253496170044, + "learning_rate": 7.030145592943878e-05, + "loss": 2.9983, + "step": 21474 + }, + { + "epoch": 1.9455957962447057, + "grad_norm": 0.9209985733032227, + "learning_rate": 7.029541472844801e-05, + "loss": 2.4521, + "step": 21475 + }, + { + "epoch": 1.9456863944191525, + "grad_norm": 0.911666750907898, + "learning_rate": 7.028937352745726e-05, + "loss": 2.6019, + "step": 21476 + }, + { + "epoch": 1.9457769925935993, + "grad_norm": 0.8580409288406372, + "learning_rate": 7.02833323264665e-05, + "loss": 2.1305, + "step": 21477 + }, + { + "epoch": 1.945867590768046, + "grad_norm": 0.7840922474861145, + "learning_rate": 7.027729112547576e-05, + "loss": 2.1903, + "step": 21478 + }, + { + "epoch": 1.9459581889424928, + "grad_norm": 0.8311728239059448, + "learning_rate": 7.027124992448499e-05, + "loss": 2.5284, + "step": 21479 + }, + { + "epoch": 1.9460487871169396, + "grad_norm": 0.8826866745948792, + "learning_rate": 7.026520872349424e-05, + "loss": 2.714, + "step": 21480 + }, + { + "epoch": 1.9461393852913864, + "grad_norm": 0.9220277667045593, + "learning_rate": 7.025916752250347e-05, + "loss": 2.653, + "step": 21481 + }, + { + "epoch": 1.9462299834658332, + "grad_norm": 0.8076789379119873, + "learning_rate": 7.025312632151272e-05, + "loss": 1.9072, + "step": 21482 + }, + { + "epoch": 1.94632058164028, + "grad_norm": 0.7301204800605774, + "learning_rate": 7.024708512052197e-05, + "loss": 2.0054, + "step": 21483 + }, + { + "epoch": 1.9464111798147268, + "grad_norm": 0.8856078386306763, + "learning_rate": 7.02410439195312e-05, + "loss": 2.6466, + "step": 21484 + }, + { + "epoch": 1.9465017779891736, + "grad_norm": 0.972332239151001, + "learning_rate": 7.023500271854046e-05, + "loss": 2.8285, + "step": 21485 + }, + { + "epoch": 1.9465923761636204, + "grad_norm": 0.9271247982978821, + "learning_rate": 7.02289615175497e-05, + "loss": 2.6426, + "step": 21486 + }, + { + "epoch": 1.9466829743380671, + "grad_norm": 0.9312431216239929, + "learning_rate": 7.022292031655894e-05, + "loss": 2.8779, + "step": 21487 + }, + { + "epoch": 1.946773572512514, + "grad_norm": 0.975635826587677, + "learning_rate": 7.021687911556818e-05, + "loss": 2.8249, + "step": 21488 + }, + { + "epoch": 1.9468641706869607, + "grad_norm": 0.766749382019043, + "learning_rate": 7.021083791457742e-05, + "loss": 2.1654, + "step": 21489 + }, + { + "epoch": 1.9469547688614073, + "grad_norm": 0.8988907933235168, + "learning_rate": 7.020479671358666e-05, + "loss": 2.9043, + "step": 21490 + }, + { + "epoch": 1.9470453670358543, + "grad_norm": 0.8483492136001587, + "learning_rate": 7.01987555125959e-05, + "loss": 2.6414, + "step": 21491 + }, + { + "epoch": 1.9471359652103009, + "grad_norm": 0.8583325147628784, + "learning_rate": 7.019271431160514e-05, + "loss": 2.7438, + "step": 21492 + }, + { + "epoch": 1.9472265633847479, + "grad_norm": 0.8920061588287354, + "learning_rate": 7.01866731106144e-05, + "loss": 2.6722, + "step": 21493 + }, + { + "epoch": 1.9473171615591944, + "grad_norm": 0.8778983950614929, + "learning_rate": 7.018063190962364e-05, + "loss": 2.6932, + "step": 21494 + }, + { + "epoch": 1.9474077597336414, + "grad_norm": 0.9095582365989685, + "learning_rate": 7.017459070863288e-05, + "loss": 2.5568, + "step": 21495 + }, + { + "epoch": 1.947498357908088, + "grad_norm": 0.7770661115646362, + "learning_rate": 7.016854950764212e-05, + "loss": 2.0302, + "step": 21496 + }, + { + "epoch": 1.947588956082535, + "grad_norm": 0.8294881582260132, + "learning_rate": 7.016250830665136e-05, + "loss": 2.6657, + "step": 21497 + }, + { + "epoch": 1.9476795542569816, + "grad_norm": 0.9243609309196472, + "learning_rate": 7.015646710566061e-05, + "loss": 2.9714, + "step": 21498 + }, + { + "epoch": 1.9477701524314286, + "grad_norm": 0.9001848101615906, + "learning_rate": 7.015042590466985e-05, + "loss": 2.489, + "step": 21499 + }, + { + "epoch": 1.9478607506058752, + "grad_norm": 0.9287293553352356, + "learning_rate": 7.01443847036791e-05, + "loss": 2.509, + "step": 21500 + }, + { + "epoch": 1.9479513487803222, + "grad_norm": 0.9296715259552002, + "learning_rate": 7.013834350268834e-05, + "loss": 2.7521, + "step": 21501 + }, + { + "epoch": 1.9480419469547687, + "grad_norm": 0.9454529881477356, + "learning_rate": 7.013230230169759e-05, + "loss": 2.5714, + "step": 21502 + }, + { + "epoch": 1.9481325451292157, + "grad_norm": 0.8685082793235779, + "learning_rate": 7.012626110070682e-05, + "loss": 2.6425, + "step": 21503 + }, + { + "epoch": 1.9482231433036623, + "grad_norm": 0.9134020805358887, + "learning_rate": 7.012021989971607e-05, + "loss": 2.6821, + "step": 21504 + }, + { + "epoch": 1.9483137414781093, + "grad_norm": 0.9336453080177307, + "learning_rate": 7.01141786987253e-05, + "loss": 2.5128, + "step": 21505 + }, + { + "epoch": 1.9484043396525559, + "grad_norm": 0.832181990146637, + "learning_rate": 7.010813749773455e-05, + "loss": 2.7974, + "step": 21506 + }, + { + "epoch": 1.948494937827003, + "grad_norm": 0.9656341671943665, + "learning_rate": 7.010209629674379e-05, + "loss": 2.7093, + "step": 21507 + }, + { + "epoch": 1.9485855360014495, + "grad_norm": 0.9221335053443909, + "learning_rate": 7.009605509575305e-05, + "loss": 2.7187, + "step": 21508 + }, + { + "epoch": 1.9486761341758965, + "grad_norm": 0.9432350397109985, + "learning_rate": 7.009001389476228e-05, + "loss": 2.8159, + "step": 21509 + }, + { + "epoch": 1.948766732350343, + "grad_norm": 0.9765591621398926, + "learning_rate": 7.008397269377153e-05, + "loss": 2.4202, + "step": 21510 + }, + { + "epoch": 1.94885733052479, + "grad_norm": 0.9730276465415955, + "learning_rate": 7.007793149278076e-05, + "loss": 2.692, + "step": 21511 + }, + { + "epoch": 1.9489479286992366, + "grad_norm": 0.791655421257019, + "learning_rate": 7.007189029179001e-05, + "loss": 2.0907, + "step": 21512 + }, + { + "epoch": 1.9490385268736836, + "grad_norm": 0.8517751097679138, + "learning_rate": 7.006584909079925e-05, + "loss": 2.1878, + "step": 21513 + }, + { + "epoch": 1.9491291250481302, + "grad_norm": 0.872287392616272, + "learning_rate": 7.005980788980849e-05, + "loss": 2.4547, + "step": 21514 + }, + { + "epoch": 1.9492197232225772, + "grad_norm": 0.8943072557449341, + "learning_rate": 7.005376668881774e-05, + "loss": 2.4488, + "step": 21515 + }, + { + "epoch": 1.9493103213970238, + "grad_norm": 0.8823524713516235, + "learning_rate": 7.004772548782699e-05, + "loss": 2.118, + "step": 21516 + }, + { + "epoch": 1.9494009195714708, + "grad_norm": 0.8805017471313477, + "learning_rate": 7.004168428683624e-05, + "loss": 2.5634, + "step": 21517 + }, + { + "epoch": 1.9494915177459173, + "grad_norm": 0.8896462321281433, + "learning_rate": 7.003564308584547e-05, + "loss": 2.9584, + "step": 21518 + }, + { + "epoch": 1.9495821159203643, + "grad_norm": 0.7510219216346741, + "learning_rate": 7.002960188485472e-05, + "loss": 2.1054, + "step": 21519 + }, + { + "epoch": 1.949672714094811, + "grad_norm": 0.9306509494781494, + "learning_rate": 7.002356068386395e-05, + "loss": 2.7727, + "step": 21520 + }, + { + "epoch": 1.949763312269258, + "grad_norm": 0.9056233763694763, + "learning_rate": 7.00175194828732e-05, + "loss": 2.6463, + "step": 21521 + }, + { + "epoch": 1.9498539104437045, + "grad_norm": 0.8456003665924072, + "learning_rate": 7.001147828188243e-05, + "loss": 2.4225, + "step": 21522 + }, + { + "epoch": 1.9499445086181515, + "grad_norm": 0.9861899018287659, + "learning_rate": 7.00054370808917e-05, + "loss": 2.6853, + "step": 21523 + }, + { + "epoch": 1.950035106792598, + "grad_norm": 0.9297730326652527, + "learning_rate": 6.999939587990093e-05, + "loss": 2.7359, + "step": 21524 + }, + { + "epoch": 1.950125704967045, + "grad_norm": 0.9472779631614685, + "learning_rate": 6.999335467891018e-05, + "loss": 2.3737, + "step": 21525 + }, + { + "epoch": 1.9502163031414916, + "grad_norm": 0.8976501226425171, + "learning_rate": 6.998731347791941e-05, + "loss": 2.8042, + "step": 21526 + }, + { + "epoch": 1.9503069013159386, + "grad_norm": 0.8902608752250671, + "learning_rate": 6.998127227692866e-05, + "loss": 2.5171, + "step": 21527 + }, + { + "epoch": 1.9503974994903852, + "grad_norm": 0.8093535900115967, + "learning_rate": 6.997523107593789e-05, + "loss": 2.1292, + "step": 21528 + }, + { + "epoch": 1.950488097664832, + "grad_norm": 0.7483078241348267, + "learning_rate": 6.996918987494714e-05, + "loss": 1.8944, + "step": 21529 + }, + { + "epoch": 1.9505786958392788, + "grad_norm": 0.812865674495697, + "learning_rate": 6.996314867395639e-05, + "loss": 1.9889, + "step": 21530 + }, + { + "epoch": 1.9506692940137256, + "grad_norm": 0.8046831488609314, + "learning_rate": 6.995710747296563e-05, + "loss": 2.0508, + "step": 21531 + }, + { + "epoch": 1.9507598921881724, + "grad_norm": 0.9413279294967651, + "learning_rate": 6.995106627197487e-05, + "loss": 2.7893, + "step": 21532 + }, + { + "epoch": 1.9508504903626191, + "grad_norm": 0.7959960699081421, + "learning_rate": 6.994502507098412e-05, + "loss": 1.94, + "step": 21533 + }, + { + "epoch": 1.950941088537066, + "grad_norm": 0.8149101734161377, + "learning_rate": 6.993898386999336e-05, + "loss": 2.1327, + "step": 21534 + }, + { + "epoch": 1.9510316867115127, + "grad_norm": 0.9210954308509827, + "learning_rate": 6.99329426690026e-05, + "loss": 2.6979, + "step": 21535 + }, + { + "epoch": 1.9511222848859595, + "grad_norm": 0.7455305457115173, + "learning_rate": 6.992690146801185e-05, + "loss": 2.0436, + "step": 21536 + }, + { + "epoch": 1.9512128830604063, + "grad_norm": 0.7887824177742004, + "learning_rate": 6.992086026702108e-05, + "loss": 2.2225, + "step": 21537 + }, + { + "epoch": 1.951303481234853, + "grad_norm": 0.8913573026657104, + "learning_rate": 6.991481906603034e-05, + "loss": 2.6472, + "step": 21538 + }, + { + "epoch": 1.9513940794092999, + "grad_norm": 1.0069823265075684, + "learning_rate": 6.990877786503957e-05, + "loss": 2.8207, + "step": 21539 + }, + { + "epoch": 1.9514846775837467, + "grad_norm": 0.8438824415206909, + "learning_rate": 6.990273666404882e-05, + "loss": 2.6912, + "step": 21540 + }, + { + "epoch": 1.9515752757581935, + "grad_norm": 0.89146488904953, + "learning_rate": 6.989669546305806e-05, + "loss": 2.4489, + "step": 21541 + }, + { + "epoch": 1.9516658739326402, + "grad_norm": 0.9110278487205505, + "learning_rate": 6.98906542620673e-05, + "loss": 2.5697, + "step": 21542 + }, + { + "epoch": 1.951756472107087, + "grad_norm": 0.8366368412971497, + "learning_rate": 6.988461306107654e-05, + "loss": 2.6654, + "step": 21543 + }, + { + "epoch": 1.9518470702815338, + "grad_norm": 0.8535529971122742, + "learning_rate": 6.987857186008579e-05, + "loss": 2.3681, + "step": 21544 + }, + { + "epoch": 1.9519376684559806, + "grad_norm": 0.9155868887901306, + "learning_rate": 6.987253065909503e-05, + "loss": 2.8148, + "step": 21545 + }, + { + "epoch": 1.9520282666304274, + "grad_norm": 0.8924000859260559, + "learning_rate": 6.986648945810428e-05, + "loss": 2.62, + "step": 21546 + }, + { + "epoch": 1.9521188648048742, + "grad_norm": 0.9350339770317078, + "learning_rate": 6.986044825711351e-05, + "loss": 2.7142, + "step": 21547 + }, + { + "epoch": 1.952209462979321, + "grad_norm": 0.8049705028533936, + "learning_rate": 6.985440705612276e-05, + "loss": 1.8763, + "step": 21548 + }, + { + "epoch": 1.9523000611537678, + "grad_norm": 0.912399411201477, + "learning_rate": 6.9848365855132e-05, + "loss": 2.6054, + "step": 21549 + }, + { + "epoch": 1.9523906593282145, + "grad_norm": 0.8690926432609558, + "learning_rate": 6.984232465414124e-05, + "loss": 2.4722, + "step": 21550 + }, + { + "epoch": 1.9524812575026613, + "grad_norm": 0.8892078399658203, + "learning_rate": 6.983628345315049e-05, + "loss": 2.6444, + "step": 21551 + }, + { + "epoch": 1.9525718556771081, + "grad_norm": 0.7322989106178284, + "learning_rate": 6.983024225215973e-05, + "loss": 2.0648, + "step": 21552 + }, + { + "epoch": 1.952662453851555, + "grad_norm": 0.8630737662315369, + "learning_rate": 6.982420105116899e-05, + "loss": 1.9434, + "step": 21553 + }, + { + "epoch": 1.9527530520260017, + "grad_norm": 0.8694072365760803, + "learning_rate": 6.981815985017822e-05, + "loss": 2.5941, + "step": 21554 + }, + { + "epoch": 1.9528436502004485, + "grad_norm": 0.9124820828437805, + "learning_rate": 6.981211864918747e-05, + "loss": 2.7063, + "step": 21555 + }, + { + "epoch": 1.9529342483748953, + "grad_norm": 0.8691083192825317, + "learning_rate": 6.98060774481967e-05, + "loss": 2.4044, + "step": 21556 + }, + { + "epoch": 1.953024846549342, + "grad_norm": 0.9030095934867859, + "learning_rate": 6.980003624720595e-05, + "loss": 2.7379, + "step": 21557 + }, + { + "epoch": 1.9531154447237888, + "grad_norm": 0.9406692981719971, + "learning_rate": 6.979399504621518e-05, + "loss": 2.2751, + "step": 21558 + }, + { + "epoch": 1.9532060428982356, + "grad_norm": 0.9555650949478149, + "learning_rate": 6.978795384522443e-05, + "loss": 2.7715, + "step": 21559 + }, + { + "epoch": 1.9532966410726824, + "grad_norm": 0.7806158065795898, + "learning_rate": 6.978191264423368e-05, + "loss": 1.9842, + "step": 21560 + }, + { + "epoch": 1.9533872392471292, + "grad_norm": 0.88689786195755, + "learning_rate": 6.977587144324293e-05, + "loss": 2.8211, + "step": 21561 + }, + { + "epoch": 1.953477837421576, + "grad_norm": 0.9363514184951782, + "learning_rate": 6.976983024225216e-05, + "loss": 2.5774, + "step": 21562 + }, + { + "epoch": 1.9535684355960228, + "grad_norm": 0.9054033160209656, + "learning_rate": 6.976378904126141e-05, + "loss": 2.784, + "step": 21563 + }, + { + "epoch": 1.9536590337704696, + "grad_norm": 0.8903701901435852, + "learning_rate": 6.975774784027064e-05, + "loss": 2.7584, + "step": 21564 + }, + { + "epoch": 1.9537496319449164, + "grad_norm": 0.8098778128623962, + "learning_rate": 6.975170663927989e-05, + "loss": 1.8385, + "step": 21565 + }, + { + "epoch": 1.9538402301193631, + "grad_norm": 0.8757308721542358, + "learning_rate": 6.974566543828914e-05, + "loss": 2.2434, + "step": 21566 + }, + { + "epoch": 1.95393082829381, + "grad_norm": 0.8729099035263062, + "learning_rate": 6.973962423729837e-05, + "loss": 2.5197, + "step": 21567 + }, + { + "epoch": 1.9540214264682567, + "grad_norm": 1.006359577178955, + "learning_rate": 6.973358303630762e-05, + "loss": 2.8281, + "step": 21568 + }, + { + "epoch": 1.9541120246427035, + "grad_norm": 0.8342524766921997, + "learning_rate": 6.972754183531687e-05, + "loss": 1.8572, + "step": 21569 + }, + { + "epoch": 1.9542026228171503, + "grad_norm": 0.8989118933677673, + "learning_rate": 6.972150063432611e-05, + "loss": 2.4552, + "step": 21570 + }, + { + "epoch": 1.9542932209915969, + "grad_norm": 0.9271064400672913, + "learning_rate": 6.971545943333535e-05, + "loss": 2.9175, + "step": 21571 + }, + { + "epoch": 1.9543838191660439, + "grad_norm": 0.977174699306488, + "learning_rate": 6.97094182323446e-05, + "loss": 1.8525, + "step": 21572 + }, + { + "epoch": 1.9544744173404904, + "grad_norm": 0.9392899870872498, + "learning_rate": 6.970337703135383e-05, + "loss": 2.7187, + "step": 21573 + }, + { + "epoch": 1.9545650155149374, + "grad_norm": 0.9346267580986023, + "learning_rate": 6.969733583036308e-05, + "loss": 2.6704, + "step": 21574 + }, + { + "epoch": 1.954655613689384, + "grad_norm": 0.781827986240387, + "learning_rate": 6.969129462937233e-05, + "loss": 2.432, + "step": 21575 + }, + { + "epoch": 1.954746211863831, + "grad_norm": 0.8456592559814453, + "learning_rate": 6.968525342838157e-05, + "loss": 2.6981, + "step": 21576 + }, + { + "epoch": 1.9548368100382776, + "grad_norm": 0.9086673855781555, + "learning_rate": 6.967921222739081e-05, + "loss": 2.644, + "step": 21577 + }, + { + "epoch": 1.9549274082127246, + "grad_norm": 0.8397541046142578, + "learning_rate": 6.967317102640005e-05, + "loss": 2.284, + "step": 21578 + }, + { + "epoch": 1.9550180063871712, + "grad_norm": 0.8953772783279419, + "learning_rate": 6.966712982540929e-05, + "loss": 2.6346, + "step": 21579 + }, + { + "epoch": 1.9551086045616182, + "grad_norm": 0.9457581639289856, + "learning_rate": 6.966108862441854e-05, + "loss": 2.5884, + "step": 21580 + }, + { + "epoch": 1.9551992027360647, + "grad_norm": 0.8834366798400879, + "learning_rate": 6.965504742342777e-05, + "loss": 2.6366, + "step": 21581 + }, + { + "epoch": 1.9552898009105117, + "grad_norm": 0.9172624945640564, + "learning_rate": 6.964900622243702e-05, + "loss": 2.7493, + "step": 21582 + }, + { + "epoch": 1.9553803990849583, + "grad_norm": 0.8991875648498535, + "learning_rate": 6.964296502144627e-05, + "loss": 2.6318, + "step": 21583 + }, + { + "epoch": 1.9554709972594053, + "grad_norm": 0.8645442724227905, + "learning_rate": 6.963692382045551e-05, + "loss": 2.7381, + "step": 21584 + }, + { + "epoch": 1.9555615954338519, + "grad_norm": 0.9860345721244812, + "learning_rate": 6.963088261946476e-05, + "loss": 3.0775, + "step": 21585 + }, + { + "epoch": 1.955652193608299, + "grad_norm": 0.9332860112190247, + "learning_rate": 6.9624841418474e-05, + "loss": 2.6788, + "step": 21586 + }, + { + "epoch": 1.9557427917827455, + "grad_norm": 0.9496585726737976, + "learning_rate": 6.961880021748324e-05, + "loss": 2.8511, + "step": 21587 + }, + { + "epoch": 1.9558333899571925, + "grad_norm": 0.8801353573799133, + "learning_rate": 6.961275901649248e-05, + "loss": 2.709, + "step": 21588 + }, + { + "epoch": 1.955923988131639, + "grad_norm": 0.9250879883766174, + "learning_rate": 6.960671781550172e-05, + "loss": 2.7262, + "step": 21589 + }, + { + "epoch": 1.956014586306086, + "grad_norm": 0.9942592978477478, + "learning_rate": 6.960067661451097e-05, + "loss": 2.8034, + "step": 21590 + }, + { + "epoch": 1.9561051844805326, + "grad_norm": 0.949677586555481, + "learning_rate": 6.959463541352022e-05, + "loss": 2.9251, + "step": 21591 + }, + { + "epoch": 1.9561957826549796, + "grad_norm": 0.8974136114120483, + "learning_rate": 6.958859421252945e-05, + "loss": 2.5871, + "step": 21592 + }, + { + "epoch": 1.9562863808294262, + "grad_norm": 0.9200702905654907, + "learning_rate": 6.95825530115387e-05, + "loss": 2.5686, + "step": 21593 + }, + { + "epoch": 1.9563769790038732, + "grad_norm": 0.7901109457015991, + "learning_rate": 6.957651181054794e-05, + "loss": 2.0473, + "step": 21594 + }, + { + "epoch": 1.9564675771783198, + "grad_norm": 0.7674616575241089, + "learning_rate": 6.957047060955718e-05, + "loss": 1.982, + "step": 21595 + }, + { + "epoch": 1.9565581753527668, + "grad_norm": 0.9363428354263306, + "learning_rate": 6.956442940856642e-05, + "loss": 2.7353, + "step": 21596 + }, + { + "epoch": 1.9566487735272133, + "grad_norm": 0.8757940530776978, + "learning_rate": 6.955838820757566e-05, + "loss": 1.92, + "step": 21597 + }, + { + "epoch": 1.9567393717016603, + "grad_norm": 0.9816930890083313, + "learning_rate": 6.955234700658491e-05, + "loss": 2.7715, + "step": 21598 + }, + { + "epoch": 1.956829969876107, + "grad_norm": 0.9107317328453064, + "learning_rate": 6.954630580559416e-05, + "loss": 2.4422, + "step": 21599 + }, + { + "epoch": 1.956920568050554, + "grad_norm": 0.9539721012115479, + "learning_rate": 6.95402646046034e-05, + "loss": 2.5804, + "step": 21600 + }, + { + "epoch": 1.9570111662250005, + "grad_norm": 0.9717631340026855, + "learning_rate": 6.953422340361264e-05, + "loss": 2.3373, + "step": 21601 + }, + { + "epoch": 1.9571017643994475, + "grad_norm": 0.8422892689704895, + "learning_rate": 6.952818220262189e-05, + "loss": 2.6356, + "step": 21602 + }, + { + "epoch": 1.957192362573894, + "grad_norm": 0.8664951920509338, + "learning_rate": 6.952214100163112e-05, + "loss": 2.8878, + "step": 21603 + }, + { + "epoch": 1.957282960748341, + "grad_norm": 0.9596849083900452, + "learning_rate": 6.951609980064037e-05, + "loss": 2.6799, + "step": 21604 + }, + { + "epoch": 1.9573735589227876, + "grad_norm": 0.9650335907936096, + "learning_rate": 6.951005859964962e-05, + "loss": 2.8632, + "step": 21605 + }, + { + "epoch": 1.9574641570972346, + "grad_norm": 0.8791460394859314, + "learning_rate": 6.950401739865887e-05, + "loss": 2.3871, + "step": 21606 + }, + { + "epoch": 1.9575547552716812, + "grad_norm": 0.9133236408233643, + "learning_rate": 6.94979761976681e-05, + "loss": 2.8861, + "step": 21607 + }, + { + "epoch": 1.9576453534461282, + "grad_norm": 0.9289703965187073, + "learning_rate": 6.949193499667735e-05, + "loss": 2.7841, + "step": 21608 + }, + { + "epoch": 1.9577359516205748, + "grad_norm": 0.9391872882843018, + "learning_rate": 6.948589379568658e-05, + "loss": 2.7182, + "step": 21609 + }, + { + "epoch": 1.9578265497950216, + "grad_norm": 0.9125271439552307, + "learning_rate": 6.947985259469583e-05, + "loss": 2.4601, + "step": 21610 + }, + { + "epoch": 1.9579171479694684, + "grad_norm": 0.8280571103096008, + "learning_rate": 6.947381139370506e-05, + "loss": 2.5661, + "step": 21611 + }, + { + "epoch": 1.9580077461439152, + "grad_norm": 0.9958089590072632, + "learning_rate": 6.946777019271431e-05, + "loss": 2.9202, + "step": 21612 + }, + { + "epoch": 1.958098344318362, + "grad_norm": 0.8680088520050049, + "learning_rate": 6.946172899172356e-05, + "loss": 2.6904, + "step": 21613 + }, + { + "epoch": 1.9581889424928087, + "grad_norm": 0.8710132241249084, + "learning_rate": 6.94556877907328e-05, + "loss": 2.6285, + "step": 21614 + }, + { + "epoch": 1.9582795406672555, + "grad_norm": 0.9304686784744263, + "learning_rate": 6.944964658974204e-05, + "loss": 2.4582, + "step": 21615 + }, + { + "epoch": 1.9583701388417023, + "grad_norm": 0.8730977773666382, + "learning_rate": 6.944360538875129e-05, + "loss": 2.6304, + "step": 21616 + }, + { + "epoch": 1.958460737016149, + "grad_norm": 0.9685207009315491, + "learning_rate": 6.943756418776054e-05, + "loss": 2.4936, + "step": 21617 + }, + { + "epoch": 1.9585513351905959, + "grad_norm": 0.8923685550689697, + "learning_rate": 6.943152298676977e-05, + "loss": 2.7813, + "step": 21618 + }, + { + "epoch": 1.9586419333650427, + "grad_norm": 0.8701659440994263, + "learning_rate": 6.942548178577902e-05, + "loss": 2.563, + "step": 21619 + }, + { + "epoch": 1.9587325315394895, + "grad_norm": 0.885464608669281, + "learning_rate": 6.941944058478826e-05, + "loss": 2.546, + "step": 21620 + }, + { + "epoch": 1.9588231297139362, + "grad_norm": 0.9599636793136597, + "learning_rate": 6.941339938379751e-05, + "loss": 3.0205, + "step": 21621 + }, + { + "epoch": 1.958913727888383, + "grad_norm": 0.8607471585273743, + "learning_rate": 6.940735818280675e-05, + "loss": 2.6524, + "step": 21622 + }, + { + "epoch": 1.9590043260628298, + "grad_norm": 0.885052502155304, + "learning_rate": 6.9401316981816e-05, + "loss": 2.4549, + "step": 21623 + }, + { + "epoch": 1.9590949242372766, + "grad_norm": 0.8911024332046509, + "learning_rate": 6.939527578082523e-05, + "loss": 2.498, + "step": 21624 + }, + { + "epoch": 1.9591855224117234, + "grad_norm": 0.8815401196479797, + "learning_rate": 6.938923457983448e-05, + "loss": 2.7315, + "step": 21625 + }, + { + "epoch": 1.9592761205861702, + "grad_norm": 0.8680258393287659, + "learning_rate": 6.938319337884371e-05, + "loss": 2.7969, + "step": 21626 + }, + { + "epoch": 1.959366718760617, + "grad_norm": 0.9299161434173584, + "learning_rate": 6.937715217785296e-05, + "loss": 2.4531, + "step": 21627 + }, + { + "epoch": 1.9594573169350638, + "grad_norm": 0.8721039891242981, + "learning_rate": 6.93711109768622e-05, + "loss": 2.5297, + "step": 21628 + }, + { + "epoch": 1.9595479151095105, + "grad_norm": 0.9618440270423889, + "learning_rate": 6.936506977587145e-05, + "loss": 2.8455, + "step": 21629 + }, + { + "epoch": 1.9596385132839573, + "grad_norm": 0.9344047904014587, + "learning_rate": 6.935902857488069e-05, + "loss": 2.7699, + "step": 21630 + }, + { + "epoch": 1.9597291114584041, + "grad_norm": 0.8457101583480835, + "learning_rate": 6.935298737388993e-05, + "loss": 2.739, + "step": 21631 + }, + { + "epoch": 1.959819709632851, + "grad_norm": 0.9118615984916687, + "learning_rate": 6.934694617289917e-05, + "loss": 2.654, + "step": 21632 + }, + { + "epoch": 1.9599103078072977, + "grad_norm": 1.0408484935760498, + "learning_rate": 6.934090497190842e-05, + "loss": 2.67, + "step": 21633 + }, + { + "epoch": 1.9600009059817445, + "grad_norm": 0.8942962884902954, + "learning_rate": 6.933486377091766e-05, + "loss": 2.6944, + "step": 21634 + }, + { + "epoch": 1.9600915041561913, + "grad_norm": 0.9034717082977295, + "learning_rate": 6.932882256992691e-05, + "loss": 2.6067, + "step": 21635 + }, + { + "epoch": 1.960182102330638, + "grad_norm": 0.9118777513504028, + "learning_rate": 6.932278136893614e-05, + "loss": 2.7686, + "step": 21636 + }, + { + "epoch": 1.9602727005050848, + "grad_norm": 1.0061542987823486, + "learning_rate": 6.931674016794539e-05, + "loss": 2.6902, + "step": 21637 + }, + { + "epoch": 1.9603632986795316, + "grad_norm": 0.9475615620613098, + "learning_rate": 6.931069896695464e-05, + "loss": 2.7154, + "step": 21638 + }, + { + "epoch": 1.9604538968539784, + "grad_norm": 0.7590129375457764, + "learning_rate": 6.930465776596387e-05, + "loss": 1.963, + "step": 21639 + }, + { + "epoch": 1.9605444950284252, + "grad_norm": 0.787135660648346, + "learning_rate": 6.929861656497312e-05, + "loss": 2.0508, + "step": 21640 + }, + { + "epoch": 1.960635093202872, + "grad_norm": 0.7057619690895081, + "learning_rate": 6.929257536398236e-05, + "loss": 1.8879, + "step": 21641 + }, + { + "epoch": 1.9607256913773188, + "grad_norm": 0.898439347743988, + "learning_rate": 6.92865341629916e-05, + "loss": 2.6739, + "step": 21642 + }, + { + "epoch": 1.9608162895517656, + "grad_norm": 0.9140918254852295, + "learning_rate": 6.928049296200085e-05, + "loss": 2.9183, + "step": 21643 + }, + { + "epoch": 1.9609068877262124, + "grad_norm": 0.8108875751495361, + "learning_rate": 6.92744517610101e-05, + "loss": 2.0827, + "step": 21644 + }, + { + "epoch": 1.9609974859006591, + "grad_norm": 0.7993526458740234, + "learning_rate": 6.926841056001933e-05, + "loss": 1.9724, + "step": 21645 + }, + { + "epoch": 1.961088084075106, + "grad_norm": 0.9609782099723816, + "learning_rate": 6.926236935902858e-05, + "loss": 2.4392, + "step": 21646 + }, + { + "epoch": 1.9611786822495527, + "grad_norm": 0.8673763871192932, + "learning_rate": 6.925632815803781e-05, + "loss": 2.3232, + "step": 21647 + }, + { + "epoch": 1.9612692804239995, + "grad_norm": 0.9289913177490234, + "learning_rate": 6.925028695704706e-05, + "loss": 2.8434, + "step": 21648 + }, + { + "epoch": 1.9613598785984463, + "grad_norm": 0.8630155324935913, + "learning_rate": 6.924424575605631e-05, + "loss": 2.6013, + "step": 21649 + }, + { + "epoch": 1.961450476772893, + "grad_norm": 0.8657164573669434, + "learning_rate": 6.923820455506556e-05, + "loss": 2.7343, + "step": 21650 + }, + { + "epoch": 1.9615410749473399, + "grad_norm": 0.8959797620773315, + "learning_rate": 6.923216335407479e-05, + "loss": 2.7644, + "step": 21651 + }, + { + "epoch": 1.9616316731217864, + "grad_norm": 0.9293039441108704, + "learning_rate": 6.922612215308404e-05, + "loss": 2.5003, + "step": 21652 + }, + { + "epoch": 1.9617222712962334, + "grad_norm": 0.9676545858383179, + "learning_rate": 6.922008095209329e-05, + "loss": 2.7137, + "step": 21653 + }, + { + "epoch": 1.96181286947068, + "grad_norm": 0.8751908540725708, + "learning_rate": 6.921403975110252e-05, + "loss": 2.5498, + "step": 21654 + }, + { + "epoch": 1.961903467645127, + "grad_norm": 0.9438727498054504, + "learning_rate": 6.920799855011177e-05, + "loss": 2.7703, + "step": 21655 + }, + { + "epoch": 1.9619940658195736, + "grad_norm": 0.9345304369926453, + "learning_rate": 6.9201957349121e-05, + "loss": 2.6476, + "step": 21656 + }, + { + "epoch": 1.9620846639940206, + "grad_norm": 0.8729058504104614, + "learning_rate": 6.919591614813025e-05, + "loss": 2.7052, + "step": 21657 + }, + { + "epoch": 1.9621752621684672, + "grad_norm": 0.8960467576980591, + "learning_rate": 6.91898749471395e-05, + "loss": 2.6501, + "step": 21658 + }, + { + "epoch": 1.9622658603429142, + "grad_norm": 0.9627088308334351, + "learning_rate": 6.918383374614875e-05, + "loss": 2.579, + "step": 21659 + }, + { + "epoch": 1.9623564585173607, + "grad_norm": 0.9526400566101074, + "learning_rate": 6.917779254515798e-05, + "loss": 2.7008, + "step": 21660 + }, + { + "epoch": 1.9624470566918077, + "grad_norm": 0.8557099103927612, + "learning_rate": 6.917175134416723e-05, + "loss": 2.1576, + "step": 21661 + }, + { + "epoch": 1.9625376548662543, + "grad_norm": 0.9086798429489136, + "learning_rate": 6.916571014317646e-05, + "loss": 2.7314, + "step": 21662 + }, + { + "epoch": 1.9626282530407013, + "grad_norm": 0.7754313945770264, + "learning_rate": 6.915966894218571e-05, + "loss": 2.0585, + "step": 21663 + }, + { + "epoch": 1.9627188512151479, + "grad_norm": 0.8976253867149353, + "learning_rate": 6.915362774119494e-05, + "loss": 2.3527, + "step": 21664 + }, + { + "epoch": 1.962809449389595, + "grad_norm": 0.8627721667289734, + "learning_rate": 6.91475865402042e-05, + "loss": 2.6326, + "step": 21665 + }, + { + "epoch": 1.9629000475640415, + "grad_norm": 0.844019889831543, + "learning_rate": 6.914154533921344e-05, + "loss": 2.8137, + "step": 21666 + }, + { + "epoch": 1.9629906457384885, + "grad_norm": 0.8924198746681213, + "learning_rate": 6.913550413822269e-05, + "loss": 2.5615, + "step": 21667 + }, + { + "epoch": 1.963081243912935, + "grad_norm": 0.7236418128013611, + "learning_rate": 6.912946293723192e-05, + "loss": 2.0631, + "step": 21668 + }, + { + "epoch": 1.963171842087382, + "grad_norm": 0.8616757988929749, + "learning_rate": 6.912342173624117e-05, + "loss": 2.5019, + "step": 21669 + }, + { + "epoch": 1.9632624402618286, + "grad_norm": 0.9474219679832458, + "learning_rate": 6.911738053525041e-05, + "loss": 2.8358, + "step": 21670 + }, + { + "epoch": 1.9633530384362756, + "grad_norm": 0.8255712389945984, + "learning_rate": 6.911133933425965e-05, + "loss": 2.3972, + "step": 21671 + }, + { + "epoch": 1.9634436366107222, + "grad_norm": 0.9040055871009827, + "learning_rate": 6.91052981332689e-05, + "loss": 2.8768, + "step": 21672 + }, + { + "epoch": 1.9635342347851692, + "grad_norm": 0.8901496529579163, + "learning_rate": 6.909925693227814e-05, + "loss": 2.7112, + "step": 21673 + }, + { + "epoch": 1.9636248329596158, + "grad_norm": 0.7782535552978516, + "learning_rate": 6.909321573128739e-05, + "loss": 2.0172, + "step": 21674 + }, + { + "epoch": 1.9637154311340628, + "grad_norm": 0.8836209774017334, + "learning_rate": 6.908717453029663e-05, + "loss": 2.729, + "step": 21675 + }, + { + "epoch": 1.9638060293085093, + "grad_norm": 0.9067685604095459, + "learning_rate": 6.908113332930587e-05, + "loss": 2.8378, + "step": 21676 + }, + { + "epoch": 1.9638966274829563, + "grad_norm": 0.9731132388114929, + "learning_rate": 6.907509212831511e-05, + "loss": 2.6803, + "step": 21677 + }, + { + "epoch": 1.963987225657403, + "grad_norm": 0.9841868281364441, + "learning_rate": 6.906905092732435e-05, + "loss": 2.7299, + "step": 21678 + }, + { + "epoch": 1.96407782383185, + "grad_norm": 0.8717772960662842, + "learning_rate": 6.906300972633359e-05, + "loss": 2.7219, + "step": 21679 + }, + { + "epoch": 1.9641684220062965, + "grad_norm": 0.9572272300720215, + "learning_rate": 6.905696852534285e-05, + "loss": 2.7183, + "step": 21680 + }, + { + "epoch": 1.9642590201807435, + "grad_norm": 0.8109077215194702, + "learning_rate": 6.905092732435208e-05, + "loss": 2.0256, + "step": 21681 + }, + { + "epoch": 1.96434961835519, + "grad_norm": 0.8412291407585144, + "learning_rate": 6.904488612336133e-05, + "loss": 2.5185, + "step": 21682 + }, + { + "epoch": 1.964440216529637, + "grad_norm": 0.888849139213562, + "learning_rate": 6.903884492237057e-05, + "loss": 2.6588, + "step": 21683 + }, + { + "epoch": 1.9645308147040836, + "grad_norm": 0.8352259993553162, + "learning_rate": 6.903280372137981e-05, + "loss": 2.5504, + "step": 21684 + }, + { + "epoch": 1.9646214128785306, + "grad_norm": 0.8540776371955872, + "learning_rate": 6.902676252038906e-05, + "loss": 1.9353, + "step": 21685 + }, + { + "epoch": 1.9647120110529772, + "grad_norm": 0.9350401759147644, + "learning_rate": 6.90207213193983e-05, + "loss": 2.6054, + "step": 21686 + }, + { + "epoch": 1.9648026092274242, + "grad_norm": 0.9530236721038818, + "learning_rate": 6.901468011840754e-05, + "loss": 2.5852, + "step": 21687 + }, + { + "epoch": 1.9648932074018708, + "grad_norm": 0.9614310264587402, + "learning_rate": 6.900863891741679e-05, + "loss": 2.918, + "step": 21688 + }, + { + "epoch": 1.9649838055763178, + "grad_norm": 0.8876335620880127, + "learning_rate": 6.900259771642604e-05, + "loss": 2.5903, + "step": 21689 + }, + { + "epoch": 1.9650744037507644, + "grad_norm": 0.9599358439445496, + "learning_rate": 6.899655651543527e-05, + "loss": 2.5589, + "step": 21690 + }, + { + "epoch": 1.9651650019252112, + "grad_norm": 0.9213436841964722, + "learning_rate": 6.899051531444452e-05, + "loss": 2.6947, + "step": 21691 + }, + { + "epoch": 1.965255600099658, + "grad_norm": 0.8475950956344604, + "learning_rate": 6.898447411345375e-05, + "loss": 2.5138, + "step": 21692 + }, + { + "epoch": 1.9653461982741047, + "grad_norm": 0.9315249919891357, + "learning_rate": 6.8978432912463e-05, + "loss": 2.8689, + "step": 21693 + }, + { + "epoch": 1.9654367964485515, + "grad_norm": 0.8281444907188416, + "learning_rate": 6.897239171147224e-05, + "loss": 2.1192, + "step": 21694 + }, + { + "epoch": 1.9655273946229983, + "grad_norm": 0.8765753507614136, + "learning_rate": 6.89663505104815e-05, + "loss": 2.8323, + "step": 21695 + }, + { + "epoch": 1.965617992797445, + "grad_norm": 0.9285534620285034, + "learning_rate": 6.896030930949073e-05, + "loss": 2.7434, + "step": 21696 + }, + { + "epoch": 1.9657085909718919, + "grad_norm": 0.8735160231590271, + "learning_rate": 6.895426810849998e-05, + "loss": 2.7333, + "step": 21697 + }, + { + "epoch": 1.9657991891463387, + "grad_norm": 0.8820976614952087, + "learning_rate": 6.894822690750921e-05, + "loss": 2.5139, + "step": 21698 + }, + { + "epoch": 1.9658897873207855, + "grad_norm": 0.950745701789856, + "learning_rate": 6.894218570651846e-05, + "loss": 2.8319, + "step": 21699 + }, + { + "epoch": 1.9659803854952322, + "grad_norm": 0.8705784678459167, + "learning_rate": 6.89361445055277e-05, + "loss": 2.7617, + "step": 21700 + }, + { + "epoch": 1.966070983669679, + "grad_norm": 0.936730682849884, + "learning_rate": 6.893010330453694e-05, + "loss": 2.5817, + "step": 21701 + }, + { + "epoch": 1.9661615818441258, + "grad_norm": 0.9248162508010864, + "learning_rate": 6.892406210354619e-05, + "loss": 2.6676, + "step": 21702 + }, + { + "epoch": 1.9662521800185726, + "grad_norm": 0.899093508720398, + "learning_rate": 6.891802090255544e-05, + "loss": 2.8251, + "step": 21703 + }, + { + "epoch": 1.9663427781930194, + "grad_norm": 0.8582836389541626, + "learning_rate": 6.891197970156468e-05, + "loss": 2.6323, + "step": 21704 + }, + { + "epoch": 1.9664333763674662, + "grad_norm": 0.9538682103157043, + "learning_rate": 6.890593850057392e-05, + "loss": 3.0535, + "step": 21705 + }, + { + "epoch": 1.966523974541913, + "grad_norm": 0.8978914618492126, + "learning_rate": 6.889989729958317e-05, + "loss": 2.7039, + "step": 21706 + }, + { + "epoch": 1.9666145727163598, + "grad_norm": 0.9101941585540771, + "learning_rate": 6.88938560985924e-05, + "loss": 2.8791, + "step": 21707 + }, + { + "epoch": 1.9667051708908065, + "grad_norm": 1.0165387392044067, + "learning_rate": 6.888781489760165e-05, + "loss": 2.7257, + "step": 21708 + }, + { + "epoch": 1.9667957690652533, + "grad_norm": 0.9476402997970581, + "learning_rate": 6.888177369661088e-05, + "loss": 2.6598, + "step": 21709 + }, + { + "epoch": 1.9668863672397001, + "grad_norm": 0.9929383397102356, + "learning_rate": 6.887573249562014e-05, + "loss": 2.5517, + "step": 21710 + }, + { + "epoch": 1.966976965414147, + "grad_norm": 0.7681307196617126, + "learning_rate": 6.886969129462938e-05, + "loss": 1.7814, + "step": 21711 + }, + { + "epoch": 1.9670675635885937, + "grad_norm": 0.9138014316558838, + "learning_rate": 6.886365009363862e-05, + "loss": 2.792, + "step": 21712 + }, + { + "epoch": 1.9671581617630405, + "grad_norm": 0.8505491018295288, + "learning_rate": 6.885760889264786e-05, + "loss": 2.7723, + "step": 21713 + }, + { + "epoch": 1.9672487599374873, + "grad_norm": 0.9713367223739624, + "learning_rate": 6.88515676916571e-05, + "loss": 2.3471, + "step": 21714 + }, + { + "epoch": 1.967339358111934, + "grad_norm": 0.9076355695724487, + "learning_rate": 6.884552649066634e-05, + "loss": 2.7043, + "step": 21715 + }, + { + "epoch": 1.9674299562863808, + "grad_norm": 0.828389048576355, + "learning_rate": 6.883948528967559e-05, + "loss": 2.2141, + "step": 21716 + }, + { + "epoch": 1.9675205544608276, + "grad_norm": 0.879748523235321, + "learning_rate": 6.883344408868484e-05, + "loss": 2.6284, + "step": 21717 + }, + { + "epoch": 1.9676111526352744, + "grad_norm": 0.9022278785705566, + "learning_rate": 6.882740288769408e-05, + "loss": 2.6465, + "step": 21718 + }, + { + "epoch": 1.9677017508097212, + "grad_norm": 0.900489866733551, + "learning_rate": 6.882136168670332e-05, + "loss": 2.9882, + "step": 21719 + }, + { + "epoch": 1.967792348984168, + "grad_norm": 0.8604227900505066, + "learning_rate": 6.881532048571256e-05, + "loss": 2.0761, + "step": 21720 + }, + { + "epoch": 1.9678829471586148, + "grad_norm": 0.7845297455787659, + "learning_rate": 6.880927928472181e-05, + "loss": 2.0405, + "step": 21721 + }, + { + "epoch": 1.9679735453330616, + "grad_norm": 0.7961942553520203, + "learning_rate": 6.880323808373105e-05, + "loss": 1.9212, + "step": 21722 + }, + { + "epoch": 1.9680641435075084, + "grad_norm": 0.9323644638061523, + "learning_rate": 6.87971968827403e-05, + "loss": 2.553, + "step": 21723 + }, + { + "epoch": 1.9681547416819551, + "grad_norm": 0.7554478645324707, + "learning_rate": 6.879115568174953e-05, + "loss": 2.2305, + "step": 21724 + }, + { + "epoch": 1.968245339856402, + "grad_norm": 0.9049403667449951, + "learning_rate": 6.878511448075879e-05, + "loss": 2.5354, + "step": 21725 + }, + { + "epoch": 1.9683359380308487, + "grad_norm": 0.8454763293266296, + "learning_rate": 6.877907327976802e-05, + "loss": 2.6545, + "step": 21726 + }, + { + "epoch": 1.9684265362052955, + "grad_norm": 0.9216330051422119, + "learning_rate": 6.877303207877727e-05, + "loss": 2.735, + "step": 21727 + }, + { + "epoch": 1.9685171343797423, + "grad_norm": 0.8597056865692139, + "learning_rate": 6.87669908777865e-05, + "loss": 2.7113, + "step": 21728 + }, + { + "epoch": 1.968607732554189, + "grad_norm": 0.9061563014984131, + "learning_rate": 6.876094967679575e-05, + "loss": 2.7285, + "step": 21729 + }, + { + "epoch": 1.9686983307286359, + "grad_norm": 0.905404269695282, + "learning_rate": 6.875490847580499e-05, + "loss": 2.6578, + "step": 21730 + }, + { + "epoch": 1.9687889289030827, + "grad_norm": 0.9554983973503113, + "learning_rate": 6.874886727481423e-05, + "loss": 2.7421, + "step": 21731 + }, + { + "epoch": 1.9688795270775294, + "grad_norm": 0.9853519201278687, + "learning_rate": 6.874282607382347e-05, + "loss": 2.8714, + "step": 21732 + }, + { + "epoch": 1.968970125251976, + "grad_norm": 0.952044665813446, + "learning_rate": 6.873678487283273e-05, + "loss": 2.5677, + "step": 21733 + }, + { + "epoch": 1.969060723426423, + "grad_norm": 0.9126653075218201, + "learning_rate": 6.873074367184196e-05, + "loss": 2.6343, + "step": 21734 + }, + { + "epoch": 1.9691513216008696, + "grad_norm": 0.6891200542449951, + "learning_rate": 6.872470247085121e-05, + "loss": 1.6004, + "step": 21735 + }, + { + "epoch": 1.9692419197753166, + "grad_norm": 0.8871278166770935, + "learning_rate": 6.871866126986044e-05, + "loss": 2.7564, + "step": 21736 + }, + { + "epoch": 1.9693325179497632, + "grad_norm": 0.8698626756668091, + "learning_rate": 6.871262006886969e-05, + "loss": 2.6648, + "step": 21737 + }, + { + "epoch": 1.9694231161242102, + "grad_norm": 0.8304987549781799, + "learning_rate": 6.870657886787894e-05, + "loss": 2.2032, + "step": 21738 + }, + { + "epoch": 1.9695137142986567, + "grad_norm": 1.1856894493103027, + "learning_rate": 6.870053766688817e-05, + "loss": 2.5842, + "step": 21739 + }, + { + "epoch": 1.9696043124731037, + "grad_norm": 0.9200223088264465, + "learning_rate": 6.869449646589744e-05, + "loss": 2.7459, + "step": 21740 + }, + { + "epoch": 1.9696949106475503, + "grad_norm": 0.9507647752761841, + "learning_rate": 6.868845526490667e-05, + "loss": 2.8623, + "step": 21741 + }, + { + "epoch": 1.9697855088219973, + "grad_norm": 0.8619601726531982, + "learning_rate": 6.868241406391592e-05, + "loss": 2.7268, + "step": 21742 + }, + { + "epoch": 1.9698761069964439, + "grad_norm": 0.909795880317688, + "learning_rate": 6.867637286292515e-05, + "loss": 2.6381, + "step": 21743 + }, + { + "epoch": 1.969966705170891, + "grad_norm": 0.8907306790351868, + "learning_rate": 6.86703316619344e-05, + "loss": 2.5898, + "step": 21744 + }, + { + "epoch": 1.9700573033453375, + "grad_norm": 0.8968166708946228, + "learning_rate": 6.866429046094363e-05, + "loss": 2.7451, + "step": 21745 + }, + { + "epoch": 1.9701479015197845, + "grad_norm": 0.7821090817451477, + "learning_rate": 6.865824925995288e-05, + "loss": 1.9978, + "step": 21746 + }, + { + "epoch": 1.970238499694231, + "grad_norm": 0.9085806608200073, + "learning_rate": 6.865220805896211e-05, + "loss": 2.8514, + "step": 21747 + }, + { + "epoch": 1.970329097868678, + "grad_norm": 0.7191044092178345, + "learning_rate": 6.864616685797138e-05, + "loss": 1.4109, + "step": 21748 + }, + { + "epoch": 1.9704196960431246, + "grad_norm": 0.9081488251686096, + "learning_rate": 6.864012565698061e-05, + "loss": 2.9652, + "step": 21749 + }, + { + "epoch": 1.9705102942175716, + "grad_norm": 0.9263824224472046, + "learning_rate": 6.863408445598986e-05, + "loss": 1.9748, + "step": 21750 + }, + { + "epoch": 1.9706008923920182, + "grad_norm": 0.8711081743240356, + "learning_rate": 6.862804325499909e-05, + "loss": 2.5901, + "step": 21751 + }, + { + "epoch": 1.9706914905664652, + "grad_norm": 0.8213682174682617, + "learning_rate": 6.862200205400834e-05, + "loss": 1.9996, + "step": 21752 + }, + { + "epoch": 1.9707820887409118, + "grad_norm": 0.9083258509635925, + "learning_rate": 6.861596085301759e-05, + "loss": 2.631, + "step": 21753 + }, + { + "epoch": 1.9708726869153588, + "grad_norm": 0.9232637882232666, + "learning_rate": 6.860991965202682e-05, + "loss": 2.5996, + "step": 21754 + }, + { + "epoch": 1.9709632850898053, + "grad_norm": 0.8394200205802917, + "learning_rate": 6.860387845103607e-05, + "loss": 2.6714, + "step": 21755 + }, + { + "epoch": 1.9710538832642523, + "grad_norm": 0.8732097744941711, + "learning_rate": 6.859783725004532e-05, + "loss": 2.9707, + "step": 21756 + }, + { + "epoch": 1.971144481438699, + "grad_norm": 0.753922164440155, + "learning_rate": 6.859179604905456e-05, + "loss": 1.9473, + "step": 21757 + }, + { + "epoch": 1.971235079613146, + "grad_norm": 0.8482640385627747, + "learning_rate": 6.85857548480638e-05, + "loss": 2.5148, + "step": 21758 + }, + { + "epoch": 1.9713256777875925, + "grad_norm": 0.8671343922615051, + "learning_rate": 6.857971364707304e-05, + "loss": 2.438, + "step": 21759 + }, + { + "epoch": 1.9714162759620395, + "grad_norm": 0.963421642780304, + "learning_rate": 6.857367244608228e-05, + "loss": 2.6959, + "step": 21760 + }, + { + "epoch": 1.971506874136486, + "grad_norm": 0.9838770031929016, + "learning_rate": 6.856763124509153e-05, + "loss": 2.6549, + "step": 21761 + }, + { + "epoch": 1.971597472310933, + "grad_norm": 0.8666298985481262, + "learning_rate": 6.856159004410076e-05, + "loss": 2.875, + "step": 21762 + }, + { + "epoch": 1.9716880704853796, + "grad_norm": 0.8196576833724976, + "learning_rate": 6.855554884311002e-05, + "loss": 2.5581, + "step": 21763 + }, + { + "epoch": 1.9717786686598266, + "grad_norm": 0.7476507425308228, + "learning_rate": 6.854950764211926e-05, + "loss": 1.9231, + "step": 21764 + }, + { + "epoch": 1.9718692668342732, + "grad_norm": 0.885082483291626, + "learning_rate": 6.85434664411285e-05, + "loss": 2.7783, + "step": 21765 + }, + { + "epoch": 1.9719598650087202, + "grad_norm": 0.7955264449119568, + "learning_rate": 6.853742524013774e-05, + "loss": 2.0304, + "step": 21766 + }, + { + "epoch": 1.9720504631831668, + "grad_norm": 1.0381184816360474, + "learning_rate": 6.853138403914699e-05, + "loss": 2.8396, + "step": 21767 + }, + { + "epoch": 1.9721410613576138, + "grad_norm": 0.8883727788925171, + "learning_rate": 6.852534283815622e-05, + "loss": 2.6213, + "step": 21768 + }, + { + "epoch": 1.9722316595320604, + "grad_norm": 0.9825789332389832, + "learning_rate": 6.851930163716547e-05, + "loss": 2.7923, + "step": 21769 + }, + { + "epoch": 1.9723222577065074, + "grad_norm": 0.8899344801902771, + "learning_rate": 6.851326043617471e-05, + "loss": 2.649, + "step": 21770 + }, + { + "epoch": 1.972412855880954, + "grad_norm": 0.9883922338485718, + "learning_rate": 6.850721923518396e-05, + "loss": 2.1894, + "step": 21771 + }, + { + "epoch": 1.9725034540554007, + "grad_norm": 0.9058656692504883, + "learning_rate": 6.850117803419321e-05, + "loss": 2.4293, + "step": 21772 + }, + { + "epoch": 1.9725940522298475, + "grad_norm": 0.9643731117248535, + "learning_rate": 6.849513683320244e-05, + "loss": 3.0985, + "step": 21773 + }, + { + "epoch": 1.9726846504042943, + "grad_norm": 0.8951696753501892, + "learning_rate": 6.848909563221169e-05, + "loss": 2.721, + "step": 21774 + }, + { + "epoch": 1.972775248578741, + "grad_norm": 0.928797721862793, + "learning_rate": 6.848305443122093e-05, + "loss": 2.7702, + "step": 21775 + }, + { + "epoch": 1.9728658467531879, + "grad_norm": 0.7308215498924255, + "learning_rate": 6.847701323023017e-05, + "loss": 1.9707, + "step": 21776 + }, + { + "epoch": 1.9729564449276347, + "grad_norm": 0.9017251133918762, + "learning_rate": 6.847097202923941e-05, + "loss": 2.6189, + "step": 21777 + }, + { + "epoch": 1.9730470431020815, + "grad_norm": 0.8578193187713623, + "learning_rate": 6.846493082824867e-05, + "loss": 2.629, + "step": 21778 + }, + { + "epoch": 1.9731376412765282, + "grad_norm": 0.8838887214660645, + "learning_rate": 6.84588896272579e-05, + "loss": 1.9781, + "step": 21779 + }, + { + "epoch": 1.973228239450975, + "grad_norm": 0.9461950659751892, + "learning_rate": 6.845284842626715e-05, + "loss": 3.1141, + "step": 21780 + }, + { + "epoch": 1.9733188376254218, + "grad_norm": 0.9418256878852844, + "learning_rate": 6.844680722527638e-05, + "loss": 2.1154, + "step": 21781 + }, + { + "epoch": 1.9734094357998686, + "grad_norm": 0.8924933075904846, + "learning_rate": 6.844076602428563e-05, + "loss": 2.6796, + "step": 21782 + }, + { + "epoch": 1.9735000339743154, + "grad_norm": 0.9532366991043091, + "learning_rate": 6.843472482329487e-05, + "loss": 2.8806, + "step": 21783 + }, + { + "epoch": 1.9735906321487622, + "grad_norm": 0.8798616528511047, + "learning_rate": 6.842868362230411e-05, + "loss": 2.6984, + "step": 21784 + }, + { + "epoch": 1.973681230323209, + "grad_norm": 0.9828965663909912, + "learning_rate": 6.842264242131336e-05, + "loss": 2.4592, + "step": 21785 + }, + { + "epoch": 1.9737718284976558, + "grad_norm": 0.9121670126914978, + "learning_rate": 6.841660122032261e-05, + "loss": 2.6485, + "step": 21786 + }, + { + "epoch": 1.9738624266721025, + "grad_norm": 0.8875933885574341, + "learning_rate": 6.841056001933184e-05, + "loss": 2.5351, + "step": 21787 + }, + { + "epoch": 1.9739530248465493, + "grad_norm": 0.8754264712333679, + "learning_rate": 6.840451881834109e-05, + "loss": 2.6129, + "step": 21788 + }, + { + "epoch": 1.9740436230209961, + "grad_norm": 0.8783143758773804, + "learning_rate": 6.839847761735034e-05, + "loss": 2.6533, + "step": 21789 + }, + { + "epoch": 1.974134221195443, + "grad_norm": 0.9370549321174622, + "learning_rate": 6.839243641635957e-05, + "loss": 2.5513, + "step": 21790 + }, + { + "epoch": 1.9742248193698897, + "grad_norm": 0.8687270879745483, + "learning_rate": 6.838639521536882e-05, + "loss": 2.751, + "step": 21791 + }, + { + "epoch": 1.9743154175443365, + "grad_norm": 0.8792144656181335, + "learning_rate": 6.838035401437805e-05, + "loss": 2.5312, + "step": 21792 + }, + { + "epoch": 1.9744060157187833, + "grad_norm": 0.8343284130096436, + "learning_rate": 6.837431281338731e-05, + "loss": 2.2515, + "step": 21793 + }, + { + "epoch": 1.97449661389323, + "grad_norm": 0.9358375072479248, + "learning_rate": 6.836827161239655e-05, + "loss": 2.6124, + "step": 21794 + }, + { + "epoch": 1.9745872120676768, + "grad_norm": 0.8666542172431946, + "learning_rate": 6.83622304114058e-05, + "loss": 2.57, + "step": 21795 + }, + { + "epoch": 1.9746778102421236, + "grad_norm": 0.8890942335128784, + "learning_rate": 6.835618921041503e-05, + "loss": 2.6537, + "step": 21796 + }, + { + "epoch": 1.9747684084165704, + "grad_norm": 0.9390025734901428, + "learning_rate": 6.835014800942428e-05, + "loss": 2.7578, + "step": 21797 + }, + { + "epoch": 1.9748590065910172, + "grad_norm": 0.9493145942687988, + "learning_rate": 6.834410680843351e-05, + "loss": 2.6138, + "step": 21798 + }, + { + "epoch": 1.974949604765464, + "grad_norm": 0.9607879519462585, + "learning_rate": 6.833806560744276e-05, + "loss": 2.5183, + "step": 21799 + }, + { + "epoch": 1.9750402029399108, + "grad_norm": 0.8883971571922302, + "learning_rate": 6.833202440645201e-05, + "loss": 2.9052, + "step": 21800 + }, + { + "epoch": 1.9751308011143576, + "grad_norm": 0.8462486267089844, + "learning_rate": 6.832598320546125e-05, + "loss": 2.827, + "step": 21801 + }, + { + "epoch": 1.9752213992888044, + "grad_norm": 0.9545494318008423, + "learning_rate": 6.831994200447049e-05, + "loss": 2.7639, + "step": 21802 + }, + { + "epoch": 1.9753119974632511, + "grad_norm": 0.901001513004303, + "learning_rate": 6.831390080347974e-05, + "loss": 2.9152, + "step": 21803 + }, + { + "epoch": 1.975402595637698, + "grad_norm": 0.9256020188331604, + "learning_rate": 6.830785960248898e-05, + "loss": 2.813, + "step": 21804 + }, + { + "epoch": 1.9754931938121447, + "grad_norm": 0.9112516641616821, + "learning_rate": 6.830181840149822e-05, + "loss": 2.6578, + "step": 21805 + }, + { + "epoch": 1.9755837919865915, + "grad_norm": 0.9521023035049438, + "learning_rate": 6.829577720050747e-05, + "loss": 2.8129, + "step": 21806 + }, + { + "epoch": 1.9756743901610383, + "grad_norm": 1.2016234397888184, + "learning_rate": 6.82897359995167e-05, + "loss": 2.7879, + "step": 21807 + }, + { + "epoch": 1.975764988335485, + "grad_norm": 0.9196460843086243, + "learning_rate": 6.828369479852596e-05, + "loss": 2.5533, + "step": 21808 + }, + { + "epoch": 1.9758555865099319, + "grad_norm": 0.9226126074790955, + "learning_rate": 6.82776535975352e-05, + "loss": 2.7409, + "step": 21809 + }, + { + "epoch": 1.9759461846843787, + "grad_norm": 0.8844379782676697, + "learning_rate": 6.827161239654444e-05, + "loss": 2.8843, + "step": 21810 + }, + { + "epoch": 1.9760367828588254, + "grad_norm": 0.771245002746582, + "learning_rate": 6.826557119555368e-05, + "loss": 2.1172, + "step": 21811 + }, + { + "epoch": 1.9761273810332722, + "grad_norm": 0.8868820071220398, + "learning_rate": 6.825952999456292e-05, + "loss": 2.5869, + "step": 21812 + }, + { + "epoch": 1.976217979207719, + "grad_norm": 0.9175477027893066, + "learning_rate": 6.825348879357216e-05, + "loss": 2.5633, + "step": 21813 + }, + { + "epoch": 1.9763085773821656, + "grad_norm": 0.8750244379043579, + "learning_rate": 6.82474475925814e-05, + "loss": 2.7505, + "step": 21814 + }, + { + "epoch": 1.9763991755566126, + "grad_norm": 0.9496874809265137, + "learning_rate": 6.824140639159065e-05, + "loss": 3.0028, + "step": 21815 + }, + { + "epoch": 1.9764897737310592, + "grad_norm": 0.8945631980895996, + "learning_rate": 6.82353651905999e-05, + "loss": 2.7399, + "step": 21816 + }, + { + "epoch": 1.9765803719055062, + "grad_norm": 0.8188447952270508, + "learning_rate": 6.822932398960913e-05, + "loss": 2.0027, + "step": 21817 + }, + { + "epoch": 1.9766709700799527, + "grad_norm": 0.9272164106369019, + "learning_rate": 6.822328278861838e-05, + "loss": 2.6905, + "step": 21818 + }, + { + "epoch": 1.9767615682543997, + "grad_norm": 0.8763179779052734, + "learning_rate": 6.821724158762762e-05, + "loss": 2.767, + "step": 21819 + }, + { + "epoch": 1.9768521664288463, + "grad_norm": 0.8853450417518616, + "learning_rate": 6.821120038663686e-05, + "loss": 2.7116, + "step": 21820 + }, + { + "epoch": 1.9769427646032933, + "grad_norm": 0.8407166004180908, + "learning_rate": 6.820515918564611e-05, + "loss": 2.5975, + "step": 21821 + }, + { + "epoch": 1.9770333627777399, + "grad_norm": 0.8876030445098877, + "learning_rate": 6.819911798465535e-05, + "loss": 2.7224, + "step": 21822 + }, + { + "epoch": 1.977123960952187, + "grad_norm": 0.8380339741706848, + "learning_rate": 6.81930767836646e-05, + "loss": 2.5914, + "step": 21823 + }, + { + "epoch": 1.9772145591266335, + "grad_norm": 0.9685965776443481, + "learning_rate": 6.818703558267384e-05, + "loss": 2.5919, + "step": 21824 + }, + { + "epoch": 1.9773051573010805, + "grad_norm": 0.9891699552536011, + "learning_rate": 6.818099438168309e-05, + "loss": 2.445, + "step": 21825 + }, + { + "epoch": 1.977395755475527, + "grad_norm": 0.8035993576049805, + "learning_rate": 6.817495318069232e-05, + "loss": 2.0126, + "step": 21826 + }, + { + "epoch": 1.977486353649974, + "grad_norm": 0.9304416179656982, + "learning_rate": 6.816891197970157e-05, + "loss": 2.7716, + "step": 21827 + }, + { + "epoch": 1.9775769518244206, + "grad_norm": 0.926908016204834, + "learning_rate": 6.81628707787108e-05, + "loss": 2.8757, + "step": 21828 + }, + { + "epoch": 1.9776675499988676, + "grad_norm": 0.9681658744812012, + "learning_rate": 6.815682957772005e-05, + "loss": 2.9428, + "step": 21829 + }, + { + "epoch": 1.9777581481733142, + "grad_norm": 0.8680261373519897, + "learning_rate": 6.81507883767293e-05, + "loss": 2.6027, + "step": 21830 + }, + { + "epoch": 1.9778487463477612, + "grad_norm": 0.8961219191551208, + "learning_rate": 6.814474717573855e-05, + "loss": 2.8114, + "step": 21831 + }, + { + "epoch": 1.9779393445222078, + "grad_norm": 0.7818641662597656, + "learning_rate": 6.813870597474778e-05, + "loss": 2.0798, + "step": 21832 + }, + { + "epoch": 1.9780299426966548, + "grad_norm": 0.7566732168197632, + "learning_rate": 6.813266477375703e-05, + "loss": 2.0834, + "step": 21833 + }, + { + "epoch": 1.9781205408711013, + "grad_norm": 0.9620642066001892, + "learning_rate": 6.812662357276626e-05, + "loss": 2.6715, + "step": 21834 + }, + { + "epoch": 1.9782111390455484, + "grad_norm": 0.8804020285606384, + "learning_rate": 6.812058237177551e-05, + "loss": 2.6156, + "step": 21835 + }, + { + "epoch": 1.978301737219995, + "grad_norm": 0.9906392097473145, + "learning_rate": 6.811454117078476e-05, + "loss": 2.7069, + "step": 21836 + }, + { + "epoch": 1.978392335394442, + "grad_norm": 0.7893737554550171, + "learning_rate": 6.810849996979399e-05, + "loss": 1.9965, + "step": 21837 + }, + { + "epoch": 1.9784829335688885, + "grad_norm": 0.8586105704307556, + "learning_rate": 6.810245876880324e-05, + "loss": 2.7712, + "step": 21838 + }, + { + "epoch": 1.9785735317433355, + "grad_norm": 0.925236701965332, + "learning_rate": 6.809641756781249e-05, + "loss": 2.6349, + "step": 21839 + }, + { + "epoch": 1.978664129917782, + "grad_norm": 0.9280155301094055, + "learning_rate": 6.809037636682173e-05, + "loss": 2.8367, + "step": 21840 + }, + { + "epoch": 1.978754728092229, + "grad_norm": 0.7867656350135803, + "learning_rate": 6.808433516583097e-05, + "loss": 2.0064, + "step": 21841 + }, + { + "epoch": 1.9788453262666756, + "grad_norm": 1.0075808763504028, + "learning_rate": 6.807829396484022e-05, + "loss": 2.5884, + "step": 21842 + }, + { + "epoch": 1.9789359244411227, + "grad_norm": 0.9526695609092712, + "learning_rate": 6.807225276384945e-05, + "loss": 2.7318, + "step": 21843 + }, + { + "epoch": 1.9790265226155692, + "grad_norm": 0.8983371257781982, + "learning_rate": 6.80662115628587e-05, + "loss": 2.4718, + "step": 21844 + }, + { + "epoch": 1.9791171207900162, + "grad_norm": 1.0415480136871338, + "learning_rate": 6.806017036186795e-05, + "loss": 2.7091, + "step": 21845 + }, + { + "epoch": 1.9792077189644628, + "grad_norm": 0.959517240524292, + "learning_rate": 6.80541291608772e-05, + "loss": 2.5434, + "step": 21846 + }, + { + "epoch": 1.9792983171389098, + "grad_norm": 0.8867398500442505, + "learning_rate": 6.804808795988643e-05, + "loss": 2.6243, + "step": 21847 + }, + { + "epoch": 1.9793889153133564, + "grad_norm": 0.9165384769439697, + "learning_rate": 6.804204675889568e-05, + "loss": 2.8598, + "step": 21848 + }, + { + "epoch": 1.9794795134878034, + "grad_norm": 0.888105034828186, + "learning_rate": 6.803600555790491e-05, + "loss": 2.8675, + "step": 21849 + }, + { + "epoch": 1.97957011166225, + "grad_norm": 0.8745602369308472, + "learning_rate": 6.802996435691416e-05, + "loss": 2.4392, + "step": 21850 + }, + { + "epoch": 1.979660709836697, + "grad_norm": 0.8682283759117126, + "learning_rate": 6.802392315592339e-05, + "loss": 2.7039, + "step": 21851 + }, + { + "epoch": 1.9797513080111435, + "grad_norm": 0.9411691427230835, + "learning_rate": 6.801788195493264e-05, + "loss": 2.7483, + "step": 21852 + }, + { + "epoch": 1.9798419061855903, + "grad_norm": 0.9254430532455444, + "learning_rate": 6.801184075394189e-05, + "loss": 2.4629, + "step": 21853 + }, + { + "epoch": 1.979932504360037, + "grad_norm": 0.8258152604103088, + "learning_rate": 6.800579955295113e-05, + "loss": 2.4357, + "step": 21854 + }, + { + "epoch": 1.9800231025344839, + "grad_norm": 0.8438834547996521, + "learning_rate": 6.799975835196037e-05, + "loss": 2.466, + "step": 21855 + }, + { + "epoch": 1.9801137007089307, + "grad_norm": 0.8818357586860657, + "learning_rate": 6.799371715096962e-05, + "loss": 2.6491, + "step": 21856 + }, + { + "epoch": 1.9802042988833775, + "grad_norm": 0.8495307564735413, + "learning_rate": 6.798767594997886e-05, + "loss": 2.5179, + "step": 21857 + }, + { + "epoch": 1.9802948970578242, + "grad_norm": 0.8393130302429199, + "learning_rate": 6.79816347489881e-05, + "loss": 1.9635, + "step": 21858 + }, + { + "epoch": 1.980385495232271, + "grad_norm": 0.9579808712005615, + "learning_rate": 6.797559354799734e-05, + "loss": 2.8125, + "step": 21859 + }, + { + "epoch": 1.9804760934067178, + "grad_norm": 0.8837434649467468, + "learning_rate": 6.796955234700659e-05, + "loss": 2.7287, + "step": 21860 + }, + { + "epoch": 1.9805666915811646, + "grad_norm": 0.8447771668434143, + "learning_rate": 6.796351114601584e-05, + "loss": 2.7089, + "step": 21861 + }, + { + "epoch": 1.9806572897556114, + "grad_norm": 0.9124864935874939, + "learning_rate": 6.795746994502507e-05, + "loss": 2.7044, + "step": 21862 + }, + { + "epoch": 1.9807478879300582, + "grad_norm": 0.8976901769638062, + "learning_rate": 6.795142874403432e-05, + "loss": 2.4856, + "step": 21863 + }, + { + "epoch": 1.980838486104505, + "grad_norm": 0.9197934865951538, + "learning_rate": 6.794538754304356e-05, + "loss": 2.5235, + "step": 21864 + }, + { + "epoch": 1.9809290842789518, + "grad_norm": 0.7771517038345337, + "learning_rate": 6.79393463420528e-05, + "loss": 2.0158, + "step": 21865 + }, + { + "epoch": 1.9810196824533985, + "grad_norm": 0.997912585735321, + "learning_rate": 6.793330514106204e-05, + "loss": 2.7602, + "step": 21866 + }, + { + "epoch": 1.9811102806278453, + "grad_norm": 0.815427839756012, + "learning_rate": 6.792726394007128e-05, + "loss": 2.6126, + "step": 21867 + }, + { + "epoch": 1.9812008788022921, + "grad_norm": 0.8780308961868286, + "learning_rate": 6.792122273908053e-05, + "loss": 2.7891, + "step": 21868 + }, + { + "epoch": 1.981291476976739, + "grad_norm": 0.7465691566467285, + "learning_rate": 6.791518153808978e-05, + "loss": 2.0893, + "step": 21869 + }, + { + "epoch": 1.9813820751511857, + "grad_norm": 0.763727068901062, + "learning_rate": 6.790914033709901e-05, + "loss": 2.1246, + "step": 21870 + }, + { + "epoch": 1.9814726733256325, + "grad_norm": 0.7538880109786987, + "learning_rate": 6.790309913610826e-05, + "loss": 2.0399, + "step": 21871 + }, + { + "epoch": 1.9815632715000793, + "grad_norm": 1.0161439180374146, + "learning_rate": 6.789705793511751e-05, + "loss": 2.7382, + "step": 21872 + }, + { + "epoch": 1.981653869674526, + "grad_norm": 0.9345690011978149, + "learning_rate": 6.789101673412674e-05, + "loss": 2.6767, + "step": 21873 + }, + { + "epoch": 1.9817444678489728, + "grad_norm": 0.8665233254432678, + "learning_rate": 6.788497553313599e-05, + "loss": 2.5317, + "step": 21874 + }, + { + "epoch": 1.9818350660234196, + "grad_norm": 0.9052120447158813, + "learning_rate": 6.787893433214524e-05, + "loss": 2.5474, + "step": 21875 + }, + { + "epoch": 1.9819256641978664, + "grad_norm": 0.8964516520500183, + "learning_rate": 6.787289313115449e-05, + "loss": 2.6673, + "step": 21876 + }, + { + "epoch": 1.9820162623723132, + "grad_norm": 0.9587815999984741, + "learning_rate": 6.786685193016372e-05, + "loss": 2.5333, + "step": 21877 + }, + { + "epoch": 1.98210686054676, + "grad_norm": 0.9931530952453613, + "learning_rate": 6.786081072917297e-05, + "loss": 2.8172, + "step": 21878 + }, + { + "epoch": 1.9821974587212068, + "grad_norm": 0.9204556345939636, + "learning_rate": 6.78547695281822e-05, + "loss": 2.8872, + "step": 21879 + }, + { + "epoch": 1.9822880568956536, + "grad_norm": 0.946543276309967, + "learning_rate": 6.784872832719145e-05, + "loss": 2.4337, + "step": 21880 + }, + { + "epoch": 1.9823786550701004, + "grad_norm": 0.7992997765541077, + "learning_rate": 6.784268712620068e-05, + "loss": 2.0735, + "step": 21881 + }, + { + "epoch": 1.9824692532445471, + "grad_norm": 0.9677898287773132, + "learning_rate": 6.783664592520993e-05, + "loss": 2.7515, + "step": 21882 + }, + { + "epoch": 1.982559851418994, + "grad_norm": 0.9288501143455505, + "learning_rate": 6.783060472421918e-05, + "loss": 2.7906, + "step": 21883 + }, + { + "epoch": 1.9826504495934407, + "grad_norm": 0.9285922050476074, + "learning_rate": 6.782456352322843e-05, + "loss": 2.8898, + "step": 21884 + }, + { + "epoch": 1.9827410477678875, + "grad_norm": 0.8904491662979126, + "learning_rate": 6.781852232223766e-05, + "loss": 2.6885, + "step": 21885 + }, + { + "epoch": 1.9828316459423343, + "grad_norm": 0.9700773358345032, + "learning_rate": 6.781248112124691e-05, + "loss": 2.6596, + "step": 21886 + }, + { + "epoch": 1.982922244116781, + "grad_norm": 0.9927495121955872, + "learning_rate": 6.780643992025614e-05, + "loss": 2.8254, + "step": 21887 + }, + { + "epoch": 1.9830128422912279, + "grad_norm": 0.9598467946052551, + "learning_rate": 6.780039871926539e-05, + "loss": 2.5697, + "step": 21888 + }, + { + "epoch": 1.9831034404656747, + "grad_norm": 0.8702412247657776, + "learning_rate": 6.779435751827464e-05, + "loss": 2.5326, + "step": 21889 + }, + { + "epoch": 1.9831940386401214, + "grad_norm": 0.8877487182617188, + "learning_rate": 6.778831631728388e-05, + "loss": 2.5522, + "step": 21890 + }, + { + "epoch": 1.9832846368145682, + "grad_norm": 0.9333006739616394, + "learning_rate": 6.778227511629313e-05, + "loss": 2.9084, + "step": 21891 + }, + { + "epoch": 1.983375234989015, + "grad_norm": 0.9198673367500305, + "learning_rate": 6.777623391530237e-05, + "loss": 2.8477, + "step": 21892 + }, + { + "epoch": 1.9834658331634618, + "grad_norm": 0.9503960013389587, + "learning_rate": 6.777019271431161e-05, + "loss": 2.7171, + "step": 21893 + }, + { + "epoch": 1.9835564313379086, + "grad_norm": 0.9383816719055176, + "learning_rate": 6.776415151332085e-05, + "loss": 2.5784, + "step": 21894 + }, + { + "epoch": 1.9836470295123552, + "grad_norm": 0.9344261884689331, + "learning_rate": 6.77581103123301e-05, + "loss": 2.7208, + "step": 21895 + }, + { + "epoch": 1.9837376276868022, + "grad_norm": 0.9007354974746704, + "learning_rate": 6.775206911133933e-05, + "loss": 2.6993, + "step": 21896 + }, + { + "epoch": 1.9838282258612487, + "grad_norm": 0.9571743607521057, + "learning_rate": 6.774602791034858e-05, + "loss": 2.6888, + "step": 21897 + }, + { + "epoch": 1.9839188240356957, + "grad_norm": 0.9138184785842896, + "learning_rate": 6.773998670935783e-05, + "loss": 2.4996, + "step": 21898 + }, + { + "epoch": 1.9840094222101423, + "grad_norm": 1.0281972885131836, + "learning_rate": 6.773394550836707e-05, + "loss": 2.8274, + "step": 21899 + }, + { + "epoch": 1.9841000203845893, + "grad_norm": 0.8434106707572937, + "learning_rate": 6.77279043073763e-05, + "loss": 2.7681, + "step": 21900 + }, + { + "epoch": 1.984190618559036, + "grad_norm": 0.9089282155036926, + "learning_rate": 6.772186310638555e-05, + "loss": 2.729, + "step": 21901 + }, + { + "epoch": 1.984281216733483, + "grad_norm": 0.964404821395874, + "learning_rate": 6.771582190539479e-05, + "loss": 2.7449, + "step": 21902 + }, + { + "epoch": 1.9843718149079295, + "grad_norm": 0.9215878844261169, + "learning_rate": 6.770978070440404e-05, + "loss": 2.7684, + "step": 21903 + }, + { + "epoch": 1.9844624130823765, + "grad_norm": 1.0093104839324951, + "learning_rate": 6.770373950341328e-05, + "loss": 2.6574, + "step": 21904 + }, + { + "epoch": 1.984553011256823, + "grad_norm": 0.912657618522644, + "learning_rate": 6.769769830242253e-05, + "loss": 2.793, + "step": 21905 + }, + { + "epoch": 1.98464360943127, + "grad_norm": 0.8525898456573486, + "learning_rate": 6.769165710143177e-05, + "loss": 2.5967, + "step": 21906 + }, + { + "epoch": 1.9847342076057166, + "grad_norm": 0.8495084047317505, + "learning_rate": 6.768561590044101e-05, + "loss": 2.5735, + "step": 21907 + }, + { + "epoch": 1.9848248057801636, + "grad_norm": 0.9029768109321594, + "learning_rate": 6.767957469945026e-05, + "loss": 2.9017, + "step": 21908 + }, + { + "epoch": 1.9849154039546102, + "grad_norm": 0.9184991121292114, + "learning_rate": 6.76735334984595e-05, + "loss": 2.5495, + "step": 21909 + }, + { + "epoch": 1.9850060021290572, + "grad_norm": 0.9875920414924622, + "learning_rate": 6.766749229746874e-05, + "loss": 2.9865, + "step": 21910 + }, + { + "epoch": 1.9850966003035038, + "grad_norm": 0.8735849857330322, + "learning_rate": 6.766145109647798e-05, + "loss": 2.6208, + "step": 21911 + }, + { + "epoch": 1.9851871984779508, + "grad_norm": 0.8632307648658752, + "learning_rate": 6.765540989548722e-05, + "loss": 2.4219, + "step": 21912 + }, + { + "epoch": 1.9852777966523973, + "grad_norm": 1.1150991916656494, + "learning_rate": 6.764936869449647e-05, + "loss": 2.8856, + "step": 21913 + }, + { + "epoch": 1.9853683948268444, + "grad_norm": 1.0164856910705566, + "learning_rate": 6.764332749350572e-05, + "loss": 2.6112, + "step": 21914 + }, + { + "epoch": 1.985458993001291, + "grad_norm": 0.8688826560974121, + "learning_rate": 6.763728629251495e-05, + "loss": 2.7836, + "step": 21915 + }, + { + "epoch": 1.985549591175738, + "grad_norm": 0.850614607334137, + "learning_rate": 6.76312450915242e-05, + "loss": 2.5637, + "step": 21916 + }, + { + "epoch": 1.9856401893501845, + "grad_norm": 0.9837454557418823, + "learning_rate": 6.762520389053343e-05, + "loss": 2.7548, + "step": 21917 + }, + { + "epoch": 1.9857307875246315, + "grad_norm": 0.9148802161216736, + "learning_rate": 6.761916268954268e-05, + "loss": 2.9864, + "step": 21918 + }, + { + "epoch": 1.985821385699078, + "grad_norm": 0.9088625907897949, + "learning_rate": 6.761312148855192e-05, + "loss": 2.6637, + "step": 21919 + }, + { + "epoch": 1.985911983873525, + "grad_norm": 0.8697312474250793, + "learning_rate": 6.760708028756118e-05, + "loss": 2.6559, + "step": 21920 + }, + { + "epoch": 1.9860025820479716, + "grad_norm": 0.8841017484664917, + "learning_rate": 6.760103908657041e-05, + "loss": 2.8607, + "step": 21921 + }, + { + "epoch": 1.9860931802224187, + "grad_norm": 0.8878340125083923, + "learning_rate": 6.759499788557966e-05, + "loss": 2.6233, + "step": 21922 + }, + { + "epoch": 1.9861837783968652, + "grad_norm": 0.8627448678016663, + "learning_rate": 6.75889566845889e-05, + "loss": 2.6089, + "step": 21923 + }, + { + "epoch": 1.9862743765713122, + "grad_norm": 0.8324993848800659, + "learning_rate": 6.758291548359814e-05, + "loss": 2.7327, + "step": 21924 + }, + { + "epoch": 1.9863649747457588, + "grad_norm": 0.8982061147689819, + "learning_rate": 6.757687428260739e-05, + "loss": 2.7712, + "step": 21925 + }, + { + "epoch": 1.9864555729202058, + "grad_norm": 0.9464892745018005, + "learning_rate": 6.757083308161662e-05, + "loss": 2.8488, + "step": 21926 + }, + { + "epoch": 1.9865461710946524, + "grad_norm": 0.934432864189148, + "learning_rate": 6.756479188062587e-05, + "loss": 3.0157, + "step": 21927 + }, + { + "epoch": 1.9866367692690994, + "grad_norm": 1.0115597248077393, + "learning_rate": 6.755875067963512e-05, + "loss": 2.6041, + "step": 21928 + }, + { + "epoch": 1.986727367443546, + "grad_norm": 0.880549967288971, + "learning_rate": 6.755270947864437e-05, + "loss": 2.5894, + "step": 21929 + }, + { + "epoch": 1.986817965617993, + "grad_norm": 0.8960908651351929, + "learning_rate": 6.75466682776536e-05, + "loss": 2.8871, + "step": 21930 + }, + { + "epoch": 1.9869085637924395, + "grad_norm": 0.8628455996513367, + "learning_rate": 6.754062707666285e-05, + "loss": 2.6052, + "step": 21931 + }, + { + "epoch": 1.9869991619668865, + "grad_norm": 1.0090653896331787, + "learning_rate": 6.753458587567208e-05, + "loss": 2.4051, + "step": 21932 + }, + { + "epoch": 1.987089760141333, + "grad_norm": 0.8440439105033875, + "learning_rate": 6.752854467468133e-05, + "loss": 2.4535, + "step": 21933 + }, + { + "epoch": 1.9871803583157799, + "grad_norm": 0.834990918636322, + "learning_rate": 6.752250347369056e-05, + "loss": 2.5072, + "step": 21934 + }, + { + "epoch": 1.9872709564902267, + "grad_norm": 0.9190937280654907, + "learning_rate": 6.751646227269982e-05, + "loss": 2.9033, + "step": 21935 + }, + { + "epoch": 1.9873615546646735, + "grad_norm": 0.9076366424560547, + "learning_rate": 6.751042107170906e-05, + "loss": 2.8976, + "step": 21936 + }, + { + "epoch": 1.9874521528391202, + "grad_norm": 0.9331775903701782, + "learning_rate": 6.75043798707183e-05, + "loss": 2.9122, + "step": 21937 + }, + { + "epoch": 1.987542751013567, + "grad_norm": 0.9108801484107971, + "learning_rate": 6.749833866972754e-05, + "loss": 2.8605, + "step": 21938 + }, + { + "epoch": 1.9876333491880138, + "grad_norm": 0.9185742735862732, + "learning_rate": 6.749229746873679e-05, + "loss": 2.9281, + "step": 21939 + }, + { + "epoch": 1.9877239473624606, + "grad_norm": 0.8734402060508728, + "learning_rate": 6.748625626774603e-05, + "loss": 2.4003, + "step": 21940 + }, + { + "epoch": 1.9878145455369074, + "grad_norm": 0.8919374346733093, + "learning_rate": 6.748021506675527e-05, + "loss": 2.4954, + "step": 21941 + }, + { + "epoch": 1.9879051437113542, + "grad_norm": 0.9100356101989746, + "learning_rate": 6.747417386576452e-05, + "loss": 2.6849, + "step": 21942 + }, + { + "epoch": 1.987995741885801, + "grad_norm": 0.9756798148155212, + "learning_rate": 6.746813266477376e-05, + "loss": 2.6966, + "step": 21943 + }, + { + "epoch": 1.9880863400602478, + "grad_norm": 0.9103202223777771, + "learning_rate": 6.746209146378301e-05, + "loss": 2.5872, + "step": 21944 + }, + { + "epoch": 1.9881769382346945, + "grad_norm": 0.9009626507759094, + "learning_rate": 6.745605026279225e-05, + "loss": 2.5044, + "step": 21945 + }, + { + "epoch": 1.9882675364091413, + "grad_norm": 0.9995089173316956, + "learning_rate": 6.74500090618015e-05, + "loss": 2.9521, + "step": 21946 + }, + { + "epoch": 1.9883581345835881, + "grad_norm": 0.8165571093559265, + "learning_rate": 6.744396786081073e-05, + "loss": 2.6848, + "step": 21947 + }, + { + "epoch": 1.988448732758035, + "grad_norm": 0.8474389314651489, + "learning_rate": 6.743792665981997e-05, + "loss": 2.9341, + "step": 21948 + }, + { + "epoch": 1.9885393309324817, + "grad_norm": 0.9340183734893799, + "learning_rate": 6.743188545882921e-05, + "loss": 2.5541, + "step": 21949 + }, + { + "epoch": 1.9886299291069285, + "grad_norm": 0.9446035027503967, + "learning_rate": 6.742584425783847e-05, + "loss": 2.5514, + "step": 21950 + }, + { + "epoch": 1.9887205272813753, + "grad_norm": 0.8796369433403015, + "learning_rate": 6.74198030568477e-05, + "loss": 2.0858, + "step": 21951 + }, + { + "epoch": 1.988811125455822, + "grad_norm": 0.8727629780769348, + "learning_rate": 6.741376185585695e-05, + "loss": 2.9269, + "step": 21952 + }, + { + "epoch": 1.9889017236302688, + "grad_norm": 0.8456050157546997, + "learning_rate": 6.740772065486619e-05, + "loss": 2.5514, + "step": 21953 + }, + { + "epoch": 1.9889923218047156, + "grad_norm": 0.9630146622657776, + "learning_rate": 6.740167945387543e-05, + "loss": 2.6966, + "step": 21954 + }, + { + "epoch": 1.9890829199791624, + "grad_norm": 0.9057168364524841, + "learning_rate": 6.739563825288467e-05, + "loss": 2.6737, + "step": 21955 + }, + { + "epoch": 1.9891735181536092, + "grad_norm": 0.9175916910171509, + "learning_rate": 6.738959705189392e-05, + "loss": 2.7161, + "step": 21956 + }, + { + "epoch": 1.989264116328056, + "grad_norm": 0.971930742263794, + "learning_rate": 6.738355585090316e-05, + "loss": 2.6668, + "step": 21957 + }, + { + "epoch": 1.9893547145025028, + "grad_norm": 0.8658604621887207, + "learning_rate": 6.737751464991241e-05, + "loss": 2.8367, + "step": 21958 + }, + { + "epoch": 1.9894453126769496, + "grad_norm": 0.8520286679267883, + "learning_rate": 6.737147344892166e-05, + "loss": 2.5789, + "step": 21959 + }, + { + "epoch": 1.9895359108513964, + "grad_norm": 0.8814653754234314, + "learning_rate": 6.736543224793089e-05, + "loss": 2.477, + "step": 21960 + }, + { + "epoch": 1.9896265090258431, + "grad_norm": 0.9093120098114014, + "learning_rate": 6.735939104694014e-05, + "loss": 2.7328, + "step": 21961 + }, + { + "epoch": 1.98971710720029, + "grad_norm": 0.8547976016998291, + "learning_rate": 6.735334984594937e-05, + "loss": 2.5089, + "step": 21962 + }, + { + "epoch": 1.9898077053747367, + "grad_norm": 1.0595180988311768, + "learning_rate": 6.734730864495862e-05, + "loss": 2.6962, + "step": 21963 + }, + { + "epoch": 1.9898983035491835, + "grad_norm": 0.9359062910079956, + "learning_rate": 6.734126744396786e-05, + "loss": 2.8445, + "step": 21964 + }, + { + "epoch": 1.9899889017236303, + "grad_norm": 0.9957440495491028, + "learning_rate": 6.733522624297712e-05, + "loss": 2.5527, + "step": 21965 + }, + { + "epoch": 1.990079499898077, + "grad_norm": 0.7952594757080078, + "learning_rate": 6.732918504198635e-05, + "loss": 2.0019, + "step": 21966 + }, + { + "epoch": 1.9901700980725239, + "grad_norm": 0.8868899941444397, + "learning_rate": 6.73231438409956e-05, + "loss": 2.8048, + "step": 21967 + }, + { + "epoch": 1.9902606962469707, + "grad_norm": 0.932883620262146, + "learning_rate": 6.731710264000483e-05, + "loss": 2.995, + "step": 21968 + }, + { + "epoch": 1.9903512944214174, + "grad_norm": 0.8102086186408997, + "learning_rate": 6.731106143901408e-05, + "loss": 1.9052, + "step": 21969 + }, + { + "epoch": 1.9904418925958642, + "grad_norm": 0.8037506937980652, + "learning_rate": 6.730502023802331e-05, + "loss": 2.1311, + "step": 21970 + }, + { + "epoch": 1.990532490770311, + "grad_norm": 0.8742132186889648, + "learning_rate": 6.729897903703256e-05, + "loss": 2.5455, + "step": 21971 + }, + { + "epoch": 1.9906230889447578, + "grad_norm": 0.9397000074386597, + "learning_rate": 6.729293783604181e-05, + "loss": 1.9464, + "step": 21972 + }, + { + "epoch": 1.9907136871192046, + "grad_norm": 0.8498817682266235, + "learning_rate": 6.728689663505106e-05, + "loss": 2.698, + "step": 21973 + }, + { + "epoch": 1.9908042852936514, + "grad_norm": 0.9454205632209778, + "learning_rate": 6.728085543406029e-05, + "loss": 2.821, + "step": 21974 + }, + { + "epoch": 1.9908948834680982, + "grad_norm": 0.9128995537757874, + "learning_rate": 6.727481423306954e-05, + "loss": 2.912, + "step": 21975 + }, + { + "epoch": 1.9909854816425447, + "grad_norm": 0.9161812663078308, + "learning_rate": 6.726877303207879e-05, + "loss": 2.7474, + "step": 21976 + }, + { + "epoch": 1.9910760798169918, + "grad_norm": 0.886479377746582, + "learning_rate": 6.726273183108802e-05, + "loss": 2.8674, + "step": 21977 + }, + { + "epoch": 1.9911666779914383, + "grad_norm": 1.0054142475128174, + "learning_rate": 6.725669063009727e-05, + "loss": 2.7268, + "step": 21978 + }, + { + "epoch": 1.9912572761658853, + "grad_norm": 0.7806164622306824, + "learning_rate": 6.72506494291065e-05, + "loss": 2.0807, + "step": 21979 + }, + { + "epoch": 1.991347874340332, + "grad_norm": 0.9604588747024536, + "learning_rate": 6.724460822811576e-05, + "loss": 2.9009, + "step": 21980 + }, + { + "epoch": 1.991438472514779, + "grad_norm": 0.8965021371841431, + "learning_rate": 6.7238567027125e-05, + "loss": 2.9067, + "step": 21981 + }, + { + "epoch": 1.9915290706892255, + "grad_norm": 0.8732870817184448, + "learning_rate": 6.723252582613424e-05, + "loss": 2.5434, + "step": 21982 + }, + { + "epoch": 1.9916196688636725, + "grad_norm": 0.857929527759552, + "learning_rate": 6.722648462514348e-05, + "loss": 2.5574, + "step": 21983 + }, + { + "epoch": 1.991710267038119, + "grad_norm": 0.8588625192642212, + "learning_rate": 6.722044342415273e-05, + "loss": 2.7927, + "step": 21984 + }, + { + "epoch": 1.991800865212566, + "grad_norm": 0.9122637510299683, + "learning_rate": 6.721440222316196e-05, + "loss": 2.8454, + "step": 21985 + }, + { + "epoch": 1.9918914633870126, + "grad_norm": 0.9329195618629456, + "learning_rate": 6.720836102217121e-05, + "loss": 2.7717, + "step": 21986 + }, + { + "epoch": 1.9919820615614596, + "grad_norm": 0.895886242389679, + "learning_rate": 6.720231982118046e-05, + "loss": 3.1623, + "step": 21987 + }, + { + "epoch": 1.9920726597359062, + "grad_norm": 0.9120877385139465, + "learning_rate": 6.71962786201897e-05, + "loss": 2.9836, + "step": 21988 + }, + { + "epoch": 1.9921632579103532, + "grad_norm": 0.7969406247138977, + "learning_rate": 6.719023741919894e-05, + "loss": 1.8708, + "step": 21989 + }, + { + "epoch": 1.9922538560847998, + "grad_norm": 0.8966934084892273, + "learning_rate": 6.718419621820818e-05, + "loss": 2.794, + "step": 21990 + }, + { + "epoch": 1.9923444542592468, + "grad_norm": 0.9936961531639099, + "learning_rate": 6.717815501721743e-05, + "loss": 2.7855, + "step": 21991 + }, + { + "epoch": 1.9924350524336933, + "grad_norm": 1.037261962890625, + "learning_rate": 6.717211381622667e-05, + "loss": 2.7432, + "step": 21992 + }, + { + "epoch": 1.9925256506081404, + "grad_norm": 0.8928008079528809, + "learning_rate": 6.716607261523591e-05, + "loss": 2.7736, + "step": 21993 + }, + { + "epoch": 1.992616248782587, + "grad_norm": 0.8513633608818054, + "learning_rate": 6.716003141424515e-05, + "loss": 2.5907, + "step": 21994 + }, + { + "epoch": 1.992706846957034, + "grad_norm": 0.9010640382766724, + "learning_rate": 6.715399021325441e-05, + "loss": 2.5402, + "step": 21995 + }, + { + "epoch": 1.9927974451314805, + "grad_norm": 1.012383222579956, + "learning_rate": 6.714794901226364e-05, + "loss": 2.5876, + "step": 21996 + }, + { + "epoch": 1.9928880433059275, + "grad_norm": 0.8749256730079651, + "learning_rate": 6.714190781127289e-05, + "loss": 2.6714, + "step": 21997 + }, + { + "epoch": 1.992978641480374, + "grad_norm": 0.8086074590682983, + "learning_rate": 6.713586661028212e-05, + "loss": 1.9288, + "step": 21998 + }, + { + "epoch": 1.993069239654821, + "grad_norm": 0.9043845534324646, + "learning_rate": 6.712982540929137e-05, + "loss": 2.7293, + "step": 21999 + }, + { + "epoch": 1.9931598378292676, + "grad_norm": 1.0049647092819214, + "learning_rate": 6.71237842083006e-05, + "loss": 2.6408, + "step": 22000 + }, + { + "epoch": 1.9932504360037147, + "grad_norm": 0.8988420367240906, + "learning_rate": 6.711774300730985e-05, + "loss": 2.4759, + "step": 22001 + }, + { + "epoch": 1.9933410341781612, + "grad_norm": 0.910529613494873, + "learning_rate": 6.71117018063191e-05, + "loss": 2.8392, + "step": 22002 + }, + { + "epoch": 1.9934316323526082, + "grad_norm": 0.8188683986663818, + "learning_rate": 6.710566060532835e-05, + "loss": 2.5253, + "step": 22003 + }, + { + "epoch": 1.9935222305270548, + "grad_norm": 0.9336397051811218, + "learning_rate": 6.709961940433758e-05, + "loss": 3.0285, + "step": 22004 + }, + { + "epoch": 1.9936128287015018, + "grad_norm": 0.9169546961784363, + "learning_rate": 6.709357820334683e-05, + "loss": 2.8794, + "step": 22005 + }, + { + "epoch": 1.9937034268759484, + "grad_norm": 0.8559876084327698, + "learning_rate": 6.708753700235607e-05, + "loss": 2.5622, + "step": 22006 + }, + { + "epoch": 1.9937940250503954, + "grad_norm": 0.9469301104545593, + "learning_rate": 6.708149580136531e-05, + "loss": 2.6458, + "step": 22007 + }, + { + "epoch": 1.993884623224842, + "grad_norm": 0.7882858514785767, + "learning_rate": 6.707545460037456e-05, + "loss": 2.0467, + "step": 22008 + }, + { + "epoch": 1.993975221399289, + "grad_norm": 0.9021356105804443, + "learning_rate": 6.70694133993838e-05, + "loss": 2.4731, + "step": 22009 + }, + { + "epoch": 1.9940658195737355, + "grad_norm": 0.952904224395752, + "learning_rate": 6.706337219839304e-05, + "loss": 2.8559, + "step": 22010 + }, + { + "epoch": 1.9941564177481825, + "grad_norm": 0.8970682621002197, + "learning_rate": 6.705733099740229e-05, + "loss": 2.7464, + "step": 22011 + }, + { + "epoch": 1.994247015922629, + "grad_norm": 0.9307073950767517, + "learning_rate": 6.705128979641154e-05, + "loss": 2.7337, + "step": 22012 + }, + { + "epoch": 1.994337614097076, + "grad_norm": 1.1247215270996094, + "learning_rate": 6.704524859542077e-05, + "loss": 2.7226, + "step": 22013 + }, + { + "epoch": 1.9944282122715227, + "grad_norm": 0.9505211710929871, + "learning_rate": 6.703920739443002e-05, + "loss": 2.6447, + "step": 22014 + }, + { + "epoch": 1.9945188104459695, + "grad_norm": 0.9302672743797302, + "learning_rate": 6.703316619343925e-05, + "loss": 2.7179, + "step": 22015 + }, + { + "epoch": 1.9946094086204162, + "grad_norm": 0.9604479074478149, + "learning_rate": 6.70271249924485e-05, + "loss": 2.7867, + "step": 22016 + }, + { + "epoch": 1.994700006794863, + "grad_norm": 0.8728188276290894, + "learning_rate": 6.702108379145775e-05, + "loss": 2.8628, + "step": 22017 + }, + { + "epoch": 1.9947906049693098, + "grad_norm": 0.7994276285171509, + "learning_rate": 6.7015042590467e-05, + "loss": 2.6317, + "step": 22018 + }, + { + "epoch": 1.9948812031437566, + "grad_norm": 0.9116067886352539, + "learning_rate": 6.700900138947623e-05, + "loss": 2.6773, + "step": 22019 + }, + { + "epoch": 1.9949718013182034, + "grad_norm": 1.0662349462509155, + "learning_rate": 6.700296018848548e-05, + "loss": 2.3877, + "step": 22020 + }, + { + "epoch": 1.9950623994926502, + "grad_norm": 0.9095531105995178, + "learning_rate": 6.699691898749471e-05, + "loss": 2.6526, + "step": 22021 + }, + { + "epoch": 1.995152997667097, + "grad_norm": 0.9744149446487427, + "learning_rate": 6.699087778650396e-05, + "loss": 2.8044, + "step": 22022 + }, + { + "epoch": 1.9952435958415438, + "grad_norm": 0.8659090995788574, + "learning_rate": 6.69848365855132e-05, + "loss": 2.4653, + "step": 22023 + }, + { + "epoch": 1.9953341940159905, + "grad_norm": 0.8780086040496826, + "learning_rate": 6.697879538452244e-05, + "loss": 2.6913, + "step": 22024 + }, + { + "epoch": 1.9954247921904373, + "grad_norm": 0.8761019110679626, + "learning_rate": 6.697275418353169e-05, + "loss": 2.5529, + "step": 22025 + }, + { + "epoch": 1.9955153903648841, + "grad_norm": 0.8622615337371826, + "learning_rate": 6.696671298254094e-05, + "loss": 2.0807, + "step": 22026 + }, + { + "epoch": 1.995605988539331, + "grad_norm": 1.0315403938293457, + "learning_rate": 6.696067178155018e-05, + "loss": 2.4252, + "step": 22027 + }, + { + "epoch": 1.9956965867137777, + "grad_norm": 0.9398416876792908, + "learning_rate": 6.695463058055942e-05, + "loss": 2.678, + "step": 22028 + }, + { + "epoch": 1.9957871848882245, + "grad_norm": 0.930877685546875, + "learning_rate": 6.694858937956867e-05, + "loss": 2.7131, + "step": 22029 + }, + { + "epoch": 1.9958777830626713, + "grad_norm": 0.9926424026489258, + "learning_rate": 6.69425481785779e-05, + "loss": 2.6655, + "step": 22030 + }, + { + "epoch": 1.995968381237118, + "grad_norm": 0.8757054209709167, + "learning_rate": 6.693650697758715e-05, + "loss": 2.7208, + "step": 22031 + }, + { + "epoch": 1.9960589794115648, + "grad_norm": 0.908991277217865, + "learning_rate": 6.69304657765964e-05, + "loss": 2.7166, + "step": 22032 + }, + { + "epoch": 1.9961495775860116, + "grad_norm": 0.8713201880455017, + "learning_rate": 6.692442457560564e-05, + "loss": 2.9055, + "step": 22033 + }, + { + "epoch": 1.9962401757604584, + "grad_norm": 0.9706884622573853, + "learning_rate": 6.691838337461488e-05, + "loss": 2.626, + "step": 22034 + }, + { + "epoch": 1.9963307739349052, + "grad_norm": 0.7578139305114746, + "learning_rate": 6.691234217362412e-05, + "loss": 1.9608, + "step": 22035 + }, + { + "epoch": 1.996421372109352, + "grad_norm": 0.8554753065109253, + "learning_rate": 6.690630097263336e-05, + "loss": 2.7132, + "step": 22036 + }, + { + "epoch": 1.9965119702837988, + "grad_norm": 0.7802469730377197, + "learning_rate": 6.69002597716426e-05, + "loss": 2.159, + "step": 22037 + }, + { + "epoch": 1.9966025684582456, + "grad_norm": 0.9214851260185242, + "learning_rate": 6.689421857065184e-05, + "loss": 2.715, + "step": 22038 + }, + { + "epoch": 1.9966931666326924, + "grad_norm": 0.802523672580719, + "learning_rate": 6.688817736966109e-05, + "loss": 1.8197, + "step": 22039 + }, + { + "epoch": 1.9967837648071391, + "grad_norm": 0.9136136770248413, + "learning_rate": 6.688213616867033e-05, + "loss": 2.6891, + "step": 22040 + }, + { + "epoch": 1.996874362981586, + "grad_norm": 0.8982875347137451, + "learning_rate": 6.687609496767958e-05, + "loss": 2.6421, + "step": 22041 + }, + { + "epoch": 1.9969649611560327, + "grad_norm": 0.880587100982666, + "learning_rate": 6.687005376668882e-05, + "loss": 2.7997, + "step": 22042 + }, + { + "epoch": 1.9970555593304795, + "grad_norm": 0.9064926505088806, + "learning_rate": 6.686401256569806e-05, + "loss": 2.5763, + "step": 22043 + }, + { + "epoch": 1.9971461575049263, + "grad_norm": 0.9789602160453796, + "learning_rate": 6.685797136470731e-05, + "loss": 2.6516, + "step": 22044 + }, + { + "epoch": 1.997236755679373, + "grad_norm": 0.8100987672805786, + "learning_rate": 6.685193016371655e-05, + "loss": 1.9725, + "step": 22045 + }, + { + "epoch": 1.9973273538538199, + "grad_norm": 0.8759751915931702, + "learning_rate": 6.684588896272579e-05, + "loss": 2.6957, + "step": 22046 + }, + { + "epoch": 1.9974179520282667, + "grad_norm": 0.9263187050819397, + "learning_rate": 6.683984776173504e-05, + "loss": 2.8912, + "step": 22047 + }, + { + "epoch": 1.9975085502027135, + "grad_norm": 0.785764753818512, + "learning_rate": 6.683380656074429e-05, + "loss": 2.1163, + "step": 22048 + }, + { + "epoch": 1.9975991483771602, + "grad_norm": 0.8093534708023071, + "learning_rate": 6.682776535975352e-05, + "loss": 2.6133, + "step": 22049 + }, + { + "epoch": 1.997689746551607, + "grad_norm": 0.7820321321487427, + "learning_rate": 6.682172415876277e-05, + "loss": 2.2061, + "step": 22050 + }, + { + "epoch": 1.9977803447260538, + "grad_norm": 0.9777339100837708, + "learning_rate": 6.6815682957772e-05, + "loss": 2.7406, + "step": 22051 + }, + { + "epoch": 1.9978709429005006, + "grad_norm": 0.776572585105896, + "learning_rate": 6.680964175678125e-05, + "loss": 2.1379, + "step": 22052 + }, + { + "epoch": 1.9979615410749474, + "grad_norm": 0.948615312576294, + "learning_rate": 6.680360055579049e-05, + "loss": 2.7715, + "step": 22053 + }, + { + "epoch": 1.9980521392493942, + "grad_norm": 0.9645307660102844, + "learning_rate": 6.679755935479973e-05, + "loss": 2.6764, + "step": 22054 + }, + { + "epoch": 1.998142737423841, + "grad_norm": 0.8875554203987122, + "learning_rate": 6.679151815380898e-05, + "loss": 2.6825, + "step": 22055 + }, + { + "epoch": 1.9982333355982878, + "grad_norm": 0.8336690664291382, + "learning_rate": 6.678547695281823e-05, + "loss": 2.5742, + "step": 22056 + }, + { + "epoch": 1.9983239337727343, + "grad_norm": 0.9038867950439453, + "learning_rate": 6.677943575182746e-05, + "loss": 2.6289, + "step": 22057 + }, + { + "epoch": 1.9984145319471813, + "grad_norm": 0.890823483467102, + "learning_rate": 6.677339455083671e-05, + "loss": 2.7844, + "step": 22058 + }, + { + "epoch": 1.998505130121628, + "grad_norm": 0.901271641254425, + "learning_rate": 6.676735334984596e-05, + "loss": 3.0126, + "step": 22059 + }, + { + "epoch": 1.998595728296075, + "grad_norm": 0.8092062473297119, + "learning_rate": 6.676131214885519e-05, + "loss": 2.5834, + "step": 22060 + }, + { + "epoch": 1.9986863264705215, + "grad_norm": 0.8938345313072205, + "learning_rate": 6.675527094786444e-05, + "loss": 2.7163, + "step": 22061 + }, + { + "epoch": 1.9987769246449685, + "grad_norm": 0.9894053339958191, + "learning_rate": 6.674922974687369e-05, + "loss": 2.7361, + "step": 22062 + }, + { + "epoch": 1.998867522819415, + "grad_norm": 0.8849132657051086, + "learning_rate": 6.674318854588293e-05, + "loss": 2.6687, + "step": 22063 + }, + { + "epoch": 1.998958120993862, + "grad_norm": 0.8324366807937622, + "learning_rate": 6.673714734489217e-05, + "loss": 2.0976, + "step": 22064 + }, + { + "epoch": 1.9990487191683086, + "grad_norm": 0.9075923562049866, + "learning_rate": 6.673110614390142e-05, + "loss": 2.3277, + "step": 22065 + }, + { + "epoch": 1.9991393173427556, + "grad_norm": 0.9044941663742065, + "learning_rate": 6.672506494291065e-05, + "loss": 2.933, + "step": 22066 + }, + { + "epoch": 1.9992299155172022, + "grad_norm": 0.9634657502174377, + "learning_rate": 6.67190237419199e-05, + "loss": 2.8271, + "step": 22067 + }, + { + "epoch": 1.9993205136916492, + "grad_norm": 0.9750401973724365, + "learning_rate": 6.671298254092913e-05, + "loss": 2.6499, + "step": 22068 + }, + { + "epoch": 1.9994111118660958, + "grad_norm": 0.9068164825439453, + "learning_rate": 6.670694133993838e-05, + "loss": 2.7867, + "step": 22069 + }, + { + "epoch": 1.9995017100405428, + "grad_norm": 0.8973096609115601, + "learning_rate": 6.670090013894763e-05, + "loss": 2.707, + "step": 22070 + }, + { + "epoch": 1.9995923082149893, + "grad_norm": 0.9456422328948975, + "learning_rate": 6.669485893795687e-05, + "loss": 2.7357, + "step": 22071 + }, + { + "epoch": 1.9996829063894364, + "grad_norm": 0.7364769577980042, + "learning_rate": 6.668881773696611e-05, + "loss": 2.1189, + "step": 22072 + }, + { + "epoch": 1.999773504563883, + "grad_norm": 1.0220099687576294, + "learning_rate": 6.668277653597536e-05, + "loss": 2.6426, + "step": 22073 + }, + { + "epoch": 1.99986410273833, + "grad_norm": 0.8913978338241577, + "learning_rate": 6.667673533498459e-05, + "loss": 2.7021, + "step": 22074 + }, + { + "epoch": 1.9999547009127765, + "grad_norm": 0.8936171531677246, + "learning_rate": 6.667069413399384e-05, + "loss": 2.5662, + "step": 22075 + }, + { + "epoch": 2.0000452990872235, + "grad_norm": 0.7807835936546326, + "learning_rate": 6.666465293300309e-05, + "loss": 1.988, + "step": 22076 + }, + { + "epoch": 2.00013589726167, + "grad_norm": 0.9288907647132874, + "learning_rate": 6.665861173201233e-05, + "loss": 2.6966, + "step": 22077 + }, + { + "epoch": 2.000226495436117, + "grad_norm": 0.8342959880828857, + "learning_rate": 6.665257053102158e-05, + "loss": 2.6425, + "step": 22078 + }, + { + "epoch": 2.0003170936105636, + "grad_norm": 0.9211458563804626, + "learning_rate": 6.664652933003081e-05, + "loss": 2.7122, + "step": 22079 + }, + { + "epoch": 2.0004076917850107, + "grad_norm": 0.8788048624992371, + "learning_rate": 6.664048812904006e-05, + "loss": 3.0007, + "step": 22080 + }, + { + "epoch": 2.000498289959457, + "grad_norm": 1.0346542596817017, + "learning_rate": 6.66344469280493e-05, + "loss": 2.6983, + "step": 22081 + }, + { + "epoch": 2.0005888881339042, + "grad_norm": 0.8923380374908447, + "learning_rate": 6.662840572705854e-05, + "loss": 2.6916, + "step": 22082 + }, + { + "epoch": 2.000679486308351, + "grad_norm": 0.8703694343566895, + "learning_rate": 6.662236452606778e-05, + "loss": 2.6736, + "step": 22083 + }, + { + "epoch": 2.000770084482798, + "grad_norm": 0.8558394908905029, + "learning_rate": 6.661632332507703e-05, + "loss": 2.4625, + "step": 22084 + }, + { + "epoch": 2.0008606826572444, + "grad_norm": 0.8873952627182007, + "learning_rate": 6.661028212408627e-05, + "loss": 2.5902, + "step": 22085 + }, + { + "epoch": 2.0009512808316914, + "grad_norm": 0.7519994974136353, + "learning_rate": 6.660424092309552e-05, + "loss": 2.0801, + "step": 22086 + }, + { + "epoch": 2.001041879006138, + "grad_norm": 0.8575258851051331, + "learning_rate": 6.659819972210476e-05, + "loss": 2.4456, + "step": 22087 + }, + { + "epoch": 2.001132477180585, + "grad_norm": 0.9336376786231995, + "learning_rate": 6.6592158521114e-05, + "loss": 2.6561, + "step": 22088 + }, + { + "epoch": 2.0012230753550315, + "grad_norm": 0.8756159543991089, + "learning_rate": 6.658611732012324e-05, + "loss": 2.6366, + "step": 22089 + }, + { + "epoch": 2.0013136735294785, + "grad_norm": 0.9274473786354065, + "learning_rate": 6.658007611913248e-05, + "loss": 2.4667, + "step": 22090 + }, + { + "epoch": 2.001404271703925, + "grad_norm": 0.9170359969139099, + "learning_rate": 6.657403491814173e-05, + "loss": 2.6087, + "step": 22091 + }, + { + "epoch": 2.001494869878372, + "grad_norm": 0.8461923003196716, + "learning_rate": 6.656799371715098e-05, + "loss": 2.7276, + "step": 22092 + }, + { + "epoch": 2.0015854680528187, + "grad_norm": 0.8855032920837402, + "learning_rate": 6.656195251616021e-05, + "loss": 2.639, + "step": 22093 + }, + { + "epoch": 2.0016760662272657, + "grad_norm": 0.856621265411377, + "learning_rate": 6.655591131516946e-05, + "loss": 2.6274, + "step": 22094 + }, + { + "epoch": 2.0017666644017122, + "grad_norm": 0.867649257183075, + "learning_rate": 6.654987011417871e-05, + "loss": 2.4543, + "step": 22095 + }, + { + "epoch": 2.0018572625761593, + "grad_norm": 0.9019612073898315, + "learning_rate": 6.654382891318794e-05, + "loss": 2.4444, + "step": 22096 + }, + { + "epoch": 2.001947860750606, + "grad_norm": 0.9007282257080078, + "learning_rate": 6.653778771219719e-05, + "loss": 2.5829, + "step": 22097 + }, + { + "epoch": 2.002038458925053, + "grad_norm": 0.945803165435791, + "learning_rate": 6.653174651120642e-05, + "loss": 2.8086, + "step": 22098 + }, + { + "epoch": 2.0021290570994994, + "grad_norm": 0.9241525530815125, + "learning_rate": 6.652570531021567e-05, + "loss": 2.757, + "step": 22099 + }, + { + "epoch": 2.0022196552739464, + "grad_norm": 0.9049884676933289, + "learning_rate": 6.651966410922492e-05, + "loss": 2.859, + "step": 22100 + }, + { + "epoch": 2.002310253448393, + "grad_norm": 0.9909634590148926, + "learning_rate": 6.651362290823417e-05, + "loss": 2.4467, + "step": 22101 + }, + { + "epoch": 2.00240085162284, + "grad_norm": 0.7731654047966003, + "learning_rate": 6.65075817072434e-05, + "loss": 1.898, + "step": 22102 + }, + { + "epoch": 2.0024914497972865, + "grad_norm": 0.9706332683563232, + "learning_rate": 6.650154050625265e-05, + "loss": 2.9546, + "step": 22103 + }, + { + "epoch": 2.0025820479717336, + "grad_norm": 0.7487534880638123, + "learning_rate": 6.649549930526188e-05, + "loss": 1.8182, + "step": 22104 + }, + { + "epoch": 2.00267264614618, + "grad_norm": 0.9215719103813171, + "learning_rate": 6.648945810427113e-05, + "loss": 2.6665, + "step": 22105 + }, + { + "epoch": 2.002763244320627, + "grad_norm": 1.0683985948562622, + "learning_rate": 6.648341690328036e-05, + "loss": 2.78, + "step": 22106 + }, + { + "epoch": 2.0028538424950737, + "grad_norm": 0.8349151015281677, + "learning_rate": 6.647737570228963e-05, + "loss": 2.1221, + "step": 22107 + }, + { + "epoch": 2.0029444406695207, + "grad_norm": 0.9402657151222229, + "learning_rate": 6.647133450129886e-05, + "loss": 2.4584, + "step": 22108 + }, + { + "epoch": 2.0030350388439673, + "grad_norm": 0.9352627396583557, + "learning_rate": 6.646529330030811e-05, + "loss": 2.4466, + "step": 22109 + }, + { + "epoch": 2.0031256370184143, + "grad_norm": 0.9026618599891663, + "learning_rate": 6.645925209931734e-05, + "loss": 2.6493, + "step": 22110 + }, + { + "epoch": 2.003216235192861, + "grad_norm": 0.969508171081543, + "learning_rate": 6.645321089832659e-05, + "loss": 2.7327, + "step": 22111 + }, + { + "epoch": 2.003306833367308, + "grad_norm": 0.9475044012069702, + "learning_rate": 6.644716969733584e-05, + "loss": 2.3709, + "step": 22112 + }, + { + "epoch": 2.0033974315417544, + "grad_norm": 1.0175234079360962, + "learning_rate": 6.644112849634507e-05, + "loss": 2.456, + "step": 22113 + }, + { + "epoch": 2.0034880297162014, + "grad_norm": 0.924346387386322, + "learning_rate": 6.643508729535432e-05, + "loss": 2.6906, + "step": 22114 + }, + { + "epoch": 2.003578627890648, + "grad_norm": 1.0553594827651978, + "learning_rate": 6.642904609436357e-05, + "loss": 2.5612, + "step": 22115 + }, + { + "epoch": 2.003669226065095, + "grad_norm": 0.8085430860519409, + "learning_rate": 6.642300489337281e-05, + "loss": 1.9029, + "step": 22116 + }, + { + "epoch": 2.0037598242395416, + "grad_norm": 0.8934222459793091, + "learning_rate": 6.641696369238205e-05, + "loss": 2.5233, + "step": 22117 + }, + { + "epoch": 2.003850422413988, + "grad_norm": 0.8734584450721741, + "learning_rate": 6.64109224913913e-05, + "loss": 2.7408, + "step": 22118 + }, + { + "epoch": 2.003941020588435, + "grad_norm": 0.9305898547172546, + "learning_rate": 6.640488129040053e-05, + "loss": 2.6485, + "step": 22119 + }, + { + "epoch": 2.0040316187628817, + "grad_norm": 0.9577285647392273, + "learning_rate": 6.639884008940978e-05, + "loss": 2.6622, + "step": 22120 + }, + { + "epoch": 2.0041222169373287, + "grad_norm": 0.920318603515625, + "learning_rate": 6.639279888841901e-05, + "loss": 2.4171, + "step": 22121 + }, + { + "epoch": 2.0042128151117753, + "grad_norm": 1.0217876434326172, + "learning_rate": 6.638675768742827e-05, + "loss": 2.2943, + "step": 22122 + }, + { + "epoch": 2.0043034132862223, + "grad_norm": 0.9341265559196472, + "learning_rate": 6.63807164864375e-05, + "loss": 2.4277, + "step": 22123 + }, + { + "epoch": 2.004394011460669, + "grad_norm": 0.9916378855705261, + "learning_rate": 6.637467528544675e-05, + "loss": 2.3802, + "step": 22124 + }, + { + "epoch": 2.004484609635116, + "grad_norm": 0.9684740304946899, + "learning_rate": 6.636863408445599e-05, + "loss": 2.6241, + "step": 22125 + }, + { + "epoch": 2.0045752078095624, + "grad_norm": 0.757253110408783, + "learning_rate": 6.636259288346524e-05, + "loss": 1.766, + "step": 22126 + }, + { + "epoch": 2.0046658059840095, + "grad_norm": 1.0191038846969604, + "learning_rate": 6.635655168247448e-05, + "loss": 2.7798, + "step": 22127 + }, + { + "epoch": 2.004756404158456, + "grad_norm": 1.018718957901001, + "learning_rate": 6.635051048148372e-05, + "loss": 2.6917, + "step": 22128 + }, + { + "epoch": 2.004847002332903, + "grad_norm": 1.0202114582061768, + "learning_rate": 6.634446928049296e-05, + "loss": 2.5326, + "step": 22129 + }, + { + "epoch": 2.0049376005073496, + "grad_norm": 0.9241003394126892, + "learning_rate": 6.633842807950221e-05, + "loss": 2.423, + "step": 22130 + }, + { + "epoch": 2.0050281986817966, + "grad_norm": 0.9957377910614014, + "learning_rate": 6.633238687851146e-05, + "loss": 2.5518, + "step": 22131 + }, + { + "epoch": 2.005118796856243, + "grad_norm": 0.9588457345962524, + "learning_rate": 6.63263456775207e-05, + "loss": 2.7598, + "step": 22132 + }, + { + "epoch": 2.00520939503069, + "grad_norm": 0.9140105247497559, + "learning_rate": 6.632030447652994e-05, + "loss": 2.5914, + "step": 22133 + }, + { + "epoch": 2.0052999932051367, + "grad_norm": 0.9766395092010498, + "learning_rate": 6.631426327553918e-05, + "loss": 2.6104, + "step": 22134 + }, + { + "epoch": 2.0053905913795838, + "grad_norm": 0.909700870513916, + "learning_rate": 6.630822207454842e-05, + "loss": 1.9343, + "step": 22135 + }, + { + "epoch": 2.0054811895540303, + "grad_norm": 0.9397399425506592, + "learning_rate": 6.630218087355766e-05, + "loss": 2.5379, + "step": 22136 + }, + { + "epoch": 2.0055717877284773, + "grad_norm": 1.0000641345977783, + "learning_rate": 6.629613967256692e-05, + "loss": 2.7217, + "step": 22137 + }, + { + "epoch": 2.005662385902924, + "grad_norm": 0.9637089967727661, + "learning_rate": 6.629009847157615e-05, + "loss": 2.6618, + "step": 22138 + }, + { + "epoch": 2.005752984077371, + "grad_norm": 0.8150345087051392, + "learning_rate": 6.62840572705854e-05, + "loss": 1.8898, + "step": 22139 + }, + { + "epoch": 2.0058435822518175, + "grad_norm": 1.0562208890914917, + "learning_rate": 6.627801606959463e-05, + "loss": 2.2337, + "step": 22140 + }, + { + "epoch": 2.0059341804262645, + "grad_norm": 0.9076548218727112, + "learning_rate": 6.627197486860388e-05, + "loss": 2.8629, + "step": 22141 + }, + { + "epoch": 2.006024778600711, + "grad_norm": 0.9217681288719177, + "learning_rate": 6.626593366761312e-05, + "loss": 2.577, + "step": 22142 + }, + { + "epoch": 2.006115376775158, + "grad_norm": 1.139878749847412, + "learning_rate": 6.625989246662236e-05, + "loss": 2.7738, + "step": 22143 + }, + { + "epoch": 2.0062059749496046, + "grad_norm": 0.8596683740615845, + "learning_rate": 6.625385126563161e-05, + "loss": 2.3608, + "step": 22144 + }, + { + "epoch": 2.0062965731240516, + "grad_norm": 0.9250267744064331, + "learning_rate": 6.624781006464086e-05, + "loss": 2.4953, + "step": 22145 + }, + { + "epoch": 2.006387171298498, + "grad_norm": 1.1562541723251343, + "learning_rate": 6.62417688636501e-05, + "loss": 2.5402, + "step": 22146 + }, + { + "epoch": 2.006477769472945, + "grad_norm": 0.9604735374450684, + "learning_rate": 6.623572766265934e-05, + "loss": 2.6912, + "step": 22147 + }, + { + "epoch": 2.0065683676473918, + "grad_norm": 1.0299596786499023, + "learning_rate": 6.622968646166859e-05, + "loss": 2.6638, + "step": 22148 + }, + { + "epoch": 2.006658965821839, + "grad_norm": 0.9203044176101685, + "learning_rate": 6.622364526067782e-05, + "loss": 2.4938, + "step": 22149 + }, + { + "epoch": 2.0067495639962853, + "grad_norm": 0.9499161243438721, + "learning_rate": 6.621760405968707e-05, + "loss": 2.4912, + "step": 22150 + }, + { + "epoch": 2.0068401621707324, + "grad_norm": 0.9272204637527466, + "learning_rate": 6.62115628586963e-05, + "loss": 2.4811, + "step": 22151 + }, + { + "epoch": 2.006930760345179, + "grad_norm": 0.9735282063484192, + "learning_rate": 6.620552165770556e-05, + "loss": 2.9849, + "step": 22152 + }, + { + "epoch": 2.007021358519626, + "grad_norm": 1.053053617477417, + "learning_rate": 6.61994804567148e-05, + "loss": 2.6311, + "step": 22153 + }, + { + "epoch": 2.0071119566940725, + "grad_norm": 0.8462556004524231, + "learning_rate": 6.619343925572405e-05, + "loss": 2.2206, + "step": 22154 + }, + { + "epoch": 2.0072025548685195, + "grad_norm": 0.9564963579177856, + "learning_rate": 6.618739805473328e-05, + "loss": 2.6931, + "step": 22155 + }, + { + "epoch": 2.007293153042966, + "grad_norm": 0.9784834980964661, + "learning_rate": 6.618135685374253e-05, + "loss": 3.0683, + "step": 22156 + }, + { + "epoch": 2.007383751217413, + "grad_norm": 0.9889714121818542, + "learning_rate": 6.617531565275176e-05, + "loss": 2.7467, + "step": 22157 + }, + { + "epoch": 2.0074743493918596, + "grad_norm": 0.9511418342590332, + "learning_rate": 6.616927445176101e-05, + "loss": 2.5582, + "step": 22158 + }, + { + "epoch": 2.0075649475663067, + "grad_norm": 1.0140000581741333, + "learning_rate": 6.616323325077026e-05, + "loss": 2.7114, + "step": 22159 + }, + { + "epoch": 2.0076555457407532, + "grad_norm": 0.9607928395271301, + "learning_rate": 6.61571920497795e-05, + "loss": 2.8172, + "step": 22160 + }, + { + "epoch": 2.0077461439152002, + "grad_norm": 0.9798826575279236, + "learning_rate": 6.615115084878874e-05, + "loss": 2.4753, + "step": 22161 + }, + { + "epoch": 2.007836742089647, + "grad_norm": 1.0101594924926758, + "learning_rate": 6.614510964779799e-05, + "loss": 2.3959, + "step": 22162 + }, + { + "epoch": 2.007927340264094, + "grad_norm": 0.9417849779129028, + "learning_rate": 6.613906844680723e-05, + "loss": 2.5224, + "step": 22163 + }, + { + "epoch": 2.0080179384385404, + "grad_norm": 1.0252774953842163, + "learning_rate": 6.613302724581647e-05, + "loss": 2.5283, + "step": 22164 + }, + { + "epoch": 2.0081085366129874, + "grad_norm": 0.9838129281997681, + "learning_rate": 6.612698604482572e-05, + "loss": 2.5425, + "step": 22165 + }, + { + "epoch": 2.008199134787434, + "grad_norm": 0.9162052273750305, + "learning_rate": 6.612094484383495e-05, + "loss": 2.3336, + "step": 22166 + }, + { + "epoch": 2.008289732961881, + "grad_norm": 0.9393516182899475, + "learning_rate": 6.611490364284421e-05, + "loss": 2.4063, + "step": 22167 + }, + { + "epoch": 2.0083803311363275, + "grad_norm": 1.0202980041503906, + "learning_rate": 6.610886244185345e-05, + "loss": 2.6607, + "step": 22168 + }, + { + "epoch": 2.0084709293107745, + "grad_norm": 1.0163938999176025, + "learning_rate": 6.610282124086269e-05, + "loss": 2.7709, + "step": 22169 + }, + { + "epoch": 2.008561527485221, + "grad_norm": 0.9611431360244751, + "learning_rate": 6.609678003987193e-05, + "loss": 2.9184, + "step": 22170 + }, + { + "epoch": 2.008652125659668, + "grad_norm": 0.9292290210723877, + "learning_rate": 6.609073883888117e-05, + "loss": 2.342, + "step": 22171 + }, + { + "epoch": 2.0087427238341147, + "grad_norm": 0.9662888050079346, + "learning_rate": 6.608469763789041e-05, + "loss": 2.3895, + "step": 22172 + }, + { + "epoch": 2.0088333220085617, + "grad_norm": 0.9196934103965759, + "learning_rate": 6.607865643689966e-05, + "loss": 2.9042, + "step": 22173 + }, + { + "epoch": 2.0089239201830082, + "grad_norm": 0.9488062262535095, + "learning_rate": 6.607261523590889e-05, + "loss": 2.5706, + "step": 22174 + }, + { + "epoch": 2.0090145183574553, + "grad_norm": 0.8723991513252258, + "learning_rate": 6.606657403491815e-05, + "loss": 2.5871, + "step": 22175 + }, + { + "epoch": 2.009105116531902, + "grad_norm": 0.9604541063308716, + "learning_rate": 6.606053283392739e-05, + "loss": 2.7618, + "step": 22176 + }, + { + "epoch": 2.009195714706349, + "grad_norm": 1.0583372116088867, + "learning_rate": 6.605449163293663e-05, + "loss": 2.6371, + "step": 22177 + }, + { + "epoch": 2.0092863128807954, + "grad_norm": 0.9733402132987976, + "learning_rate": 6.604845043194588e-05, + "loss": 2.6152, + "step": 22178 + }, + { + "epoch": 2.0093769110552424, + "grad_norm": 0.9338865876197815, + "learning_rate": 6.604240923095511e-05, + "loss": 2.3902, + "step": 22179 + }, + { + "epoch": 2.009467509229689, + "grad_norm": 0.9014615416526794, + "learning_rate": 6.603636802996436e-05, + "loss": 2.7187, + "step": 22180 + }, + { + "epoch": 2.009558107404136, + "grad_norm": 0.9694496393203735, + "learning_rate": 6.60303268289736e-05, + "loss": 2.5188, + "step": 22181 + }, + { + "epoch": 2.0096487055785826, + "grad_norm": 0.8712385892868042, + "learning_rate": 6.602428562798286e-05, + "loss": 1.7952, + "step": 22182 + }, + { + "epoch": 2.0097393037530296, + "grad_norm": 0.9296853542327881, + "learning_rate": 6.601824442699209e-05, + "loss": 2.8691, + "step": 22183 + }, + { + "epoch": 2.009829901927476, + "grad_norm": 0.9344446063041687, + "learning_rate": 6.601220322600134e-05, + "loss": 2.7749, + "step": 22184 + }, + { + "epoch": 2.009920500101923, + "grad_norm": 0.7947579026222229, + "learning_rate": 6.600616202501057e-05, + "loss": 1.9604, + "step": 22185 + }, + { + "epoch": 2.0100110982763697, + "grad_norm": 0.945681095123291, + "learning_rate": 6.600012082401982e-05, + "loss": 2.6124, + "step": 22186 + }, + { + "epoch": 2.0101016964508167, + "grad_norm": 0.9705020189285278, + "learning_rate": 6.599407962302905e-05, + "loss": 2.5743, + "step": 22187 + }, + { + "epoch": 2.0101922946252633, + "grad_norm": 0.9583263993263245, + "learning_rate": 6.59880384220383e-05, + "loss": 2.4173, + "step": 22188 + }, + { + "epoch": 2.0102828927997103, + "grad_norm": 0.9812033176422119, + "learning_rate": 6.598199722104754e-05, + "loss": 2.3876, + "step": 22189 + }, + { + "epoch": 2.010373490974157, + "grad_norm": 0.9758785963058472, + "learning_rate": 6.59759560200568e-05, + "loss": 2.8615, + "step": 22190 + }, + { + "epoch": 2.010464089148604, + "grad_norm": 0.9275907278060913, + "learning_rate": 6.596991481906603e-05, + "loss": 2.362, + "step": 22191 + }, + { + "epoch": 2.0105546873230504, + "grad_norm": 0.9373116493225098, + "learning_rate": 6.596387361807528e-05, + "loss": 2.5279, + "step": 22192 + }, + { + "epoch": 2.0106452854974974, + "grad_norm": 1.14357328414917, + "learning_rate": 6.595783241708451e-05, + "loss": 2.434, + "step": 22193 + }, + { + "epoch": 2.010735883671944, + "grad_norm": 0.8929854035377502, + "learning_rate": 6.595179121609376e-05, + "loss": 2.5851, + "step": 22194 + }, + { + "epoch": 2.010826481846391, + "grad_norm": 0.9810654520988464, + "learning_rate": 6.594575001510301e-05, + "loss": 2.4313, + "step": 22195 + }, + { + "epoch": 2.0109170800208376, + "grad_norm": 0.9031517505645752, + "learning_rate": 6.593970881411224e-05, + "loss": 2.3828, + "step": 22196 + }, + { + "epoch": 2.011007678195284, + "grad_norm": 0.9971385598182678, + "learning_rate": 6.593366761312149e-05, + "loss": 2.569, + "step": 22197 + }, + { + "epoch": 2.011098276369731, + "grad_norm": 1.059675693511963, + "learning_rate": 6.592762641213074e-05, + "loss": 2.626, + "step": 22198 + }, + { + "epoch": 2.0111888745441777, + "grad_norm": 1.106652855873108, + "learning_rate": 6.592158521113999e-05, + "loss": 2.2331, + "step": 22199 + }, + { + "epoch": 2.0112794727186247, + "grad_norm": 0.8411643505096436, + "learning_rate": 6.591554401014922e-05, + "loss": 2.043, + "step": 22200 + }, + { + "epoch": 2.0113700708930713, + "grad_norm": 1.0469655990600586, + "learning_rate": 6.590950280915847e-05, + "loss": 2.8897, + "step": 22201 + }, + { + "epoch": 2.0114606690675183, + "grad_norm": 0.8532590270042419, + "learning_rate": 6.59034616081677e-05, + "loss": 1.8863, + "step": 22202 + }, + { + "epoch": 2.011551267241965, + "grad_norm": 1.0218113660812378, + "learning_rate": 6.589742040717695e-05, + "loss": 3.0551, + "step": 22203 + }, + { + "epoch": 2.011641865416412, + "grad_norm": 0.9521249532699585, + "learning_rate": 6.589137920618618e-05, + "loss": 2.7329, + "step": 22204 + }, + { + "epoch": 2.0117324635908584, + "grad_norm": 0.9852840304374695, + "learning_rate": 6.588533800519544e-05, + "loss": 2.581, + "step": 22205 + }, + { + "epoch": 2.0118230617653055, + "grad_norm": 1.0647069215774536, + "learning_rate": 6.587929680420468e-05, + "loss": 2.5578, + "step": 22206 + }, + { + "epoch": 2.011913659939752, + "grad_norm": 0.9651138186454773, + "learning_rate": 6.587325560321393e-05, + "loss": 2.363, + "step": 22207 + }, + { + "epoch": 2.012004258114199, + "grad_norm": 0.8977387547492981, + "learning_rate": 6.586721440222316e-05, + "loss": 2.4588, + "step": 22208 + }, + { + "epoch": 2.0120948562886456, + "grad_norm": 0.9089027047157288, + "learning_rate": 6.586117320123241e-05, + "loss": 2.413, + "step": 22209 + }, + { + "epoch": 2.0121854544630926, + "grad_norm": 0.9179531931877136, + "learning_rate": 6.585513200024166e-05, + "loss": 2.7979, + "step": 22210 + }, + { + "epoch": 2.012276052637539, + "grad_norm": 0.9907575249671936, + "learning_rate": 6.584909079925089e-05, + "loss": 2.6272, + "step": 22211 + }, + { + "epoch": 2.012366650811986, + "grad_norm": 0.9575689435005188, + "learning_rate": 6.584304959826014e-05, + "loss": 2.4573, + "step": 22212 + }, + { + "epoch": 2.0124572489864327, + "grad_norm": 0.9608269333839417, + "learning_rate": 6.583700839726938e-05, + "loss": 2.5743, + "step": 22213 + }, + { + "epoch": 2.0125478471608798, + "grad_norm": 0.9041986465454102, + "learning_rate": 6.583096719627863e-05, + "loss": 2.2539, + "step": 22214 + }, + { + "epoch": 2.0126384453353263, + "grad_norm": 0.9666232466697693, + "learning_rate": 6.582492599528787e-05, + "loss": 2.4757, + "step": 22215 + }, + { + "epoch": 2.0127290435097733, + "grad_norm": 0.8889815807342529, + "learning_rate": 6.581888479429711e-05, + "loss": 2.4068, + "step": 22216 + }, + { + "epoch": 2.01281964168422, + "grad_norm": 1.0437606573104858, + "learning_rate": 6.581284359330635e-05, + "loss": 2.7828, + "step": 22217 + }, + { + "epoch": 2.012910239858667, + "grad_norm": 0.9416305422782898, + "learning_rate": 6.58068023923156e-05, + "loss": 2.6711, + "step": 22218 + }, + { + "epoch": 2.0130008380331135, + "grad_norm": 0.8332409858703613, + "learning_rate": 6.580076119132483e-05, + "loss": 1.9434, + "step": 22219 + }, + { + "epoch": 2.0130914362075605, + "grad_norm": 0.9412200450897217, + "learning_rate": 6.579471999033409e-05, + "loss": 2.5865, + "step": 22220 + }, + { + "epoch": 2.013182034382007, + "grad_norm": 0.9625687599182129, + "learning_rate": 6.578867878934332e-05, + "loss": 2.8795, + "step": 22221 + }, + { + "epoch": 2.013272632556454, + "grad_norm": 0.9716606140136719, + "learning_rate": 6.578263758835257e-05, + "loss": 2.5389, + "step": 22222 + }, + { + "epoch": 2.0133632307309006, + "grad_norm": 0.9486905932426453, + "learning_rate": 6.57765963873618e-05, + "loss": 2.5627, + "step": 22223 + }, + { + "epoch": 2.0134538289053476, + "grad_norm": 0.9727621078491211, + "learning_rate": 6.577055518637105e-05, + "loss": 2.7754, + "step": 22224 + }, + { + "epoch": 2.013544427079794, + "grad_norm": 1.0065709352493286, + "learning_rate": 6.576451398538029e-05, + "loss": 2.8087, + "step": 22225 + }, + { + "epoch": 2.013635025254241, + "grad_norm": 0.8947951197624207, + "learning_rate": 6.575847278438954e-05, + "loss": 2.3458, + "step": 22226 + }, + { + "epoch": 2.0137256234286878, + "grad_norm": 0.8998611569404602, + "learning_rate": 6.575243158339878e-05, + "loss": 2.654, + "step": 22227 + }, + { + "epoch": 2.013816221603135, + "grad_norm": 0.9568382501602173, + "learning_rate": 6.574639038240803e-05, + "loss": 2.2929, + "step": 22228 + }, + { + "epoch": 2.0139068197775813, + "grad_norm": 0.9600340723991394, + "learning_rate": 6.574034918141726e-05, + "loss": 2.3627, + "step": 22229 + }, + { + "epoch": 2.0139974179520284, + "grad_norm": 0.9298689365386963, + "learning_rate": 6.573430798042651e-05, + "loss": 2.4549, + "step": 22230 + }, + { + "epoch": 2.014088016126475, + "grad_norm": 0.9934860467910767, + "learning_rate": 6.572826677943576e-05, + "loss": 2.5733, + "step": 22231 + }, + { + "epoch": 2.014178614300922, + "grad_norm": 0.9105152487754822, + "learning_rate": 6.5722225578445e-05, + "loss": 2.3318, + "step": 22232 + }, + { + "epoch": 2.0142692124753685, + "grad_norm": 0.993360698223114, + "learning_rate": 6.571618437745424e-05, + "loss": 2.5715, + "step": 22233 + }, + { + "epoch": 2.0143598106498155, + "grad_norm": 1.029297947883606, + "learning_rate": 6.571014317646348e-05, + "loss": 2.6236, + "step": 22234 + }, + { + "epoch": 2.014450408824262, + "grad_norm": 1.0443065166473389, + "learning_rate": 6.570410197547274e-05, + "loss": 2.3469, + "step": 22235 + }, + { + "epoch": 2.014541006998709, + "grad_norm": 0.9960575103759766, + "learning_rate": 6.569806077448197e-05, + "loss": 2.384, + "step": 22236 + }, + { + "epoch": 2.0146316051731556, + "grad_norm": 0.9618468284606934, + "learning_rate": 6.569201957349122e-05, + "loss": 2.7525, + "step": 22237 + }, + { + "epoch": 2.0147222033476027, + "grad_norm": 0.9535780549049377, + "learning_rate": 6.568597837250045e-05, + "loss": 2.9615, + "step": 22238 + }, + { + "epoch": 2.0148128015220492, + "grad_norm": 0.9778190851211548, + "learning_rate": 6.56799371715097e-05, + "loss": 2.8066, + "step": 22239 + }, + { + "epoch": 2.0149033996964962, + "grad_norm": 0.960132360458374, + "learning_rate": 6.567389597051893e-05, + "loss": 2.7163, + "step": 22240 + }, + { + "epoch": 2.014993997870943, + "grad_norm": 0.9288378953933716, + "learning_rate": 6.566785476952818e-05, + "loss": 2.6118, + "step": 22241 + }, + { + "epoch": 2.01508459604539, + "grad_norm": 1.049227237701416, + "learning_rate": 6.566181356853743e-05, + "loss": 2.8418, + "step": 22242 + }, + { + "epoch": 2.0151751942198364, + "grad_norm": 0.9308046102523804, + "learning_rate": 6.565577236754668e-05, + "loss": 2.5062, + "step": 22243 + }, + { + "epoch": 2.0152657923942834, + "grad_norm": 0.9571216106414795, + "learning_rate": 6.564973116655591e-05, + "loss": 2.5899, + "step": 22244 + }, + { + "epoch": 2.01535639056873, + "grad_norm": 0.9131469130516052, + "learning_rate": 6.564368996556516e-05, + "loss": 2.3258, + "step": 22245 + }, + { + "epoch": 2.015446988743177, + "grad_norm": 0.9117424488067627, + "learning_rate": 6.56376487645744e-05, + "loss": 2.6279, + "step": 22246 + }, + { + "epoch": 2.0155375869176235, + "grad_norm": 0.9667395353317261, + "learning_rate": 6.563160756358364e-05, + "loss": 2.486, + "step": 22247 + }, + { + "epoch": 2.0156281850920705, + "grad_norm": 1.0099451541900635, + "learning_rate": 6.562556636259289e-05, + "loss": 2.7594, + "step": 22248 + }, + { + "epoch": 2.015718783266517, + "grad_norm": 0.9957290291786194, + "learning_rate": 6.561952516160212e-05, + "loss": 2.7, + "step": 22249 + }, + { + "epoch": 2.015809381440964, + "grad_norm": 0.9696356654167175, + "learning_rate": 6.561348396061138e-05, + "loss": 2.4895, + "step": 22250 + }, + { + "epoch": 2.0158999796154107, + "grad_norm": 0.9183465838432312, + "learning_rate": 6.560744275962062e-05, + "loss": 2.7435, + "step": 22251 + }, + { + "epoch": 2.0159905777898577, + "grad_norm": 1.045095682144165, + "learning_rate": 6.560140155862986e-05, + "loss": 2.532, + "step": 22252 + }, + { + "epoch": 2.0160811759643043, + "grad_norm": 0.9970134496688843, + "learning_rate": 6.55953603576391e-05, + "loss": 2.5788, + "step": 22253 + }, + { + "epoch": 2.0161717741387513, + "grad_norm": 0.9670876264572144, + "learning_rate": 6.558931915664835e-05, + "loss": 2.573, + "step": 22254 + }, + { + "epoch": 2.016262372313198, + "grad_norm": 0.9523451328277588, + "learning_rate": 6.558327795565758e-05, + "loss": 2.7542, + "step": 22255 + }, + { + "epoch": 2.016352970487645, + "grad_norm": 1.0806952714920044, + "learning_rate": 6.557723675466683e-05, + "loss": 2.5392, + "step": 22256 + }, + { + "epoch": 2.0164435686620914, + "grad_norm": 1.0517964363098145, + "learning_rate": 6.557119555367608e-05, + "loss": 2.6507, + "step": 22257 + }, + { + "epoch": 2.0165341668365384, + "grad_norm": 0.9558658003807068, + "learning_rate": 6.556515435268532e-05, + "loss": 2.4005, + "step": 22258 + }, + { + "epoch": 2.016624765010985, + "grad_norm": 0.9537099599838257, + "learning_rate": 6.555911315169456e-05, + "loss": 2.6138, + "step": 22259 + }, + { + "epoch": 2.016715363185432, + "grad_norm": 1.0902197360992432, + "learning_rate": 6.55530719507038e-05, + "loss": 2.3664, + "step": 22260 + }, + { + "epoch": 2.0168059613598786, + "grad_norm": 1.0258649587631226, + "learning_rate": 6.554703074971304e-05, + "loss": 2.6824, + "step": 22261 + }, + { + "epoch": 2.0168965595343256, + "grad_norm": 1.0727964639663696, + "learning_rate": 6.554098954872229e-05, + "loss": 2.4796, + "step": 22262 + }, + { + "epoch": 2.016987157708772, + "grad_norm": 0.9378777742385864, + "learning_rate": 6.553494834773153e-05, + "loss": 2.5655, + "step": 22263 + }, + { + "epoch": 2.017077755883219, + "grad_norm": 0.9634414911270142, + "learning_rate": 6.552890714674077e-05, + "loss": 2.6242, + "step": 22264 + }, + { + "epoch": 2.0171683540576657, + "grad_norm": 0.9284623265266418, + "learning_rate": 6.552286594575003e-05, + "loss": 2.654, + "step": 22265 + }, + { + "epoch": 2.0172589522321127, + "grad_norm": 0.8111773133277893, + "learning_rate": 6.551682474475926e-05, + "loss": 1.8233, + "step": 22266 + }, + { + "epoch": 2.0173495504065593, + "grad_norm": 0.9604699015617371, + "learning_rate": 6.551078354376851e-05, + "loss": 2.6539, + "step": 22267 + }, + { + "epoch": 2.0174401485810063, + "grad_norm": 0.942598819732666, + "learning_rate": 6.550474234277775e-05, + "loss": 2.5547, + "step": 22268 + }, + { + "epoch": 2.017530746755453, + "grad_norm": 0.8094252943992615, + "learning_rate": 6.549870114178699e-05, + "loss": 2.0351, + "step": 22269 + }, + { + "epoch": 2.0176213449299, + "grad_norm": 0.9665316343307495, + "learning_rate": 6.549265994079623e-05, + "loss": 2.3513, + "step": 22270 + }, + { + "epoch": 2.0177119431043464, + "grad_norm": 0.9698562026023865, + "learning_rate": 6.548661873980547e-05, + "loss": 2.4103, + "step": 22271 + }, + { + "epoch": 2.0178025412787934, + "grad_norm": 0.9543115496635437, + "learning_rate": 6.548057753881472e-05, + "loss": 2.4749, + "step": 22272 + }, + { + "epoch": 2.01789313945324, + "grad_norm": 0.8946053981781006, + "learning_rate": 6.547453633782397e-05, + "loss": 2.5804, + "step": 22273 + }, + { + "epoch": 2.017983737627687, + "grad_norm": 0.7618741393089294, + "learning_rate": 6.54684951368332e-05, + "loss": 1.8546, + "step": 22274 + }, + { + "epoch": 2.0180743358021336, + "grad_norm": 0.932341992855072, + "learning_rate": 6.546245393584245e-05, + "loss": 2.5256, + "step": 22275 + }, + { + "epoch": 2.0181649339765806, + "grad_norm": 0.9642780423164368, + "learning_rate": 6.545641273485169e-05, + "loss": 2.5678, + "step": 22276 + }, + { + "epoch": 2.018255532151027, + "grad_norm": 0.975363552570343, + "learning_rate": 6.545037153386093e-05, + "loss": 2.6711, + "step": 22277 + }, + { + "epoch": 2.018346130325474, + "grad_norm": 0.9305722117424011, + "learning_rate": 6.544433033287018e-05, + "loss": 2.3726, + "step": 22278 + }, + { + "epoch": 2.0184367284999207, + "grad_norm": 1.0041877031326294, + "learning_rate": 6.543828913187941e-05, + "loss": 2.6757, + "step": 22279 + }, + { + "epoch": 2.0185273266743673, + "grad_norm": 1.0495223999023438, + "learning_rate": 6.543224793088866e-05, + "loss": 2.9479, + "step": 22280 + }, + { + "epoch": 2.0186179248488143, + "grad_norm": 0.9474796056747437, + "learning_rate": 6.542620672989791e-05, + "loss": 2.1431, + "step": 22281 + }, + { + "epoch": 2.018708523023261, + "grad_norm": 1.0865739583969116, + "learning_rate": 6.542016552890716e-05, + "loss": 2.7143, + "step": 22282 + }, + { + "epoch": 2.018799121197708, + "grad_norm": 0.9447497725486755, + "learning_rate": 6.541412432791639e-05, + "loss": 2.478, + "step": 22283 + }, + { + "epoch": 2.0188897193721544, + "grad_norm": 1.0170639753341675, + "learning_rate": 6.540808312692564e-05, + "loss": 2.3324, + "step": 22284 + }, + { + "epoch": 2.0189803175466015, + "grad_norm": 1.0823960304260254, + "learning_rate": 6.540204192593487e-05, + "loss": 2.4878, + "step": 22285 + }, + { + "epoch": 2.019070915721048, + "grad_norm": 0.8327419757843018, + "learning_rate": 6.539600072494412e-05, + "loss": 2.0716, + "step": 22286 + }, + { + "epoch": 2.019161513895495, + "grad_norm": 0.9366143941879272, + "learning_rate": 6.538995952395337e-05, + "loss": 2.4865, + "step": 22287 + }, + { + "epoch": 2.0192521120699416, + "grad_norm": 0.9684823751449585, + "learning_rate": 6.538391832296262e-05, + "loss": 2.81, + "step": 22288 + }, + { + "epoch": 2.0193427102443886, + "grad_norm": 0.9902073740959167, + "learning_rate": 6.537787712197185e-05, + "loss": 2.6925, + "step": 22289 + }, + { + "epoch": 2.019433308418835, + "grad_norm": 0.9111079573631287, + "learning_rate": 6.53718359209811e-05, + "loss": 2.1516, + "step": 22290 + }, + { + "epoch": 2.019523906593282, + "grad_norm": 1.0031626224517822, + "learning_rate": 6.536579471999033e-05, + "loss": 2.9361, + "step": 22291 + }, + { + "epoch": 2.0196145047677287, + "grad_norm": 0.9610316753387451, + "learning_rate": 6.535975351899958e-05, + "loss": 2.3594, + "step": 22292 + }, + { + "epoch": 2.0197051029421758, + "grad_norm": 1.0189018249511719, + "learning_rate": 6.535371231800881e-05, + "loss": 2.787, + "step": 22293 + }, + { + "epoch": 2.0197957011166223, + "grad_norm": 0.9167855978012085, + "learning_rate": 6.534767111701806e-05, + "loss": 2.7095, + "step": 22294 + }, + { + "epoch": 2.0198862992910693, + "grad_norm": 0.924912691116333, + "learning_rate": 6.534162991602731e-05, + "loss": 2.7492, + "step": 22295 + }, + { + "epoch": 2.019976897465516, + "grad_norm": 0.8276658058166504, + "learning_rate": 6.533558871503656e-05, + "loss": 1.9952, + "step": 22296 + }, + { + "epoch": 2.020067495639963, + "grad_norm": 0.9799723625183105, + "learning_rate": 6.532954751404579e-05, + "loss": 2.4621, + "step": 22297 + }, + { + "epoch": 2.0201580938144095, + "grad_norm": 0.9298193454742432, + "learning_rate": 6.532350631305504e-05, + "loss": 2.7232, + "step": 22298 + }, + { + "epoch": 2.0202486919888565, + "grad_norm": 0.9605390429496765, + "learning_rate": 6.531746511206429e-05, + "loss": 2.6033, + "step": 22299 + }, + { + "epoch": 2.020339290163303, + "grad_norm": 0.8981531858444214, + "learning_rate": 6.531142391107352e-05, + "loss": 2.7503, + "step": 22300 + }, + { + "epoch": 2.02042988833775, + "grad_norm": 0.9988234043121338, + "learning_rate": 6.530538271008277e-05, + "loss": 2.5261, + "step": 22301 + }, + { + "epoch": 2.0205204865121966, + "grad_norm": 0.8833351135253906, + "learning_rate": 6.529934150909201e-05, + "loss": 2.4719, + "step": 22302 + }, + { + "epoch": 2.0206110846866436, + "grad_norm": 1.09488046169281, + "learning_rate": 6.529330030810126e-05, + "loss": 2.5484, + "step": 22303 + }, + { + "epoch": 2.02070168286109, + "grad_norm": 1.036611795425415, + "learning_rate": 6.52872591071105e-05, + "loss": 2.6207, + "step": 22304 + }, + { + "epoch": 2.020792281035537, + "grad_norm": 0.9771097302436829, + "learning_rate": 6.528121790611974e-05, + "loss": 2.4953, + "step": 22305 + }, + { + "epoch": 2.0208828792099838, + "grad_norm": 0.9394170045852661, + "learning_rate": 6.527517670512898e-05, + "loss": 2.4257, + "step": 22306 + }, + { + "epoch": 2.020973477384431, + "grad_norm": 0.855318546295166, + "learning_rate": 6.526913550413823e-05, + "loss": 1.8558, + "step": 22307 + }, + { + "epoch": 2.0210640755588773, + "grad_norm": 0.9798226356506348, + "learning_rate": 6.526309430314746e-05, + "loss": 2.5276, + "step": 22308 + }, + { + "epoch": 2.0211546737333244, + "grad_norm": 1.0254108905792236, + "learning_rate": 6.525705310215671e-05, + "loss": 2.5771, + "step": 22309 + }, + { + "epoch": 2.021245271907771, + "grad_norm": 0.9437389969825745, + "learning_rate": 6.525101190116595e-05, + "loss": 2.6986, + "step": 22310 + }, + { + "epoch": 2.021335870082218, + "grad_norm": 1.1062233448028564, + "learning_rate": 6.52449707001752e-05, + "loss": 2.3881, + "step": 22311 + }, + { + "epoch": 2.0214264682566645, + "grad_norm": 0.917816698551178, + "learning_rate": 6.523892949918444e-05, + "loss": 2.6116, + "step": 22312 + }, + { + "epoch": 2.0215170664311115, + "grad_norm": 0.9591683745384216, + "learning_rate": 6.523288829819368e-05, + "loss": 2.4997, + "step": 22313 + }, + { + "epoch": 2.021607664605558, + "grad_norm": 0.9631614089012146, + "learning_rate": 6.522684709720293e-05, + "loss": 2.7195, + "step": 22314 + }, + { + "epoch": 2.021698262780005, + "grad_norm": 0.9630274176597595, + "learning_rate": 6.522080589621217e-05, + "loss": 2.6586, + "step": 22315 + }, + { + "epoch": 2.0217888609544516, + "grad_norm": 1.045170545578003, + "learning_rate": 6.521476469522141e-05, + "loss": 2.5098, + "step": 22316 + }, + { + "epoch": 2.0218794591288987, + "grad_norm": 0.9993496537208557, + "learning_rate": 6.520872349423066e-05, + "loss": 2.5063, + "step": 22317 + }, + { + "epoch": 2.0219700573033452, + "grad_norm": 0.9371564984321594, + "learning_rate": 6.520268229323991e-05, + "loss": 2.4779, + "step": 22318 + }, + { + "epoch": 2.0220606554777922, + "grad_norm": 1.5308417081832886, + "learning_rate": 6.519664109224914e-05, + "loss": 2.3551, + "step": 22319 + }, + { + "epoch": 2.022151253652239, + "grad_norm": 0.9211228489875793, + "learning_rate": 6.519059989125839e-05, + "loss": 2.716, + "step": 22320 + }, + { + "epoch": 2.022241851826686, + "grad_norm": 0.9033268690109253, + "learning_rate": 6.518455869026762e-05, + "loss": 2.5366, + "step": 22321 + }, + { + "epoch": 2.0223324500011324, + "grad_norm": 0.9793587327003479, + "learning_rate": 6.517851748927687e-05, + "loss": 2.8852, + "step": 22322 + }, + { + "epoch": 2.0224230481755794, + "grad_norm": 0.9608896970748901, + "learning_rate": 6.51724762882861e-05, + "loss": 2.4552, + "step": 22323 + }, + { + "epoch": 2.022513646350026, + "grad_norm": 1.0153764486312866, + "learning_rate": 6.516643508729535e-05, + "loss": 2.7375, + "step": 22324 + }, + { + "epoch": 2.022604244524473, + "grad_norm": 0.9341175556182861, + "learning_rate": 6.51603938863046e-05, + "loss": 1.8432, + "step": 22325 + }, + { + "epoch": 2.0226948426989195, + "grad_norm": 0.9615916609764099, + "learning_rate": 6.515435268531385e-05, + "loss": 2.5902, + "step": 22326 + }, + { + "epoch": 2.0227854408733665, + "grad_norm": 1.0431016683578491, + "learning_rate": 6.514831148432308e-05, + "loss": 2.7841, + "step": 22327 + }, + { + "epoch": 2.022876039047813, + "grad_norm": 0.9987636208534241, + "learning_rate": 6.514227028333233e-05, + "loss": 2.6784, + "step": 22328 + }, + { + "epoch": 2.02296663722226, + "grad_norm": 0.978547990322113, + "learning_rate": 6.513622908234156e-05, + "loss": 2.7665, + "step": 22329 + }, + { + "epoch": 2.0230572353967067, + "grad_norm": 0.9217368364334106, + "learning_rate": 6.513018788135081e-05, + "loss": 2.54, + "step": 22330 + }, + { + "epoch": 2.0231478335711537, + "grad_norm": 0.9835273623466492, + "learning_rate": 6.512414668036006e-05, + "loss": 2.6693, + "step": 22331 + }, + { + "epoch": 2.0232384317456003, + "grad_norm": 0.9149322509765625, + "learning_rate": 6.511810547936931e-05, + "loss": 2.3738, + "step": 22332 + }, + { + "epoch": 2.0233290299200473, + "grad_norm": 0.9495061635971069, + "learning_rate": 6.511206427837855e-05, + "loss": 2.8407, + "step": 22333 + }, + { + "epoch": 2.023419628094494, + "grad_norm": 1.0002284049987793, + "learning_rate": 6.510602307738779e-05, + "loss": 2.7918, + "step": 22334 + }, + { + "epoch": 2.023510226268941, + "grad_norm": 0.9723373055458069, + "learning_rate": 6.509998187639704e-05, + "loss": 2.5829, + "step": 22335 + }, + { + "epoch": 2.0236008244433874, + "grad_norm": 0.9292380213737488, + "learning_rate": 6.509394067540627e-05, + "loss": 2.4958, + "step": 22336 + }, + { + "epoch": 2.0236914226178344, + "grad_norm": 0.9976257085800171, + "learning_rate": 6.508789947441552e-05, + "loss": 2.9148, + "step": 22337 + }, + { + "epoch": 2.023782020792281, + "grad_norm": 0.9209380745887756, + "learning_rate": 6.508185827342475e-05, + "loss": 2.7419, + "step": 22338 + }, + { + "epoch": 2.023872618966728, + "grad_norm": 0.9207227826118469, + "learning_rate": 6.5075817072434e-05, + "loss": 2.4427, + "step": 22339 + }, + { + "epoch": 2.0239632171411746, + "grad_norm": 0.947903037071228, + "learning_rate": 6.506977587144325e-05, + "loss": 2.5649, + "step": 22340 + }, + { + "epoch": 2.0240538153156216, + "grad_norm": 0.9258540272712708, + "learning_rate": 6.50637346704525e-05, + "loss": 2.6455, + "step": 22341 + }, + { + "epoch": 2.024144413490068, + "grad_norm": 0.8843840956687927, + "learning_rate": 6.505769346946173e-05, + "loss": 2.0031, + "step": 22342 + }, + { + "epoch": 2.024235011664515, + "grad_norm": 1.0019594430923462, + "learning_rate": 6.505165226847098e-05, + "loss": 2.7343, + "step": 22343 + }, + { + "epoch": 2.0243256098389617, + "grad_norm": 0.9284935593605042, + "learning_rate": 6.504561106748021e-05, + "loss": 2.7085, + "step": 22344 + }, + { + "epoch": 2.0244162080134087, + "grad_norm": 0.9750171303749084, + "learning_rate": 6.503956986648946e-05, + "loss": 2.4705, + "step": 22345 + }, + { + "epoch": 2.0245068061878553, + "grad_norm": 1.076870083808899, + "learning_rate": 6.50335286654987e-05, + "loss": 2.4734, + "step": 22346 + }, + { + "epoch": 2.0245974043623023, + "grad_norm": 0.9520135521888733, + "learning_rate": 6.502748746450795e-05, + "loss": 2.5137, + "step": 22347 + }, + { + "epoch": 2.024688002536749, + "grad_norm": 0.8339969515800476, + "learning_rate": 6.502144626351719e-05, + "loss": 1.8767, + "step": 22348 + }, + { + "epoch": 2.024778600711196, + "grad_norm": 1.0069819688796997, + "learning_rate": 6.501540506252644e-05, + "loss": 2.5429, + "step": 22349 + }, + { + "epoch": 2.0248691988856424, + "grad_norm": 0.990054726600647, + "learning_rate": 6.500936386153568e-05, + "loss": 2.4856, + "step": 22350 + }, + { + "epoch": 2.0249597970600894, + "grad_norm": 0.9315087795257568, + "learning_rate": 6.500332266054492e-05, + "loss": 2.3566, + "step": 22351 + }, + { + "epoch": 2.025050395234536, + "grad_norm": 0.9421808123588562, + "learning_rate": 6.499728145955416e-05, + "loss": 2.5956, + "step": 22352 + }, + { + "epoch": 2.025140993408983, + "grad_norm": 0.9545943140983582, + "learning_rate": 6.49912402585634e-05, + "loss": 2.6888, + "step": 22353 + }, + { + "epoch": 2.0252315915834296, + "grad_norm": 0.7986626625061035, + "learning_rate": 6.498519905757265e-05, + "loss": 1.8084, + "step": 22354 + }, + { + "epoch": 2.0253221897578766, + "grad_norm": 1.0127158164978027, + "learning_rate": 6.49791578565819e-05, + "loss": 2.8292, + "step": 22355 + }, + { + "epoch": 2.025412787932323, + "grad_norm": 1.0242902040481567, + "learning_rate": 6.497311665559114e-05, + "loss": 2.6821, + "step": 22356 + }, + { + "epoch": 2.02550338610677, + "grad_norm": 0.9490645527839661, + "learning_rate": 6.496707545460038e-05, + "loss": 2.6715, + "step": 22357 + }, + { + "epoch": 2.0255939842812167, + "grad_norm": 0.9473970532417297, + "learning_rate": 6.496103425360962e-05, + "loss": 2.6052, + "step": 22358 + }, + { + "epoch": 2.0256845824556633, + "grad_norm": 1.0262824296951294, + "learning_rate": 6.495499305261886e-05, + "loss": 2.5985, + "step": 22359 + }, + { + "epoch": 2.0257751806301103, + "grad_norm": 0.9824936389923096, + "learning_rate": 6.49489518516281e-05, + "loss": 2.4457, + "step": 22360 + }, + { + "epoch": 2.025865778804557, + "grad_norm": 0.9558623433113098, + "learning_rate": 6.494291065063734e-05, + "loss": 2.4516, + "step": 22361 + }, + { + "epoch": 2.025956376979004, + "grad_norm": 1.000853180885315, + "learning_rate": 6.49368694496466e-05, + "loss": 2.5697, + "step": 22362 + }, + { + "epoch": 2.0260469751534504, + "grad_norm": 0.9147507548332214, + "learning_rate": 6.493082824865583e-05, + "loss": 2.5052, + "step": 22363 + }, + { + "epoch": 2.0261375733278975, + "grad_norm": 0.9544876217842102, + "learning_rate": 6.492478704766508e-05, + "loss": 2.5086, + "step": 22364 + }, + { + "epoch": 2.026228171502344, + "grad_norm": 1.0258697271347046, + "learning_rate": 6.491874584667433e-05, + "loss": 2.6619, + "step": 22365 + }, + { + "epoch": 2.026318769676791, + "grad_norm": 0.986737847328186, + "learning_rate": 6.491270464568356e-05, + "loss": 2.6364, + "step": 22366 + }, + { + "epoch": 2.0264093678512376, + "grad_norm": 0.9588218927383423, + "learning_rate": 6.490666344469281e-05, + "loss": 2.5354, + "step": 22367 + }, + { + "epoch": 2.0264999660256846, + "grad_norm": 0.9215015769004822, + "learning_rate": 6.490062224370204e-05, + "loss": 2.3585, + "step": 22368 + }, + { + "epoch": 2.026590564200131, + "grad_norm": 0.977505087852478, + "learning_rate": 6.489458104271129e-05, + "loss": 2.515, + "step": 22369 + }, + { + "epoch": 2.026681162374578, + "grad_norm": 1.0725923776626587, + "learning_rate": 6.488853984172054e-05, + "loss": 2.731, + "step": 22370 + }, + { + "epoch": 2.0267717605490247, + "grad_norm": 1.1049742698669434, + "learning_rate": 6.488249864072979e-05, + "loss": 2.5856, + "step": 22371 + }, + { + "epoch": 2.0268623587234718, + "grad_norm": 0.9276291131973267, + "learning_rate": 6.487645743973902e-05, + "loss": 2.5546, + "step": 22372 + }, + { + "epoch": 2.0269529568979183, + "grad_norm": 0.9756126999855042, + "learning_rate": 6.487041623874827e-05, + "loss": 2.6948, + "step": 22373 + }, + { + "epoch": 2.0270435550723653, + "grad_norm": 0.9421334266662598, + "learning_rate": 6.48643750377575e-05, + "loss": 2.8603, + "step": 22374 + }, + { + "epoch": 2.027134153246812, + "grad_norm": 0.9348518252372742, + "learning_rate": 6.485833383676675e-05, + "loss": 2.396, + "step": 22375 + }, + { + "epoch": 2.027224751421259, + "grad_norm": 0.9594429731369019, + "learning_rate": 6.485229263577599e-05, + "loss": 2.7272, + "step": 22376 + }, + { + "epoch": 2.0273153495957055, + "grad_norm": 1.058952808380127, + "learning_rate": 6.484625143478525e-05, + "loss": 2.588, + "step": 22377 + }, + { + "epoch": 2.0274059477701525, + "grad_norm": 0.916416347026825, + "learning_rate": 6.484021023379448e-05, + "loss": 2.7895, + "step": 22378 + }, + { + "epoch": 2.027496545944599, + "grad_norm": 0.9638849496841431, + "learning_rate": 6.483416903280373e-05, + "loss": 2.4512, + "step": 22379 + }, + { + "epoch": 2.027587144119046, + "grad_norm": 1.0777662992477417, + "learning_rate": 6.482812783181296e-05, + "loss": 3.0476, + "step": 22380 + }, + { + "epoch": 2.0276777422934926, + "grad_norm": 1.070432186126709, + "learning_rate": 6.482208663082221e-05, + "loss": 2.6529, + "step": 22381 + }, + { + "epoch": 2.0277683404679396, + "grad_norm": 1.07658851146698, + "learning_rate": 6.481604542983146e-05, + "loss": 2.4203, + "step": 22382 + }, + { + "epoch": 2.027858938642386, + "grad_norm": 1.1283966302871704, + "learning_rate": 6.481000422884069e-05, + "loss": 2.605, + "step": 22383 + }, + { + "epoch": 2.027949536816833, + "grad_norm": 0.9646402597427368, + "learning_rate": 6.480396302784994e-05, + "loss": 2.7789, + "step": 22384 + }, + { + "epoch": 2.0280401349912798, + "grad_norm": 0.9880200028419495, + "learning_rate": 6.479792182685919e-05, + "loss": 2.5113, + "step": 22385 + }, + { + "epoch": 2.028130733165727, + "grad_norm": 0.9982405304908752, + "learning_rate": 6.479188062586843e-05, + "loss": 2.6235, + "step": 22386 + }, + { + "epoch": 2.0282213313401734, + "grad_norm": 0.924789309501648, + "learning_rate": 6.478583942487767e-05, + "loss": 2.64, + "step": 22387 + }, + { + "epoch": 2.0283119295146204, + "grad_norm": 0.7898188829421997, + "learning_rate": 6.477979822388692e-05, + "loss": 1.8258, + "step": 22388 + }, + { + "epoch": 2.028402527689067, + "grad_norm": 0.9484037756919861, + "learning_rate": 6.477375702289615e-05, + "loss": 2.6785, + "step": 22389 + }, + { + "epoch": 2.028493125863514, + "grad_norm": 1.0491816997528076, + "learning_rate": 6.47677158219054e-05, + "loss": 2.6538, + "step": 22390 + }, + { + "epoch": 2.0285837240379605, + "grad_norm": 1.0731689929962158, + "learning_rate": 6.476167462091463e-05, + "loss": 2.5136, + "step": 22391 + }, + { + "epoch": 2.0286743222124075, + "grad_norm": 0.965114951133728, + "learning_rate": 6.475563341992389e-05, + "loss": 2.5366, + "step": 22392 + }, + { + "epoch": 2.028764920386854, + "grad_norm": 1.0519577264785767, + "learning_rate": 6.474959221893313e-05, + "loss": 2.5399, + "step": 22393 + }, + { + "epoch": 2.028855518561301, + "grad_norm": 0.957182765007019, + "learning_rate": 6.474355101794237e-05, + "loss": 2.3882, + "step": 22394 + }, + { + "epoch": 2.0289461167357477, + "grad_norm": 0.9974482655525208, + "learning_rate": 6.473750981695161e-05, + "loss": 2.6717, + "step": 22395 + }, + { + "epoch": 2.0290367149101947, + "grad_norm": 0.971270740032196, + "learning_rate": 6.473146861596086e-05, + "loss": 2.6604, + "step": 22396 + }, + { + "epoch": 2.0291273130846412, + "grad_norm": 0.9576391577720642, + "learning_rate": 6.47254274149701e-05, + "loss": 2.4061, + "step": 22397 + }, + { + "epoch": 2.0292179112590882, + "grad_norm": 0.9777870178222656, + "learning_rate": 6.471938621397934e-05, + "loss": 3.002, + "step": 22398 + }, + { + "epoch": 2.029308509433535, + "grad_norm": 0.9824258685112, + "learning_rate": 6.471334501298859e-05, + "loss": 2.9809, + "step": 22399 + }, + { + "epoch": 2.029399107607982, + "grad_norm": 0.8818273544311523, + "learning_rate": 6.470730381199783e-05, + "loss": 2.5024, + "step": 22400 + }, + { + "epoch": 2.0294897057824284, + "grad_norm": 1.0929046869277954, + "learning_rate": 6.470126261100708e-05, + "loss": 2.5502, + "step": 22401 + }, + { + "epoch": 2.0295803039568754, + "grad_norm": 1.0127578973770142, + "learning_rate": 6.469522141001631e-05, + "loss": 2.8828, + "step": 22402 + }, + { + "epoch": 2.029670902131322, + "grad_norm": 0.9389491677284241, + "learning_rate": 6.468918020902556e-05, + "loss": 2.6836, + "step": 22403 + }, + { + "epoch": 2.029761500305769, + "grad_norm": 0.9705793857574463, + "learning_rate": 6.46831390080348e-05, + "loss": 2.5204, + "step": 22404 + }, + { + "epoch": 2.0298520984802155, + "grad_norm": 1.0130335092544556, + "learning_rate": 6.467709780704404e-05, + "loss": 2.5018, + "step": 22405 + }, + { + "epoch": 2.0299426966546625, + "grad_norm": 0.8420836925506592, + "learning_rate": 6.467105660605328e-05, + "loss": 1.8868, + "step": 22406 + }, + { + "epoch": 2.030033294829109, + "grad_norm": 0.9469150304794312, + "learning_rate": 6.466501540506254e-05, + "loss": 2.7903, + "step": 22407 + }, + { + "epoch": 2.030123893003556, + "grad_norm": 0.9695302844047546, + "learning_rate": 6.465897420407177e-05, + "loss": 2.5996, + "step": 22408 + }, + { + "epoch": 2.0302144911780027, + "grad_norm": 0.9320539236068726, + "learning_rate": 6.465293300308102e-05, + "loss": 2.4998, + "step": 22409 + }, + { + "epoch": 2.0303050893524497, + "grad_norm": 0.948647677898407, + "learning_rate": 6.464689180209025e-05, + "loss": 2.7548, + "step": 22410 + }, + { + "epoch": 2.0303956875268963, + "grad_norm": 1.050823450088501, + "learning_rate": 6.46408506010995e-05, + "loss": 2.8192, + "step": 22411 + }, + { + "epoch": 2.0304862857013433, + "grad_norm": 0.9655031561851501, + "learning_rate": 6.463480940010874e-05, + "loss": 2.6076, + "step": 22412 + }, + { + "epoch": 2.03057688387579, + "grad_norm": 0.9490017890930176, + "learning_rate": 6.462876819911798e-05, + "loss": 2.8577, + "step": 22413 + }, + { + "epoch": 2.030667482050237, + "grad_norm": 0.8085335493087769, + "learning_rate": 6.462272699812723e-05, + "loss": 1.9031, + "step": 22414 + }, + { + "epoch": 2.0307580802246834, + "grad_norm": 0.8863703608512878, + "learning_rate": 6.461668579713648e-05, + "loss": 2.4684, + "step": 22415 + }, + { + "epoch": 2.0308486783991304, + "grad_norm": 0.9866116046905518, + "learning_rate": 6.461064459614571e-05, + "loss": 2.5491, + "step": 22416 + }, + { + "epoch": 2.030939276573577, + "grad_norm": 0.9375900626182556, + "learning_rate": 6.460460339515496e-05, + "loss": 2.7366, + "step": 22417 + }, + { + "epoch": 2.031029874748024, + "grad_norm": 1.01241934299469, + "learning_rate": 6.459856219416421e-05, + "loss": 2.5583, + "step": 22418 + }, + { + "epoch": 2.0311204729224706, + "grad_norm": 1.0744432210922241, + "learning_rate": 6.459252099317344e-05, + "loss": 2.677, + "step": 22419 + }, + { + "epoch": 2.0312110710969176, + "grad_norm": 0.9301223754882812, + "learning_rate": 6.458647979218269e-05, + "loss": 2.3263, + "step": 22420 + }, + { + "epoch": 2.031301669271364, + "grad_norm": 0.9857115149497986, + "learning_rate": 6.458043859119192e-05, + "loss": 2.6295, + "step": 22421 + }, + { + "epoch": 2.031392267445811, + "grad_norm": 1.0535370111465454, + "learning_rate": 6.457439739020119e-05, + "loss": 2.6016, + "step": 22422 + }, + { + "epoch": 2.0314828656202577, + "grad_norm": 0.9615355730056763, + "learning_rate": 6.456835618921042e-05, + "loss": 2.721, + "step": 22423 + }, + { + "epoch": 2.0315734637947047, + "grad_norm": 0.9685148000717163, + "learning_rate": 6.456231498821967e-05, + "loss": 2.6044, + "step": 22424 + }, + { + "epoch": 2.0316640619691513, + "grad_norm": 0.9314402341842651, + "learning_rate": 6.45562737872289e-05, + "loss": 2.6072, + "step": 22425 + }, + { + "epoch": 2.0317546601435983, + "grad_norm": 0.9411925077438354, + "learning_rate": 6.455023258623815e-05, + "loss": 2.8254, + "step": 22426 + }, + { + "epoch": 2.031845258318045, + "grad_norm": 0.9710496664047241, + "learning_rate": 6.454419138524738e-05, + "loss": 2.6495, + "step": 22427 + }, + { + "epoch": 2.031935856492492, + "grad_norm": 1.1388403177261353, + "learning_rate": 6.453815018425663e-05, + "loss": 2.5947, + "step": 22428 + }, + { + "epoch": 2.0320264546669384, + "grad_norm": 1.0400506258010864, + "learning_rate": 6.453210898326586e-05, + "loss": 2.5456, + "step": 22429 + }, + { + "epoch": 2.0321170528413854, + "grad_norm": 1.2002090215682983, + "learning_rate": 6.452606778227513e-05, + "loss": 2.5543, + "step": 22430 + }, + { + "epoch": 2.032207651015832, + "grad_norm": 1.0173771381378174, + "learning_rate": 6.452002658128436e-05, + "loss": 2.6446, + "step": 22431 + }, + { + "epoch": 2.032298249190279, + "grad_norm": 0.9783263206481934, + "learning_rate": 6.451398538029361e-05, + "loss": 2.7269, + "step": 22432 + }, + { + "epoch": 2.0323888473647256, + "grad_norm": 0.9347544312477112, + "learning_rate": 6.450794417930285e-05, + "loss": 2.6799, + "step": 22433 + }, + { + "epoch": 2.0324794455391726, + "grad_norm": 0.8640034794807434, + "learning_rate": 6.450190297831209e-05, + "loss": 2.5789, + "step": 22434 + }, + { + "epoch": 2.032570043713619, + "grad_norm": 0.9802292585372925, + "learning_rate": 6.449586177732134e-05, + "loss": 2.6418, + "step": 22435 + }, + { + "epoch": 2.032660641888066, + "grad_norm": 0.9903784394264221, + "learning_rate": 6.448982057633057e-05, + "loss": 2.403, + "step": 22436 + }, + { + "epoch": 2.0327512400625127, + "grad_norm": 1.025582194328308, + "learning_rate": 6.448377937533983e-05, + "loss": 2.4982, + "step": 22437 + }, + { + "epoch": 2.0328418382369597, + "grad_norm": 1.027122974395752, + "learning_rate": 6.447773817434907e-05, + "loss": 2.6837, + "step": 22438 + }, + { + "epoch": 2.0329324364114063, + "grad_norm": 0.8538390398025513, + "learning_rate": 6.447169697335831e-05, + "loss": 1.8937, + "step": 22439 + }, + { + "epoch": 2.0330230345858533, + "grad_norm": 0.9834951162338257, + "learning_rate": 6.446565577236755e-05, + "loss": 2.3404, + "step": 22440 + }, + { + "epoch": 2.0331136327603, + "grad_norm": 1.0410627126693726, + "learning_rate": 6.44596145713768e-05, + "loss": 2.4474, + "step": 22441 + }, + { + "epoch": 2.0332042309347464, + "grad_norm": 0.9248774647712708, + "learning_rate": 6.445357337038603e-05, + "loss": 2.7273, + "step": 22442 + }, + { + "epoch": 2.0332948291091935, + "grad_norm": 1.0453121662139893, + "learning_rate": 6.444753216939528e-05, + "loss": 2.6722, + "step": 22443 + }, + { + "epoch": 2.03338542728364, + "grad_norm": 0.9504424929618835, + "learning_rate": 6.444149096840451e-05, + "loss": 1.8352, + "step": 22444 + }, + { + "epoch": 2.033476025458087, + "grad_norm": 1.0692120790481567, + "learning_rate": 6.443544976741377e-05, + "loss": 2.5143, + "step": 22445 + }, + { + "epoch": 2.0335666236325336, + "grad_norm": 1.0768705606460571, + "learning_rate": 6.4429408566423e-05, + "loss": 2.6401, + "step": 22446 + }, + { + "epoch": 2.0336572218069806, + "grad_norm": 1.043609857559204, + "learning_rate": 6.442336736543225e-05, + "loss": 2.8262, + "step": 22447 + }, + { + "epoch": 2.033747819981427, + "grad_norm": 0.9821882247924805, + "learning_rate": 6.441732616444149e-05, + "loss": 2.5056, + "step": 22448 + }, + { + "epoch": 2.033838418155874, + "grad_norm": 0.9721155166625977, + "learning_rate": 6.441128496345074e-05, + "loss": 2.6905, + "step": 22449 + }, + { + "epoch": 2.0339290163303207, + "grad_norm": 0.9965510368347168, + "learning_rate": 6.440524376245998e-05, + "loss": 2.3906, + "step": 22450 + }, + { + "epoch": 2.0340196145047678, + "grad_norm": 0.9926531910896301, + "learning_rate": 6.439920256146922e-05, + "loss": 2.8981, + "step": 22451 + }, + { + "epoch": 2.0341102126792143, + "grad_norm": 1.0179239511489868, + "learning_rate": 6.439316136047848e-05, + "loss": 2.5296, + "step": 22452 + }, + { + "epoch": 2.0342008108536613, + "grad_norm": 0.9366929531097412, + "learning_rate": 6.438712015948771e-05, + "loss": 1.9389, + "step": 22453 + }, + { + "epoch": 2.034291409028108, + "grad_norm": 0.9085698127746582, + "learning_rate": 6.438107895849696e-05, + "loss": 2.4413, + "step": 22454 + }, + { + "epoch": 2.034382007202555, + "grad_norm": 0.989940881729126, + "learning_rate": 6.43750377575062e-05, + "loss": 2.5669, + "step": 22455 + }, + { + "epoch": 2.0344726053770015, + "grad_norm": 0.9641996622085571, + "learning_rate": 6.436899655651544e-05, + "loss": 2.4767, + "step": 22456 + }, + { + "epoch": 2.0345632035514485, + "grad_norm": 0.9743224382400513, + "learning_rate": 6.436295535552468e-05, + "loss": 2.9115, + "step": 22457 + }, + { + "epoch": 2.034653801725895, + "grad_norm": 0.9236489534378052, + "learning_rate": 6.435691415453392e-05, + "loss": 2.4749, + "step": 22458 + }, + { + "epoch": 2.034744399900342, + "grad_norm": 0.9541135430335999, + "learning_rate": 6.435087295354316e-05, + "loss": 2.6546, + "step": 22459 + }, + { + "epoch": 2.0348349980747886, + "grad_norm": 0.794182300567627, + "learning_rate": 6.434483175255242e-05, + "loss": 1.9734, + "step": 22460 + }, + { + "epoch": 2.0349255962492356, + "grad_norm": 0.9354034066200256, + "learning_rate": 6.433879055156165e-05, + "loss": 2.645, + "step": 22461 + }, + { + "epoch": 2.035016194423682, + "grad_norm": 0.8161360025405884, + "learning_rate": 6.43327493505709e-05, + "loss": 1.8248, + "step": 22462 + }, + { + "epoch": 2.035106792598129, + "grad_norm": 1.0105313062667847, + "learning_rate": 6.432670814958013e-05, + "loss": 2.6164, + "step": 22463 + }, + { + "epoch": 2.0351973907725758, + "grad_norm": 0.9616381525993347, + "learning_rate": 6.432066694858938e-05, + "loss": 2.7862, + "step": 22464 + }, + { + "epoch": 2.035287988947023, + "grad_norm": 0.9557884931564331, + "learning_rate": 6.431462574759863e-05, + "loss": 2.3828, + "step": 22465 + }, + { + "epoch": 2.0353785871214694, + "grad_norm": 0.8960312008857727, + "learning_rate": 6.430858454660786e-05, + "loss": 1.9838, + "step": 22466 + }, + { + "epoch": 2.0354691852959164, + "grad_norm": 0.9868356585502625, + "learning_rate": 6.430254334561711e-05, + "loss": 2.6269, + "step": 22467 + }, + { + "epoch": 2.035559783470363, + "grad_norm": 0.9656646847724915, + "learning_rate": 6.429650214462636e-05, + "loss": 2.5636, + "step": 22468 + }, + { + "epoch": 2.03565038164481, + "grad_norm": 0.7885226011276245, + "learning_rate": 6.42904609436356e-05, + "loss": 1.901, + "step": 22469 + }, + { + "epoch": 2.0357409798192565, + "grad_norm": 1.0827261209487915, + "learning_rate": 6.428441974264484e-05, + "loss": 2.5164, + "step": 22470 + }, + { + "epoch": 2.0358315779937035, + "grad_norm": 1.1326322555541992, + "learning_rate": 6.427837854165409e-05, + "loss": 2.293, + "step": 22471 + }, + { + "epoch": 2.03592217616815, + "grad_norm": 0.9845771789550781, + "learning_rate": 6.427233734066332e-05, + "loss": 2.8153, + "step": 22472 + }, + { + "epoch": 2.036012774342597, + "grad_norm": 0.9922913908958435, + "learning_rate": 6.426629613967257e-05, + "loss": 2.4944, + "step": 22473 + }, + { + "epoch": 2.0361033725170437, + "grad_norm": 0.9551555514335632, + "learning_rate": 6.42602549386818e-05, + "loss": 2.6201, + "step": 22474 + }, + { + "epoch": 2.0361939706914907, + "grad_norm": 0.9868990182876587, + "learning_rate": 6.425421373769106e-05, + "loss": 2.628, + "step": 22475 + }, + { + "epoch": 2.0362845688659372, + "grad_norm": 0.9922584891319275, + "learning_rate": 6.42481725367003e-05, + "loss": 2.7142, + "step": 22476 + }, + { + "epoch": 2.0363751670403842, + "grad_norm": 1.1203941106796265, + "learning_rate": 6.424213133570955e-05, + "loss": 2.5041, + "step": 22477 + }, + { + "epoch": 2.036465765214831, + "grad_norm": 1.0870739221572876, + "learning_rate": 6.423609013471878e-05, + "loss": 2.6468, + "step": 22478 + }, + { + "epoch": 2.036556363389278, + "grad_norm": 0.9625353813171387, + "learning_rate": 6.423004893372803e-05, + "loss": 2.5212, + "step": 22479 + }, + { + "epoch": 2.0366469615637244, + "grad_norm": 0.9563135504722595, + "learning_rate": 6.422400773273726e-05, + "loss": 2.4515, + "step": 22480 + }, + { + "epoch": 2.0367375597381714, + "grad_norm": 0.9985554218292236, + "learning_rate": 6.421796653174651e-05, + "loss": 2.5226, + "step": 22481 + }, + { + "epoch": 2.036828157912618, + "grad_norm": 0.9952003955841064, + "learning_rate": 6.421192533075576e-05, + "loss": 2.433, + "step": 22482 + }, + { + "epoch": 2.036918756087065, + "grad_norm": 0.935188353061676, + "learning_rate": 6.4205884129765e-05, + "loss": 2.0109, + "step": 22483 + }, + { + "epoch": 2.0370093542615115, + "grad_norm": 1.0255359411239624, + "learning_rate": 6.419984292877424e-05, + "loss": 1.8935, + "step": 22484 + }, + { + "epoch": 2.0370999524359585, + "grad_norm": 0.9549593925476074, + "learning_rate": 6.419380172778349e-05, + "loss": 2.6329, + "step": 22485 + }, + { + "epoch": 2.037190550610405, + "grad_norm": 1.014007329940796, + "learning_rate": 6.418776052679273e-05, + "loss": 2.5814, + "step": 22486 + }, + { + "epoch": 2.037281148784852, + "grad_norm": 0.9385154843330383, + "learning_rate": 6.418171932580197e-05, + "loss": 2.7025, + "step": 22487 + }, + { + "epoch": 2.0373717469592987, + "grad_norm": 0.9699317216873169, + "learning_rate": 6.417567812481122e-05, + "loss": 2.2992, + "step": 22488 + }, + { + "epoch": 2.0374623451337457, + "grad_norm": 0.9149712920188904, + "learning_rate": 6.416963692382045e-05, + "loss": 2.4469, + "step": 22489 + }, + { + "epoch": 2.0375529433081923, + "grad_norm": 1.0284031629562378, + "learning_rate": 6.416359572282971e-05, + "loss": 2.7296, + "step": 22490 + }, + { + "epoch": 2.0376435414826393, + "grad_norm": 0.8286362886428833, + "learning_rate": 6.415755452183894e-05, + "loss": 2.0026, + "step": 22491 + }, + { + "epoch": 2.037734139657086, + "grad_norm": 1.157618522644043, + "learning_rate": 6.415151332084819e-05, + "loss": 2.4494, + "step": 22492 + }, + { + "epoch": 2.037824737831533, + "grad_norm": 0.941924512386322, + "learning_rate": 6.414547211985743e-05, + "loss": 2.6614, + "step": 22493 + }, + { + "epoch": 2.0379153360059794, + "grad_norm": 1.0556200742721558, + "learning_rate": 6.413943091886667e-05, + "loss": 2.4907, + "step": 22494 + }, + { + "epoch": 2.0380059341804264, + "grad_norm": 1.0414419174194336, + "learning_rate": 6.413338971787591e-05, + "loss": 2.71, + "step": 22495 + }, + { + "epoch": 2.038096532354873, + "grad_norm": 0.9800940155982971, + "learning_rate": 6.412734851688516e-05, + "loss": 2.9369, + "step": 22496 + }, + { + "epoch": 2.03818713052932, + "grad_norm": 0.9972456097602844, + "learning_rate": 6.41213073158944e-05, + "loss": 2.6483, + "step": 22497 + }, + { + "epoch": 2.0382777287037666, + "grad_norm": 0.987454891204834, + "learning_rate": 6.411526611490365e-05, + "loss": 2.789, + "step": 22498 + }, + { + "epoch": 2.0383683268782136, + "grad_norm": 0.9932933449745178, + "learning_rate": 6.410922491391288e-05, + "loss": 2.7982, + "step": 22499 + }, + { + "epoch": 2.03845892505266, + "grad_norm": 1.0998448133468628, + "learning_rate": 6.410318371292213e-05, + "loss": 2.5953, + "step": 22500 + }, + { + "epoch": 2.038549523227107, + "grad_norm": 1.0198296308517456, + "learning_rate": 6.409714251193138e-05, + "loss": 2.4263, + "step": 22501 + }, + { + "epoch": 2.0386401214015537, + "grad_norm": 0.9080761075019836, + "learning_rate": 6.409110131094061e-05, + "loss": 2.5872, + "step": 22502 + }, + { + "epoch": 2.0387307195760007, + "grad_norm": 1.0320014953613281, + "learning_rate": 6.408506010994986e-05, + "loss": 2.7798, + "step": 22503 + }, + { + "epoch": 2.0388213177504473, + "grad_norm": 1.0172734260559082, + "learning_rate": 6.40790189089591e-05, + "loss": 2.4336, + "step": 22504 + }, + { + "epoch": 2.0389119159248943, + "grad_norm": 0.9896705150604248, + "learning_rate": 6.407297770796836e-05, + "loss": 2.6164, + "step": 22505 + }, + { + "epoch": 2.039002514099341, + "grad_norm": 0.9053377509117126, + "learning_rate": 6.406693650697759e-05, + "loss": 2.7018, + "step": 22506 + }, + { + "epoch": 2.039093112273788, + "grad_norm": 1.028765082359314, + "learning_rate": 6.406089530598684e-05, + "loss": 2.5788, + "step": 22507 + }, + { + "epoch": 2.0391837104482344, + "grad_norm": 0.9166079759597778, + "learning_rate": 6.405485410499607e-05, + "loss": 2.6725, + "step": 22508 + }, + { + "epoch": 2.0392743086226814, + "grad_norm": 1.0024828910827637, + "learning_rate": 6.404881290400532e-05, + "loss": 2.8034, + "step": 22509 + }, + { + "epoch": 2.039364906797128, + "grad_norm": 0.8979551792144775, + "learning_rate": 6.404277170301455e-05, + "loss": 2.5963, + "step": 22510 + }, + { + "epoch": 2.039455504971575, + "grad_norm": 1.0110561847686768, + "learning_rate": 6.40367305020238e-05, + "loss": 2.7995, + "step": 22511 + }, + { + "epoch": 2.0395461031460216, + "grad_norm": 0.9766048789024353, + "learning_rate": 6.403068930103305e-05, + "loss": 2.7225, + "step": 22512 + }, + { + "epoch": 2.0396367013204686, + "grad_norm": 0.8458483219146729, + "learning_rate": 6.40246481000423e-05, + "loss": 1.9315, + "step": 22513 + }, + { + "epoch": 2.039727299494915, + "grad_norm": 1.066815972328186, + "learning_rate": 6.401860689905153e-05, + "loss": 2.9555, + "step": 22514 + }, + { + "epoch": 2.039817897669362, + "grad_norm": 0.9964020848274231, + "learning_rate": 6.401256569806078e-05, + "loss": 2.7083, + "step": 22515 + }, + { + "epoch": 2.0399084958438087, + "grad_norm": 1.0260035991668701, + "learning_rate": 6.400652449707001e-05, + "loss": 2.6486, + "step": 22516 + }, + { + "epoch": 2.0399990940182557, + "grad_norm": 0.9666662216186523, + "learning_rate": 6.400048329607926e-05, + "loss": 2.6204, + "step": 22517 + }, + { + "epoch": 2.0400896921927023, + "grad_norm": 0.9722194075584412, + "learning_rate": 6.399444209508851e-05, + "loss": 2.6506, + "step": 22518 + }, + { + "epoch": 2.0401802903671493, + "grad_norm": 0.9157611131668091, + "learning_rate": 6.398840089409774e-05, + "loss": 2.6236, + "step": 22519 + }, + { + "epoch": 2.040270888541596, + "grad_norm": 0.9655811786651611, + "learning_rate": 6.3982359693107e-05, + "loss": 2.6724, + "step": 22520 + }, + { + "epoch": 2.0403614867160424, + "grad_norm": 0.9250736832618713, + "learning_rate": 6.397631849211624e-05, + "loss": 2.6359, + "step": 22521 + }, + { + "epoch": 2.0404520848904895, + "grad_norm": 0.9459090828895569, + "learning_rate": 6.397027729112549e-05, + "loss": 2.7807, + "step": 22522 + }, + { + "epoch": 2.040542683064936, + "grad_norm": 0.9911989569664001, + "learning_rate": 6.396423609013472e-05, + "loss": 2.4619, + "step": 22523 + }, + { + "epoch": 2.040633281239383, + "grad_norm": 0.9566729664802551, + "learning_rate": 6.395819488914397e-05, + "loss": 2.6126, + "step": 22524 + }, + { + "epoch": 2.0407238794138296, + "grad_norm": 0.9251226782798767, + "learning_rate": 6.39521536881532e-05, + "loss": 2.3631, + "step": 22525 + }, + { + "epoch": 2.0408144775882766, + "grad_norm": 0.9417858719825745, + "learning_rate": 6.394611248716245e-05, + "loss": 2.5391, + "step": 22526 + }, + { + "epoch": 2.040905075762723, + "grad_norm": 0.8979924321174622, + "learning_rate": 6.39400712861717e-05, + "loss": 2.4574, + "step": 22527 + }, + { + "epoch": 2.04099567393717, + "grad_norm": 0.9925383925437927, + "learning_rate": 6.393403008518094e-05, + "loss": 2.7501, + "step": 22528 + }, + { + "epoch": 2.0410862721116168, + "grad_norm": 0.9494670629501343, + "learning_rate": 6.392798888419018e-05, + "loss": 2.8848, + "step": 22529 + }, + { + "epoch": 2.0411768702860638, + "grad_norm": 1.0008326768875122, + "learning_rate": 6.392194768319943e-05, + "loss": 2.5386, + "step": 22530 + }, + { + "epoch": 2.0412674684605103, + "grad_norm": 0.9937869906425476, + "learning_rate": 6.391590648220866e-05, + "loss": 2.6259, + "step": 22531 + }, + { + "epoch": 2.0413580666349573, + "grad_norm": 0.9896372556686401, + "learning_rate": 6.390986528121791e-05, + "loss": 2.7059, + "step": 22532 + }, + { + "epoch": 2.041448664809404, + "grad_norm": 1.0169293880462646, + "learning_rate": 6.390382408022715e-05, + "loss": 2.4897, + "step": 22533 + }, + { + "epoch": 2.041539262983851, + "grad_norm": 1.0422813892364502, + "learning_rate": 6.389778287923639e-05, + "loss": 2.816, + "step": 22534 + }, + { + "epoch": 2.0416298611582975, + "grad_norm": 0.9555917382240295, + "learning_rate": 6.389174167824564e-05, + "loss": 2.6082, + "step": 22535 + }, + { + "epoch": 2.0417204593327445, + "grad_norm": 0.9853233098983765, + "learning_rate": 6.388570047725488e-05, + "loss": 2.769, + "step": 22536 + }, + { + "epoch": 2.041811057507191, + "grad_norm": 1.1151952743530273, + "learning_rate": 6.387965927626413e-05, + "loss": 2.3941, + "step": 22537 + }, + { + "epoch": 2.041901655681638, + "grad_norm": 0.9171881675720215, + "learning_rate": 6.387361807527337e-05, + "loss": 2.7124, + "step": 22538 + }, + { + "epoch": 2.0419922538560846, + "grad_norm": 0.898955225944519, + "learning_rate": 6.386757687428261e-05, + "loss": 2.6052, + "step": 22539 + }, + { + "epoch": 2.0420828520305316, + "grad_norm": 1.0008785724639893, + "learning_rate": 6.386153567329185e-05, + "loss": 2.8197, + "step": 22540 + }, + { + "epoch": 2.042173450204978, + "grad_norm": 0.8985213041305542, + "learning_rate": 6.38554944723011e-05, + "loss": 2.4653, + "step": 22541 + }, + { + "epoch": 2.042264048379425, + "grad_norm": 1.0775257349014282, + "learning_rate": 6.384945327131034e-05, + "loss": 2.728, + "step": 22542 + }, + { + "epoch": 2.0423546465538718, + "grad_norm": 0.9409040212631226, + "learning_rate": 6.384341207031959e-05, + "loss": 2.6948, + "step": 22543 + }, + { + "epoch": 2.042445244728319, + "grad_norm": 0.9725443720817566, + "learning_rate": 6.383737086932882e-05, + "loss": 2.481, + "step": 22544 + }, + { + "epoch": 2.0425358429027654, + "grad_norm": 0.835784912109375, + "learning_rate": 6.383132966833807e-05, + "loss": 1.9209, + "step": 22545 + }, + { + "epoch": 2.0426264410772124, + "grad_norm": 1.0026524066925049, + "learning_rate": 6.38252884673473e-05, + "loss": 2.5746, + "step": 22546 + }, + { + "epoch": 2.042717039251659, + "grad_norm": 1.0858005285263062, + "learning_rate": 6.381924726635655e-05, + "loss": 2.5724, + "step": 22547 + }, + { + "epoch": 2.042807637426106, + "grad_norm": 0.8653454184532166, + "learning_rate": 6.381320606536579e-05, + "loss": 1.9282, + "step": 22548 + }, + { + "epoch": 2.0428982356005525, + "grad_norm": 1.0143052339553833, + "learning_rate": 6.380716486437503e-05, + "loss": 2.4241, + "step": 22549 + }, + { + "epoch": 2.0429888337749995, + "grad_norm": 0.9867615699768066, + "learning_rate": 6.380112366338428e-05, + "loss": 2.4176, + "step": 22550 + }, + { + "epoch": 2.043079431949446, + "grad_norm": 1.096803069114685, + "learning_rate": 6.379508246239353e-05, + "loss": 2.7444, + "step": 22551 + }, + { + "epoch": 2.043170030123893, + "grad_norm": 0.9377018213272095, + "learning_rate": 6.378904126140278e-05, + "loss": 2.2914, + "step": 22552 + }, + { + "epoch": 2.0432606282983397, + "grad_norm": 0.9667424559593201, + "learning_rate": 6.378300006041201e-05, + "loss": 2.6543, + "step": 22553 + }, + { + "epoch": 2.0433512264727867, + "grad_norm": 0.9807984828948975, + "learning_rate": 6.377695885942126e-05, + "loss": 2.6215, + "step": 22554 + }, + { + "epoch": 2.0434418246472332, + "grad_norm": 1.0064085721969604, + "learning_rate": 6.37709176584305e-05, + "loss": 2.6982, + "step": 22555 + }, + { + "epoch": 2.0435324228216802, + "grad_norm": 1.119480013847351, + "learning_rate": 6.376487645743974e-05, + "loss": 2.5054, + "step": 22556 + }, + { + "epoch": 2.043623020996127, + "grad_norm": 1.0275259017944336, + "learning_rate": 6.375883525644899e-05, + "loss": 2.548, + "step": 22557 + }, + { + "epoch": 2.043713619170574, + "grad_norm": 1.0286402702331543, + "learning_rate": 6.375279405545824e-05, + "loss": 2.8651, + "step": 22558 + }, + { + "epoch": 2.0438042173450204, + "grad_norm": 1.0948445796966553, + "learning_rate": 6.374675285446747e-05, + "loss": 2.4969, + "step": 22559 + }, + { + "epoch": 2.0438948155194674, + "grad_norm": 0.972754716873169, + "learning_rate": 6.374071165347672e-05, + "loss": 2.5283, + "step": 22560 + }, + { + "epoch": 2.043985413693914, + "grad_norm": 0.9944978952407837, + "learning_rate": 6.373467045248595e-05, + "loss": 2.7143, + "step": 22561 + }, + { + "epoch": 2.044076011868361, + "grad_norm": 0.9089500904083252, + "learning_rate": 6.37286292514952e-05, + "loss": 2.5623, + "step": 22562 + }, + { + "epoch": 2.0441666100428075, + "grad_norm": 0.8525901436805725, + "learning_rate": 6.372258805050443e-05, + "loss": 2.0109, + "step": 22563 + }, + { + "epoch": 2.0442572082172545, + "grad_norm": 0.981980562210083, + "learning_rate": 6.37165468495137e-05, + "loss": 2.7427, + "step": 22564 + }, + { + "epoch": 2.044347806391701, + "grad_norm": 1.1150647401809692, + "learning_rate": 6.371050564852293e-05, + "loss": 2.6366, + "step": 22565 + }, + { + "epoch": 2.044438404566148, + "grad_norm": 1.004163146018982, + "learning_rate": 6.370446444753218e-05, + "loss": 2.5551, + "step": 22566 + }, + { + "epoch": 2.0445290027405947, + "grad_norm": 0.967962920665741, + "learning_rate": 6.369842324654141e-05, + "loss": 2.563, + "step": 22567 + }, + { + "epoch": 2.0446196009150417, + "grad_norm": 1.1700012683868408, + "learning_rate": 6.369238204555066e-05, + "loss": 2.776, + "step": 22568 + }, + { + "epoch": 2.0447101990894883, + "grad_norm": 0.8356457948684692, + "learning_rate": 6.36863408445599e-05, + "loss": 2.0045, + "step": 22569 + }, + { + "epoch": 2.0448007972639353, + "grad_norm": 0.962115466594696, + "learning_rate": 6.368029964356914e-05, + "loss": 2.5173, + "step": 22570 + }, + { + "epoch": 2.044891395438382, + "grad_norm": 0.9720346331596375, + "learning_rate": 6.367425844257839e-05, + "loss": 2.7505, + "step": 22571 + }, + { + "epoch": 2.044981993612829, + "grad_norm": 0.9032013416290283, + "learning_rate": 6.366821724158763e-05, + "loss": 1.7515, + "step": 22572 + }, + { + "epoch": 2.0450725917872754, + "grad_norm": 0.8700711727142334, + "learning_rate": 6.366217604059688e-05, + "loss": 1.9822, + "step": 22573 + }, + { + "epoch": 2.0451631899617224, + "grad_norm": 1.020229458808899, + "learning_rate": 6.365613483960612e-05, + "loss": 2.6338, + "step": 22574 + }, + { + "epoch": 2.045253788136169, + "grad_norm": 0.9302432537078857, + "learning_rate": 6.365009363861536e-05, + "loss": 2.4709, + "step": 22575 + }, + { + "epoch": 2.045344386310616, + "grad_norm": 0.9653428792953491, + "learning_rate": 6.36440524376246e-05, + "loss": 2.5986, + "step": 22576 + }, + { + "epoch": 2.0454349844850626, + "grad_norm": 0.9333884716033936, + "learning_rate": 6.363801123663385e-05, + "loss": 2.3914, + "step": 22577 + }, + { + "epoch": 2.0455255826595096, + "grad_norm": 0.8647329211235046, + "learning_rate": 6.363197003564308e-05, + "loss": 1.9295, + "step": 22578 + }, + { + "epoch": 2.045616180833956, + "grad_norm": 0.9752195477485657, + "learning_rate": 6.362592883465234e-05, + "loss": 2.5549, + "step": 22579 + }, + { + "epoch": 2.045706779008403, + "grad_norm": 0.9546664357185364, + "learning_rate": 6.361988763366158e-05, + "loss": 2.6623, + "step": 22580 + }, + { + "epoch": 2.0457973771828497, + "grad_norm": 0.9361727833747864, + "learning_rate": 6.361384643267082e-05, + "loss": 2.6915, + "step": 22581 + }, + { + "epoch": 2.0458879753572967, + "grad_norm": 0.8940561413764954, + "learning_rate": 6.360780523168006e-05, + "loss": 2.027, + "step": 22582 + }, + { + "epoch": 2.0459785735317433, + "grad_norm": 0.8727886080741882, + "learning_rate": 6.36017640306893e-05, + "loss": 2.5674, + "step": 22583 + }, + { + "epoch": 2.0460691717061903, + "grad_norm": 0.9865095019340515, + "learning_rate": 6.359572282969855e-05, + "loss": 2.6154, + "step": 22584 + }, + { + "epoch": 2.046159769880637, + "grad_norm": 1.0297385454177856, + "learning_rate": 6.358968162870779e-05, + "loss": 2.6661, + "step": 22585 + }, + { + "epoch": 2.046250368055084, + "grad_norm": 0.9749696850776672, + "learning_rate": 6.358364042771703e-05, + "loss": 2.4112, + "step": 22586 + }, + { + "epoch": 2.0463409662295304, + "grad_norm": 0.8852656483650208, + "learning_rate": 6.357759922672628e-05, + "loss": 1.9268, + "step": 22587 + }, + { + "epoch": 2.0464315644039774, + "grad_norm": 1.0001457929611206, + "learning_rate": 6.357155802573553e-05, + "loss": 2.7126, + "step": 22588 + }, + { + "epoch": 2.046522162578424, + "grad_norm": 1.0333892107009888, + "learning_rate": 6.356551682474476e-05, + "loss": 2.7693, + "step": 22589 + }, + { + "epoch": 2.046612760752871, + "grad_norm": 0.9878695607185364, + "learning_rate": 6.355947562375401e-05, + "loss": 2.36, + "step": 22590 + }, + { + "epoch": 2.0467033589273176, + "grad_norm": 1.0168191194534302, + "learning_rate": 6.355343442276324e-05, + "loss": 2.6159, + "step": 22591 + }, + { + "epoch": 2.0467939571017646, + "grad_norm": 1.0098586082458496, + "learning_rate": 6.354739322177249e-05, + "loss": 2.5622, + "step": 22592 + }, + { + "epoch": 2.046884555276211, + "grad_norm": 0.9555143117904663, + "learning_rate": 6.354135202078173e-05, + "loss": 2.493, + "step": 22593 + }, + { + "epoch": 2.046975153450658, + "grad_norm": 1.0893062353134155, + "learning_rate": 6.353531081979099e-05, + "loss": 2.4443, + "step": 22594 + }, + { + "epoch": 2.0470657516251047, + "grad_norm": 0.8303702473640442, + "learning_rate": 6.352926961880022e-05, + "loss": 1.9195, + "step": 22595 + }, + { + "epoch": 2.0471563497995517, + "grad_norm": 0.9610450267791748, + "learning_rate": 6.352322841780947e-05, + "loss": 2.7019, + "step": 22596 + }, + { + "epoch": 2.0472469479739983, + "grad_norm": 0.9090034365653992, + "learning_rate": 6.35171872168187e-05, + "loss": 1.7948, + "step": 22597 + }, + { + "epoch": 2.0473375461484453, + "grad_norm": 1.1339194774627686, + "learning_rate": 6.351114601582795e-05, + "loss": 2.7097, + "step": 22598 + }, + { + "epoch": 2.047428144322892, + "grad_norm": 0.9823558330535889, + "learning_rate": 6.350510481483718e-05, + "loss": 2.627, + "step": 22599 + }, + { + "epoch": 2.047518742497339, + "grad_norm": 1.0052762031555176, + "learning_rate": 6.349906361384643e-05, + "loss": 2.7369, + "step": 22600 + }, + { + "epoch": 2.0476093406717855, + "grad_norm": 1.0166376829147339, + "learning_rate": 6.349302241285568e-05, + "loss": 2.6879, + "step": 22601 + }, + { + "epoch": 2.0476999388462325, + "grad_norm": 0.9907487630844116, + "learning_rate": 6.348698121186493e-05, + "loss": 2.6979, + "step": 22602 + }, + { + "epoch": 2.047790537020679, + "grad_norm": 1.004740834236145, + "learning_rate": 6.348094001087416e-05, + "loss": 2.5731, + "step": 22603 + }, + { + "epoch": 2.0478811351951256, + "grad_norm": 0.96180659532547, + "learning_rate": 6.347489880988341e-05, + "loss": 1.8845, + "step": 22604 + }, + { + "epoch": 2.0479717333695726, + "grad_norm": 0.9655829668045044, + "learning_rate": 6.346885760889266e-05, + "loss": 2.1873, + "step": 22605 + }, + { + "epoch": 2.048062331544019, + "grad_norm": 1.125704288482666, + "learning_rate": 6.346281640790189e-05, + "loss": 2.4982, + "step": 22606 + }, + { + "epoch": 2.048152929718466, + "grad_norm": 0.9911738038063049, + "learning_rate": 6.345677520691114e-05, + "loss": 2.6403, + "step": 22607 + }, + { + "epoch": 2.0482435278929128, + "grad_norm": 0.8777198791503906, + "learning_rate": 6.345073400592037e-05, + "loss": 2.1564, + "step": 22608 + }, + { + "epoch": 2.0483341260673598, + "grad_norm": 0.9390410780906677, + "learning_rate": 6.344469280492963e-05, + "loss": 2.5799, + "step": 22609 + }, + { + "epoch": 2.0484247242418063, + "grad_norm": 0.8325757384300232, + "learning_rate": 6.343865160393887e-05, + "loss": 1.9871, + "step": 22610 + }, + { + "epoch": 2.0485153224162533, + "grad_norm": 0.9501429796218872, + "learning_rate": 6.343261040294812e-05, + "loss": 2.4003, + "step": 22611 + }, + { + "epoch": 2.0486059205907, + "grad_norm": 0.9855993986129761, + "learning_rate": 6.342656920195735e-05, + "loss": 2.5518, + "step": 22612 + }, + { + "epoch": 2.048696518765147, + "grad_norm": 1.0880509614944458, + "learning_rate": 6.34205280009666e-05, + "loss": 2.4282, + "step": 22613 + }, + { + "epoch": 2.0487871169395935, + "grad_norm": 0.9679456949234009, + "learning_rate": 6.341448679997583e-05, + "loss": 2.3917, + "step": 22614 + }, + { + "epoch": 2.0488777151140405, + "grad_norm": 0.9715635776519775, + "learning_rate": 6.340844559898508e-05, + "loss": 2.6808, + "step": 22615 + }, + { + "epoch": 2.048968313288487, + "grad_norm": 0.9880200028419495, + "learning_rate": 6.340240439799431e-05, + "loss": 2.6918, + "step": 22616 + }, + { + "epoch": 2.049058911462934, + "grad_norm": 1.0899544954299927, + "learning_rate": 6.339636319700357e-05, + "loss": 2.6826, + "step": 22617 + }, + { + "epoch": 2.0491495096373806, + "grad_norm": 0.9829482436180115, + "learning_rate": 6.339032199601281e-05, + "loss": 2.6481, + "step": 22618 + }, + { + "epoch": 2.0492401078118276, + "grad_norm": 1.0388944149017334, + "learning_rate": 6.338428079502206e-05, + "loss": 2.5293, + "step": 22619 + }, + { + "epoch": 2.049330705986274, + "grad_norm": 0.997139573097229, + "learning_rate": 6.33782395940313e-05, + "loss": 2.7156, + "step": 22620 + }, + { + "epoch": 2.049421304160721, + "grad_norm": 0.9931432604789734, + "learning_rate": 6.337219839304054e-05, + "loss": 2.3286, + "step": 22621 + }, + { + "epoch": 2.049511902335168, + "grad_norm": 0.9963425993919373, + "learning_rate": 6.336615719204978e-05, + "loss": 2.5748, + "step": 22622 + }, + { + "epoch": 2.049602500509615, + "grad_norm": 1.075953483581543, + "learning_rate": 6.336011599105902e-05, + "loss": 2.8943, + "step": 22623 + }, + { + "epoch": 2.0496930986840614, + "grad_norm": 0.9572001695632935, + "learning_rate": 6.335407479006828e-05, + "loss": 2.6631, + "step": 22624 + }, + { + "epoch": 2.0497836968585084, + "grad_norm": 0.9855595231056213, + "learning_rate": 6.334803358907751e-05, + "loss": 2.6276, + "step": 22625 + }, + { + "epoch": 2.049874295032955, + "grad_norm": 1.836513876914978, + "learning_rate": 6.334199238808676e-05, + "loss": 2.3002, + "step": 22626 + }, + { + "epoch": 2.049964893207402, + "grad_norm": 0.9881384968757629, + "learning_rate": 6.3335951187096e-05, + "loss": 2.7472, + "step": 22627 + }, + { + "epoch": 2.0500554913818485, + "grad_norm": 0.8054053783416748, + "learning_rate": 6.332990998610524e-05, + "loss": 2.025, + "step": 22628 + }, + { + "epoch": 2.0501460895562955, + "grad_norm": 0.963868260383606, + "learning_rate": 6.332386878511448e-05, + "loss": 2.464, + "step": 22629 + }, + { + "epoch": 2.050236687730742, + "grad_norm": 0.9602528810501099, + "learning_rate": 6.331782758412373e-05, + "loss": 2.4325, + "step": 22630 + }, + { + "epoch": 2.050327285905189, + "grad_norm": 0.9793020486831665, + "learning_rate": 6.331178638313296e-05, + "loss": 2.5587, + "step": 22631 + }, + { + "epoch": 2.0504178840796357, + "grad_norm": 1.0409222841262817, + "learning_rate": 6.330574518214222e-05, + "loss": 2.9982, + "step": 22632 + }, + { + "epoch": 2.0505084822540827, + "grad_norm": 0.9600249528884888, + "learning_rate": 6.329970398115145e-05, + "loss": 2.6744, + "step": 22633 + }, + { + "epoch": 2.0505990804285292, + "grad_norm": 0.9182130098342896, + "learning_rate": 6.32936627801607e-05, + "loss": 2.5685, + "step": 22634 + }, + { + "epoch": 2.0506896786029762, + "grad_norm": 1.0220210552215576, + "learning_rate": 6.328762157916994e-05, + "loss": 2.81, + "step": 22635 + }, + { + "epoch": 2.050780276777423, + "grad_norm": 0.9477523565292358, + "learning_rate": 6.328158037817918e-05, + "loss": 2.434, + "step": 22636 + }, + { + "epoch": 2.05087087495187, + "grad_norm": 0.94012051820755, + "learning_rate": 6.327553917718843e-05, + "loss": 2.5868, + "step": 22637 + }, + { + "epoch": 2.0509614731263164, + "grad_norm": 1.035434365272522, + "learning_rate": 6.326949797619767e-05, + "loss": 2.4972, + "step": 22638 + }, + { + "epoch": 2.0510520713007634, + "grad_norm": 1.2661426067352295, + "learning_rate": 6.326345677520693e-05, + "loss": 2.347, + "step": 22639 + }, + { + "epoch": 2.05114266947521, + "grad_norm": 0.9823977947235107, + "learning_rate": 6.325741557421616e-05, + "loss": 2.9762, + "step": 22640 + }, + { + "epoch": 2.051233267649657, + "grad_norm": 0.9429644346237183, + "learning_rate": 6.325137437322541e-05, + "loss": 2.6277, + "step": 22641 + }, + { + "epoch": 2.0513238658241035, + "grad_norm": 0.9734635353088379, + "learning_rate": 6.324533317223464e-05, + "loss": 2.6538, + "step": 22642 + }, + { + "epoch": 2.0514144639985505, + "grad_norm": 0.9831640124320984, + "learning_rate": 6.323929197124389e-05, + "loss": 2.4476, + "step": 22643 + }, + { + "epoch": 2.051505062172997, + "grad_norm": 1.0345336198806763, + "learning_rate": 6.323325077025312e-05, + "loss": 2.2978, + "step": 22644 + }, + { + "epoch": 2.051595660347444, + "grad_norm": 0.917147696018219, + "learning_rate": 6.322720956926237e-05, + "loss": 2.3183, + "step": 22645 + }, + { + "epoch": 2.0516862585218907, + "grad_norm": 0.9558948278427124, + "learning_rate": 6.32211683682716e-05, + "loss": 2.6826, + "step": 22646 + }, + { + "epoch": 2.0517768566963377, + "grad_norm": 0.9874320030212402, + "learning_rate": 6.321512716728087e-05, + "loss": 2.8275, + "step": 22647 + }, + { + "epoch": 2.0518674548707843, + "grad_norm": 0.9053168296813965, + "learning_rate": 6.32090859662901e-05, + "loss": 2.2391, + "step": 22648 + }, + { + "epoch": 2.0519580530452313, + "grad_norm": 0.9887839555740356, + "learning_rate": 6.320304476529935e-05, + "loss": 1.8466, + "step": 22649 + }, + { + "epoch": 2.052048651219678, + "grad_norm": 0.9886549711227417, + "learning_rate": 6.319700356430858e-05, + "loss": 2.4908, + "step": 22650 + }, + { + "epoch": 2.052139249394125, + "grad_norm": 1.0102453231811523, + "learning_rate": 6.319096236331783e-05, + "loss": 2.6269, + "step": 22651 + }, + { + "epoch": 2.0522298475685714, + "grad_norm": 0.9010884165763855, + "learning_rate": 6.318492116232708e-05, + "loss": 2.436, + "step": 22652 + }, + { + "epoch": 2.0523204457430184, + "grad_norm": 0.9334977269172668, + "learning_rate": 6.317887996133631e-05, + "loss": 2.5284, + "step": 22653 + }, + { + "epoch": 2.052411043917465, + "grad_norm": 0.9336189031600952, + "learning_rate": 6.317283876034556e-05, + "loss": 2.5036, + "step": 22654 + }, + { + "epoch": 2.052501642091912, + "grad_norm": 0.9432747960090637, + "learning_rate": 6.31667975593548e-05, + "loss": 2.7686, + "step": 22655 + }, + { + "epoch": 2.0525922402663586, + "grad_norm": 0.960013210773468, + "learning_rate": 6.316075635836405e-05, + "loss": 2.4279, + "step": 22656 + }, + { + "epoch": 2.0526828384408056, + "grad_norm": 0.961010217666626, + "learning_rate": 6.315471515737329e-05, + "loss": 2.5081, + "step": 22657 + }, + { + "epoch": 2.052773436615252, + "grad_norm": 0.9532817602157593, + "learning_rate": 6.314867395638254e-05, + "loss": 2.5881, + "step": 22658 + }, + { + "epoch": 2.052864034789699, + "grad_norm": 0.8881338238716125, + "learning_rate": 6.314263275539177e-05, + "loss": 1.9827, + "step": 22659 + }, + { + "epoch": 2.0529546329641457, + "grad_norm": 0.954803466796875, + "learning_rate": 6.313659155440102e-05, + "loss": 2.5891, + "step": 22660 + }, + { + "epoch": 2.0530452311385927, + "grad_norm": 0.9443511366844177, + "learning_rate": 6.313055035341025e-05, + "loss": 2.4563, + "step": 22661 + }, + { + "epoch": 2.0531358293130393, + "grad_norm": 1.0233107805252075, + "learning_rate": 6.312450915241951e-05, + "loss": 2.7008, + "step": 22662 + }, + { + "epoch": 2.0532264274874863, + "grad_norm": 0.9671158194541931, + "learning_rate": 6.311846795142875e-05, + "loss": 2.5273, + "step": 22663 + }, + { + "epoch": 2.053317025661933, + "grad_norm": 1.0174362659454346, + "learning_rate": 6.3112426750438e-05, + "loss": 2.6514, + "step": 22664 + }, + { + "epoch": 2.05340762383638, + "grad_norm": 0.9394615292549133, + "learning_rate": 6.310638554944723e-05, + "loss": 2.8119, + "step": 22665 + }, + { + "epoch": 2.0534982220108264, + "grad_norm": 0.9143931269645691, + "learning_rate": 6.310034434845648e-05, + "loss": 2.5832, + "step": 22666 + }, + { + "epoch": 2.0535888201852734, + "grad_norm": 1.0260998010635376, + "learning_rate": 6.309430314746571e-05, + "loss": 2.5824, + "step": 22667 + }, + { + "epoch": 2.05367941835972, + "grad_norm": 0.9409666061401367, + "learning_rate": 6.308826194647496e-05, + "loss": 2.8835, + "step": 22668 + }, + { + "epoch": 2.053770016534167, + "grad_norm": 1.021858811378479, + "learning_rate": 6.30822207454842e-05, + "loss": 2.5524, + "step": 22669 + }, + { + "epoch": 2.0538606147086136, + "grad_norm": 0.9762405157089233, + "learning_rate": 6.307617954449345e-05, + "loss": 2.6594, + "step": 22670 + }, + { + "epoch": 2.0539512128830606, + "grad_norm": 0.8038343787193298, + "learning_rate": 6.307013834350269e-05, + "loss": 2.0035, + "step": 22671 + }, + { + "epoch": 2.054041811057507, + "grad_norm": 1.0349143743515015, + "learning_rate": 6.306409714251193e-05, + "loss": 2.6669, + "step": 22672 + }, + { + "epoch": 2.054132409231954, + "grad_norm": 0.9471554756164551, + "learning_rate": 6.305805594152118e-05, + "loss": 2.4786, + "step": 22673 + }, + { + "epoch": 2.0542230074064007, + "grad_norm": 0.9580487608909607, + "learning_rate": 6.305201474053042e-05, + "loss": 2.6972, + "step": 22674 + }, + { + "epoch": 2.0543136055808477, + "grad_norm": 0.9596225619316101, + "learning_rate": 6.304597353953966e-05, + "loss": 2.624, + "step": 22675 + }, + { + "epoch": 2.0544042037552943, + "grad_norm": 1.0457470417022705, + "learning_rate": 6.30399323385489e-05, + "loss": 2.4903, + "step": 22676 + }, + { + "epoch": 2.0544948019297413, + "grad_norm": 0.9956724047660828, + "learning_rate": 6.303389113755816e-05, + "loss": 2.6032, + "step": 22677 + }, + { + "epoch": 2.054585400104188, + "grad_norm": 1.0137567520141602, + "learning_rate": 6.30278499365674e-05, + "loss": 2.6598, + "step": 22678 + }, + { + "epoch": 2.054675998278635, + "grad_norm": 0.9740610122680664, + "learning_rate": 6.302180873557664e-05, + "loss": 1.9308, + "step": 22679 + }, + { + "epoch": 2.0547665964530815, + "grad_norm": 1.055100440979004, + "learning_rate": 6.301576753458587e-05, + "loss": 2.5852, + "step": 22680 + }, + { + "epoch": 2.0548571946275285, + "grad_norm": 0.9308798909187317, + "learning_rate": 6.300972633359512e-05, + "loss": 2.2501, + "step": 22681 + }, + { + "epoch": 2.054947792801975, + "grad_norm": 0.9070813059806824, + "learning_rate": 6.300368513260436e-05, + "loss": 2.4128, + "step": 22682 + }, + { + "epoch": 2.0550383909764216, + "grad_norm": 1.014105200767517, + "learning_rate": 6.29976439316136e-05, + "loss": 2.7188, + "step": 22683 + }, + { + "epoch": 2.0551289891508686, + "grad_norm": 0.9478300213813782, + "learning_rate": 6.299160273062285e-05, + "loss": 2.6373, + "step": 22684 + }, + { + "epoch": 2.055219587325315, + "grad_norm": 1.056430697441101, + "learning_rate": 6.29855615296321e-05, + "loss": 2.7618, + "step": 22685 + }, + { + "epoch": 2.055310185499762, + "grad_norm": 1.019615650177002, + "learning_rate": 6.297952032864133e-05, + "loss": 2.6049, + "step": 22686 + }, + { + "epoch": 2.0554007836742088, + "grad_norm": 1.0529277324676514, + "learning_rate": 6.297347912765058e-05, + "loss": 2.6917, + "step": 22687 + }, + { + "epoch": 2.0554913818486558, + "grad_norm": 0.9438138008117676, + "learning_rate": 6.296743792665983e-05, + "loss": 2.6698, + "step": 22688 + }, + { + "epoch": 2.0555819800231023, + "grad_norm": 1.0729974508285522, + "learning_rate": 6.296139672566906e-05, + "loss": 2.6818, + "step": 22689 + }, + { + "epoch": 2.0556725781975493, + "grad_norm": 0.9421913027763367, + "learning_rate": 6.295535552467831e-05, + "loss": 2.4969, + "step": 22690 + }, + { + "epoch": 2.055763176371996, + "grad_norm": 0.9688330292701721, + "learning_rate": 6.294931432368754e-05, + "loss": 2.5301, + "step": 22691 + }, + { + "epoch": 2.055853774546443, + "grad_norm": 1.0098309516906738, + "learning_rate": 6.29432731226968e-05, + "loss": 2.7755, + "step": 22692 + }, + { + "epoch": 2.0559443727208895, + "grad_norm": 0.9885329008102417, + "learning_rate": 6.293723192170604e-05, + "loss": 2.6128, + "step": 22693 + }, + { + "epoch": 2.0560349708953365, + "grad_norm": 0.9896336793899536, + "learning_rate": 6.293119072071529e-05, + "loss": 2.4369, + "step": 22694 + }, + { + "epoch": 2.056125569069783, + "grad_norm": 0.9639785885810852, + "learning_rate": 6.292514951972452e-05, + "loss": 2.5639, + "step": 22695 + }, + { + "epoch": 2.05621616724423, + "grad_norm": 1.051200270652771, + "learning_rate": 6.291910831873377e-05, + "loss": 2.6142, + "step": 22696 + }, + { + "epoch": 2.0563067654186766, + "grad_norm": 0.9751104712486267, + "learning_rate": 6.2913067117743e-05, + "loss": 2.2848, + "step": 22697 + }, + { + "epoch": 2.0563973635931236, + "grad_norm": 0.7953630089759827, + "learning_rate": 6.290702591675225e-05, + "loss": 1.9292, + "step": 22698 + }, + { + "epoch": 2.05648796176757, + "grad_norm": 0.9747294187545776, + "learning_rate": 6.29009847157615e-05, + "loss": 2.857, + "step": 22699 + }, + { + "epoch": 2.056578559942017, + "grad_norm": 0.9926913380622864, + "learning_rate": 6.289494351477075e-05, + "loss": 2.0838, + "step": 22700 + }, + { + "epoch": 2.056669158116464, + "grad_norm": 1.0195516347885132, + "learning_rate": 6.288890231377998e-05, + "loss": 2.5748, + "step": 22701 + }, + { + "epoch": 2.056759756290911, + "grad_norm": 1.0374304056167603, + "learning_rate": 6.288286111278923e-05, + "loss": 2.3971, + "step": 22702 + }, + { + "epoch": 2.0568503544653574, + "grad_norm": 0.9471361041069031, + "learning_rate": 6.287681991179846e-05, + "loss": 2.6876, + "step": 22703 + }, + { + "epoch": 2.0569409526398044, + "grad_norm": 0.955608069896698, + "learning_rate": 6.287077871080771e-05, + "loss": 3.0886, + "step": 22704 + }, + { + "epoch": 2.057031550814251, + "grad_norm": 1.0333534479141235, + "learning_rate": 6.286473750981696e-05, + "loss": 2.6162, + "step": 22705 + }, + { + "epoch": 2.057122148988698, + "grad_norm": 1.0494332313537598, + "learning_rate": 6.285869630882619e-05, + "loss": 2.4633, + "step": 22706 + }, + { + "epoch": 2.0572127471631445, + "grad_norm": 0.9707449078559875, + "learning_rate": 6.285265510783545e-05, + "loss": 2.4352, + "step": 22707 + }, + { + "epoch": 2.0573033453375915, + "grad_norm": 0.9160539507865906, + "learning_rate": 6.284661390684469e-05, + "loss": 2.4515, + "step": 22708 + }, + { + "epoch": 2.057393943512038, + "grad_norm": 0.9413464069366455, + "learning_rate": 6.284057270585393e-05, + "loss": 2.4654, + "step": 22709 + }, + { + "epoch": 2.057484541686485, + "grad_norm": 0.8285589218139648, + "learning_rate": 6.283453150486317e-05, + "loss": 1.8338, + "step": 22710 + }, + { + "epoch": 2.0575751398609317, + "grad_norm": 0.9137992262840271, + "learning_rate": 6.282849030387242e-05, + "loss": 2.5138, + "step": 22711 + }, + { + "epoch": 2.0576657380353787, + "grad_norm": 0.9717377424240112, + "learning_rate": 6.282244910288165e-05, + "loss": 2.5932, + "step": 22712 + }, + { + "epoch": 2.0577563362098252, + "grad_norm": 0.9827550649642944, + "learning_rate": 6.28164079018909e-05, + "loss": 2.6098, + "step": 22713 + }, + { + "epoch": 2.0578469343842722, + "grad_norm": 0.9734681248664856, + "learning_rate": 6.281036670090014e-05, + "loss": 2.5172, + "step": 22714 + }, + { + "epoch": 2.057937532558719, + "grad_norm": 1.0671194791793823, + "learning_rate": 6.280432549990939e-05, + "loss": 2.4602, + "step": 22715 + }, + { + "epoch": 2.058028130733166, + "grad_norm": 1.0050849914550781, + "learning_rate": 6.279828429891863e-05, + "loss": 2.8364, + "step": 22716 + }, + { + "epoch": 2.0581187289076124, + "grad_norm": 0.9786787629127502, + "learning_rate": 6.279224309792787e-05, + "loss": 1.7922, + "step": 22717 + }, + { + "epoch": 2.0582093270820594, + "grad_norm": 0.951095700263977, + "learning_rate": 6.278620189693711e-05, + "loss": 2.57, + "step": 22718 + }, + { + "epoch": 2.058299925256506, + "grad_norm": 0.9399031400680542, + "learning_rate": 6.278016069594636e-05, + "loss": 2.6667, + "step": 22719 + }, + { + "epoch": 2.058390523430953, + "grad_norm": 1.0398331880569458, + "learning_rate": 6.27741194949556e-05, + "loss": 2.6689, + "step": 22720 + }, + { + "epoch": 2.0584811216053995, + "grad_norm": 1.0733742713928223, + "learning_rate": 6.276807829396484e-05, + "loss": 2.6095, + "step": 22721 + }, + { + "epoch": 2.0585717197798465, + "grad_norm": 1.0563371181488037, + "learning_rate": 6.276203709297408e-05, + "loss": 2.8838, + "step": 22722 + }, + { + "epoch": 2.058662317954293, + "grad_norm": 0.9914066195487976, + "learning_rate": 6.275599589198333e-05, + "loss": 2.5578, + "step": 22723 + }, + { + "epoch": 2.05875291612874, + "grad_norm": 0.8386269807815552, + "learning_rate": 6.274995469099258e-05, + "loss": 1.9642, + "step": 22724 + }, + { + "epoch": 2.0588435143031867, + "grad_norm": 0.971492350101471, + "learning_rate": 6.274391349000181e-05, + "loss": 2.5286, + "step": 22725 + }, + { + "epoch": 2.0589341124776337, + "grad_norm": 0.9503830671310425, + "learning_rate": 6.273787228901106e-05, + "loss": 2.5676, + "step": 22726 + }, + { + "epoch": 2.0590247106520803, + "grad_norm": 0.9762376546859741, + "learning_rate": 6.27318310880203e-05, + "loss": 2.4212, + "step": 22727 + }, + { + "epoch": 2.0591153088265273, + "grad_norm": 0.93340003490448, + "learning_rate": 6.272578988702954e-05, + "loss": 2.6747, + "step": 22728 + }, + { + "epoch": 2.059205907000974, + "grad_norm": 1.0182825326919556, + "learning_rate": 6.271974868603879e-05, + "loss": 2.7573, + "step": 22729 + }, + { + "epoch": 2.059296505175421, + "grad_norm": 0.8933236002922058, + "learning_rate": 6.271370748504804e-05, + "loss": 2.5648, + "step": 22730 + }, + { + "epoch": 2.0593871033498674, + "grad_norm": 0.8196445107460022, + "learning_rate": 6.270766628405727e-05, + "loss": 1.951, + "step": 22731 + }, + { + "epoch": 2.0594777015243144, + "grad_norm": 0.9944872260093689, + "learning_rate": 6.270162508306652e-05, + "loss": 2.5819, + "step": 22732 + }, + { + "epoch": 2.059568299698761, + "grad_norm": 0.8879053592681885, + "learning_rate": 6.269558388207575e-05, + "loss": 2.5193, + "step": 22733 + }, + { + "epoch": 2.059658897873208, + "grad_norm": 1.0175139904022217, + "learning_rate": 6.2689542681085e-05, + "loss": 2.5998, + "step": 22734 + }, + { + "epoch": 2.0597494960476546, + "grad_norm": 0.926899254322052, + "learning_rate": 6.268350148009424e-05, + "loss": 2.526, + "step": 22735 + }, + { + "epoch": 2.0598400942221016, + "grad_norm": 1.0147734880447388, + "learning_rate": 6.267746027910348e-05, + "loss": 2.6531, + "step": 22736 + }, + { + "epoch": 2.059930692396548, + "grad_norm": 1.060317039489746, + "learning_rate": 6.267141907811273e-05, + "loss": 2.8522, + "step": 22737 + }, + { + "epoch": 2.060021290570995, + "grad_norm": 1.014245629310608, + "learning_rate": 6.266537787712198e-05, + "loss": 2.8395, + "step": 22738 + }, + { + "epoch": 2.0601118887454417, + "grad_norm": 0.9655405879020691, + "learning_rate": 6.265933667613123e-05, + "loss": 2.6057, + "step": 22739 + }, + { + "epoch": 2.0602024869198887, + "grad_norm": 1.0507017374038696, + "learning_rate": 6.265329547514046e-05, + "loss": 2.5089, + "step": 22740 + }, + { + "epoch": 2.0602930850943353, + "grad_norm": 0.9572716951370239, + "learning_rate": 6.264725427414971e-05, + "loss": 2.6424, + "step": 22741 + }, + { + "epoch": 2.0603836832687823, + "grad_norm": 0.8151836395263672, + "learning_rate": 6.264121307315894e-05, + "loss": 1.8596, + "step": 22742 + }, + { + "epoch": 2.060474281443229, + "grad_norm": 0.9317359328269958, + "learning_rate": 6.263517187216819e-05, + "loss": 2.4482, + "step": 22743 + }, + { + "epoch": 2.060564879617676, + "grad_norm": 1.130515694618225, + "learning_rate": 6.262913067117744e-05, + "loss": 2.672, + "step": 22744 + }, + { + "epoch": 2.0606554777921224, + "grad_norm": 1.0304402112960815, + "learning_rate": 6.262308947018668e-05, + "loss": 2.7694, + "step": 22745 + }, + { + "epoch": 2.0607460759665694, + "grad_norm": 1.0096830129623413, + "learning_rate": 6.261704826919592e-05, + "loss": 2.781, + "step": 22746 + }, + { + "epoch": 2.060836674141016, + "grad_norm": 1.0101165771484375, + "learning_rate": 6.261100706820517e-05, + "loss": 2.6653, + "step": 22747 + }, + { + "epoch": 2.060927272315463, + "grad_norm": 1.0755016803741455, + "learning_rate": 6.26049658672144e-05, + "loss": 2.689, + "step": 22748 + }, + { + "epoch": 2.0610178704899096, + "grad_norm": 1.0250499248504639, + "learning_rate": 6.259892466622365e-05, + "loss": 2.605, + "step": 22749 + }, + { + "epoch": 2.0611084686643566, + "grad_norm": 1.0329203605651855, + "learning_rate": 6.259288346523288e-05, + "loss": 2.7152, + "step": 22750 + }, + { + "epoch": 2.061199066838803, + "grad_norm": 1.0128132104873657, + "learning_rate": 6.258684226424213e-05, + "loss": 2.5444, + "step": 22751 + }, + { + "epoch": 2.06128966501325, + "grad_norm": 0.9724762439727783, + "learning_rate": 6.258080106325138e-05, + "loss": 2.5237, + "step": 22752 + }, + { + "epoch": 2.0613802631876967, + "grad_norm": 0.9114803671836853, + "learning_rate": 6.257475986226062e-05, + "loss": 2.2069, + "step": 22753 + }, + { + "epoch": 2.0614708613621437, + "grad_norm": 1.0353636741638184, + "learning_rate": 6.256871866126986e-05, + "loss": 2.5472, + "step": 22754 + }, + { + "epoch": 2.0615614595365903, + "grad_norm": 0.9811989068984985, + "learning_rate": 6.25626774602791e-05, + "loss": 2.7447, + "step": 22755 + }, + { + "epoch": 2.0616520577110373, + "grad_norm": 1.0199462175369263, + "learning_rate": 6.255663625928835e-05, + "loss": 2.7975, + "step": 22756 + }, + { + "epoch": 2.061742655885484, + "grad_norm": 0.9583297967910767, + "learning_rate": 6.255059505829759e-05, + "loss": 2.3875, + "step": 22757 + }, + { + "epoch": 2.061833254059931, + "grad_norm": 0.9841790199279785, + "learning_rate": 6.254455385730684e-05, + "loss": 2.5892, + "step": 22758 + }, + { + "epoch": 2.0619238522343775, + "grad_norm": 1.0163425207138062, + "learning_rate": 6.253851265631608e-05, + "loss": 2.9093, + "step": 22759 + }, + { + "epoch": 2.0620144504088245, + "grad_norm": 0.9330992698669434, + "learning_rate": 6.253247145532533e-05, + "loss": 2.3398, + "step": 22760 + }, + { + "epoch": 2.062105048583271, + "grad_norm": 0.9545167684555054, + "learning_rate": 6.252643025433457e-05, + "loss": 2.6258, + "step": 22761 + }, + { + "epoch": 2.062195646757718, + "grad_norm": 0.9319035410881042, + "learning_rate": 6.252038905334381e-05, + "loss": 2.5311, + "step": 22762 + }, + { + "epoch": 2.0622862449321646, + "grad_norm": 0.9745025634765625, + "learning_rate": 6.251434785235305e-05, + "loss": 2.4232, + "step": 22763 + }, + { + "epoch": 2.0623768431066116, + "grad_norm": 0.9143370985984802, + "learning_rate": 6.25083066513623e-05, + "loss": 2.5629, + "step": 22764 + }, + { + "epoch": 2.062467441281058, + "grad_norm": 0.8597267866134644, + "learning_rate": 6.250226545037153e-05, + "loss": 2.2162, + "step": 22765 + }, + { + "epoch": 2.0625580394555048, + "grad_norm": 0.9756398797035217, + "learning_rate": 6.249622424938078e-05, + "loss": 2.7009, + "step": 22766 + }, + { + "epoch": 2.0626486376299518, + "grad_norm": 0.9818626642227173, + "learning_rate": 6.249018304839002e-05, + "loss": 2.7759, + "step": 22767 + }, + { + "epoch": 2.0627392358043983, + "grad_norm": 0.9700974822044373, + "learning_rate": 6.248414184739927e-05, + "loss": 2.7452, + "step": 22768 + }, + { + "epoch": 2.0628298339788453, + "grad_norm": 1.0135101079940796, + "learning_rate": 6.24781006464085e-05, + "loss": 2.6817, + "step": 22769 + }, + { + "epoch": 2.062920432153292, + "grad_norm": 1.0220869779586792, + "learning_rate": 6.247205944541775e-05, + "loss": 2.5022, + "step": 22770 + }, + { + "epoch": 2.063011030327739, + "grad_norm": 0.9623569250106812, + "learning_rate": 6.2466018244427e-05, + "loss": 2.4274, + "step": 22771 + }, + { + "epoch": 2.0631016285021855, + "grad_norm": 1.0938044786453247, + "learning_rate": 6.245997704343623e-05, + "loss": 2.5491, + "step": 22772 + }, + { + "epoch": 2.0631922266766325, + "grad_norm": 1.0646549463272095, + "learning_rate": 6.245393584244548e-05, + "loss": 2.7803, + "step": 22773 + }, + { + "epoch": 2.063282824851079, + "grad_norm": 0.9735512137413025, + "learning_rate": 6.244789464145473e-05, + "loss": 2.6491, + "step": 22774 + }, + { + "epoch": 2.063373423025526, + "grad_norm": 0.8500323295593262, + "learning_rate": 6.244185344046398e-05, + "loss": 1.9514, + "step": 22775 + }, + { + "epoch": 2.0634640211999726, + "grad_norm": 0.9528985619544983, + "learning_rate": 6.243581223947321e-05, + "loss": 2.4389, + "step": 22776 + }, + { + "epoch": 2.0635546193744196, + "grad_norm": 0.9780193567276001, + "learning_rate": 6.242977103848246e-05, + "loss": 2.5765, + "step": 22777 + }, + { + "epoch": 2.063645217548866, + "grad_norm": 0.9421409368515015, + "learning_rate": 6.242372983749169e-05, + "loss": 2.577, + "step": 22778 + }, + { + "epoch": 2.063735815723313, + "grad_norm": 0.9820749163627625, + "learning_rate": 6.241768863650094e-05, + "loss": 2.8066, + "step": 22779 + }, + { + "epoch": 2.06382641389776, + "grad_norm": 1.036921739578247, + "learning_rate": 6.241164743551017e-05, + "loss": 2.7788, + "step": 22780 + }, + { + "epoch": 2.063917012072207, + "grad_norm": 1.036601185798645, + "learning_rate": 6.240560623451942e-05, + "loss": 2.416, + "step": 22781 + }, + { + "epoch": 2.0640076102466534, + "grad_norm": 0.9249228239059448, + "learning_rate": 6.239956503352867e-05, + "loss": 2.5398, + "step": 22782 + }, + { + "epoch": 2.0640982084211004, + "grad_norm": 1.0134215354919434, + "learning_rate": 6.239352383253792e-05, + "loss": 2.6537, + "step": 22783 + }, + { + "epoch": 2.064188806595547, + "grad_norm": 0.9725034236907959, + "learning_rate": 6.238748263154715e-05, + "loss": 2.8406, + "step": 22784 + }, + { + "epoch": 2.064279404769994, + "grad_norm": 0.9713392853736877, + "learning_rate": 6.23814414305564e-05, + "loss": 2.5545, + "step": 22785 + }, + { + "epoch": 2.0643700029444405, + "grad_norm": 0.976131796836853, + "learning_rate": 6.237540022956563e-05, + "loss": 2.6332, + "step": 22786 + }, + { + "epoch": 2.0644606011188875, + "grad_norm": 1.020333170890808, + "learning_rate": 6.236935902857488e-05, + "loss": 2.5534, + "step": 22787 + }, + { + "epoch": 2.064551199293334, + "grad_norm": 1.006338119506836, + "learning_rate": 6.236331782758413e-05, + "loss": 2.4935, + "step": 22788 + }, + { + "epoch": 2.064641797467781, + "grad_norm": 1.1139411926269531, + "learning_rate": 6.235727662659338e-05, + "loss": 2.321, + "step": 22789 + }, + { + "epoch": 2.0647323956422277, + "grad_norm": 0.9357942342758179, + "learning_rate": 6.235123542560261e-05, + "loss": 2.8745, + "step": 22790 + }, + { + "epoch": 2.0648229938166747, + "grad_norm": 0.9637852907180786, + "learning_rate": 6.234519422461186e-05, + "loss": 2.7041, + "step": 22791 + }, + { + "epoch": 2.0649135919911212, + "grad_norm": 0.9618591666221619, + "learning_rate": 6.23391530236211e-05, + "loss": 2.4568, + "step": 22792 + }, + { + "epoch": 2.0650041901655682, + "grad_norm": 0.9688306450843811, + "learning_rate": 6.233311182263034e-05, + "loss": 2.6119, + "step": 22793 + }, + { + "epoch": 2.065094788340015, + "grad_norm": 1.0042604207992554, + "learning_rate": 6.232707062163959e-05, + "loss": 2.667, + "step": 22794 + }, + { + "epoch": 2.065185386514462, + "grad_norm": 1.0074632167816162, + "learning_rate": 6.232102942064882e-05, + "loss": 2.303, + "step": 22795 + }, + { + "epoch": 2.0652759846889084, + "grad_norm": 0.972603976726532, + "learning_rate": 6.231498821965807e-05, + "loss": 2.6516, + "step": 22796 + }, + { + "epoch": 2.0653665828633554, + "grad_norm": 1.0368618965148926, + "learning_rate": 6.230894701866732e-05, + "loss": 2.4991, + "step": 22797 + }, + { + "epoch": 2.065457181037802, + "grad_norm": 0.9952623248100281, + "learning_rate": 6.230290581767656e-05, + "loss": 2.4742, + "step": 22798 + }, + { + "epoch": 2.065547779212249, + "grad_norm": 1.0523546934127808, + "learning_rate": 6.22968646166858e-05, + "loss": 2.66, + "step": 22799 + }, + { + "epoch": 2.0656383773866955, + "grad_norm": 0.9062508344650269, + "learning_rate": 6.229082341569505e-05, + "loss": 2.4426, + "step": 22800 + }, + { + "epoch": 2.0657289755611425, + "grad_norm": 0.921950101852417, + "learning_rate": 6.228478221470428e-05, + "loss": 2.4259, + "step": 22801 + }, + { + "epoch": 2.065819573735589, + "grad_norm": 0.958024799823761, + "learning_rate": 6.227874101371353e-05, + "loss": 2.8146, + "step": 22802 + }, + { + "epoch": 2.065910171910036, + "grad_norm": 1.0379704236984253, + "learning_rate": 6.227269981272276e-05, + "loss": 2.4382, + "step": 22803 + }, + { + "epoch": 2.0660007700844827, + "grad_norm": 0.9871276021003723, + "learning_rate": 6.226665861173202e-05, + "loss": 2.5604, + "step": 22804 + }, + { + "epoch": 2.0660913682589297, + "grad_norm": 0.934616208076477, + "learning_rate": 6.226061741074126e-05, + "loss": 2.7483, + "step": 22805 + }, + { + "epoch": 2.0661819664333763, + "grad_norm": 0.9607836008071899, + "learning_rate": 6.22545762097505e-05, + "loss": 2.4312, + "step": 22806 + }, + { + "epoch": 2.0662725646078233, + "grad_norm": 1.1472275257110596, + "learning_rate": 6.224853500875975e-05, + "loss": 2.3743, + "step": 22807 + }, + { + "epoch": 2.06636316278227, + "grad_norm": 0.9213216304779053, + "learning_rate": 6.224249380776899e-05, + "loss": 2.6557, + "step": 22808 + }, + { + "epoch": 2.066453760956717, + "grad_norm": 1.0008232593536377, + "learning_rate": 6.223645260677823e-05, + "loss": 2.9068, + "step": 22809 + }, + { + "epoch": 2.0665443591311634, + "grad_norm": 1.1332217454910278, + "learning_rate": 6.223041140578747e-05, + "loss": 2.4731, + "step": 22810 + }, + { + "epoch": 2.0666349573056104, + "grad_norm": 0.8812506794929504, + "learning_rate": 6.222437020479671e-05, + "loss": 1.9917, + "step": 22811 + }, + { + "epoch": 2.066725555480057, + "grad_norm": 0.9071558713912964, + "learning_rate": 6.221832900380596e-05, + "loss": 2.4461, + "step": 22812 + }, + { + "epoch": 2.066816153654504, + "grad_norm": 0.9616997241973877, + "learning_rate": 6.221228780281521e-05, + "loss": 2.6391, + "step": 22813 + }, + { + "epoch": 2.0669067518289506, + "grad_norm": 0.9794843792915344, + "learning_rate": 6.220624660182444e-05, + "loss": 2.4607, + "step": 22814 + }, + { + "epoch": 2.0669973500033976, + "grad_norm": 0.9430341124534607, + "learning_rate": 6.220020540083369e-05, + "loss": 2.8839, + "step": 22815 + }, + { + "epoch": 2.067087948177844, + "grad_norm": 0.8556358814239502, + "learning_rate": 6.219416419984293e-05, + "loss": 2.0022, + "step": 22816 + }, + { + "epoch": 2.067178546352291, + "grad_norm": 1.0346797704696655, + "learning_rate": 6.218812299885217e-05, + "loss": 2.9533, + "step": 22817 + }, + { + "epoch": 2.0672691445267377, + "grad_norm": 1.1965314149856567, + "learning_rate": 6.218208179786141e-05, + "loss": 2.5332, + "step": 22818 + }, + { + "epoch": 2.0673597427011847, + "grad_norm": 1.0210169553756714, + "learning_rate": 6.217604059687067e-05, + "loss": 2.4984, + "step": 22819 + }, + { + "epoch": 2.0674503408756313, + "grad_norm": 0.9844578504562378, + "learning_rate": 6.21699993958799e-05, + "loss": 2.4337, + "step": 22820 + }, + { + "epoch": 2.0675409390500783, + "grad_norm": 1.0381715297698975, + "learning_rate": 6.216395819488915e-05, + "loss": 2.3477, + "step": 22821 + }, + { + "epoch": 2.067631537224525, + "grad_norm": 0.8775728940963745, + "learning_rate": 6.215791699389838e-05, + "loss": 1.9934, + "step": 22822 + }, + { + "epoch": 2.067722135398972, + "grad_norm": 1.0029269456863403, + "learning_rate": 6.215187579290763e-05, + "loss": 2.5197, + "step": 22823 + }, + { + "epoch": 2.0678127335734184, + "grad_norm": 1.0387359857559204, + "learning_rate": 6.214583459191688e-05, + "loss": 3.0358, + "step": 22824 + }, + { + "epoch": 2.0679033317478654, + "grad_norm": 1.0445350408554077, + "learning_rate": 6.213979339092611e-05, + "loss": 2.6413, + "step": 22825 + }, + { + "epoch": 2.067993929922312, + "grad_norm": 0.975917637348175, + "learning_rate": 6.213375218993536e-05, + "loss": 2.5926, + "step": 22826 + }, + { + "epoch": 2.068084528096759, + "grad_norm": 1.0467982292175293, + "learning_rate": 6.212771098894461e-05, + "loss": 2.6442, + "step": 22827 + }, + { + "epoch": 2.0681751262712056, + "grad_norm": 0.842475950717926, + "learning_rate": 6.212166978795386e-05, + "loss": 1.8316, + "step": 22828 + }, + { + "epoch": 2.0682657244456526, + "grad_norm": 1.0303337574005127, + "learning_rate": 6.211562858696309e-05, + "loss": 2.7797, + "step": 22829 + }, + { + "epoch": 2.068356322620099, + "grad_norm": 0.9512374401092529, + "learning_rate": 6.210958738597234e-05, + "loss": 2.441, + "step": 22830 + }, + { + "epoch": 2.068446920794546, + "grad_norm": 1.0180364847183228, + "learning_rate": 6.210354618498157e-05, + "loss": 2.7949, + "step": 22831 + }, + { + "epoch": 2.0685375189689927, + "grad_norm": 0.8452637195587158, + "learning_rate": 6.209750498399082e-05, + "loss": 2.0136, + "step": 22832 + }, + { + "epoch": 2.0686281171434397, + "grad_norm": 1.0267465114593506, + "learning_rate": 6.209146378300005e-05, + "loss": 2.5964, + "step": 22833 + }, + { + "epoch": 2.0687187153178863, + "grad_norm": 1.04438054561615, + "learning_rate": 6.208542258200932e-05, + "loss": 2.8737, + "step": 22834 + }, + { + "epoch": 2.0688093134923333, + "grad_norm": 1.0039821863174438, + "learning_rate": 6.207938138101855e-05, + "loss": 2.7782, + "step": 22835 + }, + { + "epoch": 2.06889991166678, + "grad_norm": 0.9914148449897766, + "learning_rate": 6.20733401800278e-05, + "loss": 2.5427, + "step": 22836 + }, + { + "epoch": 2.068990509841227, + "grad_norm": 0.9810732007026672, + "learning_rate": 6.206729897903703e-05, + "loss": 2.6945, + "step": 22837 + }, + { + "epoch": 2.0690811080156735, + "grad_norm": 0.9655851721763611, + "learning_rate": 6.206125777804628e-05, + "loss": 2.5152, + "step": 22838 + }, + { + "epoch": 2.0691717061901205, + "grad_norm": 1.051182508468628, + "learning_rate": 6.205521657705553e-05, + "loss": 2.6805, + "step": 22839 + }, + { + "epoch": 2.069262304364567, + "grad_norm": 1.0394550561904907, + "learning_rate": 6.204917537606476e-05, + "loss": 2.7728, + "step": 22840 + }, + { + "epoch": 2.069352902539014, + "grad_norm": 1.07975435256958, + "learning_rate": 6.204313417507401e-05, + "loss": 2.4477, + "step": 22841 + }, + { + "epoch": 2.0694435007134606, + "grad_norm": 0.9524809718132019, + "learning_rate": 6.203709297408326e-05, + "loss": 2.6597, + "step": 22842 + }, + { + "epoch": 2.0695340988879076, + "grad_norm": 1.017693042755127, + "learning_rate": 6.20310517730925e-05, + "loss": 2.6389, + "step": 22843 + }, + { + "epoch": 2.069624697062354, + "grad_norm": 0.9670323133468628, + "learning_rate": 6.202501057210174e-05, + "loss": 2.7197, + "step": 22844 + }, + { + "epoch": 2.0697152952368008, + "grad_norm": 1.0129401683807373, + "learning_rate": 6.201896937111098e-05, + "loss": 2.8232, + "step": 22845 + }, + { + "epoch": 2.0698058934112478, + "grad_norm": 0.9486474990844727, + "learning_rate": 6.201292817012022e-05, + "loss": 2.5261, + "step": 22846 + }, + { + "epoch": 2.0698964915856948, + "grad_norm": 0.9713467359542847, + "learning_rate": 6.200688696912947e-05, + "loss": 2.7204, + "step": 22847 + }, + { + "epoch": 2.0699870897601413, + "grad_norm": 0.9184922575950623, + "learning_rate": 6.20008457681387e-05, + "loss": 2.4938, + "step": 22848 + }, + { + "epoch": 2.070077687934588, + "grad_norm": 1.0122603178024292, + "learning_rate": 6.199480456714796e-05, + "loss": 2.5276, + "step": 22849 + }, + { + "epoch": 2.070168286109035, + "grad_norm": 0.9866361618041992, + "learning_rate": 6.19887633661572e-05, + "loss": 2.6123, + "step": 22850 + }, + { + "epoch": 2.0702588842834815, + "grad_norm": 1.0383434295654297, + "learning_rate": 6.198272216516644e-05, + "loss": 2.5675, + "step": 22851 + }, + { + "epoch": 2.0703494824579285, + "grad_norm": 1.0281089544296265, + "learning_rate": 6.197668096417568e-05, + "loss": 2.7914, + "step": 22852 + }, + { + "epoch": 2.070440080632375, + "grad_norm": 0.9734753370285034, + "learning_rate": 6.197063976318492e-05, + "loss": 2.5723, + "step": 22853 + }, + { + "epoch": 2.070530678806822, + "grad_norm": 1.054943323135376, + "learning_rate": 6.196459856219416e-05, + "loss": 2.689, + "step": 22854 + }, + { + "epoch": 2.0706212769812686, + "grad_norm": 0.9179619550704956, + "learning_rate": 6.19585573612034e-05, + "loss": 1.9556, + "step": 22855 + }, + { + "epoch": 2.0707118751557156, + "grad_norm": 0.9660824537277222, + "learning_rate": 6.195251616021265e-05, + "loss": 2.4959, + "step": 22856 + }, + { + "epoch": 2.070802473330162, + "grad_norm": 0.9633544087409973, + "learning_rate": 6.19464749592219e-05, + "loss": 2.6693, + "step": 22857 + }, + { + "epoch": 2.070893071504609, + "grad_norm": 0.9936349391937256, + "learning_rate": 6.194043375823114e-05, + "loss": 2.5653, + "step": 22858 + }, + { + "epoch": 2.070983669679056, + "grad_norm": 0.8023314476013184, + "learning_rate": 6.193439255724038e-05, + "loss": 1.9901, + "step": 22859 + }, + { + "epoch": 2.071074267853503, + "grad_norm": 0.9323933720588684, + "learning_rate": 6.192835135624963e-05, + "loss": 2.3631, + "step": 22860 + }, + { + "epoch": 2.0711648660279494, + "grad_norm": 0.8551279902458191, + "learning_rate": 6.192231015525886e-05, + "loss": 2.1804, + "step": 22861 + }, + { + "epoch": 2.0712554642023964, + "grad_norm": 0.8958501219749451, + "learning_rate": 6.191626895426811e-05, + "loss": 1.9334, + "step": 22862 + }, + { + "epoch": 2.071346062376843, + "grad_norm": 1.247597336769104, + "learning_rate": 6.191022775327735e-05, + "loss": 2.3823, + "step": 22863 + }, + { + "epoch": 2.07143666055129, + "grad_norm": 1.0189483165740967, + "learning_rate": 6.190418655228661e-05, + "loss": 2.4651, + "step": 22864 + }, + { + "epoch": 2.0715272587257365, + "grad_norm": 1.0027194023132324, + "learning_rate": 6.189814535129584e-05, + "loss": 2.4966, + "step": 22865 + }, + { + "epoch": 2.0716178569001835, + "grad_norm": 0.8731092810630798, + "learning_rate": 6.189210415030509e-05, + "loss": 2.223, + "step": 22866 + }, + { + "epoch": 2.07170845507463, + "grad_norm": 1.0603424310684204, + "learning_rate": 6.188606294931432e-05, + "loss": 2.5864, + "step": 22867 + }, + { + "epoch": 2.071799053249077, + "grad_norm": 0.9353687167167664, + "learning_rate": 6.188002174832357e-05, + "loss": 2.3416, + "step": 22868 + }, + { + "epoch": 2.0718896514235237, + "grad_norm": 0.9634443521499634, + "learning_rate": 6.18739805473328e-05, + "loss": 2.4649, + "step": 22869 + }, + { + "epoch": 2.0719802495979707, + "grad_norm": 0.9554609060287476, + "learning_rate": 6.186793934634205e-05, + "loss": 2.7004, + "step": 22870 + }, + { + "epoch": 2.0720708477724172, + "grad_norm": 1.0038434267044067, + "learning_rate": 6.18618981453513e-05, + "loss": 2.66, + "step": 22871 + }, + { + "epoch": 2.0721614459468642, + "grad_norm": 0.9712157845497131, + "learning_rate": 6.185585694436055e-05, + "loss": 2.9399, + "step": 22872 + }, + { + "epoch": 2.072252044121311, + "grad_norm": 1.0453108549118042, + "learning_rate": 6.184981574336978e-05, + "loss": 2.8054, + "step": 22873 + }, + { + "epoch": 2.072342642295758, + "grad_norm": 0.995804488658905, + "learning_rate": 6.184377454237903e-05, + "loss": 2.5533, + "step": 22874 + }, + { + "epoch": 2.0724332404702044, + "grad_norm": 1.0370720624923706, + "learning_rate": 6.183773334138828e-05, + "loss": 2.8135, + "step": 22875 + }, + { + "epoch": 2.0725238386446514, + "grad_norm": 0.9475657343864441, + "learning_rate": 6.183169214039751e-05, + "loss": 2.4146, + "step": 22876 + }, + { + "epoch": 2.072614436819098, + "grad_norm": 0.9504573941230774, + "learning_rate": 6.182565093940676e-05, + "loss": 2.7299, + "step": 22877 + }, + { + "epoch": 2.072705034993545, + "grad_norm": 1.0480488538742065, + "learning_rate": 6.181960973841599e-05, + "loss": 2.4555, + "step": 22878 + }, + { + "epoch": 2.0727956331679915, + "grad_norm": 0.9317992329597473, + "learning_rate": 6.181356853742525e-05, + "loss": 2.6074, + "step": 22879 + }, + { + "epoch": 2.0728862313424385, + "grad_norm": 0.9345345497131348, + "learning_rate": 6.180752733643449e-05, + "loss": 2.351, + "step": 22880 + }, + { + "epoch": 2.072976829516885, + "grad_norm": 0.8881949186325073, + "learning_rate": 6.180148613544374e-05, + "loss": 2.0252, + "step": 22881 + }, + { + "epoch": 2.073067427691332, + "grad_norm": 0.9983474612236023, + "learning_rate": 6.179544493445297e-05, + "loss": 2.4545, + "step": 22882 + }, + { + "epoch": 2.0731580258657787, + "grad_norm": 0.9602903723716736, + "learning_rate": 6.178940373346222e-05, + "loss": 2.8416, + "step": 22883 + }, + { + "epoch": 2.0732486240402257, + "grad_norm": 1.0802042484283447, + "learning_rate": 6.178336253247145e-05, + "loss": 2.7069, + "step": 22884 + }, + { + "epoch": 2.0733392222146723, + "grad_norm": 0.9274361729621887, + "learning_rate": 6.17773213314807e-05, + "loss": 1.9158, + "step": 22885 + }, + { + "epoch": 2.0734298203891193, + "grad_norm": 0.9520278573036194, + "learning_rate": 6.177128013048993e-05, + "loss": 2.622, + "step": 22886 + }, + { + "epoch": 2.073520418563566, + "grad_norm": 0.9414560198783875, + "learning_rate": 6.17652389294992e-05, + "loss": 2.8005, + "step": 22887 + }, + { + "epoch": 2.073611016738013, + "grad_norm": 1.0530242919921875, + "learning_rate": 6.175919772850843e-05, + "loss": 2.6203, + "step": 22888 + }, + { + "epoch": 2.0737016149124594, + "grad_norm": 1.1328846216201782, + "learning_rate": 6.175315652751768e-05, + "loss": 2.5181, + "step": 22889 + }, + { + "epoch": 2.0737922130869064, + "grad_norm": 1.0348809957504272, + "learning_rate": 6.174711532652691e-05, + "loss": 2.853, + "step": 22890 + }, + { + "epoch": 2.073882811261353, + "grad_norm": 1.0527007579803467, + "learning_rate": 6.174107412553616e-05, + "loss": 2.5219, + "step": 22891 + }, + { + "epoch": 2.0739734094358, + "grad_norm": 1.0037925243377686, + "learning_rate": 6.17350329245454e-05, + "loss": 2.5284, + "step": 22892 + }, + { + "epoch": 2.0740640076102466, + "grad_norm": 0.9843792915344238, + "learning_rate": 6.172899172355464e-05, + "loss": 2.7482, + "step": 22893 + }, + { + "epoch": 2.0741546057846936, + "grad_norm": 0.9854533672332764, + "learning_rate": 6.17229505225639e-05, + "loss": 2.4778, + "step": 22894 + }, + { + "epoch": 2.07424520395914, + "grad_norm": 1.028953194618225, + "learning_rate": 6.171690932157313e-05, + "loss": 2.6466, + "step": 22895 + }, + { + "epoch": 2.074335802133587, + "grad_norm": 0.9854997992515564, + "learning_rate": 6.171086812058238e-05, + "loss": 2.6493, + "step": 22896 + }, + { + "epoch": 2.0744264003080337, + "grad_norm": 0.9441813826560974, + "learning_rate": 6.170482691959162e-05, + "loss": 2.5709, + "step": 22897 + }, + { + "epoch": 2.0745169984824807, + "grad_norm": 1.0141721963882446, + "learning_rate": 6.169878571860086e-05, + "loss": 2.6699, + "step": 22898 + }, + { + "epoch": 2.0746075966569273, + "grad_norm": 0.966541051864624, + "learning_rate": 6.16927445176101e-05, + "loss": 2.5247, + "step": 22899 + }, + { + "epoch": 2.0746981948313743, + "grad_norm": 1.0427652597427368, + "learning_rate": 6.168670331661935e-05, + "loss": 2.5222, + "step": 22900 + }, + { + "epoch": 2.074788793005821, + "grad_norm": 1.0109398365020752, + "learning_rate": 6.168066211562858e-05, + "loss": 2.6357, + "step": 22901 + }, + { + "epoch": 2.074879391180268, + "grad_norm": 1.0518617630004883, + "learning_rate": 6.167462091463784e-05, + "loss": 2.4146, + "step": 22902 + }, + { + "epoch": 2.0749699893547144, + "grad_norm": 0.9401484727859497, + "learning_rate": 6.166857971364707e-05, + "loss": 2.4635, + "step": 22903 + }, + { + "epoch": 2.0750605875291614, + "grad_norm": 0.878447413444519, + "learning_rate": 6.166253851265632e-05, + "loss": 2.5294, + "step": 22904 + }, + { + "epoch": 2.075151185703608, + "grad_norm": 1.011034369468689, + "learning_rate": 6.165649731166556e-05, + "loss": 2.6285, + "step": 22905 + }, + { + "epoch": 2.075241783878055, + "grad_norm": 0.924384593963623, + "learning_rate": 6.16504561106748e-05, + "loss": 2.4179, + "step": 22906 + }, + { + "epoch": 2.0753323820525016, + "grad_norm": 0.9856852889060974, + "learning_rate": 6.164441490968405e-05, + "loss": 2.6241, + "step": 22907 + }, + { + "epoch": 2.0754229802269486, + "grad_norm": 0.9975219368934631, + "learning_rate": 6.163837370869329e-05, + "loss": 2.3829, + "step": 22908 + }, + { + "epoch": 2.075513578401395, + "grad_norm": 1.0560072660446167, + "learning_rate": 6.163233250770253e-05, + "loss": 2.8882, + "step": 22909 + }, + { + "epoch": 2.075604176575842, + "grad_norm": 0.9582732319831848, + "learning_rate": 6.162629130671178e-05, + "loss": 2.6008, + "step": 22910 + }, + { + "epoch": 2.0756947747502887, + "grad_norm": 0.973103940486908, + "learning_rate": 6.162025010572103e-05, + "loss": 2.8968, + "step": 22911 + }, + { + "epoch": 2.0757853729247357, + "grad_norm": 1.0009552240371704, + "learning_rate": 6.161420890473026e-05, + "loss": 2.6519, + "step": 22912 + }, + { + "epoch": 2.0758759710991823, + "grad_norm": 1.0155833959579468, + "learning_rate": 6.160816770373951e-05, + "loss": 2.6445, + "step": 22913 + }, + { + "epoch": 2.0759665692736293, + "grad_norm": 0.9221788048744202, + "learning_rate": 6.160212650274874e-05, + "loss": 2.353, + "step": 22914 + }, + { + "epoch": 2.076057167448076, + "grad_norm": 0.9912570118904114, + "learning_rate": 6.159608530175799e-05, + "loss": 2.8741, + "step": 22915 + }, + { + "epoch": 2.076147765622523, + "grad_norm": 0.9681047201156616, + "learning_rate": 6.159004410076723e-05, + "loss": 2.6634, + "step": 22916 + }, + { + "epoch": 2.0762383637969695, + "grad_norm": 0.9552757143974304, + "learning_rate": 6.158400289977649e-05, + "loss": 2.0615, + "step": 22917 + }, + { + "epoch": 2.0763289619714165, + "grad_norm": 1.0112870931625366, + "learning_rate": 6.157796169878572e-05, + "loss": 2.6035, + "step": 22918 + }, + { + "epoch": 2.076419560145863, + "grad_norm": 0.9672481417655945, + "learning_rate": 6.157192049779497e-05, + "loss": 2.6147, + "step": 22919 + }, + { + "epoch": 2.07651015832031, + "grad_norm": 1.0379700660705566, + "learning_rate": 6.15658792968042e-05, + "loss": 2.6191, + "step": 22920 + }, + { + "epoch": 2.0766007564947566, + "grad_norm": 0.9563535451889038, + "learning_rate": 6.155983809581345e-05, + "loss": 2.7354, + "step": 22921 + }, + { + "epoch": 2.0766913546692036, + "grad_norm": 1.0056614875793457, + "learning_rate": 6.155379689482268e-05, + "loss": 2.7209, + "step": 22922 + }, + { + "epoch": 2.07678195284365, + "grad_norm": 1.000431776046753, + "learning_rate": 6.154775569383193e-05, + "loss": 2.6476, + "step": 22923 + }, + { + "epoch": 2.0768725510180968, + "grad_norm": 1.0600109100341797, + "learning_rate": 6.154171449284118e-05, + "loss": 2.5822, + "step": 22924 + }, + { + "epoch": 2.0769631491925438, + "grad_norm": 0.9808324575424194, + "learning_rate": 6.153567329185043e-05, + "loss": 2.8998, + "step": 22925 + }, + { + "epoch": 2.0770537473669908, + "grad_norm": 0.9498834609985352, + "learning_rate": 6.152963209085967e-05, + "loss": 2.5566, + "step": 22926 + }, + { + "epoch": 2.0771443455414373, + "grad_norm": 0.9504066705703735, + "learning_rate": 6.152359088986891e-05, + "loss": 2.4393, + "step": 22927 + }, + { + "epoch": 2.077234943715884, + "grad_norm": 0.8673797249794006, + "learning_rate": 6.151754968887816e-05, + "loss": 1.8941, + "step": 22928 + }, + { + "epoch": 2.077325541890331, + "grad_norm": 0.9362795352935791, + "learning_rate": 6.151150848788739e-05, + "loss": 2.5096, + "step": 22929 + }, + { + "epoch": 2.0774161400647775, + "grad_norm": 0.9938153624534607, + "learning_rate": 6.150546728689664e-05, + "loss": 1.8278, + "step": 22930 + }, + { + "epoch": 2.0775067382392245, + "grad_norm": 1.02004873752594, + "learning_rate": 6.149942608590587e-05, + "loss": 2.6888, + "step": 22931 + }, + { + "epoch": 2.077597336413671, + "grad_norm": 0.7925088405609131, + "learning_rate": 6.149338488491513e-05, + "loss": 1.3179, + "step": 22932 + }, + { + "epoch": 2.077687934588118, + "grad_norm": 1.0235720872879028, + "learning_rate": 6.148734368392437e-05, + "loss": 2.9905, + "step": 22933 + }, + { + "epoch": 2.0777785327625646, + "grad_norm": 1.1714164018630981, + "learning_rate": 6.148130248293361e-05, + "loss": 2.6334, + "step": 22934 + }, + { + "epoch": 2.0778691309370116, + "grad_norm": 0.9659576416015625, + "learning_rate": 6.147526128194285e-05, + "loss": 2.7285, + "step": 22935 + }, + { + "epoch": 2.077959729111458, + "grad_norm": 1.0171908140182495, + "learning_rate": 6.14692200809521e-05, + "loss": 2.7247, + "step": 22936 + }, + { + "epoch": 2.078050327285905, + "grad_norm": 1.0439441204071045, + "learning_rate": 6.146317887996133e-05, + "loss": 2.6768, + "step": 22937 + }, + { + "epoch": 2.078140925460352, + "grad_norm": 0.9075984358787537, + "learning_rate": 6.145713767897058e-05, + "loss": 2.5433, + "step": 22938 + }, + { + "epoch": 2.078231523634799, + "grad_norm": 0.9393426775932312, + "learning_rate": 6.145109647797983e-05, + "loss": 2.5648, + "step": 22939 + }, + { + "epoch": 2.0783221218092454, + "grad_norm": 0.9762976765632629, + "learning_rate": 6.144505527698907e-05, + "loss": 2.7145, + "step": 22940 + }, + { + "epoch": 2.0784127199836924, + "grad_norm": 0.9612032771110535, + "learning_rate": 6.143901407599831e-05, + "loss": 2.4139, + "step": 22941 + }, + { + "epoch": 2.078503318158139, + "grad_norm": 0.9634888768196106, + "learning_rate": 6.143297287500755e-05, + "loss": 1.8191, + "step": 22942 + }, + { + "epoch": 2.078593916332586, + "grad_norm": 1.0765644311904907, + "learning_rate": 6.14269316740168e-05, + "loss": 2.534, + "step": 22943 + }, + { + "epoch": 2.0786845145070325, + "grad_norm": 1.0496007204055786, + "learning_rate": 6.142089047302604e-05, + "loss": 2.4562, + "step": 22944 + }, + { + "epoch": 2.0787751126814795, + "grad_norm": 0.9787802696228027, + "learning_rate": 6.141484927203528e-05, + "loss": 2.5742, + "step": 22945 + }, + { + "epoch": 2.078865710855926, + "grad_norm": 0.92730712890625, + "learning_rate": 6.140880807104452e-05, + "loss": 2.6337, + "step": 22946 + }, + { + "epoch": 2.078956309030373, + "grad_norm": 1.0336265563964844, + "learning_rate": 6.140276687005378e-05, + "loss": 2.6147, + "step": 22947 + }, + { + "epoch": 2.0790469072048197, + "grad_norm": 0.9394655823707581, + "learning_rate": 6.139672566906301e-05, + "loss": 2.6769, + "step": 22948 + }, + { + "epoch": 2.0791375053792667, + "grad_norm": 1.0547178983688354, + "learning_rate": 6.139068446807226e-05, + "loss": 2.5623, + "step": 22949 + }, + { + "epoch": 2.0792281035537132, + "grad_norm": 0.9655873775482178, + "learning_rate": 6.13846432670815e-05, + "loss": 2.6671, + "step": 22950 + }, + { + "epoch": 2.0793187017281602, + "grad_norm": 0.910201907157898, + "learning_rate": 6.137860206609074e-05, + "loss": 2.5827, + "step": 22951 + }, + { + "epoch": 2.079409299902607, + "grad_norm": 1.075461983680725, + "learning_rate": 6.137256086509998e-05, + "loss": 2.4275, + "step": 22952 + }, + { + "epoch": 2.079499898077054, + "grad_norm": 0.7679852843284607, + "learning_rate": 6.136651966410922e-05, + "loss": 1.8402, + "step": 22953 + }, + { + "epoch": 2.0795904962515004, + "grad_norm": 0.978100061416626, + "learning_rate": 6.136047846311847e-05, + "loss": 2.1482, + "step": 22954 + }, + { + "epoch": 2.0796810944259474, + "grad_norm": 0.9235230088233948, + "learning_rate": 6.135443726212772e-05, + "loss": 2.7669, + "step": 22955 + }, + { + "epoch": 2.079771692600394, + "grad_norm": 0.8506850004196167, + "learning_rate": 6.134839606113695e-05, + "loss": 2.0103, + "step": 22956 + }, + { + "epoch": 2.079862290774841, + "grad_norm": 0.851283848285675, + "learning_rate": 6.13423548601462e-05, + "loss": 1.8635, + "step": 22957 + }, + { + "epoch": 2.0799528889492875, + "grad_norm": 1.0122089385986328, + "learning_rate": 6.133631365915545e-05, + "loss": 2.3657, + "step": 22958 + }, + { + "epoch": 2.0800434871237345, + "grad_norm": 0.9307143092155457, + "learning_rate": 6.133027245816468e-05, + "loss": 2.0866, + "step": 22959 + }, + { + "epoch": 2.080134085298181, + "grad_norm": 0.974539577960968, + "learning_rate": 6.132423125717393e-05, + "loss": 2.5076, + "step": 22960 + }, + { + "epoch": 2.080224683472628, + "grad_norm": 1.0199573040008545, + "learning_rate": 6.131819005618316e-05, + "loss": 2.5501, + "step": 22961 + }, + { + "epoch": 2.0803152816470747, + "grad_norm": 0.960294783115387, + "learning_rate": 6.131214885519243e-05, + "loss": 2.5863, + "step": 22962 + }, + { + "epoch": 2.0804058798215217, + "grad_norm": 0.9379286170005798, + "learning_rate": 6.130610765420166e-05, + "loss": 2.6374, + "step": 22963 + }, + { + "epoch": 2.0804964779959683, + "grad_norm": 0.9662681818008423, + "learning_rate": 6.130006645321091e-05, + "loss": 2.6678, + "step": 22964 + }, + { + "epoch": 2.0805870761704153, + "grad_norm": 0.9775801301002502, + "learning_rate": 6.129402525222014e-05, + "loss": 2.386, + "step": 22965 + }, + { + "epoch": 2.080677674344862, + "grad_norm": 1.0633844137191772, + "learning_rate": 6.128798405122939e-05, + "loss": 2.4782, + "step": 22966 + }, + { + "epoch": 2.080768272519309, + "grad_norm": 0.9844827651977539, + "learning_rate": 6.128194285023862e-05, + "loss": 2.5064, + "step": 22967 + }, + { + "epoch": 2.0808588706937554, + "grad_norm": 1.0528842210769653, + "learning_rate": 6.127590164924787e-05, + "loss": 2.4927, + "step": 22968 + }, + { + "epoch": 2.0809494688682024, + "grad_norm": 0.9329504370689392, + "learning_rate": 6.126986044825712e-05, + "loss": 1.9923, + "step": 22969 + }, + { + "epoch": 2.081040067042649, + "grad_norm": 1.074687123298645, + "learning_rate": 6.126381924726637e-05, + "loss": 2.6961, + "step": 22970 + }, + { + "epoch": 2.081130665217096, + "grad_norm": 0.9491421580314636, + "learning_rate": 6.12577780462756e-05, + "loss": 2.6096, + "step": 22971 + }, + { + "epoch": 2.0812212633915426, + "grad_norm": 1.0169603824615479, + "learning_rate": 6.125173684528485e-05, + "loss": 2.406, + "step": 22972 + }, + { + "epoch": 2.0813118615659896, + "grad_norm": 1.0986006259918213, + "learning_rate": 6.124569564429408e-05, + "loss": 2.5012, + "step": 22973 + }, + { + "epoch": 2.081402459740436, + "grad_norm": 0.9675447940826416, + "learning_rate": 6.123965444330333e-05, + "loss": 2.8389, + "step": 22974 + }, + { + "epoch": 2.081493057914883, + "grad_norm": 1.0371205806732178, + "learning_rate": 6.123361324231258e-05, + "loss": 2.6589, + "step": 22975 + }, + { + "epoch": 2.0815836560893297, + "grad_norm": 0.9448294043540955, + "learning_rate": 6.122757204132181e-05, + "loss": 2.6424, + "step": 22976 + }, + { + "epoch": 2.0816742542637767, + "grad_norm": 1.0167380571365356, + "learning_rate": 6.122153084033106e-05, + "loss": 2.5185, + "step": 22977 + }, + { + "epoch": 2.0817648524382233, + "grad_norm": 0.890972912311554, + "learning_rate": 6.12154896393403e-05, + "loss": 2.4445, + "step": 22978 + }, + { + "epoch": 2.0818554506126703, + "grad_norm": 0.9656214714050293, + "learning_rate": 6.120944843834955e-05, + "loss": 2.586, + "step": 22979 + }, + { + "epoch": 2.081946048787117, + "grad_norm": 0.9620306491851807, + "learning_rate": 6.120340723735879e-05, + "loss": 2.6193, + "step": 22980 + }, + { + "epoch": 2.082036646961564, + "grad_norm": 1.1076076030731201, + "learning_rate": 6.119736603636804e-05, + "loss": 2.8152, + "step": 22981 + }, + { + "epoch": 2.0821272451360104, + "grad_norm": 0.8333880305290222, + "learning_rate": 6.119132483537727e-05, + "loss": 1.8767, + "step": 22982 + }, + { + "epoch": 2.0822178433104575, + "grad_norm": 0.9672382473945618, + "learning_rate": 6.118528363438652e-05, + "loss": 2.7733, + "step": 22983 + }, + { + "epoch": 2.082308441484904, + "grad_norm": 0.9877290725708008, + "learning_rate": 6.117924243339576e-05, + "loss": 2.6596, + "step": 22984 + }, + { + "epoch": 2.082399039659351, + "grad_norm": 0.9983770847320557, + "learning_rate": 6.117320123240501e-05, + "loss": 2.6317, + "step": 22985 + }, + { + "epoch": 2.0824896378337976, + "grad_norm": 0.9843353629112244, + "learning_rate": 6.116716003141425e-05, + "loss": 2.7616, + "step": 22986 + }, + { + "epoch": 2.0825802360082446, + "grad_norm": 0.9720288515090942, + "learning_rate": 6.11611188304235e-05, + "loss": 2.4776, + "step": 22987 + }, + { + "epoch": 2.082670834182691, + "grad_norm": 1.0118181705474854, + "learning_rate": 6.115507762943273e-05, + "loss": 2.5392, + "step": 22988 + }, + { + "epoch": 2.082761432357138, + "grad_norm": 0.8746861219406128, + "learning_rate": 6.114903642844198e-05, + "loss": 1.8335, + "step": 22989 + }, + { + "epoch": 2.0828520305315847, + "grad_norm": 0.9313881993293762, + "learning_rate": 6.114299522745121e-05, + "loss": 2.6451, + "step": 22990 + }, + { + "epoch": 2.0829426287060318, + "grad_norm": 1.10030198097229, + "learning_rate": 6.113695402646046e-05, + "loss": 2.3117, + "step": 22991 + }, + { + "epoch": 2.0830332268804783, + "grad_norm": 0.912164568901062, + "learning_rate": 6.11309128254697e-05, + "loss": 1.9579, + "step": 22992 + }, + { + "epoch": 2.0831238250549253, + "grad_norm": 0.9809117913246155, + "learning_rate": 6.112487162447895e-05, + "loss": 2.524, + "step": 22993 + }, + { + "epoch": 2.083214423229372, + "grad_norm": 0.9281859993934631, + "learning_rate": 6.11188304234882e-05, + "loss": 2.8266, + "step": 22994 + }, + { + "epoch": 2.083305021403819, + "grad_norm": 1.0257859230041504, + "learning_rate": 6.111278922249743e-05, + "loss": 2.4678, + "step": 22995 + }, + { + "epoch": 2.0833956195782655, + "grad_norm": 0.9972684383392334, + "learning_rate": 6.110674802150668e-05, + "loss": 2.5104, + "step": 22996 + }, + { + "epoch": 2.0834862177527125, + "grad_norm": 0.9890825748443604, + "learning_rate": 6.110070682051592e-05, + "loss": 2.6475, + "step": 22997 + }, + { + "epoch": 2.083576815927159, + "grad_norm": 0.8594781756401062, + "learning_rate": 6.109466561952516e-05, + "loss": 1.8843, + "step": 22998 + }, + { + "epoch": 2.083667414101606, + "grad_norm": 0.990338146686554, + "learning_rate": 6.108862441853441e-05, + "loss": 2.6836, + "step": 22999 + }, + { + "epoch": 2.0837580122760526, + "grad_norm": 0.9578921794891357, + "learning_rate": 6.108258321754366e-05, + "loss": 2.2118, + "step": 23000 + }, + { + "epoch": 2.0838486104504996, + "grad_norm": 0.921147346496582, + "learning_rate": 6.107654201655289e-05, + "loss": 2.6615, + "step": 23001 + }, + { + "epoch": 2.083939208624946, + "grad_norm": 0.9906714558601379, + "learning_rate": 6.107050081556214e-05, + "loss": 2.543, + "step": 23002 + }, + { + "epoch": 2.084029806799393, + "grad_norm": 0.9654662609100342, + "learning_rate": 6.106445961457137e-05, + "loss": 2.5846, + "step": 23003 + }, + { + "epoch": 2.0841204049738398, + "grad_norm": 0.9402145743370056, + "learning_rate": 6.105841841358062e-05, + "loss": 2.5447, + "step": 23004 + }, + { + "epoch": 2.084211003148287, + "grad_norm": 0.9681181311607361, + "learning_rate": 6.105237721258986e-05, + "loss": 2.3317, + "step": 23005 + }, + { + "epoch": 2.0843016013227333, + "grad_norm": 0.8755001425743103, + "learning_rate": 6.10463360115991e-05, + "loss": 2.0092, + "step": 23006 + }, + { + "epoch": 2.08439219949718, + "grad_norm": 1.0273438692092896, + "learning_rate": 6.104029481060835e-05, + "loss": 2.5467, + "step": 23007 + }, + { + "epoch": 2.084482797671627, + "grad_norm": 0.9545395970344543, + "learning_rate": 6.10342536096176e-05, + "loss": 2.484, + "step": 23008 + }, + { + "epoch": 2.084573395846074, + "grad_norm": 1.022344708442688, + "learning_rate": 6.102821240862684e-05, + "loss": 2.6066, + "step": 23009 + }, + { + "epoch": 2.0846639940205205, + "grad_norm": 0.7936359643936157, + "learning_rate": 6.102217120763608e-05, + "loss": 1.9662, + "step": 23010 + }, + { + "epoch": 2.084754592194967, + "grad_norm": 0.7629531025886536, + "learning_rate": 6.101613000664532e-05, + "loss": 1.7443, + "step": 23011 + }, + { + "epoch": 2.084845190369414, + "grad_norm": 0.9663554430007935, + "learning_rate": 6.101008880565456e-05, + "loss": 2.499, + "step": 23012 + }, + { + "epoch": 2.0849357885438606, + "grad_norm": 1.063083291053772, + "learning_rate": 6.10040476046638e-05, + "loss": 2.6201, + "step": 23013 + }, + { + "epoch": 2.0850263867183076, + "grad_norm": 1.0188267230987549, + "learning_rate": 6.099800640367306e-05, + "loss": 2.3974, + "step": 23014 + }, + { + "epoch": 2.085116984892754, + "grad_norm": 0.9806586503982544, + "learning_rate": 6.09919652026823e-05, + "loss": 2.8391, + "step": 23015 + }, + { + "epoch": 2.085207583067201, + "grad_norm": 1.0927220582962036, + "learning_rate": 6.098592400169154e-05, + "loss": 2.5396, + "step": 23016 + }, + { + "epoch": 2.085298181241648, + "grad_norm": 0.9526028037071228, + "learning_rate": 6.097988280070078e-05, + "loss": 2.8134, + "step": 23017 + }, + { + "epoch": 2.085388779416095, + "grad_norm": 1.036973476409912, + "learning_rate": 6.097384159971003e-05, + "loss": 2.6691, + "step": 23018 + }, + { + "epoch": 2.0854793775905414, + "grad_norm": 1.0709450244903564, + "learning_rate": 6.096780039871927e-05, + "loss": 2.9034, + "step": 23019 + }, + { + "epoch": 2.0855699757649884, + "grad_norm": 0.9346066117286682, + "learning_rate": 6.096175919772851e-05, + "loss": 2.5822, + "step": 23020 + }, + { + "epoch": 2.085660573939435, + "grad_norm": 0.9282328486442566, + "learning_rate": 6.095571799673775e-05, + "loss": 2.6219, + "step": 23021 + }, + { + "epoch": 2.085751172113882, + "grad_norm": 1.01582932472229, + "learning_rate": 6.0949676795747004e-05, + "loss": 2.5166, + "step": 23022 + }, + { + "epoch": 2.0858417702883285, + "grad_norm": 0.9515308141708374, + "learning_rate": 6.0943635594756245e-05, + "loss": 2.4775, + "step": 23023 + }, + { + "epoch": 2.0859323684627755, + "grad_norm": 1.0773999691009521, + "learning_rate": 6.0937594393765486e-05, + "loss": 2.5418, + "step": 23024 + }, + { + "epoch": 2.086022966637222, + "grad_norm": 0.9353301525115967, + "learning_rate": 6.093155319277473e-05, + "loss": 2.199, + "step": 23025 + }, + { + "epoch": 2.086113564811669, + "grad_norm": 1.0076994895935059, + "learning_rate": 6.092551199178397e-05, + "loss": 2.7439, + "step": 23026 + }, + { + "epoch": 2.0862041629861157, + "grad_norm": 1.0465266704559326, + "learning_rate": 6.091947079079321e-05, + "loss": 2.5599, + "step": 23027 + }, + { + "epoch": 2.0862947611605627, + "grad_norm": 0.9851424694061279, + "learning_rate": 6.091342958980245e-05, + "loss": 2.7482, + "step": 23028 + }, + { + "epoch": 2.0863853593350092, + "grad_norm": 0.9944718480110168, + "learning_rate": 6.0907388388811704e-05, + "loss": 2.7074, + "step": 23029 + }, + { + "epoch": 2.0864759575094562, + "grad_norm": 0.9267707467079163, + "learning_rate": 6.0901347187820944e-05, + "loss": 2.6241, + "step": 23030 + }, + { + "epoch": 2.086566555683903, + "grad_norm": 1.0045915842056274, + "learning_rate": 6.0895305986830185e-05, + "loss": 2.3923, + "step": 23031 + }, + { + "epoch": 2.08665715385835, + "grad_norm": 0.9655470848083496, + "learning_rate": 6.0889264785839426e-05, + "loss": 2.5612, + "step": 23032 + }, + { + "epoch": 2.0867477520327964, + "grad_norm": 0.9656202793121338, + "learning_rate": 6.088322358484867e-05, + "loss": 2.3923, + "step": 23033 + }, + { + "epoch": 2.0868383502072434, + "grad_norm": 0.9159496426582336, + "learning_rate": 6.0877182383857915e-05, + "loss": 2.496, + "step": 23034 + }, + { + "epoch": 2.08692894838169, + "grad_norm": 1.0499385595321655, + "learning_rate": 6.0871141182867155e-05, + "loss": 2.846, + "step": 23035 + }, + { + "epoch": 2.087019546556137, + "grad_norm": 1.25364089012146, + "learning_rate": 6.0865099981876396e-05, + "loss": 3.2078, + "step": 23036 + }, + { + "epoch": 2.0871101447305835, + "grad_norm": 0.9287804365158081, + "learning_rate": 6.0859058780885644e-05, + "loss": 2.5539, + "step": 23037 + }, + { + "epoch": 2.0872007429050305, + "grad_norm": 0.983586311340332, + "learning_rate": 6.085301757989489e-05, + "loss": 2.7674, + "step": 23038 + }, + { + "epoch": 2.087291341079477, + "grad_norm": 0.9804332256317139, + "learning_rate": 6.084697637890413e-05, + "loss": 2.7977, + "step": 23039 + }, + { + "epoch": 2.087381939253924, + "grad_norm": 1.020670771598816, + "learning_rate": 6.084093517791337e-05, + "loss": 2.6522, + "step": 23040 + }, + { + "epoch": 2.0874725374283707, + "grad_norm": 0.9995073080062866, + "learning_rate": 6.0834893976922614e-05, + "loss": 2.4672, + "step": 23041 + }, + { + "epoch": 2.0875631356028177, + "grad_norm": 0.9908020496368408, + "learning_rate": 6.0828852775931855e-05, + "loss": 2.6639, + "step": 23042 + }, + { + "epoch": 2.0876537337772643, + "grad_norm": 0.9705966114997864, + "learning_rate": 6.0822811574941096e-05, + "loss": 2.3814, + "step": 23043 + }, + { + "epoch": 2.0877443319517113, + "grad_norm": 0.9798986315727234, + "learning_rate": 6.081677037395035e-05, + "loss": 2.7967, + "step": 23044 + }, + { + "epoch": 2.087834930126158, + "grad_norm": 0.841187596321106, + "learning_rate": 6.081072917295959e-05, + "loss": 1.9708, + "step": 23045 + }, + { + "epoch": 2.087925528300605, + "grad_norm": 0.9770022034645081, + "learning_rate": 6.080468797196883e-05, + "loss": 2.3367, + "step": 23046 + }, + { + "epoch": 2.0880161264750514, + "grad_norm": 1.0801280736923218, + "learning_rate": 6.079864677097807e-05, + "loss": 2.3845, + "step": 23047 + }, + { + "epoch": 2.0881067246494984, + "grad_norm": 0.8780736923217773, + "learning_rate": 6.079260556998731e-05, + "loss": 1.8987, + "step": 23048 + }, + { + "epoch": 2.088197322823945, + "grad_norm": 1.0512498617172241, + "learning_rate": 6.0786564368996554e-05, + "loss": 2.6201, + "step": 23049 + }, + { + "epoch": 2.088287920998392, + "grad_norm": 1.0192821025848389, + "learning_rate": 6.07805231680058e-05, + "loss": 2.9244, + "step": 23050 + }, + { + "epoch": 2.0883785191728386, + "grad_norm": 1.0060572624206543, + "learning_rate": 6.077448196701504e-05, + "loss": 2.3584, + "step": 23051 + }, + { + "epoch": 2.0884691173472856, + "grad_norm": 0.8908597230911255, + "learning_rate": 6.076844076602429e-05, + "loss": 2.3774, + "step": 23052 + }, + { + "epoch": 2.088559715521732, + "grad_norm": 1.0366522073745728, + "learning_rate": 6.076239956503353e-05, + "loss": 3.1292, + "step": 23053 + }, + { + "epoch": 2.088650313696179, + "grad_norm": 0.9485582709312439, + "learning_rate": 6.075635836404278e-05, + "loss": 2.6705, + "step": 23054 + }, + { + "epoch": 2.0887409118706257, + "grad_norm": 0.991101086139679, + "learning_rate": 6.075031716305202e-05, + "loss": 2.6343, + "step": 23055 + }, + { + "epoch": 2.0888315100450727, + "grad_norm": 1.0218197107315063, + "learning_rate": 6.074427596206126e-05, + "loss": 2.5865, + "step": 23056 + }, + { + "epoch": 2.0889221082195193, + "grad_norm": 1.0221726894378662, + "learning_rate": 6.07382347610705e-05, + "loss": 2.6592, + "step": 23057 + }, + { + "epoch": 2.0890127063939663, + "grad_norm": 1.0056538581848145, + "learning_rate": 6.073219356007974e-05, + "loss": 2.7432, + "step": 23058 + }, + { + "epoch": 2.089103304568413, + "grad_norm": 0.9679771661758423, + "learning_rate": 6.0726152359088996e-05, + "loss": 2.7869, + "step": 23059 + }, + { + "epoch": 2.08919390274286, + "grad_norm": 1.0111163854599, + "learning_rate": 6.072011115809824e-05, + "loss": 2.7244, + "step": 23060 + }, + { + "epoch": 2.0892845009173064, + "grad_norm": 0.8159533739089966, + "learning_rate": 6.071406995710748e-05, + "loss": 1.8171, + "step": 23061 + }, + { + "epoch": 2.0893750990917535, + "grad_norm": 0.9786754846572876, + "learning_rate": 6.070802875611672e-05, + "loss": 2.6116, + "step": 23062 + }, + { + "epoch": 2.0894656972662, + "grad_norm": 1.1308372020721436, + "learning_rate": 6.070198755512596e-05, + "loss": 2.4373, + "step": 23063 + }, + { + "epoch": 2.089556295440647, + "grad_norm": 0.88519686460495, + "learning_rate": 6.06959463541352e-05, + "loss": 2.0379, + "step": 23064 + }, + { + "epoch": 2.0896468936150936, + "grad_norm": 0.942015528678894, + "learning_rate": 6.068990515314444e-05, + "loss": 2.478, + "step": 23065 + }, + { + "epoch": 2.0897374917895406, + "grad_norm": 0.9787017107009888, + "learning_rate": 6.068386395215369e-05, + "loss": 2.8297, + "step": 23066 + }, + { + "epoch": 2.089828089963987, + "grad_norm": 1.0381484031677246, + "learning_rate": 6.0677822751162937e-05, + "loss": 2.453, + "step": 23067 + }, + { + "epoch": 2.089918688138434, + "grad_norm": 0.9413816332817078, + "learning_rate": 6.067178155017218e-05, + "loss": 2.5384, + "step": 23068 + }, + { + "epoch": 2.0900092863128807, + "grad_norm": 1.0615020990371704, + "learning_rate": 6.066574034918142e-05, + "loss": 2.7346, + "step": 23069 + }, + { + "epoch": 2.0900998844873278, + "grad_norm": 0.9440195560455322, + "learning_rate": 6.0659699148190666e-05, + "loss": 2.5247, + "step": 23070 + }, + { + "epoch": 2.0901904826617743, + "grad_norm": 0.7919712066650391, + "learning_rate": 6.0653657947199907e-05, + "loss": 1.8463, + "step": 23071 + }, + { + "epoch": 2.0902810808362213, + "grad_norm": 0.9319653511047363, + "learning_rate": 6.064761674620915e-05, + "loss": 2.6959, + "step": 23072 + }, + { + "epoch": 2.090371679010668, + "grad_norm": 0.9885264039039612, + "learning_rate": 6.064157554521839e-05, + "loss": 2.5382, + "step": 23073 + }, + { + "epoch": 2.090462277185115, + "grad_norm": 0.969397246837616, + "learning_rate": 6.063553434422764e-05, + "loss": 2.7287, + "step": 23074 + }, + { + "epoch": 2.0905528753595615, + "grad_norm": 1.0783857107162476, + "learning_rate": 6.0629493143236883e-05, + "loss": 2.3056, + "step": 23075 + }, + { + "epoch": 2.0906434735340085, + "grad_norm": 1.024861454963684, + "learning_rate": 6.0623451942246124e-05, + "loss": 2.7163, + "step": 23076 + }, + { + "epoch": 2.090734071708455, + "grad_norm": 1.0472444295883179, + "learning_rate": 6.0617410741255365e-05, + "loss": 2.5827, + "step": 23077 + }, + { + "epoch": 2.090824669882902, + "grad_norm": 0.9936532974243164, + "learning_rate": 6.0611369540264606e-05, + "loss": 2.6796, + "step": 23078 + }, + { + "epoch": 2.0909152680573486, + "grad_norm": 0.9924523234367371, + "learning_rate": 6.060532833927385e-05, + "loss": 2.5998, + "step": 23079 + }, + { + "epoch": 2.0910058662317956, + "grad_norm": 0.9559819102287292, + "learning_rate": 6.059928713828309e-05, + "loss": 2.4845, + "step": 23080 + }, + { + "epoch": 2.091096464406242, + "grad_norm": 0.994963526725769, + "learning_rate": 6.059324593729233e-05, + "loss": 2.6109, + "step": 23081 + }, + { + "epoch": 2.091187062580689, + "grad_norm": 0.9959737658500671, + "learning_rate": 6.058720473630158e-05, + "loss": 2.6944, + "step": 23082 + }, + { + "epoch": 2.0912776607551358, + "grad_norm": 0.8527446389198303, + "learning_rate": 6.0581163535310824e-05, + "loss": 1.8735, + "step": 23083 + }, + { + "epoch": 2.091368258929583, + "grad_norm": 0.8564149141311646, + "learning_rate": 6.0575122334320064e-05, + "loss": 2.0261, + "step": 23084 + }, + { + "epoch": 2.0914588571040293, + "grad_norm": 0.955910861492157, + "learning_rate": 6.0569081133329305e-05, + "loss": 2.5658, + "step": 23085 + }, + { + "epoch": 2.091549455278476, + "grad_norm": 1.0230004787445068, + "learning_rate": 6.056303993233855e-05, + "loss": 2.6568, + "step": 23086 + }, + { + "epoch": 2.091640053452923, + "grad_norm": 1.0270613431930542, + "learning_rate": 6.0556998731347794e-05, + "loss": 2.6834, + "step": 23087 + }, + { + "epoch": 2.09173065162737, + "grad_norm": 0.9981997013092041, + "learning_rate": 6.0550957530357035e-05, + "loss": 2.5257, + "step": 23088 + }, + { + "epoch": 2.0918212498018165, + "grad_norm": 0.9777645468711853, + "learning_rate": 6.054491632936629e-05, + "loss": 2.6663, + "step": 23089 + }, + { + "epoch": 2.091911847976263, + "grad_norm": 0.9671030640602112, + "learning_rate": 6.053887512837553e-05, + "loss": 2.5865, + "step": 23090 + }, + { + "epoch": 2.09200244615071, + "grad_norm": 0.9844585061073303, + "learning_rate": 6.053283392738477e-05, + "loss": 2.509, + "step": 23091 + }, + { + "epoch": 2.0920930443251566, + "grad_norm": 0.96159428358078, + "learning_rate": 6.052679272639401e-05, + "loss": 2.7113, + "step": 23092 + }, + { + "epoch": 2.0921836424996036, + "grad_norm": 1.1405848264694214, + "learning_rate": 6.052075152540325e-05, + "loss": 2.5087, + "step": 23093 + }, + { + "epoch": 2.09227424067405, + "grad_norm": 0.8719161152839661, + "learning_rate": 6.051471032441249e-05, + "loss": 1.83, + "step": 23094 + }, + { + "epoch": 2.092364838848497, + "grad_norm": 1.1104525327682495, + "learning_rate": 6.0508669123421734e-05, + "loss": 2.7395, + "step": 23095 + }, + { + "epoch": 2.092455437022944, + "grad_norm": 0.9447026252746582, + "learning_rate": 6.0502627922430975e-05, + "loss": 2.4696, + "step": 23096 + }, + { + "epoch": 2.092546035197391, + "grad_norm": 0.9802218079566956, + "learning_rate": 6.049658672144023e-05, + "loss": 2.4518, + "step": 23097 + }, + { + "epoch": 2.0926366333718374, + "grad_norm": 1.0116395950317383, + "learning_rate": 6.049054552044947e-05, + "loss": 2.4059, + "step": 23098 + }, + { + "epoch": 2.0927272315462844, + "grad_norm": 0.9966495037078857, + "learning_rate": 6.048450431945871e-05, + "loss": 2.6209, + "step": 23099 + }, + { + "epoch": 2.092817829720731, + "grad_norm": 1.034073829650879, + "learning_rate": 6.047846311846795e-05, + "loss": 2.7539, + "step": 23100 + }, + { + "epoch": 2.092908427895178, + "grad_norm": 1.1579681634902954, + "learning_rate": 6.047242191747719e-05, + "loss": 2.6912, + "step": 23101 + }, + { + "epoch": 2.0929990260696245, + "grad_norm": 0.9169504046440125, + "learning_rate": 6.046638071648644e-05, + "loss": 2.3395, + "step": 23102 + }, + { + "epoch": 2.0930896242440715, + "grad_norm": 0.9556692242622375, + "learning_rate": 6.046033951549568e-05, + "loss": 2.4064, + "step": 23103 + }, + { + "epoch": 2.093180222418518, + "grad_norm": 1.0148262977600098, + "learning_rate": 6.045429831450493e-05, + "loss": 2.6732, + "step": 23104 + }, + { + "epoch": 2.093270820592965, + "grad_norm": 0.9681768417358398, + "learning_rate": 6.0448257113514176e-05, + "loss": 2.4497, + "step": 23105 + }, + { + "epoch": 2.0933614187674117, + "grad_norm": 0.9825873970985413, + "learning_rate": 6.044221591252342e-05, + "loss": 2.6475, + "step": 23106 + }, + { + "epoch": 2.0934520169418587, + "grad_norm": 1.0043065547943115, + "learning_rate": 6.043617471153266e-05, + "loss": 2.5311, + "step": 23107 + }, + { + "epoch": 2.0935426151163052, + "grad_norm": 0.9450591802597046, + "learning_rate": 6.04301335105419e-05, + "loss": 2.5999, + "step": 23108 + }, + { + "epoch": 2.0936332132907522, + "grad_norm": 0.9739270806312561, + "learning_rate": 6.042409230955114e-05, + "loss": 2.808, + "step": 23109 + }, + { + "epoch": 2.093723811465199, + "grad_norm": 0.9117646217346191, + "learning_rate": 6.041805110856038e-05, + "loss": 2.6766, + "step": 23110 + }, + { + "epoch": 2.093814409639646, + "grad_norm": 0.9424731731414795, + "learning_rate": 6.041200990756962e-05, + "loss": 2.5158, + "step": 23111 + }, + { + "epoch": 2.0939050078140924, + "grad_norm": 1.093166470527649, + "learning_rate": 6.0405968706578875e-05, + "loss": 2.6983, + "step": 23112 + }, + { + "epoch": 2.0939956059885394, + "grad_norm": 1.0112465620040894, + "learning_rate": 6.0399927505588116e-05, + "loss": 2.7625, + "step": 23113 + }, + { + "epoch": 2.094086204162986, + "grad_norm": 0.9718071818351746, + "learning_rate": 6.039388630459736e-05, + "loss": 2.6516, + "step": 23114 + }, + { + "epoch": 2.094176802337433, + "grad_norm": 0.8896054625511169, + "learning_rate": 6.03878451036066e-05, + "loss": 1.6855, + "step": 23115 + }, + { + "epoch": 2.0942674005118795, + "grad_norm": 0.9289267063140869, + "learning_rate": 6.038180390261584e-05, + "loss": 2.646, + "step": 23116 + }, + { + "epoch": 2.0943579986863265, + "grad_norm": 0.9329887628555298, + "learning_rate": 6.037576270162508e-05, + "loss": 2.6164, + "step": 23117 + }, + { + "epoch": 2.094448596860773, + "grad_norm": 0.9641862511634827, + "learning_rate": 6.036972150063433e-05, + "loss": 2.5464, + "step": 23118 + }, + { + "epoch": 2.09453919503522, + "grad_norm": 0.8718426823616028, + "learning_rate": 6.0363680299643575e-05, + "loss": 1.9383, + "step": 23119 + }, + { + "epoch": 2.0946297932096667, + "grad_norm": 0.9811793565750122, + "learning_rate": 6.0357639098652816e-05, + "loss": 2.8118, + "step": 23120 + }, + { + "epoch": 2.0947203913841137, + "grad_norm": 1.0241872072219849, + "learning_rate": 6.035159789766206e-05, + "loss": 2.7394, + "step": 23121 + }, + { + "epoch": 2.0948109895585603, + "grad_norm": 0.9697124361991882, + "learning_rate": 6.0345556696671304e-05, + "loss": 2.3106, + "step": 23122 + }, + { + "epoch": 2.0949015877330073, + "grad_norm": 1.13285493850708, + "learning_rate": 6.0339515495680545e-05, + "loss": 2.8244, + "step": 23123 + }, + { + "epoch": 2.094992185907454, + "grad_norm": 0.9141849875450134, + "learning_rate": 6.0333474294689786e-05, + "loss": 2.0477, + "step": 23124 + }, + { + "epoch": 2.095082784081901, + "grad_norm": 1.0671288967132568, + "learning_rate": 6.0327433093699027e-05, + "loss": 2.5679, + "step": 23125 + }, + { + "epoch": 2.0951733822563474, + "grad_norm": 0.9734991192817688, + "learning_rate": 6.032139189270828e-05, + "loss": 2.7444, + "step": 23126 + }, + { + "epoch": 2.0952639804307944, + "grad_norm": 1.0200908184051514, + "learning_rate": 6.031535069171752e-05, + "loss": 2.7538, + "step": 23127 + }, + { + "epoch": 2.095354578605241, + "grad_norm": 0.958867073059082, + "learning_rate": 6.030930949072676e-05, + "loss": 2.5537, + "step": 23128 + }, + { + "epoch": 2.095445176779688, + "grad_norm": 0.838844358921051, + "learning_rate": 6.0303268289736003e-05, + "loss": 1.8678, + "step": 23129 + }, + { + "epoch": 2.0955357749541346, + "grad_norm": 0.955177903175354, + "learning_rate": 6.0297227088745244e-05, + "loss": 2.6104, + "step": 23130 + }, + { + "epoch": 2.0956263731285816, + "grad_norm": 1.0393733978271484, + "learning_rate": 6.0291185887754485e-05, + "loss": 2.6291, + "step": 23131 + }, + { + "epoch": 2.095716971303028, + "grad_norm": 0.9319693446159363, + "learning_rate": 6.0285144686763726e-05, + "loss": 2.7135, + "step": 23132 + }, + { + "epoch": 2.095807569477475, + "grad_norm": 0.9032102823257446, + "learning_rate": 6.027910348577297e-05, + "loss": 1.9666, + "step": 23133 + }, + { + "epoch": 2.0958981676519217, + "grad_norm": 0.9755586981773376, + "learning_rate": 6.027306228478222e-05, + "loss": 2.5298, + "step": 23134 + }, + { + "epoch": 2.0959887658263687, + "grad_norm": 1.0635714530944824, + "learning_rate": 6.026702108379146e-05, + "loss": 2.377, + "step": 23135 + }, + { + "epoch": 2.0960793640008153, + "grad_norm": 1.0048084259033203, + "learning_rate": 6.02609798828007e-05, + "loss": 2.6091, + "step": 23136 + }, + { + "epoch": 2.0961699621752623, + "grad_norm": 1.0989229679107666, + "learning_rate": 6.025493868180995e-05, + "loss": 2.8089, + "step": 23137 + }, + { + "epoch": 2.096260560349709, + "grad_norm": 1.0849558115005493, + "learning_rate": 6.024889748081919e-05, + "loss": 2.5275, + "step": 23138 + }, + { + "epoch": 2.096351158524156, + "grad_norm": 1.0369716882705688, + "learning_rate": 6.024285627982843e-05, + "loss": 2.6153, + "step": 23139 + }, + { + "epoch": 2.0964417566986024, + "grad_norm": 0.9754659533500671, + "learning_rate": 6.023681507883767e-05, + "loss": 2.5037, + "step": 23140 + }, + { + "epoch": 2.0965323548730495, + "grad_norm": 1.007834792137146, + "learning_rate": 6.023077387784693e-05, + "loss": 2.4039, + "step": 23141 + }, + { + "epoch": 2.096622953047496, + "grad_norm": 0.9948417544364929, + "learning_rate": 6.022473267685617e-05, + "loss": 2.5745, + "step": 23142 + }, + { + "epoch": 2.096713551221943, + "grad_norm": 0.8996816873550415, + "learning_rate": 6.021869147586541e-05, + "loss": 1.9086, + "step": 23143 + }, + { + "epoch": 2.0968041493963896, + "grad_norm": 0.8638957142829895, + "learning_rate": 6.021265027487465e-05, + "loss": 1.8521, + "step": 23144 + }, + { + "epoch": 2.0968947475708366, + "grad_norm": 0.973198652267456, + "learning_rate": 6.020660907388389e-05, + "loss": 2.5198, + "step": 23145 + }, + { + "epoch": 2.096985345745283, + "grad_norm": 0.9927272200584412, + "learning_rate": 6.020056787289313e-05, + "loss": 2.5289, + "step": 23146 + }, + { + "epoch": 2.09707594391973, + "grad_norm": 1.0765769481658936, + "learning_rate": 6.019452667190237e-05, + "loss": 2.6318, + "step": 23147 + }, + { + "epoch": 2.0971665420941767, + "grad_norm": 1.0989487171173096, + "learning_rate": 6.018848547091161e-05, + "loss": 2.7788, + "step": 23148 + }, + { + "epoch": 2.0972571402686238, + "grad_norm": 1.1447495222091675, + "learning_rate": 6.018244426992087e-05, + "loss": 2.6095, + "step": 23149 + }, + { + "epoch": 2.0973477384430703, + "grad_norm": 1.0209163427352905, + "learning_rate": 6.017640306893011e-05, + "loss": 2.4824, + "step": 23150 + }, + { + "epoch": 2.0974383366175173, + "grad_norm": 0.9409422874450684, + "learning_rate": 6.017036186793935e-05, + "loss": 2.4106, + "step": 23151 + }, + { + "epoch": 2.097528934791964, + "grad_norm": 1.0120606422424316, + "learning_rate": 6.016432066694859e-05, + "loss": 2.6597, + "step": 23152 + }, + { + "epoch": 2.097619532966411, + "grad_norm": 0.9540300369262695, + "learning_rate": 6.015827946595783e-05, + "loss": 2.8083, + "step": 23153 + }, + { + "epoch": 2.0977101311408575, + "grad_norm": 0.846560537815094, + "learning_rate": 6.015223826496708e-05, + "loss": 1.8424, + "step": 23154 + }, + { + "epoch": 2.0978007293153045, + "grad_norm": 0.9171627163887024, + "learning_rate": 6.014619706397632e-05, + "loss": 2.7352, + "step": 23155 + }, + { + "epoch": 2.097891327489751, + "grad_norm": 1.0205849409103394, + "learning_rate": 6.014015586298557e-05, + "loss": 2.515, + "step": 23156 + }, + { + "epoch": 2.097981925664198, + "grad_norm": 0.9550230503082275, + "learning_rate": 6.0134114661994814e-05, + "loss": 2.6718, + "step": 23157 + }, + { + "epoch": 2.0980725238386446, + "grad_norm": 0.9951868653297424, + "learning_rate": 6.0128073461004055e-05, + "loss": 2.5263, + "step": 23158 + }, + { + "epoch": 2.0981631220130916, + "grad_norm": 1.0633485317230225, + "learning_rate": 6.0122032260013296e-05, + "loss": 2.9953, + "step": 23159 + }, + { + "epoch": 2.098253720187538, + "grad_norm": 1.060019612312317, + "learning_rate": 6.011599105902254e-05, + "loss": 2.7126, + "step": 23160 + }, + { + "epoch": 2.098344318361985, + "grad_norm": 1.1834567785263062, + "learning_rate": 6.010994985803178e-05, + "loss": 2.2397, + "step": 23161 + }, + { + "epoch": 2.0984349165364318, + "grad_norm": 0.9992170333862305, + "learning_rate": 6.010390865704102e-05, + "loss": 2.9004, + "step": 23162 + }, + { + "epoch": 2.098525514710879, + "grad_norm": 0.9873785972595215, + "learning_rate": 6.009786745605026e-05, + "loss": 2.3472, + "step": 23163 + }, + { + "epoch": 2.0986161128853253, + "grad_norm": 0.9837403297424316, + "learning_rate": 6.0091826255059514e-05, + "loss": 2.5052, + "step": 23164 + }, + { + "epoch": 2.0987067110597724, + "grad_norm": 1.1211011409759521, + "learning_rate": 6.0085785054068755e-05, + "loss": 2.5252, + "step": 23165 + }, + { + "epoch": 2.098797309234219, + "grad_norm": 1.0656397342681885, + "learning_rate": 6.0079743853077995e-05, + "loss": 2.5259, + "step": 23166 + }, + { + "epoch": 2.098887907408666, + "grad_norm": 0.987249493598938, + "learning_rate": 6.0073702652087236e-05, + "loss": 2.8687, + "step": 23167 + }, + { + "epoch": 2.0989785055831125, + "grad_norm": 0.9909968972206116, + "learning_rate": 6.006766145109648e-05, + "loss": 2.5623, + "step": 23168 + }, + { + "epoch": 2.099069103757559, + "grad_norm": 0.9708342552185059, + "learning_rate": 6.006162025010572e-05, + "loss": 2.4428, + "step": 23169 + }, + { + "epoch": 2.099159701932006, + "grad_norm": 0.832631528377533, + "learning_rate": 6.0055579049114966e-05, + "loss": 2.0268, + "step": 23170 + }, + { + "epoch": 2.099250300106453, + "grad_norm": 1.0103410482406616, + "learning_rate": 6.004953784812421e-05, + "loss": 2.6335, + "step": 23171 + }, + { + "epoch": 2.0993408982808996, + "grad_norm": 1.044547200202942, + "learning_rate": 6.0043496647133454e-05, + "loss": 2.7447, + "step": 23172 + }, + { + "epoch": 2.099431496455346, + "grad_norm": 0.9673134088516235, + "learning_rate": 6.00374554461427e-05, + "loss": 2.6533, + "step": 23173 + }, + { + "epoch": 2.0995220946297932, + "grad_norm": 0.9948270916938782, + "learning_rate": 6.003141424515194e-05, + "loss": 2.6163, + "step": 23174 + }, + { + "epoch": 2.09961269280424, + "grad_norm": 0.9826149344444275, + "learning_rate": 6.002537304416118e-05, + "loss": 2.3653, + "step": 23175 + }, + { + "epoch": 2.099703290978687, + "grad_norm": 0.9367687702178955, + "learning_rate": 6.0019331843170424e-05, + "loss": 2.3179, + "step": 23176 + }, + { + "epoch": 2.0997938891531334, + "grad_norm": 0.9567402005195618, + "learning_rate": 6.0013290642179665e-05, + "loss": 2.5742, + "step": 23177 + }, + { + "epoch": 2.0998844873275804, + "grad_norm": 1.0159447193145752, + "learning_rate": 6.0007249441188906e-05, + "loss": 2.5584, + "step": 23178 + }, + { + "epoch": 2.099975085502027, + "grad_norm": 0.9547716975212097, + "learning_rate": 6.000120824019816e-05, + "loss": 2.5516, + "step": 23179 + }, + { + "epoch": 2.100065683676474, + "grad_norm": 0.9342955350875854, + "learning_rate": 5.99951670392074e-05, + "loss": 2.0428, + "step": 23180 + }, + { + "epoch": 2.1001562818509205, + "grad_norm": 1.0284425020217896, + "learning_rate": 5.998912583821664e-05, + "loss": 2.6369, + "step": 23181 + }, + { + "epoch": 2.1002468800253675, + "grad_norm": 0.9949783086776733, + "learning_rate": 5.998308463722588e-05, + "loss": 2.5394, + "step": 23182 + }, + { + "epoch": 2.100337478199814, + "grad_norm": 1.0129421949386597, + "learning_rate": 5.9977043436235123e-05, + "loss": 2.6508, + "step": 23183 + }, + { + "epoch": 2.100428076374261, + "grad_norm": 0.938418984413147, + "learning_rate": 5.9971002235244364e-05, + "loss": 2.5713, + "step": 23184 + }, + { + "epoch": 2.1005186745487077, + "grad_norm": 0.9634199142456055, + "learning_rate": 5.9964961034253605e-05, + "loss": 2.4377, + "step": 23185 + }, + { + "epoch": 2.1006092727231547, + "grad_norm": 1.018388032913208, + "learning_rate": 5.995891983326286e-05, + "loss": 2.659, + "step": 23186 + }, + { + "epoch": 2.1006998708976012, + "grad_norm": 0.9720519781112671, + "learning_rate": 5.99528786322721e-05, + "loss": 2.6672, + "step": 23187 + }, + { + "epoch": 2.1007904690720482, + "grad_norm": 0.9152733087539673, + "learning_rate": 5.994683743128134e-05, + "loss": 2.2555, + "step": 23188 + }, + { + "epoch": 2.100881067246495, + "grad_norm": 1.0354269742965698, + "learning_rate": 5.994079623029059e-05, + "loss": 2.7774, + "step": 23189 + }, + { + "epoch": 2.100971665420942, + "grad_norm": 1.0004541873931885, + "learning_rate": 5.993475502929983e-05, + "loss": 2.6072, + "step": 23190 + }, + { + "epoch": 2.1010622635953884, + "grad_norm": 0.9970341324806213, + "learning_rate": 5.992871382830907e-05, + "loss": 2.7157, + "step": 23191 + }, + { + "epoch": 2.1011528617698354, + "grad_norm": 0.9143871068954468, + "learning_rate": 5.992267262731831e-05, + "loss": 2.2922, + "step": 23192 + }, + { + "epoch": 2.101243459944282, + "grad_norm": 0.9107315540313721, + "learning_rate": 5.991663142632755e-05, + "loss": 1.872, + "step": 23193 + }, + { + "epoch": 2.101334058118729, + "grad_norm": 1.0969719886779785, + "learning_rate": 5.9910590225336806e-05, + "loss": 2.6329, + "step": 23194 + }, + { + "epoch": 2.1014246562931755, + "grad_norm": 0.8839970231056213, + "learning_rate": 5.990454902434605e-05, + "loss": 2.0326, + "step": 23195 + }, + { + "epoch": 2.1015152544676226, + "grad_norm": 1.0054503679275513, + "learning_rate": 5.989850782335529e-05, + "loss": 2.3349, + "step": 23196 + }, + { + "epoch": 2.101605852642069, + "grad_norm": 0.9571183919906616, + "learning_rate": 5.989246662236453e-05, + "loss": 2.4426, + "step": 23197 + }, + { + "epoch": 2.101696450816516, + "grad_norm": 1.0055063962936401, + "learning_rate": 5.988642542137377e-05, + "loss": 2.6315, + "step": 23198 + }, + { + "epoch": 2.1017870489909627, + "grad_norm": 1.1135671138763428, + "learning_rate": 5.988038422038301e-05, + "loss": 2.7539, + "step": 23199 + }, + { + "epoch": 2.1018776471654097, + "grad_norm": 1.0366703271865845, + "learning_rate": 5.987434301939225e-05, + "loss": 2.7543, + "step": 23200 + }, + { + "epoch": 2.1019682453398563, + "grad_norm": 1.0139058828353882, + "learning_rate": 5.9868301818401506e-05, + "loss": 2.845, + "step": 23201 + }, + { + "epoch": 2.1020588435143033, + "grad_norm": 0.9850923418998718, + "learning_rate": 5.986226061741075e-05, + "loss": 2.4605, + "step": 23202 + }, + { + "epoch": 2.10214944168875, + "grad_norm": 0.9574189782142639, + "learning_rate": 5.985621941641999e-05, + "loss": 2.3725, + "step": 23203 + }, + { + "epoch": 2.102240039863197, + "grad_norm": 0.9645761847496033, + "learning_rate": 5.985017821542923e-05, + "loss": 2.7156, + "step": 23204 + }, + { + "epoch": 2.1023306380376434, + "grad_norm": 0.9523817896842957, + "learning_rate": 5.9844137014438476e-05, + "loss": 2.6274, + "step": 23205 + }, + { + "epoch": 2.1024212362120904, + "grad_norm": 1.0487984418869019, + "learning_rate": 5.983809581344772e-05, + "loss": 2.8376, + "step": 23206 + }, + { + "epoch": 2.102511834386537, + "grad_norm": 0.939629077911377, + "learning_rate": 5.983205461245696e-05, + "loss": 2.3958, + "step": 23207 + }, + { + "epoch": 2.102602432560984, + "grad_norm": 0.9552595615386963, + "learning_rate": 5.98260134114662e-05, + "loss": 2.512, + "step": 23208 + }, + { + "epoch": 2.1026930307354306, + "grad_norm": 0.9428699016571045, + "learning_rate": 5.981997221047545e-05, + "loss": 2.7648, + "step": 23209 + }, + { + "epoch": 2.1027836289098776, + "grad_norm": 1.0463624000549316, + "learning_rate": 5.9813931009484694e-05, + "loss": 2.4127, + "step": 23210 + }, + { + "epoch": 2.102874227084324, + "grad_norm": 1.0240739583969116, + "learning_rate": 5.9807889808493934e-05, + "loss": 2.5427, + "step": 23211 + }, + { + "epoch": 2.102964825258771, + "grad_norm": 0.9910959601402283, + "learning_rate": 5.9801848607503175e-05, + "loss": 2.7912, + "step": 23212 + }, + { + "epoch": 2.1030554234332177, + "grad_norm": 0.949773907661438, + "learning_rate": 5.9795807406512416e-05, + "loss": 2.5134, + "step": 23213 + }, + { + "epoch": 2.1031460216076647, + "grad_norm": 0.9598626494407654, + "learning_rate": 5.978976620552166e-05, + "loss": 2.4789, + "step": 23214 + }, + { + "epoch": 2.1032366197821113, + "grad_norm": 0.9768168330192566, + "learning_rate": 5.97837250045309e-05, + "loss": 2.5544, + "step": 23215 + }, + { + "epoch": 2.1033272179565583, + "grad_norm": 1.0216697454452515, + "learning_rate": 5.977768380354015e-05, + "loss": 2.5613, + "step": 23216 + }, + { + "epoch": 2.103417816131005, + "grad_norm": 1.0635179281234741, + "learning_rate": 5.977164260254939e-05, + "loss": 2.5464, + "step": 23217 + }, + { + "epoch": 2.103508414305452, + "grad_norm": 0.9675613641738892, + "learning_rate": 5.9765601401558634e-05, + "loss": 2.5495, + "step": 23218 + }, + { + "epoch": 2.1035990124798984, + "grad_norm": 1.0968232154846191, + "learning_rate": 5.9759560200567875e-05, + "loss": 2.6355, + "step": 23219 + }, + { + "epoch": 2.1036896106543455, + "grad_norm": 0.9609947204589844, + "learning_rate": 5.9753518999577115e-05, + "loss": 2.761, + "step": 23220 + }, + { + "epoch": 2.103780208828792, + "grad_norm": 1.0991407632827759, + "learning_rate": 5.974747779858636e-05, + "loss": 2.5763, + "step": 23221 + }, + { + "epoch": 2.103870807003239, + "grad_norm": 0.9773561954498291, + "learning_rate": 5.9741436597595604e-05, + "loss": 2.5891, + "step": 23222 + }, + { + "epoch": 2.1039614051776856, + "grad_norm": 1.0279181003570557, + "learning_rate": 5.9735395396604845e-05, + "loss": 1.9067, + "step": 23223 + }, + { + "epoch": 2.1040520033521326, + "grad_norm": 0.9302407503128052, + "learning_rate": 5.972935419561409e-05, + "loss": 2.5201, + "step": 23224 + }, + { + "epoch": 2.104142601526579, + "grad_norm": 0.9931994080543518, + "learning_rate": 5.972331299462334e-05, + "loss": 2.5208, + "step": 23225 + }, + { + "epoch": 2.104233199701026, + "grad_norm": 1.0929652452468872, + "learning_rate": 5.971727179363258e-05, + "loss": 2.4775, + "step": 23226 + }, + { + "epoch": 2.1043237978754727, + "grad_norm": 1.1316537857055664, + "learning_rate": 5.971123059264182e-05, + "loss": 2.7177, + "step": 23227 + }, + { + "epoch": 2.1044143960499198, + "grad_norm": 0.9843370318412781, + "learning_rate": 5.970518939165106e-05, + "loss": 2.5737, + "step": 23228 + }, + { + "epoch": 2.1045049942243663, + "grad_norm": 0.9320904612541199, + "learning_rate": 5.96991481906603e-05, + "loss": 2.5601, + "step": 23229 + }, + { + "epoch": 2.1045955923988133, + "grad_norm": 0.9404157400131226, + "learning_rate": 5.9693106989669544e-05, + "loss": 2.5122, + "step": 23230 + }, + { + "epoch": 2.10468619057326, + "grad_norm": 0.8591941595077515, + "learning_rate": 5.96870657886788e-05, + "loss": 2.2569, + "step": 23231 + }, + { + "epoch": 2.104776788747707, + "grad_norm": 0.9659132957458496, + "learning_rate": 5.968102458768804e-05, + "loss": 2.4356, + "step": 23232 + }, + { + "epoch": 2.1048673869221535, + "grad_norm": 0.9660621881484985, + "learning_rate": 5.967498338669728e-05, + "loss": 2.4249, + "step": 23233 + }, + { + "epoch": 2.1049579850966005, + "grad_norm": 0.9404299259185791, + "learning_rate": 5.966894218570652e-05, + "loss": 2.6151, + "step": 23234 + }, + { + "epoch": 2.105048583271047, + "grad_norm": 1.0557770729064941, + "learning_rate": 5.966290098471576e-05, + "loss": 2.7741, + "step": 23235 + }, + { + "epoch": 2.105139181445494, + "grad_norm": 1.153956413269043, + "learning_rate": 5.9656859783725e-05, + "loss": 2.3911, + "step": 23236 + }, + { + "epoch": 2.1052297796199406, + "grad_norm": 0.9884134531021118, + "learning_rate": 5.965081858273425e-05, + "loss": 2.5454, + "step": 23237 + }, + { + "epoch": 2.1053203777943876, + "grad_norm": 0.8809394836425781, + "learning_rate": 5.964477738174349e-05, + "loss": 1.9672, + "step": 23238 + }, + { + "epoch": 2.105410975968834, + "grad_norm": 1.0292421579360962, + "learning_rate": 5.963873618075274e-05, + "loss": 2.4894, + "step": 23239 + }, + { + "epoch": 2.105501574143281, + "grad_norm": 0.950468897819519, + "learning_rate": 5.963269497976198e-05, + "loss": 2.515, + "step": 23240 + }, + { + "epoch": 2.1055921723177278, + "grad_norm": 1.0371204614639282, + "learning_rate": 5.962665377877123e-05, + "loss": 2.4532, + "step": 23241 + }, + { + "epoch": 2.105682770492175, + "grad_norm": 0.8555238842964172, + "learning_rate": 5.962061257778047e-05, + "loss": 1.9187, + "step": 23242 + }, + { + "epoch": 2.1057733686666213, + "grad_norm": 0.9969245791435242, + "learning_rate": 5.961457137678971e-05, + "loss": 1.8277, + "step": 23243 + }, + { + "epoch": 2.1058639668410684, + "grad_norm": 1.000860571861267, + "learning_rate": 5.960853017579895e-05, + "loss": 2.8082, + "step": 23244 + }, + { + "epoch": 2.105954565015515, + "grad_norm": 0.94207763671875, + "learning_rate": 5.960248897480819e-05, + "loss": 1.9112, + "step": 23245 + }, + { + "epoch": 2.106045163189962, + "grad_norm": 0.9298088550567627, + "learning_rate": 5.9596447773817445e-05, + "loss": 2.5492, + "step": 23246 + }, + { + "epoch": 2.1061357613644085, + "grad_norm": 0.959496796131134, + "learning_rate": 5.9590406572826686e-05, + "loss": 2.707, + "step": 23247 + }, + { + "epoch": 2.106226359538855, + "grad_norm": 1.0424938201904297, + "learning_rate": 5.9584365371835926e-05, + "loss": 2.5804, + "step": 23248 + }, + { + "epoch": 2.106316957713302, + "grad_norm": 0.8361676335334778, + "learning_rate": 5.957832417084517e-05, + "loss": 2.0694, + "step": 23249 + }, + { + "epoch": 2.106407555887749, + "grad_norm": 0.9786226153373718, + "learning_rate": 5.957228296985441e-05, + "loss": 2.6207, + "step": 23250 + }, + { + "epoch": 2.1064981540621956, + "grad_norm": 0.8278214931488037, + "learning_rate": 5.956624176886365e-05, + "loss": 1.9227, + "step": 23251 + }, + { + "epoch": 2.106588752236642, + "grad_norm": 0.9437081217765808, + "learning_rate": 5.956020056787289e-05, + "loss": 2.4913, + "step": 23252 + }, + { + "epoch": 2.1066793504110892, + "grad_norm": 0.9740346074104309, + "learning_rate": 5.955415936688214e-05, + "loss": 2.5693, + "step": 23253 + }, + { + "epoch": 2.106769948585536, + "grad_norm": 0.8207419514656067, + "learning_rate": 5.9548118165891385e-05, + "loss": 1.8414, + "step": 23254 + }, + { + "epoch": 2.106860546759983, + "grad_norm": 0.9786468744277954, + "learning_rate": 5.9542076964900626e-05, + "loss": 2.6989, + "step": 23255 + }, + { + "epoch": 2.1069511449344294, + "grad_norm": 0.97153639793396, + "learning_rate": 5.953603576390987e-05, + "loss": 2.5966, + "step": 23256 + }, + { + "epoch": 2.1070417431088764, + "grad_norm": 1.004115104675293, + "learning_rate": 5.9529994562919114e-05, + "loss": 2.6709, + "step": 23257 + }, + { + "epoch": 2.107132341283323, + "grad_norm": 1.0084532499313354, + "learning_rate": 5.9523953361928355e-05, + "loss": 2.7515, + "step": 23258 + }, + { + "epoch": 2.10722293945777, + "grad_norm": 1.0542038679122925, + "learning_rate": 5.9517912160937596e-05, + "loss": 2.6854, + "step": 23259 + }, + { + "epoch": 2.1073135376322165, + "grad_norm": 0.7958642244338989, + "learning_rate": 5.951187095994684e-05, + "loss": 1.9249, + "step": 23260 + }, + { + "epoch": 2.1074041358066635, + "grad_norm": 0.8483197093009949, + "learning_rate": 5.950582975895609e-05, + "loss": 2.0072, + "step": 23261 + }, + { + "epoch": 2.10749473398111, + "grad_norm": 1.0063145160675049, + "learning_rate": 5.949978855796533e-05, + "loss": 2.7052, + "step": 23262 + }, + { + "epoch": 2.107585332155557, + "grad_norm": 0.9361486434936523, + "learning_rate": 5.949374735697457e-05, + "loss": 2.4885, + "step": 23263 + }, + { + "epoch": 2.1076759303300037, + "grad_norm": 1.0786957740783691, + "learning_rate": 5.9487706155983814e-05, + "loss": 2.6903, + "step": 23264 + }, + { + "epoch": 2.1077665285044507, + "grad_norm": 0.8450761437416077, + "learning_rate": 5.9481664954993054e-05, + "loss": 1.8747, + "step": 23265 + }, + { + "epoch": 2.1078571266788972, + "grad_norm": 0.9660691022872925, + "learning_rate": 5.9475623754002295e-05, + "loss": 2.6361, + "step": 23266 + }, + { + "epoch": 2.1079477248533443, + "grad_norm": 1.0045948028564453, + "learning_rate": 5.9469582553011536e-05, + "loss": 2.7118, + "step": 23267 + }, + { + "epoch": 2.108038323027791, + "grad_norm": 0.9580534100532532, + "learning_rate": 5.946354135202078e-05, + "loss": 2.7855, + "step": 23268 + }, + { + "epoch": 2.108128921202238, + "grad_norm": 0.9663134813308716, + "learning_rate": 5.945750015103003e-05, + "loss": 2.4547, + "step": 23269 + }, + { + "epoch": 2.1082195193766844, + "grad_norm": 1.0635675191879272, + "learning_rate": 5.945145895003927e-05, + "loss": 2.609, + "step": 23270 + }, + { + "epoch": 2.1083101175511314, + "grad_norm": 1.0335896015167236, + "learning_rate": 5.944541774904851e-05, + "loss": 2.2816, + "step": 23271 + }, + { + "epoch": 2.108400715725578, + "grad_norm": 1.140017032623291, + "learning_rate": 5.9439376548057754e-05, + "loss": 2.7329, + "step": 23272 + }, + { + "epoch": 2.108491313900025, + "grad_norm": 0.9729452133178711, + "learning_rate": 5.9433335347067e-05, + "loss": 2.532, + "step": 23273 + }, + { + "epoch": 2.1085819120744715, + "grad_norm": 0.8399760127067566, + "learning_rate": 5.942729414607624e-05, + "loss": 2.0693, + "step": 23274 + }, + { + "epoch": 2.1086725102489186, + "grad_norm": 1.0041142702102661, + "learning_rate": 5.942125294508548e-05, + "loss": 2.6674, + "step": 23275 + }, + { + "epoch": 2.108763108423365, + "grad_norm": 1.0797063112258911, + "learning_rate": 5.941521174409474e-05, + "loss": 2.4297, + "step": 23276 + }, + { + "epoch": 2.108853706597812, + "grad_norm": 0.9324294328689575, + "learning_rate": 5.940917054310398e-05, + "loss": 2.5102, + "step": 23277 + }, + { + "epoch": 2.1089443047722587, + "grad_norm": 1.0218983888626099, + "learning_rate": 5.940312934211322e-05, + "loss": 2.411, + "step": 23278 + }, + { + "epoch": 2.1090349029467057, + "grad_norm": 1.0346559286117554, + "learning_rate": 5.939708814112246e-05, + "loss": 2.5244, + "step": 23279 + }, + { + "epoch": 2.1091255011211523, + "grad_norm": 0.9617395997047424, + "learning_rate": 5.93910469401317e-05, + "loss": 2.5812, + "step": 23280 + }, + { + "epoch": 2.1092160992955993, + "grad_norm": 0.9754108786582947, + "learning_rate": 5.938500573914094e-05, + "loss": 2.582, + "step": 23281 + }, + { + "epoch": 2.109306697470046, + "grad_norm": 1.0485998392105103, + "learning_rate": 5.937896453815018e-05, + "loss": 2.6182, + "step": 23282 + }, + { + "epoch": 2.109397295644493, + "grad_norm": 0.999091625213623, + "learning_rate": 5.937292333715942e-05, + "loss": 2.6409, + "step": 23283 + }, + { + "epoch": 2.1094878938189394, + "grad_norm": 0.9650511145591736, + "learning_rate": 5.936688213616868e-05, + "loss": 2.4581, + "step": 23284 + }, + { + "epoch": 2.1095784919933864, + "grad_norm": 0.9435597658157349, + "learning_rate": 5.936084093517792e-05, + "loss": 2.4685, + "step": 23285 + }, + { + "epoch": 2.109669090167833, + "grad_norm": 0.9154069423675537, + "learning_rate": 5.935479973418716e-05, + "loss": 2.5858, + "step": 23286 + }, + { + "epoch": 2.10975968834228, + "grad_norm": 0.9716078042984009, + "learning_rate": 5.93487585331964e-05, + "loss": 2.7386, + "step": 23287 + }, + { + "epoch": 2.1098502865167266, + "grad_norm": 0.9348539710044861, + "learning_rate": 5.934271733220564e-05, + "loss": 2.5951, + "step": 23288 + }, + { + "epoch": 2.1099408846911736, + "grad_norm": 0.9675948619842529, + "learning_rate": 5.933667613121489e-05, + "loss": 2.4617, + "step": 23289 + }, + { + "epoch": 2.11003148286562, + "grad_norm": 0.9528394937515259, + "learning_rate": 5.933063493022413e-05, + "loss": 1.7328, + "step": 23290 + }, + { + "epoch": 2.110122081040067, + "grad_norm": 1.0825419425964355, + "learning_rate": 5.932459372923338e-05, + "loss": 2.5657, + "step": 23291 + }, + { + "epoch": 2.1102126792145137, + "grad_norm": 0.8873586654663086, + "learning_rate": 5.9318552528242625e-05, + "loss": 1.8492, + "step": 23292 + }, + { + "epoch": 2.1103032773889607, + "grad_norm": 1.0075312852859497, + "learning_rate": 5.9312511327251865e-05, + "loss": 2.3835, + "step": 23293 + }, + { + "epoch": 2.1103938755634073, + "grad_norm": 0.9784000515937805, + "learning_rate": 5.9306470126261106e-05, + "loss": 2.4738, + "step": 23294 + }, + { + "epoch": 2.1104844737378543, + "grad_norm": 0.9948106408119202, + "learning_rate": 5.930042892527035e-05, + "loss": 2.5953, + "step": 23295 + }, + { + "epoch": 2.110575071912301, + "grad_norm": 0.9232503771781921, + "learning_rate": 5.929438772427959e-05, + "loss": 2.7219, + "step": 23296 + }, + { + "epoch": 2.110665670086748, + "grad_norm": 0.9824028015136719, + "learning_rate": 5.928834652328883e-05, + "loss": 2.6472, + "step": 23297 + }, + { + "epoch": 2.1107562682611944, + "grad_norm": 1.0537421703338623, + "learning_rate": 5.928230532229807e-05, + "loss": 2.6758, + "step": 23298 + }, + { + "epoch": 2.1108468664356415, + "grad_norm": 1.002942681312561, + "learning_rate": 5.9276264121307324e-05, + "loss": 2.6129, + "step": 23299 + }, + { + "epoch": 2.110937464610088, + "grad_norm": 0.9800559878349304, + "learning_rate": 5.9270222920316565e-05, + "loss": 2.7111, + "step": 23300 + }, + { + "epoch": 2.111028062784535, + "grad_norm": 0.9976126551628113, + "learning_rate": 5.9264181719325806e-05, + "loss": 2.5186, + "step": 23301 + }, + { + "epoch": 2.1111186609589816, + "grad_norm": 1.040391206741333, + "learning_rate": 5.9258140518335046e-05, + "loss": 2.6901, + "step": 23302 + }, + { + "epoch": 2.1112092591334286, + "grad_norm": 1.112959861755371, + "learning_rate": 5.925209931734429e-05, + "loss": 2.6782, + "step": 23303 + }, + { + "epoch": 2.111299857307875, + "grad_norm": 1.0078203678131104, + "learning_rate": 5.924605811635353e-05, + "loss": 2.507, + "step": 23304 + }, + { + "epoch": 2.111390455482322, + "grad_norm": 1.0925819873809814, + "learning_rate": 5.9240016915362776e-05, + "loss": 2.8036, + "step": 23305 + }, + { + "epoch": 2.1114810536567687, + "grad_norm": 1.005466341972351, + "learning_rate": 5.923397571437202e-05, + "loss": 2.6021, + "step": 23306 + }, + { + "epoch": 2.1115716518312158, + "grad_norm": 0.9895222187042236, + "learning_rate": 5.9227934513381264e-05, + "loss": 2.1306, + "step": 23307 + }, + { + "epoch": 2.1116622500056623, + "grad_norm": 1.0058361291885376, + "learning_rate": 5.922189331239051e-05, + "loss": 2.5905, + "step": 23308 + }, + { + "epoch": 2.1117528481801093, + "grad_norm": 1.1497372388839722, + "learning_rate": 5.921585211139975e-05, + "loss": 2.5452, + "step": 23309 + }, + { + "epoch": 2.111843446354556, + "grad_norm": 0.6217920780181885, + "learning_rate": 5.9209810910408993e-05, + "loss": 1.0567, + "step": 23310 + }, + { + "epoch": 2.111934044529003, + "grad_norm": 1.1924279928207397, + "learning_rate": 5.9203769709418234e-05, + "loss": 2.5539, + "step": 23311 + }, + { + "epoch": 2.1120246427034495, + "grad_norm": 1.0660427808761597, + "learning_rate": 5.9197728508427475e-05, + "loss": 2.4365, + "step": 23312 + }, + { + "epoch": 2.1121152408778965, + "grad_norm": 1.1068274974822998, + "learning_rate": 5.9191687307436716e-05, + "loss": 2.5071, + "step": 23313 + }, + { + "epoch": 2.112205839052343, + "grad_norm": 1.0080034732818604, + "learning_rate": 5.918564610644597e-05, + "loss": 2.872, + "step": 23314 + }, + { + "epoch": 2.11229643722679, + "grad_norm": 0.6875690817832947, + "learning_rate": 5.917960490545521e-05, + "loss": 1.5178, + "step": 23315 + }, + { + "epoch": 2.1123870354012366, + "grad_norm": 0.9665954113006592, + "learning_rate": 5.917356370446445e-05, + "loss": 2.5454, + "step": 23316 + }, + { + "epoch": 2.1124776335756836, + "grad_norm": 0.94209223985672, + "learning_rate": 5.916752250347369e-05, + "loss": 2.6456, + "step": 23317 + }, + { + "epoch": 2.11256823175013, + "grad_norm": 0.8590113520622253, + "learning_rate": 5.9161481302482934e-05, + "loss": 1.9565, + "step": 23318 + }, + { + "epoch": 2.112658829924577, + "grad_norm": 0.8369855880737305, + "learning_rate": 5.9155440101492174e-05, + "loss": 1.8984, + "step": 23319 + }, + { + "epoch": 2.1127494280990238, + "grad_norm": 0.979617714881897, + "learning_rate": 5.9149398900501415e-05, + "loss": 2.5815, + "step": 23320 + }, + { + "epoch": 2.112840026273471, + "grad_norm": 1.0003842115402222, + "learning_rate": 5.914335769951067e-05, + "loss": 2.505, + "step": 23321 + }, + { + "epoch": 2.1129306244479173, + "grad_norm": 0.9289253354072571, + "learning_rate": 5.913731649851991e-05, + "loss": 2.793, + "step": 23322 + }, + { + "epoch": 2.1130212226223644, + "grad_norm": 0.9554456472396851, + "learning_rate": 5.913127529752915e-05, + "loss": 2.3052, + "step": 23323 + }, + { + "epoch": 2.113111820796811, + "grad_norm": 1.0225197076797485, + "learning_rate": 5.91252340965384e-05, + "loss": 2.5408, + "step": 23324 + }, + { + "epoch": 2.113202418971258, + "grad_norm": 0.8894565105438232, + "learning_rate": 5.911919289554764e-05, + "loss": 2.1828, + "step": 23325 + }, + { + "epoch": 2.1132930171457045, + "grad_norm": 0.9124491810798645, + "learning_rate": 5.911315169455688e-05, + "loss": 2.5261, + "step": 23326 + }, + { + "epoch": 2.1133836153201515, + "grad_norm": 1.0014058351516724, + "learning_rate": 5.910711049356612e-05, + "loss": 2.563, + "step": 23327 + }, + { + "epoch": 2.113474213494598, + "grad_norm": 0.9235665798187256, + "learning_rate": 5.910106929257536e-05, + "loss": 2.3472, + "step": 23328 + }, + { + "epoch": 2.113564811669045, + "grad_norm": 0.954531729221344, + "learning_rate": 5.9095028091584617e-05, + "loss": 2.6551, + "step": 23329 + }, + { + "epoch": 2.1136554098434917, + "grad_norm": 0.8908863067626953, + "learning_rate": 5.908898689059386e-05, + "loss": 1.9253, + "step": 23330 + }, + { + "epoch": 2.113746008017938, + "grad_norm": 1.1582149267196655, + "learning_rate": 5.90829456896031e-05, + "loss": 2.5838, + "step": 23331 + }, + { + "epoch": 2.1138366061923852, + "grad_norm": 0.9682547450065613, + "learning_rate": 5.907690448861234e-05, + "loss": 2.6574, + "step": 23332 + }, + { + "epoch": 2.1139272043668322, + "grad_norm": 0.9689342975616455, + "learning_rate": 5.907086328762158e-05, + "loss": 2.8032, + "step": 23333 + }, + { + "epoch": 2.114017802541279, + "grad_norm": 0.8076866865158081, + "learning_rate": 5.906482208663082e-05, + "loss": 1.9077, + "step": 23334 + }, + { + "epoch": 2.1141084007157254, + "grad_norm": 0.9276089668273926, + "learning_rate": 5.905878088564006e-05, + "loss": 2.6012, + "step": 23335 + }, + { + "epoch": 2.1141989988901724, + "grad_norm": 0.97358238697052, + "learning_rate": 5.9052739684649316e-05, + "loss": 2.5943, + "step": 23336 + }, + { + "epoch": 2.114289597064619, + "grad_norm": 0.9231265187263489, + "learning_rate": 5.904669848365856e-05, + "loss": 2.5464, + "step": 23337 + }, + { + "epoch": 2.114380195239066, + "grad_norm": 0.9164793491363525, + "learning_rate": 5.90406572826678e-05, + "loss": 2.7231, + "step": 23338 + }, + { + "epoch": 2.1144707934135125, + "grad_norm": 0.8843784332275391, + "learning_rate": 5.903461608167704e-05, + "loss": 1.8609, + "step": 23339 + }, + { + "epoch": 2.1145613915879595, + "grad_norm": 0.894115149974823, + "learning_rate": 5.902857488068628e-05, + "loss": 2.0178, + "step": 23340 + }, + { + "epoch": 2.114651989762406, + "grad_norm": 0.9631538987159729, + "learning_rate": 5.902253367969553e-05, + "loss": 2.5816, + "step": 23341 + }, + { + "epoch": 2.114742587936853, + "grad_norm": 0.9890022873878479, + "learning_rate": 5.901649247870477e-05, + "loss": 2.5275, + "step": 23342 + }, + { + "epoch": 2.1148331861112997, + "grad_norm": 0.9541836977005005, + "learning_rate": 5.901045127771401e-05, + "loss": 1.7936, + "step": 23343 + }, + { + "epoch": 2.1149237842857467, + "grad_norm": 1.0787650346755981, + "learning_rate": 5.900441007672326e-05, + "loss": 2.728, + "step": 23344 + }, + { + "epoch": 2.1150143824601932, + "grad_norm": 0.8447771668434143, + "learning_rate": 5.8998368875732504e-05, + "loss": 1.8685, + "step": 23345 + }, + { + "epoch": 2.1151049806346403, + "grad_norm": 0.9304687976837158, + "learning_rate": 5.8992327674741745e-05, + "loss": 2.3931, + "step": 23346 + }, + { + "epoch": 2.115195578809087, + "grad_norm": 0.9831095933914185, + "learning_rate": 5.8986286473750985e-05, + "loss": 2.6398, + "step": 23347 + }, + { + "epoch": 2.115286176983534, + "grad_norm": 1.0072665214538574, + "learning_rate": 5.8980245272760226e-05, + "loss": 2.4211, + "step": 23348 + }, + { + "epoch": 2.1153767751579804, + "grad_norm": 0.9740390181541443, + "learning_rate": 5.897420407176947e-05, + "loss": 2.5588, + "step": 23349 + }, + { + "epoch": 2.1154673733324274, + "grad_norm": 0.9373380541801453, + "learning_rate": 5.896816287077871e-05, + "loss": 2.5722, + "step": 23350 + }, + { + "epoch": 2.115557971506874, + "grad_norm": 0.9709389209747314, + "learning_rate": 5.896212166978796e-05, + "loss": 2.4999, + "step": 23351 + }, + { + "epoch": 2.115648569681321, + "grad_norm": 1.1458879709243774, + "learning_rate": 5.89560804687972e-05, + "loss": 2.677, + "step": 23352 + }, + { + "epoch": 2.1157391678557675, + "grad_norm": 1.0001428127288818, + "learning_rate": 5.8950039267806444e-05, + "loss": 2.5862, + "step": 23353 + }, + { + "epoch": 2.1158297660302146, + "grad_norm": 1.0897027254104614, + "learning_rate": 5.8943998066815685e-05, + "loss": 2.9946, + "step": 23354 + }, + { + "epoch": 2.115920364204661, + "grad_norm": 0.8821679949760437, + "learning_rate": 5.8937956865824926e-05, + "loss": 1.8554, + "step": 23355 + }, + { + "epoch": 2.116010962379108, + "grad_norm": 1.0418787002563477, + "learning_rate": 5.8931915664834166e-05, + "loss": 2.5909, + "step": 23356 + }, + { + "epoch": 2.1161015605535547, + "grad_norm": 1.0312118530273438, + "learning_rate": 5.8925874463843414e-05, + "loss": 2.4614, + "step": 23357 + }, + { + "epoch": 2.1161921587280017, + "grad_norm": 0.9671609997749329, + "learning_rate": 5.8919833262852655e-05, + "loss": 2.7411, + "step": 23358 + }, + { + "epoch": 2.1162827569024483, + "grad_norm": 1.0272940397262573, + "learning_rate": 5.89137920618619e-05, + "loss": 2.6661, + "step": 23359 + }, + { + "epoch": 2.1163733550768953, + "grad_norm": 0.9848920702934265, + "learning_rate": 5.890775086087115e-05, + "loss": 2.6949, + "step": 23360 + }, + { + "epoch": 2.116463953251342, + "grad_norm": 0.9666497707366943, + "learning_rate": 5.890170965988039e-05, + "loss": 2.7679, + "step": 23361 + }, + { + "epoch": 2.116554551425789, + "grad_norm": 0.9616955518722534, + "learning_rate": 5.889566845888963e-05, + "loss": 2.7169, + "step": 23362 + }, + { + "epoch": 2.1166451496002354, + "grad_norm": 0.9869728684425354, + "learning_rate": 5.888962725789887e-05, + "loss": 2.6885, + "step": 23363 + }, + { + "epoch": 2.1167357477746824, + "grad_norm": 0.9585897326469421, + "learning_rate": 5.8883586056908113e-05, + "loss": 2.7778, + "step": 23364 + }, + { + "epoch": 2.116826345949129, + "grad_norm": 1.0153573751449585, + "learning_rate": 5.8877544855917354e-05, + "loss": 2.6327, + "step": 23365 + }, + { + "epoch": 2.116916944123576, + "grad_norm": 1.0882656574249268, + "learning_rate": 5.887150365492661e-05, + "loss": 2.2296, + "step": 23366 + }, + { + "epoch": 2.1170075422980226, + "grad_norm": 0.9712031483650208, + "learning_rate": 5.886546245393585e-05, + "loss": 2.481, + "step": 23367 + }, + { + "epoch": 2.1170981404724696, + "grad_norm": 0.8789578676223755, + "learning_rate": 5.885942125294509e-05, + "loss": 2.0339, + "step": 23368 + }, + { + "epoch": 2.117188738646916, + "grad_norm": 1.0407871007919312, + "learning_rate": 5.885338005195433e-05, + "loss": 2.729, + "step": 23369 + }, + { + "epoch": 2.117279336821363, + "grad_norm": 1.0147316455841064, + "learning_rate": 5.884733885096357e-05, + "loss": 2.8526, + "step": 23370 + }, + { + "epoch": 2.1173699349958097, + "grad_norm": 1.0375497341156006, + "learning_rate": 5.884129764997281e-05, + "loss": 2.5507, + "step": 23371 + }, + { + "epoch": 2.1174605331702567, + "grad_norm": 1.008650779724121, + "learning_rate": 5.8835256448982054e-05, + "loss": 2.587, + "step": 23372 + }, + { + "epoch": 2.1175511313447033, + "grad_norm": 0.9817922711372375, + "learning_rate": 5.88292152479913e-05, + "loss": 2.4818, + "step": 23373 + }, + { + "epoch": 2.1176417295191503, + "grad_norm": 1.0153578519821167, + "learning_rate": 5.882317404700055e-05, + "loss": 2.9093, + "step": 23374 + }, + { + "epoch": 2.117732327693597, + "grad_norm": 0.9911856651306152, + "learning_rate": 5.881713284600979e-05, + "loss": 2.4342, + "step": 23375 + }, + { + "epoch": 2.117822925868044, + "grad_norm": 1.0264101028442383, + "learning_rate": 5.881109164501904e-05, + "loss": 2.3392, + "step": 23376 + }, + { + "epoch": 2.1179135240424904, + "grad_norm": 0.965678870677948, + "learning_rate": 5.880505044402828e-05, + "loss": 2.3686, + "step": 23377 + }, + { + "epoch": 2.1180041222169375, + "grad_norm": 0.9854018688201904, + "learning_rate": 5.879900924303752e-05, + "loss": 2.6493, + "step": 23378 + }, + { + "epoch": 2.118094720391384, + "grad_norm": 1.165784239768982, + "learning_rate": 5.879296804204676e-05, + "loss": 2.5866, + "step": 23379 + }, + { + "epoch": 2.118185318565831, + "grad_norm": 1.058669924736023, + "learning_rate": 5.8786926841056e-05, + "loss": 2.353, + "step": 23380 + }, + { + "epoch": 2.1182759167402776, + "grad_norm": 0.8807028532028198, + "learning_rate": 5.8780885640065255e-05, + "loss": 2.3416, + "step": 23381 + }, + { + "epoch": 2.1183665149147246, + "grad_norm": 1.010705828666687, + "learning_rate": 5.8774844439074496e-05, + "loss": 2.8216, + "step": 23382 + }, + { + "epoch": 2.118457113089171, + "grad_norm": 0.987068772315979, + "learning_rate": 5.8768803238083737e-05, + "loss": 2.6098, + "step": 23383 + }, + { + "epoch": 2.118547711263618, + "grad_norm": 1.1400099992752075, + "learning_rate": 5.876276203709298e-05, + "loss": 2.4826, + "step": 23384 + }, + { + "epoch": 2.1186383094380647, + "grad_norm": 1.1430259943008423, + "learning_rate": 5.875672083610222e-05, + "loss": 2.6507, + "step": 23385 + }, + { + "epoch": 2.1187289076125118, + "grad_norm": 0.9879099726676941, + "learning_rate": 5.875067963511146e-05, + "loss": 2.6032, + "step": 23386 + }, + { + "epoch": 2.1188195057869583, + "grad_norm": 1.0072741508483887, + "learning_rate": 5.87446384341207e-05, + "loss": 2.6363, + "step": 23387 + }, + { + "epoch": 2.1189101039614053, + "grad_norm": 0.9773271083831787, + "learning_rate": 5.873859723312994e-05, + "loss": 2.4396, + "step": 23388 + }, + { + "epoch": 2.119000702135852, + "grad_norm": 1.0189279317855835, + "learning_rate": 5.8732556032139195e-05, + "loss": 2.5585, + "step": 23389 + }, + { + "epoch": 2.119091300310299, + "grad_norm": 0.987837016582489, + "learning_rate": 5.8726514831148436e-05, + "loss": 2.5512, + "step": 23390 + }, + { + "epoch": 2.1191818984847455, + "grad_norm": 1.001693844795227, + "learning_rate": 5.872047363015768e-05, + "loss": 2.5096, + "step": 23391 + }, + { + "epoch": 2.1192724966591925, + "grad_norm": 1.0341709852218628, + "learning_rate": 5.8714432429166924e-05, + "loss": 2.6258, + "step": 23392 + }, + { + "epoch": 2.119363094833639, + "grad_norm": 0.991272509098053, + "learning_rate": 5.8708391228176165e-05, + "loss": 2.7179, + "step": 23393 + }, + { + "epoch": 2.119453693008086, + "grad_norm": 0.8962863087654114, + "learning_rate": 5.8702350027185406e-05, + "loss": 1.9393, + "step": 23394 + }, + { + "epoch": 2.1195442911825326, + "grad_norm": 1.1711851358413696, + "learning_rate": 5.869630882619465e-05, + "loss": 2.7326, + "step": 23395 + }, + { + "epoch": 2.1196348893569796, + "grad_norm": 0.9964451789855957, + "learning_rate": 5.86902676252039e-05, + "loss": 2.0344, + "step": 23396 + }, + { + "epoch": 2.119725487531426, + "grad_norm": 1.011993408203125, + "learning_rate": 5.868422642421314e-05, + "loss": 2.4645, + "step": 23397 + }, + { + "epoch": 2.119816085705873, + "grad_norm": 0.932979166507721, + "learning_rate": 5.867818522322238e-05, + "loss": 2.4425, + "step": 23398 + }, + { + "epoch": 2.1199066838803198, + "grad_norm": 1.112338662147522, + "learning_rate": 5.8672144022231624e-05, + "loss": 2.6, + "step": 23399 + }, + { + "epoch": 2.119997282054767, + "grad_norm": 1.0306531190872192, + "learning_rate": 5.8666102821240865e-05, + "loss": 2.6106, + "step": 23400 + }, + { + "epoch": 2.1200878802292134, + "grad_norm": 0.9891301989555359, + "learning_rate": 5.8660061620250105e-05, + "loss": 2.5947, + "step": 23401 + }, + { + "epoch": 2.1201784784036604, + "grad_norm": 1.00649094581604, + "learning_rate": 5.8654020419259346e-05, + "loss": 2.6169, + "step": 23402 + }, + { + "epoch": 2.120269076578107, + "grad_norm": 1.1135612726211548, + "learning_rate": 5.864797921826859e-05, + "loss": 2.6522, + "step": 23403 + }, + { + "epoch": 2.120359674752554, + "grad_norm": 1.0899499654769897, + "learning_rate": 5.864193801727784e-05, + "loss": 2.7247, + "step": 23404 + }, + { + "epoch": 2.1204502729270005, + "grad_norm": 0.9722973108291626, + "learning_rate": 5.863589681628708e-05, + "loss": 2.4794, + "step": 23405 + }, + { + "epoch": 2.1205408711014475, + "grad_norm": 1.0588834285736084, + "learning_rate": 5.862985561529632e-05, + "loss": 2.8775, + "step": 23406 + }, + { + "epoch": 2.120631469275894, + "grad_norm": 1.00043523311615, + "learning_rate": 5.8623814414305564e-05, + "loss": 2.6537, + "step": 23407 + }, + { + "epoch": 2.120722067450341, + "grad_norm": 0.9941033720970154, + "learning_rate": 5.861777321331481e-05, + "loss": 2.6439, + "step": 23408 + }, + { + "epoch": 2.1208126656247877, + "grad_norm": 0.8464045524597168, + "learning_rate": 5.861173201232405e-05, + "loss": 1.7873, + "step": 23409 + }, + { + "epoch": 2.120903263799234, + "grad_norm": 0.883573591709137, + "learning_rate": 5.860569081133329e-05, + "loss": 2.4593, + "step": 23410 + }, + { + "epoch": 2.1209938619736812, + "grad_norm": 0.8597694039344788, + "learning_rate": 5.859964961034254e-05, + "loss": 1.9984, + "step": 23411 + }, + { + "epoch": 2.1210844601481282, + "grad_norm": 0.9453285336494446, + "learning_rate": 5.859360840935179e-05, + "loss": 2.3348, + "step": 23412 + }, + { + "epoch": 2.121175058322575, + "grad_norm": 0.8454073667526245, + "learning_rate": 5.858756720836103e-05, + "loss": 2.0053, + "step": 23413 + }, + { + "epoch": 2.1212656564970214, + "grad_norm": 0.9896373152732849, + "learning_rate": 5.858152600737027e-05, + "loss": 2.7515, + "step": 23414 + }, + { + "epoch": 2.1213562546714684, + "grad_norm": 0.9730123281478882, + "learning_rate": 5.857548480637951e-05, + "loss": 2.5134, + "step": 23415 + }, + { + "epoch": 2.121446852845915, + "grad_norm": 1.0102273225784302, + "learning_rate": 5.856944360538875e-05, + "loss": 2.9132, + "step": 23416 + }, + { + "epoch": 2.121537451020362, + "grad_norm": 0.9630420804023743, + "learning_rate": 5.856340240439799e-05, + "loss": 2.7156, + "step": 23417 + }, + { + "epoch": 2.1216280491948085, + "grad_norm": 1.1829441785812378, + "learning_rate": 5.855736120340723e-05, + "loss": 2.5193, + "step": 23418 + }, + { + "epoch": 2.1217186473692555, + "grad_norm": 1.0045298337936401, + "learning_rate": 5.855132000241649e-05, + "loss": 2.8882, + "step": 23419 + }, + { + "epoch": 2.121809245543702, + "grad_norm": 0.9786355495452881, + "learning_rate": 5.854527880142573e-05, + "loss": 2.5824, + "step": 23420 + }, + { + "epoch": 2.121899843718149, + "grad_norm": 1.0192804336547852, + "learning_rate": 5.853923760043497e-05, + "loss": 2.7618, + "step": 23421 + }, + { + "epoch": 2.1219904418925957, + "grad_norm": 0.9840250015258789, + "learning_rate": 5.853319639944421e-05, + "loss": 2.5341, + "step": 23422 + }, + { + "epoch": 2.1220810400670427, + "grad_norm": 0.9413425326347351, + "learning_rate": 5.852715519845345e-05, + "loss": 2.3414, + "step": 23423 + }, + { + "epoch": 2.1221716382414892, + "grad_norm": 0.9433721899986267, + "learning_rate": 5.85211139974627e-05, + "loss": 2.5893, + "step": 23424 + }, + { + "epoch": 2.1222622364159363, + "grad_norm": 1.0370534658432007, + "learning_rate": 5.851507279647194e-05, + "loss": 2.6092, + "step": 23425 + }, + { + "epoch": 2.122352834590383, + "grad_norm": 1.0078407526016235, + "learning_rate": 5.850903159548119e-05, + "loss": 2.5302, + "step": 23426 + }, + { + "epoch": 2.12244343276483, + "grad_norm": 0.7461893558502197, + "learning_rate": 5.850299039449043e-05, + "loss": 1.394, + "step": 23427 + }, + { + "epoch": 2.1225340309392764, + "grad_norm": 1.0811830759048462, + "learning_rate": 5.8496949193499676e-05, + "loss": 2.3757, + "step": 23428 + }, + { + "epoch": 2.1226246291137234, + "grad_norm": 0.9949172735214233, + "learning_rate": 5.8490907992508916e-05, + "loss": 2.5419, + "step": 23429 + }, + { + "epoch": 2.12271522728817, + "grad_norm": 0.9849228858947754, + "learning_rate": 5.848486679151816e-05, + "loss": 2.8285, + "step": 23430 + }, + { + "epoch": 2.122805825462617, + "grad_norm": 0.9469999670982361, + "learning_rate": 5.84788255905274e-05, + "loss": 2.411, + "step": 23431 + }, + { + "epoch": 2.1228964236370635, + "grad_norm": 0.9629706740379333, + "learning_rate": 5.847278438953664e-05, + "loss": 2.7962, + "step": 23432 + }, + { + "epoch": 2.1229870218115106, + "grad_norm": 0.9912241697311401, + "learning_rate": 5.846674318854588e-05, + "loss": 2.5122, + "step": 23433 + }, + { + "epoch": 2.123077619985957, + "grad_norm": 1.0649473667144775, + "learning_rate": 5.8460701987555134e-05, + "loss": 2.7866, + "step": 23434 + }, + { + "epoch": 2.123168218160404, + "grad_norm": 1.0530195236206055, + "learning_rate": 5.8454660786564375e-05, + "loss": 2.5405, + "step": 23435 + }, + { + "epoch": 2.1232588163348507, + "grad_norm": 1.0194417238235474, + "learning_rate": 5.8448619585573616e-05, + "loss": 2.5377, + "step": 23436 + }, + { + "epoch": 2.1233494145092977, + "grad_norm": 0.9576936960220337, + "learning_rate": 5.8442578384582857e-05, + "loss": 2.5569, + "step": 23437 + }, + { + "epoch": 2.1234400126837443, + "grad_norm": 1.1256746053695679, + "learning_rate": 5.84365371835921e-05, + "loss": 2.2481, + "step": 23438 + }, + { + "epoch": 2.1235306108581913, + "grad_norm": 1.0045744180679321, + "learning_rate": 5.843049598260134e-05, + "loss": 2.5902, + "step": 23439 + }, + { + "epoch": 2.123621209032638, + "grad_norm": 0.9422037601470947, + "learning_rate": 5.8424454781610586e-05, + "loss": 2.4223, + "step": 23440 + }, + { + "epoch": 2.123711807207085, + "grad_norm": 1.0672417879104614, + "learning_rate": 5.8418413580619833e-05, + "loss": 2.4373, + "step": 23441 + }, + { + "epoch": 2.1238024053815314, + "grad_norm": 1.0239936113357544, + "learning_rate": 5.8412372379629074e-05, + "loss": 2.8186, + "step": 23442 + }, + { + "epoch": 2.1238930035559784, + "grad_norm": 1.0293930768966675, + "learning_rate": 5.8406331178638315e-05, + "loss": 2.7296, + "step": 23443 + }, + { + "epoch": 2.123983601730425, + "grad_norm": 1.0199276208877563, + "learning_rate": 5.840028997764756e-05, + "loss": 2.5924, + "step": 23444 + }, + { + "epoch": 2.124074199904872, + "grad_norm": 0.9813863039016724, + "learning_rate": 5.8394248776656804e-05, + "loss": 2.6984, + "step": 23445 + }, + { + "epoch": 2.1241647980793186, + "grad_norm": 0.9679998755455017, + "learning_rate": 5.8388207575666044e-05, + "loss": 2.4836, + "step": 23446 + }, + { + "epoch": 2.1242553962537656, + "grad_norm": 1.0208195447921753, + "learning_rate": 5.8382166374675285e-05, + "loss": 2.6249, + "step": 23447 + }, + { + "epoch": 2.124345994428212, + "grad_norm": 0.9752762913703918, + "learning_rate": 5.8376125173684526e-05, + "loss": 2.6744, + "step": 23448 + }, + { + "epoch": 2.124436592602659, + "grad_norm": 0.830668032169342, + "learning_rate": 5.837008397269378e-05, + "loss": 1.8837, + "step": 23449 + }, + { + "epoch": 2.1245271907771057, + "grad_norm": 0.8995434045791626, + "learning_rate": 5.836404277170302e-05, + "loss": 2.7308, + "step": 23450 + }, + { + "epoch": 2.1246177889515527, + "grad_norm": 0.9469010233879089, + "learning_rate": 5.835800157071226e-05, + "loss": 2.4104, + "step": 23451 + }, + { + "epoch": 2.1247083871259993, + "grad_norm": 0.8952498435974121, + "learning_rate": 5.83519603697215e-05, + "loss": 2.0711, + "step": 23452 + }, + { + "epoch": 2.1247989853004463, + "grad_norm": 1.0813188552856445, + "learning_rate": 5.8345919168730744e-05, + "loss": 2.7344, + "step": 23453 + }, + { + "epoch": 2.124889583474893, + "grad_norm": 0.9686183929443359, + "learning_rate": 5.8339877967739985e-05, + "loss": 2.6824, + "step": 23454 + }, + { + "epoch": 2.12498018164934, + "grad_norm": 1.0258678197860718, + "learning_rate": 5.8333836766749225e-05, + "loss": 2.8534, + "step": 23455 + }, + { + "epoch": 2.1250707798237864, + "grad_norm": 1.049203872680664, + "learning_rate": 5.832779556575848e-05, + "loss": 2.688, + "step": 23456 + }, + { + "epoch": 2.1251613779982335, + "grad_norm": 0.9207696914672852, + "learning_rate": 5.832175436476772e-05, + "loss": 2.7456, + "step": 23457 + }, + { + "epoch": 2.12525197617268, + "grad_norm": 0.9633134007453918, + "learning_rate": 5.831571316377696e-05, + "loss": 2.4546, + "step": 23458 + }, + { + "epoch": 2.125342574347127, + "grad_norm": 1.0205110311508179, + "learning_rate": 5.83096719627862e-05, + "loss": 2.7511, + "step": 23459 + }, + { + "epoch": 2.1254331725215736, + "grad_norm": 0.9865165948867798, + "learning_rate": 5.830363076179545e-05, + "loss": 2.331, + "step": 23460 + }, + { + "epoch": 2.1255237706960206, + "grad_norm": 0.9975360035896301, + "learning_rate": 5.829758956080469e-05, + "loss": 2.3386, + "step": 23461 + }, + { + "epoch": 2.125614368870467, + "grad_norm": 1.052178978919983, + "learning_rate": 5.829154835981393e-05, + "loss": 2.4508, + "step": 23462 + }, + { + "epoch": 2.125704967044914, + "grad_norm": 1.0034384727478027, + "learning_rate": 5.828550715882317e-05, + "loss": 2.5579, + "step": 23463 + }, + { + "epoch": 2.1257955652193607, + "grad_norm": 1.0356212854385376, + "learning_rate": 5.827946595783243e-05, + "loss": 2.3187, + "step": 23464 + }, + { + "epoch": 2.1258861633938078, + "grad_norm": 1.034590721130371, + "learning_rate": 5.827342475684167e-05, + "loss": 2.5611, + "step": 23465 + }, + { + "epoch": 2.1259767615682543, + "grad_norm": 0.9897977113723755, + "learning_rate": 5.826738355585091e-05, + "loss": 2.7154, + "step": 23466 + }, + { + "epoch": 2.1260673597427013, + "grad_norm": 1.0245128870010376, + "learning_rate": 5.826134235486015e-05, + "loss": 2.5726, + "step": 23467 + }, + { + "epoch": 2.126157957917148, + "grad_norm": 0.9797902703285217, + "learning_rate": 5.825530115386939e-05, + "loss": 2.7149, + "step": 23468 + }, + { + "epoch": 2.126248556091595, + "grad_norm": 1.0006195306777954, + "learning_rate": 5.824925995287863e-05, + "loss": 2.6585, + "step": 23469 + }, + { + "epoch": 2.1263391542660415, + "grad_norm": 0.9213246703147888, + "learning_rate": 5.824321875188787e-05, + "loss": 2.4752, + "step": 23470 + }, + { + "epoch": 2.1264297524404885, + "grad_norm": 0.9493767619132996, + "learning_rate": 5.8237177550897126e-05, + "loss": 2.5046, + "step": 23471 + }, + { + "epoch": 2.126520350614935, + "grad_norm": 0.9984090328216553, + "learning_rate": 5.823113634990637e-05, + "loss": 2.6138, + "step": 23472 + }, + { + "epoch": 2.126610948789382, + "grad_norm": 1.104619026184082, + "learning_rate": 5.822509514891561e-05, + "loss": 2.4096, + "step": 23473 + }, + { + "epoch": 2.1267015469638286, + "grad_norm": 0.8416060209274292, + "learning_rate": 5.821905394792485e-05, + "loss": 1.8245, + "step": 23474 + }, + { + "epoch": 2.1267921451382756, + "grad_norm": 0.8544435501098633, + "learning_rate": 5.821301274693409e-05, + "loss": 1.9587, + "step": 23475 + }, + { + "epoch": 2.126882743312722, + "grad_norm": 0.9767008423805237, + "learning_rate": 5.820697154594334e-05, + "loss": 2.7309, + "step": 23476 + }, + { + "epoch": 2.126973341487169, + "grad_norm": 1.0038623809814453, + "learning_rate": 5.820093034495258e-05, + "loss": 2.8955, + "step": 23477 + }, + { + "epoch": 2.1270639396616158, + "grad_norm": 0.9760084748268127, + "learning_rate": 5.819488914396182e-05, + "loss": 2.2963, + "step": 23478 + }, + { + "epoch": 2.127154537836063, + "grad_norm": 0.8875098824501038, + "learning_rate": 5.818884794297107e-05, + "loss": 2.5066, + "step": 23479 + }, + { + "epoch": 2.1272451360105094, + "grad_norm": 0.9613945484161377, + "learning_rate": 5.8182806741980314e-05, + "loss": 2.3713, + "step": 23480 + }, + { + "epoch": 2.1273357341849564, + "grad_norm": 0.9330441951751709, + "learning_rate": 5.8176765540989555e-05, + "loss": 2.8255, + "step": 23481 + }, + { + "epoch": 2.127426332359403, + "grad_norm": 0.8413888812065125, + "learning_rate": 5.8170724339998796e-05, + "loss": 2.0269, + "step": 23482 + }, + { + "epoch": 2.12751693053385, + "grad_norm": 0.938715934753418, + "learning_rate": 5.8164683139008036e-05, + "loss": 2.6702, + "step": 23483 + }, + { + "epoch": 2.1276075287082965, + "grad_norm": 0.9977558851242065, + "learning_rate": 5.815864193801728e-05, + "loss": 2.8556, + "step": 23484 + }, + { + "epoch": 2.1276981268827435, + "grad_norm": 0.9670853614807129, + "learning_rate": 5.815260073702652e-05, + "loss": 1.72, + "step": 23485 + }, + { + "epoch": 2.12778872505719, + "grad_norm": 0.9853187799453735, + "learning_rate": 5.814655953603577e-05, + "loss": 2.6603, + "step": 23486 + }, + { + "epoch": 2.127879323231637, + "grad_norm": 1.06461763381958, + "learning_rate": 5.814051833504501e-05, + "loss": 2.6961, + "step": 23487 + }, + { + "epoch": 2.1279699214060837, + "grad_norm": 1.0650893449783325, + "learning_rate": 5.8134477134054254e-05, + "loss": 2.4123, + "step": 23488 + }, + { + "epoch": 2.12806051958053, + "grad_norm": 0.9651289582252502, + "learning_rate": 5.8128435933063495e-05, + "loss": 2.8525, + "step": 23489 + }, + { + "epoch": 2.1281511177549772, + "grad_norm": 0.9801304340362549, + "learning_rate": 5.8122394732072736e-05, + "loss": 2.8285, + "step": 23490 + }, + { + "epoch": 2.1282417159294242, + "grad_norm": 1.0266770124435425, + "learning_rate": 5.8116353531081977e-05, + "loss": 2.8069, + "step": 23491 + }, + { + "epoch": 2.128332314103871, + "grad_norm": 0.9434162974357605, + "learning_rate": 5.8110312330091224e-05, + "loss": 2.9029, + "step": 23492 + }, + { + "epoch": 2.1284229122783174, + "grad_norm": 0.758673906326294, + "learning_rate": 5.8104271129100465e-05, + "loss": 1.3156, + "step": 23493 + }, + { + "epoch": 2.1285135104527644, + "grad_norm": 1.0800938606262207, + "learning_rate": 5.809822992810971e-05, + "loss": 2.5918, + "step": 23494 + }, + { + "epoch": 2.1286041086272114, + "grad_norm": 0.9855385422706604, + "learning_rate": 5.809218872711896e-05, + "loss": 2.4612, + "step": 23495 + }, + { + "epoch": 2.128694706801658, + "grad_norm": 0.9633838534355164, + "learning_rate": 5.80861475261282e-05, + "loss": 2.3208, + "step": 23496 + }, + { + "epoch": 2.1287853049761045, + "grad_norm": 0.9030390381813049, + "learning_rate": 5.808010632513744e-05, + "loss": 1.916, + "step": 23497 + }, + { + "epoch": 2.1288759031505515, + "grad_norm": 1.1422030925750732, + "learning_rate": 5.807406512414668e-05, + "loss": 2.5234, + "step": 23498 + }, + { + "epoch": 2.128966501324998, + "grad_norm": 0.9456514716148376, + "learning_rate": 5.8068023923155924e-05, + "loss": 2.6327, + "step": 23499 + }, + { + "epoch": 2.129057099499445, + "grad_norm": 1.143214225769043, + "learning_rate": 5.8061982722165164e-05, + "loss": 2.3324, + "step": 23500 + }, + { + "epoch": 2.1291476976738917, + "grad_norm": 1.0282175540924072, + "learning_rate": 5.805594152117442e-05, + "loss": 2.7025, + "step": 23501 + }, + { + "epoch": 2.1292382958483387, + "grad_norm": 0.9407945871353149, + "learning_rate": 5.804990032018366e-05, + "loss": 2.6558, + "step": 23502 + }, + { + "epoch": 2.1293288940227852, + "grad_norm": 1.046287178993225, + "learning_rate": 5.80438591191929e-05, + "loss": 2.4329, + "step": 23503 + }, + { + "epoch": 2.1294194921972323, + "grad_norm": 0.9098972678184509, + "learning_rate": 5.803781791820214e-05, + "loss": 2.7165, + "step": 23504 + }, + { + "epoch": 2.129510090371679, + "grad_norm": 0.8476859927177429, + "learning_rate": 5.803177671721138e-05, + "loss": 2.0329, + "step": 23505 + }, + { + "epoch": 2.129600688546126, + "grad_norm": 1.0510175228118896, + "learning_rate": 5.802573551622062e-05, + "loss": 2.589, + "step": 23506 + }, + { + "epoch": 2.1296912867205724, + "grad_norm": 0.9921206831932068, + "learning_rate": 5.8019694315229864e-05, + "loss": 2.2278, + "step": 23507 + }, + { + "epoch": 2.1297818848950194, + "grad_norm": 1.0570483207702637, + "learning_rate": 5.801365311423911e-05, + "loss": 2.8479, + "step": 23508 + }, + { + "epoch": 2.129872483069466, + "grad_norm": 0.866459846496582, + "learning_rate": 5.800761191324836e-05, + "loss": 2.1489, + "step": 23509 + }, + { + "epoch": 2.129963081243913, + "grad_norm": 0.9419288635253906, + "learning_rate": 5.80015707122576e-05, + "loss": 2.5087, + "step": 23510 + }, + { + "epoch": 2.1300536794183595, + "grad_norm": 0.9635589718818665, + "learning_rate": 5.799552951126684e-05, + "loss": 2.7167, + "step": 23511 + }, + { + "epoch": 2.1301442775928066, + "grad_norm": 1.1439542770385742, + "learning_rate": 5.798948831027609e-05, + "loss": 2.5786, + "step": 23512 + }, + { + "epoch": 2.130234875767253, + "grad_norm": 0.9480549097061157, + "learning_rate": 5.798344710928533e-05, + "loss": 2.3998, + "step": 23513 + }, + { + "epoch": 2.1303254739417, + "grad_norm": 0.96586674451828, + "learning_rate": 5.797740590829457e-05, + "loss": 2.781, + "step": 23514 + }, + { + "epoch": 2.1304160721161467, + "grad_norm": 0.9508797526359558, + "learning_rate": 5.797136470730381e-05, + "loss": 2.821, + "step": 23515 + }, + { + "epoch": 2.1305066702905937, + "grad_norm": 0.9983242154121399, + "learning_rate": 5.7965323506313065e-05, + "loss": 2.5407, + "step": 23516 + }, + { + "epoch": 2.1305972684650403, + "grad_norm": 1.0047845840454102, + "learning_rate": 5.7959282305322306e-05, + "loss": 2.519, + "step": 23517 + }, + { + "epoch": 2.1306878666394873, + "grad_norm": 1.0839086771011353, + "learning_rate": 5.795324110433155e-05, + "loss": 2.5531, + "step": 23518 + }, + { + "epoch": 2.130778464813934, + "grad_norm": 0.9792038798332214, + "learning_rate": 5.794719990334079e-05, + "loss": 2.4194, + "step": 23519 + }, + { + "epoch": 2.130869062988381, + "grad_norm": 1.0176740884780884, + "learning_rate": 5.794115870235003e-05, + "loss": 2.4178, + "step": 23520 + }, + { + "epoch": 2.1309596611628274, + "grad_norm": 1.0025699138641357, + "learning_rate": 5.793511750135927e-05, + "loss": 2.5952, + "step": 23521 + }, + { + "epoch": 2.1310502593372744, + "grad_norm": 1.0321890115737915, + "learning_rate": 5.792907630036851e-05, + "loss": 2.441, + "step": 23522 + }, + { + "epoch": 2.131140857511721, + "grad_norm": 0.9986928701400757, + "learning_rate": 5.792303509937775e-05, + "loss": 2.6152, + "step": 23523 + }, + { + "epoch": 2.131231455686168, + "grad_norm": 0.9061464071273804, + "learning_rate": 5.7916993898387005e-05, + "loss": 1.9891, + "step": 23524 + }, + { + "epoch": 2.1313220538606146, + "grad_norm": 1.005599021911621, + "learning_rate": 5.7910952697396246e-05, + "loss": 2.8212, + "step": 23525 + }, + { + "epoch": 2.1314126520350616, + "grad_norm": 0.9725895524024963, + "learning_rate": 5.790491149640549e-05, + "loss": 2.5009, + "step": 23526 + }, + { + "epoch": 2.131503250209508, + "grad_norm": 1.0222904682159424, + "learning_rate": 5.789887029541473e-05, + "loss": 2.6146, + "step": 23527 + }, + { + "epoch": 2.131593848383955, + "grad_norm": 0.8851461410522461, + "learning_rate": 5.7892829094423975e-05, + "loss": 1.9357, + "step": 23528 + }, + { + "epoch": 2.1316844465584017, + "grad_norm": 0.9934079647064209, + "learning_rate": 5.7886787893433216e-05, + "loss": 2.5858, + "step": 23529 + }, + { + "epoch": 2.1317750447328487, + "grad_norm": 1.0414592027664185, + "learning_rate": 5.788074669244246e-05, + "loss": 2.6561, + "step": 23530 + }, + { + "epoch": 2.1318656429072953, + "grad_norm": 0.9420262575149536, + "learning_rate": 5.787470549145171e-05, + "loss": 2.5873, + "step": 23531 + }, + { + "epoch": 2.1319562410817423, + "grad_norm": 0.9740713238716125, + "learning_rate": 5.786866429046095e-05, + "loss": 2.5024, + "step": 23532 + }, + { + "epoch": 2.132046839256189, + "grad_norm": 1.0102739334106445, + "learning_rate": 5.786262308947019e-05, + "loss": 2.3629, + "step": 23533 + }, + { + "epoch": 2.132137437430636, + "grad_norm": 1.0314984321594238, + "learning_rate": 5.7856581888479434e-05, + "loss": 2.49, + "step": 23534 + }, + { + "epoch": 2.1322280356050825, + "grad_norm": 1.035811424255371, + "learning_rate": 5.7850540687488675e-05, + "loss": 2.7306, + "step": 23535 + }, + { + "epoch": 2.1323186337795295, + "grad_norm": 1.0266518592834473, + "learning_rate": 5.7844499486497916e-05, + "loss": 2.697, + "step": 23536 + }, + { + "epoch": 2.132409231953976, + "grad_norm": 1.0401579141616821, + "learning_rate": 5.7838458285507156e-05, + "loss": 2.5195, + "step": 23537 + }, + { + "epoch": 2.132499830128423, + "grad_norm": 1.006014347076416, + "learning_rate": 5.78324170845164e-05, + "loss": 2.4486, + "step": 23538 + }, + { + "epoch": 2.1325904283028696, + "grad_norm": 0.9772194623947144, + "learning_rate": 5.782637588352565e-05, + "loss": 2.6298, + "step": 23539 + }, + { + "epoch": 2.1326810264773166, + "grad_norm": 1.0299558639526367, + "learning_rate": 5.782033468253489e-05, + "loss": 2.4549, + "step": 23540 + }, + { + "epoch": 2.132771624651763, + "grad_norm": 1.0397961139678955, + "learning_rate": 5.781429348154413e-05, + "loss": 2.6995, + "step": 23541 + }, + { + "epoch": 2.13286222282621, + "grad_norm": 0.9904382228851318, + "learning_rate": 5.7808252280553374e-05, + "loss": 2.6549, + "step": 23542 + }, + { + "epoch": 2.1329528210006568, + "grad_norm": 1.058803915977478, + "learning_rate": 5.7802211079562615e-05, + "loss": 2.6321, + "step": 23543 + }, + { + "epoch": 2.1330434191751038, + "grad_norm": 0.9696104526519775, + "learning_rate": 5.779616987857186e-05, + "loss": 2.4704, + "step": 23544 + }, + { + "epoch": 2.1331340173495503, + "grad_norm": 1.025243878364563, + "learning_rate": 5.77901286775811e-05, + "loss": 2.7442, + "step": 23545 + }, + { + "epoch": 2.1332246155239973, + "grad_norm": 0.9680338501930237, + "learning_rate": 5.778408747659035e-05, + "loss": 2.0462, + "step": 23546 + }, + { + "epoch": 2.133315213698444, + "grad_norm": 1.0562388896942139, + "learning_rate": 5.77780462755996e-05, + "loss": 2.6607, + "step": 23547 + }, + { + "epoch": 2.133405811872891, + "grad_norm": 1.0355658531188965, + "learning_rate": 5.777200507460884e-05, + "loss": 2.539, + "step": 23548 + }, + { + "epoch": 2.1334964100473375, + "grad_norm": 0.9680585265159607, + "learning_rate": 5.776596387361808e-05, + "loss": 2.5153, + "step": 23549 + }, + { + "epoch": 2.1335870082217845, + "grad_norm": 1.016360878944397, + "learning_rate": 5.775992267262732e-05, + "loss": 2.6591, + "step": 23550 + }, + { + "epoch": 2.133677606396231, + "grad_norm": 1.0142687559127808, + "learning_rate": 5.775388147163656e-05, + "loss": 2.9305, + "step": 23551 + }, + { + "epoch": 2.133768204570678, + "grad_norm": 1.0372601747512817, + "learning_rate": 5.77478402706458e-05, + "loss": 2.6247, + "step": 23552 + }, + { + "epoch": 2.1338588027451246, + "grad_norm": 1.0393428802490234, + "learning_rate": 5.7741799069655044e-05, + "loss": 2.6471, + "step": 23553 + }, + { + "epoch": 2.1339494009195716, + "grad_norm": 0.9921740293502808, + "learning_rate": 5.77357578686643e-05, + "loss": 2.7031, + "step": 23554 + }, + { + "epoch": 2.134039999094018, + "grad_norm": 0.9768416285514832, + "learning_rate": 5.772971666767354e-05, + "loss": 2.7479, + "step": 23555 + }, + { + "epoch": 2.134130597268465, + "grad_norm": 0.9952223896980286, + "learning_rate": 5.772367546668278e-05, + "loss": 2.6569, + "step": 23556 + }, + { + "epoch": 2.134221195442912, + "grad_norm": 0.9905052781105042, + "learning_rate": 5.771763426569202e-05, + "loss": 2.6552, + "step": 23557 + }, + { + "epoch": 2.134311793617359, + "grad_norm": 0.9803889393806458, + "learning_rate": 5.771159306470126e-05, + "loss": 2.4924, + "step": 23558 + }, + { + "epoch": 2.1344023917918054, + "grad_norm": 1.001951813697815, + "learning_rate": 5.77055518637105e-05, + "loss": 2.8834, + "step": 23559 + }, + { + "epoch": 2.1344929899662524, + "grad_norm": 0.9392244219779968, + "learning_rate": 5.769951066271975e-05, + "loss": 2.6274, + "step": 23560 + }, + { + "epoch": 2.134583588140699, + "grad_norm": 0.9976236820220947, + "learning_rate": 5.7693469461729e-05, + "loss": 2.6399, + "step": 23561 + }, + { + "epoch": 2.134674186315146, + "grad_norm": 0.9956968426704407, + "learning_rate": 5.768742826073824e-05, + "loss": 2.6469, + "step": 23562 + }, + { + "epoch": 2.1347647844895925, + "grad_norm": 1.0624055862426758, + "learning_rate": 5.7681387059747486e-05, + "loss": 2.7201, + "step": 23563 + }, + { + "epoch": 2.1348553826640395, + "grad_norm": 1.0970691442489624, + "learning_rate": 5.7675345858756727e-05, + "loss": 2.5583, + "step": 23564 + }, + { + "epoch": 2.134945980838486, + "grad_norm": 0.9455724358558655, + "learning_rate": 5.766930465776597e-05, + "loss": 2.5878, + "step": 23565 + }, + { + "epoch": 2.135036579012933, + "grad_norm": 0.9678584337234497, + "learning_rate": 5.766326345677521e-05, + "loss": 2.5548, + "step": 23566 + }, + { + "epoch": 2.1351271771873797, + "grad_norm": 1.0115158557891846, + "learning_rate": 5.765722225578445e-05, + "loss": 2.3308, + "step": 23567 + }, + { + "epoch": 2.1352177753618267, + "grad_norm": 1.0173953771591187, + "learning_rate": 5.765118105479369e-05, + "loss": 2.6439, + "step": 23568 + }, + { + "epoch": 2.1353083735362732, + "grad_norm": 0.9370059370994568, + "learning_rate": 5.7645139853802944e-05, + "loss": 2.4738, + "step": 23569 + }, + { + "epoch": 2.1353989717107202, + "grad_norm": 1.0414565801620483, + "learning_rate": 5.7639098652812185e-05, + "loss": 2.8317, + "step": 23570 + }, + { + "epoch": 2.135489569885167, + "grad_norm": 0.8654468655586243, + "learning_rate": 5.7633057451821426e-05, + "loss": 2.0094, + "step": 23571 + }, + { + "epoch": 2.1355801680596134, + "grad_norm": 1.039306640625, + "learning_rate": 5.762701625083067e-05, + "loss": 2.3154, + "step": 23572 + }, + { + "epoch": 2.1356707662340604, + "grad_norm": 0.9722877144813538, + "learning_rate": 5.762097504983991e-05, + "loss": 2.2028, + "step": 23573 + }, + { + "epoch": 2.1357613644085074, + "grad_norm": 0.9158585071563721, + "learning_rate": 5.761493384884915e-05, + "loss": 2.2089, + "step": 23574 + }, + { + "epoch": 2.135851962582954, + "grad_norm": 1.0060409307479858, + "learning_rate": 5.760889264785839e-05, + "loss": 2.5115, + "step": 23575 + }, + { + "epoch": 2.1359425607574005, + "grad_norm": 0.955980658531189, + "learning_rate": 5.7602851446867644e-05, + "loss": 2.7265, + "step": 23576 + }, + { + "epoch": 2.1360331589318475, + "grad_norm": 0.998731255531311, + "learning_rate": 5.7596810245876884e-05, + "loss": 2.6282, + "step": 23577 + }, + { + "epoch": 2.1361237571062945, + "grad_norm": 1.1222736835479736, + "learning_rate": 5.7590769044886125e-05, + "loss": 2.5216, + "step": 23578 + }, + { + "epoch": 2.136214355280741, + "grad_norm": 1.0349446535110474, + "learning_rate": 5.758472784389537e-05, + "loss": 2.6233, + "step": 23579 + }, + { + "epoch": 2.1363049534551877, + "grad_norm": 1.0206722021102905, + "learning_rate": 5.7578686642904614e-05, + "loss": 2.6566, + "step": 23580 + }, + { + "epoch": 2.1363955516296347, + "grad_norm": 0.8994844555854797, + "learning_rate": 5.7572645441913855e-05, + "loss": 2.612, + "step": 23581 + }, + { + "epoch": 2.1364861498040812, + "grad_norm": 0.9626255631446838, + "learning_rate": 5.7566604240923095e-05, + "loss": 2.4191, + "step": 23582 + }, + { + "epoch": 2.1365767479785283, + "grad_norm": 1.044543981552124, + "learning_rate": 5.7560563039932336e-05, + "loss": 2.5125, + "step": 23583 + }, + { + "epoch": 2.136667346152975, + "grad_norm": 0.9651875495910645, + "learning_rate": 5.755452183894159e-05, + "loss": 2.6165, + "step": 23584 + }, + { + "epoch": 2.136757944327422, + "grad_norm": 0.8406783938407898, + "learning_rate": 5.754848063795083e-05, + "loss": 1.9655, + "step": 23585 + }, + { + "epoch": 2.1368485425018684, + "grad_norm": 1.043570876121521, + "learning_rate": 5.754243943696007e-05, + "loss": 2.4955, + "step": 23586 + }, + { + "epoch": 2.1369391406763154, + "grad_norm": 1.016523838043213, + "learning_rate": 5.753639823596931e-05, + "loss": 2.722, + "step": 23587 + }, + { + "epoch": 2.137029738850762, + "grad_norm": 0.9102354049682617, + "learning_rate": 5.7530357034978554e-05, + "loss": 2.5106, + "step": 23588 + }, + { + "epoch": 2.137120337025209, + "grad_norm": 0.9482591152191162, + "learning_rate": 5.7524315833987795e-05, + "loss": 2.5259, + "step": 23589 + }, + { + "epoch": 2.1372109351996555, + "grad_norm": 1.0179740190505981, + "learning_rate": 5.7518274632997036e-05, + "loss": 2.7353, + "step": 23590 + }, + { + "epoch": 2.1373015333741026, + "grad_norm": 1.0180096626281738, + "learning_rate": 5.751223343200629e-05, + "loss": 2.5372, + "step": 23591 + }, + { + "epoch": 2.137392131548549, + "grad_norm": 0.9855421185493469, + "learning_rate": 5.750619223101553e-05, + "loss": 2.5194, + "step": 23592 + }, + { + "epoch": 2.137482729722996, + "grad_norm": 1.0389971733093262, + "learning_rate": 5.750015103002477e-05, + "loss": 2.5368, + "step": 23593 + }, + { + "epoch": 2.1375733278974427, + "grad_norm": 0.9989765286445618, + "learning_rate": 5.749410982903401e-05, + "loss": 2.6467, + "step": 23594 + }, + { + "epoch": 2.1376639260718897, + "grad_norm": 1.0212682485580444, + "learning_rate": 5.748806862804326e-05, + "loss": 2.7957, + "step": 23595 + }, + { + "epoch": 2.1377545242463363, + "grad_norm": 0.9051060080528259, + "learning_rate": 5.74820274270525e-05, + "loss": 2.4667, + "step": 23596 + }, + { + "epoch": 2.1378451224207833, + "grad_norm": 0.9935570359230042, + "learning_rate": 5.747598622606174e-05, + "loss": 2.5725, + "step": 23597 + }, + { + "epoch": 2.13793572059523, + "grad_norm": 0.9392523169517517, + "learning_rate": 5.746994502507098e-05, + "loss": 2.5057, + "step": 23598 + }, + { + "epoch": 2.138026318769677, + "grad_norm": 0.993367075920105, + "learning_rate": 5.746390382408024e-05, + "loss": 2.5352, + "step": 23599 + }, + { + "epoch": 2.1381169169441234, + "grad_norm": 0.8966637849807739, + "learning_rate": 5.745786262308948e-05, + "loss": 2.0103, + "step": 23600 + }, + { + "epoch": 2.1382075151185704, + "grad_norm": 1.0331345796585083, + "learning_rate": 5.745182142209872e-05, + "loss": 2.8402, + "step": 23601 + }, + { + "epoch": 2.138298113293017, + "grad_norm": 0.9743503332138062, + "learning_rate": 5.744578022110796e-05, + "loss": 2.707, + "step": 23602 + }, + { + "epoch": 2.138388711467464, + "grad_norm": 1.0951296091079712, + "learning_rate": 5.74397390201172e-05, + "loss": 2.6877, + "step": 23603 + }, + { + "epoch": 2.1384793096419106, + "grad_norm": 1.0393379926681519, + "learning_rate": 5.743369781912644e-05, + "loss": 2.9607, + "step": 23604 + }, + { + "epoch": 2.1385699078163576, + "grad_norm": 0.9310993552207947, + "learning_rate": 5.742765661813568e-05, + "loss": 2.3902, + "step": 23605 + }, + { + "epoch": 2.138660505990804, + "grad_norm": 0.8141733407974243, + "learning_rate": 5.7421615417144936e-05, + "loss": 1.9037, + "step": 23606 + }, + { + "epoch": 2.138751104165251, + "grad_norm": 1.0358686447143555, + "learning_rate": 5.741557421615418e-05, + "loss": 2.7049, + "step": 23607 + }, + { + "epoch": 2.1388417023396977, + "grad_norm": 0.9604774713516235, + "learning_rate": 5.740953301516342e-05, + "loss": 2.5176, + "step": 23608 + }, + { + "epoch": 2.1389323005141447, + "grad_norm": 0.9066134691238403, + "learning_rate": 5.740349181417266e-05, + "loss": 2.5042, + "step": 23609 + }, + { + "epoch": 2.1390228986885913, + "grad_norm": 1.0314478874206543, + "learning_rate": 5.73974506131819e-05, + "loss": 2.7519, + "step": 23610 + }, + { + "epoch": 2.1391134968630383, + "grad_norm": 0.975639820098877, + "learning_rate": 5.739140941219115e-05, + "loss": 2.564, + "step": 23611 + }, + { + "epoch": 2.139204095037485, + "grad_norm": 1.0866321325302124, + "learning_rate": 5.738536821120039e-05, + "loss": 2.6175, + "step": 23612 + }, + { + "epoch": 2.139294693211932, + "grad_norm": 0.8138881921768188, + "learning_rate": 5.737932701020963e-05, + "loss": 1.7645, + "step": 23613 + }, + { + "epoch": 2.1393852913863785, + "grad_norm": 1.013074517250061, + "learning_rate": 5.7373285809218876e-05, + "loss": 2.6896, + "step": 23614 + }, + { + "epoch": 2.1394758895608255, + "grad_norm": 1.0668972730636597, + "learning_rate": 5.7367244608228124e-05, + "loss": 2.7535, + "step": 23615 + }, + { + "epoch": 2.139566487735272, + "grad_norm": 0.9841393828392029, + "learning_rate": 5.7361203407237365e-05, + "loss": 2.5234, + "step": 23616 + }, + { + "epoch": 2.139657085909719, + "grad_norm": 0.9603144526481628, + "learning_rate": 5.7355162206246606e-05, + "loss": 2.724, + "step": 23617 + }, + { + "epoch": 2.1397476840841656, + "grad_norm": 1.0261348485946655, + "learning_rate": 5.7349121005255847e-05, + "loss": 2.6994, + "step": 23618 + }, + { + "epoch": 2.1398382822586126, + "grad_norm": 1.0487632751464844, + "learning_rate": 5.734307980426509e-05, + "loss": 2.6959, + "step": 23619 + }, + { + "epoch": 2.139928880433059, + "grad_norm": 0.9947928786277771, + "learning_rate": 5.733703860327433e-05, + "loss": 2.5517, + "step": 23620 + }, + { + "epoch": 2.140019478607506, + "grad_norm": 0.9547750353813171, + "learning_rate": 5.733099740228358e-05, + "loss": 2.6137, + "step": 23621 + }, + { + "epoch": 2.1401100767819528, + "grad_norm": 0.9754071235656738, + "learning_rate": 5.732495620129282e-05, + "loss": 2.7911, + "step": 23622 + }, + { + "epoch": 2.1402006749563998, + "grad_norm": 0.9960466623306274, + "learning_rate": 5.7318915000302064e-05, + "loss": 2.7337, + "step": 23623 + }, + { + "epoch": 2.1402912731308463, + "grad_norm": 1.027715802192688, + "learning_rate": 5.7312873799311305e-05, + "loss": 2.1264, + "step": 23624 + }, + { + "epoch": 2.1403818713052933, + "grad_norm": 0.8208296895027161, + "learning_rate": 5.7306832598320546e-05, + "loss": 2.0264, + "step": 23625 + }, + { + "epoch": 2.14047246947974, + "grad_norm": 0.9634917378425598, + "learning_rate": 5.730079139732979e-05, + "loss": 2.5699, + "step": 23626 + }, + { + "epoch": 2.140563067654187, + "grad_norm": 0.9659220576286316, + "learning_rate": 5.7294750196339034e-05, + "loss": 2.582, + "step": 23627 + }, + { + "epoch": 2.1406536658286335, + "grad_norm": 0.9974837899208069, + "learning_rate": 5.7288708995348275e-05, + "loss": 2.782, + "step": 23628 + }, + { + "epoch": 2.1407442640030805, + "grad_norm": 0.9334555268287659, + "learning_rate": 5.728266779435752e-05, + "loss": 2.7876, + "step": 23629 + }, + { + "epoch": 2.140834862177527, + "grad_norm": 0.838541567325592, + "learning_rate": 5.7276626593366764e-05, + "loss": 2.0518, + "step": 23630 + }, + { + "epoch": 2.140925460351974, + "grad_norm": 0.9043042659759521, + "learning_rate": 5.727058539237601e-05, + "loss": 2.504, + "step": 23631 + }, + { + "epoch": 2.1410160585264206, + "grad_norm": 1.0604952573776245, + "learning_rate": 5.726454419138525e-05, + "loss": 2.7066, + "step": 23632 + }, + { + "epoch": 2.1411066567008676, + "grad_norm": 0.9401586651802063, + "learning_rate": 5.725850299039449e-05, + "loss": 2.5582, + "step": 23633 + }, + { + "epoch": 2.141197254875314, + "grad_norm": 1.0885978937149048, + "learning_rate": 5.7252461789403734e-05, + "loss": 2.4112, + "step": 23634 + }, + { + "epoch": 2.141287853049761, + "grad_norm": 0.9779587984085083, + "learning_rate": 5.7246420588412974e-05, + "loss": 2.5291, + "step": 23635 + }, + { + "epoch": 2.141378451224208, + "grad_norm": 0.9479103088378906, + "learning_rate": 5.724037938742223e-05, + "loss": 2.6914, + "step": 23636 + }, + { + "epoch": 2.141469049398655, + "grad_norm": 1.124435305595398, + "learning_rate": 5.723433818643147e-05, + "loss": 1.8333, + "step": 23637 + }, + { + "epoch": 2.1415596475731014, + "grad_norm": 0.9713464975357056, + "learning_rate": 5.722829698544071e-05, + "loss": 2.6517, + "step": 23638 + }, + { + "epoch": 2.1416502457475484, + "grad_norm": 0.9998056292533875, + "learning_rate": 5.722225578444995e-05, + "loss": 2.7157, + "step": 23639 + }, + { + "epoch": 2.141740843921995, + "grad_norm": 1.0299545526504517, + "learning_rate": 5.721621458345919e-05, + "loss": 2.8582, + "step": 23640 + }, + { + "epoch": 2.141831442096442, + "grad_norm": 0.9841257333755493, + "learning_rate": 5.721017338246843e-05, + "loss": 2.5714, + "step": 23641 + }, + { + "epoch": 2.1419220402708885, + "grad_norm": 1.0240240097045898, + "learning_rate": 5.7204132181477674e-05, + "loss": 2.6468, + "step": 23642 + }, + { + "epoch": 2.1420126384453355, + "grad_norm": 1.0118074417114258, + "learning_rate": 5.7198090980486915e-05, + "loss": 2.8069, + "step": 23643 + }, + { + "epoch": 2.142103236619782, + "grad_norm": 0.8257700204849243, + "learning_rate": 5.719204977949617e-05, + "loss": 1.9831, + "step": 23644 + }, + { + "epoch": 2.142193834794229, + "grad_norm": 0.9996439814567566, + "learning_rate": 5.718600857850541e-05, + "loss": 2.8298, + "step": 23645 + }, + { + "epoch": 2.1422844329686757, + "grad_norm": 1.1483361721038818, + "learning_rate": 5.717996737751465e-05, + "loss": 2.3351, + "step": 23646 + }, + { + "epoch": 2.1423750311431227, + "grad_norm": 0.9781690835952759, + "learning_rate": 5.71739261765239e-05, + "loss": 2.5061, + "step": 23647 + }, + { + "epoch": 2.1424656293175692, + "grad_norm": 0.924403727054596, + "learning_rate": 5.716788497553314e-05, + "loss": 2.7179, + "step": 23648 + }, + { + "epoch": 2.1425562274920162, + "grad_norm": 0.9331015348434448, + "learning_rate": 5.716184377454238e-05, + "loss": 2.4591, + "step": 23649 + }, + { + "epoch": 2.142646825666463, + "grad_norm": 1.0166926383972168, + "learning_rate": 5.715580257355162e-05, + "loss": 2.7796, + "step": 23650 + }, + { + "epoch": 2.1427374238409094, + "grad_norm": 0.9819971919059753, + "learning_rate": 5.7149761372560875e-05, + "loss": 2.5834, + "step": 23651 + }, + { + "epoch": 2.1428280220153564, + "grad_norm": 1.011093020439148, + "learning_rate": 5.7143720171570116e-05, + "loss": 2.53, + "step": 23652 + }, + { + "epoch": 2.1429186201898034, + "grad_norm": 0.8724699020385742, + "learning_rate": 5.713767897057936e-05, + "loss": 1.9515, + "step": 23653 + }, + { + "epoch": 2.14300921836425, + "grad_norm": 1.004512071609497, + "learning_rate": 5.71316377695886e-05, + "loss": 2.4132, + "step": 23654 + }, + { + "epoch": 2.1430998165386965, + "grad_norm": 0.9684231281280518, + "learning_rate": 5.712559656859784e-05, + "loss": 2.6168, + "step": 23655 + }, + { + "epoch": 2.1431904147131435, + "grad_norm": 1.0016814470291138, + "learning_rate": 5.711955536760708e-05, + "loss": 2.5082, + "step": 23656 + }, + { + "epoch": 2.1432810128875905, + "grad_norm": 0.9324401617050171, + "learning_rate": 5.711351416661632e-05, + "loss": 2.5236, + "step": 23657 + }, + { + "epoch": 2.143371611062037, + "grad_norm": 1.0294382572174072, + "learning_rate": 5.710747296562556e-05, + "loss": 3.0174, + "step": 23658 + }, + { + "epoch": 2.1434622092364837, + "grad_norm": 0.9698994159698486, + "learning_rate": 5.7101431764634815e-05, + "loss": 2.5156, + "step": 23659 + }, + { + "epoch": 2.1435528074109307, + "grad_norm": 0.9986891746520996, + "learning_rate": 5.7095390563644056e-05, + "loss": 2.7292, + "step": 23660 + }, + { + "epoch": 2.1436434055853772, + "grad_norm": 0.9652001857757568, + "learning_rate": 5.70893493626533e-05, + "loss": 2.874, + "step": 23661 + }, + { + "epoch": 2.1437340037598243, + "grad_norm": 1.005010724067688, + "learning_rate": 5.708330816166254e-05, + "loss": 2.5294, + "step": 23662 + }, + { + "epoch": 2.143824601934271, + "grad_norm": 0.9960439801216125, + "learning_rate": 5.7077266960671785e-05, + "loss": 2.7994, + "step": 23663 + }, + { + "epoch": 2.143915200108718, + "grad_norm": 0.9606835842132568, + "learning_rate": 5.7071225759681026e-05, + "loss": 2.6492, + "step": 23664 + }, + { + "epoch": 2.1440057982831644, + "grad_norm": 0.7028384804725647, + "learning_rate": 5.706518455869027e-05, + "loss": 1.3017, + "step": 23665 + }, + { + "epoch": 2.1440963964576114, + "grad_norm": 1.0545105934143066, + "learning_rate": 5.705914335769952e-05, + "loss": 2.7785, + "step": 23666 + }, + { + "epoch": 2.144186994632058, + "grad_norm": 1.0899142026901245, + "learning_rate": 5.705310215670876e-05, + "loss": 2.5556, + "step": 23667 + }, + { + "epoch": 2.144277592806505, + "grad_norm": 0.9478514194488525, + "learning_rate": 5.7047060955718e-05, + "loss": 2.3264, + "step": 23668 + }, + { + "epoch": 2.1443681909809515, + "grad_norm": 0.9658775329589844, + "learning_rate": 5.7041019754727244e-05, + "loss": 2.5385, + "step": 23669 + }, + { + "epoch": 2.1444587891553986, + "grad_norm": 1.0015215873718262, + "learning_rate": 5.7034978553736485e-05, + "loss": 2.432, + "step": 23670 + }, + { + "epoch": 2.144549387329845, + "grad_norm": 1.0747586488723755, + "learning_rate": 5.7028937352745726e-05, + "loss": 2.5648, + "step": 23671 + }, + { + "epoch": 2.144639985504292, + "grad_norm": 0.9673598408699036, + "learning_rate": 5.7022896151754967e-05, + "loss": 2.5796, + "step": 23672 + }, + { + "epoch": 2.1447305836787387, + "grad_norm": 0.8758460283279419, + "learning_rate": 5.701685495076421e-05, + "loss": 1.9648, + "step": 23673 + }, + { + "epoch": 2.1448211818531857, + "grad_norm": 1.0098804235458374, + "learning_rate": 5.701081374977346e-05, + "loss": 2.4154, + "step": 23674 + }, + { + "epoch": 2.1449117800276323, + "grad_norm": 1.015049934387207, + "learning_rate": 5.70047725487827e-05, + "loss": 2.9099, + "step": 23675 + }, + { + "epoch": 2.1450023782020793, + "grad_norm": 1.0546107292175293, + "learning_rate": 5.699873134779194e-05, + "loss": 2.6234, + "step": 23676 + }, + { + "epoch": 2.145092976376526, + "grad_norm": 0.913318932056427, + "learning_rate": 5.6992690146801184e-05, + "loss": 1.9091, + "step": 23677 + }, + { + "epoch": 2.145183574550973, + "grad_norm": 0.9967489242553711, + "learning_rate": 5.6986648945810425e-05, + "loss": 2.4477, + "step": 23678 + }, + { + "epoch": 2.1452741727254194, + "grad_norm": 0.9847778677940369, + "learning_rate": 5.698060774481967e-05, + "loss": 2.6588, + "step": 23679 + }, + { + "epoch": 2.1453647708998664, + "grad_norm": 1.0525912046432495, + "learning_rate": 5.6974566543828913e-05, + "loss": 2.6864, + "step": 23680 + }, + { + "epoch": 2.145455369074313, + "grad_norm": 0.9549933671951294, + "learning_rate": 5.696852534283816e-05, + "loss": 2.6877, + "step": 23681 + }, + { + "epoch": 2.14554596724876, + "grad_norm": 0.9723814725875854, + "learning_rate": 5.696248414184741e-05, + "loss": 2.6461, + "step": 23682 + }, + { + "epoch": 2.1456365654232066, + "grad_norm": 0.9279060363769531, + "learning_rate": 5.695644294085665e-05, + "loss": 1.8484, + "step": 23683 + }, + { + "epoch": 2.1457271635976536, + "grad_norm": 0.9910145998001099, + "learning_rate": 5.695040173986589e-05, + "loss": 2.7948, + "step": 23684 + }, + { + "epoch": 2.1458177617721, + "grad_norm": 1.1797786951065063, + "learning_rate": 5.694436053887513e-05, + "loss": 2.6813, + "step": 23685 + }, + { + "epoch": 2.145908359946547, + "grad_norm": 0.9688350558280945, + "learning_rate": 5.693831933788437e-05, + "loss": 2.7082, + "step": 23686 + }, + { + "epoch": 2.1459989581209937, + "grad_norm": 1.0266507863998413, + "learning_rate": 5.693227813689361e-05, + "loss": 2.8107, + "step": 23687 + }, + { + "epoch": 2.1460895562954407, + "grad_norm": 0.9947353005409241, + "learning_rate": 5.6926236935902854e-05, + "loss": 2.6613, + "step": 23688 + }, + { + "epoch": 2.1461801544698873, + "grad_norm": 0.9426398277282715, + "learning_rate": 5.692019573491211e-05, + "loss": 2.6764, + "step": 23689 + }, + { + "epoch": 2.1462707526443343, + "grad_norm": 0.9941008687019348, + "learning_rate": 5.691415453392135e-05, + "loss": 2.5231, + "step": 23690 + }, + { + "epoch": 2.146361350818781, + "grad_norm": 1.0795646905899048, + "learning_rate": 5.690811333293059e-05, + "loss": 2.8254, + "step": 23691 + }, + { + "epoch": 2.146451948993228, + "grad_norm": 0.9814958572387695, + "learning_rate": 5.690207213193983e-05, + "loss": 2.7031, + "step": 23692 + }, + { + "epoch": 2.1465425471676745, + "grad_norm": 1.0600507259368896, + "learning_rate": 5.689603093094907e-05, + "loss": 2.8333, + "step": 23693 + }, + { + "epoch": 2.1466331453421215, + "grad_norm": 0.8443867564201355, + "learning_rate": 5.688998972995831e-05, + "loss": 1.8959, + "step": 23694 + }, + { + "epoch": 2.146723743516568, + "grad_norm": 1.2237588167190552, + "learning_rate": 5.688394852896756e-05, + "loss": 2.4962, + "step": 23695 + }, + { + "epoch": 2.146814341691015, + "grad_norm": 0.8414674401283264, + "learning_rate": 5.687790732797681e-05, + "loss": 1.9953, + "step": 23696 + }, + { + "epoch": 2.1469049398654616, + "grad_norm": 0.9504470229148865, + "learning_rate": 5.687186612698605e-05, + "loss": 2.5509, + "step": 23697 + }, + { + "epoch": 2.1469955380399086, + "grad_norm": 0.849073052406311, + "learning_rate": 5.686582492599529e-05, + "loss": 1.9145, + "step": 23698 + }, + { + "epoch": 2.147086136214355, + "grad_norm": 1.0277725458145142, + "learning_rate": 5.685978372500454e-05, + "loss": 2.8559, + "step": 23699 + }, + { + "epoch": 2.147176734388802, + "grad_norm": 0.9803405404090881, + "learning_rate": 5.685374252401378e-05, + "loss": 2.7219, + "step": 23700 + }, + { + "epoch": 2.1472673325632488, + "grad_norm": 1.0371167659759521, + "learning_rate": 5.684770132302302e-05, + "loss": 2.5674, + "step": 23701 + }, + { + "epoch": 2.1473579307376958, + "grad_norm": 0.9688727259635925, + "learning_rate": 5.684166012203226e-05, + "loss": 2.6149, + "step": 23702 + }, + { + "epoch": 2.1474485289121423, + "grad_norm": 0.9667537212371826, + "learning_rate": 5.6835618921041514e-05, + "loss": 2.6134, + "step": 23703 + }, + { + "epoch": 2.1475391270865893, + "grad_norm": 1.028936505317688, + "learning_rate": 5.6829577720050754e-05, + "loss": 2.4249, + "step": 23704 + }, + { + "epoch": 2.147629725261036, + "grad_norm": 0.9974848031997681, + "learning_rate": 5.6823536519059995e-05, + "loss": 2.6058, + "step": 23705 + }, + { + "epoch": 2.147720323435483, + "grad_norm": 0.9630032181739807, + "learning_rate": 5.6817495318069236e-05, + "loss": 2.4614, + "step": 23706 + }, + { + "epoch": 2.1478109216099295, + "grad_norm": 1.0277650356292725, + "learning_rate": 5.681145411707848e-05, + "loss": 2.3924, + "step": 23707 + }, + { + "epoch": 2.1479015197843765, + "grad_norm": 1.158290982246399, + "learning_rate": 5.680541291608772e-05, + "loss": 2.5274, + "step": 23708 + }, + { + "epoch": 2.147992117958823, + "grad_norm": 0.8502354621887207, + "learning_rate": 5.679937171509696e-05, + "loss": 1.9391, + "step": 23709 + }, + { + "epoch": 2.14808271613327, + "grad_norm": 0.8352006077766418, + "learning_rate": 5.67933305141062e-05, + "loss": 2.0181, + "step": 23710 + }, + { + "epoch": 2.1481733143077166, + "grad_norm": 1.1029739379882812, + "learning_rate": 5.6787289313115454e-05, + "loss": 2.7778, + "step": 23711 + }, + { + "epoch": 2.1482639124821636, + "grad_norm": 0.9701548218727112, + "learning_rate": 5.6781248112124695e-05, + "loss": 2.3413, + "step": 23712 + }, + { + "epoch": 2.14835451065661, + "grad_norm": 0.9618914127349854, + "learning_rate": 5.6775206911133935e-05, + "loss": 2.4964, + "step": 23713 + }, + { + "epoch": 2.148445108831057, + "grad_norm": 0.9989287257194519, + "learning_rate": 5.6769165710143176e-05, + "loss": 2.736, + "step": 23714 + }, + { + "epoch": 2.148535707005504, + "grad_norm": 0.8176589608192444, + "learning_rate": 5.6763124509152424e-05, + "loss": 1.8737, + "step": 23715 + }, + { + "epoch": 2.148626305179951, + "grad_norm": 0.988356351852417, + "learning_rate": 5.6757083308161665e-05, + "loss": 2.8282, + "step": 23716 + }, + { + "epoch": 2.1487169033543974, + "grad_norm": 0.9345535039901733, + "learning_rate": 5.6751042107170905e-05, + "loss": 2.2314, + "step": 23717 + }, + { + "epoch": 2.1488075015288444, + "grad_norm": 1.005476951599121, + "learning_rate": 5.674500090618016e-05, + "loss": 2.72, + "step": 23718 + }, + { + "epoch": 2.148898099703291, + "grad_norm": 1.0501058101654053, + "learning_rate": 5.67389597051894e-05, + "loss": 2.5885, + "step": 23719 + }, + { + "epoch": 2.148988697877738, + "grad_norm": 0.9988093376159668, + "learning_rate": 5.673291850419864e-05, + "loss": 3.0658, + "step": 23720 + }, + { + "epoch": 2.1490792960521845, + "grad_norm": 1.2342811822891235, + "learning_rate": 5.672687730320788e-05, + "loss": 2.395, + "step": 23721 + }, + { + "epoch": 2.1491698942266315, + "grad_norm": 0.9594888687133789, + "learning_rate": 5.672083610221712e-05, + "loss": 2.7788, + "step": 23722 + }, + { + "epoch": 2.149260492401078, + "grad_norm": 1.0848952531814575, + "learning_rate": 5.6714794901226364e-05, + "loss": 2.4962, + "step": 23723 + }, + { + "epoch": 2.149351090575525, + "grad_norm": 1.0594875812530518, + "learning_rate": 5.6708753700235605e-05, + "loss": 2.5416, + "step": 23724 + }, + { + "epoch": 2.1494416887499717, + "grad_norm": 1.0208557844161987, + "learning_rate": 5.6702712499244846e-05, + "loss": 2.6849, + "step": 23725 + }, + { + "epoch": 2.1495322869244187, + "grad_norm": 1.0385390520095825, + "learning_rate": 5.66966712982541e-05, + "loss": 2.5369, + "step": 23726 + }, + { + "epoch": 2.1496228850988652, + "grad_norm": 0.9819542765617371, + "learning_rate": 5.669063009726334e-05, + "loss": 2.4528, + "step": 23727 + }, + { + "epoch": 2.1497134832733122, + "grad_norm": 0.9595012068748474, + "learning_rate": 5.668458889627258e-05, + "loss": 2.5744, + "step": 23728 + }, + { + "epoch": 2.149804081447759, + "grad_norm": 0.8898355960845947, + "learning_rate": 5.667854769528182e-05, + "loss": 2.0278, + "step": 23729 + }, + { + "epoch": 2.149894679622206, + "grad_norm": 0.9797978401184082, + "learning_rate": 5.667250649429106e-05, + "loss": 2.6278, + "step": 23730 + }, + { + "epoch": 2.1499852777966524, + "grad_norm": 0.9980000257492065, + "learning_rate": 5.666646529330031e-05, + "loss": 2.6787, + "step": 23731 + }, + { + "epoch": 2.1500758759710994, + "grad_norm": 1.0542491674423218, + "learning_rate": 5.666042409230955e-05, + "loss": 2.8882, + "step": 23732 + }, + { + "epoch": 2.150166474145546, + "grad_norm": 0.8370090126991272, + "learning_rate": 5.66543828913188e-05, + "loss": 1.831, + "step": 23733 + }, + { + "epoch": 2.1502570723199925, + "grad_norm": 0.9063785672187805, + "learning_rate": 5.664834169032805e-05, + "loss": 2.1174, + "step": 23734 + }, + { + "epoch": 2.1503476704944395, + "grad_norm": 0.8593177199363708, + "learning_rate": 5.664230048933729e-05, + "loss": 2.0484, + "step": 23735 + }, + { + "epoch": 2.1504382686688865, + "grad_norm": 0.9577082395553589, + "learning_rate": 5.663625928834653e-05, + "loss": 2.8246, + "step": 23736 + }, + { + "epoch": 2.150528866843333, + "grad_norm": 0.8576990365982056, + "learning_rate": 5.663021808735577e-05, + "loss": 1.9944, + "step": 23737 + }, + { + "epoch": 2.1506194650177797, + "grad_norm": 0.9081783890724182, + "learning_rate": 5.662417688636501e-05, + "loss": 2.3593, + "step": 23738 + }, + { + "epoch": 2.1507100631922267, + "grad_norm": 1.0533103942871094, + "learning_rate": 5.661813568537425e-05, + "loss": 2.656, + "step": 23739 + }, + { + "epoch": 2.1508006613666737, + "grad_norm": 0.9233902096748352, + "learning_rate": 5.661209448438349e-05, + "loss": 2.448, + "step": 23740 + }, + { + "epoch": 2.1508912595411203, + "grad_norm": 0.9204397797584534, + "learning_rate": 5.6606053283392746e-05, + "loss": 1.8445, + "step": 23741 + }, + { + "epoch": 2.150981857715567, + "grad_norm": 0.9746995568275452, + "learning_rate": 5.660001208240199e-05, + "loss": 2.3649, + "step": 23742 + }, + { + "epoch": 2.151072455890014, + "grad_norm": 0.9665855765342712, + "learning_rate": 5.659397088141123e-05, + "loss": 2.6657, + "step": 23743 + }, + { + "epoch": 2.1511630540644604, + "grad_norm": 1.111581802368164, + "learning_rate": 5.658792968042047e-05, + "loss": 2.5945, + "step": 23744 + }, + { + "epoch": 2.1512536522389074, + "grad_norm": 0.9903662204742432, + "learning_rate": 5.658188847942971e-05, + "loss": 2.6364, + "step": 23745 + }, + { + "epoch": 2.151344250413354, + "grad_norm": 1.0008448362350464, + "learning_rate": 5.657584727843895e-05, + "loss": 2.5822, + "step": 23746 + }, + { + "epoch": 2.151434848587801, + "grad_norm": 1.0712554454803467, + "learning_rate": 5.65698060774482e-05, + "loss": 2.7867, + "step": 23747 + }, + { + "epoch": 2.1515254467622476, + "grad_norm": 1.0218884944915771, + "learning_rate": 5.6563764876457446e-05, + "loss": 2.6212, + "step": 23748 + }, + { + "epoch": 2.1516160449366946, + "grad_norm": 0.980730414390564, + "learning_rate": 5.6557723675466687e-05, + "loss": 2.6321, + "step": 23749 + }, + { + "epoch": 2.151706643111141, + "grad_norm": 1.0696897506713867, + "learning_rate": 5.6551682474475934e-05, + "loss": 2.5103, + "step": 23750 + }, + { + "epoch": 2.151797241285588, + "grad_norm": 0.9641746282577515, + "learning_rate": 5.6545641273485175e-05, + "loss": 2.5911, + "step": 23751 + }, + { + "epoch": 2.1518878394600347, + "grad_norm": 0.9936058521270752, + "learning_rate": 5.6539600072494416e-05, + "loss": 2.7834, + "step": 23752 + }, + { + "epoch": 2.1519784376344817, + "grad_norm": 1.0887612104415894, + "learning_rate": 5.653355887150366e-05, + "loss": 2.4365, + "step": 23753 + }, + { + "epoch": 2.1520690358089283, + "grad_norm": 1.017488718032837, + "learning_rate": 5.65275176705129e-05, + "loss": 2.9665, + "step": 23754 + }, + { + "epoch": 2.1521596339833753, + "grad_norm": 1.0093278884887695, + "learning_rate": 5.652147646952214e-05, + "loss": 2.7717, + "step": 23755 + }, + { + "epoch": 2.152250232157822, + "grad_norm": 0.9441165328025818, + "learning_rate": 5.651543526853139e-05, + "loss": 2.6252, + "step": 23756 + }, + { + "epoch": 2.152340830332269, + "grad_norm": 0.9689805507659912, + "learning_rate": 5.6509394067540634e-05, + "loss": 2.7017, + "step": 23757 + }, + { + "epoch": 2.1524314285067154, + "grad_norm": 1.1021000146865845, + "learning_rate": 5.6503352866549874e-05, + "loss": 2.487, + "step": 23758 + }, + { + "epoch": 2.1525220266811624, + "grad_norm": 1.0221571922302246, + "learning_rate": 5.6497311665559115e-05, + "loss": 2.449, + "step": 23759 + }, + { + "epoch": 2.152612624855609, + "grad_norm": 1.0411120653152466, + "learning_rate": 5.6491270464568356e-05, + "loss": 2.7975, + "step": 23760 + }, + { + "epoch": 2.152703223030056, + "grad_norm": 0.9936509132385254, + "learning_rate": 5.64852292635776e-05, + "loss": 2.4629, + "step": 23761 + }, + { + "epoch": 2.1527938212045026, + "grad_norm": 1.0660265684127808, + "learning_rate": 5.647918806258684e-05, + "loss": 2.8766, + "step": 23762 + }, + { + "epoch": 2.1528844193789496, + "grad_norm": 1.0418223142623901, + "learning_rate": 5.647314686159609e-05, + "loss": 2.4555, + "step": 23763 + }, + { + "epoch": 2.152975017553396, + "grad_norm": 1.1143306493759155, + "learning_rate": 5.646710566060533e-05, + "loss": 2.6298, + "step": 23764 + }, + { + "epoch": 2.153065615727843, + "grad_norm": 0.9469137787818909, + "learning_rate": 5.6461064459614574e-05, + "loss": 2.6207, + "step": 23765 + }, + { + "epoch": 2.1531562139022897, + "grad_norm": 0.9568922519683838, + "learning_rate": 5.645502325862382e-05, + "loss": 2.7279, + "step": 23766 + }, + { + "epoch": 2.1532468120767367, + "grad_norm": 1.0830413103103638, + "learning_rate": 5.644898205763306e-05, + "loss": 2.5943, + "step": 23767 + }, + { + "epoch": 2.1533374102511833, + "grad_norm": 0.9565128684043884, + "learning_rate": 5.64429408566423e-05, + "loss": 2.5695, + "step": 23768 + }, + { + "epoch": 2.1534280084256303, + "grad_norm": 1.0545573234558105, + "learning_rate": 5.6436899655651544e-05, + "loss": 2.5219, + "step": 23769 + }, + { + "epoch": 2.153518606600077, + "grad_norm": 0.9152471423149109, + "learning_rate": 5.6430858454660785e-05, + "loss": 2.0525, + "step": 23770 + }, + { + "epoch": 2.153609204774524, + "grad_norm": 0.8794392347335815, + "learning_rate": 5.642481725367004e-05, + "loss": 2.3103, + "step": 23771 + }, + { + "epoch": 2.1536998029489705, + "grad_norm": 0.865415096282959, + "learning_rate": 5.641877605267928e-05, + "loss": 1.9407, + "step": 23772 + }, + { + "epoch": 2.1537904011234175, + "grad_norm": 1.0133941173553467, + "learning_rate": 5.641273485168852e-05, + "loss": 2.566, + "step": 23773 + }, + { + "epoch": 2.153880999297864, + "grad_norm": 0.9350850582122803, + "learning_rate": 5.640669365069776e-05, + "loss": 2.6734, + "step": 23774 + }, + { + "epoch": 2.153971597472311, + "grad_norm": 0.9942685961723328, + "learning_rate": 5.6400652449707e-05, + "loss": 2.6427, + "step": 23775 + }, + { + "epoch": 2.1540621956467576, + "grad_norm": 1.0333590507507324, + "learning_rate": 5.639461124871624e-05, + "loss": 2.7917, + "step": 23776 + }, + { + "epoch": 2.1541527938212046, + "grad_norm": 0.9735284447669983, + "learning_rate": 5.6388570047725484e-05, + "loss": 2.5002, + "step": 23777 + }, + { + "epoch": 2.154243391995651, + "grad_norm": 0.9451054334640503, + "learning_rate": 5.638252884673474e-05, + "loss": 2.3481, + "step": 23778 + }, + { + "epoch": 2.154333990170098, + "grad_norm": 0.9543513655662537, + "learning_rate": 5.637648764574398e-05, + "loss": 2.4805, + "step": 23779 + }, + { + "epoch": 2.1544245883445448, + "grad_norm": 1.0164849758148193, + "learning_rate": 5.637044644475322e-05, + "loss": 2.579, + "step": 23780 + }, + { + "epoch": 2.1545151865189918, + "grad_norm": 0.8106377720832825, + "learning_rate": 5.636440524376246e-05, + "loss": 2.1028, + "step": 23781 + }, + { + "epoch": 2.1546057846934383, + "grad_norm": 1.011560082435608, + "learning_rate": 5.635836404277171e-05, + "loss": 2.7769, + "step": 23782 + }, + { + "epoch": 2.1546963828678853, + "grad_norm": 0.9731292128562927, + "learning_rate": 5.635232284178095e-05, + "loss": 2.3523, + "step": 23783 + }, + { + "epoch": 2.154786981042332, + "grad_norm": 0.9972783923149109, + "learning_rate": 5.634628164079019e-05, + "loss": 2.6617, + "step": 23784 + }, + { + "epoch": 2.154877579216779, + "grad_norm": 0.9761731028556824, + "learning_rate": 5.634024043979943e-05, + "loss": 2.8407, + "step": 23785 + }, + { + "epoch": 2.1549681773912255, + "grad_norm": 0.9804157018661499, + "learning_rate": 5.6334199238808685e-05, + "loss": 2.7137, + "step": 23786 + }, + { + "epoch": 2.1550587755656725, + "grad_norm": 1.0759388208389282, + "learning_rate": 5.6328158037817926e-05, + "loss": 2.6453, + "step": 23787 + }, + { + "epoch": 2.155149373740119, + "grad_norm": 0.9692199230194092, + "learning_rate": 5.632211683682717e-05, + "loss": 2.5615, + "step": 23788 + }, + { + "epoch": 2.155239971914566, + "grad_norm": 1.1032214164733887, + "learning_rate": 5.631607563583641e-05, + "loss": 2.5846, + "step": 23789 + }, + { + "epoch": 2.1553305700890126, + "grad_norm": 0.8389748334884644, + "learning_rate": 5.631003443484565e-05, + "loss": 1.8625, + "step": 23790 + }, + { + "epoch": 2.1554211682634596, + "grad_norm": 0.8786113858222961, + "learning_rate": 5.630399323385489e-05, + "loss": 1.8024, + "step": 23791 + }, + { + "epoch": 2.155511766437906, + "grad_norm": 0.8957756161689758, + "learning_rate": 5.629795203286413e-05, + "loss": 2.2803, + "step": 23792 + }, + { + "epoch": 2.155602364612353, + "grad_norm": 0.9503508806228638, + "learning_rate": 5.6291910831873385e-05, + "loss": 2.5742, + "step": 23793 + }, + { + "epoch": 2.1556929627868, + "grad_norm": 0.9883853197097778, + "learning_rate": 5.6285869630882626e-05, + "loss": 2.2941, + "step": 23794 + }, + { + "epoch": 2.155783560961247, + "grad_norm": 1.0087319612503052, + "learning_rate": 5.6279828429891866e-05, + "loss": 2.5966, + "step": 23795 + }, + { + "epoch": 2.1558741591356934, + "grad_norm": 1.0815627574920654, + "learning_rate": 5.627378722890111e-05, + "loss": 2.6202, + "step": 23796 + }, + { + "epoch": 2.1559647573101404, + "grad_norm": 0.9980720281600952, + "learning_rate": 5.626774602791035e-05, + "loss": 2.6602, + "step": 23797 + }, + { + "epoch": 2.156055355484587, + "grad_norm": 0.9810261726379395, + "learning_rate": 5.6261704826919596e-05, + "loss": 2.7033, + "step": 23798 + }, + { + "epoch": 2.156145953659034, + "grad_norm": 1.0790470838546753, + "learning_rate": 5.6255663625928836e-05, + "loss": 2.285, + "step": 23799 + }, + { + "epoch": 2.1562365518334805, + "grad_norm": 1.1418776512145996, + "learning_rate": 5.624962242493808e-05, + "loss": 2.4724, + "step": 23800 + }, + { + "epoch": 2.1563271500079275, + "grad_norm": 0.9442089200019836, + "learning_rate": 5.6243581223947325e-05, + "loss": 2.7719, + "step": 23801 + }, + { + "epoch": 2.156417748182374, + "grad_norm": 0.9233430027961731, + "learning_rate": 5.623754002295657e-05, + "loss": 2.5961, + "step": 23802 + }, + { + "epoch": 2.156508346356821, + "grad_norm": 1.0482851266860962, + "learning_rate": 5.623149882196581e-05, + "loss": 2.584, + "step": 23803 + }, + { + "epoch": 2.1565989445312677, + "grad_norm": 0.9336808919906616, + "learning_rate": 5.6225457620975054e-05, + "loss": 2.469, + "step": 23804 + }, + { + "epoch": 2.1566895427057147, + "grad_norm": 1.024688959121704, + "learning_rate": 5.6219416419984295e-05, + "loss": 2.7075, + "step": 23805 + }, + { + "epoch": 2.1567801408801612, + "grad_norm": 0.981174886226654, + "learning_rate": 5.6213375218993536e-05, + "loss": 2.4117, + "step": 23806 + }, + { + "epoch": 2.1568707390546082, + "grad_norm": 1.037561058998108, + "learning_rate": 5.620733401800278e-05, + "loss": 2.5628, + "step": 23807 + }, + { + "epoch": 2.156961337229055, + "grad_norm": 1.0097324848175049, + "learning_rate": 5.620129281701203e-05, + "loss": 2.5379, + "step": 23808 + }, + { + "epoch": 2.157051935403502, + "grad_norm": 1.0434622764587402, + "learning_rate": 5.619525161602127e-05, + "loss": 2.7129, + "step": 23809 + }, + { + "epoch": 2.1571425335779484, + "grad_norm": 1.0153197050094604, + "learning_rate": 5.618921041503051e-05, + "loss": 2.5628, + "step": 23810 + }, + { + "epoch": 2.1572331317523954, + "grad_norm": 0.9930467009544373, + "learning_rate": 5.6183169214039754e-05, + "loss": 2.7161, + "step": 23811 + }, + { + "epoch": 2.157323729926842, + "grad_norm": 0.9788038730621338, + "learning_rate": 5.6177128013048994e-05, + "loss": 2.7299, + "step": 23812 + }, + { + "epoch": 2.1574143281012885, + "grad_norm": 0.8163625001907349, + "learning_rate": 5.6171086812058235e-05, + "loss": 1.9971, + "step": 23813 + }, + { + "epoch": 2.1575049262757355, + "grad_norm": 1.0005522966384888, + "learning_rate": 5.616504561106748e-05, + "loss": 3.0464, + "step": 23814 + }, + { + "epoch": 2.1575955244501825, + "grad_norm": 0.9709886908531189, + "learning_rate": 5.6159004410076724e-05, + "loss": 2.4445, + "step": 23815 + }, + { + "epoch": 2.157686122624629, + "grad_norm": 1.025915503501892, + "learning_rate": 5.615296320908597e-05, + "loss": 2.549, + "step": 23816 + }, + { + "epoch": 2.1577767207990757, + "grad_norm": 1.1283420324325562, + "learning_rate": 5.614692200809521e-05, + "loss": 2.7458, + "step": 23817 + }, + { + "epoch": 2.1578673189735227, + "grad_norm": 0.7981805205345154, + "learning_rate": 5.614088080710446e-05, + "loss": 1.3777, + "step": 23818 + }, + { + "epoch": 2.1579579171479697, + "grad_norm": 1.1343886852264404, + "learning_rate": 5.61348396061137e-05, + "loss": 2.8293, + "step": 23819 + }, + { + "epoch": 2.1580485153224163, + "grad_norm": 0.9897127747535706, + "learning_rate": 5.612879840512294e-05, + "loss": 2.8447, + "step": 23820 + }, + { + "epoch": 2.158139113496863, + "grad_norm": 0.9680557250976562, + "learning_rate": 5.612275720413218e-05, + "loss": 2.5226, + "step": 23821 + }, + { + "epoch": 2.15822971167131, + "grad_norm": 1.055474877357483, + "learning_rate": 5.611671600314142e-05, + "loss": 2.4956, + "step": 23822 + }, + { + "epoch": 2.1583203098457564, + "grad_norm": 0.8598965406417847, + "learning_rate": 5.611067480215068e-05, + "loss": 2.0745, + "step": 23823 + }, + { + "epoch": 2.1584109080202034, + "grad_norm": 1.0925835371017456, + "learning_rate": 5.610463360115992e-05, + "loss": 2.608, + "step": 23824 + }, + { + "epoch": 2.15850150619465, + "grad_norm": 1.035742998123169, + "learning_rate": 5.609859240016916e-05, + "loss": 2.6824, + "step": 23825 + }, + { + "epoch": 2.158592104369097, + "grad_norm": 1.0110286474227905, + "learning_rate": 5.60925511991784e-05, + "loss": 2.4936, + "step": 23826 + }, + { + "epoch": 2.1586827025435436, + "grad_norm": 0.9543599486351013, + "learning_rate": 5.608650999818764e-05, + "loss": 2.6224, + "step": 23827 + }, + { + "epoch": 2.1587733007179906, + "grad_norm": 0.9318636655807495, + "learning_rate": 5.608046879719688e-05, + "loss": 2.1807, + "step": 23828 + }, + { + "epoch": 2.158863898892437, + "grad_norm": 0.9399539828300476, + "learning_rate": 5.607442759620612e-05, + "loss": 2.506, + "step": 23829 + }, + { + "epoch": 2.158954497066884, + "grad_norm": 0.9996212720870972, + "learning_rate": 5.606838639521536e-05, + "loss": 2.5205, + "step": 23830 + }, + { + "epoch": 2.1590450952413307, + "grad_norm": 1.0209394693374634, + "learning_rate": 5.606234519422462e-05, + "loss": 2.7001, + "step": 23831 + }, + { + "epoch": 2.1591356934157777, + "grad_norm": 0.9515381455421448, + "learning_rate": 5.605630399323386e-05, + "loss": 2.597, + "step": 23832 + }, + { + "epoch": 2.1592262915902243, + "grad_norm": 0.892020583152771, + "learning_rate": 5.60502627922431e-05, + "loss": 2.104, + "step": 23833 + }, + { + "epoch": 2.1593168897646713, + "grad_norm": 1.0070947408676147, + "learning_rate": 5.604422159125235e-05, + "loss": 2.6057, + "step": 23834 + }, + { + "epoch": 2.159407487939118, + "grad_norm": 0.9100715517997742, + "learning_rate": 5.603818039026159e-05, + "loss": 2.4606, + "step": 23835 + }, + { + "epoch": 2.159498086113565, + "grad_norm": 0.9692074060440063, + "learning_rate": 5.603213918927083e-05, + "loss": 2.7554, + "step": 23836 + }, + { + "epoch": 2.1595886842880114, + "grad_norm": 1.0918657779693604, + "learning_rate": 5.602609798828007e-05, + "loss": 2.5137, + "step": 23837 + }, + { + "epoch": 2.1596792824624584, + "grad_norm": 1.1368244886398315, + "learning_rate": 5.6020056787289324e-05, + "loss": 2.9271, + "step": 23838 + }, + { + "epoch": 2.159769880636905, + "grad_norm": 0.9888536334037781, + "learning_rate": 5.6014015586298565e-05, + "loss": 2.7021, + "step": 23839 + }, + { + "epoch": 2.159860478811352, + "grad_norm": 1.0587929487228394, + "learning_rate": 5.6007974385307805e-05, + "loss": 2.4646, + "step": 23840 + }, + { + "epoch": 2.1599510769857986, + "grad_norm": 1.0221143960952759, + "learning_rate": 5.6001933184317046e-05, + "loss": 2.5861, + "step": 23841 + }, + { + "epoch": 2.1600416751602456, + "grad_norm": 0.9415448904037476, + "learning_rate": 5.599589198332629e-05, + "loss": 2.6253, + "step": 23842 + }, + { + "epoch": 2.160132273334692, + "grad_norm": 1.0245163440704346, + "learning_rate": 5.598985078233553e-05, + "loss": 2.4974, + "step": 23843 + }, + { + "epoch": 2.160222871509139, + "grad_norm": 0.963447630405426, + "learning_rate": 5.598380958134477e-05, + "loss": 2.6544, + "step": 23844 + }, + { + "epoch": 2.1603134696835857, + "grad_norm": 1.003409743309021, + "learning_rate": 5.597776838035401e-05, + "loss": 2.6797, + "step": 23845 + }, + { + "epoch": 2.1604040678580327, + "grad_norm": 0.9909545183181763, + "learning_rate": 5.5971727179363264e-05, + "loss": 2.3751, + "step": 23846 + }, + { + "epoch": 2.1604946660324793, + "grad_norm": 0.9088668823242188, + "learning_rate": 5.5965685978372505e-05, + "loss": 2.5048, + "step": 23847 + }, + { + "epoch": 2.1605852642069263, + "grad_norm": 0.9986473917961121, + "learning_rate": 5.5959644777381746e-05, + "loss": 2.8053, + "step": 23848 + }, + { + "epoch": 2.160675862381373, + "grad_norm": 1.001908779144287, + "learning_rate": 5.5953603576390986e-05, + "loss": 2.5587, + "step": 23849 + }, + { + "epoch": 2.16076646055582, + "grad_norm": 0.9869672656059265, + "learning_rate": 5.5947562375400234e-05, + "loss": 2.4691, + "step": 23850 + }, + { + "epoch": 2.1608570587302665, + "grad_norm": 1.2085515260696411, + "learning_rate": 5.5941521174409475e-05, + "loss": 2.6232, + "step": 23851 + }, + { + "epoch": 2.1609476569047135, + "grad_norm": 0.883597731590271, + "learning_rate": 5.5935479973418716e-05, + "loss": 2.1179, + "step": 23852 + }, + { + "epoch": 2.16103825507916, + "grad_norm": 1.0531041622161865, + "learning_rate": 5.592943877242797e-05, + "loss": 2.6091, + "step": 23853 + }, + { + "epoch": 2.161128853253607, + "grad_norm": 0.9349947571754456, + "learning_rate": 5.592339757143721e-05, + "loss": 2.4156, + "step": 23854 + }, + { + "epoch": 2.1612194514280536, + "grad_norm": 1.0221893787384033, + "learning_rate": 5.591735637044645e-05, + "loss": 2.54, + "step": 23855 + }, + { + "epoch": 2.1613100496025006, + "grad_norm": 0.9865648746490479, + "learning_rate": 5.591131516945569e-05, + "loss": 2.6367, + "step": 23856 + }, + { + "epoch": 2.161400647776947, + "grad_norm": 1.0268397331237793, + "learning_rate": 5.590527396846493e-05, + "loss": 2.5227, + "step": 23857 + }, + { + "epoch": 2.161491245951394, + "grad_norm": 0.9360123872756958, + "learning_rate": 5.5899232767474174e-05, + "loss": 2.476, + "step": 23858 + }, + { + "epoch": 2.1615818441258408, + "grad_norm": 0.966883659362793, + "learning_rate": 5.5893191566483415e-05, + "loss": 2.6425, + "step": 23859 + }, + { + "epoch": 2.1616724423002878, + "grad_norm": 0.951860785484314, + "learning_rate": 5.5887150365492656e-05, + "loss": 2.4771, + "step": 23860 + }, + { + "epoch": 2.1617630404747343, + "grad_norm": 0.9513357877731323, + "learning_rate": 5.588110916450191e-05, + "loss": 2.7854, + "step": 23861 + }, + { + "epoch": 2.1618536386491813, + "grad_norm": 0.9792149066925049, + "learning_rate": 5.587506796351115e-05, + "loss": 2.5305, + "step": 23862 + }, + { + "epoch": 2.161944236823628, + "grad_norm": 1.1659313440322876, + "learning_rate": 5.586902676252039e-05, + "loss": 2.5601, + "step": 23863 + }, + { + "epoch": 2.162034834998075, + "grad_norm": 1.0081499814987183, + "learning_rate": 5.586298556152963e-05, + "loss": 2.6019, + "step": 23864 + }, + { + "epoch": 2.1621254331725215, + "grad_norm": 0.993863046169281, + "learning_rate": 5.5856944360538874e-05, + "loss": 2.7698, + "step": 23865 + }, + { + "epoch": 2.1622160313469685, + "grad_norm": 0.989479124546051, + "learning_rate": 5.585090315954812e-05, + "loss": 2.7531, + "step": 23866 + }, + { + "epoch": 2.162306629521415, + "grad_norm": 0.9660749435424805, + "learning_rate": 5.584486195855736e-05, + "loss": 2.5924, + "step": 23867 + }, + { + "epoch": 2.162397227695862, + "grad_norm": 0.9706699252128601, + "learning_rate": 5.583882075756661e-05, + "loss": 2.4689, + "step": 23868 + }, + { + "epoch": 2.1624878258703086, + "grad_norm": 0.9796139597892761, + "learning_rate": 5.583277955657586e-05, + "loss": 2.6174, + "step": 23869 + }, + { + "epoch": 2.1625784240447556, + "grad_norm": 1.112655520439148, + "learning_rate": 5.58267383555851e-05, + "loss": 2.5562, + "step": 23870 + }, + { + "epoch": 2.162669022219202, + "grad_norm": 0.8756229877471924, + "learning_rate": 5.582069715459434e-05, + "loss": 1.9543, + "step": 23871 + }, + { + "epoch": 2.162759620393649, + "grad_norm": 1.0217221975326538, + "learning_rate": 5.581465595360358e-05, + "loss": 2.5779, + "step": 23872 + }, + { + "epoch": 2.162850218568096, + "grad_norm": 1.0425668954849243, + "learning_rate": 5.580861475261282e-05, + "loss": 2.6521, + "step": 23873 + }, + { + "epoch": 2.162940816742543, + "grad_norm": 1.0273786783218384, + "learning_rate": 5.580257355162206e-05, + "loss": 2.6172, + "step": 23874 + }, + { + "epoch": 2.1630314149169894, + "grad_norm": 0.9681636095046997, + "learning_rate": 5.57965323506313e-05, + "loss": 2.5579, + "step": 23875 + }, + { + "epoch": 2.1631220130914364, + "grad_norm": 0.9603824615478516, + "learning_rate": 5.5790491149640557e-05, + "loss": 2.6087, + "step": 23876 + }, + { + "epoch": 2.163212611265883, + "grad_norm": 0.9807518720626831, + "learning_rate": 5.57844499486498e-05, + "loss": 2.6643, + "step": 23877 + }, + { + "epoch": 2.16330320944033, + "grad_norm": 1.065290093421936, + "learning_rate": 5.577840874765904e-05, + "loss": 2.9626, + "step": 23878 + }, + { + "epoch": 2.1633938076147765, + "grad_norm": 1.0424087047576904, + "learning_rate": 5.577236754666828e-05, + "loss": 2.7049, + "step": 23879 + }, + { + "epoch": 2.1634844057892235, + "grad_norm": 1.068080186843872, + "learning_rate": 5.576632634567752e-05, + "loss": 2.3169, + "step": 23880 + }, + { + "epoch": 2.16357500396367, + "grad_norm": 1.0100584030151367, + "learning_rate": 5.576028514468676e-05, + "loss": 2.4451, + "step": 23881 + }, + { + "epoch": 2.163665602138117, + "grad_norm": 0.8483843803405762, + "learning_rate": 5.575424394369601e-05, + "loss": 1.9388, + "step": 23882 + }, + { + "epoch": 2.1637562003125637, + "grad_norm": 0.9313346147537231, + "learning_rate": 5.5748202742705256e-05, + "loss": 2.5694, + "step": 23883 + }, + { + "epoch": 2.1638467984870107, + "grad_norm": 1.041965365409851, + "learning_rate": 5.57421615417145e-05, + "loss": 2.8211, + "step": 23884 + }, + { + "epoch": 2.1639373966614572, + "grad_norm": 1.0956124067306519, + "learning_rate": 5.573612034072374e-05, + "loss": 2.5072, + "step": 23885 + }, + { + "epoch": 2.1640279948359042, + "grad_norm": 0.9648289084434509, + "learning_rate": 5.5730079139732985e-05, + "loss": 2.4665, + "step": 23886 + }, + { + "epoch": 2.164118593010351, + "grad_norm": 0.9209550619125366, + "learning_rate": 5.5724037938742226e-05, + "loss": 2.4677, + "step": 23887 + }, + { + "epoch": 2.164209191184798, + "grad_norm": 0.9338389039039612, + "learning_rate": 5.571799673775147e-05, + "loss": 2.6707, + "step": 23888 + }, + { + "epoch": 2.1642997893592444, + "grad_norm": 0.9324139952659607, + "learning_rate": 5.571195553676071e-05, + "loss": 2.4536, + "step": 23889 + }, + { + "epoch": 2.1643903875336914, + "grad_norm": 0.904090404510498, + "learning_rate": 5.570591433576995e-05, + "loss": 2.4971, + "step": 23890 + }, + { + "epoch": 2.164480985708138, + "grad_norm": 0.8664665222167969, + "learning_rate": 5.56998731347792e-05, + "loss": 2.0014, + "step": 23891 + }, + { + "epoch": 2.164571583882585, + "grad_norm": 0.9769234657287598, + "learning_rate": 5.5693831933788444e-05, + "loss": 2.4928, + "step": 23892 + }, + { + "epoch": 2.1646621820570315, + "grad_norm": 0.9696568250656128, + "learning_rate": 5.5687790732797684e-05, + "loss": 2.4635, + "step": 23893 + }, + { + "epoch": 2.1647527802314785, + "grad_norm": 0.9963234066963196, + "learning_rate": 5.5681749531806925e-05, + "loss": 2.5488, + "step": 23894 + }, + { + "epoch": 2.164843378405925, + "grad_norm": 1.000345230102539, + "learning_rate": 5.5675708330816166e-05, + "loss": 2.5463, + "step": 23895 + }, + { + "epoch": 2.1649339765803717, + "grad_norm": 1.0835556983947754, + "learning_rate": 5.566966712982541e-05, + "loss": 2.6735, + "step": 23896 + }, + { + "epoch": 2.1650245747548187, + "grad_norm": 1.0501508712768555, + "learning_rate": 5.566362592883465e-05, + "loss": 2.4493, + "step": 23897 + }, + { + "epoch": 2.1651151729292657, + "grad_norm": 0.9833187460899353, + "learning_rate": 5.56575847278439e-05, + "loss": 2.4998, + "step": 23898 + }, + { + "epoch": 2.1652057711037123, + "grad_norm": 1.1159605979919434, + "learning_rate": 5.565154352685314e-05, + "loss": 2.5642, + "step": 23899 + }, + { + "epoch": 2.165296369278159, + "grad_norm": 0.9545150995254517, + "learning_rate": 5.5645502325862384e-05, + "loss": 2.638, + "step": 23900 + }, + { + "epoch": 2.165386967452606, + "grad_norm": 1.0512281656265259, + "learning_rate": 5.5639461124871625e-05, + "loss": 2.6392, + "step": 23901 + }, + { + "epoch": 2.165477565627053, + "grad_norm": 0.9787390828132629, + "learning_rate": 5.563341992388087e-05, + "loss": 2.6092, + "step": 23902 + }, + { + "epoch": 2.1655681638014994, + "grad_norm": 1.0276867151260376, + "learning_rate": 5.562737872289011e-05, + "loss": 2.7597, + "step": 23903 + }, + { + "epoch": 2.165658761975946, + "grad_norm": 1.0231621265411377, + "learning_rate": 5.5621337521899354e-05, + "loss": 2.752, + "step": 23904 + }, + { + "epoch": 2.165749360150393, + "grad_norm": 0.9814401865005493, + "learning_rate": 5.5615296320908595e-05, + "loss": 2.5606, + "step": 23905 + }, + { + "epoch": 2.1658399583248396, + "grad_norm": 0.901274561882019, + "learning_rate": 5.560925511991785e-05, + "loss": 2.064, + "step": 23906 + }, + { + "epoch": 2.1659305564992866, + "grad_norm": 0.928112268447876, + "learning_rate": 5.560321391892709e-05, + "loss": 2.8422, + "step": 23907 + }, + { + "epoch": 2.166021154673733, + "grad_norm": 0.9936025738716125, + "learning_rate": 5.559717271793633e-05, + "loss": 2.523, + "step": 23908 + }, + { + "epoch": 2.16611175284818, + "grad_norm": 1.094122052192688, + "learning_rate": 5.559113151694557e-05, + "loss": 2.5749, + "step": 23909 + }, + { + "epoch": 2.1662023510226267, + "grad_norm": 0.9982135891914368, + "learning_rate": 5.558509031595481e-05, + "loss": 2.6535, + "step": 23910 + }, + { + "epoch": 2.1662929491970737, + "grad_norm": 1.0914021730422974, + "learning_rate": 5.557904911496405e-05, + "loss": 2.5901, + "step": 23911 + }, + { + "epoch": 2.1663835473715203, + "grad_norm": 0.9111135005950928, + "learning_rate": 5.5573007913973294e-05, + "loss": 2.4624, + "step": 23912 + }, + { + "epoch": 2.1664741455459673, + "grad_norm": 0.9509663581848145, + "learning_rate": 5.556696671298255e-05, + "loss": 2.5096, + "step": 23913 + }, + { + "epoch": 2.166564743720414, + "grad_norm": 0.9685181975364685, + "learning_rate": 5.556092551199179e-05, + "loss": 2.4813, + "step": 23914 + }, + { + "epoch": 2.166655341894861, + "grad_norm": 0.9759155511856079, + "learning_rate": 5.555488431100103e-05, + "loss": 2.6748, + "step": 23915 + }, + { + "epoch": 2.1667459400693074, + "grad_norm": 1.0027999877929688, + "learning_rate": 5.554884311001027e-05, + "loss": 2.8646, + "step": 23916 + }, + { + "epoch": 2.1668365382437544, + "grad_norm": 1.0495367050170898, + "learning_rate": 5.554280190901951e-05, + "loss": 2.6807, + "step": 23917 + }, + { + "epoch": 2.166927136418201, + "grad_norm": 0.9927710890769958, + "learning_rate": 5.553676070802876e-05, + "loss": 2.6264, + "step": 23918 + }, + { + "epoch": 2.167017734592648, + "grad_norm": 1.002787470817566, + "learning_rate": 5.5530719507038e-05, + "loss": 2.5338, + "step": 23919 + }, + { + "epoch": 2.1671083327670946, + "grad_norm": 0.9173005223274231, + "learning_rate": 5.552467830604724e-05, + "loss": 2.0785, + "step": 23920 + }, + { + "epoch": 2.1671989309415416, + "grad_norm": 0.946219265460968, + "learning_rate": 5.5518637105056495e-05, + "loss": 2.7153, + "step": 23921 + }, + { + "epoch": 2.167289529115988, + "grad_norm": 1.0196559429168701, + "learning_rate": 5.5512595904065736e-05, + "loss": 2.4799, + "step": 23922 + }, + { + "epoch": 2.167380127290435, + "grad_norm": 1.0884746313095093, + "learning_rate": 5.550655470307498e-05, + "loss": 2.5738, + "step": 23923 + }, + { + "epoch": 2.1674707254648817, + "grad_norm": 1.0364158153533936, + "learning_rate": 5.550051350208422e-05, + "loss": 2.4226, + "step": 23924 + }, + { + "epoch": 2.1675613236393287, + "grad_norm": 1.166157603263855, + "learning_rate": 5.549447230109346e-05, + "loss": 2.5386, + "step": 23925 + }, + { + "epoch": 2.1676519218137753, + "grad_norm": 0.9345084428787231, + "learning_rate": 5.54884311001027e-05, + "loss": 2.6499, + "step": 23926 + }, + { + "epoch": 2.1677425199882223, + "grad_norm": 0.9663710594177246, + "learning_rate": 5.548238989911194e-05, + "loss": 2.6659, + "step": 23927 + }, + { + "epoch": 2.167833118162669, + "grad_norm": 0.9654200673103333, + "learning_rate": 5.5476348698121195e-05, + "loss": 2.6237, + "step": 23928 + }, + { + "epoch": 2.167923716337116, + "grad_norm": 0.9974876046180725, + "learning_rate": 5.5470307497130436e-05, + "loss": 2.4821, + "step": 23929 + }, + { + "epoch": 2.1680143145115625, + "grad_norm": 0.9133386015892029, + "learning_rate": 5.5464266296139676e-05, + "loss": 2.6284, + "step": 23930 + }, + { + "epoch": 2.1681049126860095, + "grad_norm": 0.8331572413444519, + "learning_rate": 5.545822509514892e-05, + "loss": 1.949, + "step": 23931 + }, + { + "epoch": 2.168195510860456, + "grad_norm": 0.9481726288795471, + "learning_rate": 5.545218389415816e-05, + "loss": 2.608, + "step": 23932 + }, + { + "epoch": 2.168286109034903, + "grad_norm": 1.077109456062317, + "learning_rate": 5.54461426931674e-05, + "loss": 2.7273, + "step": 23933 + }, + { + "epoch": 2.1683767072093496, + "grad_norm": 1.0111783742904663, + "learning_rate": 5.5440101492176647e-05, + "loss": 2.359, + "step": 23934 + }, + { + "epoch": 2.1684673053837966, + "grad_norm": 0.9681062698364258, + "learning_rate": 5.543406029118589e-05, + "loss": 2.5255, + "step": 23935 + }, + { + "epoch": 2.168557903558243, + "grad_norm": 1.09569251537323, + "learning_rate": 5.5428019090195135e-05, + "loss": 2.5589, + "step": 23936 + }, + { + "epoch": 2.16864850173269, + "grad_norm": 0.9642459154129028, + "learning_rate": 5.542197788920438e-05, + "loss": 2.5884, + "step": 23937 + }, + { + "epoch": 2.1687390999071368, + "grad_norm": 0.8628683090209961, + "learning_rate": 5.5415936688213623e-05, + "loss": 1.7409, + "step": 23938 + }, + { + "epoch": 2.1688296980815838, + "grad_norm": 0.8652188181877136, + "learning_rate": 5.5409895487222864e-05, + "loss": 1.9988, + "step": 23939 + }, + { + "epoch": 2.1689202962560303, + "grad_norm": 1.0116533041000366, + "learning_rate": 5.5403854286232105e-05, + "loss": 2.7192, + "step": 23940 + }, + { + "epoch": 2.1690108944304773, + "grad_norm": 1.035569190979004, + "learning_rate": 5.5397813085241346e-05, + "loss": 2.6783, + "step": 23941 + }, + { + "epoch": 2.169101492604924, + "grad_norm": 1.070185899734497, + "learning_rate": 5.539177188425059e-05, + "loss": 2.4468, + "step": 23942 + }, + { + "epoch": 2.169192090779371, + "grad_norm": 1.0031362771987915, + "learning_rate": 5.538573068325984e-05, + "loss": 2.5619, + "step": 23943 + }, + { + "epoch": 2.1692826889538175, + "grad_norm": 1.1242451667785645, + "learning_rate": 5.537968948226908e-05, + "loss": 2.4279, + "step": 23944 + }, + { + "epoch": 2.1693732871282645, + "grad_norm": 1.0264265537261963, + "learning_rate": 5.537364828127832e-05, + "loss": 2.6959, + "step": 23945 + }, + { + "epoch": 2.169463885302711, + "grad_norm": 0.9907986521720886, + "learning_rate": 5.5367607080287564e-05, + "loss": 2.4686, + "step": 23946 + }, + { + "epoch": 2.169554483477158, + "grad_norm": 1.0122734308242798, + "learning_rate": 5.5361565879296804e-05, + "loss": 2.6675, + "step": 23947 + }, + { + "epoch": 2.1696450816516046, + "grad_norm": 1.1275393962860107, + "learning_rate": 5.5355524678306045e-05, + "loss": 2.4957, + "step": 23948 + }, + { + "epoch": 2.1697356798260516, + "grad_norm": 0.9965260028839111, + "learning_rate": 5.5349483477315286e-05, + "loss": 2.5773, + "step": 23949 + }, + { + "epoch": 2.169826278000498, + "grad_norm": 0.9433050155639648, + "learning_rate": 5.5343442276324534e-05, + "loss": 2.5154, + "step": 23950 + }, + { + "epoch": 2.169916876174945, + "grad_norm": 0.9671869277954102, + "learning_rate": 5.533740107533378e-05, + "loss": 2.3053, + "step": 23951 + }, + { + "epoch": 2.170007474349392, + "grad_norm": 0.9179713726043701, + "learning_rate": 5.533135987434302e-05, + "loss": 2.2332, + "step": 23952 + }, + { + "epoch": 2.170098072523839, + "grad_norm": 1.0741788148880005, + "learning_rate": 5.532531867335227e-05, + "loss": 2.694, + "step": 23953 + }, + { + "epoch": 2.1701886706982854, + "grad_norm": 0.971733570098877, + "learning_rate": 5.531927747236151e-05, + "loss": 2.4769, + "step": 23954 + }, + { + "epoch": 2.1702792688727324, + "grad_norm": 0.9809569120407104, + "learning_rate": 5.531323627137075e-05, + "loss": 2.6909, + "step": 23955 + }, + { + "epoch": 2.170369867047179, + "grad_norm": 1.0816601514816284, + "learning_rate": 5.530719507037999e-05, + "loss": 2.6015, + "step": 23956 + }, + { + "epoch": 2.170460465221626, + "grad_norm": 1.1195439100265503, + "learning_rate": 5.530115386938923e-05, + "loss": 2.5018, + "step": 23957 + }, + { + "epoch": 2.1705510633960725, + "grad_norm": 0.9974433779716492, + "learning_rate": 5.529511266839849e-05, + "loss": 2.5616, + "step": 23958 + }, + { + "epoch": 2.1706416615705195, + "grad_norm": 0.9512884020805359, + "learning_rate": 5.528907146740773e-05, + "loss": 2.2666, + "step": 23959 + }, + { + "epoch": 2.170732259744966, + "grad_norm": 0.999351978302002, + "learning_rate": 5.528303026641697e-05, + "loss": 2.5323, + "step": 23960 + }, + { + "epoch": 2.170822857919413, + "grad_norm": 0.97685706615448, + "learning_rate": 5.527698906542621e-05, + "loss": 2.6742, + "step": 23961 + }, + { + "epoch": 2.1709134560938597, + "grad_norm": 1.0851060152053833, + "learning_rate": 5.527094786443545e-05, + "loss": 2.3871, + "step": 23962 + }, + { + "epoch": 2.1710040542683067, + "grad_norm": 1.0135934352874756, + "learning_rate": 5.526490666344469e-05, + "loss": 2.5464, + "step": 23963 + }, + { + "epoch": 2.1710946524427532, + "grad_norm": 0.9173440933227539, + "learning_rate": 5.525886546245393e-05, + "loss": 2.4771, + "step": 23964 + }, + { + "epoch": 2.1711852506172002, + "grad_norm": 0.8698248863220215, + "learning_rate": 5.525282426146317e-05, + "loss": 2.0454, + "step": 23965 + }, + { + "epoch": 2.171275848791647, + "grad_norm": 0.9104448556900024, + "learning_rate": 5.524678306047243e-05, + "loss": 2.549, + "step": 23966 + }, + { + "epoch": 2.171366446966094, + "grad_norm": 1.0458016395568848, + "learning_rate": 5.524074185948167e-05, + "loss": 2.6614, + "step": 23967 + }, + { + "epoch": 2.1714570451405404, + "grad_norm": 1.1315617561340332, + "learning_rate": 5.523470065849091e-05, + "loss": 2.873, + "step": 23968 + }, + { + "epoch": 2.1715476433149874, + "grad_norm": 0.9908566474914551, + "learning_rate": 5.522865945750016e-05, + "loss": 2.6749, + "step": 23969 + }, + { + "epoch": 2.171638241489434, + "grad_norm": 1.0574748516082764, + "learning_rate": 5.52226182565094e-05, + "loss": 2.4828, + "step": 23970 + }, + { + "epoch": 2.171728839663881, + "grad_norm": 1.0366960763931274, + "learning_rate": 5.521657705551864e-05, + "loss": 2.6062, + "step": 23971 + }, + { + "epoch": 2.1718194378383275, + "grad_norm": 1.053339958190918, + "learning_rate": 5.521053585452788e-05, + "loss": 2.8083, + "step": 23972 + }, + { + "epoch": 2.1719100360127745, + "grad_norm": 1.1138721704483032, + "learning_rate": 5.5204494653537134e-05, + "loss": 2.7069, + "step": 23973 + }, + { + "epoch": 2.172000634187221, + "grad_norm": 0.9735769033432007, + "learning_rate": 5.5198453452546375e-05, + "loss": 2.6712, + "step": 23974 + }, + { + "epoch": 2.1720912323616677, + "grad_norm": 0.9006081223487854, + "learning_rate": 5.5192412251555615e-05, + "loss": 2.5138, + "step": 23975 + }, + { + "epoch": 2.1721818305361147, + "grad_norm": 0.8708144426345825, + "learning_rate": 5.5186371050564856e-05, + "loss": 1.8774, + "step": 23976 + }, + { + "epoch": 2.1722724287105617, + "grad_norm": 0.9759683609008789, + "learning_rate": 5.51803298495741e-05, + "loss": 2.4458, + "step": 23977 + }, + { + "epoch": 2.1723630268850083, + "grad_norm": 1.0142453908920288, + "learning_rate": 5.517428864858334e-05, + "loss": 2.5768, + "step": 23978 + }, + { + "epoch": 2.172453625059455, + "grad_norm": 0.9886578917503357, + "learning_rate": 5.516824744759258e-05, + "loss": 2.6135, + "step": 23979 + }, + { + "epoch": 2.172544223233902, + "grad_norm": 1.177799940109253, + "learning_rate": 5.516220624660182e-05, + "loss": 2.4963, + "step": 23980 + }, + { + "epoch": 2.172634821408349, + "grad_norm": 0.9668105840682983, + "learning_rate": 5.5156165045611074e-05, + "loss": 2.8821, + "step": 23981 + }, + { + "epoch": 2.1727254195827954, + "grad_norm": 1.0284266471862793, + "learning_rate": 5.5150123844620315e-05, + "loss": 2.8725, + "step": 23982 + }, + { + "epoch": 2.172816017757242, + "grad_norm": 0.9515056014060974, + "learning_rate": 5.5144082643629556e-05, + "loss": 2.65, + "step": 23983 + }, + { + "epoch": 2.172906615931689, + "grad_norm": 1.0452038049697876, + "learning_rate": 5.5138041442638796e-05, + "loss": 2.8416, + "step": 23984 + }, + { + "epoch": 2.1729972141061356, + "grad_norm": 0.9808871150016785, + "learning_rate": 5.5132000241648044e-05, + "loss": 2.69, + "step": 23985 + }, + { + "epoch": 2.1730878122805826, + "grad_norm": 1.0943164825439453, + "learning_rate": 5.5125959040657285e-05, + "loss": 2.4429, + "step": 23986 + }, + { + "epoch": 2.173178410455029, + "grad_norm": 1.0229524374008179, + "learning_rate": 5.5119917839666526e-05, + "loss": 2.6787, + "step": 23987 + }, + { + "epoch": 2.173269008629476, + "grad_norm": 1.0465707778930664, + "learning_rate": 5.511387663867577e-05, + "loss": 2.9259, + "step": 23988 + }, + { + "epoch": 2.1733596068039227, + "grad_norm": 0.9941469430923462, + "learning_rate": 5.510783543768502e-05, + "loss": 2.7386, + "step": 23989 + }, + { + "epoch": 2.1734502049783697, + "grad_norm": 1.041374683380127, + "learning_rate": 5.510179423669426e-05, + "loss": 2.5986, + "step": 23990 + }, + { + "epoch": 2.1735408031528163, + "grad_norm": 1.1251332759857178, + "learning_rate": 5.50957530357035e-05, + "loss": 2.7636, + "step": 23991 + }, + { + "epoch": 2.1736314013272633, + "grad_norm": 0.9684184193611145, + "learning_rate": 5.5089711834712743e-05, + "loss": 2.6307, + "step": 23992 + }, + { + "epoch": 2.17372199950171, + "grad_norm": 1.0000905990600586, + "learning_rate": 5.5083670633721984e-05, + "loss": 2.6987, + "step": 23993 + }, + { + "epoch": 2.173812597676157, + "grad_norm": 0.9608534574508667, + "learning_rate": 5.5077629432731225e-05, + "loss": 2.6338, + "step": 23994 + }, + { + "epoch": 2.1739031958506034, + "grad_norm": 0.955539345741272, + "learning_rate": 5.5071588231740466e-05, + "loss": 2.6133, + "step": 23995 + }, + { + "epoch": 2.1739937940250504, + "grad_norm": 0.9963970184326172, + "learning_rate": 5.506554703074972e-05, + "loss": 2.79, + "step": 23996 + }, + { + "epoch": 2.174084392199497, + "grad_norm": 1.0515408515930176, + "learning_rate": 5.505950582975896e-05, + "loss": 2.3912, + "step": 23997 + }, + { + "epoch": 2.174174990373944, + "grad_norm": 1.044087290763855, + "learning_rate": 5.50534646287682e-05, + "loss": 2.8109, + "step": 23998 + }, + { + "epoch": 2.1742655885483906, + "grad_norm": 1.0709269046783447, + "learning_rate": 5.504742342777744e-05, + "loss": 2.2198, + "step": 23999 + }, + { + "epoch": 2.1743561867228376, + "grad_norm": 0.8450218439102173, + "learning_rate": 5.5041382226786684e-05, + "loss": 1.7849, + "step": 24000 + }, + { + "epoch": 2.174446784897284, + "grad_norm": 1.0208452939987183, + "learning_rate": 5.503534102579593e-05, + "loss": 2.8986, + "step": 24001 + }, + { + "epoch": 2.174537383071731, + "grad_norm": 1.0150351524353027, + "learning_rate": 5.502929982480517e-05, + "loss": 2.5007, + "step": 24002 + }, + { + "epoch": 2.1746279812461777, + "grad_norm": 0.9592316746711731, + "learning_rate": 5.502325862381442e-05, + "loss": 2.5104, + "step": 24003 + }, + { + "epoch": 2.1747185794206247, + "grad_norm": 0.9714202284812927, + "learning_rate": 5.501721742282366e-05, + "loss": 2.4972, + "step": 24004 + }, + { + "epoch": 2.1748091775950713, + "grad_norm": 0.9940517544746399, + "learning_rate": 5.501117622183291e-05, + "loss": 2.5586, + "step": 24005 + }, + { + "epoch": 2.1748997757695183, + "grad_norm": 0.9287230372428894, + "learning_rate": 5.500513502084215e-05, + "loss": 2.4294, + "step": 24006 + }, + { + "epoch": 2.174990373943965, + "grad_norm": 1.106284499168396, + "learning_rate": 5.499909381985139e-05, + "loss": 2.6661, + "step": 24007 + }, + { + "epoch": 2.175080972118412, + "grad_norm": 0.7784183025360107, + "learning_rate": 5.499305261886063e-05, + "loss": 2.0496, + "step": 24008 + }, + { + "epoch": 2.1751715702928585, + "grad_norm": 0.9510731101036072, + "learning_rate": 5.498701141786987e-05, + "loss": 2.6094, + "step": 24009 + }, + { + "epoch": 2.1752621684673055, + "grad_norm": 0.8803509473800659, + "learning_rate": 5.498097021687911e-05, + "loss": 1.7555, + "step": 24010 + }, + { + "epoch": 2.175352766641752, + "grad_norm": 1.000609040260315, + "learning_rate": 5.497492901588837e-05, + "loss": 2.594, + "step": 24011 + }, + { + "epoch": 2.175443364816199, + "grad_norm": 1.0727049112319946, + "learning_rate": 5.496888781489761e-05, + "loss": 2.9752, + "step": 24012 + }, + { + "epoch": 2.1755339629906456, + "grad_norm": 0.950831413269043, + "learning_rate": 5.496284661390685e-05, + "loss": 1.7582, + "step": 24013 + }, + { + "epoch": 2.1756245611650926, + "grad_norm": 1.0917208194732666, + "learning_rate": 5.495680541291609e-05, + "loss": 2.5485, + "step": 24014 + }, + { + "epoch": 2.175715159339539, + "grad_norm": 0.9722142219543457, + "learning_rate": 5.495076421192533e-05, + "loss": 2.6914, + "step": 24015 + }, + { + "epoch": 2.175805757513986, + "grad_norm": 1.0675054788589478, + "learning_rate": 5.494472301093457e-05, + "loss": 3.0703, + "step": 24016 + }, + { + "epoch": 2.1758963556884328, + "grad_norm": 1.023889422416687, + "learning_rate": 5.493868180994381e-05, + "loss": 2.6603, + "step": 24017 + }, + { + "epoch": 2.1759869538628798, + "grad_norm": 0.9725723266601562, + "learning_rate": 5.4932640608953066e-05, + "loss": 2.5173, + "step": 24018 + }, + { + "epoch": 2.1760775520373263, + "grad_norm": 1.0233052968978882, + "learning_rate": 5.492659940796231e-05, + "loss": 2.7885, + "step": 24019 + }, + { + "epoch": 2.1761681502117733, + "grad_norm": 0.9602108597755432, + "learning_rate": 5.492055820697155e-05, + "loss": 2.5474, + "step": 24020 + }, + { + "epoch": 2.17625874838622, + "grad_norm": 0.9987680315971375, + "learning_rate": 5.4914517005980795e-05, + "loss": 2.5714, + "step": 24021 + }, + { + "epoch": 2.176349346560667, + "grad_norm": 0.961430549621582, + "learning_rate": 5.4908475804990036e-05, + "loss": 2.4849, + "step": 24022 + }, + { + "epoch": 2.1764399447351135, + "grad_norm": 0.8871133923530579, + "learning_rate": 5.490243460399928e-05, + "loss": 1.982, + "step": 24023 + }, + { + "epoch": 2.1765305429095605, + "grad_norm": 1.1262109279632568, + "learning_rate": 5.489639340300852e-05, + "loss": 2.3917, + "step": 24024 + }, + { + "epoch": 2.176621141084007, + "grad_norm": 0.9561803936958313, + "learning_rate": 5.489035220201776e-05, + "loss": 2.6903, + "step": 24025 + }, + { + "epoch": 2.176711739258454, + "grad_norm": 0.9903106689453125, + "learning_rate": 5.488431100102701e-05, + "loss": 2.6464, + "step": 24026 + }, + { + "epoch": 2.1768023374329006, + "grad_norm": 0.9784175753593445, + "learning_rate": 5.4878269800036254e-05, + "loss": 2.62, + "step": 24027 + }, + { + "epoch": 2.1768929356073476, + "grad_norm": 0.9747034907341003, + "learning_rate": 5.4872228599045495e-05, + "loss": 2.9204, + "step": 24028 + }, + { + "epoch": 2.176983533781794, + "grad_norm": 1.0461257696151733, + "learning_rate": 5.4866187398054735e-05, + "loss": 2.6296, + "step": 24029 + }, + { + "epoch": 2.177074131956241, + "grad_norm": 0.9544271230697632, + "learning_rate": 5.4860146197063976e-05, + "loss": 2.6618, + "step": 24030 + }, + { + "epoch": 2.177164730130688, + "grad_norm": 0.9192463755607605, + "learning_rate": 5.485410499607322e-05, + "loss": 2.4597, + "step": 24031 + }, + { + "epoch": 2.177255328305135, + "grad_norm": 0.9371446371078491, + "learning_rate": 5.484806379508246e-05, + "loss": 2.5802, + "step": 24032 + }, + { + "epoch": 2.1773459264795814, + "grad_norm": 0.9835460782051086, + "learning_rate": 5.484202259409171e-05, + "loss": 2.7122, + "step": 24033 + }, + { + "epoch": 2.1774365246540284, + "grad_norm": 1.0034379959106445, + "learning_rate": 5.483598139310095e-05, + "loss": 2.6148, + "step": 24034 + }, + { + "epoch": 2.177527122828475, + "grad_norm": 0.9693751335144043, + "learning_rate": 5.4829940192110194e-05, + "loss": 2.433, + "step": 24035 + }, + { + "epoch": 2.177617721002922, + "grad_norm": 1.005293369293213, + "learning_rate": 5.4823898991119435e-05, + "loss": 2.6527, + "step": 24036 + }, + { + "epoch": 2.1777083191773685, + "grad_norm": 1.0079295635223389, + "learning_rate": 5.481785779012868e-05, + "loss": 2.6407, + "step": 24037 + }, + { + "epoch": 2.1777989173518155, + "grad_norm": 0.972102165222168, + "learning_rate": 5.481181658913792e-05, + "loss": 2.6323, + "step": 24038 + }, + { + "epoch": 2.177889515526262, + "grad_norm": 0.9678933024406433, + "learning_rate": 5.4805775388147164e-05, + "loss": 2.6204, + "step": 24039 + }, + { + "epoch": 2.177980113700709, + "grad_norm": 1.047093391418457, + "learning_rate": 5.4799734187156405e-05, + "loss": 2.7648, + "step": 24040 + }, + { + "epoch": 2.1780707118751557, + "grad_norm": 0.9682454466819763, + "learning_rate": 5.479369298616566e-05, + "loss": 2.792, + "step": 24041 + }, + { + "epoch": 2.1781613100496027, + "grad_norm": 1.0139577388763428, + "learning_rate": 5.47876517851749e-05, + "loss": 2.6454, + "step": 24042 + }, + { + "epoch": 2.1782519082240492, + "grad_norm": 1.0087751150131226, + "learning_rate": 5.478161058418414e-05, + "loss": 2.6537, + "step": 24043 + }, + { + "epoch": 2.1783425063984962, + "grad_norm": 0.8172050714492798, + "learning_rate": 5.477556938319338e-05, + "loss": 1.9211, + "step": 24044 + }, + { + "epoch": 2.178433104572943, + "grad_norm": 0.8538711667060852, + "learning_rate": 5.476952818220262e-05, + "loss": 2.0171, + "step": 24045 + }, + { + "epoch": 2.17852370274739, + "grad_norm": 0.9771665334701538, + "learning_rate": 5.4763486981211863e-05, + "loss": 2.5748, + "step": 24046 + }, + { + "epoch": 2.1786143009218364, + "grad_norm": 1.054185152053833, + "learning_rate": 5.4757445780221104e-05, + "loss": 2.668, + "step": 24047 + }, + { + "epoch": 2.1787048990962834, + "grad_norm": 0.9436936974525452, + "learning_rate": 5.475140457923036e-05, + "loss": 2.6856, + "step": 24048 + }, + { + "epoch": 2.17879549727073, + "grad_norm": 1.025984764099121, + "learning_rate": 5.47453633782396e-05, + "loss": 2.4313, + "step": 24049 + }, + { + "epoch": 2.178886095445177, + "grad_norm": 0.9538614749908447, + "learning_rate": 5.473932217724884e-05, + "loss": 2.5695, + "step": 24050 + }, + { + "epoch": 2.1789766936196235, + "grad_norm": 0.8465344309806824, + "learning_rate": 5.473328097625808e-05, + "loss": 1.8563, + "step": 24051 + }, + { + "epoch": 2.1790672917940705, + "grad_norm": 1.138013482093811, + "learning_rate": 5.472723977526732e-05, + "loss": 2.6843, + "step": 24052 + }, + { + "epoch": 2.179157889968517, + "grad_norm": 0.9523173570632935, + "learning_rate": 5.472119857427657e-05, + "loss": 2.7614, + "step": 24053 + }, + { + "epoch": 2.179248488142964, + "grad_norm": 0.9961135983467102, + "learning_rate": 5.471515737328581e-05, + "loss": 2.5887, + "step": 24054 + }, + { + "epoch": 2.1793390863174107, + "grad_norm": 1.0189608335494995, + "learning_rate": 5.470911617229505e-05, + "loss": 2.3135, + "step": 24055 + }, + { + "epoch": 2.1794296844918577, + "grad_norm": 0.945957601070404, + "learning_rate": 5.4703074971304306e-05, + "loss": 2.7348, + "step": 24056 + }, + { + "epoch": 2.1795202826663043, + "grad_norm": 0.9792180061340332, + "learning_rate": 5.4697033770313546e-05, + "loss": 2.3651, + "step": 24057 + }, + { + "epoch": 2.179610880840751, + "grad_norm": 0.9090420007705688, + "learning_rate": 5.469099256932279e-05, + "loss": 2.0098, + "step": 24058 + }, + { + "epoch": 2.179701479015198, + "grad_norm": 1.0772595405578613, + "learning_rate": 5.468495136833203e-05, + "loss": 2.6471, + "step": 24059 + }, + { + "epoch": 2.179792077189645, + "grad_norm": 0.9321334362030029, + "learning_rate": 5.467891016734127e-05, + "loss": 2.617, + "step": 24060 + }, + { + "epoch": 2.1798826753640914, + "grad_norm": 0.9851348996162415, + "learning_rate": 5.467286896635051e-05, + "loss": 2.6147, + "step": 24061 + }, + { + "epoch": 2.179973273538538, + "grad_norm": 0.9516922831535339, + "learning_rate": 5.466682776535975e-05, + "loss": 2.8353, + "step": 24062 + }, + { + "epoch": 2.180063871712985, + "grad_norm": 0.9518684148788452, + "learning_rate": 5.4660786564369005e-05, + "loss": 2.7268, + "step": 24063 + }, + { + "epoch": 2.180154469887432, + "grad_norm": 1.0591411590576172, + "learning_rate": 5.4654745363378246e-05, + "loss": 2.7364, + "step": 24064 + }, + { + "epoch": 2.1802450680618786, + "grad_norm": 0.9678483009338379, + "learning_rate": 5.464870416238749e-05, + "loss": 2.6471, + "step": 24065 + }, + { + "epoch": 2.180335666236325, + "grad_norm": 1.0088282823562622, + "learning_rate": 5.464266296139673e-05, + "loss": 2.5704, + "step": 24066 + }, + { + "epoch": 2.180426264410772, + "grad_norm": 0.9865283966064453, + "learning_rate": 5.463662176040597e-05, + "loss": 2.6531, + "step": 24067 + }, + { + "epoch": 2.1805168625852187, + "grad_norm": 0.9372443556785583, + "learning_rate": 5.463058055941521e-05, + "loss": 2.65, + "step": 24068 + }, + { + "epoch": 2.1806074607596657, + "grad_norm": 1.1657694578170776, + "learning_rate": 5.462453935842446e-05, + "loss": 2.4256, + "step": 24069 + }, + { + "epoch": 2.1806980589341123, + "grad_norm": 1.0417201519012451, + "learning_rate": 5.46184981574337e-05, + "loss": 2.638, + "step": 24070 + }, + { + "epoch": 2.1807886571085593, + "grad_norm": 0.9780079126358032, + "learning_rate": 5.4612456956442945e-05, + "loss": 2.6348, + "step": 24071 + }, + { + "epoch": 2.180879255283006, + "grad_norm": 1.0139501094818115, + "learning_rate": 5.4606415755452186e-05, + "loss": 2.7462, + "step": 24072 + }, + { + "epoch": 2.180969853457453, + "grad_norm": 0.836466372013092, + "learning_rate": 5.4600374554461434e-05, + "loss": 1.7739, + "step": 24073 + }, + { + "epoch": 2.1810604516318994, + "grad_norm": 1.0037633180618286, + "learning_rate": 5.4594333353470674e-05, + "loss": 2.8197, + "step": 24074 + }, + { + "epoch": 2.1811510498063464, + "grad_norm": 1.006277084350586, + "learning_rate": 5.4588292152479915e-05, + "loss": 2.8719, + "step": 24075 + }, + { + "epoch": 2.181241647980793, + "grad_norm": 0.9793887734413147, + "learning_rate": 5.4582250951489156e-05, + "loss": 2.4438, + "step": 24076 + }, + { + "epoch": 2.18133224615524, + "grad_norm": 0.8560184240341187, + "learning_rate": 5.45762097504984e-05, + "loss": 2.0591, + "step": 24077 + }, + { + "epoch": 2.1814228443296866, + "grad_norm": 0.9790959358215332, + "learning_rate": 5.457016854950765e-05, + "loss": 2.6088, + "step": 24078 + }, + { + "epoch": 2.1815134425041336, + "grad_norm": 0.8655616044998169, + "learning_rate": 5.456412734851689e-05, + "loss": 1.9992, + "step": 24079 + }, + { + "epoch": 2.18160404067858, + "grad_norm": 0.9408383369445801, + "learning_rate": 5.455808614752613e-05, + "loss": 2.5494, + "step": 24080 + }, + { + "epoch": 2.181694638853027, + "grad_norm": 1.029467225074768, + "learning_rate": 5.4552044946535374e-05, + "loss": 2.7546, + "step": 24081 + }, + { + "epoch": 2.1817852370274737, + "grad_norm": 0.9504972696304321, + "learning_rate": 5.4546003745544615e-05, + "loss": 2.8605, + "step": 24082 + }, + { + "epoch": 2.1818758352019207, + "grad_norm": 1.0493310689926147, + "learning_rate": 5.4539962544553855e-05, + "loss": 2.6173, + "step": 24083 + }, + { + "epoch": 2.1819664333763673, + "grad_norm": 0.9784508347511292, + "learning_rate": 5.4533921343563096e-05, + "loss": 2.5441, + "step": 24084 + }, + { + "epoch": 2.1820570315508143, + "grad_norm": 0.9917535781860352, + "learning_rate": 5.4527880142572344e-05, + "loss": 2.8946, + "step": 24085 + }, + { + "epoch": 2.182147629725261, + "grad_norm": 1.0145437717437744, + "learning_rate": 5.452183894158159e-05, + "loss": 2.4814, + "step": 24086 + }, + { + "epoch": 2.182238227899708, + "grad_norm": 1.0449808835983276, + "learning_rate": 5.451579774059083e-05, + "loss": 2.4963, + "step": 24087 + }, + { + "epoch": 2.1823288260741545, + "grad_norm": 1.0290250778198242, + "learning_rate": 5.450975653960007e-05, + "loss": 2.7114, + "step": 24088 + }, + { + "epoch": 2.1824194242486015, + "grad_norm": 0.980208694934845, + "learning_rate": 5.450371533860932e-05, + "loss": 2.6517, + "step": 24089 + }, + { + "epoch": 2.182510022423048, + "grad_norm": 0.9729131460189819, + "learning_rate": 5.449767413761856e-05, + "loss": 2.8314, + "step": 24090 + }, + { + "epoch": 2.182600620597495, + "grad_norm": 0.9595668315887451, + "learning_rate": 5.44916329366278e-05, + "loss": 2.4459, + "step": 24091 + }, + { + "epoch": 2.1826912187719416, + "grad_norm": 0.9936398267745972, + "learning_rate": 5.448559173563704e-05, + "loss": 2.3605, + "step": 24092 + }, + { + "epoch": 2.1827818169463886, + "grad_norm": 1.1150144338607788, + "learning_rate": 5.44795505346463e-05, + "loss": 2.5824, + "step": 24093 + }, + { + "epoch": 2.182872415120835, + "grad_norm": 1.045683741569519, + "learning_rate": 5.447350933365554e-05, + "loss": 2.3781, + "step": 24094 + }, + { + "epoch": 2.182963013295282, + "grad_norm": 1.103066086769104, + "learning_rate": 5.446746813266478e-05, + "loss": 2.747, + "step": 24095 + }, + { + "epoch": 2.1830536114697288, + "grad_norm": 0.9467779397964478, + "learning_rate": 5.446142693167402e-05, + "loss": 2.4366, + "step": 24096 + }, + { + "epoch": 2.1831442096441758, + "grad_norm": 1.0418787002563477, + "learning_rate": 5.445538573068326e-05, + "loss": 2.558, + "step": 24097 + }, + { + "epoch": 2.1832348078186223, + "grad_norm": 0.9605383276939392, + "learning_rate": 5.44493445296925e-05, + "loss": 2.5839, + "step": 24098 + }, + { + "epoch": 2.1833254059930693, + "grad_norm": 1.0427753925323486, + "learning_rate": 5.444330332870174e-05, + "loss": 2.6197, + "step": 24099 + }, + { + "epoch": 2.183416004167516, + "grad_norm": 1.040168285369873, + "learning_rate": 5.4437262127710983e-05, + "loss": 2.4779, + "step": 24100 + }, + { + "epoch": 2.183506602341963, + "grad_norm": 0.8676812052726746, + "learning_rate": 5.443122092672024e-05, + "loss": 1.8801, + "step": 24101 + }, + { + "epoch": 2.1835972005164095, + "grad_norm": 1.0787756443023682, + "learning_rate": 5.442517972572948e-05, + "loss": 2.7369, + "step": 24102 + }, + { + "epoch": 2.1836877986908565, + "grad_norm": 0.9809891581535339, + "learning_rate": 5.441913852473872e-05, + "loss": 1.8012, + "step": 24103 + }, + { + "epoch": 2.183778396865303, + "grad_norm": 0.9141408205032349, + "learning_rate": 5.441309732374796e-05, + "loss": 2.4481, + "step": 24104 + }, + { + "epoch": 2.18386899503975, + "grad_norm": 0.9422247409820557, + "learning_rate": 5.440705612275721e-05, + "loss": 2.6041, + "step": 24105 + }, + { + "epoch": 2.1839595932141966, + "grad_norm": 1.0468202829360962, + "learning_rate": 5.440101492176645e-05, + "loss": 2.8016, + "step": 24106 + }, + { + "epoch": 2.1840501913886436, + "grad_norm": 1.0522451400756836, + "learning_rate": 5.439497372077569e-05, + "loss": 2.714, + "step": 24107 + }, + { + "epoch": 2.18414078956309, + "grad_norm": 1.158981442451477, + "learning_rate": 5.4388932519784944e-05, + "loss": 2.4662, + "step": 24108 + }, + { + "epoch": 2.1842313877375372, + "grad_norm": 1.118939995765686, + "learning_rate": 5.4382891318794185e-05, + "loss": 2.4368, + "step": 24109 + }, + { + "epoch": 2.184321985911984, + "grad_norm": 0.9836773872375488, + "learning_rate": 5.4376850117803426e-05, + "loss": 2.5712, + "step": 24110 + }, + { + "epoch": 2.184412584086431, + "grad_norm": 0.8553103804588318, + "learning_rate": 5.4370808916812666e-05, + "loss": 2.1434, + "step": 24111 + }, + { + "epoch": 2.1845031822608774, + "grad_norm": 1.0566072463989258, + "learning_rate": 5.436476771582191e-05, + "loss": 2.4184, + "step": 24112 + }, + { + "epoch": 2.1845937804353244, + "grad_norm": 0.9732450842857361, + "learning_rate": 5.435872651483115e-05, + "loss": 2.5135, + "step": 24113 + }, + { + "epoch": 2.184684378609771, + "grad_norm": 0.9610328078269958, + "learning_rate": 5.435268531384039e-05, + "loss": 2.6114, + "step": 24114 + }, + { + "epoch": 2.184774976784218, + "grad_norm": 0.9851844906806946, + "learning_rate": 5.434664411284963e-05, + "loss": 2.6645, + "step": 24115 + }, + { + "epoch": 2.1848655749586645, + "grad_norm": 0.9916605353355408, + "learning_rate": 5.4340602911858884e-05, + "loss": 2.6249, + "step": 24116 + }, + { + "epoch": 2.1849561731331115, + "grad_norm": 1.0591872930526733, + "learning_rate": 5.4334561710868125e-05, + "loss": 2.3847, + "step": 24117 + }, + { + "epoch": 2.185046771307558, + "grad_norm": 1.082241415977478, + "learning_rate": 5.4328520509877366e-05, + "loss": 2.6779, + "step": 24118 + }, + { + "epoch": 2.185137369482005, + "grad_norm": 0.931211531162262, + "learning_rate": 5.432247930888661e-05, + "loss": 2.7077, + "step": 24119 + }, + { + "epoch": 2.1852279676564517, + "grad_norm": 0.8569030165672302, + "learning_rate": 5.431643810789585e-05, + "loss": 2.0213, + "step": 24120 + }, + { + "epoch": 2.1853185658308987, + "grad_norm": 0.9881839752197266, + "learning_rate": 5.4310396906905095e-05, + "loss": 2.593, + "step": 24121 + }, + { + "epoch": 2.1854091640053452, + "grad_norm": 0.957100510597229, + "learning_rate": 5.4304355705914336e-05, + "loss": 2.8719, + "step": 24122 + }, + { + "epoch": 2.1854997621797922, + "grad_norm": 1.06679368019104, + "learning_rate": 5.4298314504923583e-05, + "loss": 2.759, + "step": 24123 + }, + { + "epoch": 2.185590360354239, + "grad_norm": 0.9442565441131592, + "learning_rate": 5.429227330393283e-05, + "loss": 2.0501, + "step": 24124 + }, + { + "epoch": 2.185680958528686, + "grad_norm": 0.9948819875717163, + "learning_rate": 5.428623210294207e-05, + "loss": 2.392, + "step": 24125 + }, + { + "epoch": 2.1857715567031324, + "grad_norm": 1.012609839439392, + "learning_rate": 5.428019090195131e-05, + "loss": 2.5109, + "step": 24126 + }, + { + "epoch": 2.1858621548775794, + "grad_norm": 0.942608654499054, + "learning_rate": 5.4274149700960554e-05, + "loss": 2.389, + "step": 24127 + }, + { + "epoch": 2.185952753052026, + "grad_norm": 0.9394868612289429, + "learning_rate": 5.4268108499969794e-05, + "loss": 2.393, + "step": 24128 + }, + { + "epoch": 2.186043351226473, + "grad_norm": 0.9191456437110901, + "learning_rate": 5.4262067298979035e-05, + "loss": 2.7213, + "step": 24129 + }, + { + "epoch": 2.1861339494009195, + "grad_norm": 0.9447809457778931, + "learning_rate": 5.4256026097988276e-05, + "loss": 1.964, + "step": 24130 + }, + { + "epoch": 2.1862245475753666, + "grad_norm": 0.9105162024497986, + "learning_rate": 5.424998489699753e-05, + "loss": 2.5276, + "step": 24131 + }, + { + "epoch": 2.186315145749813, + "grad_norm": 1.0209132432937622, + "learning_rate": 5.424394369600677e-05, + "loss": 2.7481, + "step": 24132 + }, + { + "epoch": 2.18640574392426, + "grad_norm": 0.9177475571632385, + "learning_rate": 5.423790249501601e-05, + "loss": 1.9715, + "step": 24133 + }, + { + "epoch": 2.1864963420987067, + "grad_norm": 1.0639303922653198, + "learning_rate": 5.423186129402525e-05, + "loss": 2.3664, + "step": 24134 + }, + { + "epoch": 2.1865869402731537, + "grad_norm": 0.9566017985343933, + "learning_rate": 5.4225820093034494e-05, + "loss": 2.488, + "step": 24135 + }, + { + "epoch": 2.1866775384476003, + "grad_norm": 1.0996956825256348, + "learning_rate": 5.4219778892043735e-05, + "loss": 2.6389, + "step": 24136 + }, + { + "epoch": 2.186768136622047, + "grad_norm": 1.030204176902771, + "learning_rate": 5.421373769105298e-05, + "loss": 2.5066, + "step": 24137 + }, + { + "epoch": 2.186858734796494, + "grad_norm": 1.1268328428268433, + "learning_rate": 5.420769649006223e-05, + "loss": 2.7545, + "step": 24138 + }, + { + "epoch": 2.186949332970941, + "grad_norm": 1.0741831064224243, + "learning_rate": 5.420165528907147e-05, + "loss": 2.6094, + "step": 24139 + }, + { + "epoch": 2.1870399311453874, + "grad_norm": 0.9615746140480042, + "learning_rate": 5.419561408808072e-05, + "loss": 2.4367, + "step": 24140 + }, + { + "epoch": 2.187130529319834, + "grad_norm": 0.9750231504440308, + "learning_rate": 5.418957288708996e-05, + "loss": 2.7743, + "step": 24141 + }, + { + "epoch": 2.187221127494281, + "grad_norm": 1.0215033292770386, + "learning_rate": 5.41835316860992e-05, + "loss": 2.5546, + "step": 24142 + }, + { + "epoch": 2.187311725668728, + "grad_norm": 0.9951531887054443, + "learning_rate": 5.417749048510844e-05, + "loss": 2.4432, + "step": 24143 + }, + { + "epoch": 2.1874023238431746, + "grad_norm": 1.003763198852539, + "learning_rate": 5.417144928411768e-05, + "loss": 2.8671, + "step": 24144 + }, + { + "epoch": 2.187492922017621, + "grad_norm": 0.9464038014411926, + "learning_rate": 5.416540808312692e-05, + "loss": 2.4289, + "step": 24145 + }, + { + "epoch": 2.187583520192068, + "grad_norm": 0.8171577453613281, + "learning_rate": 5.415936688213618e-05, + "loss": 1.73, + "step": 24146 + }, + { + "epoch": 2.1876741183665147, + "grad_norm": 1.0810062885284424, + "learning_rate": 5.415332568114542e-05, + "loss": 2.5761, + "step": 24147 + }, + { + "epoch": 2.1877647165409617, + "grad_norm": 1.4132099151611328, + "learning_rate": 5.414728448015466e-05, + "loss": 2.5104, + "step": 24148 + }, + { + "epoch": 2.1878553147154083, + "grad_norm": 0.885877788066864, + "learning_rate": 5.41412432791639e-05, + "loss": 2.0458, + "step": 24149 + }, + { + "epoch": 2.1879459128898553, + "grad_norm": 1.018398404121399, + "learning_rate": 5.413520207817314e-05, + "loss": 2.8271, + "step": 24150 + }, + { + "epoch": 2.188036511064302, + "grad_norm": 1.007159948348999, + "learning_rate": 5.412916087718238e-05, + "loss": 2.75, + "step": 24151 + }, + { + "epoch": 2.188127109238749, + "grad_norm": 0.9757589101791382, + "learning_rate": 5.412311967619162e-05, + "loss": 2.6147, + "step": 24152 + }, + { + "epoch": 2.1882177074131954, + "grad_norm": 0.985755205154419, + "learning_rate": 5.4117078475200876e-05, + "loss": 2.7402, + "step": 24153 + }, + { + "epoch": 2.1883083055876424, + "grad_norm": 0.9271398782730103, + "learning_rate": 5.411103727421012e-05, + "loss": 2.5255, + "step": 24154 + }, + { + "epoch": 2.188398903762089, + "grad_norm": 0.9781710505485535, + "learning_rate": 5.410499607321936e-05, + "loss": 2.4159, + "step": 24155 + }, + { + "epoch": 2.188489501936536, + "grad_norm": 0.9316450357437134, + "learning_rate": 5.4098954872228605e-05, + "loss": 2.3319, + "step": 24156 + }, + { + "epoch": 2.1885801001109826, + "grad_norm": 1.0367531776428223, + "learning_rate": 5.4092913671237846e-05, + "loss": 2.4988, + "step": 24157 + }, + { + "epoch": 2.1886706982854296, + "grad_norm": 0.980626106262207, + "learning_rate": 5.408687247024709e-05, + "loss": 2.6692, + "step": 24158 + }, + { + "epoch": 2.188761296459876, + "grad_norm": 1.0089343786239624, + "learning_rate": 5.408083126925633e-05, + "loss": 2.6761, + "step": 24159 + }, + { + "epoch": 2.188851894634323, + "grad_norm": 1.1003212928771973, + "learning_rate": 5.407479006826557e-05, + "loss": 2.4069, + "step": 24160 + }, + { + "epoch": 2.1889424928087697, + "grad_norm": 0.9874923825263977, + "learning_rate": 5.406874886727482e-05, + "loss": 2.5428, + "step": 24161 + }, + { + "epoch": 2.1890330909832167, + "grad_norm": 1.0191211700439453, + "learning_rate": 5.4062707666284064e-05, + "loss": 2.9605, + "step": 24162 + }, + { + "epoch": 2.1891236891576633, + "grad_norm": 1.0446505546569824, + "learning_rate": 5.4056666465293305e-05, + "loss": 2.6732, + "step": 24163 + }, + { + "epoch": 2.1892142873321103, + "grad_norm": 1.0327461957931519, + "learning_rate": 5.4050625264302546e-05, + "loss": 2.6014, + "step": 24164 + }, + { + "epoch": 2.189304885506557, + "grad_norm": 1.0296953916549683, + "learning_rate": 5.4044584063311786e-05, + "loss": 2.7716, + "step": 24165 + }, + { + "epoch": 2.189395483681004, + "grad_norm": 1.0000197887420654, + "learning_rate": 5.403854286232103e-05, + "loss": 2.3647, + "step": 24166 + }, + { + "epoch": 2.1894860818554505, + "grad_norm": 0.9463416337966919, + "learning_rate": 5.403250166133027e-05, + "loss": 2.6254, + "step": 24167 + }, + { + "epoch": 2.1895766800298975, + "grad_norm": 0.9966443777084351, + "learning_rate": 5.402646046033952e-05, + "loss": 2.7631, + "step": 24168 + }, + { + "epoch": 2.189667278204344, + "grad_norm": 0.9993438720703125, + "learning_rate": 5.402041925934876e-05, + "loss": 2.7085, + "step": 24169 + }, + { + "epoch": 2.189757876378791, + "grad_norm": 1.109045147895813, + "learning_rate": 5.4014378058358004e-05, + "loss": 2.4898, + "step": 24170 + }, + { + "epoch": 2.1898484745532376, + "grad_norm": 0.9484251737594604, + "learning_rate": 5.4008336857367245e-05, + "loss": 2.6828, + "step": 24171 + }, + { + "epoch": 2.1899390727276846, + "grad_norm": 1.0571337938308716, + "learning_rate": 5.400229565637649e-05, + "loss": 2.638, + "step": 24172 + }, + { + "epoch": 2.190029670902131, + "grad_norm": 0.9622308015823364, + "learning_rate": 5.3996254455385733e-05, + "loss": 2.6832, + "step": 24173 + }, + { + "epoch": 2.190120269076578, + "grad_norm": 0.7879108786582947, + "learning_rate": 5.3990213254394974e-05, + "loss": 1.909, + "step": 24174 + }, + { + "epoch": 2.1902108672510248, + "grad_norm": 1.0998631715774536, + "learning_rate": 5.3984172053404215e-05, + "loss": 2.607, + "step": 24175 + }, + { + "epoch": 2.1903014654254718, + "grad_norm": 0.8628975749015808, + "learning_rate": 5.397813085241347e-05, + "loss": 1.9641, + "step": 24176 + }, + { + "epoch": 2.1903920635999183, + "grad_norm": 0.9386603236198425, + "learning_rate": 5.397208965142271e-05, + "loss": 2.5471, + "step": 24177 + }, + { + "epoch": 2.1904826617743653, + "grad_norm": 1.04398775100708, + "learning_rate": 5.396604845043195e-05, + "loss": 2.655, + "step": 24178 + }, + { + "epoch": 2.190573259948812, + "grad_norm": 1.0599690675735474, + "learning_rate": 5.396000724944119e-05, + "loss": 2.5289, + "step": 24179 + }, + { + "epoch": 2.190663858123259, + "grad_norm": 0.9935640692710876, + "learning_rate": 5.395396604845043e-05, + "loss": 2.5851, + "step": 24180 + }, + { + "epoch": 2.1907544562977055, + "grad_norm": 1.079746961593628, + "learning_rate": 5.3947924847459674e-05, + "loss": 2.7583, + "step": 24181 + }, + { + "epoch": 2.1908450544721525, + "grad_norm": 1.0127747058868408, + "learning_rate": 5.3941883646468914e-05, + "loss": 2.5248, + "step": 24182 + }, + { + "epoch": 2.190935652646599, + "grad_norm": 0.954674482345581, + "learning_rate": 5.393584244547817e-05, + "loss": 2.6614, + "step": 24183 + }, + { + "epoch": 2.191026250821046, + "grad_norm": 1.0900495052337646, + "learning_rate": 5.392980124448741e-05, + "loss": 2.5369, + "step": 24184 + }, + { + "epoch": 2.1911168489954926, + "grad_norm": 0.8215780258178711, + "learning_rate": 5.392376004349665e-05, + "loss": 1.7131, + "step": 24185 + }, + { + "epoch": 2.1912074471699396, + "grad_norm": 0.9705542325973511, + "learning_rate": 5.391771884250589e-05, + "loss": 2.5441, + "step": 24186 + }, + { + "epoch": 2.191298045344386, + "grad_norm": 1.0005937814712524, + "learning_rate": 5.391167764151513e-05, + "loss": 2.6158, + "step": 24187 + }, + { + "epoch": 2.1913886435188332, + "grad_norm": 1.0900015830993652, + "learning_rate": 5.390563644052438e-05, + "loss": 2.5708, + "step": 24188 + }, + { + "epoch": 2.19147924169328, + "grad_norm": 0.970356285572052, + "learning_rate": 5.389959523953362e-05, + "loss": 2.5153, + "step": 24189 + }, + { + "epoch": 2.191569839867727, + "grad_norm": 1.0001085996627808, + "learning_rate": 5.389355403854286e-05, + "loss": 2.5419, + "step": 24190 + }, + { + "epoch": 2.1916604380421734, + "grad_norm": 1.0272661447525024, + "learning_rate": 5.388751283755211e-05, + "loss": 2.6356, + "step": 24191 + }, + { + "epoch": 2.1917510362166204, + "grad_norm": 0.9380815029144287, + "learning_rate": 5.3881471636561357e-05, + "loss": 2.6387, + "step": 24192 + }, + { + "epoch": 2.191841634391067, + "grad_norm": 0.959356427192688, + "learning_rate": 5.38754304355706e-05, + "loss": 2.7987, + "step": 24193 + }, + { + "epoch": 2.191932232565514, + "grad_norm": 1.1260186433792114, + "learning_rate": 5.386938923457984e-05, + "loss": 2.3706, + "step": 24194 + }, + { + "epoch": 2.1920228307399605, + "grad_norm": 0.9897041916847229, + "learning_rate": 5.386334803358908e-05, + "loss": 2.7097, + "step": 24195 + }, + { + "epoch": 2.1921134289144075, + "grad_norm": 0.9185648560523987, + "learning_rate": 5.385730683259832e-05, + "loss": 2.4949, + "step": 24196 + }, + { + "epoch": 2.192204027088854, + "grad_norm": 0.831383466720581, + "learning_rate": 5.385126563160756e-05, + "loss": 1.9687, + "step": 24197 + }, + { + "epoch": 2.192294625263301, + "grad_norm": 1.0892921686172485, + "learning_rate": 5.3845224430616815e-05, + "loss": 2.3931, + "step": 24198 + }, + { + "epoch": 2.1923852234377477, + "grad_norm": 0.9487707018852234, + "learning_rate": 5.3839183229626056e-05, + "loss": 2.4188, + "step": 24199 + }, + { + "epoch": 2.1924758216121947, + "grad_norm": 1.0233299732208252, + "learning_rate": 5.38331420286353e-05, + "loss": 2.7219, + "step": 24200 + }, + { + "epoch": 2.1925664197866412, + "grad_norm": 0.9976934194564819, + "learning_rate": 5.382710082764454e-05, + "loss": 2.7546, + "step": 24201 + }, + { + "epoch": 2.1926570179610883, + "grad_norm": 1.0717341899871826, + "learning_rate": 5.382105962665378e-05, + "loss": 2.421, + "step": 24202 + }, + { + "epoch": 2.192747616135535, + "grad_norm": 1.1087568998336792, + "learning_rate": 5.381501842566302e-05, + "loss": 2.7713, + "step": 24203 + }, + { + "epoch": 2.192838214309982, + "grad_norm": 1.008979082107544, + "learning_rate": 5.380897722467226e-05, + "loss": 2.5805, + "step": 24204 + }, + { + "epoch": 2.1929288124844284, + "grad_norm": 0.8484503030776978, + "learning_rate": 5.380293602368151e-05, + "loss": 1.8067, + "step": 24205 + }, + { + "epoch": 2.1930194106588754, + "grad_norm": 0.9687211513519287, + "learning_rate": 5.3796894822690755e-05, + "loss": 2.6485, + "step": 24206 + }, + { + "epoch": 2.193110008833322, + "grad_norm": 1.023596167564392, + "learning_rate": 5.3790853621699996e-05, + "loss": 2.6559, + "step": 24207 + }, + { + "epoch": 2.193200607007769, + "grad_norm": 1.0378674268722534, + "learning_rate": 5.3784812420709244e-05, + "loss": 2.585, + "step": 24208 + }, + { + "epoch": 2.1932912051822155, + "grad_norm": 0.9884794354438782, + "learning_rate": 5.3778771219718485e-05, + "loss": 2.4646, + "step": 24209 + }, + { + "epoch": 2.1933818033566626, + "grad_norm": 1.0348474979400635, + "learning_rate": 5.3772730018727725e-05, + "loss": 2.6488, + "step": 24210 + }, + { + "epoch": 2.193472401531109, + "grad_norm": 0.9658702611923218, + "learning_rate": 5.3766688817736966e-05, + "loss": 2.4745, + "step": 24211 + }, + { + "epoch": 2.193562999705556, + "grad_norm": 0.888719379901886, + "learning_rate": 5.376064761674621e-05, + "loss": 2.0935, + "step": 24212 + }, + { + "epoch": 2.1936535978800027, + "grad_norm": 0.9783333539962769, + "learning_rate": 5.375460641575546e-05, + "loss": 2.8508, + "step": 24213 + }, + { + "epoch": 2.1937441960544497, + "grad_norm": 1.0666424036026, + "learning_rate": 5.37485652147647e-05, + "loss": 2.7374, + "step": 24214 + }, + { + "epoch": 2.1938347942288963, + "grad_norm": 0.9777833819389343, + "learning_rate": 5.374252401377394e-05, + "loss": 2.3382, + "step": 24215 + }, + { + "epoch": 2.1939253924033433, + "grad_norm": 0.9631014466285706, + "learning_rate": 5.3736482812783184e-05, + "loss": 2.707, + "step": 24216 + }, + { + "epoch": 2.19401599057779, + "grad_norm": 1.0038695335388184, + "learning_rate": 5.3730441611792425e-05, + "loss": 2.4283, + "step": 24217 + }, + { + "epoch": 2.194106588752237, + "grad_norm": 0.9540562033653259, + "learning_rate": 5.3724400410801666e-05, + "loss": 2.5404, + "step": 24218 + }, + { + "epoch": 2.1941971869266834, + "grad_norm": 0.9694822430610657, + "learning_rate": 5.3718359209810906e-05, + "loss": 2.8075, + "step": 24219 + }, + { + "epoch": 2.19428778510113, + "grad_norm": 0.9783774018287659, + "learning_rate": 5.371231800882015e-05, + "loss": 2.7455, + "step": 24220 + }, + { + "epoch": 2.194378383275577, + "grad_norm": 0.9837494492530823, + "learning_rate": 5.37062768078294e-05, + "loss": 2.7655, + "step": 24221 + }, + { + "epoch": 2.194468981450024, + "grad_norm": 1.161607027053833, + "learning_rate": 5.370023560683864e-05, + "loss": 2.7485, + "step": 24222 + }, + { + "epoch": 2.1945595796244706, + "grad_norm": 1.0275497436523438, + "learning_rate": 5.369419440584788e-05, + "loss": 2.4923, + "step": 24223 + }, + { + "epoch": 2.194650177798917, + "grad_norm": 0.9806829690933228, + "learning_rate": 5.368815320485713e-05, + "loss": 2.704, + "step": 24224 + }, + { + "epoch": 2.194740775973364, + "grad_norm": 0.9925400614738464, + "learning_rate": 5.368211200386637e-05, + "loss": 2.4535, + "step": 24225 + }, + { + "epoch": 2.194831374147811, + "grad_norm": 1.0379369258880615, + "learning_rate": 5.367607080287561e-05, + "loss": 2.8116, + "step": 24226 + }, + { + "epoch": 2.1949219723222577, + "grad_norm": 0.8052270412445068, + "learning_rate": 5.367002960188485e-05, + "loss": 1.7862, + "step": 24227 + }, + { + "epoch": 2.1950125704967043, + "grad_norm": 1.0052000284194946, + "learning_rate": 5.366398840089411e-05, + "loss": 2.5431, + "step": 24228 + }, + { + "epoch": 2.1951031686711513, + "grad_norm": 0.9545775651931763, + "learning_rate": 5.365794719990335e-05, + "loss": 2.3678, + "step": 24229 + }, + { + "epoch": 2.195193766845598, + "grad_norm": 1.0448713302612305, + "learning_rate": 5.365190599891259e-05, + "loss": 2.5909, + "step": 24230 + }, + { + "epoch": 2.195284365020045, + "grad_norm": 0.9667290449142456, + "learning_rate": 5.364586479792183e-05, + "loss": 2.4502, + "step": 24231 + }, + { + "epoch": 2.1953749631944914, + "grad_norm": 1.0276226997375488, + "learning_rate": 5.363982359693107e-05, + "loss": 2.5382, + "step": 24232 + }, + { + "epoch": 2.1954655613689384, + "grad_norm": 1.0144672393798828, + "learning_rate": 5.363378239594031e-05, + "loss": 2.4111, + "step": 24233 + }, + { + "epoch": 2.195556159543385, + "grad_norm": 0.9965855479240417, + "learning_rate": 5.362774119494955e-05, + "loss": 2.5705, + "step": 24234 + }, + { + "epoch": 2.195646757717832, + "grad_norm": 1.043418049812317, + "learning_rate": 5.3621699993958794e-05, + "loss": 2.662, + "step": 24235 + }, + { + "epoch": 2.1957373558922786, + "grad_norm": 0.9526211023330688, + "learning_rate": 5.361565879296805e-05, + "loss": 2.2123, + "step": 24236 + }, + { + "epoch": 2.1958279540667256, + "grad_norm": 0.9604742527008057, + "learning_rate": 5.360961759197729e-05, + "loss": 2.2676, + "step": 24237 + }, + { + "epoch": 2.195918552241172, + "grad_norm": 0.8534219264984131, + "learning_rate": 5.360357639098653e-05, + "loss": 2.0723, + "step": 24238 + }, + { + "epoch": 2.196009150415619, + "grad_norm": 0.9805982112884521, + "learning_rate": 5.359753518999577e-05, + "loss": 2.5061, + "step": 24239 + }, + { + "epoch": 2.1960997485900657, + "grad_norm": 1.1673177480697632, + "learning_rate": 5.359149398900502e-05, + "loss": 2.4339, + "step": 24240 + }, + { + "epoch": 2.1961903467645127, + "grad_norm": 0.9828906655311584, + "learning_rate": 5.358545278801426e-05, + "loss": 2.6831, + "step": 24241 + }, + { + "epoch": 2.1962809449389593, + "grad_norm": 1.1215946674346924, + "learning_rate": 5.35794115870235e-05, + "loss": 2.5272, + "step": 24242 + }, + { + "epoch": 2.1963715431134063, + "grad_norm": 0.8815866708755493, + "learning_rate": 5.3573370386032754e-05, + "loss": 2.1286, + "step": 24243 + }, + { + "epoch": 2.196462141287853, + "grad_norm": 0.9929757118225098, + "learning_rate": 5.3567329185041995e-05, + "loss": 2.4954, + "step": 24244 + }, + { + "epoch": 2.1965527394623, + "grad_norm": 0.9844332933425903, + "learning_rate": 5.3561287984051236e-05, + "loss": 2.5656, + "step": 24245 + }, + { + "epoch": 2.1966433376367465, + "grad_norm": 1.0952467918395996, + "learning_rate": 5.3555246783060477e-05, + "loss": 2.7177, + "step": 24246 + }, + { + "epoch": 2.1967339358111935, + "grad_norm": 0.9534103274345398, + "learning_rate": 5.354920558206972e-05, + "loss": 2.6355, + "step": 24247 + }, + { + "epoch": 2.19682453398564, + "grad_norm": 1.033407211303711, + "learning_rate": 5.354316438107896e-05, + "loss": 2.8721, + "step": 24248 + }, + { + "epoch": 2.196915132160087, + "grad_norm": 0.9606199264526367, + "learning_rate": 5.35371231800882e-05, + "loss": 2.6145, + "step": 24249 + }, + { + "epoch": 2.1970057303345336, + "grad_norm": 1.0373961925506592, + "learning_rate": 5.353108197909744e-05, + "loss": 2.5414, + "step": 24250 + }, + { + "epoch": 2.1970963285089806, + "grad_norm": 0.9895697236061096, + "learning_rate": 5.3525040778106694e-05, + "loss": 2.5991, + "step": 24251 + }, + { + "epoch": 2.197186926683427, + "grad_norm": 1.0120770931243896, + "learning_rate": 5.3518999577115935e-05, + "loss": 2.7122, + "step": 24252 + }, + { + "epoch": 2.197277524857874, + "grad_norm": 0.9951782822608948, + "learning_rate": 5.3512958376125176e-05, + "loss": 2.8965, + "step": 24253 + }, + { + "epoch": 2.1973681230323208, + "grad_norm": 1.0079383850097656, + "learning_rate": 5.350691717513442e-05, + "loss": 2.2536, + "step": 24254 + }, + { + "epoch": 2.1974587212067678, + "grad_norm": 1.065063238143921, + "learning_rate": 5.350087597414366e-05, + "loss": 2.8086, + "step": 24255 + }, + { + "epoch": 2.1975493193812143, + "grad_norm": 1.0347553491592407, + "learning_rate": 5.3494834773152905e-05, + "loss": 2.4475, + "step": 24256 + }, + { + "epoch": 2.1976399175556613, + "grad_norm": 1.083061695098877, + "learning_rate": 5.3488793572162146e-05, + "loss": 2.6161, + "step": 24257 + }, + { + "epoch": 2.197730515730108, + "grad_norm": 0.8433712124824524, + "learning_rate": 5.3482752371171394e-05, + "loss": 1.9738, + "step": 24258 + }, + { + "epoch": 2.197821113904555, + "grad_norm": 0.9894688129425049, + "learning_rate": 5.3476711170180634e-05, + "loss": 2.5202, + "step": 24259 + }, + { + "epoch": 2.1979117120790015, + "grad_norm": 1.0600578784942627, + "learning_rate": 5.347066996918988e-05, + "loss": 2.6334, + "step": 24260 + }, + { + "epoch": 2.1980023102534485, + "grad_norm": 0.9993512630462646, + "learning_rate": 5.346462876819912e-05, + "loss": 2.8967, + "step": 24261 + }, + { + "epoch": 2.198092908427895, + "grad_norm": 1.0632953643798828, + "learning_rate": 5.3458587567208364e-05, + "loss": 2.5614, + "step": 24262 + }, + { + "epoch": 2.198183506602342, + "grad_norm": 1.0649155378341675, + "learning_rate": 5.3452546366217605e-05, + "loss": 2.7177, + "step": 24263 + }, + { + "epoch": 2.1982741047767886, + "grad_norm": 1.0142189264297485, + "learning_rate": 5.3446505165226845e-05, + "loss": 2.5593, + "step": 24264 + }, + { + "epoch": 2.1983647029512356, + "grad_norm": 0.9811755418777466, + "learning_rate": 5.34404639642361e-05, + "loss": 2.6899, + "step": 24265 + }, + { + "epoch": 2.198455301125682, + "grad_norm": 0.835540235042572, + "learning_rate": 5.343442276324534e-05, + "loss": 1.8952, + "step": 24266 + }, + { + "epoch": 2.1985458993001292, + "grad_norm": 1.1465755701065063, + "learning_rate": 5.342838156225458e-05, + "loss": 2.6368, + "step": 24267 + }, + { + "epoch": 2.198636497474576, + "grad_norm": 0.872334897518158, + "learning_rate": 5.342234036126382e-05, + "loss": 1.8455, + "step": 24268 + }, + { + "epoch": 2.198727095649023, + "grad_norm": 0.9990634322166443, + "learning_rate": 5.341629916027306e-05, + "loss": 2.6821, + "step": 24269 + }, + { + "epoch": 2.1988176938234694, + "grad_norm": 1.0115070343017578, + "learning_rate": 5.3410257959282304e-05, + "loss": 2.6372, + "step": 24270 + }, + { + "epoch": 2.1989082919979164, + "grad_norm": 1.0852549076080322, + "learning_rate": 5.3404216758291545e-05, + "loss": 2.3345, + "step": 24271 + }, + { + "epoch": 2.198998890172363, + "grad_norm": 1.032594084739685, + "learning_rate": 5.339817555730079e-05, + "loss": 2.5324, + "step": 24272 + }, + { + "epoch": 2.19908948834681, + "grad_norm": 0.8440233469009399, + "learning_rate": 5.339213435631004e-05, + "loss": 1.8686, + "step": 24273 + }, + { + "epoch": 2.1991800865212565, + "grad_norm": 0.9544344544410706, + "learning_rate": 5.338609315531928e-05, + "loss": 2.7214, + "step": 24274 + }, + { + "epoch": 2.1992706846957035, + "grad_norm": 1.137199878692627, + "learning_rate": 5.338005195432852e-05, + "loss": 2.4629, + "step": 24275 + }, + { + "epoch": 2.19936128287015, + "grad_norm": 1.00774085521698, + "learning_rate": 5.337401075333777e-05, + "loss": 2.7675, + "step": 24276 + }, + { + "epoch": 2.199451881044597, + "grad_norm": 0.9600167274475098, + "learning_rate": 5.336796955234701e-05, + "loss": 2.6067, + "step": 24277 + }, + { + "epoch": 2.1995424792190437, + "grad_norm": 0.9582087397575378, + "learning_rate": 5.336192835135625e-05, + "loss": 2.419, + "step": 24278 + }, + { + "epoch": 2.1996330773934907, + "grad_norm": 0.9883240461349487, + "learning_rate": 5.335588715036549e-05, + "loss": 2.4337, + "step": 24279 + }, + { + "epoch": 2.1997236755679372, + "grad_norm": 0.9908019304275513, + "learning_rate": 5.3349845949374746e-05, + "loss": 2.7294, + "step": 24280 + }, + { + "epoch": 2.1998142737423843, + "grad_norm": 0.9505941867828369, + "learning_rate": 5.334380474838399e-05, + "loss": 2.5629, + "step": 24281 + }, + { + "epoch": 2.199904871916831, + "grad_norm": 1.0071693658828735, + "learning_rate": 5.333776354739323e-05, + "loss": 2.1348, + "step": 24282 + }, + { + "epoch": 2.199995470091278, + "grad_norm": 0.9296957850456238, + "learning_rate": 5.333172234640247e-05, + "loss": 2.9492, + "step": 24283 + }, + { + "epoch": 2.2000860682657244, + "grad_norm": 0.8199889659881592, + "learning_rate": 5.332568114541171e-05, + "loss": 2.0274, + "step": 24284 + }, + { + "epoch": 2.2001766664401714, + "grad_norm": 0.956015944480896, + "learning_rate": 5.331963994442095e-05, + "loss": 2.6125, + "step": 24285 + }, + { + "epoch": 2.200267264614618, + "grad_norm": 1.2032049894332886, + "learning_rate": 5.331359874343019e-05, + "loss": 2.3036, + "step": 24286 + }, + { + "epoch": 2.200357862789065, + "grad_norm": 0.9747666716575623, + "learning_rate": 5.330755754243943e-05, + "loss": 2.4757, + "step": 24287 + }, + { + "epoch": 2.2004484609635115, + "grad_norm": 1.0168979167938232, + "learning_rate": 5.3301516341448686e-05, + "loss": 2.588, + "step": 24288 + }, + { + "epoch": 2.2005390591379586, + "grad_norm": 0.974463164806366, + "learning_rate": 5.329547514045793e-05, + "loss": 2.4021, + "step": 24289 + }, + { + "epoch": 2.200629657312405, + "grad_norm": 0.9791430234909058, + "learning_rate": 5.328943393946717e-05, + "loss": 2.6108, + "step": 24290 + }, + { + "epoch": 2.200720255486852, + "grad_norm": 0.9588944315910339, + "learning_rate": 5.328339273847641e-05, + "loss": 2.873, + "step": 24291 + }, + { + "epoch": 2.2008108536612987, + "grad_norm": 0.9817695617675781, + "learning_rate": 5.3277351537485656e-05, + "loss": 2.7658, + "step": 24292 + }, + { + "epoch": 2.2009014518357457, + "grad_norm": 0.9227092266082764, + "learning_rate": 5.32713103364949e-05, + "loss": 2.3502, + "step": 24293 + }, + { + "epoch": 2.2009920500101923, + "grad_norm": 1.0501344203948975, + "learning_rate": 5.326526913550414e-05, + "loss": 2.6141, + "step": 24294 + }, + { + "epoch": 2.2010826481846393, + "grad_norm": 0.9629533886909485, + "learning_rate": 5.325922793451339e-05, + "loss": 2.7086, + "step": 24295 + }, + { + "epoch": 2.201173246359086, + "grad_norm": 1.077872633934021, + "learning_rate": 5.325318673352263e-05, + "loss": 2.863, + "step": 24296 + }, + { + "epoch": 2.201263844533533, + "grad_norm": 0.9566937685012817, + "learning_rate": 5.3247145532531874e-05, + "loss": 2.4455, + "step": 24297 + }, + { + "epoch": 2.2013544427079794, + "grad_norm": 1.0351898670196533, + "learning_rate": 5.3241104331541115e-05, + "loss": 1.9743, + "step": 24298 + }, + { + "epoch": 2.201445040882426, + "grad_norm": 0.861556887626648, + "learning_rate": 5.3235063130550356e-05, + "loss": 1.8524, + "step": 24299 + }, + { + "epoch": 2.201535639056873, + "grad_norm": 1.0163475275039673, + "learning_rate": 5.3229021929559597e-05, + "loss": 2.707, + "step": 24300 + }, + { + "epoch": 2.20162623723132, + "grad_norm": 0.9714913964271545, + "learning_rate": 5.322298072856884e-05, + "loss": 2.6381, + "step": 24301 + }, + { + "epoch": 2.2017168354057666, + "grad_norm": 0.9072740077972412, + "learning_rate": 5.321693952757808e-05, + "loss": 2.6177, + "step": 24302 + }, + { + "epoch": 2.201807433580213, + "grad_norm": 0.9926596879959106, + "learning_rate": 5.321089832658733e-05, + "loss": 2.344, + "step": 24303 + }, + { + "epoch": 2.20189803175466, + "grad_norm": 1.0039806365966797, + "learning_rate": 5.3204857125596573e-05, + "loss": 2.7181, + "step": 24304 + }, + { + "epoch": 2.201988629929107, + "grad_norm": 0.9502020478248596, + "learning_rate": 5.3198815924605814e-05, + "loss": 2.4816, + "step": 24305 + }, + { + "epoch": 2.2020792281035537, + "grad_norm": 0.7969931960105896, + "learning_rate": 5.3192774723615055e-05, + "loss": 1.8867, + "step": 24306 + }, + { + "epoch": 2.2021698262780003, + "grad_norm": 1.1089656352996826, + "learning_rate": 5.3186733522624296e-05, + "loss": 2.3297, + "step": 24307 + }, + { + "epoch": 2.2022604244524473, + "grad_norm": 1.0928438901901245, + "learning_rate": 5.3180692321633544e-05, + "loss": 2.9112, + "step": 24308 + }, + { + "epoch": 2.202351022626894, + "grad_norm": 1.0089250802993774, + "learning_rate": 5.3174651120642784e-05, + "loss": 3.1354, + "step": 24309 + }, + { + "epoch": 2.202441620801341, + "grad_norm": 0.8071439862251282, + "learning_rate": 5.316860991965203e-05, + "loss": 1.7912, + "step": 24310 + }, + { + "epoch": 2.2025322189757874, + "grad_norm": 1.037025809288025, + "learning_rate": 5.316256871866128e-05, + "loss": 2.7268, + "step": 24311 + }, + { + "epoch": 2.2026228171502344, + "grad_norm": 1.087235450744629, + "learning_rate": 5.315652751767052e-05, + "loss": 2.9507, + "step": 24312 + }, + { + "epoch": 2.202713415324681, + "grad_norm": 0.9681382775306702, + "learning_rate": 5.315048631667976e-05, + "loss": 2.5777, + "step": 24313 + }, + { + "epoch": 2.202804013499128, + "grad_norm": 1.0024417638778687, + "learning_rate": 5.3144445115689e-05, + "loss": 2.816, + "step": 24314 + }, + { + "epoch": 2.2028946116735746, + "grad_norm": 0.9603287577629089, + "learning_rate": 5.313840391469824e-05, + "loss": 2.6086, + "step": 24315 + }, + { + "epoch": 2.2029852098480216, + "grad_norm": 1.0798006057739258, + "learning_rate": 5.3132362713707484e-05, + "loss": 2.6474, + "step": 24316 + }, + { + "epoch": 2.203075808022468, + "grad_norm": 0.9623766541481018, + "learning_rate": 5.3126321512716725e-05, + "loss": 2.5894, + "step": 24317 + }, + { + "epoch": 2.203166406196915, + "grad_norm": 1.0221093893051147, + "learning_rate": 5.312028031172598e-05, + "loss": 2.7444, + "step": 24318 + }, + { + "epoch": 2.2032570043713617, + "grad_norm": 1.0629571676254272, + "learning_rate": 5.311423911073522e-05, + "loss": 2.7341, + "step": 24319 + }, + { + "epoch": 2.2033476025458087, + "grad_norm": 1.0012166500091553, + "learning_rate": 5.310819790974446e-05, + "loss": 2.4291, + "step": 24320 + }, + { + "epoch": 2.2034382007202553, + "grad_norm": 1.0939624309539795, + "learning_rate": 5.31021567087537e-05, + "loss": 2.5871, + "step": 24321 + }, + { + "epoch": 2.2035287988947023, + "grad_norm": 1.0101488828659058, + "learning_rate": 5.309611550776294e-05, + "loss": 2.659, + "step": 24322 + }, + { + "epoch": 2.203619397069149, + "grad_norm": 0.9910045266151428, + "learning_rate": 5.309007430677218e-05, + "loss": 2.6533, + "step": 24323 + }, + { + "epoch": 2.203709995243596, + "grad_norm": 1.0754501819610596, + "learning_rate": 5.308403310578143e-05, + "loss": 2.4965, + "step": 24324 + }, + { + "epoch": 2.2038005934180425, + "grad_norm": 0.8421215415000916, + "learning_rate": 5.307799190479068e-05, + "loss": 2.1508, + "step": 24325 + }, + { + "epoch": 2.2038911915924895, + "grad_norm": 1.0249525308609009, + "learning_rate": 5.307195070379992e-05, + "loss": 2.517, + "step": 24326 + }, + { + "epoch": 2.203981789766936, + "grad_norm": 1.0935100317001343, + "learning_rate": 5.306590950280917e-05, + "loss": 2.7219, + "step": 24327 + }, + { + "epoch": 2.204072387941383, + "grad_norm": 0.8349199891090393, + "learning_rate": 5.305986830181841e-05, + "loss": 2.0121, + "step": 24328 + }, + { + "epoch": 2.2041629861158296, + "grad_norm": 1.089910626411438, + "learning_rate": 5.305382710082765e-05, + "loss": 2.2222, + "step": 24329 + }, + { + "epoch": 2.2042535842902766, + "grad_norm": 0.9619470238685608, + "learning_rate": 5.304778589983689e-05, + "loss": 2.5702, + "step": 24330 + }, + { + "epoch": 2.204344182464723, + "grad_norm": 1.028868556022644, + "learning_rate": 5.304174469884613e-05, + "loss": 2.6963, + "step": 24331 + }, + { + "epoch": 2.20443478063917, + "grad_norm": 0.9910551309585571, + "learning_rate": 5.303570349785537e-05, + "loss": 2.7349, + "step": 24332 + }, + { + "epoch": 2.2045253788136168, + "grad_norm": 0.9440744519233704, + "learning_rate": 5.3029662296864625e-05, + "loss": 2.5849, + "step": 24333 + }, + { + "epoch": 2.2046159769880638, + "grad_norm": 1.0258433818817139, + "learning_rate": 5.3023621095873866e-05, + "loss": 2.4521, + "step": 24334 + }, + { + "epoch": 2.2047065751625103, + "grad_norm": 0.9502691030502319, + "learning_rate": 5.301757989488311e-05, + "loss": 2.3694, + "step": 24335 + }, + { + "epoch": 2.2047971733369573, + "grad_norm": 0.880666196346283, + "learning_rate": 5.301153869389235e-05, + "loss": 1.9133, + "step": 24336 + }, + { + "epoch": 2.204887771511404, + "grad_norm": 0.9972802996635437, + "learning_rate": 5.300549749290159e-05, + "loss": 2.7119, + "step": 24337 + }, + { + "epoch": 2.204978369685851, + "grad_norm": 1.020744800567627, + "learning_rate": 5.299945629191083e-05, + "loss": 2.7356, + "step": 24338 + }, + { + "epoch": 2.2050689678602975, + "grad_norm": 1.0071715116500854, + "learning_rate": 5.299341509092007e-05, + "loss": 2.5082, + "step": 24339 + }, + { + "epoch": 2.2051595660347445, + "grad_norm": 1.0238723754882812, + "learning_rate": 5.2987373889929325e-05, + "loss": 2.6485, + "step": 24340 + }, + { + "epoch": 2.205250164209191, + "grad_norm": 0.9556558132171631, + "learning_rate": 5.2981332688938565e-05, + "loss": 2.5414, + "step": 24341 + }, + { + "epoch": 2.205340762383638, + "grad_norm": 1.084288239479065, + "learning_rate": 5.2975291487947806e-05, + "loss": 2.537, + "step": 24342 + }, + { + "epoch": 2.2054313605580846, + "grad_norm": 1.102156400680542, + "learning_rate": 5.2969250286957054e-05, + "loss": 2.5244, + "step": 24343 + }, + { + "epoch": 2.2055219587325317, + "grad_norm": 0.978866457939148, + "learning_rate": 5.2963209085966295e-05, + "loss": 2.7555, + "step": 24344 + }, + { + "epoch": 2.205612556906978, + "grad_norm": 0.8936893939971924, + "learning_rate": 5.2957167884975536e-05, + "loss": 2.0032, + "step": 24345 + }, + { + "epoch": 2.2057031550814252, + "grad_norm": 0.9717504978179932, + "learning_rate": 5.2951126683984776e-05, + "loss": 2.3656, + "step": 24346 + }, + { + "epoch": 2.205793753255872, + "grad_norm": 0.933247447013855, + "learning_rate": 5.294508548299402e-05, + "loss": 2.3671, + "step": 24347 + }, + { + "epoch": 2.205884351430319, + "grad_norm": 0.9617614150047302, + "learning_rate": 5.293904428200327e-05, + "loss": 2.5565, + "step": 24348 + }, + { + "epoch": 2.2059749496047654, + "grad_norm": 0.99611496925354, + "learning_rate": 5.293300308101251e-05, + "loss": 2.5942, + "step": 24349 + }, + { + "epoch": 2.2060655477792124, + "grad_norm": 1.0721548795700073, + "learning_rate": 5.292696188002175e-05, + "loss": 2.4641, + "step": 24350 + }, + { + "epoch": 2.206156145953659, + "grad_norm": 1.1585406064987183, + "learning_rate": 5.2920920679030994e-05, + "loss": 2.545, + "step": 24351 + }, + { + "epoch": 2.206246744128106, + "grad_norm": 0.9769628047943115, + "learning_rate": 5.2914879478040235e-05, + "loss": 2.541, + "step": 24352 + }, + { + "epoch": 2.2063373423025525, + "grad_norm": 0.9617420434951782, + "learning_rate": 5.2908838277049476e-05, + "loss": 2.7115, + "step": 24353 + }, + { + "epoch": 2.2064279404769995, + "grad_norm": 0.8613858222961426, + "learning_rate": 5.2902797076058717e-05, + "loss": 1.9077, + "step": 24354 + }, + { + "epoch": 2.206518538651446, + "grad_norm": 1.1057809591293335, + "learning_rate": 5.289675587506797e-05, + "loss": 2.7181, + "step": 24355 + }, + { + "epoch": 2.206609136825893, + "grad_norm": 0.9638598561286926, + "learning_rate": 5.289071467407721e-05, + "loss": 2.6752, + "step": 24356 + }, + { + "epoch": 2.2066997350003397, + "grad_norm": 1.0672239065170288, + "learning_rate": 5.288467347308645e-05, + "loss": 1.9954, + "step": 24357 + }, + { + "epoch": 2.2067903331747867, + "grad_norm": 0.9553927183151245, + "learning_rate": 5.2878632272095693e-05, + "loss": 2.3716, + "step": 24358 + }, + { + "epoch": 2.2068809313492332, + "grad_norm": 1.0312241315841675, + "learning_rate": 5.287259107110494e-05, + "loss": 2.6154, + "step": 24359 + }, + { + "epoch": 2.2069715295236803, + "grad_norm": 1.032446265220642, + "learning_rate": 5.286654987011418e-05, + "loss": 2.6706, + "step": 24360 + }, + { + "epoch": 2.207062127698127, + "grad_norm": 0.8486997485160828, + "learning_rate": 5.286050866912342e-05, + "loss": 1.8221, + "step": 24361 + }, + { + "epoch": 2.207152725872574, + "grad_norm": 0.9972816109657288, + "learning_rate": 5.2854467468132664e-05, + "loss": 2.7643, + "step": 24362 + }, + { + "epoch": 2.2072433240470204, + "grad_norm": 1.017704963684082, + "learning_rate": 5.284842626714192e-05, + "loss": 2.7581, + "step": 24363 + }, + { + "epoch": 2.2073339222214674, + "grad_norm": 0.9649245738983154, + "learning_rate": 5.284238506615116e-05, + "loss": 2.5338, + "step": 24364 + }, + { + "epoch": 2.207424520395914, + "grad_norm": 1.004721999168396, + "learning_rate": 5.28363438651604e-05, + "loss": 2.4883, + "step": 24365 + }, + { + "epoch": 2.207515118570361, + "grad_norm": 0.9909772872924805, + "learning_rate": 5.283030266416964e-05, + "loss": 2.6954, + "step": 24366 + }, + { + "epoch": 2.2076057167448075, + "grad_norm": 0.9836070537567139, + "learning_rate": 5.282426146317888e-05, + "loss": 2.7627, + "step": 24367 + }, + { + "epoch": 2.2076963149192546, + "grad_norm": 1.000473141670227, + "learning_rate": 5.281822026218812e-05, + "loss": 2.6426, + "step": 24368 + }, + { + "epoch": 2.207786913093701, + "grad_norm": 0.8782273530960083, + "learning_rate": 5.281217906119736e-05, + "loss": 2.1373, + "step": 24369 + }, + { + "epoch": 2.207877511268148, + "grad_norm": 0.8667532801628113, + "learning_rate": 5.280613786020662e-05, + "loss": 2.1618, + "step": 24370 + }, + { + "epoch": 2.2079681094425947, + "grad_norm": 0.9906131625175476, + "learning_rate": 5.280009665921586e-05, + "loss": 2.9223, + "step": 24371 + }, + { + "epoch": 2.2080587076170417, + "grad_norm": 0.9772157073020935, + "learning_rate": 5.27940554582251e-05, + "loss": 1.8739, + "step": 24372 + }, + { + "epoch": 2.2081493057914883, + "grad_norm": 1.0193796157836914, + "learning_rate": 5.278801425723434e-05, + "loss": 2.726, + "step": 24373 + }, + { + "epoch": 2.2082399039659353, + "grad_norm": 0.9701005816459656, + "learning_rate": 5.278197305624358e-05, + "loss": 2.5106, + "step": 24374 + }, + { + "epoch": 2.208330502140382, + "grad_norm": 0.9931989908218384, + "learning_rate": 5.277593185525282e-05, + "loss": 2.4979, + "step": 24375 + }, + { + "epoch": 2.208421100314829, + "grad_norm": 0.9931032657623291, + "learning_rate": 5.276989065426207e-05, + "loss": 2.7879, + "step": 24376 + }, + { + "epoch": 2.2085116984892754, + "grad_norm": 1.166155219078064, + "learning_rate": 5.276384945327131e-05, + "loss": 2.1943, + "step": 24377 + }, + { + "epoch": 2.2086022966637224, + "grad_norm": 0.9974825978279114, + "learning_rate": 5.275780825228056e-05, + "loss": 2.8583, + "step": 24378 + }, + { + "epoch": 2.208692894838169, + "grad_norm": 1.0351824760437012, + "learning_rate": 5.2751767051289805e-05, + "loss": 2.8365, + "step": 24379 + }, + { + "epoch": 2.208783493012616, + "grad_norm": 1.0045675039291382, + "learning_rate": 5.2745725850299046e-05, + "loss": 2.4387, + "step": 24380 + }, + { + "epoch": 2.2088740911870626, + "grad_norm": 1.0199178457260132, + "learning_rate": 5.273968464930829e-05, + "loss": 2.7151, + "step": 24381 + }, + { + "epoch": 2.208964689361509, + "grad_norm": 1.0098156929016113, + "learning_rate": 5.273364344831753e-05, + "loss": 2.477, + "step": 24382 + }, + { + "epoch": 2.209055287535956, + "grad_norm": 1.0221847295761108, + "learning_rate": 5.272760224732677e-05, + "loss": 2.7547, + "step": 24383 + }, + { + "epoch": 2.209145885710403, + "grad_norm": 0.9737768173217773, + "learning_rate": 5.272156104633601e-05, + "loss": 2.5282, + "step": 24384 + }, + { + "epoch": 2.2092364838848497, + "grad_norm": 0.934751033782959, + "learning_rate": 5.2715519845345264e-05, + "loss": 1.8848, + "step": 24385 + }, + { + "epoch": 2.2093270820592963, + "grad_norm": 0.9852161407470703, + "learning_rate": 5.2709478644354504e-05, + "loss": 2.5128, + "step": 24386 + }, + { + "epoch": 2.2094176802337433, + "grad_norm": 0.994278073310852, + "learning_rate": 5.2703437443363745e-05, + "loss": 2.6029, + "step": 24387 + }, + { + "epoch": 2.2095082784081903, + "grad_norm": 0.9759238362312317, + "learning_rate": 5.2697396242372986e-05, + "loss": 2.7329, + "step": 24388 + }, + { + "epoch": 2.209598876582637, + "grad_norm": 1.0026969909667969, + "learning_rate": 5.269135504138223e-05, + "loss": 2.5629, + "step": 24389 + }, + { + "epoch": 2.2096894747570834, + "grad_norm": 0.945396900177002, + "learning_rate": 5.268531384039147e-05, + "loss": 2.1917, + "step": 24390 + }, + { + "epoch": 2.2097800729315304, + "grad_norm": 0.9907640218734741, + "learning_rate": 5.267927263940071e-05, + "loss": 2.4563, + "step": 24391 + }, + { + "epoch": 2.209870671105977, + "grad_norm": 1.000076413154602, + "learning_rate": 5.2673231438409956e-05, + "loss": 2.7396, + "step": 24392 + }, + { + "epoch": 2.209961269280424, + "grad_norm": 0.966791570186615, + "learning_rate": 5.2667190237419204e-05, + "loss": 2.5203, + "step": 24393 + }, + { + "epoch": 2.2100518674548706, + "grad_norm": 1.1226370334625244, + "learning_rate": 5.2661149036428445e-05, + "loss": 2.3868, + "step": 24394 + }, + { + "epoch": 2.2101424656293176, + "grad_norm": 1.069503903388977, + "learning_rate": 5.265510783543769e-05, + "loss": 2.6706, + "step": 24395 + }, + { + "epoch": 2.210233063803764, + "grad_norm": 0.9630197882652283, + "learning_rate": 5.264906663444693e-05, + "loss": 2.7327, + "step": 24396 + }, + { + "epoch": 2.210323661978211, + "grad_norm": 0.973849892616272, + "learning_rate": 5.2643025433456174e-05, + "loss": 2.7361, + "step": 24397 + }, + { + "epoch": 2.2104142601526577, + "grad_norm": 1.0595775842666626, + "learning_rate": 5.2636984232465415e-05, + "loss": 2.6651, + "step": 24398 + }, + { + "epoch": 2.2105048583271047, + "grad_norm": 0.952263593673706, + "learning_rate": 5.2630943031474656e-05, + "loss": 2.7864, + "step": 24399 + }, + { + "epoch": 2.2105954565015513, + "grad_norm": 0.9660975337028503, + "learning_rate": 5.262490183048391e-05, + "loss": 2.7128, + "step": 24400 + }, + { + "epoch": 2.2106860546759983, + "grad_norm": 0.9702099561691284, + "learning_rate": 5.261886062949315e-05, + "loss": 2.6947, + "step": 24401 + }, + { + "epoch": 2.210776652850445, + "grad_norm": 1.0713943243026733, + "learning_rate": 5.261281942850239e-05, + "loss": 2.8167, + "step": 24402 + }, + { + "epoch": 2.210867251024892, + "grad_norm": 1.0379263162612915, + "learning_rate": 5.260677822751163e-05, + "loss": 2.6324, + "step": 24403 + }, + { + "epoch": 2.2109578491993385, + "grad_norm": 1.0076218843460083, + "learning_rate": 5.260073702652087e-05, + "loss": 2.565, + "step": 24404 + }, + { + "epoch": 2.2110484473737855, + "grad_norm": 1.0028455257415771, + "learning_rate": 5.2594695825530114e-05, + "loss": 2.8154, + "step": 24405 + }, + { + "epoch": 2.211139045548232, + "grad_norm": 1.0636472702026367, + "learning_rate": 5.2588654624539355e-05, + "loss": 2.3965, + "step": 24406 + }, + { + "epoch": 2.211229643722679, + "grad_norm": 1.030558466911316, + "learning_rate": 5.2582613423548596e-05, + "loss": 2.6447, + "step": 24407 + }, + { + "epoch": 2.2113202418971256, + "grad_norm": 0.995564877986908, + "learning_rate": 5.257657222255785e-05, + "loss": 2.731, + "step": 24408 + }, + { + "epoch": 2.2114108400715726, + "grad_norm": 0.9906464219093323, + "learning_rate": 5.257053102156709e-05, + "loss": 2.7119, + "step": 24409 + }, + { + "epoch": 2.211501438246019, + "grad_norm": 1.0395385026931763, + "learning_rate": 5.256448982057633e-05, + "loss": 2.6512, + "step": 24410 + }, + { + "epoch": 2.211592036420466, + "grad_norm": 1.0992138385772705, + "learning_rate": 5.255844861958558e-05, + "loss": 2.6265, + "step": 24411 + }, + { + "epoch": 2.2116826345949128, + "grad_norm": 1.0254510641098022, + "learning_rate": 5.255240741859482e-05, + "loss": 2.5324, + "step": 24412 + }, + { + "epoch": 2.2117732327693598, + "grad_norm": 0.9606077671051025, + "learning_rate": 5.254636621760406e-05, + "loss": 2.5451, + "step": 24413 + }, + { + "epoch": 2.2118638309438063, + "grad_norm": 1.0434409379959106, + "learning_rate": 5.25403250166133e-05, + "loss": 2.806, + "step": 24414 + }, + { + "epoch": 2.2119544291182534, + "grad_norm": 0.827897310256958, + "learning_rate": 5.2534283815622556e-05, + "loss": 1.9575, + "step": 24415 + }, + { + "epoch": 2.2120450272927, + "grad_norm": 1.0137192010879517, + "learning_rate": 5.25282426146318e-05, + "loss": 2.576, + "step": 24416 + }, + { + "epoch": 2.212135625467147, + "grad_norm": 0.9622340798377991, + "learning_rate": 5.252220141364104e-05, + "loss": 2.6546, + "step": 24417 + }, + { + "epoch": 2.2122262236415935, + "grad_norm": 1.1044590473175049, + "learning_rate": 5.251616021265028e-05, + "loss": 2.3679, + "step": 24418 + }, + { + "epoch": 2.2123168218160405, + "grad_norm": 1.0730617046356201, + "learning_rate": 5.251011901165952e-05, + "loss": 2.9324, + "step": 24419 + }, + { + "epoch": 2.212407419990487, + "grad_norm": 0.9506444334983826, + "learning_rate": 5.250407781066876e-05, + "loss": 2.4056, + "step": 24420 + }, + { + "epoch": 2.212498018164934, + "grad_norm": 1.0835901498794556, + "learning_rate": 5.2498036609678e-05, + "loss": 2.8373, + "step": 24421 + }, + { + "epoch": 2.2125886163393806, + "grad_norm": 1.1419799327850342, + "learning_rate": 5.249199540868724e-05, + "loss": 2.3892, + "step": 24422 + }, + { + "epoch": 2.2126792145138277, + "grad_norm": 0.9273302555084229, + "learning_rate": 5.2485954207696496e-05, + "loss": 2.4212, + "step": 24423 + }, + { + "epoch": 2.212769812688274, + "grad_norm": 1.0094568729400635, + "learning_rate": 5.247991300670574e-05, + "loss": 2.7862, + "step": 24424 + }, + { + "epoch": 2.2128604108627212, + "grad_norm": 1.1746320724487305, + "learning_rate": 5.247387180571498e-05, + "loss": 2.5782, + "step": 24425 + }, + { + "epoch": 2.212951009037168, + "grad_norm": 0.8357889652252197, + "learning_rate": 5.246783060472422e-05, + "loss": 1.8861, + "step": 24426 + }, + { + "epoch": 2.213041607211615, + "grad_norm": 0.8687006235122681, + "learning_rate": 5.2461789403733467e-05, + "loss": 2.1835, + "step": 24427 + }, + { + "epoch": 2.2131322053860614, + "grad_norm": 0.9738554954528809, + "learning_rate": 5.245574820274271e-05, + "loss": 2.8163, + "step": 24428 + }, + { + "epoch": 2.2132228035605084, + "grad_norm": 1.0959208011627197, + "learning_rate": 5.244970700175195e-05, + "loss": 2.5658, + "step": 24429 + }, + { + "epoch": 2.213313401734955, + "grad_norm": 1.0291858911514282, + "learning_rate": 5.24436658007612e-05, + "loss": 2.633, + "step": 24430 + }, + { + "epoch": 2.213403999909402, + "grad_norm": 0.8629577159881592, + "learning_rate": 5.243762459977044e-05, + "loss": 1.8868, + "step": 24431 + }, + { + "epoch": 2.2134945980838485, + "grad_norm": 0.9576844573020935, + "learning_rate": 5.2431583398779684e-05, + "loss": 2.6207, + "step": 24432 + }, + { + "epoch": 2.2135851962582955, + "grad_norm": 0.9788950085639954, + "learning_rate": 5.2425542197788925e-05, + "loss": 2.3701, + "step": 24433 + }, + { + "epoch": 2.213675794432742, + "grad_norm": 1.063433051109314, + "learning_rate": 5.2419500996798166e-05, + "loss": 2.3648, + "step": 24434 + }, + { + "epoch": 2.213766392607189, + "grad_norm": 0.9817669987678528, + "learning_rate": 5.241345979580741e-05, + "loss": 2.474, + "step": 24435 + }, + { + "epoch": 2.2138569907816357, + "grad_norm": 0.950819730758667, + "learning_rate": 5.240741859481665e-05, + "loss": 2.5349, + "step": 24436 + }, + { + "epoch": 2.2139475889560827, + "grad_norm": 1.0731732845306396, + "learning_rate": 5.240137739382589e-05, + "loss": 2.5781, + "step": 24437 + }, + { + "epoch": 2.2140381871305292, + "grad_norm": 0.8180130124092102, + "learning_rate": 5.239533619283514e-05, + "loss": 1.9192, + "step": 24438 + }, + { + "epoch": 2.2141287853049763, + "grad_norm": 0.9686796069145203, + "learning_rate": 5.2389294991844384e-05, + "loss": 2.6855, + "step": 24439 + }, + { + "epoch": 2.214219383479423, + "grad_norm": 1.0384249687194824, + "learning_rate": 5.2383253790853624e-05, + "loss": 2.7665, + "step": 24440 + }, + { + "epoch": 2.21430998165387, + "grad_norm": 1.055159330368042, + "learning_rate": 5.2377212589862865e-05, + "loss": 2.5106, + "step": 24441 + }, + { + "epoch": 2.2144005798283164, + "grad_norm": 1.1526442766189575, + "learning_rate": 5.2371171388872106e-05, + "loss": 2.5891, + "step": 24442 + }, + { + "epoch": 2.2144911780027634, + "grad_norm": 1.0782804489135742, + "learning_rate": 5.2365130187881354e-05, + "loss": 2.528, + "step": 24443 + }, + { + "epoch": 2.21458177617721, + "grad_norm": 0.961060106754303, + "learning_rate": 5.2359088986890595e-05, + "loss": 2.6368, + "step": 24444 + }, + { + "epoch": 2.214672374351657, + "grad_norm": 1.0428152084350586, + "learning_rate": 5.235304778589984e-05, + "loss": 2.5213, + "step": 24445 + }, + { + "epoch": 2.2147629725261035, + "grad_norm": 1.0255911350250244, + "learning_rate": 5.234700658490908e-05, + "loss": 2.3505, + "step": 24446 + }, + { + "epoch": 2.2148535707005506, + "grad_norm": 1.048166275024414, + "learning_rate": 5.234096538391833e-05, + "loss": 2.6952, + "step": 24447 + }, + { + "epoch": 2.214944168874997, + "grad_norm": 0.9379188418388367, + "learning_rate": 5.233492418292757e-05, + "loss": 2.4411, + "step": 24448 + }, + { + "epoch": 2.215034767049444, + "grad_norm": 1.068990707397461, + "learning_rate": 5.232888298193681e-05, + "loss": 2.747, + "step": 24449 + }, + { + "epoch": 2.2151253652238907, + "grad_norm": 0.8545829653739929, + "learning_rate": 5.232284178094605e-05, + "loss": 2.0076, + "step": 24450 + }, + { + "epoch": 2.2152159633983377, + "grad_norm": 1.0565859079360962, + "learning_rate": 5.2316800579955294e-05, + "loss": 2.765, + "step": 24451 + }, + { + "epoch": 2.2153065615727843, + "grad_norm": 0.9693272709846497, + "learning_rate": 5.2310759378964535e-05, + "loss": 2.6412, + "step": 24452 + }, + { + "epoch": 2.2153971597472313, + "grad_norm": 1.032715082168579, + "learning_rate": 5.230471817797379e-05, + "loss": 2.4788, + "step": 24453 + }, + { + "epoch": 2.215487757921678, + "grad_norm": 0.9867928624153137, + "learning_rate": 5.229867697698303e-05, + "loss": 2.7686, + "step": 24454 + }, + { + "epoch": 2.215578356096125, + "grad_norm": 1.0106340646743774, + "learning_rate": 5.229263577599227e-05, + "loss": 2.4808, + "step": 24455 + }, + { + "epoch": 2.2156689542705714, + "grad_norm": 0.9613744020462036, + "learning_rate": 5.228659457500151e-05, + "loss": 2.4583, + "step": 24456 + }, + { + "epoch": 2.2157595524450184, + "grad_norm": 1.0053083896636963, + "learning_rate": 5.228055337401075e-05, + "loss": 2.5165, + "step": 24457 + }, + { + "epoch": 2.215850150619465, + "grad_norm": 0.9999325275421143, + "learning_rate": 5.227451217301999e-05, + "loss": 2.5115, + "step": 24458 + }, + { + "epoch": 2.215940748793912, + "grad_norm": 1.067893147468567, + "learning_rate": 5.226847097202924e-05, + "loss": 2.9665, + "step": 24459 + }, + { + "epoch": 2.2160313469683586, + "grad_norm": 1.0298384428024292, + "learning_rate": 5.226242977103849e-05, + "loss": 2.3385, + "step": 24460 + }, + { + "epoch": 2.216121945142805, + "grad_norm": 1.13263738155365, + "learning_rate": 5.225638857004773e-05, + "loss": 2.5089, + "step": 24461 + }, + { + "epoch": 2.216212543317252, + "grad_norm": 0.9272982478141785, + "learning_rate": 5.225034736905697e-05, + "loss": 2.5637, + "step": 24462 + }, + { + "epoch": 2.216303141491699, + "grad_norm": 1.05058753490448, + "learning_rate": 5.224430616806622e-05, + "loss": 2.9197, + "step": 24463 + }, + { + "epoch": 2.2163937396661457, + "grad_norm": 0.9852254390716553, + "learning_rate": 5.223826496707546e-05, + "loss": 2.5493, + "step": 24464 + }, + { + "epoch": 2.2164843378405923, + "grad_norm": 0.9773514866828918, + "learning_rate": 5.22322237660847e-05, + "loss": 2.683, + "step": 24465 + }, + { + "epoch": 2.2165749360150393, + "grad_norm": 0.815179169178009, + "learning_rate": 5.222618256509394e-05, + "loss": 2.0431, + "step": 24466 + }, + { + "epoch": 2.2166655341894863, + "grad_norm": 0.9569122791290283, + "learning_rate": 5.222014136410318e-05, + "loss": 2.3516, + "step": 24467 + }, + { + "epoch": 2.216756132363933, + "grad_norm": 1.011162519454956, + "learning_rate": 5.2214100163112435e-05, + "loss": 2.8462, + "step": 24468 + }, + { + "epoch": 2.2168467305383794, + "grad_norm": 1.2398674488067627, + "learning_rate": 5.2208058962121676e-05, + "loss": 2.3351, + "step": 24469 + }, + { + "epoch": 2.2169373287128264, + "grad_norm": 1.0908901691436768, + "learning_rate": 5.220201776113092e-05, + "loss": 2.8179, + "step": 24470 + }, + { + "epoch": 2.217027926887273, + "grad_norm": 1.0148615837097168, + "learning_rate": 5.219597656014016e-05, + "loss": 2.796, + "step": 24471 + }, + { + "epoch": 2.21711852506172, + "grad_norm": 1.007083535194397, + "learning_rate": 5.21899353591494e-05, + "loss": 2.7458, + "step": 24472 + }, + { + "epoch": 2.2172091232361666, + "grad_norm": 0.9805845618247986, + "learning_rate": 5.218389415815864e-05, + "loss": 2.5219, + "step": 24473 + }, + { + "epoch": 2.2172997214106136, + "grad_norm": 0.9732187986373901, + "learning_rate": 5.217785295716788e-05, + "loss": 2.6918, + "step": 24474 + }, + { + "epoch": 2.21739031958506, + "grad_norm": 0.9672894477844238, + "learning_rate": 5.2171811756177135e-05, + "loss": 2.6665, + "step": 24475 + }, + { + "epoch": 2.217480917759507, + "grad_norm": 0.9603413343429565, + "learning_rate": 5.2165770555186376e-05, + "loss": 2.6638, + "step": 24476 + }, + { + "epoch": 2.2175715159339537, + "grad_norm": 0.9605821371078491, + "learning_rate": 5.2159729354195616e-05, + "loss": 2.5134, + "step": 24477 + }, + { + "epoch": 2.2176621141084008, + "grad_norm": 0.8905580043792725, + "learning_rate": 5.215368815320486e-05, + "loss": 1.9489, + "step": 24478 + }, + { + "epoch": 2.2177527122828473, + "grad_norm": 0.9925828576087952, + "learning_rate": 5.2147646952214105e-05, + "loss": 2.6159, + "step": 24479 + }, + { + "epoch": 2.2178433104572943, + "grad_norm": 0.9671757817268372, + "learning_rate": 5.2141605751223346e-05, + "loss": 2.7742, + "step": 24480 + }, + { + "epoch": 2.217933908631741, + "grad_norm": 1.0768163204193115, + "learning_rate": 5.2135564550232587e-05, + "loss": 2.5743, + "step": 24481 + }, + { + "epoch": 2.218024506806188, + "grad_norm": 0.992374837398529, + "learning_rate": 5.212952334924183e-05, + "loss": 2.7619, + "step": 24482 + }, + { + "epoch": 2.2181151049806345, + "grad_norm": 0.9952061772346497, + "learning_rate": 5.212348214825108e-05, + "loss": 2.6953, + "step": 24483 + }, + { + "epoch": 2.2182057031550815, + "grad_norm": 0.9604133367538452, + "learning_rate": 5.211744094726032e-05, + "loss": 2.5486, + "step": 24484 + }, + { + "epoch": 2.218296301329528, + "grad_norm": 0.9916930198669434, + "learning_rate": 5.211139974626956e-05, + "loss": 2.5523, + "step": 24485 + }, + { + "epoch": 2.218386899503975, + "grad_norm": 1.1137040853500366, + "learning_rate": 5.2105358545278804e-05, + "loss": 2.3669, + "step": 24486 + }, + { + "epoch": 2.2184774976784216, + "grad_norm": 0.9897922277450562, + "learning_rate": 5.2099317344288045e-05, + "loss": 2.4981, + "step": 24487 + }, + { + "epoch": 2.2185680958528686, + "grad_norm": 0.9518617391586304, + "learning_rate": 5.2093276143297286e-05, + "loss": 2.5189, + "step": 24488 + }, + { + "epoch": 2.218658694027315, + "grad_norm": 0.9281025528907776, + "learning_rate": 5.208723494230653e-05, + "loss": 2.4771, + "step": 24489 + }, + { + "epoch": 2.218749292201762, + "grad_norm": 1.173449158668518, + "learning_rate": 5.208119374131578e-05, + "loss": 2.5211, + "step": 24490 + }, + { + "epoch": 2.2188398903762088, + "grad_norm": 0.8507887125015259, + "learning_rate": 5.207515254032502e-05, + "loss": 1.8416, + "step": 24491 + }, + { + "epoch": 2.2189304885506558, + "grad_norm": 1.0079954862594604, + "learning_rate": 5.206911133933426e-05, + "loss": 2.5129, + "step": 24492 + }, + { + "epoch": 2.2190210867251023, + "grad_norm": 0.9753649830818176, + "learning_rate": 5.2063070138343504e-05, + "loss": 2.5652, + "step": 24493 + }, + { + "epoch": 2.2191116848995494, + "grad_norm": 0.9328112602233887, + "learning_rate": 5.2057028937352744e-05, + "loss": 2.5731, + "step": 24494 + }, + { + "epoch": 2.219202283073996, + "grad_norm": 0.8476932644844055, + "learning_rate": 5.205098773636199e-05, + "loss": 1.9017, + "step": 24495 + }, + { + "epoch": 2.219292881248443, + "grad_norm": 0.9843248128890991, + "learning_rate": 5.204494653537123e-05, + "loss": 2.2657, + "step": 24496 + }, + { + "epoch": 2.2193834794228895, + "grad_norm": 0.8856931328773499, + "learning_rate": 5.2038905334380474e-05, + "loss": 2.1242, + "step": 24497 + }, + { + "epoch": 2.2194740775973365, + "grad_norm": 0.9699370861053467, + "learning_rate": 5.203286413338973e-05, + "loss": 2.7189, + "step": 24498 + }, + { + "epoch": 2.219564675771783, + "grad_norm": 0.9520725011825562, + "learning_rate": 5.202682293239897e-05, + "loss": 2.416, + "step": 24499 + }, + { + "epoch": 2.21965527394623, + "grad_norm": 1.0550123453140259, + "learning_rate": 5.202078173140821e-05, + "loss": 2.575, + "step": 24500 + }, + { + "epoch": 2.2197458721206766, + "grad_norm": 0.9819817543029785, + "learning_rate": 5.201474053041745e-05, + "loss": 2.5631, + "step": 24501 + }, + { + "epoch": 2.2198364702951237, + "grad_norm": 0.9787179827690125, + "learning_rate": 5.200869932942669e-05, + "loss": 2.817, + "step": 24502 + }, + { + "epoch": 2.21992706846957, + "grad_norm": 1.0079458951950073, + "learning_rate": 5.200265812843593e-05, + "loss": 2.487, + "step": 24503 + }, + { + "epoch": 2.2200176666440172, + "grad_norm": 0.9567796587944031, + "learning_rate": 5.199661692744517e-05, + "loss": 2.4985, + "step": 24504 + }, + { + "epoch": 2.220108264818464, + "grad_norm": 1.01318359375, + "learning_rate": 5.199057572645443e-05, + "loss": 2.8248, + "step": 24505 + }, + { + "epoch": 2.220198862992911, + "grad_norm": 1.0461491346359253, + "learning_rate": 5.198453452546367e-05, + "loss": 2.8622, + "step": 24506 + }, + { + "epoch": 2.2202894611673574, + "grad_norm": 1.0293077230453491, + "learning_rate": 5.197849332447291e-05, + "loss": 2.5517, + "step": 24507 + }, + { + "epoch": 2.2203800593418044, + "grad_norm": 1.1273865699768066, + "learning_rate": 5.197245212348215e-05, + "loss": 2.5203, + "step": 24508 + }, + { + "epoch": 2.220470657516251, + "grad_norm": 0.9681062698364258, + "learning_rate": 5.196641092249139e-05, + "loss": 2.6577, + "step": 24509 + }, + { + "epoch": 2.220561255690698, + "grad_norm": 1.0608100891113281, + "learning_rate": 5.196036972150063e-05, + "loss": 2.685, + "step": 24510 + }, + { + "epoch": 2.2206518538651445, + "grad_norm": 1.0160363912582397, + "learning_rate": 5.195432852050988e-05, + "loss": 2.5425, + "step": 24511 + }, + { + "epoch": 2.2207424520395915, + "grad_norm": 0.9801622629165649, + "learning_rate": 5.194828731951912e-05, + "loss": 2.6673, + "step": 24512 + }, + { + "epoch": 2.220833050214038, + "grad_norm": 0.881573498249054, + "learning_rate": 5.194224611852837e-05, + "loss": 2.0121, + "step": 24513 + }, + { + "epoch": 2.220923648388485, + "grad_norm": 1.0488510131835938, + "learning_rate": 5.1936204917537615e-05, + "loss": 2.7104, + "step": 24514 + }, + { + "epoch": 2.2210142465629317, + "grad_norm": 1.0483787059783936, + "learning_rate": 5.1930163716546856e-05, + "loss": 2.9155, + "step": 24515 + }, + { + "epoch": 2.2211048447373787, + "grad_norm": 1.2514193058013916, + "learning_rate": 5.19241225155561e-05, + "loss": 2.3483, + "step": 24516 + }, + { + "epoch": 2.2211954429118252, + "grad_norm": 0.9965010285377502, + "learning_rate": 5.191808131456534e-05, + "loss": 2.5156, + "step": 24517 + }, + { + "epoch": 2.2212860410862723, + "grad_norm": 0.9767670035362244, + "learning_rate": 5.191204011357458e-05, + "loss": 2.4164, + "step": 24518 + }, + { + "epoch": 2.221376639260719, + "grad_norm": 1.0328481197357178, + "learning_rate": 5.190599891258382e-05, + "loss": 2.4566, + "step": 24519 + }, + { + "epoch": 2.221467237435166, + "grad_norm": 0.9687452912330627, + "learning_rate": 5.1899957711593074e-05, + "loss": 2.6589, + "step": 24520 + }, + { + "epoch": 2.2215578356096124, + "grad_norm": 1.005750060081482, + "learning_rate": 5.1893916510602315e-05, + "loss": 2.5517, + "step": 24521 + }, + { + "epoch": 2.2216484337840594, + "grad_norm": 1.0777480602264404, + "learning_rate": 5.1887875309611555e-05, + "loss": 2.4616, + "step": 24522 + }, + { + "epoch": 2.221739031958506, + "grad_norm": 0.9557012915611267, + "learning_rate": 5.1881834108620796e-05, + "loss": 2.8615, + "step": 24523 + }, + { + "epoch": 2.221829630132953, + "grad_norm": 1.0169438123703003, + "learning_rate": 5.187579290763004e-05, + "loss": 2.5511, + "step": 24524 + }, + { + "epoch": 2.2219202283073995, + "grad_norm": 0.9915849566459656, + "learning_rate": 5.186975170663928e-05, + "loss": 2.5211, + "step": 24525 + }, + { + "epoch": 2.2220108264818466, + "grad_norm": 1.010949969291687, + "learning_rate": 5.186371050564852e-05, + "loss": 2.5765, + "step": 24526 + }, + { + "epoch": 2.222101424656293, + "grad_norm": 0.9799745082855225, + "learning_rate": 5.1857669304657766e-05, + "loss": 2.5174, + "step": 24527 + }, + { + "epoch": 2.22219202283074, + "grad_norm": 0.9405409097671509, + "learning_rate": 5.1851628103667014e-05, + "loss": 2.7308, + "step": 24528 + }, + { + "epoch": 2.2222826210051867, + "grad_norm": 1.0735349655151367, + "learning_rate": 5.1845586902676255e-05, + "loss": 2.6417, + "step": 24529 + }, + { + "epoch": 2.2223732191796337, + "grad_norm": 1.0108176469802856, + "learning_rate": 5.18395457016855e-05, + "loss": 2.7156, + "step": 24530 + }, + { + "epoch": 2.2224638173540803, + "grad_norm": 0.8732377886772156, + "learning_rate": 5.183350450069474e-05, + "loss": 1.9205, + "step": 24531 + }, + { + "epoch": 2.2225544155285273, + "grad_norm": 1.0257540941238403, + "learning_rate": 5.1827463299703984e-05, + "loss": 2.7633, + "step": 24532 + }, + { + "epoch": 2.222645013702974, + "grad_norm": 0.8492392301559448, + "learning_rate": 5.1821422098713225e-05, + "loss": 1.7826, + "step": 24533 + }, + { + "epoch": 2.222735611877421, + "grad_norm": 0.8524252772331238, + "learning_rate": 5.1815380897722466e-05, + "loss": 2.0179, + "step": 24534 + }, + { + "epoch": 2.2228262100518674, + "grad_norm": 0.9540413022041321, + "learning_rate": 5.180933969673172e-05, + "loss": 2.4288, + "step": 24535 + }, + { + "epoch": 2.2229168082263144, + "grad_norm": 0.9385272860527039, + "learning_rate": 5.180329849574096e-05, + "loss": 1.9984, + "step": 24536 + }, + { + "epoch": 2.223007406400761, + "grad_norm": 1.023345708847046, + "learning_rate": 5.17972572947502e-05, + "loss": 2.6591, + "step": 24537 + }, + { + "epoch": 2.223098004575208, + "grad_norm": 0.9485226273536682, + "learning_rate": 5.179121609375944e-05, + "loss": 2.6044, + "step": 24538 + }, + { + "epoch": 2.2231886027496546, + "grad_norm": 1.1550582647323608, + "learning_rate": 5.178517489276868e-05, + "loss": 2.4912, + "step": 24539 + }, + { + "epoch": 2.2232792009241016, + "grad_norm": 1.1096835136413574, + "learning_rate": 5.1779133691777924e-05, + "loss": 2.5898, + "step": 24540 + }, + { + "epoch": 2.223369799098548, + "grad_norm": 1.0036866664886475, + "learning_rate": 5.1773092490787165e-05, + "loss": 2.4779, + "step": 24541 + }, + { + "epoch": 2.223460397272995, + "grad_norm": 1.0103216171264648, + "learning_rate": 5.1767051289796406e-05, + "loss": 2.6275, + "step": 24542 + }, + { + "epoch": 2.2235509954474417, + "grad_norm": 0.9494397640228271, + "learning_rate": 5.176101008880566e-05, + "loss": 2.4643, + "step": 24543 + }, + { + "epoch": 2.2236415936218883, + "grad_norm": 0.8902720808982849, + "learning_rate": 5.17549688878149e-05, + "loss": 1.996, + "step": 24544 + }, + { + "epoch": 2.2237321917963353, + "grad_norm": 0.8432437777519226, + "learning_rate": 5.174892768682414e-05, + "loss": 1.7825, + "step": 24545 + }, + { + "epoch": 2.2238227899707823, + "grad_norm": 0.8991324305534363, + "learning_rate": 5.174288648583339e-05, + "loss": 2.7781, + "step": 24546 + }, + { + "epoch": 2.223913388145229, + "grad_norm": 0.9930669069290161, + "learning_rate": 5.173684528484263e-05, + "loss": 2.2102, + "step": 24547 + }, + { + "epoch": 2.2240039863196754, + "grad_norm": 0.8467350602149963, + "learning_rate": 5.173080408385187e-05, + "loss": 1.87, + "step": 24548 + }, + { + "epoch": 2.2240945844941225, + "grad_norm": 0.9928826093673706, + "learning_rate": 5.172476288286111e-05, + "loss": 2.5639, + "step": 24549 + }, + { + "epoch": 2.2241851826685695, + "grad_norm": 1.07970130443573, + "learning_rate": 5.1718721681870366e-05, + "loss": 2.5754, + "step": 24550 + }, + { + "epoch": 2.224275780843016, + "grad_norm": 1.0512045621871948, + "learning_rate": 5.171268048087961e-05, + "loss": 2.5051, + "step": 24551 + }, + { + "epoch": 2.2243663790174626, + "grad_norm": 0.9789888262748718, + "learning_rate": 5.170663927988885e-05, + "loss": 2.6722, + "step": 24552 + }, + { + "epoch": 2.2244569771919096, + "grad_norm": 1.0391985177993774, + "learning_rate": 5.170059807889809e-05, + "loss": 2.5932, + "step": 24553 + }, + { + "epoch": 2.224547575366356, + "grad_norm": 1.0020686388015747, + "learning_rate": 5.169455687790733e-05, + "loss": 2.6071, + "step": 24554 + }, + { + "epoch": 2.224638173540803, + "grad_norm": 0.9975208640098572, + "learning_rate": 5.168851567691657e-05, + "loss": 2.7358, + "step": 24555 + }, + { + "epoch": 2.2247287717152497, + "grad_norm": 1.0006811618804932, + "learning_rate": 5.168247447592581e-05, + "loss": 2.7935, + "step": 24556 + }, + { + "epoch": 2.2248193698896968, + "grad_norm": 1.011270523071289, + "learning_rate": 5.167643327493505e-05, + "loss": 2.6721, + "step": 24557 + }, + { + "epoch": 2.2249099680641433, + "grad_norm": 0.99925297498703, + "learning_rate": 5.1670392073944307e-05, + "loss": 2.7745, + "step": 24558 + }, + { + "epoch": 2.2250005662385903, + "grad_norm": 0.9953814744949341, + "learning_rate": 5.166435087295355e-05, + "loss": 2.5316, + "step": 24559 + }, + { + "epoch": 2.225091164413037, + "grad_norm": 0.9449275732040405, + "learning_rate": 5.165830967196279e-05, + "loss": 2.6354, + "step": 24560 + }, + { + "epoch": 2.225181762587484, + "grad_norm": 0.7177294492721558, + "learning_rate": 5.165226847097203e-05, + "loss": 1.3847, + "step": 24561 + }, + { + "epoch": 2.2252723607619305, + "grad_norm": 0.9583407044410706, + "learning_rate": 5.164622726998127e-05, + "loss": 2.6136, + "step": 24562 + }, + { + "epoch": 2.2253629589363775, + "grad_norm": 1.0638846158981323, + "learning_rate": 5.164018606899052e-05, + "loss": 2.4933, + "step": 24563 + }, + { + "epoch": 2.225453557110824, + "grad_norm": 1.0185024738311768, + "learning_rate": 5.163414486799976e-05, + "loss": 2.5986, + "step": 24564 + }, + { + "epoch": 2.225544155285271, + "grad_norm": 0.9770681262016296, + "learning_rate": 5.1628103667009006e-05, + "loss": 2.4939, + "step": 24565 + }, + { + "epoch": 2.2256347534597176, + "grad_norm": 1.0602564811706543, + "learning_rate": 5.1622062466018254e-05, + "loss": 2.6199, + "step": 24566 + }, + { + "epoch": 2.2257253516341646, + "grad_norm": 0.9005023241043091, + "learning_rate": 5.1616021265027494e-05, + "loss": 1.9021, + "step": 24567 + }, + { + "epoch": 2.225815949808611, + "grad_norm": 1.0036462545394897, + "learning_rate": 5.1609980064036735e-05, + "loss": 2.5742, + "step": 24568 + }, + { + "epoch": 2.225906547983058, + "grad_norm": 1.0108280181884766, + "learning_rate": 5.1603938863045976e-05, + "loss": 2.3152, + "step": 24569 + }, + { + "epoch": 2.2259971461575048, + "grad_norm": 0.7871686816215515, + "learning_rate": 5.159789766205522e-05, + "loss": 1.3096, + "step": 24570 + }, + { + "epoch": 2.226087744331952, + "grad_norm": 1.0075199604034424, + "learning_rate": 5.159185646106446e-05, + "loss": 2.5838, + "step": 24571 + }, + { + "epoch": 2.2261783425063983, + "grad_norm": 1.0836957693099976, + "learning_rate": 5.15858152600737e-05, + "loss": 2.5831, + "step": 24572 + }, + { + "epoch": 2.2262689406808454, + "grad_norm": 1.0072990655899048, + "learning_rate": 5.157977405908295e-05, + "loss": 2.6477, + "step": 24573 + }, + { + "epoch": 2.226359538855292, + "grad_norm": 1.0663723945617676, + "learning_rate": 5.1573732858092194e-05, + "loss": 2.8031, + "step": 24574 + }, + { + "epoch": 2.226450137029739, + "grad_norm": 0.9700540900230408, + "learning_rate": 5.1567691657101435e-05, + "loss": 1.8848, + "step": 24575 + }, + { + "epoch": 2.2265407352041855, + "grad_norm": 1.0095349550247192, + "learning_rate": 5.1561650456110675e-05, + "loss": 2.6378, + "step": 24576 + }, + { + "epoch": 2.2266313333786325, + "grad_norm": 1.008327603340149, + "learning_rate": 5.1555609255119916e-05, + "loss": 2.588, + "step": 24577 + }, + { + "epoch": 2.226721931553079, + "grad_norm": 1.062530755996704, + "learning_rate": 5.154956805412916e-05, + "loss": 2.4829, + "step": 24578 + }, + { + "epoch": 2.226812529727526, + "grad_norm": 0.8730295300483704, + "learning_rate": 5.1543526853138405e-05, + "loss": 1.8937, + "step": 24579 + }, + { + "epoch": 2.2269031279019726, + "grad_norm": 0.8623102903366089, + "learning_rate": 5.153748565214765e-05, + "loss": 2.1473, + "step": 24580 + }, + { + "epoch": 2.2269937260764197, + "grad_norm": 0.9463425278663635, + "learning_rate": 5.153144445115689e-05, + "loss": 2.1751, + "step": 24581 + }, + { + "epoch": 2.227084324250866, + "grad_norm": 0.9782412648200989, + "learning_rate": 5.152540325016614e-05, + "loss": 2.4847, + "step": 24582 + }, + { + "epoch": 2.2271749224253132, + "grad_norm": 1.0849086046218872, + "learning_rate": 5.151936204917538e-05, + "loss": 2.7061, + "step": 24583 + }, + { + "epoch": 2.22726552059976, + "grad_norm": 1.1400998830795288, + "learning_rate": 5.151332084818462e-05, + "loss": 2.4344, + "step": 24584 + }, + { + "epoch": 2.227356118774207, + "grad_norm": 0.9450408816337585, + "learning_rate": 5.150727964719386e-05, + "loss": 2.8788, + "step": 24585 + }, + { + "epoch": 2.2274467169486534, + "grad_norm": 1.0099085569381714, + "learning_rate": 5.1501238446203104e-05, + "loss": 2.5753, + "step": 24586 + }, + { + "epoch": 2.2275373151231004, + "grad_norm": 0.9832696318626404, + "learning_rate": 5.1495197245212345e-05, + "loss": 2.8341, + "step": 24587 + }, + { + "epoch": 2.227627913297547, + "grad_norm": 0.9603887796401978, + "learning_rate": 5.14891560442216e-05, + "loss": 2.5816, + "step": 24588 + }, + { + "epoch": 2.227718511471994, + "grad_norm": 0.9772837162017822, + "learning_rate": 5.148311484323084e-05, + "loss": 2.6132, + "step": 24589 + }, + { + "epoch": 2.2278091096464405, + "grad_norm": 0.9631600379943848, + "learning_rate": 5.147707364224008e-05, + "loss": 2.7416, + "step": 24590 + }, + { + "epoch": 2.2278997078208875, + "grad_norm": 1.0127743482589722, + "learning_rate": 5.147103244124932e-05, + "loss": 2.5363, + "step": 24591 + }, + { + "epoch": 2.227990305995334, + "grad_norm": 0.937326192855835, + "learning_rate": 5.146499124025856e-05, + "loss": 2.7047, + "step": 24592 + }, + { + "epoch": 2.228080904169781, + "grad_norm": 0.9295424222946167, + "learning_rate": 5.14589500392678e-05, + "loss": 2.4152, + "step": 24593 + }, + { + "epoch": 2.2281715023442277, + "grad_norm": 0.9455865621566772, + "learning_rate": 5.1452908838277044e-05, + "loss": 2.3553, + "step": 24594 + }, + { + "epoch": 2.2282621005186747, + "grad_norm": 0.9769918322563171, + "learning_rate": 5.14468676372863e-05, + "loss": 2.585, + "step": 24595 + }, + { + "epoch": 2.2283526986931212, + "grad_norm": 1.0562658309936523, + "learning_rate": 5.144082643629554e-05, + "loss": 2.9149, + "step": 24596 + }, + { + "epoch": 2.2284432968675683, + "grad_norm": 0.9223426580429077, + "learning_rate": 5.143478523530478e-05, + "loss": 2.3962, + "step": 24597 + }, + { + "epoch": 2.228533895042015, + "grad_norm": 0.8663753271102905, + "learning_rate": 5.142874403431403e-05, + "loss": 1.7897, + "step": 24598 + }, + { + "epoch": 2.228624493216462, + "grad_norm": 1.0473670959472656, + "learning_rate": 5.142270283332327e-05, + "loss": 2.8119, + "step": 24599 + }, + { + "epoch": 2.2287150913909084, + "grad_norm": 1.0282238721847534, + "learning_rate": 5.141666163233251e-05, + "loss": 2.484, + "step": 24600 + }, + { + "epoch": 2.2288056895653554, + "grad_norm": 0.9989812970161438, + "learning_rate": 5.141062043134175e-05, + "loss": 2.6896, + "step": 24601 + }, + { + "epoch": 2.228896287739802, + "grad_norm": 1.0971497297286987, + "learning_rate": 5.140457923035099e-05, + "loss": 2.5442, + "step": 24602 + }, + { + "epoch": 2.228986885914249, + "grad_norm": 0.8288206458091736, + "learning_rate": 5.1398538029360246e-05, + "loss": 1.9073, + "step": 24603 + }, + { + "epoch": 2.2290774840886955, + "grad_norm": 1.0094236135482788, + "learning_rate": 5.1392496828369486e-05, + "loss": 2.7324, + "step": 24604 + }, + { + "epoch": 2.2291680822631426, + "grad_norm": 1.0005676746368408, + "learning_rate": 5.138645562737873e-05, + "loss": 2.6502, + "step": 24605 + }, + { + "epoch": 2.229258680437589, + "grad_norm": 0.9377448558807373, + "learning_rate": 5.138041442638797e-05, + "loss": 1.9655, + "step": 24606 + }, + { + "epoch": 2.229349278612036, + "grad_norm": 0.9211044907569885, + "learning_rate": 5.137437322539721e-05, + "loss": 2.5262, + "step": 24607 + }, + { + "epoch": 2.2294398767864827, + "grad_norm": 0.9566271305084229, + "learning_rate": 5.136833202440645e-05, + "loss": 2.6551, + "step": 24608 + }, + { + "epoch": 2.2295304749609297, + "grad_norm": 1.0022255182266235, + "learning_rate": 5.136229082341569e-05, + "loss": 2.6627, + "step": 24609 + }, + { + "epoch": 2.2296210731353763, + "grad_norm": 1.0585150718688965, + "learning_rate": 5.1356249622424945e-05, + "loss": 2.5848, + "step": 24610 + }, + { + "epoch": 2.2297116713098233, + "grad_norm": 0.989048182964325, + "learning_rate": 5.1350208421434186e-05, + "loss": 2.7838, + "step": 24611 + }, + { + "epoch": 2.22980226948427, + "grad_norm": 0.9990922212600708, + "learning_rate": 5.1344167220443427e-05, + "loss": 2.5664, + "step": 24612 + }, + { + "epoch": 2.229892867658717, + "grad_norm": 0.8851587176322937, + "learning_rate": 5.133812601945267e-05, + "loss": 1.7572, + "step": 24613 + }, + { + "epoch": 2.2299834658331634, + "grad_norm": 1.022773265838623, + "learning_rate": 5.1332084818461915e-05, + "loss": 2.4024, + "step": 24614 + }, + { + "epoch": 2.2300740640076104, + "grad_norm": 0.9422008395195007, + "learning_rate": 5.1326043617471156e-05, + "loss": 2.1191, + "step": 24615 + }, + { + "epoch": 2.230164662182057, + "grad_norm": 1.0113359689712524, + "learning_rate": 5.13200024164804e-05, + "loss": 2.708, + "step": 24616 + }, + { + "epoch": 2.230255260356504, + "grad_norm": 1.014255404472351, + "learning_rate": 5.131396121548964e-05, + "loss": 2.6241, + "step": 24617 + }, + { + "epoch": 2.2303458585309506, + "grad_norm": 1.126194953918457, + "learning_rate": 5.130792001449889e-05, + "loss": 2.6922, + "step": 24618 + }, + { + "epoch": 2.2304364567053976, + "grad_norm": 0.9968371987342834, + "learning_rate": 5.130187881350813e-05, + "loss": 2.5753, + "step": 24619 + }, + { + "epoch": 2.230527054879844, + "grad_norm": 0.8341964483261108, + "learning_rate": 5.1295837612517374e-05, + "loss": 2.1466, + "step": 24620 + }, + { + "epoch": 2.230617653054291, + "grad_norm": 1.0122880935668945, + "learning_rate": 5.1289796411526614e-05, + "loss": 2.5774, + "step": 24621 + }, + { + "epoch": 2.2307082512287377, + "grad_norm": 0.9730978608131409, + "learning_rate": 5.1283755210535855e-05, + "loss": 2.5665, + "step": 24622 + }, + { + "epoch": 2.2307988494031843, + "grad_norm": 0.9477663636207581, + "learning_rate": 5.1277714009545096e-05, + "loss": 2.4293, + "step": 24623 + }, + { + "epoch": 2.2308894475776313, + "grad_norm": 1.0159034729003906, + "learning_rate": 5.127167280855434e-05, + "loss": 2.321, + "step": 24624 + }, + { + "epoch": 2.2309800457520783, + "grad_norm": 0.9630293846130371, + "learning_rate": 5.126563160756359e-05, + "loss": 2.7644, + "step": 24625 + }, + { + "epoch": 2.231070643926525, + "grad_norm": 1.1367769241333008, + "learning_rate": 5.125959040657283e-05, + "loss": 2.6117, + "step": 24626 + }, + { + "epoch": 2.2311612421009714, + "grad_norm": 0.9562566876411438, + "learning_rate": 5.125354920558207e-05, + "loss": 2.8719, + "step": 24627 + }, + { + "epoch": 2.2312518402754185, + "grad_norm": 1.1331732273101807, + "learning_rate": 5.1247508004591314e-05, + "loss": 2.4455, + "step": 24628 + }, + { + "epoch": 2.2313424384498655, + "grad_norm": 1.001499056816101, + "learning_rate": 5.1241466803600555e-05, + "loss": 2.5469, + "step": 24629 + }, + { + "epoch": 2.231433036624312, + "grad_norm": 0.9682405591011047, + "learning_rate": 5.12354256026098e-05, + "loss": 2.675, + "step": 24630 + }, + { + "epoch": 2.2315236347987586, + "grad_norm": 0.973930299282074, + "learning_rate": 5.122938440161904e-05, + "loss": 2.5942, + "step": 24631 + }, + { + "epoch": 2.2316142329732056, + "grad_norm": 0.950171947479248, + "learning_rate": 5.1223343200628284e-05, + "loss": 2.2772, + "step": 24632 + }, + { + "epoch": 2.231704831147652, + "grad_norm": 1.1286077499389648, + "learning_rate": 5.121730199963753e-05, + "loss": 2.4498, + "step": 24633 + }, + { + "epoch": 2.231795429322099, + "grad_norm": 0.9848610758781433, + "learning_rate": 5.121126079864678e-05, + "loss": 2.4211, + "step": 24634 + }, + { + "epoch": 2.2318860274965457, + "grad_norm": 0.9339320063591003, + "learning_rate": 5.120521959765602e-05, + "loss": 2.4763, + "step": 24635 + }, + { + "epoch": 2.2319766256709928, + "grad_norm": 0.9681841135025024, + "learning_rate": 5.119917839666526e-05, + "loss": 2.5872, + "step": 24636 + }, + { + "epoch": 2.2320672238454393, + "grad_norm": 1.1078914403915405, + "learning_rate": 5.11931371956745e-05, + "loss": 2.6757, + "step": 24637 + }, + { + "epoch": 2.2321578220198863, + "grad_norm": 0.6668241024017334, + "learning_rate": 5.118709599468374e-05, + "loss": 1.357, + "step": 24638 + }, + { + "epoch": 2.232248420194333, + "grad_norm": 1.1366868019104004, + "learning_rate": 5.118105479369298e-05, + "loss": 2.7221, + "step": 24639 + }, + { + "epoch": 2.23233901836878, + "grad_norm": 1.0170934200286865, + "learning_rate": 5.117501359270224e-05, + "loss": 2.7099, + "step": 24640 + }, + { + "epoch": 2.2324296165432265, + "grad_norm": 1.0063235759735107, + "learning_rate": 5.116897239171148e-05, + "loss": 2.5892, + "step": 24641 + }, + { + "epoch": 2.2325202147176735, + "grad_norm": 0.9679201245307922, + "learning_rate": 5.116293119072072e-05, + "loss": 2.5144, + "step": 24642 + }, + { + "epoch": 2.23261081289212, + "grad_norm": 1.051190733909607, + "learning_rate": 5.115688998972996e-05, + "loss": 2.8978, + "step": 24643 + }, + { + "epoch": 2.232701411066567, + "grad_norm": 0.9320235848426819, + "learning_rate": 5.11508487887392e-05, + "loss": 2.6614, + "step": 24644 + }, + { + "epoch": 2.2327920092410136, + "grad_norm": 0.9820689558982849, + "learning_rate": 5.114480758774844e-05, + "loss": 2.8264, + "step": 24645 + }, + { + "epoch": 2.2328826074154606, + "grad_norm": 0.9272956848144531, + "learning_rate": 5.113876638675769e-05, + "loss": 2.4628, + "step": 24646 + }, + { + "epoch": 2.232973205589907, + "grad_norm": 0.9594470262527466, + "learning_rate": 5.113272518576693e-05, + "loss": 2.548, + "step": 24647 + }, + { + "epoch": 2.233063803764354, + "grad_norm": 0.9921393990516663, + "learning_rate": 5.112668398477618e-05, + "loss": 2.0006, + "step": 24648 + }, + { + "epoch": 2.2331544019388008, + "grad_norm": 0.9970415234565735, + "learning_rate": 5.112064278378542e-05, + "loss": 2.7061, + "step": 24649 + }, + { + "epoch": 2.233245000113248, + "grad_norm": 1.0132083892822266, + "learning_rate": 5.1114601582794666e-05, + "loss": 2.804, + "step": 24650 + }, + { + "epoch": 2.2333355982876943, + "grad_norm": 1.073668360710144, + "learning_rate": 5.110856038180391e-05, + "loss": 2.8139, + "step": 24651 + }, + { + "epoch": 2.2334261964621414, + "grad_norm": 0.9621559977531433, + "learning_rate": 5.110251918081315e-05, + "loss": 2.4783, + "step": 24652 + }, + { + "epoch": 2.233516794636588, + "grad_norm": 1.0243706703186035, + "learning_rate": 5.109647797982239e-05, + "loss": 2.3551, + "step": 24653 + }, + { + "epoch": 2.233607392811035, + "grad_norm": 0.9634197950363159, + "learning_rate": 5.109043677883163e-05, + "loss": 2.5352, + "step": 24654 + }, + { + "epoch": 2.2336979909854815, + "grad_norm": 1.007347583770752, + "learning_rate": 5.1084395577840884e-05, + "loss": 2.4797, + "step": 24655 + }, + { + "epoch": 2.2337885891599285, + "grad_norm": 0.9970808625221252, + "learning_rate": 5.1078354376850125e-05, + "loss": 2.6693, + "step": 24656 + }, + { + "epoch": 2.233879187334375, + "grad_norm": 1.0095436573028564, + "learning_rate": 5.1072313175859366e-05, + "loss": 2.9952, + "step": 24657 + }, + { + "epoch": 2.233969785508822, + "grad_norm": 1.1546272039413452, + "learning_rate": 5.1066271974868606e-05, + "loss": 2.8993, + "step": 24658 + }, + { + "epoch": 2.2340603836832686, + "grad_norm": 1.0342726707458496, + "learning_rate": 5.106023077387785e-05, + "loss": 2.6998, + "step": 24659 + }, + { + "epoch": 2.2341509818577157, + "grad_norm": 0.9562206268310547, + "learning_rate": 5.105418957288709e-05, + "loss": 2.4616, + "step": 24660 + }, + { + "epoch": 2.2342415800321622, + "grad_norm": 1.0549083948135376, + "learning_rate": 5.104814837189633e-05, + "loss": 2.4783, + "step": 24661 + }, + { + "epoch": 2.2343321782066092, + "grad_norm": 0.9517850279808044, + "learning_rate": 5.1042107170905576e-05, + "loss": 2.6112, + "step": 24662 + }, + { + "epoch": 2.234422776381056, + "grad_norm": 0.9771384000778198, + "learning_rate": 5.1036065969914824e-05, + "loss": 2.3765, + "step": 24663 + }, + { + "epoch": 2.234513374555503, + "grad_norm": 1.0044258832931519, + "learning_rate": 5.1030024768924065e-05, + "loss": 2.3245, + "step": 24664 + }, + { + "epoch": 2.2346039727299494, + "grad_norm": 1.103562831878662, + "learning_rate": 5.1023983567933306e-05, + "loss": 1.9241, + "step": 24665 + }, + { + "epoch": 2.2346945709043964, + "grad_norm": 1.038609266281128, + "learning_rate": 5.101794236694255e-05, + "loss": 2.577, + "step": 24666 + }, + { + "epoch": 2.234785169078843, + "grad_norm": 0.9736760854721069, + "learning_rate": 5.1011901165951794e-05, + "loss": 2.6992, + "step": 24667 + }, + { + "epoch": 2.23487576725329, + "grad_norm": 0.9552660584449768, + "learning_rate": 5.1005859964961035e-05, + "loss": 2.6202, + "step": 24668 + }, + { + "epoch": 2.2349663654277365, + "grad_norm": 1.019892930984497, + "learning_rate": 5.0999818763970276e-05, + "loss": 2.6152, + "step": 24669 + }, + { + "epoch": 2.2350569636021835, + "grad_norm": 0.991312563419342, + "learning_rate": 5.099377756297953e-05, + "loss": 2.5728, + "step": 24670 + }, + { + "epoch": 2.23514756177663, + "grad_norm": 0.9594220519065857, + "learning_rate": 5.098773636198877e-05, + "loss": 2.6245, + "step": 24671 + }, + { + "epoch": 2.235238159951077, + "grad_norm": 0.9781492352485657, + "learning_rate": 5.098169516099801e-05, + "loss": 2.4667, + "step": 24672 + }, + { + "epoch": 2.2353287581255237, + "grad_norm": 0.9839023947715759, + "learning_rate": 5.097565396000725e-05, + "loss": 2.5153, + "step": 24673 + }, + { + "epoch": 2.2354193562999707, + "grad_norm": 1.1226601600646973, + "learning_rate": 5.0969612759016494e-05, + "loss": 2.7562, + "step": 24674 + }, + { + "epoch": 2.2355099544744172, + "grad_norm": 0.9693971276283264, + "learning_rate": 5.0963571558025734e-05, + "loss": 2.5077, + "step": 24675 + }, + { + "epoch": 2.2356005526488643, + "grad_norm": 1.0559755563735962, + "learning_rate": 5.0957530357034975e-05, + "loss": 2.6631, + "step": 24676 + }, + { + "epoch": 2.235691150823311, + "grad_norm": 1.0193102359771729, + "learning_rate": 5.0951489156044216e-05, + "loss": 2.6438, + "step": 24677 + }, + { + "epoch": 2.235781748997758, + "grad_norm": 0.9893166422843933, + "learning_rate": 5.094544795505347e-05, + "loss": 2.6978, + "step": 24678 + }, + { + "epoch": 2.2358723471722044, + "grad_norm": 0.8475984930992126, + "learning_rate": 5.093940675406271e-05, + "loss": 1.9731, + "step": 24679 + }, + { + "epoch": 2.2359629453466514, + "grad_norm": 1.0866470336914062, + "learning_rate": 5.093336555307195e-05, + "loss": 3.004, + "step": 24680 + }, + { + "epoch": 2.236053543521098, + "grad_norm": 1.0597991943359375, + "learning_rate": 5.092732435208119e-05, + "loss": 2.5858, + "step": 24681 + }, + { + "epoch": 2.236144141695545, + "grad_norm": 0.9696293473243713, + "learning_rate": 5.092128315109044e-05, + "loss": 2.5386, + "step": 24682 + }, + { + "epoch": 2.2362347398699916, + "grad_norm": 0.9822048544883728, + "learning_rate": 5.091524195009968e-05, + "loss": 2.4425, + "step": 24683 + }, + { + "epoch": 2.2363253380444386, + "grad_norm": 0.862488865852356, + "learning_rate": 5.090920074910892e-05, + "loss": 1.8797, + "step": 24684 + }, + { + "epoch": 2.236415936218885, + "grad_norm": 0.9566226601600647, + "learning_rate": 5.0903159548118177e-05, + "loss": 2.5885, + "step": 24685 + }, + { + "epoch": 2.236506534393332, + "grad_norm": 1.0060927867889404, + "learning_rate": 5.089711834712742e-05, + "loss": 2.6062, + "step": 24686 + }, + { + "epoch": 2.2365971325677787, + "grad_norm": 0.8953485488891602, + "learning_rate": 5.089107714613666e-05, + "loss": 1.8337, + "step": 24687 + }, + { + "epoch": 2.2366877307422257, + "grad_norm": 0.9591816663742065, + "learning_rate": 5.08850359451459e-05, + "loss": 2.7533, + "step": 24688 + }, + { + "epoch": 2.2367783289166723, + "grad_norm": 1.044917106628418, + "learning_rate": 5.087899474415514e-05, + "loss": 2.7592, + "step": 24689 + }, + { + "epoch": 2.2368689270911193, + "grad_norm": 0.9003686904907227, + "learning_rate": 5.087295354316438e-05, + "loss": 2.0178, + "step": 24690 + }, + { + "epoch": 2.236959525265566, + "grad_norm": 0.9997341632843018, + "learning_rate": 5.086691234217362e-05, + "loss": 2.6145, + "step": 24691 + }, + { + "epoch": 2.237050123440013, + "grad_norm": 1.0679851770401, + "learning_rate": 5.086087114118286e-05, + "loss": 2.7566, + "step": 24692 + }, + { + "epoch": 2.2371407216144594, + "grad_norm": 0.946188747882843, + "learning_rate": 5.085482994019212e-05, + "loss": 2.45, + "step": 24693 + }, + { + "epoch": 2.2372313197889064, + "grad_norm": 1.1778496503829956, + "learning_rate": 5.084878873920136e-05, + "loss": 2.7435, + "step": 24694 + }, + { + "epoch": 2.237321917963353, + "grad_norm": 1.0573570728302002, + "learning_rate": 5.08427475382106e-05, + "loss": 2.6842, + "step": 24695 + }, + { + "epoch": 2.2374125161378, + "grad_norm": 0.9920772910118103, + "learning_rate": 5.083670633721984e-05, + "loss": 2.7968, + "step": 24696 + }, + { + "epoch": 2.2375031143122466, + "grad_norm": 1.0491856336593628, + "learning_rate": 5.083066513622908e-05, + "loss": 2.7114, + "step": 24697 + }, + { + "epoch": 2.2375937124866936, + "grad_norm": 0.979546844959259, + "learning_rate": 5.082462393523833e-05, + "loss": 2.9706, + "step": 24698 + }, + { + "epoch": 2.23768431066114, + "grad_norm": 1.0268290042877197, + "learning_rate": 5.081858273424757e-05, + "loss": 2.6325, + "step": 24699 + }, + { + "epoch": 2.237774908835587, + "grad_norm": 0.9344151616096497, + "learning_rate": 5.0812541533256816e-05, + "loss": 1.9503, + "step": 24700 + }, + { + "epoch": 2.2378655070100337, + "grad_norm": 0.9590056538581848, + "learning_rate": 5.0806500332266064e-05, + "loss": 2.6344, + "step": 24701 + }, + { + "epoch": 2.2379561051844807, + "grad_norm": 0.9299556016921997, + "learning_rate": 5.0800459131275304e-05, + "loss": 2.5094, + "step": 24702 + }, + { + "epoch": 2.2380467033589273, + "grad_norm": 1.1206607818603516, + "learning_rate": 5.0794417930284545e-05, + "loss": 2.3783, + "step": 24703 + }, + { + "epoch": 2.2381373015333743, + "grad_norm": 0.9467629790306091, + "learning_rate": 5.0788376729293786e-05, + "loss": 2.5676, + "step": 24704 + }, + { + "epoch": 2.238227899707821, + "grad_norm": 0.9828413128852844, + "learning_rate": 5.078233552830303e-05, + "loss": 2.9214, + "step": 24705 + }, + { + "epoch": 2.2383184978822674, + "grad_norm": 0.9854329824447632, + "learning_rate": 5.077629432731227e-05, + "loss": 2.5996, + "step": 24706 + }, + { + "epoch": 2.2384090960567145, + "grad_norm": 1.0689679384231567, + "learning_rate": 5.077025312632151e-05, + "loss": 2.6255, + "step": 24707 + }, + { + "epoch": 2.2384996942311615, + "grad_norm": 0.9802139401435852, + "learning_rate": 5.076421192533076e-05, + "loss": 2.9986, + "step": 24708 + }, + { + "epoch": 2.238590292405608, + "grad_norm": 0.9911450147628784, + "learning_rate": 5.0758170724340004e-05, + "loss": 2.8856, + "step": 24709 + }, + { + "epoch": 2.2386808905800546, + "grad_norm": 0.9342080950737, + "learning_rate": 5.0752129523349245e-05, + "loss": 2.6621, + "step": 24710 + }, + { + "epoch": 2.2387714887545016, + "grad_norm": 0.9648517370223999, + "learning_rate": 5.0746088322358486e-05, + "loss": 2.707, + "step": 24711 + }, + { + "epoch": 2.2388620869289486, + "grad_norm": 0.9531192183494568, + "learning_rate": 5.0740047121367726e-05, + "loss": 2.5426, + "step": 24712 + }, + { + "epoch": 2.238952685103395, + "grad_norm": 1.0197014808654785, + "learning_rate": 5.073400592037697e-05, + "loss": 2.6348, + "step": 24713 + }, + { + "epoch": 2.2390432832778417, + "grad_norm": 1.0942051410675049, + "learning_rate": 5.0727964719386215e-05, + "loss": 2.5269, + "step": 24714 + }, + { + "epoch": 2.2391338814522888, + "grad_norm": 1.0092825889587402, + "learning_rate": 5.072192351839546e-05, + "loss": 2.5283, + "step": 24715 + }, + { + "epoch": 2.2392244796267353, + "grad_norm": 1.0336220264434814, + "learning_rate": 5.07158823174047e-05, + "loss": 2.7797, + "step": 24716 + }, + { + "epoch": 2.2393150778011823, + "grad_norm": 0.9378945231437683, + "learning_rate": 5.070984111641395e-05, + "loss": 2.36, + "step": 24717 + }, + { + "epoch": 2.239405675975629, + "grad_norm": 1.0375568866729736, + "learning_rate": 5.070379991542319e-05, + "loss": 2.6526, + "step": 24718 + }, + { + "epoch": 2.239496274150076, + "grad_norm": 0.9420696496963501, + "learning_rate": 5.069775871443243e-05, + "loss": 2.5941, + "step": 24719 + }, + { + "epoch": 2.2395868723245225, + "grad_norm": 0.9632164835929871, + "learning_rate": 5.069171751344167e-05, + "loss": 2.6003, + "step": 24720 + }, + { + "epoch": 2.2396774704989695, + "grad_norm": 0.9784833788871765, + "learning_rate": 5.0685676312450914e-05, + "loss": 2.7034, + "step": 24721 + }, + { + "epoch": 2.239768068673416, + "grad_norm": 1.048004388809204, + "learning_rate": 5.0679635111460155e-05, + "loss": 2.75, + "step": 24722 + }, + { + "epoch": 2.239858666847863, + "grad_norm": 1.0862276554107666, + "learning_rate": 5.067359391046941e-05, + "loss": 2.8011, + "step": 24723 + }, + { + "epoch": 2.2399492650223096, + "grad_norm": 0.961161732673645, + "learning_rate": 5.066755270947865e-05, + "loss": 2.6326, + "step": 24724 + }, + { + "epoch": 2.2400398631967566, + "grad_norm": 1.099135160446167, + "learning_rate": 5.066151150848789e-05, + "loss": 2.6415, + "step": 24725 + }, + { + "epoch": 2.240130461371203, + "grad_norm": 1.0905144214630127, + "learning_rate": 5.065547030749713e-05, + "loss": 2.652, + "step": 24726 + }, + { + "epoch": 2.24022105954565, + "grad_norm": 1.0023664236068726, + "learning_rate": 5.064942910650637e-05, + "loss": 2.5407, + "step": 24727 + }, + { + "epoch": 2.2403116577200968, + "grad_norm": 1.0106003284454346, + "learning_rate": 5.0643387905515613e-05, + "loss": 2.5667, + "step": 24728 + }, + { + "epoch": 2.240402255894544, + "grad_norm": 1.0235915184020996, + "learning_rate": 5.0637346704524854e-05, + "loss": 2.5396, + "step": 24729 + }, + { + "epoch": 2.2404928540689903, + "grad_norm": 0.9644848704338074, + "learning_rate": 5.063130550353411e-05, + "loss": 2.8269, + "step": 24730 + }, + { + "epoch": 2.2405834522434374, + "grad_norm": 0.9731734395027161, + "learning_rate": 5.062526430254335e-05, + "loss": 2.5785, + "step": 24731 + }, + { + "epoch": 2.240674050417884, + "grad_norm": 0.9593808054924011, + "learning_rate": 5.061922310155259e-05, + "loss": 2.5449, + "step": 24732 + }, + { + "epoch": 2.240764648592331, + "grad_norm": 1.0626999139785767, + "learning_rate": 5.061318190056184e-05, + "loss": 2.6616, + "step": 24733 + }, + { + "epoch": 2.2408552467667775, + "grad_norm": 0.9679470062255859, + "learning_rate": 5.060714069957108e-05, + "loss": 2.716, + "step": 24734 + }, + { + "epoch": 2.2409458449412245, + "grad_norm": 1.0606852769851685, + "learning_rate": 5.060109949858032e-05, + "loss": 2.683, + "step": 24735 + }, + { + "epoch": 2.241036443115671, + "grad_norm": 0.962578296661377, + "learning_rate": 5.059505829758956e-05, + "loss": 2.7516, + "step": 24736 + }, + { + "epoch": 2.241127041290118, + "grad_norm": 1.1294453144073486, + "learning_rate": 5.05890170965988e-05, + "loss": 2.5431, + "step": 24737 + }, + { + "epoch": 2.2412176394645646, + "grad_norm": 1.0132529735565186, + "learning_rate": 5.0582975895608056e-05, + "loss": 2.7038, + "step": 24738 + }, + { + "epoch": 2.2413082376390117, + "grad_norm": 1.0250526666641235, + "learning_rate": 5.0576934694617297e-05, + "loss": 2.6099, + "step": 24739 + }, + { + "epoch": 2.2413988358134582, + "grad_norm": 0.9894005060195923, + "learning_rate": 5.057089349362654e-05, + "loss": 2.6089, + "step": 24740 + }, + { + "epoch": 2.2414894339879052, + "grad_norm": 1.0078332424163818, + "learning_rate": 5.056485229263578e-05, + "loss": 2.5633, + "step": 24741 + }, + { + "epoch": 2.241580032162352, + "grad_norm": 0.9907876253128052, + "learning_rate": 5.055881109164502e-05, + "loss": 2.7103, + "step": 24742 + }, + { + "epoch": 2.241670630336799, + "grad_norm": 0.9581535458564758, + "learning_rate": 5.055276989065426e-05, + "loss": 2.4987, + "step": 24743 + }, + { + "epoch": 2.2417612285112454, + "grad_norm": 0.9271909594535828, + "learning_rate": 5.05467286896635e-05, + "loss": 2.44, + "step": 24744 + }, + { + "epoch": 2.2418518266856924, + "grad_norm": 0.8240542411804199, + "learning_rate": 5.0540687488672755e-05, + "loss": 2.0475, + "step": 24745 + }, + { + "epoch": 2.241942424860139, + "grad_norm": 1.0923285484313965, + "learning_rate": 5.0534646287681996e-05, + "loss": 2.5953, + "step": 24746 + }, + { + "epoch": 2.242033023034586, + "grad_norm": 0.9838476777076721, + "learning_rate": 5.052860508669124e-05, + "loss": 2.4301, + "step": 24747 + }, + { + "epoch": 2.2421236212090325, + "grad_norm": 1.0020064115524292, + "learning_rate": 5.052256388570048e-05, + "loss": 2.6533, + "step": 24748 + }, + { + "epoch": 2.2422142193834795, + "grad_norm": 1.029296875, + "learning_rate": 5.051652268470972e-05, + "loss": 2.5277, + "step": 24749 + }, + { + "epoch": 2.242304817557926, + "grad_norm": 1.0249818563461304, + "learning_rate": 5.0510481483718966e-05, + "loss": 2.5021, + "step": 24750 + }, + { + "epoch": 2.242395415732373, + "grad_norm": 0.9514363408088684, + "learning_rate": 5.050444028272821e-05, + "loss": 2.3991, + "step": 24751 + }, + { + "epoch": 2.2424860139068197, + "grad_norm": 0.9823819994926453, + "learning_rate": 5.049839908173745e-05, + "loss": 2.5749, + "step": 24752 + }, + { + "epoch": 2.2425766120812667, + "grad_norm": 0.974040687084198, + "learning_rate": 5.04923578807467e-05, + "loss": 2.532, + "step": 24753 + }, + { + "epoch": 2.2426672102557133, + "grad_norm": 0.9499194025993347, + "learning_rate": 5.048631667975594e-05, + "loss": 2.5929, + "step": 24754 + }, + { + "epoch": 2.2427578084301603, + "grad_norm": 0.9587731957435608, + "learning_rate": 5.0480275478765184e-05, + "loss": 2.6361, + "step": 24755 + }, + { + "epoch": 2.242848406604607, + "grad_norm": 0.8820648789405823, + "learning_rate": 5.0474234277774424e-05, + "loss": 1.8163, + "step": 24756 + }, + { + "epoch": 2.242939004779054, + "grad_norm": 0.9705950021743774, + "learning_rate": 5.0468193076783665e-05, + "loss": 2.7141, + "step": 24757 + }, + { + "epoch": 2.2430296029535004, + "grad_norm": 0.8301183581352234, + "learning_rate": 5.0462151875792906e-05, + "loss": 1.9323, + "step": 24758 + }, + { + "epoch": 2.2431202011279474, + "grad_norm": 0.9585245251655579, + "learning_rate": 5.045611067480215e-05, + "loss": 2.5669, + "step": 24759 + }, + { + "epoch": 2.243210799302394, + "grad_norm": 0.94454425573349, + "learning_rate": 5.04500694738114e-05, + "loss": 2.3587, + "step": 24760 + }, + { + "epoch": 2.243301397476841, + "grad_norm": 1.0049580335617065, + "learning_rate": 5.044402827282064e-05, + "loss": 2.6352, + "step": 24761 + }, + { + "epoch": 2.2433919956512876, + "grad_norm": 0.8877586722373962, + "learning_rate": 5.043798707182988e-05, + "loss": 2.1812, + "step": 24762 + }, + { + "epoch": 2.2434825938257346, + "grad_norm": 0.9553864598274231, + "learning_rate": 5.0431945870839124e-05, + "loss": 1.9347, + "step": 24763 + }, + { + "epoch": 2.243573192000181, + "grad_norm": 0.9479913711547852, + "learning_rate": 5.0425904669848365e-05, + "loss": 2.4943, + "step": 24764 + }, + { + "epoch": 2.243663790174628, + "grad_norm": 1.104378581047058, + "learning_rate": 5.0419863468857606e-05, + "loss": 2.6101, + "step": 24765 + }, + { + "epoch": 2.2437543883490747, + "grad_norm": 0.9042251110076904, + "learning_rate": 5.041382226786685e-05, + "loss": 2.6539, + "step": 24766 + }, + { + "epoch": 2.2438449865235217, + "grad_norm": 0.8436914086341858, + "learning_rate": 5.0407781066876094e-05, + "loss": 1.964, + "step": 24767 + }, + { + "epoch": 2.2439355846979683, + "grad_norm": 0.9295493364334106, + "learning_rate": 5.040173986588534e-05, + "loss": 2.6416, + "step": 24768 + }, + { + "epoch": 2.2440261828724153, + "grad_norm": 1.0726344585418701, + "learning_rate": 5.039569866489459e-05, + "loss": 2.4683, + "step": 24769 + }, + { + "epoch": 2.244116781046862, + "grad_norm": 1.1738942861557007, + "learning_rate": 5.038965746390383e-05, + "loss": 2.5107, + "step": 24770 + }, + { + "epoch": 2.244207379221309, + "grad_norm": 1.0878804922103882, + "learning_rate": 5.038361626291307e-05, + "loss": 2.5087, + "step": 24771 + }, + { + "epoch": 2.2442979773957554, + "grad_norm": 0.9594277739524841, + "learning_rate": 5.037757506192231e-05, + "loss": 2.6323, + "step": 24772 + }, + { + "epoch": 2.2443885755702024, + "grad_norm": 0.9100306630134583, + "learning_rate": 5.037153386093155e-05, + "loss": 1.9768, + "step": 24773 + }, + { + "epoch": 2.244479173744649, + "grad_norm": 0.878053605556488, + "learning_rate": 5.036549265994079e-05, + "loss": 1.879, + "step": 24774 + }, + { + "epoch": 2.244569771919096, + "grad_norm": 0.8730630278587341, + "learning_rate": 5.035945145895005e-05, + "loss": 1.8924, + "step": 24775 + }, + { + "epoch": 2.2446603700935426, + "grad_norm": 0.9584820866584778, + "learning_rate": 5.035341025795929e-05, + "loss": 2.6093, + "step": 24776 + }, + { + "epoch": 2.2447509682679896, + "grad_norm": 0.9718323945999146, + "learning_rate": 5.034736905696853e-05, + "loss": 2.6515, + "step": 24777 + }, + { + "epoch": 2.244841566442436, + "grad_norm": 1.0047194957733154, + "learning_rate": 5.034132785597777e-05, + "loss": 2.5509, + "step": 24778 + }, + { + "epoch": 2.244932164616883, + "grad_norm": 1.0062021017074585, + "learning_rate": 5.033528665498701e-05, + "loss": 2.601, + "step": 24779 + }, + { + "epoch": 2.2450227627913297, + "grad_norm": 0.9948665499687195, + "learning_rate": 5.032924545399625e-05, + "loss": 2.3797, + "step": 24780 + }, + { + "epoch": 2.2451133609657767, + "grad_norm": 0.9745914936065674, + "learning_rate": 5.032320425300549e-05, + "loss": 2.4323, + "step": 24781 + }, + { + "epoch": 2.2452039591402233, + "grad_norm": 0.9879051446914673, + "learning_rate": 5.031716305201474e-05, + "loss": 2.6776, + "step": 24782 + }, + { + "epoch": 2.2452945573146703, + "grad_norm": 1.016187071800232, + "learning_rate": 5.031112185102399e-05, + "loss": 2.5668, + "step": 24783 + }, + { + "epoch": 2.245385155489117, + "grad_norm": 0.9885455965995789, + "learning_rate": 5.030508065003323e-05, + "loss": 2.6158, + "step": 24784 + }, + { + "epoch": 2.2454757536635634, + "grad_norm": 0.986340343952179, + "learning_rate": 5.0299039449042476e-05, + "loss": 2.7536, + "step": 24785 + }, + { + "epoch": 2.2455663518380105, + "grad_norm": 1.099155306816101, + "learning_rate": 5.029299824805172e-05, + "loss": 2.5701, + "step": 24786 + }, + { + "epoch": 2.2456569500124575, + "grad_norm": 0.9829861521720886, + "learning_rate": 5.028695704706096e-05, + "loss": 2.4973, + "step": 24787 + }, + { + "epoch": 2.245747548186904, + "grad_norm": 1.0191826820373535, + "learning_rate": 5.02809158460702e-05, + "loss": 2.8406, + "step": 24788 + }, + { + "epoch": 2.2458381463613506, + "grad_norm": 0.9828197956085205, + "learning_rate": 5.027487464507944e-05, + "loss": 2.5268, + "step": 24789 + }, + { + "epoch": 2.2459287445357976, + "grad_norm": 1.0244979858398438, + "learning_rate": 5.0268833444088694e-05, + "loss": 2.6055, + "step": 24790 + }, + { + "epoch": 2.2460193427102446, + "grad_norm": 0.8534203171730042, + "learning_rate": 5.0262792243097935e-05, + "loss": 1.7415, + "step": 24791 + }, + { + "epoch": 2.246109940884691, + "grad_norm": 1.0744869709014893, + "learning_rate": 5.0256751042107176e-05, + "loss": 2.5849, + "step": 24792 + }, + { + "epoch": 2.2462005390591377, + "grad_norm": 0.978415310382843, + "learning_rate": 5.0250709841116416e-05, + "loss": 2.6007, + "step": 24793 + }, + { + "epoch": 2.2462911372335848, + "grad_norm": 0.990895688533783, + "learning_rate": 5.024466864012566e-05, + "loss": 2.7679, + "step": 24794 + }, + { + "epoch": 2.2463817354080313, + "grad_norm": 1.0069667100906372, + "learning_rate": 5.02386274391349e-05, + "loss": 2.5783, + "step": 24795 + }, + { + "epoch": 2.2464723335824783, + "grad_norm": 1.1142871379852295, + "learning_rate": 5.023258623814414e-05, + "loss": 2.5726, + "step": 24796 + }, + { + "epoch": 2.246562931756925, + "grad_norm": 0.9945904016494751, + "learning_rate": 5.022654503715338e-05, + "loss": 2.546, + "step": 24797 + }, + { + "epoch": 2.246653529931372, + "grad_norm": 1.0289833545684814, + "learning_rate": 5.0220503836162634e-05, + "loss": 2.5437, + "step": 24798 + }, + { + "epoch": 2.2467441281058185, + "grad_norm": 1.0079370737075806, + "learning_rate": 5.0214462635171875e-05, + "loss": 2.3039, + "step": 24799 + }, + { + "epoch": 2.2468347262802655, + "grad_norm": 0.9604512453079224, + "learning_rate": 5.0208421434181116e-05, + "loss": 2.8751, + "step": 24800 + }, + { + "epoch": 2.246925324454712, + "grad_norm": 1.1700340509414673, + "learning_rate": 5.0202380233190363e-05, + "loss": 2.7878, + "step": 24801 + }, + { + "epoch": 2.247015922629159, + "grad_norm": 1.037068247795105, + "learning_rate": 5.0196339032199604e-05, + "loss": 2.7644, + "step": 24802 + }, + { + "epoch": 2.2471065208036056, + "grad_norm": 0.8544146418571472, + "learning_rate": 5.0190297831208845e-05, + "loss": 2.0443, + "step": 24803 + }, + { + "epoch": 2.2471971189780526, + "grad_norm": 1.012000322341919, + "learning_rate": 5.0184256630218086e-05, + "loss": 2.7792, + "step": 24804 + }, + { + "epoch": 2.247287717152499, + "grad_norm": 1.0595002174377441, + "learning_rate": 5.017821542922734e-05, + "loss": 2.5749, + "step": 24805 + }, + { + "epoch": 2.247378315326946, + "grad_norm": 0.970798909664154, + "learning_rate": 5.017217422823658e-05, + "loss": 2.6129, + "step": 24806 + }, + { + "epoch": 2.2474689135013928, + "grad_norm": 1.0096614360809326, + "learning_rate": 5.016613302724582e-05, + "loss": 2.3845, + "step": 24807 + }, + { + "epoch": 2.24755951167584, + "grad_norm": 1.0252060890197754, + "learning_rate": 5.016009182625506e-05, + "loss": 2.6177, + "step": 24808 + }, + { + "epoch": 2.2476501098502863, + "grad_norm": 0.8472386598587036, + "learning_rate": 5.0154050625264304e-05, + "loss": 1.7827, + "step": 24809 + }, + { + "epoch": 2.2477407080247334, + "grad_norm": 1.0299391746520996, + "learning_rate": 5.0148009424273544e-05, + "loss": 2.4346, + "step": 24810 + }, + { + "epoch": 2.24783130619918, + "grad_norm": 1.087256908416748, + "learning_rate": 5.0141968223282785e-05, + "loss": 2.5487, + "step": 24811 + }, + { + "epoch": 2.247921904373627, + "grad_norm": 0.9505580067634583, + "learning_rate": 5.0135927022292026e-05, + "loss": 2.56, + "step": 24812 + }, + { + "epoch": 2.2480125025480735, + "grad_norm": 0.9971149563789368, + "learning_rate": 5.012988582130128e-05, + "loss": 2.6031, + "step": 24813 + }, + { + "epoch": 2.2481031007225205, + "grad_norm": 0.9237697124481201, + "learning_rate": 5.012384462031052e-05, + "loss": 2.5096, + "step": 24814 + }, + { + "epoch": 2.248193698896967, + "grad_norm": 1.011405110359192, + "learning_rate": 5.011780341931976e-05, + "loss": 2.6535, + "step": 24815 + }, + { + "epoch": 2.248284297071414, + "grad_norm": 1.1132527589797974, + "learning_rate": 5.0111762218329e-05, + "loss": 2.4904, + "step": 24816 + }, + { + "epoch": 2.2483748952458606, + "grad_norm": 0.9419025778770447, + "learning_rate": 5.010572101733825e-05, + "loss": 2.5829, + "step": 24817 + }, + { + "epoch": 2.2484654934203077, + "grad_norm": 0.9917271137237549, + "learning_rate": 5.009967981634749e-05, + "loss": 2.5848, + "step": 24818 + }, + { + "epoch": 2.2485560915947542, + "grad_norm": 1.015679955482483, + "learning_rate": 5.009363861535673e-05, + "loss": 2.4844, + "step": 24819 + }, + { + "epoch": 2.2486466897692012, + "grad_norm": 1.0697906017303467, + "learning_rate": 5.008759741436598e-05, + "loss": 2.5415, + "step": 24820 + }, + { + "epoch": 2.248737287943648, + "grad_norm": 1.019416332244873, + "learning_rate": 5.008155621337523e-05, + "loss": 2.5017, + "step": 24821 + }, + { + "epoch": 2.248827886118095, + "grad_norm": 1.1268612146377563, + "learning_rate": 5.007551501238447e-05, + "loss": 2.3792, + "step": 24822 + }, + { + "epoch": 2.2489184842925414, + "grad_norm": 1.0183300971984863, + "learning_rate": 5.006947381139371e-05, + "loss": 2.4202, + "step": 24823 + }, + { + "epoch": 2.2490090824669884, + "grad_norm": 0.9927284717559814, + "learning_rate": 5.006343261040295e-05, + "loss": 2.489, + "step": 24824 + }, + { + "epoch": 2.249099680641435, + "grad_norm": 1.0540847778320312, + "learning_rate": 5.005739140941219e-05, + "loss": 2.7597, + "step": 24825 + }, + { + "epoch": 2.249190278815882, + "grad_norm": 1.006609320640564, + "learning_rate": 5.005135020842143e-05, + "loss": 2.7443, + "step": 24826 + }, + { + "epoch": 2.2492808769903285, + "grad_norm": 1.0831180810928345, + "learning_rate": 5.004530900743067e-05, + "loss": 2.5438, + "step": 24827 + }, + { + "epoch": 2.2493714751647755, + "grad_norm": 1.0015053749084473, + "learning_rate": 5.003926780643993e-05, + "loss": 2.596, + "step": 24828 + }, + { + "epoch": 2.249462073339222, + "grad_norm": 1.1080050468444824, + "learning_rate": 5.003322660544917e-05, + "loss": 2.7447, + "step": 24829 + }, + { + "epoch": 2.249552671513669, + "grad_norm": 1.037483811378479, + "learning_rate": 5.002718540445841e-05, + "loss": 2.8508, + "step": 24830 + }, + { + "epoch": 2.2496432696881157, + "grad_norm": 0.9553998112678528, + "learning_rate": 5.002114420346765e-05, + "loss": 2.3665, + "step": 24831 + }, + { + "epoch": 2.2497338678625627, + "grad_norm": 1.0202945470809937, + "learning_rate": 5.001510300247689e-05, + "loss": 2.6779, + "step": 24832 + }, + { + "epoch": 2.2498244660370093, + "grad_norm": 0.8893454074859619, + "learning_rate": 5.000906180148614e-05, + "loss": 2.2789, + "step": 24833 + }, + { + "epoch": 2.2499150642114563, + "grad_norm": 1.028976559638977, + "learning_rate": 5.000302060049538e-05, + "loss": 2.5401, + "step": 24834 + }, + { + "epoch": 2.250005662385903, + "grad_norm": 1.0011934041976929, + "learning_rate": 4.9996979399504626e-05, + "loss": 2.7277, + "step": 24835 + }, + { + "epoch": 2.25009626056035, + "grad_norm": 1.0182759761810303, + "learning_rate": 4.999093819851387e-05, + "loss": 2.269, + "step": 24836 + }, + { + "epoch": 2.2501868587347964, + "grad_norm": 1.0412623882293701, + "learning_rate": 4.9984896997523115e-05, + "loss": 3.1269, + "step": 24837 + }, + { + "epoch": 2.2502774569092434, + "grad_norm": 1.0767921209335327, + "learning_rate": 4.9978855796532355e-05, + "loss": 2.417, + "step": 24838 + }, + { + "epoch": 2.25036805508369, + "grad_norm": 1.0080901384353638, + "learning_rate": 4.9972814595541596e-05, + "loss": 2.4771, + "step": 24839 + }, + { + "epoch": 2.250458653258137, + "grad_norm": 1.0907548666000366, + "learning_rate": 4.996677339455084e-05, + "loss": 1.8862, + "step": 24840 + }, + { + "epoch": 2.2505492514325836, + "grad_norm": 1.0199816226959229, + "learning_rate": 4.9960732193560085e-05, + "loss": 2.9324, + "step": 24841 + }, + { + "epoch": 2.2506398496070306, + "grad_norm": 1.0322962999343872, + "learning_rate": 4.9954690992569326e-05, + "loss": 2.8255, + "step": 24842 + }, + { + "epoch": 2.250730447781477, + "grad_norm": 0.8462657332420349, + "learning_rate": 4.9948649791578566e-05, + "loss": 1.8525, + "step": 24843 + }, + { + "epoch": 2.250821045955924, + "grad_norm": 1.0623778104782104, + "learning_rate": 4.9942608590587814e-05, + "loss": 2.6376, + "step": 24844 + }, + { + "epoch": 2.2509116441303707, + "grad_norm": 0.9590023159980774, + "learning_rate": 4.9936567389597055e-05, + "loss": 2.7706, + "step": 24845 + }, + { + "epoch": 2.2510022423048177, + "grad_norm": 1.138452172279358, + "learning_rate": 4.9930526188606296e-05, + "loss": 2.6106, + "step": 24846 + }, + { + "epoch": 2.2510928404792643, + "grad_norm": 0.9897506237030029, + "learning_rate": 4.9924484987615536e-05, + "loss": 2.7887, + "step": 24847 + }, + { + "epoch": 2.2511834386537113, + "grad_norm": 1.173161268234253, + "learning_rate": 4.9918443786624784e-05, + "loss": 2.4979, + "step": 24848 + }, + { + "epoch": 2.251274036828158, + "grad_norm": 1.127234935760498, + "learning_rate": 4.9912402585634025e-05, + "loss": 2.6658, + "step": 24849 + }, + { + "epoch": 2.251364635002605, + "grad_norm": 0.9935876727104187, + "learning_rate": 4.9906361384643266e-05, + "loss": 2.5674, + "step": 24850 + }, + { + "epoch": 2.2514552331770514, + "grad_norm": 1.0260556936264038, + "learning_rate": 4.990032018365251e-05, + "loss": 2.6947, + "step": 24851 + }, + { + "epoch": 2.2515458313514984, + "grad_norm": 1.0381335020065308, + "learning_rate": 4.9894278982661754e-05, + "loss": 2.7009, + "step": 24852 + }, + { + "epoch": 2.251636429525945, + "grad_norm": 1.0802897214889526, + "learning_rate": 4.9888237781671e-05, + "loss": 2.5289, + "step": 24853 + }, + { + "epoch": 2.251727027700392, + "grad_norm": 0.9856101870536804, + "learning_rate": 4.988219658068024e-05, + "loss": 2.7515, + "step": 24854 + }, + { + "epoch": 2.2518176258748386, + "grad_norm": 0.9885389804840088, + "learning_rate": 4.9876155379689483e-05, + "loss": 2.5179, + "step": 24855 + }, + { + "epoch": 2.2519082240492856, + "grad_norm": 0.8535187840461731, + "learning_rate": 4.987011417869873e-05, + "loss": 2.1551, + "step": 24856 + }, + { + "epoch": 2.251998822223732, + "grad_norm": 0.9507065415382385, + "learning_rate": 4.986407297770797e-05, + "loss": 2.4485, + "step": 24857 + }, + { + "epoch": 2.252089420398179, + "grad_norm": 0.994730532169342, + "learning_rate": 4.985803177671721e-05, + "loss": 2.3188, + "step": 24858 + }, + { + "epoch": 2.2521800185726257, + "grad_norm": 1.0513179302215576, + "learning_rate": 4.985199057572646e-05, + "loss": 2.585, + "step": 24859 + }, + { + "epoch": 2.2522706167470727, + "grad_norm": 0.925925612449646, + "learning_rate": 4.98459493747357e-05, + "loss": 2.565, + "step": 24860 + }, + { + "epoch": 2.2523612149215193, + "grad_norm": 1.1440967321395874, + "learning_rate": 4.983990817374494e-05, + "loss": 2.4493, + "step": 24861 + }, + { + "epoch": 2.2524518130959663, + "grad_norm": 0.9586418867111206, + "learning_rate": 4.983386697275418e-05, + "loss": 2.682, + "step": 24862 + }, + { + "epoch": 2.252542411270413, + "grad_norm": 0.9426853060722351, + "learning_rate": 4.982782577176343e-05, + "loss": 2.719, + "step": 24863 + }, + { + "epoch": 2.2526330094448594, + "grad_norm": 1.0769182443618774, + "learning_rate": 4.982178457077267e-05, + "loss": 2.7243, + "step": 24864 + }, + { + "epoch": 2.2527236076193065, + "grad_norm": 1.019359827041626, + "learning_rate": 4.981574336978191e-05, + "loss": 2.5026, + "step": 24865 + }, + { + "epoch": 2.2528142057937535, + "grad_norm": 0.9287539124488831, + "learning_rate": 4.980970216879115e-05, + "loss": 2.6748, + "step": 24866 + }, + { + "epoch": 2.2529048039682, + "grad_norm": 0.9805046916007996, + "learning_rate": 4.98036609678004e-05, + "loss": 2.591, + "step": 24867 + }, + { + "epoch": 2.2529954021426466, + "grad_norm": 1.0043450593948364, + "learning_rate": 4.979761976680964e-05, + "loss": 2.5248, + "step": 24868 + }, + { + "epoch": 2.2530860003170936, + "grad_norm": 0.9411090612411499, + "learning_rate": 4.979157856581889e-05, + "loss": 2.7225, + "step": 24869 + }, + { + "epoch": 2.2531765984915406, + "grad_norm": 1.0681214332580566, + "learning_rate": 4.978553736482813e-05, + "loss": 2.6755, + "step": 24870 + }, + { + "epoch": 2.253267196665987, + "grad_norm": 0.9523795247077942, + "learning_rate": 4.977949616383738e-05, + "loss": 2.5146, + "step": 24871 + }, + { + "epoch": 2.2533577948404337, + "grad_norm": 1.0113890171051025, + "learning_rate": 4.977345496284662e-05, + "loss": 2.6578, + "step": 24872 + }, + { + "epoch": 2.2534483930148808, + "grad_norm": 0.9660605192184448, + "learning_rate": 4.976741376185586e-05, + "loss": 2.7094, + "step": 24873 + }, + { + "epoch": 2.2535389911893278, + "grad_norm": 0.9781202077865601, + "learning_rate": 4.976137256086511e-05, + "loss": 2.7617, + "step": 24874 + }, + { + "epoch": 2.2536295893637743, + "grad_norm": 1.0385668277740479, + "learning_rate": 4.975533135987435e-05, + "loss": 2.934, + "step": 24875 + }, + { + "epoch": 2.253720187538221, + "grad_norm": 0.9257063865661621, + "learning_rate": 4.974929015888359e-05, + "loss": 2.3828, + "step": 24876 + }, + { + "epoch": 2.253810785712668, + "grad_norm": 0.9995795488357544, + "learning_rate": 4.974324895789283e-05, + "loss": 2.493, + "step": 24877 + }, + { + "epoch": 2.253901383887115, + "grad_norm": 1.0293948650360107, + "learning_rate": 4.973720775690208e-05, + "loss": 2.7155, + "step": 24878 + }, + { + "epoch": 2.2539919820615615, + "grad_norm": 1.0142829418182373, + "learning_rate": 4.973116655591132e-05, + "loss": 2.8245, + "step": 24879 + }, + { + "epoch": 2.254082580236008, + "grad_norm": 1.0123143196105957, + "learning_rate": 4.972512535492056e-05, + "loss": 2.4459, + "step": 24880 + }, + { + "epoch": 2.254173178410455, + "grad_norm": 1.0102880001068115, + "learning_rate": 4.97190841539298e-05, + "loss": 2.5405, + "step": 24881 + }, + { + "epoch": 2.2542637765849016, + "grad_norm": 1.0895315408706665, + "learning_rate": 4.971304295293905e-05, + "loss": 2.5793, + "step": 24882 + }, + { + "epoch": 2.2543543747593486, + "grad_norm": 1.007033109664917, + "learning_rate": 4.970700175194829e-05, + "loss": 2.3443, + "step": 24883 + }, + { + "epoch": 2.254444972933795, + "grad_norm": 1.135168433189392, + "learning_rate": 4.970096055095753e-05, + "loss": 2.5814, + "step": 24884 + }, + { + "epoch": 2.254535571108242, + "grad_norm": 1.1269092559814453, + "learning_rate": 4.9694919349966776e-05, + "loss": 2.4563, + "step": 24885 + }, + { + "epoch": 2.2546261692826888, + "grad_norm": 0.9378727078437805, + "learning_rate": 4.968887814897602e-05, + "loss": 2.4073, + "step": 24886 + }, + { + "epoch": 2.254716767457136, + "grad_norm": 0.8693758845329285, + "learning_rate": 4.9682836947985265e-05, + "loss": 1.9444, + "step": 24887 + }, + { + "epoch": 2.2548073656315823, + "grad_norm": 1.110587239265442, + "learning_rate": 4.9676795746994505e-05, + "loss": 2.8279, + "step": 24888 + }, + { + "epoch": 2.2548979638060294, + "grad_norm": 0.9355151653289795, + "learning_rate": 4.967075454600375e-05, + "loss": 2.0931, + "step": 24889 + }, + { + "epoch": 2.254988561980476, + "grad_norm": 0.9927013516426086, + "learning_rate": 4.9664713345012994e-05, + "loss": 2.5557, + "step": 24890 + }, + { + "epoch": 2.255079160154923, + "grad_norm": 1.0509042739868164, + "learning_rate": 4.9658672144022235e-05, + "loss": 2.7064, + "step": 24891 + }, + { + "epoch": 2.2551697583293695, + "grad_norm": 0.9636088013648987, + "learning_rate": 4.9652630943031475e-05, + "loss": 2.5552, + "step": 24892 + }, + { + "epoch": 2.2552603565038165, + "grad_norm": 0.9715813994407654, + "learning_rate": 4.964658974204072e-05, + "loss": 2.4828, + "step": 24893 + }, + { + "epoch": 2.255350954678263, + "grad_norm": 0.9966575503349304, + "learning_rate": 4.9640548541049964e-05, + "loss": 2.3821, + "step": 24894 + }, + { + "epoch": 2.25544155285271, + "grad_norm": 0.957507312297821, + "learning_rate": 4.9634507340059205e-05, + "loss": 2.5871, + "step": 24895 + }, + { + "epoch": 2.2555321510271567, + "grad_norm": 1.0818148851394653, + "learning_rate": 4.9628466139068446e-05, + "loss": 2.674, + "step": 24896 + }, + { + "epoch": 2.2556227492016037, + "grad_norm": 1.0122092962265015, + "learning_rate": 4.962242493807769e-05, + "loss": 2.8442, + "step": 24897 + }, + { + "epoch": 2.2557133473760502, + "grad_norm": 1.0196595191955566, + "learning_rate": 4.9616383737086934e-05, + "loss": 2.6562, + "step": 24898 + }, + { + "epoch": 2.2558039455504972, + "grad_norm": 1.0298397541046143, + "learning_rate": 4.9610342536096175e-05, + "loss": 2.6655, + "step": 24899 + }, + { + "epoch": 2.255894543724944, + "grad_norm": 0.9770535826683044, + "learning_rate": 4.9604301335105416e-05, + "loss": 2.5512, + "step": 24900 + }, + { + "epoch": 2.255985141899391, + "grad_norm": 0.9460216760635376, + "learning_rate": 4.959826013411466e-05, + "loss": 2.7293, + "step": 24901 + }, + { + "epoch": 2.2560757400738374, + "grad_norm": 0.9436200857162476, + "learning_rate": 4.9592218933123904e-05, + "loss": 2.5788, + "step": 24902 + }, + { + "epoch": 2.2561663382482844, + "grad_norm": 1.033313274383545, + "learning_rate": 4.958617773213315e-05, + "loss": 2.3965, + "step": 24903 + }, + { + "epoch": 2.256256936422731, + "grad_norm": 1.0073697566986084, + "learning_rate": 4.95801365311424e-05, + "loss": 2.5636, + "step": 24904 + }, + { + "epoch": 2.256347534597178, + "grad_norm": 1.0686179399490356, + "learning_rate": 4.957409533015164e-05, + "loss": 2.4987, + "step": 24905 + }, + { + "epoch": 2.2564381327716245, + "grad_norm": 1.0164499282836914, + "learning_rate": 4.956805412916088e-05, + "loss": 2.76, + "step": 24906 + }, + { + "epoch": 2.2565287309460715, + "grad_norm": 1.0571763515472412, + "learning_rate": 4.956201292817012e-05, + "loss": 2.4969, + "step": 24907 + }, + { + "epoch": 2.256619329120518, + "grad_norm": 0.9875427484512329, + "learning_rate": 4.955597172717937e-05, + "loss": 2.6714, + "step": 24908 + }, + { + "epoch": 2.256709927294965, + "grad_norm": 0.8490709662437439, + "learning_rate": 4.954993052618861e-05, + "loss": 1.8884, + "step": 24909 + }, + { + "epoch": 2.2568005254694117, + "grad_norm": 0.9791225790977478, + "learning_rate": 4.954388932519785e-05, + "loss": 2.6319, + "step": 24910 + }, + { + "epoch": 2.2568911236438587, + "grad_norm": 1.0722395181655884, + "learning_rate": 4.953784812420709e-05, + "loss": 2.7134, + "step": 24911 + }, + { + "epoch": 2.2569817218183053, + "grad_norm": 1.0034302473068237, + "learning_rate": 4.953180692321634e-05, + "loss": 2.5478, + "step": 24912 + }, + { + "epoch": 2.2570723199927523, + "grad_norm": 1.0317764282226562, + "learning_rate": 4.952576572222558e-05, + "loss": 2.89, + "step": 24913 + }, + { + "epoch": 2.257162918167199, + "grad_norm": 0.8821635246276855, + "learning_rate": 4.951972452123482e-05, + "loss": 1.9616, + "step": 24914 + }, + { + "epoch": 2.257253516341646, + "grad_norm": 0.9394722580909729, + "learning_rate": 4.951368332024406e-05, + "loss": 2.5298, + "step": 24915 + }, + { + "epoch": 2.2573441145160924, + "grad_norm": 1.0745424032211304, + "learning_rate": 4.950764211925331e-05, + "loss": 2.5807, + "step": 24916 + }, + { + "epoch": 2.2574347126905394, + "grad_norm": 1.0009328126907349, + "learning_rate": 4.950160091826255e-05, + "loss": 2.4028, + "step": 24917 + }, + { + "epoch": 2.257525310864986, + "grad_norm": 1.0768331289291382, + "learning_rate": 4.949555971727179e-05, + "loss": 2.4005, + "step": 24918 + }, + { + "epoch": 2.257615909039433, + "grad_norm": 1.0722687244415283, + "learning_rate": 4.948951851628104e-05, + "loss": 2.6841, + "step": 24919 + }, + { + "epoch": 2.2577065072138796, + "grad_norm": 1.0835223197937012, + "learning_rate": 4.9483477315290286e-05, + "loss": 2.8011, + "step": 24920 + }, + { + "epoch": 2.2577971053883266, + "grad_norm": 0.9859968423843384, + "learning_rate": 4.947743611429953e-05, + "loss": 2.6487, + "step": 24921 + }, + { + "epoch": 2.257887703562773, + "grad_norm": 0.8732780814170837, + "learning_rate": 4.947139491330877e-05, + "loss": 1.9225, + "step": 24922 + }, + { + "epoch": 2.25797830173722, + "grad_norm": 1.0319424867630005, + "learning_rate": 4.9465353712318016e-05, + "loss": 2.6393, + "step": 24923 + }, + { + "epoch": 2.2580688999116667, + "grad_norm": 1.0620331764221191, + "learning_rate": 4.9459312511327257e-05, + "loss": 2.6457, + "step": 24924 + }, + { + "epoch": 2.2581594980861137, + "grad_norm": 0.9489583373069763, + "learning_rate": 4.94532713103365e-05, + "loss": 2.4811, + "step": 24925 + }, + { + "epoch": 2.2582500962605603, + "grad_norm": 1.0487778186798096, + "learning_rate": 4.944723010934574e-05, + "loss": 2.4745, + "step": 24926 + }, + { + "epoch": 2.2583406944350073, + "grad_norm": 1.0314431190490723, + "learning_rate": 4.9441188908354986e-05, + "loss": 2.5958, + "step": 24927 + }, + { + "epoch": 2.258431292609454, + "grad_norm": 0.9548390507698059, + "learning_rate": 4.943514770736423e-05, + "loss": 2.5424, + "step": 24928 + }, + { + "epoch": 2.258521890783901, + "grad_norm": 1.2657654285430908, + "learning_rate": 4.942910650637347e-05, + "loss": 2.4149, + "step": 24929 + }, + { + "epoch": 2.2586124889583474, + "grad_norm": 0.9949572682380676, + "learning_rate": 4.942306530538271e-05, + "loss": 2.7091, + "step": 24930 + }, + { + "epoch": 2.2587030871327944, + "grad_norm": 0.9175436496734619, + "learning_rate": 4.9417024104391956e-05, + "loss": 2.1271, + "step": 24931 + }, + { + "epoch": 2.258793685307241, + "grad_norm": 1.0436838865280151, + "learning_rate": 4.94109829034012e-05, + "loss": 3.1063, + "step": 24932 + }, + { + "epoch": 2.258884283481688, + "grad_norm": 0.8684737086296082, + "learning_rate": 4.940494170241044e-05, + "loss": 2.3828, + "step": 24933 + }, + { + "epoch": 2.2589748816561346, + "grad_norm": 0.9907168745994568, + "learning_rate": 4.9398900501419685e-05, + "loss": 2.5672, + "step": 24934 + }, + { + "epoch": 2.2590654798305816, + "grad_norm": 0.9226785898208618, + "learning_rate": 4.9392859300428926e-05, + "loss": 2.5056, + "step": 24935 + }, + { + "epoch": 2.259156078005028, + "grad_norm": 1.0055439472198486, + "learning_rate": 4.938681809943817e-05, + "loss": 2.6166, + "step": 24936 + }, + { + "epoch": 2.259246676179475, + "grad_norm": 0.9822160005569458, + "learning_rate": 4.9380776898447414e-05, + "loss": 2.808, + "step": 24937 + }, + { + "epoch": 2.2593372743539217, + "grad_norm": 1.1060084104537964, + "learning_rate": 4.937473569745666e-05, + "loss": 2.4537, + "step": 24938 + }, + { + "epoch": 2.2594278725283687, + "grad_norm": 1.1424639225006104, + "learning_rate": 4.93686944964659e-05, + "loss": 2.544, + "step": 24939 + }, + { + "epoch": 2.2595184707028153, + "grad_norm": 0.9661195278167725, + "learning_rate": 4.9362653295475144e-05, + "loss": 2.5137, + "step": 24940 + }, + { + "epoch": 2.2596090688772623, + "grad_norm": 0.9529351592063904, + "learning_rate": 4.9356612094484385e-05, + "loss": 2.6465, + "step": 24941 + }, + { + "epoch": 2.259699667051709, + "grad_norm": 1.0613515377044678, + "learning_rate": 4.935057089349363e-05, + "loss": 2.8995, + "step": 24942 + }, + { + "epoch": 2.2597902652261554, + "grad_norm": 0.9947753548622131, + "learning_rate": 4.934452969250287e-05, + "loss": 2.6601, + "step": 24943 + }, + { + "epoch": 2.2598808634006025, + "grad_norm": 1.0297737121582031, + "learning_rate": 4.9338488491512114e-05, + "loss": 2.7807, + "step": 24944 + }, + { + "epoch": 2.2599714615750495, + "grad_norm": 0.98881995677948, + "learning_rate": 4.9332447290521355e-05, + "loss": 2.4763, + "step": 24945 + }, + { + "epoch": 2.260062059749496, + "grad_norm": 0.9876521229743958, + "learning_rate": 4.93264060895306e-05, + "loss": 2.8536, + "step": 24946 + }, + { + "epoch": 2.2601526579239426, + "grad_norm": 0.9630858898162842, + "learning_rate": 4.932036488853984e-05, + "loss": 2.5093, + "step": 24947 + }, + { + "epoch": 2.2602432560983896, + "grad_norm": 1.0678987503051758, + "learning_rate": 4.9314323687549084e-05, + "loss": 2.5165, + "step": 24948 + }, + { + "epoch": 2.2603338542728366, + "grad_norm": 1.018458604812622, + "learning_rate": 4.930828248655833e-05, + "loss": 2.4239, + "step": 24949 + }, + { + "epoch": 2.260424452447283, + "grad_norm": 1.011821985244751, + "learning_rate": 4.930224128556757e-05, + "loss": 2.4801, + "step": 24950 + }, + { + "epoch": 2.2605150506217297, + "grad_norm": 1.0136713981628418, + "learning_rate": 4.929620008457681e-05, + "loss": 2.8332, + "step": 24951 + }, + { + "epoch": 2.2606056487961768, + "grad_norm": 0.9352036714553833, + "learning_rate": 4.9290158883586054e-05, + "loss": 2.5831, + "step": 24952 + }, + { + "epoch": 2.2606962469706238, + "grad_norm": 0.8982617259025574, + "learning_rate": 4.92841176825953e-05, + "loss": 2.6545, + "step": 24953 + }, + { + "epoch": 2.2607868451450703, + "grad_norm": 1.0097507238388062, + "learning_rate": 4.927807648160455e-05, + "loss": 2.5035, + "step": 24954 + }, + { + "epoch": 2.260877443319517, + "grad_norm": 0.9419211745262146, + "learning_rate": 4.927203528061379e-05, + "loss": 1.9833, + "step": 24955 + }, + { + "epoch": 2.260968041493964, + "grad_norm": 1.0096056461334229, + "learning_rate": 4.926599407962303e-05, + "loss": 2.5439, + "step": 24956 + }, + { + "epoch": 2.261058639668411, + "grad_norm": 0.9668935537338257, + "learning_rate": 4.925995287863228e-05, + "loss": 2.5261, + "step": 24957 + }, + { + "epoch": 2.2611492378428575, + "grad_norm": 1.1170220375061035, + "learning_rate": 4.925391167764152e-05, + "loss": 3.0166, + "step": 24958 + }, + { + "epoch": 2.261239836017304, + "grad_norm": 0.9720010161399841, + "learning_rate": 4.924787047665076e-05, + "loss": 2.6994, + "step": 24959 + }, + { + "epoch": 2.261330434191751, + "grad_norm": 1.005497932434082, + "learning_rate": 4.924182927566e-05, + "loss": 2.6955, + "step": 24960 + }, + { + "epoch": 2.261421032366198, + "grad_norm": 0.9852657914161682, + "learning_rate": 4.923578807466925e-05, + "loss": 2.7071, + "step": 24961 + }, + { + "epoch": 2.2615116305406446, + "grad_norm": 1.0127036571502686, + "learning_rate": 4.922974687367849e-05, + "loss": 2.5076, + "step": 24962 + }, + { + "epoch": 2.261602228715091, + "grad_norm": 1.0579469203948975, + "learning_rate": 4.922370567268773e-05, + "loss": 2.6783, + "step": 24963 + }, + { + "epoch": 2.261692826889538, + "grad_norm": 1.0569791793823242, + "learning_rate": 4.921766447169698e-05, + "loss": 2.3076, + "step": 24964 + }, + { + "epoch": 2.2617834250639848, + "grad_norm": 0.9975371956825256, + "learning_rate": 4.921162327070622e-05, + "loss": 2.7094, + "step": 24965 + }, + { + "epoch": 2.261874023238432, + "grad_norm": 0.9324051737785339, + "learning_rate": 4.920558206971546e-05, + "loss": 2.6432, + "step": 24966 + }, + { + "epoch": 2.2619646214128784, + "grad_norm": 1.4985949993133545, + "learning_rate": 4.91995408687247e-05, + "loss": 2.5506, + "step": 24967 + }, + { + "epoch": 2.2620552195873254, + "grad_norm": 0.9563661217689514, + "learning_rate": 4.919349966773395e-05, + "loss": 2.6716, + "step": 24968 + }, + { + "epoch": 2.262145817761772, + "grad_norm": 0.9141566157341003, + "learning_rate": 4.918745846674319e-05, + "loss": 2.5677, + "step": 24969 + }, + { + "epoch": 2.262236415936219, + "grad_norm": 0.8537058234214783, + "learning_rate": 4.9181417265752436e-05, + "loss": 1.8702, + "step": 24970 + }, + { + "epoch": 2.2623270141106655, + "grad_norm": 0.9256728291511536, + "learning_rate": 4.917537606476168e-05, + "loss": 2.9168, + "step": 24971 + }, + { + "epoch": 2.2624176122851125, + "grad_norm": 1.0310219526290894, + "learning_rate": 4.9169334863770925e-05, + "loss": 2.5106, + "step": 24972 + }, + { + "epoch": 2.262508210459559, + "grad_norm": 0.9145447611808777, + "learning_rate": 4.9163293662780166e-05, + "loss": 2.2725, + "step": 24973 + }, + { + "epoch": 2.262598808634006, + "grad_norm": 1.0329176187515259, + "learning_rate": 4.9157252461789406e-05, + "loss": 2.6589, + "step": 24974 + }, + { + "epoch": 2.2626894068084527, + "grad_norm": 1.0235460996627808, + "learning_rate": 4.915121126079865e-05, + "loss": 2.7379, + "step": 24975 + }, + { + "epoch": 2.2627800049828997, + "grad_norm": 1.0354163646697998, + "learning_rate": 4.9145170059807895e-05, + "loss": 3.0001, + "step": 24976 + }, + { + "epoch": 2.2628706031573462, + "grad_norm": 0.991800844669342, + "learning_rate": 4.9139128858817136e-05, + "loss": 2.6644, + "step": 24977 + }, + { + "epoch": 2.2629612013317932, + "grad_norm": 0.9984095692634583, + "learning_rate": 4.9133087657826377e-05, + "loss": 2.5421, + "step": 24978 + }, + { + "epoch": 2.26305179950624, + "grad_norm": 1.02716064453125, + "learning_rate": 4.9127046456835624e-05, + "loss": 2.5056, + "step": 24979 + }, + { + "epoch": 2.263142397680687, + "grad_norm": 1.005905270576477, + "learning_rate": 4.9121005255844865e-05, + "loss": 2.9765, + "step": 24980 + }, + { + "epoch": 2.2632329958551334, + "grad_norm": 0.9955169558525085, + "learning_rate": 4.9114964054854106e-05, + "loss": 2.7256, + "step": 24981 + }, + { + "epoch": 2.2633235940295804, + "grad_norm": 1.0321857929229736, + "learning_rate": 4.910892285386335e-05, + "loss": 2.4062, + "step": 24982 + }, + { + "epoch": 2.263414192204027, + "grad_norm": 0.9037665724754333, + "learning_rate": 4.9102881652872594e-05, + "loss": 2.0942, + "step": 24983 + }, + { + "epoch": 2.263504790378474, + "grad_norm": 1.0465688705444336, + "learning_rate": 4.9096840451881835e-05, + "loss": 2.508, + "step": 24984 + }, + { + "epoch": 2.2635953885529205, + "grad_norm": 0.9840750694274902, + "learning_rate": 4.9090799250891076e-05, + "loss": 2.5907, + "step": 24985 + }, + { + "epoch": 2.2636859867273675, + "grad_norm": 0.9509883522987366, + "learning_rate": 4.9084758049900323e-05, + "loss": 2.5391, + "step": 24986 + }, + { + "epoch": 2.263776584901814, + "grad_norm": 1.090024471282959, + "learning_rate": 4.9078716848909564e-05, + "loss": 2.7114, + "step": 24987 + }, + { + "epoch": 2.263867183076261, + "grad_norm": 0.9757659435272217, + "learning_rate": 4.907267564791881e-05, + "loss": 2.5616, + "step": 24988 + }, + { + "epoch": 2.2639577812507077, + "grad_norm": 1.0026711225509644, + "learning_rate": 4.906663444692805e-05, + "loss": 2.3514, + "step": 24989 + }, + { + "epoch": 2.2640483794251547, + "grad_norm": 1.0037872791290283, + "learning_rate": 4.90605932459373e-05, + "loss": 2.5443, + "step": 24990 + }, + { + "epoch": 2.2641389775996013, + "grad_norm": 1.0687873363494873, + "learning_rate": 4.905455204494654e-05, + "loss": 2.702, + "step": 24991 + }, + { + "epoch": 2.2642295757740483, + "grad_norm": 1.011358380317688, + "learning_rate": 4.904851084395578e-05, + "loss": 2.713, + "step": 24992 + }, + { + "epoch": 2.264320173948495, + "grad_norm": 0.9485487937927246, + "learning_rate": 4.904246964296502e-05, + "loss": 2.784, + "step": 24993 + }, + { + "epoch": 2.264410772122942, + "grad_norm": 1.0227335691452026, + "learning_rate": 4.903642844197427e-05, + "loss": 2.6683, + "step": 24994 + }, + { + "epoch": 2.2645013702973884, + "grad_norm": 1.0039204359054565, + "learning_rate": 4.903038724098351e-05, + "loss": 2.6582, + "step": 24995 + }, + { + "epoch": 2.2645919684718354, + "grad_norm": 0.9862052798271179, + "learning_rate": 4.902434603999275e-05, + "loss": 2.6652, + "step": 24996 + }, + { + "epoch": 2.264682566646282, + "grad_norm": 0.9929113984107971, + "learning_rate": 4.901830483900199e-05, + "loss": 2.1431, + "step": 24997 + }, + { + "epoch": 2.264773164820729, + "grad_norm": 0.9807501435279846, + "learning_rate": 4.901226363801124e-05, + "loss": 2.4648, + "step": 24998 + }, + { + "epoch": 2.2648637629951756, + "grad_norm": 0.9915395379066467, + "learning_rate": 4.900622243702048e-05, + "loss": 2.357, + "step": 24999 + }, + { + "epoch": 2.2649543611696226, + "grad_norm": 1.055577039718628, + "learning_rate": 4.900018123602972e-05, + "loss": 2.4453, + "step": 25000 + }, + { + "epoch": 2.265044959344069, + "grad_norm": 1.0222132205963135, + "learning_rate": 4.899414003503896e-05, + "loss": 1.7101, + "step": 25001 + }, + { + "epoch": 2.265135557518516, + "grad_norm": 1.07004976272583, + "learning_rate": 4.898809883404821e-05, + "loss": 2.4177, + "step": 25002 + }, + { + "epoch": 2.2652261556929627, + "grad_norm": 1.013123869895935, + "learning_rate": 4.898205763305745e-05, + "loss": 2.6042, + "step": 25003 + }, + { + "epoch": 2.2653167538674097, + "grad_norm": 0.7822917103767395, + "learning_rate": 4.89760164320667e-05, + "loss": 1.8486, + "step": 25004 + }, + { + "epoch": 2.2654073520418563, + "grad_norm": 1.1429591178894043, + "learning_rate": 4.896997523107594e-05, + "loss": 2.5271, + "step": 25005 + }, + { + "epoch": 2.2654979502163033, + "grad_norm": 0.9242222309112549, + "learning_rate": 4.896393403008519e-05, + "loss": 2.663, + "step": 25006 + }, + { + "epoch": 2.26558854839075, + "grad_norm": 1.1282851696014404, + "learning_rate": 4.895789282909443e-05, + "loss": 2.7596, + "step": 25007 + }, + { + "epoch": 2.265679146565197, + "grad_norm": 1.0768883228302002, + "learning_rate": 4.895185162810367e-05, + "loss": 2.8706, + "step": 25008 + }, + { + "epoch": 2.2657697447396434, + "grad_norm": 1.1603972911834717, + "learning_rate": 4.894581042711292e-05, + "loss": 2.5459, + "step": 25009 + }, + { + "epoch": 2.2658603429140904, + "grad_norm": 1.0201367139816284, + "learning_rate": 4.893976922612216e-05, + "loss": 2.5957, + "step": 25010 + }, + { + "epoch": 2.265950941088537, + "grad_norm": 1.151977300643921, + "learning_rate": 4.89337280251314e-05, + "loss": 2.538, + "step": 25011 + }, + { + "epoch": 2.266041539262984, + "grad_norm": 0.988254725933075, + "learning_rate": 4.892768682414064e-05, + "loss": 2.5066, + "step": 25012 + }, + { + "epoch": 2.2661321374374306, + "grad_norm": 1.0678743124008179, + "learning_rate": 4.892164562314989e-05, + "loss": 2.4899, + "step": 25013 + }, + { + "epoch": 2.2662227356118776, + "grad_norm": 1.025113582611084, + "learning_rate": 4.891560442215913e-05, + "loss": 2.4768, + "step": 25014 + }, + { + "epoch": 2.266313333786324, + "grad_norm": 0.9574859738349915, + "learning_rate": 4.890956322116837e-05, + "loss": 2.5229, + "step": 25015 + }, + { + "epoch": 2.266403931960771, + "grad_norm": 1.0786865949630737, + "learning_rate": 4.890352202017761e-05, + "loss": 2.5895, + "step": 25016 + }, + { + "epoch": 2.2664945301352177, + "grad_norm": 1.068906545639038, + "learning_rate": 4.889748081918686e-05, + "loss": 2.5091, + "step": 25017 + }, + { + "epoch": 2.2665851283096647, + "grad_norm": 0.820847749710083, + "learning_rate": 4.88914396181961e-05, + "loss": 1.923, + "step": 25018 + }, + { + "epoch": 2.2666757264841113, + "grad_norm": 1.0440616607666016, + "learning_rate": 4.888539841720534e-05, + "loss": 2.6043, + "step": 25019 + }, + { + "epoch": 2.2667663246585583, + "grad_norm": 0.9110643267631531, + "learning_rate": 4.8879357216214586e-05, + "loss": 1.7921, + "step": 25020 + }, + { + "epoch": 2.266856922833005, + "grad_norm": 1.0803745985031128, + "learning_rate": 4.887331601522383e-05, + "loss": 2.826, + "step": 25021 + }, + { + "epoch": 2.266947521007452, + "grad_norm": 0.878888726234436, + "learning_rate": 4.8867274814233075e-05, + "loss": 2.1594, + "step": 25022 + }, + { + "epoch": 2.2670381191818985, + "grad_norm": 1.0341744422912598, + "learning_rate": 4.8861233613242315e-05, + "loss": 2.6313, + "step": 25023 + }, + { + "epoch": 2.2671287173563455, + "grad_norm": 0.9909249544143677, + "learning_rate": 4.885519241225156e-05, + "loss": 2.459, + "step": 25024 + }, + { + "epoch": 2.267219315530792, + "grad_norm": 1.0431820154190063, + "learning_rate": 4.8849151211260804e-05, + "loss": 2.7409, + "step": 25025 + }, + { + "epoch": 2.2673099137052386, + "grad_norm": 1.0291742086410522, + "learning_rate": 4.8843110010270045e-05, + "loss": 2.7065, + "step": 25026 + }, + { + "epoch": 2.2674005118796856, + "grad_norm": 0.9987310767173767, + "learning_rate": 4.8837068809279286e-05, + "loss": 2.5343, + "step": 25027 + }, + { + "epoch": 2.2674911100541326, + "grad_norm": 1.0351619720458984, + "learning_rate": 4.883102760828853e-05, + "loss": 2.4418, + "step": 25028 + }, + { + "epoch": 2.267581708228579, + "grad_norm": 0.9112306237220764, + "learning_rate": 4.8824986407297774e-05, + "loss": 2.4798, + "step": 25029 + }, + { + "epoch": 2.2676723064030258, + "grad_norm": 1.006360650062561, + "learning_rate": 4.8818945206307015e-05, + "loss": 2.6897, + "step": 25030 + }, + { + "epoch": 2.2677629045774728, + "grad_norm": 0.9537374973297119, + "learning_rate": 4.8812904005316256e-05, + "loss": 2.4684, + "step": 25031 + }, + { + "epoch": 2.2678535027519198, + "grad_norm": 0.8819912075996399, + "learning_rate": 4.88068628043255e-05, + "loss": 1.832, + "step": 25032 + }, + { + "epoch": 2.2679441009263663, + "grad_norm": 0.9761972427368164, + "learning_rate": 4.8800821603334744e-05, + "loss": 2.6938, + "step": 25033 + }, + { + "epoch": 2.268034699100813, + "grad_norm": 1.0514823198318481, + "learning_rate": 4.8794780402343985e-05, + "loss": 2.6868, + "step": 25034 + }, + { + "epoch": 2.26812529727526, + "grad_norm": 0.9945614337921143, + "learning_rate": 4.878873920135323e-05, + "loss": 2.5584, + "step": 25035 + }, + { + "epoch": 2.268215895449707, + "grad_norm": 1.0163583755493164, + "learning_rate": 4.878269800036247e-05, + "loss": 2.5381, + "step": 25036 + }, + { + "epoch": 2.2683064936241535, + "grad_norm": 1.0553970336914062, + "learning_rate": 4.8776656799371714e-05, + "loss": 2.7946, + "step": 25037 + }, + { + "epoch": 2.2683970917986, + "grad_norm": 1.1158374547958374, + "learning_rate": 4.877061559838096e-05, + "loss": 2.4655, + "step": 25038 + }, + { + "epoch": 2.268487689973047, + "grad_norm": 1.0318783521652222, + "learning_rate": 4.87645743973902e-05, + "loss": 2.5881, + "step": 25039 + }, + { + "epoch": 2.268578288147494, + "grad_norm": 0.9707843661308289, + "learning_rate": 4.875853319639945e-05, + "loss": 2.6102, + "step": 25040 + }, + { + "epoch": 2.2686688863219406, + "grad_norm": 0.9701982140541077, + "learning_rate": 4.875249199540869e-05, + "loss": 2.5457, + "step": 25041 + }, + { + "epoch": 2.268759484496387, + "grad_norm": 1.1653016805648804, + "learning_rate": 4.874645079441793e-05, + "loss": 2.8011, + "step": 25042 + }, + { + "epoch": 2.268850082670834, + "grad_norm": 1.0003994703292847, + "learning_rate": 4.874040959342718e-05, + "loss": 2.5717, + "step": 25043 + }, + { + "epoch": 2.2689406808452808, + "grad_norm": 1.0215855836868286, + "learning_rate": 4.873436839243642e-05, + "loss": 2.7372, + "step": 25044 + }, + { + "epoch": 2.269031279019728, + "grad_norm": 0.9693863391876221, + "learning_rate": 4.872832719144566e-05, + "loss": 2.7058, + "step": 25045 + }, + { + "epoch": 2.2691218771941744, + "grad_norm": 0.9121440649032593, + "learning_rate": 4.87222859904549e-05, + "loss": 2.0123, + "step": 25046 + }, + { + "epoch": 2.2692124753686214, + "grad_norm": 1.006697654724121, + "learning_rate": 4.871624478946415e-05, + "loss": 2.5934, + "step": 25047 + }, + { + "epoch": 2.269303073543068, + "grad_norm": 1.0182369947433472, + "learning_rate": 4.871020358847339e-05, + "loss": 2.698, + "step": 25048 + }, + { + "epoch": 2.269393671717515, + "grad_norm": 0.989245593547821, + "learning_rate": 4.870416238748263e-05, + "loss": 2.5566, + "step": 25049 + }, + { + "epoch": 2.2694842698919615, + "grad_norm": 1.0017447471618652, + "learning_rate": 4.869812118649188e-05, + "loss": 2.9419, + "step": 25050 + }, + { + "epoch": 2.2695748680664085, + "grad_norm": 0.976463258266449, + "learning_rate": 4.869207998550112e-05, + "loss": 2.7658, + "step": 25051 + }, + { + "epoch": 2.269665466240855, + "grad_norm": 1.0307985544204712, + "learning_rate": 4.868603878451036e-05, + "loss": 2.5443, + "step": 25052 + }, + { + "epoch": 2.269756064415302, + "grad_norm": 1.0408811569213867, + "learning_rate": 4.86799975835196e-05, + "loss": 2.4447, + "step": 25053 + }, + { + "epoch": 2.2698466625897487, + "grad_norm": 1.0438178777694702, + "learning_rate": 4.867395638252885e-05, + "loss": 2.5302, + "step": 25054 + }, + { + "epoch": 2.2699372607641957, + "grad_norm": 1.047670841217041, + "learning_rate": 4.866791518153809e-05, + "loss": 2.7046, + "step": 25055 + }, + { + "epoch": 2.2700278589386422, + "grad_norm": 0.9164600968360901, + "learning_rate": 4.866187398054734e-05, + "loss": 2.3732, + "step": 25056 + }, + { + "epoch": 2.2701184571130892, + "grad_norm": 1.0777541399002075, + "learning_rate": 4.865583277955658e-05, + "loss": 2.368, + "step": 25057 + }, + { + "epoch": 2.270209055287536, + "grad_norm": 0.9977756142616272, + "learning_rate": 4.8649791578565826e-05, + "loss": 2.639, + "step": 25058 + }, + { + "epoch": 2.270299653461983, + "grad_norm": 0.8828877210617065, + "learning_rate": 4.864375037757507e-05, + "loss": 1.9086, + "step": 25059 + }, + { + "epoch": 2.2703902516364294, + "grad_norm": 1.01322340965271, + "learning_rate": 4.863770917658431e-05, + "loss": 2.7246, + "step": 25060 + }, + { + "epoch": 2.2704808498108764, + "grad_norm": 0.9610379338264465, + "learning_rate": 4.863166797559355e-05, + "loss": 2.75, + "step": 25061 + }, + { + "epoch": 2.270571447985323, + "grad_norm": 0.9615707397460938, + "learning_rate": 4.8625626774602796e-05, + "loss": 2.6577, + "step": 25062 + }, + { + "epoch": 2.27066204615977, + "grad_norm": 0.9538711905479431, + "learning_rate": 4.861958557361204e-05, + "loss": 2.3773, + "step": 25063 + }, + { + "epoch": 2.2707526443342165, + "grad_norm": 1.0605159997940063, + "learning_rate": 4.861354437262128e-05, + "loss": 2.7226, + "step": 25064 + }, + { + "epoch": 2.2708432425086635, + "grad_norm": 1.0720804929733276, + "learning_rate": 4.8607503171630525e-05, + "loss": 2.6108, + "step": 25065 + }, + { + "epoch": 2.27093384068311, + "grad_norm": 0.9869199395179749, + "learning_rate": 4.8601461970639766e-05, + "loss": 2.698, + "step": 25066 + }, + { + "epoch": 2.271024438857557, + "grad_norm": 0.9519549012184143, + "learning_rate": 4.859542076964901e-05, + "loss": 2.5678, + "step": 25067 + }, + { + "epoch": 2.2711150370320037, + "grad_norm": 0.9829413890838623, + "learning_rate": 4.858937956865825e-05, + "loss": 1.9457, + "step": 25068 + }, + { + "epoch": 2.2712056352064507, + "grad_norm": 1.1249902248382568, + "learning_rate": 4.8583338367667495e-05, + "loss": 2.5412, + "step": 25069 + }, + { + "epoch": 2.2712962333808973, + "grad_norm": 1.142970085144043, + "learning_rate": 4.8577297166676736e-05, + "loss": 2.3799, + "step": 25070 + }, + { + "epoch": 2.2713868315553443, + "grad_norm": 1.0222240686416626, + "learning_rate": 4.857125596568598e-05, + "loss": 2.7152, + "step": 25071 + }, + { + "epoch": 2.271477429729791, + "grad_norm": 0.9786458015441895, + "learning_rate": 4.8565214764695225e-05, + "loss": 2.6133, + "step": 25072 + }, + { + "epoch": 2.271568027904238, + "grad_norm": 0.9811135530471802, + "learning_rate": 4.8559173563704465e-05, + "loss": 2.5542, + "step": 25073 + }, + { + "epoch": 2.2716586260786844, + "grad_norm": 0.82236248254776, + "learning_rate": 4.855313236271371e-05, + "loss": 1.9115, + "step": 25074 + }, + { + "epoch": 2.2717492242531314, + "grad_norm": 0.9673418402671814, + "learning_rate": 4.8547091161722954e-05, + "loss": 2.5513, + "step": 25075 + }, + { + "epoch": 2.271839822427578, + "grad_norm": 1.038370966911316, + "learning_rate": 4.8541049960732195e-05, + "loss": 2.6871, + "step": 25076 + }, + { + "epoch": 2.271930420602025, + "grad_norm": 1.2020847797393799, + "learning_rate": 4.853500875974144e-05, + "loss": 2.6431, + "step": 25077 + }, + { + "epoch": 2.2720210187764716, + "grad_norm": 1.0200198888778687, + "learning_rate": 4.852896755875068e-05, + "loss": 2.5556, + "step": 25078 + }, + { + "epoch": 2.2721116169509186, + "grad_norm": 0.840978741645813, + "learning_rate": 4.8522926357759924e-05, + "loss": 2.0377, + "step": 25079 + }, + { + "epoch": 2.272202215125365, + "grad_norm": 0.9057895541191101, + "learning_rate": 4.851688515676917e-05, + "loss": 2.1344, + "step": 25080 + }, + { + "epoch": 2.272292813299812, + "grad_norm": 0.9274826645851135, + "learning_rate": 4.851084395577841e-05, + "loss": 2.6308, + "step": 25081 + }, + { + "epoch": 2.2723834114742587, + "grad_norm": 0.9750332832336426, + "learning_rate": 4.850480275478765e-05, + "loss": 2.8519, + "step": 25082 + }, + { + "epoch": 2.2724740096487057, + "grad_norm": 1.0481271743774414, + "learning_rate": 4.8498761553796894e-05, + "loss": 2.4394, + "step": 25083 + }, + { + "epoch": 2.2725646078231523, + "grad_norm": 1.0033634901046753, + "learning_rate": 4.849272035280614e-05, + "loss": 2.4589, + "step": 25084 + }, + { + "epoch": 2.2726552059975993, + "grad_norm": 1.0299885272979736, + "learning_rate": 4.848667915181538e-05, + "loss": 2.814, + "step": 25085 + }, + { + "epoch": 2.272745804172046, + "grad_norm": 1.0141890048980713, + "learning_rate": 4.848063795082462e-05, + "loss": 2.4955, + "step": 25086 + }, + { + "epoch": 2.272836402346493, + "grad_norm": 0.8619734644889832, + "learning_rate": 4.8474596749833864e-05, + "loss": 2.035, + "step": 25087 + }, + { + "epoch": 2.2729270005209394, + "grad_norm": 1.0123099088668823, + "learning_rate": 4.846855554884311e-05, + "loss": 2.7145, + "step": 25088 + }, + { + "epoch": 2.2730175986953864, + "grad_norm": 0.9950447082519531, + "learning_rate": 4.846251434785235e-05, + "loss": 2.6928, + "step": 25089 + }, + { + "epoch": 2.273108196869833, + "grad_norm": 0.9086070656776428, + "learning_rate": 4.84564731468616e-05, + "loss": 2.5979, + "step": 25090 + }, + { + "epoch": 2.27319879504428, + "grad_norm": 1.053900122642517, + "learning_rate": 4.845043194587084e-05, + "loss": 2.386, + "step": 25091 + }, + { + "epoch": 2.2732893932187266, + "grad_norm": 1.0742383003234863, + "learning_rate": 4.844439074488009e-05, + "loss": 2.7545, + "step": 25092 + }, + { + "epoch": 2.2733799913931736, + "grad_norm": 0.8989091515541077, + "learning_rate": 4.843834954388933e-05, + "loss": 2.0636, + "step": 25093 + }, + { + "epoch": 2.27347058956762, + "grad_norm": 1.0118757486343384, + "learning_rate": 4.843230834289857e-05, + "loss": 2.6158, + "step": 25094 + }, + { + "epoch": 2.273561187742067, + "grad_norm": 1.0016833543777466, + "learning_rate": 4.842626714190782e-05, + "loss": 2.5965, + "step": 25095 + }, + { + "epoch": 2.2736517859165137, + "grad_norm": 1.0263333320617676, + "learning_rate": 4.842022594091706e-05, + "loss": 2.5709, + "step": 25096 + }, + { + "epoch": 2.2737423840909607, + "grad_norm": 1.0761425495147705, + "learning_rate": 4.84141847399263e-05, + "loss": 2.6736, + "step": 25097 + }, + { + "epoch": 2.2738329822654073, + "grad_norm": 0.994236171245575, + "learning_rate": 4.840814353893554e-05, + "loss": 2.4465, + "step": 25098 + }, + { + "epoch": 2.2739235804398543, + "grad_norm": 1.0220003128051758, + "learning_rate": 4.840210233794479e-05, + "loss": 2.7849, + "step": 25099 + }, + { + "epoch": 2.274014178614301, + "grad_norm": 0.983488142490387, + "learning_rate": 4.839606113695403e-05, + "loss": 2.5476, + "step": 25100 + }, + { + "epoch": 2.274104776788748, + "grad_norm": 0.9544490575790405, + "learning_rate": 4.839001993596327e-05, + "loss": 2.6321, + "step": 25101 + }, + { + "epoch": 2.2741953749631945, + "grad_norm": 1.005083680152893, + "learning_rate": 4.838397873497251e-05, + "loss": 2.5358, + "step": 25102 + }, + { + "epoch": 2.2742859731376415, + "grad_norm": 1.026218295097351, + "learning_rate": 4.837793753398176e-05, + "loss": 2.7747, + "step": 25103 + }, + { + "epoch": 2.274376571312088, + "grad_norm": 0.878507673740387, + "learning_rate": 4.8371896332991e-05, + "loss": 2.0849, + "step": 25104 + }, + { + "epoch": 2.2744671694865346, + "grad_norm": 1.0122209787368774, + "learning_rate": 4.836585513200024e-05, + "loss": 2.5974, + "step": 25105 + }, + { + "epoch": 2.2745577676609816, + "grad_norm": 0.9805324673652649, + "learning_rate": 4.835981393100949e-05, + "loss": 2.6398, + "step": 25106 + }, + { + "epoch": 2.2746483658354286, + "grad_norm": 0.9723056554794312, + "learning_rate": 4.8353772730018735e-05, + "loss": 2.6265, + "step": 25107 + }, + { + "epoch": 2.274738964009875, + "grad_norm": 1.0205552577972412, + "learning_rate": 4.8347731529027976e-05, + "loss": 2.6587, + "step": 25108 + }, + { + "epoch": 2.2748295621843218, + "grad_norm": 1.016754150390625, + "learning_rate": 4.8341690328037217e-05, + "loss": 2.8783, + "step": 25109 + }, + { + "epoch": 2.2749201603587688, + "grad_norm": 1.0388864278793335, + "learning_rate": 4.8335649127046464e-05, + "loss": 2.5453, + "step": 25110 + }, + { + "epoch": 2.2750107585332158, + "grad_norm": 1.1489590406417847, + "learning_rate": 4.8329607926055705e-05, + "loss": 2.5943, + "step": 25111 + }, + { + "epoch": 2.2751013567076623, + "grad_norm": 1.0051865577697754, + "learning_rate": 4.8323566725064946e-05, + "loss": 2.7347, + "step": 25112 + }, + { + "epoch": 2.275191954882109, + "grad_norm": 1.0491410493850708, + "learning_rate": 4.831752552407419e-05, + "loss": 2.6221, + "step": 25113 + }, + { + "epoch": 2.275282553056556, + "grad_norm": 0.9943227767944336, + "learning_rate": 4.8311484323083434e-05, + "loss": 2.567, + "step": 25114 + }, + { + "epoch": 2.275373151231003, + "grad_norm": 0.9371699094772339, + "learning_rate": 4.8305443122092675e-05, + "loss": 2.4696, + "step": 25115 + }, + { + "epoch": 2.2754637494054495, + "grad_norm": 1.089072823524475, + "learning_rate": 4.8299401921101916e-05, + "loss": 2.5327, + "step": 25116 + }, + { + "epoch": 2.275554347579896, + "grad_norm": 1.0655272006988525, + "learning_rate": 4.829336072011116e-05, + "loss": 2.5635, + "step": 25117 + }, + { + "epoch": 2.275644945754343, + "grad_norm": 1.018441915512085, + "learning_rate": 4.8287319519120404e-05, + "loss": 2.7608, + "step": 25118 + }, + { + "epoch": 2.27573554392879, + "grad_norm": 1.0281347036361694, + "learning_rate": 4.8281278318129645e-05, + "loss": 2.6799, + "step": 25119 + }, + { + "epoch": 2.2758261421032366, + "grad_norm": 1.0091934204101562, + "learning_rate": 4.8275237117138886e-05, + "loss": 2.725, + "step": 25120 + }, + { + "epoch": 2.275916740277683, + "grad_norm": 1.000279426574707, + "learning_rate": 4.826919591614813e-05, + "loss": 2.5851, + "step": 25121 + }, + { + "epoch": 2.27600733845213, + "grad_norm": 1.0968419313430786, + "learning_rate": 4.8263154715157374e-05, + "loss": 2.4662, + "step": 25122 + }, + { + "epoch": 2.2760979366265772, + "grad_norm": 1.046345829963684, + "learning_rate": 4.8257113514166615e-05, + "loss": 2.5966, + "step": 25123 + }, + { + "epoch": 2.276188534801024, + "grad_norm": 0.9830841422080994, + "learning_rate": 4.825107231317586e-05, + "loss": 2.9279, + "step": 25124 + }, + { + "epoch": 2.2762791329754704, + "grad_norm": 1.004665732383728, + "learning_rate": 4.824503111218511e-05, + "loss": 2.401, + "step": 25125 + }, + { + "epoch": 2.2763697311499174, + "grad_norm": 0.7669817805290222, + "learning_rate": 4.823898991119435e-05, + "loss": 1.7033, + "step": 25126 + }, + { + "epoch": 2.276460329324364, + "grad_norm": 1.1422686576843262, + "learning_rate": 4.823294871020359e-05, + "loss": 2.8394, + "step": 25127 + }, + { + "epoch": 2.276550927498811, + "grad_norm": 0.6820893883705139, + "learning_rate": 4.822690750921283e-05, + "loss": 1.1496, + "step": 25128 + }, + { + "epoch": 2.2766415256732575, + "grad_norm": 0.9956661462783813, + "learning_rate": 4.822086630822208e-05, + "loss": 2.5965, + "step": 25129 + }, + { + "epoch": 2.2767321238477045, + "grad_norm": 0.9896150827407837, + "learning_rate": 4.821482510723132e-05, + "loss": 2.3512, + "step": 25130 + }, + { + "epoch": 2.276822722022151, + "grad_norm": 0.6990948915481567, + "learning_rate": 4.820878390624056e-05, + "loss": 1.1547, + "step": 25131 + }, + { + "epoch": 2.276913320196598, + "grad_norm": 0.9475070238113403, + "learning_rate": 4.82027427052498e-05, + "loss": 2.7212, + "step": 25132 + }, + { + "epoch": 2.2770039183710447, + "grad_norm": 0.9283338785171509, + "learning_rate": 4.819670150425905e-05, + "loss": 2.5162, + "step": 25133 + }, + { + "epoch": 2.2770945165454917, + "grad_norm": 1.0135241746902466, + "learning_rate": 4.819066030326829e-05, + "loss": 2.5015, + "step": 25134 + }, + { + "epoch": 2.2771851147199382, + "grad_norm": 0.9270250797271729, + "learning_rate": 4.818461910227753e-05, + "loss": 2.5046, + "step": 25135 + }, + { + "epoch": 2.2772757128943852, + "grad_norm": 1.0371979475021362, + "learning_rate": 4.817857790128677e-05, + "loss": 2.6038, + "step": 25136 + }, + { + "epoch": 2.277366311068832, + "grad_norm": 0.8824813365936279, + "learning_rate": 4.817253670029602e-05, + "loss": 2.0886, + "step": 25137 + }, + { + "epoch": 2.277456909243279, + "grad_norm": 0.9734851121902466, + "learning_rate": 4.816649549930526e-05, + "loss": 2.7953, + "step": 25138 + }, + { + "epoch": 2.2775475074177254, + "grad_norm": 1.1419752836227417, + "learning_rate": 4.81604542983145e-05, + "loss": 2.5808, + "step": 25139 + }, + { + "epoch": 2.2776381055921724, + "grad_norm": 0.9838299751281738, + "learning_rate": 4.815441309732375e-05, + "loss": 2.7808, + "step": 25140 + }, + { + "epoch": 2.277728703766619, + "grad_norm": 0.9999135136604309, + "learning_rate": 4.8148371896333e-05, + "loss": 2.4596, + "step": 25141 + }, + { + "epoch": 2.277819301941066, + "grad_norm": 1.058966040611267, + "learning_rate": 4.814233069534224e-05, + "loss": 2.78, + "step": 25142 + }, + { + "epoch": 2.2779099001155125, + "grad_norm": 0.9607626795768738, + "learning_rate": 4.813628949435148e-05, + "loss": 2.8255, + "step": 25143 + }, + { + "epoch": 2.2780004982899595, + "grad_norm": 1.069654941558838, + "learning_rate": 4.813024829336073e-05, + "loss": 2.7814, + "step": 25144 + }, + { + "epoch": 2.278091096464406, + "grad_norm": 1.0254486799240112, + "learning_rate": 4.812420709236997e-05, + "loss": 2.5367, + "step": 25145 + }, + { + "epoch": 2.278181694638853, + "grad_norm": 0.9357786774635315, + "learning_rate": 4.811816589137921e-05, + "loss": 2.6143, + "step": 25146 + }, + { + "epoch": 2.2782722928132997, + "grad_norm": 0.9793307185173035, + "learning_rate": 4.811212469038845e-05, + "loss": 2.6508, + "step": 25147 + }, + { + "epoch": 2.2783628909877467, + "grad_norm": 1.0583395957946777, + "learning_rate": 4.81060834893977e-05, + "loss": 2.5955, + "step": 25148 + }, + { + "epoch": 2.2784534891621933, + "grad_norm": 1.0863844156265259, + "learning_rate": 4.810004228840694e-05, + "loss": 2.8488, + "step": 25149 + }, + { + "epoch": 2.2785440873366403, + "grad_norm": 0.9735062122344971, + "learning_rate": 4.809400108741618e-05, + "loss": 2.411, + "step": 25150 + }, + { + "epoch": 2.278634685511087, + "grad_norm": 0.9967738389968872, + "learning_rate": 4.808795988642542e-05, + "loss": 2.7052, + "step": 25151 + }, + { + "epoch": 2.278725283685534, + "grad_norm": 1.0078001022338867, + "learning_rate": 4.808191868543467e-05, + "loss": 2.4896, + "step": 25152 + }, + { + "epoch": 2.2788158818599804, + "grad_norm": 0.8877198696136475, + "learning_rate": 4.807587748444391e-05, + "loss": 1.7813, + "step": 25153 + }, + { + "epoch": 2.2789064800344274, + "grad_norm": 1.129629373550415, + "learning_rate": 4.806983628345315e-05, + "loss": 2.6938, + "step": 25154 + }, + { + "epoch": 2.278997078208874, + "grad_norm": 0.8069630861282349, + "learning_rate": 4.8063795082462396e-05, + "loss": 1.8941, + "step": 25155 + }, + { + "epoch": 2.279087676383321, + "grad_norm": 1.043434500694275, + "learning_rate": 4.805775388147164e-05, + "loss": 2.5465, + "step": 25156 + }, + { + "epoch": 2.2791782745577676, + "grad_norm": 0.9879887700080872, + "learning_rate": 4.8051712680480885e-05, + "loss": 2.5458, + "step": 25157 + }, + { + "epoch": 2.2792688727322146, + "grad_norm": 0.9632695317268372, + "learning_rate": 4.8045671479490126e-05, + "loss": 2.5635, + "step": 25158 + }, + { + "epoch": 2.279359470906661, + "grad_norm": 0.9451159834861755, + "learning_rate": 4.803963027849937e-05, + "loss": 2.405, + "step": 25159 + }, + { + "epoch": 2.279450069081108, + "grad_norm": 1.0534451007843018, + "learning_rate": 4.8033589077508614e-05, + "loss": 2.6089, + "step": 25160 + }, + { + "epoch": 2.2795406672555547, + "grad_norm": 1.0631208419799805, + "learning_rate": 4.8027547876517855e-05, + "loss": 2.5416, + "step": 25161 + }, + { + "epoch": 2.2796312654300017, + "grad_norm": 0.9720706343650818, + "learning_rate": 4.8021506675527096e-05, + "loss": 2.5093, + "step": 25162 + }, + { + "epoch": 2.2797218636044483, + "grad_norm": 1.0794074535369873, + "learning_rate": 4.801546547453634e-05, + "loss": 2.6519, + "step": 25163 + }, + { + "epoch": 2.2798124617788953, + "grad_norm": 1.0770808458328247, + "learning_rate": 4.8009424273545584e-05, + "loss": 2.2748, + "step": 25164 + }, + { + "epoch": 2.279903059953342, + "grad_norm": 1.062547206878662, + "learning_rate": 4.8003383072554825e-05, + "loss": 2.7491, + "step": 25165 + }, + { + "epoch": 2.279993658127789, + "grad_norm": 1.0400701761245728, + "learning_rate": 4.7997341871564066e-05, + "loss": 2.593, + "step": 25166 + }, + { + "epoch": 2.2800842563022354, + "grad_norm": 1.0319639444351196, + "learning_rate": 4.7991300670573313e-05, + "loss": 2.7898, + "step": 25167 + }, + { + "epoch": 2.2801748544766824, + "grad_norm": 1.0306510925292969, + "learning_rate": 4.7985259469582554e-05, + "loss": 2.4239, + "step": 25168 + }, + { + "epoch": 2.280265452651129, + "grad_norm": 0.9926136136054993, + "learning_rate": 4.7979218268591795e-05, + "loss": 2.354, + "step": 25169 + }, + { + "epoch": 2.280356050825576, + "grad_norm": 0.9377224445343018, + "learning_rate": 4.797317706760104e-05, + "loss": 2.5921, + "step": 25170 + }, + { + "epoch": 2.2804466490000226, + "grad_norm": 0.9874250888824463, + "learning_rate": 4.7967135866610284e-05, + "loss": 2.576, + "step": 25171 + }, + { + "epoch": 2.2805372471744696, + "grad_norm": 1.009200930595398, + "learning_rate": 4.7961094665619524e-05, + "loss": 2.6666, + "step": 25172 + }, + { + "epoch": 2.280627845348916, + "grad_norm": 1.0564014911651611, + "learning_rate": 4.795505346462877e-05, + "loss": 2.4532, + "step": 25173 + }, + { + "epoch": 2.280718443523363, + "grad_norm": 1.023401141166687, + "learning_rate": 4.794901226363801e-05, + "loss": 2.6101, + "step": 25174 + }, + { + "epoch": 2.2808090416978097, + "grad_norm": 1.005778431892395, + "learning_rate": 4.794297106264726e-05, + "loss": 2.5377, + "step": 25175 + }, + { + "epoch": 2.2808996398722567, + "grad_norm": 0.984633207321167, + "learning_rate": 4.79369298616565e-05, + "loss": 2.4719, + "step": 25176 + }, + { + "epoch": 2.2809902380467033, + "grad_norm": 1.0203721523284912, + "learning_rate": 4.793088866066574e-05, + "loss": 2.4237, + "step": 25177 + }, + { + "epoch": 2.2810808362211503, + "grad_norm": 1.0003708600997925, + "learning_rate": 4.792484745967499e-05, + "loss": 2.4847, + "step": 25178 + }, + { + "epoch": 2.281171434395597, + "grad_norm": 0.9806445837020874, + "learning_rate": 4.791880625868423e-05, + "loss": 2.7243, + "step": 25179 + }, + { + "epoch": 2.281262032570044, + "grad_norm": 0.9776265025138855, + "learning_rate": 4.791276505769347e-05, + "loss": 2.6586, + "step": 25180 + }, + { + "epoch": 2.2813526307444905, + "grad_norm": 1.0337926149368286, + "learning_rate": 4.790672385670271e-05, + "loss": 2.7343, + "step": 25181 + }, + { + "epoch": 2.2814432289189375, + "grad_norm": 1.0161429643630981, + "learning_rate": 4.790068265571196e-05, + "loss": 2.5804, + "step": 25182 + }, + { + "epoch": 2.281533827093384, + "grad_norm": 0.998906672000885, + "learning_rate": 4.78946414547212e-05, + "loss": 2.8187, + "step": 25183 + }, + { + "epoch": 2.281624425267831, + "grad_norm": 1.0364129543304443, + "learning_rate": 4.788860025373044e-05, + "loss": 2.6923, + "step": 25184 + }, + { + "epoch": 2.2817150234422776, + "grad_norm": 0.9982836246490479, + "learning_rate": 4.788255905273969e-05, + "loss": 2.7196, + "step": 25185 + }, + { + "epoch": 2.2818056216167246, + "grad_norm": 0.9696604013442993, + "learning_rate": 4.787651785174893e-05, + "loss": 2.694, + "step": 25186 + }, + { + "epoch": 2.281896219791171, + "grad_norm": 1.0974278450012207, + "learning_rate": 4.787047665075817e-05, + "loss": 2.4055, + "step": 25187 + }, + { + "epoch": 2.2819868179656178, + "grad_norm": 0.9898187518119812, + "learning_rate": 4.786443544976741e-05, + "loss": 2.6254, + "step": 25188 + }, + { + "epoch": 2.2820774161400648, + "grad_norm": 1.0487046241760254, + "learning_rate": 4.785839424877666e-05, + "loss": 2.4667, + "step": 25189 + }, + { + "epoch": 2.2821680143145118, + "grad_norm": 1.0271247625350952, + "learning_rate": 4.78523530477859e-05, + "loss": 2.8306, + "step": 25190 + }, + { + "epoch": 2.2822586124889583, + "grad_norm": 1.0038381814956665, + "learning_rate": 4.784631184679515e-05, + "loss": 2.0865, + "step": 25191 + }, + { + "epoch": 2.282349210663405, + "grad_norm": 1.1051123142242432, + "learning_rate": 4.784027064580439e-05, + "loss": 2.571, + "step": 25192 + }, + { + "epoch": 2.282439808837852, + "grad_norm": 1.080772876739502, + "learning_rate": 4.7834229444813636e-05, + "loss": 2.9134, + "step": 25193 + }, + { + "epoch": 2.282530407012299, + "grad_norm": 0.9730762243270874, + "learning_rate": 4.782818824382288e-05, + "loss": 2.6996, + "step": 25194 + }, + { + "epoch": 2.2826210051867455, + "grad_norm": 1.0955778360366821, + "learning_rate": 4.782214704283212e-05, + "loss": 2.4861, + "step": 25195 + }, + { + "epoch": 2.282711603361192, + "grad_norm": 1.0805442333221436, + "learning_rate": 4.781610584184136e-05, + "loss": 2.4198, + "step": 25196 + }, + { + "epoch": 2.282802201535639, + "grad_norm": 0.9525511264801025, + "learning_rate": 4.7810064640850606e-05, + "loss": 2.4344, + "step": 25197 + }, + { + "epoch": 2.282892799710086, + "grad_norm": 0.9790255427360535, + "learning_rate": 4.780402343985985e-05, + "loss": 2.3792, + "step": 25198 + }, + { + "epoch": 2.2829833978845326, + "grad_norm": 0.9565383791923523, + "learning_rate": 4.779798223886909e-05, + "loss": 2.6778, + "step": 25199 + }, + { + "epoch": 2.283073996058979, + "grad_norm": 0.9862082600593567, + "learning_rate": 4.7791941037878335e-05, + "loss": 2.6443, + "step": 25200 + }, + { + "epoch": 2.283164594233426, + "grad_norm": 0.8394107222557068, + "learning_rate": 4.7785899836887576e-05, + "loss": 1.9553, + "step": 25201 + }, + { + "epoch": 2.2832551924078732, + "grad_norm": 0.8126859664916992, + "learning_rate": 4.777985863589682e-05, + "loss": 1.8891, + "step": 25202 + }, + { + "epoch": 2.28334579058232, + "grad_norm": 1.0089631080627441, + "learning_rate": 4.777381743490606e-05, + "loss": 2.6626, + "step": 25203 + }, + { + "epoch": 2.2834363887567664, + "grad_norm": 0.9915516972541809, + "learning_rate": 4.7767776233915305e-05, + "loss": 2.5819, + "step": 25204 + }, + { + "epoch": 2.2835269869312134, + "grad_norm": 1.0679937601089478, + "learning_rate": 4.7761735032924546e-05, + "loss": 2.497, + "step": 25205 + }, + { + "epoch": 2.28361758510566, + "grad_norm": 0.9787963628768921, + "learning_rate": 4.775569383193379e-05, + "loss": 2.6433, + "step": 25206 + }, + { + "epoch": 2.283708183280107, + "grad_norm": 0.9507842063903809, + "learning_rate": 4.7749652630943035e-05, + "loss": 1.9554, + "step": 25207 + }, + { + "epoch": 2.2837987814545535, + "grad_norm": 1.0829380750656128, + "learning_rate": 4.7743611429952276e-05, + "loss": 2.5822, + "step": 25208 + }, + { + "epoch": 2.2838893796290005, + "grad_norm": 0.9519252181053162, + "learning_rate": 4.773757022896152e-05, + "loss": 1.9221, + "step": 25209 + }, + { + "epoch": 2.283979977803447, + "grad_norm": 1.0421181917190552, + "learning_rate": 4.7731529027970764e-05, + "loss": 2.5046, + "step": 25210 + }, + { + "epoch": 2.284070575977894, + "grad_norm": 0.9748874306678772, + "learning_rate": 4.7725487826980005e-05, + "loss": 1.7122, + "step": 25211 + }, + { + "epoch": 2.2841611741523407, + "grad_norm": 0.9870306253433228, + "learning_rate": 4.771944662598925e-05, + "loss": 2.7045, + "step": 25212 + }, + { + "epoch": 2.2842517723267877, + "grad_norm": 1.2747669219970703, + "learning_rate": 4.771340542499849e-05, + "loss": 2.5264, + "step": 25213 + }, + { + "epoch": 2.2843423705012342, + "grad_norm": 1.111602544784546, + "learning_rate": 4.7707364224007734e-05, + "loss": 2.7356, + "step": 25214 + }, + { + "epoch": 2.2844329686756812, + "grad_norm": 0.9779674410820007, + "learning_rate": 4.770132302301698e-05, + "loss": 2.3422, + "step": 25215 + }, + { + "epoch": 2.284523566850128, + "grad_norm": 1.016869068145752, + "learning_rate": 4.769528182202622e-05, + "loss": 2.8121, + "step": 25216 + }, + { + "epoch": 2.284614165024575, + "grad_norm": 1.1103014945983887, + "learning_rate": 4.768924062103546e-05, + "loss": 2.9881, + "step": 25217 + }, + { + "epoch": 2.2847047631990214, + "grad_norm": 1.0760737657546997, + "learning_rate": 4.7683199420044704e-05, + "loss": 2.3882, + "step": 25218 + }, + { + "epoch": 2.2847953613734684, + "grad_norm": 1.0048203468322754, + "learning_rate": 4.767715821905395e-05, + "loss": 2.8002, + "step": 25219 + }, + { + "epoch": 2.284885959547915, + "grad_norm": 0.8712446689605713, + "learning_rate": 4.767111701806319e-05, + "loss": 1.9582, + "step": 25220 + }, + { + "epoch": 2.284976557722362, + "grad_norm": 1.1018825769424438, + "learning_rate": 4.7665075817072433e-05, + "loss": 2.8611, + "step": 25221 + }, + { + "epoch": 2.2850671558968085, + "grad_norm": 0.9661800265312195, + "learning_rate": 4.7659034616081674e-05, + "loss": 2.5757, + "step": 25222 + }, + { + "epoch": 2.2851577540712555, + "grad_norm": 0.9768305420875549, + "learning_rate": 4.765299341509092e-05, + "loss": 2.4163, + "step": 25223 + }, + { + "epoch": 2.285248352245702, + "grad_norm": 1.0372382402420044, + "learning_rate": 4.764695221410016e-05, + "loss": 2.7545, + "step": 25224 + }, + { + "epoch": 2.285338950420149, + "grad_norm": 1.0740845203399658, + "learning_rate": 4.764091101310941e-05, + "loss": 2.7011, + "step": 25225 + }, + { + "epoch": 2.2854295485945957, + "grad_norm": 0.9900211691856384, + "learning_rate": 4.763486981211865e-05, + "loss": 2.7199, + "step": 25226 + }, + { + "epoch": 2.2855201467690427, + "grad_norm": 0.9633088707923889, + "learning_rate": 4.76288286111279e-05, + "loss": 2.6166, + "step": 25227 + }, + { + "epoch": 2.2856107449434893, + "grad_norm": 1.0133728981018066, + "learning_rate": 4.762278741013714e-05, + "loss": 2.6729, + "step": 25228 + }, + { + "epoch": 2.2857013431179363, + "grad_norm": 1.0656293630599976, + "learning_rate": 4.761674620914638e-05, + "loss": 2.796, + "step": 25229 + }, + { + "epoch": 2.285791941292383, + "grad_norm": 1.0017110109329224, + "learning_rate": 4.761070500815563e-05, + "loss": 2.6425, + "step": 25230 + }, + { + "epoch": 2.28588253946683, + "grad_norm": 1.0221272706985474, + "learning_rate": 4.760466380716487e-05, + "loss": 2.5711, + "step": 25231 + }, + { + "epoch": 2.2859731376412764, + "grad_norm": 0.9490227103233337, + "learning_rate": 4.759862260617411e-05, + "loss": 2.4933, + "step": 25232 + }, + { + "epoch": 2.2860637358157234, + "grad_norm": 1.0686213970184326, + "learning_rate": 4.759258140518335e-05, + "loss": 2.5503, + "step": 25233 + }, + { + "epoch": 2.28615433399017, + "grad_norm": 1.059739351272583, + "learning_rate": 4.75865402041926e-05, + "loss": 2.2841, + "step": 25234 + }, + { + "epoch": 2.286244932164617, + "grad_norm": 1.0462541580200195, + "learning_rate": 4.758049900320184e-05, + "loss": 2.5088, + "step": 25235 + }, + { + "epoch": 2.2863355303390636, + "grad_norm": 1.016310453414917, + "learning_rate": 4.757445780221108e-05, + "loss": 2.6006, + "step": 25236 + }, + { + "epoch": 2.2864261285135106, + "grad_norm": 0.9954138994216919, + "learning_rate": 4.756841660122032e-05, + "loss": 2.6569, + "step": 25237 + }, + { + "epoch": 2.286516726687957, + "grad_norm": 1.0252468585968018, + "learning_rate": 4.756237540022957e-05, + "loss": 2.743, + "step": 25238 + }, + { + "epoch": 2.286607324862404, + "grad_norm": 0.9087827205657959, + "learning_rate": 4.755633419923881e-05, + "loss": 2.3211, + "step": 25239 + }, + { + "epoch": 2.2866979230368507, + "grad_norm": 0.9400947093963623, + "learning_rate": 4.755029299824805e-05, + "loss": 2.674, + "step": 25240 + }, + { + "epoch": 2.2867885212112977, + "grad_norm": 0.9933532476425171, + "learning_rate": 4.75442517972573e-05, + "loss": 2.6706, + "step": 25241 + }, + { + "epoch": 2.2868791193857443, + "grad_norm": 1.0373207330703735, + "learning_rate": 4.753821059626654e-05, + "loss": 2.6214, + "step": 25242 + }, + { + "epoch": 2.2869697175601913, + "grad_norm": 1.0550187826156616, + "learning_rate": 4.7532169395275786e-05, + "loss": 2.6464, + "step": 25243 + }, + { + "epoch": 2.287060315734638, + "grad_norm": 0.9176452159881592, + "learning_rate": 4.752612819428503e-05, + "loss": 2.1188, + "step": 25244 + }, + { + "epoch": 2.287150913909085, + "grad_norm": 1.0258735418319702, + "learning_rate": 4.7520086993294274e-05, + "loss": 2.9243, + "step": 25245 + }, + { + "epoch": 2.2872415120835314, + "grad_norm": 0.9648392796516418, + "learning_rate": 4.7514045792303515e-05, + "loss": 2.595, + "step": 25246 + }, + { + "epoch": 2.2873321102579784, + "grad_norm": 1.069796085357666, + "learning_rate": 4.7508004591312756e-05, + "loss": 2.809, + "step": 25247 + }, + { + "epoch": 2.287422708432425, + "grad_norm": 1.0637754201889038, + "learning_rate": 4.7501963390322e-05, + "loss": 2.5019, + "step": 25248 + }, + { + "epoch": 2.287513306606872, + "grad_norm": 1.007617712020874, + "learning_rate": 4.7495922189331244e-05, + "loss": 2.7681, + "step": 25249 + }, + { + "epoch": 2.2876039047813186, + "grad_norm": 1.017450213432312, + "learning_rate": 4.7489880988340485e-05, + "loss": 2.6771, + "step": 25250 + }, + { + "epoch": 2.2876945029557656, + "grad_norm": 1.0390907526016235, + "learning_rate": 4.7483839787349726e-05, + "loss": 2.3244, + "step": 25251 + }, + { + "epoch": 2.287785101130212, + "grad_norm": 0.9793998003005981, + "learning_rate": 4.747779858635897e-05, + "loss": 2.9308, + "step": 25252 + }, + { + "epoch": 2.287875699304659, + "grad_norm": 1.1358824968338013, + "learning_rate": 4.7471757385368215e-05, + "loss": 2.7055, + "step": 25253 + }, + { + "epoch": 2.2879662974791057, + "grad_norm": 1.057220697402954, + "learning_rate": 4.7465716184377455e-05, + "loss": 2.4949, + "step": 25254 + }, + { + "epoch": 2.2880568956535527, + "grad_norm": 0.9811006188392639, + "learning_rate": 4.7459674983386696e-05, + "loss": 2.5079, + "step": 25255 + }, + { + "epoch": 2.2881474938279993, + "grad_norm": 1.075647234916687, + "learning_rate": 4.745363378239594e-05, + "loss": 2.3304, + "step": 25256 + }, + { + "epoch": 2.2882380920024463, + "grad_norm": 0.9961302280426025, + "learning_rate": 4.7447592581405185e-05, + "loss": 2.9076, + "step": 25257 + }, + { + "epoch": 2.288328690176893, + "grad_norm": 1.0126413106918335, + "learning_rate": 4.7441551380414425e-05, + "loss": 2.6395, + "step": 25258 + }, + { + "epoch": 2.28841928835134, + "grad_norm": 1.0350301265716553, + "learning_rate": 4.743551017942367e-05, + "loss": 2.5375, + "step": 25259 + }, + { + "epoch": 2.2885098865257865, + "grad_norm": 1.1222296953201294, + "learning_rate": 4.7429468978432914e-05, + "loss": 2.6836, + "step": 25260 + }, + { + "epoch": 2.2886004847002335, + "grad_norm": 1.007303237915039, + "learning_rate": 4.742342777744216e-05, + "loss": 2.5476, + "step": 25261 + }, + { + "epoch": 2.28869108287468, + "grad_norm": 0.8693959712982178, + "learning_rate": 4.74173865764514e-05, + "loss": 1.8614, + "step": 25262 + }, + { + "epoch": 2.288781681049127, + "grad_norm": 1.0332375764846802, + "learning_rate": 4.741134537546064e-05, + "loss": 2.7472, + "step": 25263 + }, + { + "epoch": 2.2888722792235736, + "grad_norm": 0.9997008442878723, + "learning_rate": 4.740530417446989e-05, + "loss": 2.7073, + "step": 25264 + }, + { + "epoch": 2.2889628773980206, + "grad_norm": 0.9753320813179016, + "learning_rate": 4.739926297347913e-05, + "loss": 2.6399, + "step": 25265 + }, + { + "epoch": 2.289053475572467, + "grad_norm": 1.0189944505691528, + "learning_rate": 4.739322177248837e-05, + "loss": 2.7547, + "step": 25266 + }, + { + "epoch": 2.2891440737469138, + "grad_norm": 0.9725114703178406, + "learning_rate": 4.738718057149761e-05, + "loss": 2.4446, + "step": 25267 + }, + { + "epoch": 2.2892346719213608, + "grad_norm": 0.9853807687759399, + "learning_rate": 4.738113937050686e-05, + "loss": 2.762, + "step": 25268 + }, + { + "epoch": 2.2893252700958078, + "grad_norm": 1.0505952835083008, + "learning_rate": 4.73750981695161e-05, + "loss": 2.799, + "step": 25269 + }, + { + "epoch": 2.2894158682702543, + "grad_norm": 0.9442711472511292, + "learning_rate": 4.736905696852534e-05, + "loss": 2.6271, + "step": 25270 + }, + { + "epoch": 2.289506466444701, + "grad_norm": 1.020462989807129, + "learning_rate": 4.736301576753459e-05, + "loss": 2.6025, + "step": 25271 + }, + { + "epoch": 2.289597064619148, + "grad_norm": 1.054688811302185, + "learning_rate": 4.735697456654383e-05, + "loss": 2.6447, + "step": 25272 + }, + { + "epoch": 2.289687662793595, + "grad_norm": 1.038958191871643, + "learning_rate": 4.735093336555307e-05, + "loss": 2.5777, + "step": 25273 + }, + { + "epoch": 2.2897782609680415, + "grad_norm": 0.8616878390312195, + "learning_rate": 4.734489216456231e-05, + "loss": 1.9513, + "step": 25274 + }, + { + "epoch": 2.289868859142488, + "grad_norm": 1.0014702081680298, + "learning_rate": 4.733885096357156e-05, + "loss": 2.6612, + "step": 25275 + }, + { + "epoch": 2.289959457316935, + "grad_norm": 0.980079710483551, + "learning_rate": 4.73328097625808e-05, + "loss": 2.4399, + "step": 25276 + }, + { + "epoch": 2.290050055491382, + "grad_norm": 1.0286314487457275, + "learning_rate": 4.732676856159005e-05, + "loss": 2.7462, + "step": 25277 + }, + { + "epoch": 2.2901406536658286, + "grad_norm": 0.6848557591438293, + "learning_rate": 4.732072736059929e-05, + "loss": 1.3271, + "step": 25278 + }, + { + "epoch": 2.290231251840275, + "grad_norm": 0.9400747418403625, + "learning_rate": 4.731468615960854e-05, + "loss": 2.5754, + "step": 25279 + }, + { + "epoch": 2.290321850014722, + "grad_norm": 0.9852080345153809, + "learning_rate": 4.730864495861778e-05, + "loss": 2.4531, + "step": 25280 + }, + { + "epoch": 2.2904124481891692, + "grad_norm": 1.006346583366394, + "learning_rate": 4.730260375762702e-05, + "loss": 2.645, + "step": 25281 + }, + { + "epoch": 2.290503046363616, + "grad_norm": 1.0288316011428833, + "learning_rate": 4.729656255663626e-05, + "loss": 2.6131, + "step": 25282 + }, + { + "epoch": 2.2905936445380624, + "grad_norm": 1.000516414642334, + "learning_rate": 4.729052135564551e-05, + "loss": 2.6867, + "step": 25283 + }, + { + "epoch": 2.2906842427125094, + "grad_norm": 0.925163745880127, + "learning_rate": 4.728448015465475e-05, + "loss": 1.9256, + "step": 25284 + }, + { + "epoch": 2.2907748408869564, + "grad_norm": 0.9989494681358337, + "learning_rate": 4.727843895366399e-05, + "loss": 2.8556, + "step": 25285 + }, + { + "epoch": 2.290865439061403, + "grad_norm": 0.9937574863433838, + "learning_rate": 4.7272397752673236e-05, + "loss": 2.5746, + "step": 25286 + }, + { + "epoch": 2.2909560372358495, + "grad_norm": 0.9521122574806213, + "learning_rate": 4.726635655168248e-05, + "loss": 2.3637, + "step": 25287 + }, + { + "epoch": 2.2910466354102965, + "grad_norm": 0.984262228012085, + "learning_rate": 4.726031535069172e-05, + "loss": 2.5434, + "step": 25288 + }, + { + "epoch": 2.291137233584743, + "grad_norm": 1.1066380739212036, + "learning_rate": 4.725427414970096e-05, + "loss": 1.9154, + "step": 25289 + }, + { + "epoch": 2.29122783175919, + "grad_norm": 1.0135942697525024, + "learning_rate": 4.7248232948710207e-05, + "loss": 2.7779, + "step": 25290 + }, + { + "epoch": 2.2913184299336367, + "grad_norm": 0.9178845882415771, + "learning_rate": 4.724219174771945e-05, + "loss": 1.9844, + "step": 25291 + }, + { + "epoch": 2.2914090281080837, + "grad_norm": 1.0141881704330444, + "learning_rate": 4.723615054672869e-05, + "loss": 2.4474, + "step": 25292 + }, + { + "epoch": 2.2914996262825302, + "grad_norm": 0.9969872236251831, + "learning_rate": 4.7230109345737936e-05, + "loss": 2.5794, + "step": 25293 + }, + { + "epoch": 2.2915902244569772, + "grad_norm": 1.0478360652923584, + "learning_rate": 4.722406814474718e-05, + "loss": 2.8134, + "step": 25294 + }, + { + "epoch": 2.291680822631424, + "grad_norm": 0.974180281162262, + "learning_rate": 4.7218026943756424e-05, + "loss": 2.7516, + "step": 25295 + }, + { + "epoch": 2.291771420805871, + "grad_norm": 0.962873101234436, + "learning_rate": 4.7211985742765665e-05, + "loss": 2.7527, + "step": 25296 + }, + { + "epoch": 2.2918620189803174, + "grad_norm": 0.9024257659912109, + "learning_rate": 4.7205944541774906e-05, + "loss": 2.0425, + "step": 25297 + }, + { + "epoch": 2.2919526171547644, + "grad_norm": 0.8745099902153015, + "learning_rate": 4.7199903340784153e-05, + "loss": 2.016, + "step": 25298 + }, + { + "epoch": 2.292043215329211, + "grad_norm": 1.0601247549057007, + "learning_rate": 4.7193862139793394e-05, + "loss": 2.7155, + "step": 25299 + }, + { + "epoch": 2.292133813503658, + "grad_norm": 0.9464588165283203, + "learning_rate": 4.7187820938802635e-05, + "loss": 2.779, + "step": 25300 + }, + { + "epoch": 2.2922244116781045, + "grad_norm": 1.1166844367980957, + "learning_rate": 4.718177973781188e-05, + "loss": 2.3113, + "step": 25301 + }, + { + "epoch": 2.2923150098525515, + "grad_norm": 1.1004793643951416, + "learning_rate": 4.7175738536821124e-05, + "loss": 2.4856, + "step": 25302 + }, + { + "epoch": 2.292405608026998, + "grad_norm": 0.9657024145126343, + "learning_rate": 4.7169697335830364e-05, + "loss": 2.7453, + "step": 25303 + }, + { + "epoch": 2.292496206201445, + "grad_norm": 0.9961796402931213, + "learning_rate": 4.7163656134839605e-05, + "loss": 2.5644, + "step": 25304 + }, + { + "epoch": 2.2925868043758917, + "grad_norm": 1.0278924703598022, + "learning_rate": 4.715761493384885e-05, + "loss": 2.7772, + "step": 25305 + }, + { + "epoch": 2.2926774025503387, + "grad_norm": 0.976323664188385, + "learning_rate": 4.7151573732858094e-05, + "loss": 2.7844, + "step": 25306 + }, + { + "epoch": 2.2927680007247853, + "grad_norm": 1.0010766983032227, + "learning_rate": 4.7145532531867334e-05, + "loss": 2.7198, + "step": 25307 + }, + { + "epoch": 2.2928585988992323, + "grad_norm": 0.961388885974884, + "learning_rate": 4.7139491330876575e-05, + "loss": 2.5689, + "step": 25308 + }, + { + "epoch": 2.292949197073679, + "grad_norm": 0.9648423790931702, + "learning_rate": 4.713345012988582e-05, + "loss": 2.6794, + "step": 25309 + }, + { + "epoch": 2.293039795248126, + "grad_norm": 1.0982261896133423, + "learning_rate": 4.7127408928895064e-05, + "loss": 2.4412, + "step": 25310 + }, + { + "epoch": 2.2931303934225724, + "grad_norm": 1.004824161529541, + "learning_rate": 4.712136772790431e-05, + "loss": 2.4479, + "step": 25311 + }, + { + "epoch": 2.2932209915970194, + "grad_norm": 1.044847011566162, + "learning_rate": 4.711532652691355e-05, + "loss": 2.6742, + "step": 25312 + }, + { + "epoch": 2.293311589771466, + "grad_norm": 1.028379201889038, + "learning_rate": 4.71092853259228e-05, + "loss": 2.5675, + "step": 25313 + }, + { + "epoch": 2.293402187945913, + "grad_norm": 1.053221344947815, + "learning_rate": 4.710324412493204e-05, + "loss": 2.4203, + "step": 25314 + }, + { + "epoch": 2.2934927861203596, + "grad_norm": 0.9263385534286499, + "learning_rate": 4.709720292394128e-05, + "loss": 2.5587, + "step": 25315 + }, + { + "epoch": 2.2935833842948066, + "grad_norm": 1.0489743947982788, + "learning_rate": 4.709116172295053e-05, + "loss": 2.3143, + "step": 25316 + }, + { + "epoch": 2.293673982469253, + "grad_norm": 1.0624631643295288, + "learning_rate": 4.708512052195977e-05, + "loss": 2.6666, + "step": 25317 + }, + { + "epoch": 2.2937645806437, + "grad_norm": 0.9938755631446838, + "learning_rate": 4.707907932096901e-05, + "loss": 2.6549, + "step": 25318 + }, + { + "epoch": 2.2938551788181467, + "grad_norm": 1.0229750871658325, + "learning_rate": 4.707303811997825e-05, + "loss": 2.7969, + "step": 25319 + }, + { + "epoch": 2.2939457769925937, + "grad_norm": 1.0103641748428345, + "learning_rate": 4.70669969189875e-05, + "loss": 2.8921, + "step": 25320 + }, + { + "epoch": 2.2940363751670403, + "grad_norm": 1.0184147357940674, + "learning_rate": 4.706095571799674e-05, + "loss": 2.532, + "step": 25321 + }, + { + "epoch": 2.2941269733414873, + "grad_norm": 1.1124602556228638, + "learning_rate": 4.705491451700598e-05, + "loss": 2.3317, + "step": 25322 + }, + { + "epoch": 2.294217571515934, + "grad_norm": 0.8915885090827942, + "learning_rate": 4.704887331601522e-05, + "loss": 2.0291, + "step": 25323 + }, + { + "epoch": 2.294308169690381, + "grad_norm": 0.975458025932312, + "learning_rate": 4.704283211502447e-05, + "loss": 2.5818, + "step": 25324 + }, + { + "epoch": 2.2943987678648274, + "grad_norm": 1.0335689783096313, + "learning_rate": 4.703679091403371e-05, + "loss": 2.7802, + "step": 25325 + }, + { + "epoch": 2.2944893660392744, + "grad_norm": 0.8879290223121643, + "learning_rate": 4.703074971304295e-05, + "loss": 1.7807, + "step": 25326 + }, + { + "epoch": 2.294579964213721, + "grad_norm": 0.8628506064414978, + "learning_rate": 4.70247085120522e-05, + "loss": 1.857, + "step": 25327 + }, + { + "epoch": 2.294670562388168, + "grad_norm": 1.0535597801208496, + "learning_rate": 4.7018667311061446e-05, + "loss": 2.8729, + "step": 25328 + }, + { + "epoch": 2.2947611605626146, + "grad_norm": 1.0429185628890991, + "learning_rate": 4.701262611007069e-05, + "loss": 2.6235, + "step": 25329 + }, + { + "epoch": 2.2948517587370616, + "grad_norm": 1.011780023574829, + "learning_rate": 4.700658490907993e-05, + "loss": 2.6787, + "step": 25330 + }, + { + "epoch": 2.294942356911508, + "grad_norm": 1.1191020011901855, + "learning_rate": 4.7000543708089175e-05, + "loss": 2.4879, + "step": 25331 + }, + { + "epoch": 2.295032955085955, + "grad_norm": 1.0180907249450684, + "learning_rate": 4.6994502507098416e-05, + "loss": 2.792, + "step": 25332 + }, + { + "epoch": 2.2951235532604017, + "grad_norm": 0.958519697189331, + "learning_rate": 4.698846130610766e-05, + "loss": 2.5698, + "step": 25333 + }, + { + "epoch": 2.2952141514348487, + "grad_norm": 1.0821864604949951, + "learning_rate": 4.69824201051169e-05, + "loss": 2.5637, + "step": 25334 + }, + { + "epoch": 2.2953047496092953, + "grad_norm": 1.0405105352401733, + "learning_rate": 4.6976378904126145e-05, + "loss": 2.8436, + "step": 25335 + }, + { + "epoch": 2.2953953477837423, + "grad_norm": 0.9660587906837463, + "learning_rate": 4.6970337703135386e-05, + "loss": 2.5461, + "step": 25336 + }, + { + "epoch": 2.295485945958189, + "grad_norm": 1.1169626712799072, + "learning_rate": 4.696429650214463e-05, + "loss": 2.4582, + "step": 25337 + }, + { + "epoch": 2.295576544132636, + "grad_norm": 1.0219918489456177, + "learning_rate": 4.695825530115387e-05, + "loss": 2.6052, + "step": 25338 + }, + { + "epoch": 2.2956671423070825, + "grad_norm": 0.9456284642219543, + "learning_rate": 4.6952214100163116e-05, + "loss": 2.6351, + "step": 25339 + }, + { + "epoch": 2.2957577404815295, + "grad_norm": 1.0948309898376465, + "learning_rate": 4.6946172899172356e-05, + "loss": 2.5543, + "step": 25340 + }, + { + "epoch": 2.295848338655976, + "grad_norm": 1.0291589498519897, + "learning_rate": 4.69401316981816e-05, + "loss": 2.7873, + "step": 25341 + }, + { + "epoch": 2.295938936830423, + "grad_norm": 1.0048977136611938, + "learning_rate": 4.693409049719084e-05, + "loss": 2.5369, + "step": 25342 + }, + { + "epoch": 2.2960295350048696, + "grad_norm": 1.0403058528900146, + "learning_rate": 4.6928049296200086e-05, + "loss": 2.4296, + "step": 25343 + }, + { + "epoch": 2.2961201331793166, + "grad_norm": 0.9991331696510315, + "learning_rate": 4.692200809520933e-05, + "loss": 2.5525, + "step": 25344 + }, + { + "epoch": 2.296210731353763, + "grad_norm": 0.849392831325531, + "learning_rate": 4.6915966894218574e-05, + "loss": 1.9715, + "step": 25345 + }, + { + "epoch": 2.29630132952821, + "grad_norm": 1.0499378442764282, + "learning_rate": 4.690992569322782e-05, + "loss": 2.7795, + "step": 25346 + }, + { + "epoch": 2.2963919277026568, + "grad_norm": 1.0111980438232422, + "learning_rate": 4.690388449223706e-05, + "loss": 2.7366, + "step": 25347 + }, + { + "epoch": 2.2964825258771038, + "grad_norm": 0.9745079874992371, + "learning_rate": 4.68978432912463e-05, + "loss": 2.6136, + "step": 25348 + }, + { + "epoch": 2.2965731240515503, + "grad_norm": 0.9827948212623596, + "learning_rate": 4.6891802090255544e-05, + "loss": 2.7975, + "step": 25349 + }, + { + "epoch": 2.296663722225997, + "grad_norm": 1.2262300252914429, + "learning_rate": 4.688576088926479e-05, + "loss": 2.4355, + "step": 25350 + }, + { + "epoch": 2.296754320400444, + "grad_norm": 1.052558422088623, + "learning_rate": 4.687971968827403e-05, + "loss": 2.5166, + "step": 25351 + }, + { + "epoch": 2.296844918574891, + "grad_norm": 1.015288233757019, + "learning_rate": 4.6873678487283273e-05, + "loss": 2.7957, + "step": 25352 + }, + { + "epoch": 2.2969355167493375, + "grad_norm": 0.9206778407096863, + "learning_rate": 4.6867637286292514e-05, + "loss": 1.7589, + "step": 25353 + }, + { + "epoch": 2.297026114923784, + "grad_norm": 1.0208323001861572, + "learning_rate": 4.686159608530176e-05, + "loss": 2.4744, + "step": 25354 + }, + { + "epoch": 2.297116713098231, + "grad_norm": 0.9516927003860474, + "learning_rate": 4.6855554884311e-05, + "loss": 2.5412, + "step": 25355 + }, + { + "epoch": 2.297207311272678, + "grad_norm": 1.0189803838729858, + "learning_rate": 4.6849513683320244e-05, + "loss": 2.5798, + "step": 25356 + }, + { + "epoch": 2.2972979094471246, + "grad_norm": 0.9849945902824402, + "learning_rate": 4.6843472482329484e-05, + "loss": 2.7202, + "step": 25357 + }, + { + "epoch": 2.297388507621571, + "grad_norm": 1.003309726715088, + "learning_rate": 4.683743128133873e-05, + "loss": 2.4395, + "step": 25358 + }, + { + "epoch": 2.297479105796018, + "grad_norm": 1.017517328262329, + "learning_rate": 4.683139008034797e-05, + "loss": 2.4642, + "step": 25359 + }, + { + "epoch": 2.2975697039704652, + "grad_norm": 0.880733847618103, + "learning_rate": 4.6825348879357214e-05, + "loss": 1.9686, + "step": 25360 + }, + { + "epoch": 2.297660302144912, + "grad_norm": 0.9524582028388977, + "learning_rate": 4.681930767836646e-05, + "loss": 2.5543, + "step": 25361 + }, + { + "epoch": 2.2977509003193584, + "grad_norm": 1.0116196870803833, + "learning_rate": 4.681326647737571e-05, + "loss": 2.6988, + "step": 25362 + }, + { + "epoch": 2.2978414984938054, + "grad_norm": 1.0373258590698242, + "learning_rate": 4.680722527638495e-05, + "loss": 2.5937, + "step": 25363 + }, + { + "epoch": 2.2979320966682524, + "grad_norm": 1.0101250410079956, + "learning_rate": 4.680118407539419e-05, + "loss": 2.8053, + "step": 25364 + }, + { + "epoch": 2.298022694842699, + "grad_norm": 0.9770486950874329, + "learning_rate": 4.679514287440344e-05, + "loss": 2.3634, + "step": 25365 + }, + { + "epoch": 2.2981132930171455, + "grad_norm": 1.137028694152832, + "learning_rate": 4.678910167341268e-05, + "loss": 2.6348, + "step": 25366 + }, + { + "epoch": 2.2982038911915925, + "grad_norm": 1.030227541923523, + "learning_rate": 4.678306047242192e-05, + "loss": 2.6159, + "step": 25367 + }, + { + "epoch": 2.298294489366039, + "grad_norm": 1.0875247716903687, + "learning_rate": 4.677701927143116e-05, + "loss": 2.3358, + "step": 25368 + }, + { + "epoch": 2.298385087540486, + "grad_norm": 0.9904561638832092, + "learning_rate": 4.677097807044041e-05, + "loss": 2.5091, + "step": 25369 + }, + { + "epoch": 2.2984756857149327, + "grad_norm": 0.9684230089187622, + "learning_rate": 4.676493686944965e-05, + "loss": 2.5582, + "step": 25370 + }, + { + "epoch": 2.2985662838893797, + "grad_norm": 1.0021651983261108, + "learning_rate": 4.675889566845889e-05, + "loss": 2.6654, + "step": 25371 + }, + { + "epoch": 2.2986568820638262, + "grad_norm": 1.1155283451080322, + "learning_rate": 4.675285446746813e-05, + "loss": 2.5532, + "step": 25372 + }, + { + "epoch": 2.2987474802382732, + "grad_norm": 0.9696234464645386, + "learning_rate": 4.674681326647738e-05, + "loss": 2.4985, + "step": 25373 + }, + { + "epoch": 2.29883807841272, + "grad_norm": 0.9807900786399841, + "learning_rate": 4.674077206548662e-05, + "loss": 2.3974, + "step": 25374 + }, + { + "epoch": 2.298928676587167, + "grad_norm": 0.9946906566619873, + "learning_rate": 4.673473086449586e-05, + "loss": 2.539, + "step": 25375 + }, + { + "epoch": 2.2990192747616134, + "grad_norm": 0.9584668874740601, + "learning_rate": 4.672868966350511e-05, + "loss": 2.4935, + "step": 25376 + }, + { + "epoch": 2.2991098729360604, + "grad_norm": 1.0955162048339844, + "learning_rate": 4.672264846251435e-05, + "loss": 2.4847, + "step": 25377 + }, + { + "epoch": 2.299200471110507, + "grad_norm": 1.1530482769012451, + "learning_rate": 4.6716607261523596e-05, + "loss": 2.9319, + "step": 25378 + }, + { + "epoch": 2.299291069284954, + "grad_norm": 1.00719153881073, + "learning_rate": 4.671056606053284e-05, + "loss": 2.8409, + "step": 25379 + }, + { + "epoch": 2.2993816674594005, + "grad_norm": 1.0154930353164673, + "learning_rate": 4.6704524859542084e-05, + "loss": 2.5267, + "step": 25380 + }, + { + "epoch": 2.2994722656338475, + "grad_norm": 1.0589512586593628, + "learning_rate": 4.6698483658551325e-05, + "loss": 2.7141, + "step": 25381 + }, + { + "epoch": 2.299562863808294, + "grad_norm": 0.9296720027923584, + "learning_rate": 4.6692442457560566e-05, + "loss": 2.7903, + "step": 25382 + }, + { + "epoch": 2.299653461982741, + "grad_norm": 0.8994095921516418, + "learning_rate": 4.668640125656981e-05, + "loss": 1.9783, + "step": 25383 + }, + { + "epoch": 2.2997440601571877, + "grad_norm": 0.9916989803314209, + "learning_rate": 4.6680360055579055e-05, + "loss": 2.5818, + "step": 25384 + }, + { + "epoch": 2.2998346583316347, + "grad_norm": 0.8279259204864502, + "learning_rate": 4.6674318854588295e-05, + "loss": 1.9531, + "step": 25385 + }, + { + "epoch": 2.2999252565060813, + "grad_norm": 1.0841777324676514, + "learning_rate": 4.6668277653597536e-05, + "loss": 2.6172, + "step": 25386 + }, + { + "epoch": 2.3000158546805283, + "grad_norm": 1.0497380495071411, + "learning_rate": 4.666223645260678e-05, + "loss": 2.8385, + "step": 25387 + }, + { + "epoch": 2.300106452854975, + "grad_norm": 1.0898971557617188, + "learning_rate": 4.6656195251616025e-05, + "loss": 2.5009, + "step": 25388 + }, + { + "epoch": 2.300197051029422, + "grad_norm": 1.0411947965621948, + "learning_rate": 4.6650154050625265e-05, + "loss": 2.4263, + "step": 25389 + }, + { + "epoch": 2.3002876492038684, + "grad_norm": 1.034700870513916, + "learning_rate": 4.6644112849634506e-05, + "loss": 2.5187, + "step": 25390 + }, + { + "epoch": 2.3003782473783154, + "grad_norm": 0.9891774654388428, + "learning_rate": 4.6638071648643754e-05, + "loss": 2.9066, + "step": 25391 + }, + { + "epoch": 2.300468845552762, + "grad_norm": 1.0530842542648315, + "learning_rate": 4.6632030447652995e-05, + "loss": 2.6768, + "step": 25392 + }, + { + "epoch": 2.300559443727209, + "grad_norm": 1.0359277725219727, + "learning_rate": 4.6625989246662236e-05, + "loss": 2.6295, + "step": 25393 + }, + { + "epoch": 2.3006500419016556, + "grad_norm": 1.1196001768112183, + "learning_rate": 4.661994804567148e-05, + "loss": 2.533, + "step": 25394 + }, + { + "epoch": 2.3007406400761026, + "grad_norm": 1.054915189743042, + "learning_rate": 4.6613906844680724e-05, + "loss": 2.6175, + "step": 25395 + }, + { + "epoch": 2.300831238250549, + "grad_norm": 1.0322890281677246, + "learning_rate": 4.660786564368997e-05, + "loss": 2.6538, + "step": 25396 + }, + { + "epoch": 2.300921836424996, + "grad_norm": 1.0905481576919556, + "learning_rate": 4.660182444269921e-05, + "loss": 2.5321, + "step": 25397 + }, + { + "epoch": 2.3010124345994427, + "grad_norm": 1.082579493522644, + "learning_rate": 4.659578324170845e-05, + "loss": 2.6208, + "step": 25398 + }, + { + "epoch": 2.3011030327738897, + "grad_norm": 1.021873950958252, + "learning_rate": 4.65897420407177e-05, + "loss": 2.6181, + "step": 25399 + }, + { + "epoch": 2.3011936309483363, + "grad_norm": 1.0772895812988281, + "learning_rate": 4.658370083972694e-05, + "loss": 2.6654, + "step": 25400 + }, + { + "epoch": 2.3012842291227833, + "grad_norm": 1.052848219871521, + "learning_rate": 4.657765963873618e-05, + "loss": 2.4124, + "step": 25401 + }, + { + "epoch": 2.30137482729723, + "grad_norm": 1.03011953830719, + "learning_rate": 4.657161843774542e-05, + "loss": 2.505, + "step": 25402 + }, + { + "epoch": 2.301465425471677, + "grad_norm": 1.1280027627944946, + "learning_rate": 4.656557723675467e-05, + "loss": 2.7613, + "step": 25403 + }, + { + "epoch": 2.3015560236461234, + "grad_norm": 0.9797302484512329, + "learning_rate": 4.655953603576391e-05, + "loss": 2.7255, + "step": 25404 + }, + { + "epoch": 2.3016466218205704, + "grad_norm": 1.0576173067092896, + "learning_rate": 4.655349483477315e-05, + "loss": 2.6785, + "step": 25405 + }, + { + "epoch": 2.301737219995017, + "grad_norm": 1.0160417556762695, + "learning_rate": 4.65474536337824e-05, + "loss": 2.683, + "step": 25406 + }, + { + "epoch": 2.301827818169464, + "grad_norm": 1.0038954019546509, + "learning_rate": 4.654141243279164e-05, + "loss": 2.6278, + "step": 25407 + }, + { + "epoch": 2.3019184163439106, + "grad_norm": 0.918408989906311, + "learning_rate": 4.653537123180088e-05, + "loss": 1.9691, + "step": 25408 + }, + { + "epoch": 2.3020090145183576, + "grad_norm": 0.9690179228782654, + "learning_rate": 4.652933003081012e-05, + "loss": 2.7494, + "step": 25409 + }, + { + "epoch": 2.302099612692804, + "grad_norm": 1.0512590408325195, + "learning_rate": 4.652328882981937e-05, + "loss": 2.9518, + "step": 25410 + }, + { + "epoch": 2.302190210867251, + "grad_norm": 1.0038628578186035, + "learning_rate": 4.651724762882861e-05, + "loss": 2.7748, + "step": 25411 + }, + { + "epoch": 2.3022808090416977, + "grad_norm": 1.063427448272705, + "learning_rate": 4.651120642783786e-05, + "loss": 2.467, + "step": 25412 + }, + { + "epoch": 2.3023714072161447, + "grad_norm": 0.9848437905311584, + "learning_rate": 4.65051652268471e-05, + "loss": 2.5697, + "step": 25413 + }, + { + "epoch": 2.3024620053905913, + "grad_norm": 1.0148794651031494, + "learning_rate": 4.649912402585635e-05, + "loss": 2.6426, + "step": 25414 + }, + { + "epoch": 2.3025526035650383, + "grad_norm": 1.0053801536560059, + "learning_rate": 4.649308282486559e-05, + "loss": 2.6779, + "step": 25415 + }, + { + "epoch": 2.302643201739485, + "grad_norm": 0.9766883850097656, + "learning_rate": 4.648704162387483e-05, + "loss": 2.6367, + "step": 25416 + }, + { + "epoch": 2.302733799913932, + "grad_norm": 0.9950931072235107, + "learning_rate": 4.648100042288407e-05, + "loss": 2.5789, + "step": 25417 + }, + { + "epoch": 2.3028243980883785, + "grad_norm": 1.0295952558517456, + "learning_rate": 4.647495922189332e-05, + "loss": 2.5354, + "step": 25418 + }, + { + "epoch": 2.3029149962628255, + "grad_norm": 0.9505146145820618, + "learning_rate": 4.646891802090256e-05, + "loss": 2.3961, + "step": 25419 + }, + { + "epoch": 2.303005594437272, + "grad_norm": 1.040769338607788, + "learning_rate": 4.64628768199118e-05, + "loss": 2.8066, + "step": 25420 + }, + { + "epoch": 2.303096192611719, + "grad_norm": 0.9678570628166199, + "learning_rate": 4.6456835618921047e-05, + "loss": 2.5743, + "step": 25421 + }, + { + "epoch": 2.3031867907861656, + "grad_norm": 1.0552064180374146, + "learning_rate": 4.645079441793029e-05, + "loss": 2.4735, + "step": 25422 + }, + { + "epoch": 2.3032773889606126, + "grad_norm": 1.0058119297027588, + "learning_rate": 4.644475321693953e-05, + "loss": 2.7499, + "step": 25423 + }, + { + "epoch": 2.303367987135059, + "grad_norm": 1.0273946523666382, + "learning_rate": 4.643871201594877e-05, + "loss": 2.7728, + "step": 25424 + }, + { + "epoch": 2.303458585309506, + "grad_norm": 1.109407663345337, + "learning_rate": 4.643267081495802e-05, + "loss": 2.3983, + "step": 25425 + }, + { + "epoch": 2.3035491834839528, + "grad_norm": 0.9409080147743225, + "learning_rate": 4.642662961396726e-05, + "loss": 2.554, + "step": 25426 + }, + { + "epoch": 2.3036397816583998, + "grad_norm": 0.9657700061798096, + "learning_rate": 4.64205884129765e-05, + "loss": 2.5439, + "step": 25427 + }, + { + "epoch": 2.3037303798328463, + "grad_norm": 1.0247740745544434, + "learning_rate": 4.6414547211985746e-05, + "loss": 2.6579, + "step": 25428 + }, + { + "epoch": 2.303820978007293, + "grad_norm": 0.9944895505905151, + "learning_rate": 4.640850601099499e-05, + "loss": 2.768, + "step": 25429 + }, + { + "epoch": 2.30391157618174, + "grad_norm": 1.0041543245315552, + "learning_rate": 4.6402464810004234e-05, + "loss": 2.5106, + "step": 25430 + }, + { + "epoch": 2.304002174356187, + "grad_norm": 1.0238856077194214, + "learning_rate": 4.6396423609013475e-05, + "loss": 2.7693, + "step": 25431 + }, + { + "epoch": 2.3040927725306335, + "grad_norm": 1.1184378862380981, + "learning_rate": 4.6390382408022716e-05, + "loss": 2.6986, + "step": 25432 + }, + { + "epoch": 2.30418337070508, + "grad_norm": 0.9422255158424377, + "learning_rate": 4.6384341207031964e-05, + "loss": 2.5859, + "step": 25433 + }, + { + "epoch": 2.304273968879527, + "grad_norm": 0.8943067193031311, + "learning_rate": 4.6378300006041204e-05, + "loss": 1.927, + "step": 25434 + }, + { + "epoch": 2.304364567053974, + "grad_norm": 0.9579656720161438, + "learning_rate": 4.6372258805050445e-05, + "loss": 2.468, + "step": 25435 + }, + { + "epoch": 2.3044551652284206, + "grad_norm": 0.9704015254974365, + "learning_rate": 4.636621760405969e-05, + "loss": 2.7803, + "step": 25436 + }, + { + "epoch": 2.304545763402867, + "grad_norm": 1.1157561540603638, + "learning_rate": 4.6360176403068934e-05, + "loss": 2.5307, + "step": 25437 + }, + { + "epoch": 2.304636361577314, + "grad_norm": 0.9885797500610352, + "learning_rate": 4.6354135202078175e-05, + "loss": 2.4054, + "step": 25438 + }, + { + "epoch": 2.3047269597517612, + "grad_norm": 0.9677466750144958, + "learning_rate": 4.6348094001087415e-05, + "loss": 2.4665, + "step": 25439 + }, + { + "epoch": 2.304817557926208, + "grad_norm": 1.085123062133789, + "learning_rate": 4.634205280009666e-05, + "loss": 2.564, + "step": 25440 + }, + { + "epoch": 2.3049081561006544, + "grad_norm": 1.0995409488677979, + "learning_rate": 4.6336011599105904e-05, + "loss": 2.5259, + "step": 25441 + }, + { + "epoch": 2.3049987542751014, + "grad_norm": 0.8330086469650269, + "learning_rate": 4.6329970398115145e-05, + "loss": 1.973, + "step": 25442 + }, + { + "epoch": 2.3050893524495484, + "grad_norm": 0.9086726903915405, + "learning_rate": 4.6323929197124385e-05, + "loss": 2.2024, + "step": 25443 + }, + { + "epoch": 2.305179950623995, + "grad_norm": 1.130287528038025, + "learning_rate": 4.631788799613363e-05, + "loss": 2.6132, + "step": 25444 + }, + { + "epoch": 2.3052705487984415, + "grad_norm": 1.010349154472351, + "learning_rate": 4.6311846795142874e-05, + "loss": 2.7022, + "step": 25445 + }, + { + "epoch": 2.3053611469728885, + "grad_norm": 1.0198415517807007, + "learning_rate": 4.630580559415212e-05, + "loss": 2.5668, + "step": 25446 + }, + { + "epoch": 2.3054517451473355, + "grad_norm": 1.0402525663375854, + "learning_rate": 4.629976439316136e-05, + "loss": 2.6309, + "step": 25447 + }, + { + "epoch": 2.305542343321782, + "grad_norm": 1.0705960988998413, + "learning_rate": 4.629372319217061e-05, + "loss": 2.7127, + "step": 25448 + }, + { + "epoch": 2.3056329414962287, + "grad_norm": 1.149167537689209, + "learning_rate": 4.628768199117985e-05, + "loss": 2.4565, + "step": 25449 + }, + { + "epoch": 2.3057235396706757, + "grad_norm": 1.0198248624801636, + "learning_rate": 4.628164079018909e-05, + "loss": 2.6322, + "step": 25450 + }, + { + "epoch": 2.3058141378451222, + "grad_norm": 1.1621124744415283, + "learning_rate": 4.627559958919834e-05, + "loss": 2.719, + "step": 25451 + }, + { + "epoch": 2.3059047360195692, + "grad_norm": 0.9914981722831726, + "learning_rate": 4.626955838820758e-05, + "loss": 1.8961, + "step": 25452 + }, + { + "epoch": 2.305995334194016, + "grad_norm": 0.9655652642250061, + "learning_rate": 4.626351718721682e-05, + "loss": 2.6551, + "step": 25453 + }, + { + "epoch": 2.306085932368463, + "grad_norm": 0.9916089177131653, + "learning_rate": 4.625747598622606e-05, + "loss": 2.6362, + "step": 25454 + }, + { + "epoch": 2.3061765305429094, + "grad_norm": 1.0764044523239136, + "learning_rate": 4.625143478523531e-05, + "loss": 2.3957, + "step": 25455 + }, + { + "epoch": 2.3062671287173564, + "grad_norm": 0.9319265484809875, + "learning_rate": 4.624539358424455e-05, + "loss": 2.5605, + "step": 25456 + }, + { + "epoch": 2.306357726891803, + "grad_norm": 1.0028140544891357, + "learning_rate": 4.623935238325379e-05, + "loss": 2.9168, + "step": 25457 + }, + { + "epoch": 2.30644832506625, + "grad_norm": 1.0589962005615234, + "learning_rate": 4.623331118226303e-05, + "loss": 2.6587, + "step": 25458 + }, + { + "epoch": 2.3065389232406965, + "grad_norm": 0.827878475189209, + "learning_rate": 4.622726998127228e-05, + "loss": 2.0105, + "step": 25459 + }, + { + "epoch": 2.3066295214151435, + "grad_norm": 1.0368341207504272, + "learning_rate": 4.622122878028152e-05, + "loss": 2.8015, + "step": 25460 + }, + { + "epoch": 2.30672011958959, + "grad_norm": 0.9787810444831848, + "learning_rate": 4.621518757929076e-05, + "loss": 2.6801, + "step": 25461 + }, + { + "epoch": 2.306810717764037, + "grad_norm": 1.0645915269851685, + "learning_rate": 4.620914637830001e-05, + "loss": 2.5392, + "step": 25462 + }, + { + "epoch": 2.3069013159384837, + "grad_norm": 1.0818742513656616, + "learning_rate": 4.620310517730925e-05, + "loss": 2.5665, + "step": 25463 + }, + { + "epoch": 2.3069919141129307, + "grad_norm": 1.023847222328186, + "learning_rate": 4.61970639763185e-05, + "loss": 2.7779, + "step": 25464 + }, + { + "epoch": 2.3070825122873773, + "grad_norm": 0.9995673298835754, + "learning_rate": 4.619102277532774e-05, + "loss": 2.6122, + "step": 25465 + }, + { + "epoch": 2.3071731104618243, + "grad_norm": 1.0317010879516602, + "learning_rate": 4.6184981574336986e-05, + "loss": 2.7841, + "step": 25466 + }, + { + "epoch": 2.307263708636271, + "grad_norm": 0.8201394081115723, + "learning_rate": 4.6178940373346226e-05, + "loss": 1.8163, + "step": 25467 + }, + { + "epoch": 2.307354306810718, + "grad_norm": 0.9837002158164978, + "learning_rate": 4.617289917235547e-05, + "loss": 2.5988, + "step": 25468 + }, + { + "epoch": 2.3074449049851644, + "grad_norm": 0.9901866316795349, + "learning_rate": 4.616685797136471e-05, + "loss": 2.6249, + "step": 25469 + }, + { + "epoch": 2.3075355031596114, + "grad_norm": 1.0545239448547363, + "learning_rate": 4.6160816770373956e-05, + "loss": 2.7126, + "step": 25470 + }, + { + "epoch": 2.307626101334058, + "grad_norm": 1.0452958345413208, + "learning_rate": 4.6154775569383196e-05, + "loss": 2.6413, + "step": 25471 + }, + { + "epoch": 2.307716699508505, + "grad_norm": 0.9440921545028687, + "learning_rate": 4.614873436839244e-05, + "loss": 2.389, + "step": 25472 + }, + { + "epoch": 2.3078072976829516, + "grad_norm": 1.0393967628479004, + "learning_rate": 4.614269316740168e-05, + "loss": 2.8376, + "step": 25473 + }, + { + "epoch": 2.3078978958573986, + "grad_norm": 0.8851323127746582, + "learning_rate": 4.6136651966410926e-05, + "loss": 1.8257, + "step": 25474 + }, + { + "epoch": 2.307988494031845, + "grad_norm": 0.9216623902320862, + "learning_rate": 4.6130610765420167e-05, + "loss": 1.9257, + "step": 25475 + }, + { + "epoch": 2.308079092206292, + "grad_norm": 0.9187763929367065, + "learning_rate": 4.612456956442941e-05, + "loss": 1.8121, + "step": 25476 + }, + { + "epoch": 2.3081696903807387, + "grad_norm": 0.8339969515800476, + "learning_rate": 4.611852836343865e-05, + "loss": 1.9291, + "step": 25477 + }, + { + "epoch": 2.3082602885551857, + "grad_norm": 0.9865372776985168, + "learning_rate": 4.6112487162447896e-05, + "loss": 2.5314, + "step": 25478 + }, + { + "epoch": 2.3083508867296323, + "grad_norm": 0.9756929874420166, + "learning_rate": 4.610644596145714e-05, + "loss": 2.3652, + "step": 25479 + }, + { + "epoch": 2.3084414849040793, + "grad_norm": 0.9283737540245056, + "learning_rate": 4.6100404760466384e-05, + "loss": 2.6575, + "step": 25480 + }, + { + "epoch": 2.308532083078526, + "grad_norm": 0.960020899772644, + "learning_rate": 4.6094363559475625e-05, + "loss": 2.6061, + "step": 25481 + }, + { + "epoch": 2.308622681252973, + "grad_norm": 1.0780084133148193, + "learning_rate": 4.608832235848487e-05, + "loss": 2.4497, + "step": 25482 + }, + { + "epoch": 2.3087132794274194, + "grad_norm": 0.9589098691940308, + "learning_rate": 4.6082281157494114e-05, + "loss": 2.5424, + "step": 25483 + }, + { + "epoch": 2.3088038776018664, + "grad_norm": 1.0514105558395386, + "learning_rate": 4.6076239956503354e-05, + "loss": 2.5398, + "step": 25484 + }, + { + "epoch": 2.308894475776313, + "grad_norm": 1.0160826444625854, + "learning_rate": 4.60701987555126e-05, + "loss": 2.6374, + "step": 25485 + }, + { + "epoch": 2.30898507395076, + "grad_norm": 0.9549992084503174, + "learning_rate": 4.606415755452184e-05, + "loss": 2.6814, + "step": 25486 + }, + { + "epoch": 2.3090756721252066, + "grad_norm": 1.0563851594924927, + "learning_rate": 4.6058116353531084e-05, + "loss": 2.687, + "step": 25487 + }, + { + "epoch": 2.3091662702996536, + "grad_norm": 0.9540027379989624, + "learning_rate": 4.6052075152540324e-05, + "loss": 1.837, + "step": 25488 + }, + { + "epoch": 2.3092568684741, + "grad_norm": 1.0346925258636475, + "learning_rate": 4.604603395154957e-05, + "loss": 2.9119, + "step": 25489 + }, + { + "epoch": 2.309347466648547, + "grad_norm": 1.0292716026306152, + "learning_rate": 4.603999275055881e-05, + "loss": 2.6941, + "step": 25490 + }, + { + "epoch": 2.3094380648229937, + "grad_norm": 1.0420433282852173, + "learning_rate": 4.6033951549568054e-05, + "loss": 2.6823, + "step": 25491 + }, + { + "epoch": 2.3095286629974408, + "grad_norm": 1.0087506771087646, + "learning_rate": 4.6027910348577295e-05, + "loss": 2.71, + "step": 25492 + }, + { + "epoch": 2.3096192611718873, + "grad_norm": 0.8864173889160156, + "learning_rate": 4.602186914758654e-05, + "loss": 2.0907, + "step": 25493 + }, + { + "epoch": 2.3097098593463343, + "grad_norm": 1.0178327560424805, + "learning_rate": 4.601582794659578e-05, + "loss": 2.5105, + "step": 25494 + }, + { + "epoch": 2.309800457520781, + "grad_norm": 0.9238826632499695, + "learning_rate": 4.6009786745605024e-05, + "loss": 2.5372, + "step": 25495 + }, + { + "epoch": 2.309891055695228, + "grad_norm": 1.0495141744613647, + "learning_rate": 4.600374554461427e-05, + "loss": 2.2337, + "step": 25496 + }, + { + "epoch": 2.3099816538696745, + "grad_norm": 0.9197319149971008, + "learning_rate": 4.599770434362351e-05, + "loss": 2.2885, + "step": 25497 + }, + { + "epoch": 2.3100722520441215, + "grad_norm": 1.0172758102416992, + "learning_rate": 4.599166314263276e-05, + "loss": 2.6073, + "step": 25498 + }, + { + "epoch": 2.310162850218568, + "grad_norm": 0.9808997511863708, + "learning_rate": 4.5985621941642e-05, + "loss": 2.4875, + "step": 25499 + }, + { + "epoch": 2.310253448393015, + "grad_norm": 1.0964553356170654, + "learning_rate": 4.597958074065125e-05, + "loss": 2.6197, + "step": 25500 + }, + { + "epoch": 2.3103440465674616, + "grad_norm": 0.9784706830978394, + "learning_rate": 4.597353953966049e-05, + "loss": 2.5904, + "step": 25501 + }, + { + "epoch": 2.3104346447419086, + "grad_norm": 0.8535811305046082, + "learning_rate": 4.596749833866973e-05, + "loss": 1.7512, + "step": 25502 + }, + { + "epoch": 2.310525242916355, + "grad_norm": 1.0073562860488892, + "learning_rate": 4.596145713767897e-05, + "loss": 2.8434, + "step": 25503 + }, + { + "epoch": 2.310615841090802, + "grad_norm": 0.8281400203704834, + "learning_rate": 4.595541593668822e-05, + "loss": 1.8944, + "step": 25504 + }, + { + "epoch": 2.3107064392652488, + "grad_norm": 0.9818280935287476, + "learning_rate": 4.594937473569746e-05, + "loss": 2.7994, + "step": 25505 + }, + { + "epoch": 2.310797037439696, + "grad_norm": 0.9747769832611084, + "learning_rate": 4.59433335347067e-05, + "loss": 2.3502, + "step": 25506 + }, + { + "epoch": 2.3108876356141423, + "grad_norm": 1.0228320360183716, + "learning_rate": 4.593729233371594e-05, + "loss": 2.7023, + "step": 25507 + }, + { + "epoch": 2.3109782337885894, + "grad_norm": 0.9956398010253906, + "learning_rate": 4.593125113272519e-05, + "loss": 2.5773, + "step": 25508 + }, + { + "epoch": 2.311068831963036, + "grad_norm": 1.0464816093444824, + "learning_rate": 4.592520993173443e-05, + "loss": 2.6539, + "step": 25509 + }, + { + "epoch": 2.311159430137483, + "grad_norm": 1.025909423828125, + "learning_rate": 4.591916873074367e-05, + "loss": 2.7548, + "step": 25510 + }, + { + "epoch": 2.3112500283119295, + "grad_norm": 1.0327881574630737, + "learning_rate": 4.591312752975292e-05, + "loss": 2.537, + "step": 25511 + }, + { + "epoch": 2.311340626486376, + "grad_norm": 0.9605927467346191, + "learning_rate": 4.590708632876216e-05, + "loss": 2.7112, + "step": 25512 + }, + { + "epoch": 2.311431224660823, + "grad_norm": 0.9956749677658081, + "learning_rate": 4.59010451277714e-05, + "loss": 2.3049, + "step": 25513 + }, + { + "epoch": 2.31152182283527, + "grad_norm": 0.9579891562461853, + "learning_rate": 4.589500392678065e-05, + "loss": 2.4788, + "step": 25514 + }, + { + "epoch": 2.3116124210097166, + "grad_norm": 1.0212938785552979, + "learning_rate": 4.5888962725789895e-05, + "loss": 2.6279, + "step": 25515 + }, + { + "epoch": 2.311703019184163, + "grad_norm": 1.0648843050003052, + "learning_rate": 4.5882921524799135e-05, + "loss": 2.5283, + "step": 25516 + }, + { + "epoch": 2.31179361735861, + "grad_norm": 1.0348693132400513, + "learning_rate": 4.5876880323808376e-05, + "loss": 2.4057, + "step": 25517 + }, + { + "epoch": 2.3118842155330572, + "grad_norm": 1.0302939414978027, + "learning_rate": 4.587083912281762e-05, + "loss": 2.8575, + "step": 25518 + }, + { + "epoch": 2.311974813707504, + "grad_norm": 0.9408901333808899, + "learning_rate": 4.5864797921826865e-05, + "loss": 2.6004, + "step": 25519 + }, + { + "epoch": 2.3120654118819504, + "grad_norm": 0.8798683881759644, + "learning_rate": 4.5858756720836106e-05, + "loss": 2.0411, + "step": 25520 + }, + { + "epoch": 2.3121560100563974, + "grad_norm": 0.9903228878974915, + "learning_rate": 4.5852715519845346e-05, + "loss": 2.692, + "step": 25521 + }, + { + "epoch": 2.3122466082308444, + "grad_norm": 1.0950844287872314, + "learning_rate": 4.584667431885459e-05, + "loss": 2.6047, + "step": 25522 + }, + { + "epoch": 2.312337206405291, + "grad_norm": 1.0674494504928589, + "learning_rate": 4.5840633117863835e-05, + "loss": 2.5374, + "step": 25523 + }, + { + "epoch": 2.3124278045797375, + "grad_norm": 0.9416976571083069, + "learning_rate": 4.5834591916873076e-05, + "loss": 2.6146, + "step": 25524 + }, + { + "epoch": 2.3125184027541845, + "grad_norm": 1.0085583925247192, + "learning_rate": 4.5828550715882316e-05, + "loss": 2.7443, + "step": 25525 + }, + { + "epoch": 2.3126090009286315, + "grad_norm": 1.0018692016601562, + "learning_rate": 4.5822509514891564e-05, + "loss": 2.3837, + "step": 25526 + }, + { + "epoch": 2.312699599103078, + "grad_norm": 1.0430307388305664, + "learning_rate": 4.5816468313900805e-05, + "loss": 2.5208, + "step": 25527 + }, + { + "epoch": 2.3127901972775247, + "grad_norm": 0.9469988346099854, + "learning_rate": 4.5810427112910046e-05, + "loss": 2.572, + "step": 25528 + }, + { + "epoch": 2.3128807954519717, + "grad_norm": 1.065376877784729, + "learning_rate": 4.5804385911919287e-05, + "loss": 2.9182, + "step": 25529 + }, + { + "epoch": 2.3129713936264182, + "grad_norm": 0.9663668870925903, + "learning_rate": 4.5798344710928534e-05, + "loss": 2.7958, + "step": 25530 + }, + { + "epoch": 2.3130619918008652, + "grad_norm": 1.029371738433838, + "learning_rate": 4.579230350993778e-05, + "loss": 2.6365, + "step": 25531 + }, + { + "epoch": 2.313152589975312, + "grad_norm": 1.1016902923583984, + "learning_rate": 4.578626230894702e-05, + "loss": 2.6217, + "step": 25532 + }, + { + "epoch": 2.313243188149759, + "grad_norm": 0.982734739780426, + "learning_rate": 4.5780221107956263e-05, + "loss": 2.5104, + "step": 25533 + }, + { + "epoch": 2.3133337863242054, + "grad_norm": 1.038835883140564, + "learning_rate": 4.577417990696551e-05, + "loss": 2.8037, + "step": 25534 + }, + { + "epoch": 2.3134243844986524, + "grad_norm": 1.0145342350006104, + "learning_rate": 4.576813870597475e-05, + "loss": 2.3202, + "step": 25535 + }, + { + "epoch": 2.313514982673099, + "grad_norm": 0.9601467847824097, + "learning_rate": 4.576209750498399e-05, + "loss": 2.8964, + "step": 25536 + }, + { + "epoch": 2.313605580847546, + "grad_norm": 1.1495014429092407, + "learning_rate": 4.5756056303993234e-05, + "loss": 2.6578, + "step": 25537 + }, + { + "epoch": 2.3136961790219925, + "grad_norm": 1.0643033981323242, + "learning_rate": 4.575001510300248e-05, + "loss": 2.6301, + "step": 25538 + }, + { + "epoch": 2.3137867771964395, + "grad_norm": 0.9767448306083679, + "learning_rate": 4.574397390201172e-05, + "loss": 2.4236, + "step": 25539 + }, + { + "epoch": 2.313877375370886, + "grad_norm": 0.9993828535079956, + "learning_rate": 4.573793270102096e-05, + "loss": 2.4748, + "step": 25540 + }, + { + "epoch": 2.313967973545333, + "grad_norm": 1.0616955757141113, + "learning_rate": 4.573189150003021e-05, + "loss": 2.6861, + "step": 25541 + }, + { + "epoch": 2.3140585717197797, + "grad_norm": 1.1348615884780884, + "learning_rate": 4.572585029903945e-05, + "loss": 2.4275, + "step": 25542 + }, + { + "epoch": 2.3141491698942267, + "grad_norm": 1.0195785760879517, + "learning_rate": 4.571980909804869e-05, + "loss": 2.7781, + "step": 25543 + }, + { + "epoch": 2.3142397680686733, + "grad_norm": 1.031156301498413, + "learning_rate": 4.571376789705793e-05, + "loss": 2.7526, + "step": 25544 + }, + { + "epoch": 2.3143303662431203, + "grad_norm": 1.0023436546325684, + "learning_rate": 4.570772669606718e-05, + "loss": 2.3814, + "step": 25545 + }, + { + "epoch": 2.314420964417567, + "grad_norm": 0.9531578421592712, + "learning_rate": 4.570168549507642e-05, + "loss": 2.521, + "step": 25546 + }, + { + "epoch": 2.314511562592014, + "grad_norm": 0.6760162711143494, + "learning_rate": 4.569564429408566e-05, + "loss": 1.1214, + "step": 25547 + }, + { + "epoch": 2.3146021607664604, + "grad_norm": 0.9980786442756653, + "learning_rate": 4.568960309309491e-05, + "loss": 2.5687, + "step": 25548 + }, + { + "epoch": 2.3146927589409074, + "grad_norm": 1.0208203792572021, + "learning_rate": 4.568356189210416e-05, + "loss": 2.628, + "step": 25549 + }, + { + "epoch": 2.314783357115354, + "grad_norm": 1.0922470092773438, + "learning_rate": 4.56775206911134e-05, + "loss": 2.481, + "step": 25550 + }, + { + "epoch": 2.314873955289801, + "grad_norm": 0.9933054447174072, + "learning_rate": 4.567147949012264e-05, + "loss": 2.6768, + "step": 25551 + }, + { + "epoch": 2.3149645534642476, + "grad_norm": 1.051722526550293, + "learning_rate": 4.5665438289131887e-05, + "loss": 2.7907, + "step": 25552 + }, + { + "epoch": 2.3150551516386946, + "grad_norm": 0.9971711039543152, + "learning_rate": 4.565939708814113e-05, + "loss": 3.0612, + "step": 25553 + }, + { + "epoch": 2.315145749813141, + "grad_norm": 0.8647014498710632, + "learning_rate": 4.565335588715037e-05, + "loss": 1.7281, + "step": 25554 + }, + { + "epoch": 2.315236347987588, + "grad_norm": 1.044979453086853, + "learning_rate": 4.564731468615961e-05, + "loss": 2.5998, + "step": 25555 + }, + { + "epoch": 2.3153269461620347, + "grad_norm": 1.1030486822128296, + "learning_rate": 4.564127348516886e-05, + "loss": 2.5911, + "step": 25556 + }, + { + "epoch": 2.3154175443364817, + "grad_norm": 1.0117645263671875, + "learning_rate": 4.56352322841781e-05, + "loss": 2.5898, + "step": 25557 + }, + { + "epoch": 2.3155081425109283, + "grad_norm": 1.094866394996643, + "learning_rate": 4.562919108318734e-05, + "loss": 2.5994, + "step": 25558 + }, + { + "epoch": 2.3155987406853753, + "grad_norm": 1.0379430055618286, + "learning_rate": 4.562314988219658e-05, + "loss": 2.7689, + "step": 25559 + }, + { + "epoch": 2.315689338859822, + "grad_norm": 1.0142688751220703, + "learning_rate": 4.561710868120583e-05, + "loss": 2.4188, + "step": 25560 + }, + { + "epoch": 2.315779937034269, + "grad_norm": 0.9196786284446716, + "learning_rate": 4.561106748021507e-05, + "loss": 2.6674, + "step": 25561 + }, + { + "epoch": 2.3158705352087154, + "grad_norm": 0.978605329990387, + "learning_rate": 4.560502627922431e-05, + "loss": 2.7593, + "step": 25562 + }, + { + "epoch": 2.3159611333831625, + "grad_norm": 0.9805887937545776, + "learning_rate": 4.559898507823355e-05, + "loss": 2.5916, + "step": 25563 + }, + { + "epoch": 2.316051731557609, + "grad_norm": 1.0993696451187134, + "learning_rate": 4.55929438772428e-05, + "loss": 2.5495, + "step": 25564 + }, + { + "epoch": 2.316142329732056, + "grad_norm": 1.039554238319397, + "learning_rate": 4.5586902676252044e-05, + "loss": 2.6912, + "step": 25565 + }, + { + "epoch": 2.3162329279065026, + "grad_norm": 0.9424550533294678, + "learning_rate": 4.5580861475261285e-05, + "loss": 2.1021, + "step": 25566 + }, + { + "epoch": 2.3163235260809496, + "grad_norm": 0.9149084687232971, + "learning_rate": 4.557482027427053e-05, + "loss": 2.5245, + "step": 25567 + }, + { + "epoch": 2.316414124255396, + "grad_norm": 0.990487813949585, + "learning_rate": 4.5568779073279774e-05, + "loss": 2.59, + "step": 25568 + }, + { + "epoch": 2.316504722429843, + "grad_norm": 1.0611110925674438, + "learning_rate": 4.5562737872289015e-05, + "loss": 2.5697, + "step": 25569 + }, + { + "epoch": 2.3165953206042897, + "grad_norm": 1.0259848833084106, + "learning_rate": 4.5556696671298255e-05, + "loss": 2.7783, + "step": 25570 + }, + { + "epoch": 2.3166859187787368, + "grad_norm": 0.9721906185150146, + "learning_rate": 4.55506554703075e-05, + "loss": 2.5418, + "step": 25571 + }, + { + "epoch": 2.3167765169531833, + "grad_norm": 0.8248410224914551, + "learning_rate": 4.5544614269316744e-05, + "loss": 1.9556, + "step": 25572 + }, + { + "epoch": 2.3168671151276303, + "grad_norm": 1.140650987625122, + "learning_rate": 4.5538573068325985e-05, + "loss": 2.8285, + "step": 25573 + }, + { + "epoch": 2.316957713302077, + "grad_norm": 0.8644620776176453, + "learning_rate": 4.5532531867335226e-05, + "loss": 1.9221, + "step": 25574 + }, + { + "epoch": 2.317048311476524, + "grad_norm": 0.9638432264328003, + "learning_rate": 4.552649066634447e-05, + "loss": 2.6479, + "step": 25575 + }, + { + "epoch": 2.3171389096509705, + "grad_norm": 1.2285711765289307, + "learning_rate": 4.5520449465353714e-05, + "loss": 2.2016, + "step": 25576 + }, + { + "epoch": 2.3172295078254175, + "grad_norm": 0.9679743647575378, + "learning_rate": 4.5514408264362955e-05, + "loss": 2.6669, + "step": 25577 + }, + { + "epoch": 2.317320105999864, + "grad_norm": 1.0355303287506104, + "learning_rate": 4.5508367063372196e-05, + "loss": 2.6651, + "step": 25578 + }, + { + "epoch": 2.317410704174311, + "grad_norm": 1.0255242586135864, + "learning_rate": 4.550232586238144e-05, + "loss": 2.3683, + "step": 25579 + }, + { + "epoch": 2.3175013023487576, + "grad_norm": 1.0770725011825562, + "learning_rate": 4.5496284661390684e-05, + "loss": 2.3712, + "step": 25580 + }, + { + "epoch": 2.3175919005232046, + "grad_norm": 1.004807710647583, + "learning_rate": 4.549024346039993e-05, + "loss": 2.7187, + "step": 25581 + }, + { + "epoch": 2.317682498697651, + "grad_norm": 0.9960920810699463, + "learning_rate": 4.548420225940917e-05, + "loss": 2.615, + "step": 25582 + }, + { + "epoch": 2.317773096872098, + "grad_norm": 1.003029704093933, + "learning_rate": 4.547816105841842e-05, + "loss": 2.5851, + "step": 25583 + }, + { + "epoch": 2.3178636950465448, + "grad_norm": 0.9464636445045471, + "learning_rate": 4.547211985742766e-05, + "loss": 2.5301, + "step": 25584 + }, + { + "epoch": 2.317954293220992, + "grad_norm": 1.0103206634521484, + "learning_rate": 4.54660786564369e-05, + "loss": 2.6818, + "step": 25585 + }, + { + "epoch": 2.3180448913954383, + "grad_norm": 1.0819834470748901, + "learning_rate": 4.546003745544615e-05, + "loss": 2.7253, + "step": 25586 + }, + { + "epoch": 2.3181354895698854, + "grad_norm": 1.1679075956344604, + "learning_rate": 4.545399625445539e-05, + "loss": 2.5517, + "step": 25587 + }, + { + "epoch": 2.318226087744332, + "grad_norm": 1.0110126733779907, + "learning_rate": 4.544795505346463e-05, + "loss": 2.7476, + "step": 25588 + }, + { + "epoch": 2.318316685918779, + "grad_norm": 1.0324302911758423, + "learning_rate": 4.544191385247387e-05, + "loss": 2.3051, + "step": 25589 + }, + { + "epoch": 2.3184072840932255, + "grad_norm": 1.0114245414733887, + "learning_rate": 4.543587265148312e-05, + "loss": 2.7914, + "step": 25590 + }, + { + "epoch": 2.318497882267672, + "grad_norm": 0.9566711187362671, + "learning_rate": 4.542983145049236e-05, + "loss": 3.0566, + "step": 25591 + }, + { + "epoch": 2.318588480442119, + "grad_norm": 1.0174779891967773, + "learning_rate": 4.54237902495016e-05, + "loss": 2.7932, + "step": 25592 + }, + { + "epoch": 2.318679078616566, + "grad_norm": 0.9590596556663513, + "learning_rate": 4.541774904851084e-05, + "loss": 2.4496, + "step": 25593 + }, + { + "epoch": 2.3187696767910126, + "grad_norm": 1.043812870979309, + "learning_rate": 4.541170784752009e-05, + "loss": 2.5503, + "step": 25594 + }, + { + "epoch": 2.318860274965459, + "grad_norm": 1.0052130222320557, + "learning_rate": 4.540566664652933e-05, + "loss": 2.9526, + "step": 25595 + }, + { + "epoch": 2.318950873139906, + "grad_norm": 1.2437313795089722, + "learning_rate": 4.539962544553857e-05, + "loss": 2.3286, + "step": 25596 + }, + { + "epoch": 2.3190414713143532, + "grad_norm": 0.9523200988769531, + "learning_rate": 4.539358424454782e-05, + "loss": 2.6468, + "step": 25597 + }, + { + "epoch": 2.3191320694888, + "grad_norm": 1.0069226026535034, + "learning_rate": 4.538754304355706e-05, + "loss": 2.6678, + "step": 25598 + }, + { + "epoch": 2.3192226676632464, + "grad_norm": 0.9281699061393738, + "learning_rate": 4.538150184256631e-05, + "loss": 2.4761, + "step": 25599 + }, + { + "epoch": 2.3193132658376934, + "grad_norm": 0.9721366167068481, + "learning_rate": 4.537546064157555e-05, + "loss": 2.6476, + "step": 25600 + }, + { + "epoch": 2.3194038640121404, + "grad_norm": 1.0246106386184692, + "learning_rate": 4.5369419440584796e-05, + "loss": 2.8798, + "step": 25601 + }, + { + "epoch": 2.319494462186587, + "grad_norm": 0.987236499786377, + "learning_rate": 4.5363378239594036e-05, + "loss": 2.6616, + "step": 25602 + }, + { + "epoch": 2.3195850603610335, + "grad_norm": 0.9982265830039978, + "learning_rate": 4.535733703860328e-05, + "loss": 2.6688, + "step": 25603 + }, + { + "epoch": 2.3196756585354805, + "grad_norm": 1.1062681674957275, + "learning_rate": 4.535129583761252e-05, + "loss": 2.6922, + "step": 25604 + }, + { + "epoch": 2.3197662567099275, + "grad_norm": 0.9799179434776306, + "learning_rate": 4.5345254636621766e-05, + "loss": 2.8202, + "step": 25605 + }, + { + "epoch": 2.319856854884374, + "grad_norm": 1.0291165113449097, + "learning_rate": 4.5339213435631007e-05, + "loss": 2.9541, + "step": 25606 + }, + { + "epoch": 2.3199474530588207, + "grad_norm": 0.9878459572792053, + "learning_rate": 4.533317223464025e-05, + "loss": 2.5614, + "step": 25607 + }, + { + "epoch": 2.3200380512332677, + "grad_norm": 1.2251347303390503, + "learning_rate": 4.532713103364949e-05, + "loss": 2.721, + "step": 25608 + }, + { + "epoch": 2.3201286494077147, + "grad_norm": 1.0592397451400757, + "learning_rate": 4.5321089832658736e-05, + "loss": 2.6518, + "step": 25609 + }, + { + "epoch": 2.3202192475821612, + "grad_norm": 1.0332915782928467, + "learning_rate": 4.531504863166798e-05, + "loss": 2.4935, + "step": 25610 + }, + { + "epoch": 2.320309845756608, + "grad_norm": 0.9640205502510071, + "learning_rate": 4.530900743067722e-05, + "loss": 2.5791, + "step": 25611 + }, + { + "epoch": 2.320400443931055, + "grad_norm": 1.012800931930542, + "learning_rate": 4.5302966229686465e-05, + "loss": 2.9049, + "step": 25612 + }, + { + "epoch": 2.3204910421055014, + "grad_norm": 1.010542392730713, + "learning_rate": 4.5296925028695706e-05, + "loss": 2.4471, + "step": 25613 + }, + { + "epoch": 2.3205816402799484, + "grad_norm": 1.0498234033584595, + "learning_rate": 4.529088382770495e-05, + "loss": 2.6861, + "step": 25614 + }, + { + "epoch": 2.320672238454395, + "grad_norm": 1.0556362867355347, + "learning_rate": 4.5284842626714194e-05, + "loss": 2.6207, + "step": 25615 + }, + { + "epoch": 2.320762836628842, + "grad_norm": 0.94496089220047, + "learning_rate": 4.5278801425723435e-05, + "loss": 2.6267, + "step": 25616 + }, + { + "epoch": 2.3208534348032885, + "grad_norm": 0.9294693470001221, + "learning_rate": 4.527276022473268e-05, + "loss": 1.9924, + "step": 25617 + }, + { + "epoch": 2.3209440329777355, + "grad_norm": 0.990776538848877, + "learning_rate": 4.5266719023741924e-05, + "loss": 2.5201, + "step": 25618 + }, + { + "epoch": 2.321034631152182, + "grad_norm": 1.106289029121399, + "learning_rate": 4.5260677822751164e-05, + "loss": 2.6056, + "step": 25619 + }, + { + "epoch": 2.321125229326629, + "grad_norm": 1.0063632726669312, + "learning_rate": 4.525463662176041e-05, + "loss": 2.5924, + "step": 25620 + }, + { + "epoch": 2.3212158275010757, + "grad_norm": 1.0071759223937988, + "learning_rate": 4.524859542076965e-05, + "loss": 2.609, + "step": 25621 + }, + { + "epoch": 2.3213064256755227, + "grad_norm": 0.9743877053260803, + "learning_rate": 4.5242554219778894e-05, + "loss": 3.0843, + "step": 25622 + }, + { + "epoch": 2.3213970238499693, + "grad_norm": 1.1130844354629517, + "learning_rate": 4.5236513018788135e-05, + "loss": 2.627, + "step": 25623 + }, + { + "epoch": 2.3214876220244163, + "grad_norm": 0.9825215935707092, + "learning_rate": 4.523047181779738e-05, + "loss": 2.7438, + "step": 25624 + }, + { + "epoch": 2.321578220198863, + "grad_norm": 1.0501866340637207, + "learning_rate": 4.522443061680662e-05, + "loss": 2.7266, + "step": 25625 + }, + { + "epoch": 2.32166881837331, + "grad_norm": 0.924709677696228, + "learning_rate": 4.5218389415815864e-05, + "loss": 2.0536, + "step": 25626 + }, + { + "epoch": 2.3217594165477564, + "grad_norm": 1.0464856624603271, + "learning_rate": 4.521234821482511e-05, + "loss": 2.4289, + "step": 25627 + }, + { + "epoch": 2.3218500147222034, + "grad_norm": 1.026293396949768, + "learning_rate": 4.520630701383435e-05, + "loss": 2.5796, + "step": 25628 + }, + { + "epoch": 2.32194061289665, + "grad_norm": 1.0230826139450073, + "learning_rate": 4.520026581284359e-05, + "loss": 2.7371, + "step": 25629 + }, + { + "epoch": 2.322031211071097, + "grad_norm": 1.0795913934707642, + "learning_rate": 4.5194224611852834e-05, + "loss": 2.4527, + "step": 25630 + }, + { + "epoch": 2.3221218092455436, + "grad_norm": 1.0450316667556763, + "learning_rate": 4.518818341086208e-05, + "loss": 2.6847, + "step": 25631 + }, + { + "epoch": 2.3222124074199906, + "grad_norm": 1.0134999752044678, + "learning_rate": 4.518214220987132e-05, + "loss": 2.6864, + "step": 25632 + }, + { + "epoch": 2.322303005594437, + "grad_norm": 0.9589763879776001, + "learning_rate": 4.517610100888057e-05, + "loss": 2.5276, + "step": 25633 + }, + { + "epoch": 2.322393603768884, + "grad_norm": 0.995726466178894, + "learning_rate": 4.517005980788981e-05, + "loss": 2.721, + "step": 25634 + }, + { + "epoch": 2.3224842019433307, + "grad_norm": 1.0566805601119995, + "learning_rate": 4.516401860689906e-05, + "loss": 2.2883, + "step": 25635 + }, + { + "epoch": 2.3225748001177777, + "grad_norm": 1.0234612226486206, + "learning_rate": 4.51579774059083e-05, + "loss": 2.7252, + "step": 25636 + }, + { + "epoch": 2.3226653982922243, + "grad_norm": 0.9401564002037048, + "learning_rate": 4.515193620491754e-05, + "loss": 2.5332, + "step": 25637 + }, + { + "epoch": 2.3227559964666713, + "grad_norm": 1.0407944917678833, + "learning_rate": 4.514589500392678e-05, + "loss": 2.0727, + "step": 25638 + }, + { + "epoch": 2.322846594641118, + "grad_norm": 0.9812082052230835, + "learning_rate": 4.513985380293603e-05, + "loss": 2.6442, + "step": 25639 + }, + { + "epoch": 2.322937192815565, + "grad_norm": 0.980491578578949, + "learning_rate": 4.513381260194527e-05, + "loss": 2.567, + "step": 25640 + }, + { + "epoch": 2.3230277909900114, + "grad_norm": 0.8484389185905457, + "learning_rate": 4.512777140095451e-05, + "loss": 1.997, + "step": 25641 + }, + { + "epoch": 2.3231183891644585, + "grad_norm": 0.9762370586395264, + "learning_rate": 4.512173019996376e-05, + "loss": 2.6037, + "step": 25642 + }, + { + "epoch": 2.323208987338905, + "grad_norm": 1.0981184244155884, + "learning_rate": 4.5115688998973e-05, + "loss": 2.5231, + "step": 25643 + }, + { + "epoch": 2.323299585513352, + "grad_norm": 1.0878410339355469, + "learning_rate": 4.510964779798224e-05, + "loss": 2.5355, + "step": 25644 + }, + { + "epoch": 2.3233901836877986, + "grad_norm": 1.050667405128479, + "learning_rate": 4.510360659699148e-05, + "loss": 2.392, + "step": 25645 + }, + { + "epoch": 2.3234807818622456, + "grad_norm": 0.9739794135093689, + "learning_rate": 4.509756539600073e-05, + "loss": 2.6086, + "step": 25646 + }, + { + "epoch": 2.323571380036692, + "grad_norm": 0.987821638584137, + "learning_rate": 4.509152419500997e-05, + "loss": 2.506, + "step": 25647 + }, + { + "epoch": 2.323661978211139, + "grad_norm": 1.0362753868103027, + "learning_rate": 4.508548299401921e-05, + "loss": 2.6717, + "step": 25648 + }, + { + "epoch": 2.3237525763855857, + "grad_norm": 0.9811427593231201, + "learning_rate": 4.507944179302846e-05, + "loss": 2.5204, + "step": 25649 + }, + { + "epoch": 2.3238431745600328, + "grad_norm": 0.9816723465919495, + "learning_rate": 4.50734005920377e-05, + "loss": 2.8133, + "step": 25650 + }, + { + "epoch": 2.3239337727344793, + "grad_norm": 1.149596095085144, + "learning_rate": 4.5067359391046946e-05, + "loss": 2.7027, + "step": 25651 + }, + { + "epoch": 2.3240243709089263, + "grad_norm": 0.9142692685127258, + "learning_rate": 4.5061318190056186e-05, + "loss": 2.2891, + "step": 25652 + }, + { + "epoch": 2.324114969083373, + "grad_norm": 0.9808663129806519, + "learning_rate": 4.505527698906543e-05, + "loss": 2.5415, + "step": 25653 + }, + { + "epoch": 2.32420556725782, + "grad_norm": 1.0305498838424683, + "learning_rate": 4.5049235788074675e-05, + "loss": 2.334, + "step": 25654 + }, + { + "epoch": 2.3242961654322665, + "grad_norm": 0.9878295063972473, + "learning_rate": 4.5043194587083916e-05, + "loss": 2.4916, + "step": 25655 + }, + { + "epoch": 2.3243867636067135, + "grad_norm": 0.9418854117393494, + "learning_rate": 4.5037153386093156e-05, + "loss": 2.56, + "step": 25656 + }, + { + "epoch": 2.32447736178116, + "grad_norm": 1.0932923555374146, + "learning_rate": 4.5031112185102404e-05, + "loss": 2.5222, + "step": 25657 + }, + { + "epoch": 2.324567959955607, + "grad_norm": 1.0742480754852295, + "learning_rate": 4.5025070984111645e-05, + "loss": 2.6239, + "step": 25658 + }, + { + "epoch": 2.3246585581300536, + "grad_norm": 0.8784226775169373, + "learning_rate": 4.5019029783120886e-05, + "loss": 2.0251, + "step": 25659 + }, + { + "epoch": 2.3247491563045006, + "grad_norm": 1.1055386066436768, + "learning_rate": 4.5012988582130127e-05, + "loss": 2.6322, + "step": 25660 + }, + { + "epoch": 2.324839754478947, + "grad_norm": 1.0334266424179077, + "learning_rate": 4.5006947381139374e-05, + "loss": 2.8538, + "step": 25661 + }, + { + "epoch": 2.324930352653394, + "grad_norm": 1.004436731338501, + "learning_rate": 4.5000906180148615e-05, + "loss": 2.4932, + "step": 25662 + }, + { + "epoch": 2.3250209508278408, + "grad_norm": 1.0890333652496338, + "learning_rate": 4.4994864979157856e-05, + "loss": 2.657, + "step": 25663 + }, + { + "epoch": 2.325111549002288, + "grad_norm": 0.9834780097007751, + "learning_rate": 4.49888237781671e-05, + "loss": 2.7375, + "step": 25664 + }, + { + "epoch": 2.3252021471767343, + "grad_norm": 1.0091420412063599, + "learning_rate": 4.4982782577176344e-05, + "loss": 2.9072, + "step": 25665 + }, + { + "epoch": 2.3252927453511814, + "grad_norm": 0.964179515838623, + "learning_rate": 4.4976741376185585e-05, + "loss": 2.3725, + "step": 25666 + }, + { + "epoch": 2.325383343525628, + "grad_norm": 0.9971972107887268, + "learning_rate": 4.497070017519483e-05, + "loss": 2.5767, + "step": 25667 + }, + { + "epoch": 2.325473941700075, + "grad_norm": 1.0613152980804443, + "learning_rate": 4.4964658974204074e-05, + "loss": 2.495, + "step": 25668 + }, + { + "epoch": 2.3255645398745215, + "grad_norm": 0.9745026230812073, + "learning_rate": 4.495861777321332e-05, + "loss": 2.5003, + "step": 25669 + }, + { + "epoch": 2.3256551380489685, + "grad_norm": 0.9294523000717163, + "learning_rate": 4.495257657222256e-05, + "loss": 2.5807, + "step": 25670 + }, + { + "epoch": 2.325745736223415, + "grad_norm": 1.012110948562622, + "learning_rate": 4.49465353712318e-05, + "loss": 2.7367, + "step": 25671 + }, + { + "epoch": 2.325836334397862, + "grad_norm": 1.0692180395126343, + "learning_rate": 4.494049417024105e-05, + "loss": 2.7186, + "step": 25672 + }, + { + "epoch": 2.3259269325723086, + "grad_norm": 1.0698822736740112, + "learning_rate": 4.493445296925029e-05, + "loss": 2.7592, + "step": 25673 + }, + { + "epoch": 2.326017530746755, + "grad_norm": 1.062374234199524, + "learning_rate": 4.492841176825953e-05, + "loss": 2.8223, + "step": 25674 + }, + { + "epoch": 2.3261081289212022, + "grad_norm": 1.0670396089553833, + "learning_rate": 4.492237056726877e-05, + "loss": 2.5123, + "step": 25675 + }, + { + "epoch": 2.3261987270956492, + "grad_norm": 0.8594669103622437, + "learning_rate": 4.491632936627802e-05, + "loss": 2.0281, + "step": 25676 + }, + { + "epoch": 2.326289325270096, + "grad_norm": 1.0101354122161865, + "learning_rate": 4.491028816528726e-05, + "loss": 2.8378, + "step": 25677 + }, + { + "epoch": 2.3263799234445424, + "grad_norm": 0.9616573452949524, + "learning_rate": 4.49042469642965e-05, + "loss": 2.6302, + "step": 25678 + }, + { + "epoch": 2.3264705216189894, + "grad_norm": 1.0405811071395874, + "learning_rate": 4.489820576330574e-05, + "loss": 2.5525, + "step": 25679 + }, + { + "epoch": 2.3265611197934364, + "grad_norm": 0.9965237379074097, + "learning_rate": 4.489216456231499e-05, + "loss": 2.3923, + "step": 25680 + }, + { + "epoch": 2.326651717967883, + "grad_norm": 0.9593727588653564, + "learning_rate": 4.488612336132423e-05, + "loss": 2.7172, + "step": 25681 + }, + { + "epoch": 2.3267423161423295, + "grad_norm": 1.0916848182678223, + "learning_rate": 4.488008216033347e-05, + "loss": 2.8012, + "step": 25682 + }, + { + "epoch": 2.3268329143167765, + "grad_norm": 0.9610121846199036, + "learning_rate": 4.487404095934272e-05, + "loss": 2.0506, + "step": 25683 + }, + { + "epoch": 2.3269235124912235, + "grad_norm": 0.7812389731407166, + "learning_rate": 4.486799975835196e-05, + "loss": 1.4015, + "step": 25684 + }, + { + "epoch": 2.32701411066567, + "grad_norm": 0.9608208537101746, + "learning_rate": 4.486195855736121e-05, + "loss": 2.6492, + "step": 25685 + }, + { + "epoch": 2.3271047088401167, + "grad_norm": 1.092355489730835, + "learning_rate": 4.485591735637045e-05, + "loss": 2.5881, + "step": 25686 + }, + { + "epoch": 2.3271953070145637, + "grad_norm": 1.0329207181930542, + "learning_rate": 4.48498761553797e-05, + "loss": 2.8338, + "step": 25687 + }, + { + "epoch": 2.3272859051890107, + "grad_norm": 1.0096960067749023, + "learning_rate": 4.484383495438894e-05, + "loss": 2.4478, + "step": 25688 + }, + { + "epoch": 2.3273765033634572, + "grad_norm": 0.9828053116798401, + "learning_rate": 4.483779375339818e-05, + "loss": 2.4877, + "step": 25689 + }, + { + "epoch": 2.327467101537904, + "grad_norm": 0.8972353339195251, + "learning_rate": 4.483175255240742e-05, + "loss": 1.9581, + "step": 25690 + }, + { + "epoch": 2.327557699712351, + "grad_norm": 0.9924226403236389, + "learning_rate": 4.482571135141667e-05, + "loss": 2.6298, + "step": 25691 + }, + { + "epoch": 2.3276482978867974, + "grad_norm": 1.0737969875335693, + "learning_rate": 4.481967015042591e-05, + "loss": 2.6709, + "step": 25692 + }, + { + "epoch": 2.3277388960612444, + "grad_norm": 1.0100208520889282, + "learning_rate": 4.481362894943515e-05, + "loss": 2.8195, + "step": 25693 + }, + { + "epoch": 2.327829494235691, + "grad_norm": 0.9748225212097168, + "learning_rate": 4.480758774844439e-05, + "loss": 2.5276, + "step": 25694 + }, + { + "epoch": 2.327920092410138, + "grad_norm": 1.0760607719421387, + "learning_rate": 4.480154654745364e-05, + "loss": 2.3885, + "step": 25695 + }, + { + "epoch": 2.3280106905845845, + "grad_norm": 1.0173808336257935, + "learning_rate": 4.479550534646288e-05, + "loss": 2.6098, + "step": 25696 + }, + { + "epoch": 2.3281012887590316, + "grad_norm": 1.0565483570098877, + "learning_rate": 4.478946414547212e-05, + "loss": 2.6951, + "step": 25697 + }, + { + "epoch": 2.328191886933478, + "grad_norm": 1.035897135734558, + "learning_rate": 4.478342294448136e-05, + "loss": 2.4819, + "step": 25698 + }, + { + "epoch": 2.328282485107925, + "grad_norm": 1.1131243705749512, + "learning_rate": 4.477738174349061e-05, + "loss": 2.6315, + "step": 25699 + }, + { + "epoch": 2.3283730832823717, + "grad_norm": 0.9309357404708862, + "learning_rate": 4.477134054249985e-05, + "loss": 2.6644, + "step": 25700 + }, + { + "epoch": 2.3284636814568187, + "grad_norm": 0.982332706451416, + "learning_rate": 4.4765299341509095e-05, + "loss": 2.5784, + "step": 25701 + }, + { + "epoch": 2.3285542796312653, + "grad_norm": 1.015336036682129, + "learning_rate": 4.475925814051834e-05, + "loss": 2.6076, + "step": 25702 + }, + { + "epoch": 2.3286448778057123, + "grad_norm": 1.1157954931259155, + "learning_rate": 4.4753216939527584e-05, + "loss": 2.3665, + "step": 25703 + }, + { + "epoch": 2.328735475980159, + "grad_norm": 1.2084934711456299, + "learning_rate": 4.4747175738536825e-05, + "loss": 2.5483, + "step": 25704 + }, + { + "epoch": 2.328826074154606, + "grad_norm": 1.0725675821304321, + "learning_rate": 4.4741134537546066e-05, + "loss": 2.4889, + "step": 25705 + }, + { + "epoch": 2.3289166723290524, + "grad_norm": 1.1157010793685913, + "learning_rate": 4.473509333655531e-05, + "loss": 2.3775, + "step": 25706 + }, + { + "epoch": 2.3290072705034994, + "grad_norm": 0.9463755488395691, + "learning_rate": 4.4729052135564554e-05, + "loss": 2.5602, + "step": 25707 + }, + { + "epoch": 2.329097868677946, + "grad_norm": 1.1698276996612549, + "learning_rate": 4.4723010934573795e-05, + "loss": 2.699, + "step": 25708 + }, + { + "epoch": 2.329188466852393, + "grad_norm": 0.997586727142334, + "learning_rate": 4.4716969733583036e-05, + "loss": 2.6226, + "step": 25709 + }, + { + "epoch": 2.3292790650268396, + "grad_norm": 1.0124458074569702, + "learning_rate": 4.471092853259228e-05, + "loss": 2.6814, + "step": 25710 + }, + { + "epoch": 2.3293696632012866, + "grad_norm": 0.9913476705551147, + "learning_rate": 4.4704887331601524e-05, + "loss": 2.9174, + "step": 25711 + }, + { + "epoch": 2.329460261375733, + "grad_norm": 1.1993045806884766, + "learning_rate": 4.4698846130610765e-05, + "loss": 2.5452, + "step": 25712 + }, + { + "epoch": 2.32955085955018, + "grad_norm": 1.0563286542892456, + "learning_rate": 4.4692804929620006e-05, + "loss": 2.5654, + "step": 25713 + }, + { + "epoch": 2.3296414577246267, + "grad_norm": 1.0142006874084473, + "learning_rate": 4.468676372862925e-05, + "loss": 2.5854, + "step": 25714 + }, + { + "epoch": 2.3297320558990737, + "grad_norm": 1.049128770828247, + "learning_rate": 4.4680722527638494e-05, + "loss": 2.7514, + "step": 25715 + }, + { + "epoch": 2.3298226540735203, + "grad_norm": 0.9918827414512634, + "learning_rate": 4.4674681326647735e-05, + "loss": 2.5237, + "step": 25716 + }, + { + "epoch": 2.3299132522479673, + "grad_norm": 1.0453622341156006, + "learning_rate": 4.466864012565698e-05, + "loss": 2.5273, + "step": 25717 + }, + { + "epoch": 2.330003850422414, + "grad_norm": 1.014262080192566, + "learning_rate": 4.466259892466623e-05, + "loss": 2.7176, + "step": 25718 + }, + { + "epoch": 2.330094448596861, + "grad_norm": 1.0226080417633057, + "learning_rate": 4.465655772367547e-05, + "loss": 2.447, + "step": 25719 + }, + { + "epoch": 2.3301850467713074, + "grad_norm": 1.0186114311218262, + "learning_rate": 4.465051652268471e-05, + "loss": 2.7097, + "step": 25720 + }, + { + "epoch": 2.3302756449457545, + "grad_norm": 1.0138452053070068, + "learning_rate": 4.464447532169396e-05, + "loss": 2.6431, + "step": 25721 + }, + { + "epoch": 2.330366243120201, + "grad_norm": 1.0439661741256714, + "learning_rate": 4.46384341207032e-05, + "loss": 2.8277, + "step": 25722 + }, + { + "epoch": 2.330456841294648, + "grad_norm": 1.0053322315216064, + "learning_rate": 4.463239291971244e-05, + "loss": 2.5396, + "step": 25723 + }, + { + "epoch": 2.3305474394690946, + "grad_norm": 1.0300110578536987, + "learning_rate": 4.462635171872168e-05, + "loss": 2.4694, + "step": 25724 + }, + { + "epoch": 2.3306380376435416, + "grad_norm": 0.9950382709503174, + "learning_rate": 4.462031051773093e-05, + "loss": 2.4179, + "step": 25725 + }, + { + "epoch": 2.330728635817988, + "grad_norm": 1.0308879613876343, + "learning_rate": 4.461426931674017e-05, + "loss": 2.5251, + "step": 25726 + }, + { + "epoch": 2.330819233992435, + "grad_norm": 1.0640753507614136, + "learning_rate": 4.460822811574941e-05, + "loss": 2.7163, + "step": 25727 + }, + { + "epoch": 2.3309098321668817, + "grad_norm": 0.9556722640991211, + "learning_rate": 4.460218691475865e-05, + "loss": 2.479, + "step": 25728 + }, + { + "epoch": 2.3310004303413288, + "grad_norm": 1.000243067741394, + "learning_rate": 4.45961457137679e-05, + "loss": 2.8044, + "step": 25729 + }, + { + "epoch": 2.3310910285157753, + "grad_norm": 1.0167102813720703, + "learning_rate": 4.459010451277714e-05, + "loss": 2.5537, + "step": 25730 + }, + { + "epoch": 2.3311816266902223, + "grad_norm": 0.9269869923591614, + "learning_rate": 4.458406331178638e-05, + "loss": 2.0129, + "step": 25731 + }, + { + "epoch": 2.331272224864669, + "grad_norm": 0.9919549822807312, + "learning_rate": 4.457802211079563e-05, + "loss": 2.4407, + "step": 25732 + }, + { + "epoch": 2.331362823039116, + "grad_norm": 1.1509771347045898, + "learning_rate": 4.457198090980487e-05, + "loss": 2.3347, + "step": 25733 + }, + { + "epoch": 2.3314534212135625, + "grad_norm": 0.9683167338371277, + "learning_rate": 4.456593970881411e-05, + "loss": 2.5034, + "step": 25734 + }, + { + "epoch": 2.3315440193880095, + "grad_norm": 0.9475666880607605, + "learning_rate": 4.455989850782336e-05, + "loss": 2.5427, + "step": 25735 + }, + { + "epoch": 2.331634617562456, + "grad_norm": 1.0833570957183838, + "learning_rate": 4.4553857306832606e-05, + "loss": 2.3406, + "step": 25736 + }, + { + "epoch": 2.331725215736903, + "grad_norm": 1.019152045249939, + "learning_rate": 4.454781610584185e-05, + "loss": 2.7977, + "step": 25737 + }, + { + "epoch": 2.3318158139113496, + "grad_norm": 0.987076461315155, + "learning_rate": 4.454177490485109e-05, + "loss": 2.6095, + "step": 25738 + }, + { + "epoch": 2.3319064120857966, + "grad_norm": 1.1459227800369263, + "learning_rate": 4.453573370386033e-05, + "loss": 2.4089, + "step": 25739 + }, + { + "epoch": 2.331997010260243, + "grad_norm": 1.012312889099121, + "learning_rate": 4.4529692502869576e-05, + "loss": 2.6485, + "step": 25740 + }, + { + "epoch": 2.33208760843469, + "grad_norm": 0.8726150393486023, + "learning_rate": 4.452365130187882e-05, + "loss": 2.0436, + "step": 25741 + }, + { + "epoch": 2.3321782066091368, + "grad_norm": 0.8696851134300232, + "learning_rate": 4.451761010088806e-05, + "loss": 1.8749, + "step": 25742 + }, + { + "epoch": 2.332268804783584, + "grad_norm": 0.8458625674247742, + "learning_rate": 4.45115688998973e-05, + "loss": 1.9701, + "step": 25743 + }, + { + "epoch": 2.3323594029580303, + "grad_norm": 1.0326942205429077, + "learning_rate": 4.4505527698906546e-05, + "loss": 2.6992, + "step": 25744 + }, + { + "epoch": 2.3324500011324774, + "grad_norm": 0.9778992533683777, + "learning_rate": 4.449948649791579e-05, + "loss": 2.4267, + "step": 25745 + }, + { + "epoch": 2.332540599306924, + "grad_norm": 0.9969066977500916, + "learning_rate": 4.449344529692503e-05, + "loss": 2.5422, + "step": 25746 + }, + { + "epoch": 2.332631197481371, + "grad_norm": 1.0138957500457764, + "learning_rate": 4.4487404095934275e-05, + "loss": 2.7619, + "step": 25747 + }, + { + "epoch": 2.3327217956558175, + "grad_norm": 1.0267937183380127, + "learning_rate": 4.4481362894943516e-05, + "loss": 2.7871, + "step": 25748 + }, + { + "epoch": 2.3328123938302645, + "grad_norm": 1.0050491094589233, + "learning_rate": 4.447532169395276e-05, + "loss": 2.6224, + "step": 25749 + }, + { + "epoch": 2.332902992004711, + "grad_norm": 1.064955234527588, + "learning_rate": 4.4469280492962e-05, + "loss": 2.5971, + "step": 25750 + }, + { + "epoch": 2.332993590179158, + "grad_norm": 1.0351558923721313, + "learning_rate": 4.4463239291971245e-05, + "loss": 2.6501, + "step": 25751 + }, + { + "epoch": 2.3330841883536046, + "grad_norm": 1.0456746816635132, + "learning_rate": 4.445719809098049e-05, + "loss": 2.6565, + "step": 25752 + }, + { + "epoch": 2.333174786528051, + "grad_norm": 0.9863004088401794, + "learning_rate": 4.4451156889989734e-05, + "loss": 2.534, + "step": 25753 + }, + { + "epoch": 2.3332653847024982, + "grad_norm": 0.9743459224700928, + "learning_rate": 4.4445115688998975e-05, + "loss": 2.526, + "step": 25754 + }, + { + "epoch": 2.3333559828769452, + "grad_norm": 1.185500979423523, + "learning_rate": 4.443907448800822e-05, + "loss": 2.4918, + "step": 25755 + }, + { + "epoch": 2.333446581051392, + "grad_norm": 0.9243214726448059, + "learning_rate": 4.443303328701746e-05, + "loss": 2.5366, + "step": 25756 + }, + { + "epoch": 2.3335371792258384, + "grad_norm": 1.1047409772872925, + "learning_rate": 4.4426992086026704e-05, + "loss": 2.5781, + "step": 25757 + }, + { + "epoch": 2.3336277774002854, + "grad_norm": 1.0293214321136475, + "learning_rate": 4.4420950885035945e-05, + "loss": 2.7232, + "step": 25758 + }, + { + "epoch": 2.3337183755747324, + "grad_norm": 0.9535521268844604, + "learning_rate": 4.441490968404519e-05, + "loss": 2.7097, + "step": 25759 + }, + { + "epoch": 2.333808973749179, + "grad_norm": 0.9498331546783447, + "learning_rate": 4.440886848305443e-05, + "loss": 1.9068, + "step": 25760 + }, + { + "epoch": 2.3338995719236255, + "grad_norm": 1.0194191932678223, + "learning_rate": 4.4402827282063674e-05, + "loss": 2.6536, + "step": 25761 + }, + { + "epoch": 2.3339901700980725, + "grad_norm": 0.8885258436203003, + "learning_rate": 4.439678608107292e-05, + "loss": 1.9213, + "step": 25762 + }, + { + "epoch": 2.3340807682725195, + "grad_norm": 1.0123984813690186, + "learning_rate": 4.439074488008216e-05, + "loss": 2.6073, + "step": 25763 + }, + { + "epoch": 2.334171366446966, + "grad_norm": 1.104027509689331, + "learning_rate": 4.43847036790914e-05, + "loss": 2.4448, + "step": 25764 + }, + { + "epoch": 2.3342619646214127, + "grad_norm": 1.037368655204773, + "learning_rate": 4.4378662478100644e-05, + "loss": 2.5744, + "step": 25765 + }, + { + "epoch": 2.3343525627958597, + "grad_norm": 0.9444676041603088, + "learning_rate": 4.437262127710989e-05, + "loss": 2.5911, + "step": 25766 + }, + { + "epoch": 2.3344431609703067, + "grad_norm": 0.9560856223106384, + "learning_rate": 4.436658007611913e-05, + "loss": 2.3907, + "step": 25767 + }, + { + "epoch": 2.3345337591447533, + "grad_norm": 1.0447864532470703, + "learning_rate": 4.436053887512838e-05, + "loss": 2.6625, + "step": 25768 + }, + { + "epoch": 2.3346243573192, + "grad_norm": 0.9862340688705444, + "learning_rate": 4.435449767413762e-05, + "loss": 2.5438, + "step": 25769 + }, + { + "epoch": 2.334714955493647, + "grad_norm": 0.9993987083435059, + "learning_rate": 4.434845647314687e-05, + "loss": 2.7245, + "step": 25770 + }, + { + "epoch": 2.334805553668094, + "grad_norm": 1.0724645853042603, + "learning_rate": 4.434241527215611e-05, + "loss": 2.7495, + "step": 25771 + }, + { + "epoch": 2.3348961518425404, + "grad_norm": 1.1023236513137817, + "learning_rate": 4.433637407116535e-05, + "loss": 2.531, + "step": 25772 + }, + { + "epoch": 2.334986750016987, + "grad_norm": 1.0443947315216064, + "learning_rate": 4.433033287017459e-05, + "loss": 2.5813, + "step": 25773 + }, + { + "epoch": 2.335077348191434, + "grad_norm": 0.9515095949172974, + "learning_rate": 4.432429166918384e-05, + "loss": 2.6335, + "step": 25774 + }, + { + "epoch": 2.3351679463658805, + "grad_norm": 1.0032784938812256, + "learning_rate": 4.431825046819308e-05, + "loss": 2.5353, + "step": 25775 + }, + { + "epoch": 2.3352585445403276, + "grad_norm": 0.9837586879730225, + "learning_rate": 4.431220926720232e-05, + "loss": 2.4138, + "step": 25776 + }, + { + "epoch": 2.335349142714774, + "grad_norm": 1.0230787992477417, + "learning_rate": 4.430616806621157e-05, + "loss": 2.5209, + "step": 25777 + }, + { + "epoch": 2.335439740889221, + "grad_norm": 1.0310205221176147, + "learning_rate": 4.430012686522081e-05, + "loss": 2.4481, + "step": 25778 + }, + { + "epoch": 2.3355303390636677, + "grad_norm": 1.0637941360473633, + "learning_rate": 4.429408566423005e-05, + "loss": 2.8739, + "step": 25779 + }, + { + "epoch": 2.3356209372381147, + "grad_norm": 0.9859551787376404, + "learning_rate": 4.428804446323929e-05, + "loss": 2.7339, + "step": 25780 + }, + { + "epoch": 2.3357115354125613, + "grad_norm": 1.11568021774292, + "learning_rate": 4.428200326224854e-05, + "loss": 2.8572, + "step": 25781 + }, + { + "epoch": 2.3358021335870083, + "grad_norm": 1.0266526937484741, + "learning_rate": 4.427596206125778e-05, + "loss": 2.7127, + "step": 25782 + }, + { + "epoch": 2.335892731761455, + "grad_norm": 1.0842840671539307, + "learning_rate": 4.426992086026702e-05, + "loss": 2.9225, + "step": 25783 + }, + { + "epoch": 2.335983329935902, + "grad_norm": 1.0372306108474731, + "learning_rate": 4.426387965927627e-05, + "loss": 3.123, + "step": 25784 + }, + { + "epoch": 2.3360739281103484, + "grad_norm": 0.987497866153717, + "learning_rate": 4.425783845828551e-05, + "loss": 2.7335, + "step": 25785 + }, + { + "epoch": 2.3361645262847954, + "grad_norm": 0.9671463370323181, + "learning_rate": 4.4251797257294756e-05, + "loss": 2.7663, + "step": 25786 + }, + { + "epoch": 2.336255124459242, + "grad_norm": 1.121861219406128, + "learning_rate": 4.4245756056303997e-05, + "loss": 2.5694, + "step": 25787 + }, + { + "epoch": 2.336345722633689, + "grad_norm": 0.8608109951019287, + "learning_rate": 4.423971485531324e-05, + "loss": 1.9215, + "step": 25788 + }, + { + "epoch": 2.3364363208081356, + "grad_norm": 0.9739224910736084, + "learning_rate": 4.4233673654322485e-05, + "loss": 2.3268, + "step": 25789 + }, + { + "epoch": 2.3365269189825826, + "grad_norm": 0.9731289148330688, + "learning_rate": 4.4227632453331726e-05, + "loss": 2.6872, + "step": 25790 + }, + { + "epoch": 2.336617517157029, + "grad_norm": 1.0165576934814453, + "learning_rate": 4.422159125234097e-05, + "loss": 2.5453, + "step": 25791 + }, + { + "epoch": 2.336708115331476, + "grad_norm": 1.0838383436203003, + "learning_rate": 4.4215550051350214e-05, + "loss": 2.4932, + "step": 25792 + }, + { + "epoch": 2.3367987135059227, + "grad_norm": 0.8608007431030273, + "learning_rate": 4.4209508850359455e-05, + "loss": 2.0457, + "step": 25793 + }, + { + "epoch": 2.3368893116803697, + "grad_norm": 0.9266611337661743, + "learning_rate": 4.4203467649368696e-05, + "loss": 2.1964, + "step": 25794 + }, + { + "epoch": 2.3369799098548163, + "grad_norm": 1.0044341087341309, + "learning_rate": 4.419742644837794e-05, + "loss": 2.4848, + "step": 25795 + }, + { + "epoch": 2.3370705080292633, + "grad_norm": 1.0630428791046143, + "learning_rate": 4.4191385247387184e-05, + "loss": 2.5033, + "step": 25796 + }, + { + "epoch": 2.33716110620371, + "grad_norm": 1.0168370008468628, + "learning_rate": 4.4185344046396425e-05, + "loss": 2.5298, + "step": 25797 + }, + { + "epoch": 2.337251704378157, + "grad_norm": 1.0460275411605835, + "learning_rate": 4.4179302845405666e-05, + "loss": 2.7715, + "step": 25798 + }, + { + "epoch": 2.3373423025526034, + "grad_norm": 0.9869781136512756, + "learning_rate": 4.417326164441491e-05, + "loss": 2.4473, + "step": 25799 + }, + { + "epoch": 2.3374329007270505, + "grad_norm": 0.9116135239601135, + "learning_rate": 4.4167220443424154e-05, + "loss": 2.0511, + "step": 25800 + }, + { + "epoch": 2.337523498901497, + "grad_norm": 1.0245976448059082, + "learning_rate": 4.4161179242433395e-05, + "loss": 2.547, + "step": 25801 + }, + { + "epoch": 2.337614097075944, + "grad_norm": 1.02591872215271, + "learning_rate": 4.415513804144264e-05, + "loss": 2.5236, + "step": 25802 + }, + { + "epoch": 2.3377046952503906, + "grad_norm": 1.0353425741195679, + "learning_rate": 4.4149096840451884e-05, + "loss": 2.7592, + "step": 25803 + }, + { + "epoch": 2.3377952934248376, + "grad_norm": 0.9601764678955078, + "learning_rate": 4.414305563946113e-05, + "loss": 2.3714, + "step": 25804 + }, + { + "epoch": 2.337885891599284, + "grad_norm": 1.0225406885147095, + "learning_rate": 4.413701443847037e-05, + "loss": 2.7609, + "step": 25805 + }, + { + "epoch": 2.337976489773731, + "grad_norm": 1.1368170976638794, + "learning_rate": 4.413097323747961e-05, + "loss": 2.4549, + "step": 25806 + }, + { + "epoch": 2.3380670879481777, + "grad_norm": 0.9708912372589111, + "learning_rate": 4.412493203648886e-05, + "loss": 2.7553, + "step": 25807 + }, + { + "epoch": 2.3381576861226248, + "grad_norm": 1.106641411781311, + "learning_rate": 4.41188908354981e-05, + "loss": 2.5753, + "step": 25808 + }, + { + "epoch": 2.3382482842970713, + "grad_norm": 1.0007805824279785, + "learning_rate": 4.411284963450734e-05, + "loss": 2.8788, + "step": 25809 + }, + { + "epoch": 2.3383388824715183, + "grad_norm": 1.0109256505966187, + "learning_rate": 4.410680843351658e-05, + "loss": 2.5222, + "step": 25810 + }, + { + "epoch": 2.338429480645965, + "grad_norm": 0.9821675419807434, + "learning_rate": 4.410076723252583e-05, + "loss": 2.7421, + "step": 25811 + }, + { + "epoch": 2.338520078820412, + "grad_norm": 0.868131697177887, + "learning_rate": 4.409472603153507e-05, + "loss": 1.7871, + "step": 25812 + }, + { + "epoch": 2.3386106769948585, + "grad_norm": 0.9570303559303284, + "learning_rate": 4.408868483054431e-05, + "loss": 2.0512, + "step": 25813 + }, + { + "epoch": 2.3387012751693055, + "grad_norm": 0.9896904230117798, + "learning_rate": 4.408264362955355e-05, + "loss": 2.7911, + "step": 25814 + }, + { + "epoch": 2.338791873343752, + "grad_norm": 0.9687044620513916, + "learning_rate": 4.40766024285628e-05, + "loss": 2.7505, + "step": 25815 + }, + { + "epoch": 2.338882471518199, + "grad_norm": 0.961916446685791, + "learning_rate": 4.407056122757204e-05, + "loss": 2.4088, + "step": 25816 + }, + { + "epoch": 2.3389730696926456, + "grad_norm": 0.993553638458252, + "learning_rate": 4.406452002658128e-05, + "loss": 2.3505, + "step": 25817 + }, + { + "epoch": 2.3390636678670926, + "grad_norm": 0.8818121552467346, + "learning_rate": 4.405847882559053e-05, + "loss": 2.1069, + "step": 25818 + }, + { + "epoch": 2.339154266041539, + "grad_norm": 1.0045289993286133, + "learning_rate": 4.405243762459977e-05, + "loss": 2.5897, + "step": 25819 + }, + { + "epoch": 2.339244864215986, + "grad_norm": 1.0317682027816772, + "learning_rate": 4.404639642360902e-05, + "loss": 2.6237, + "step": 25820 + }, + { + "epoch": 2.3393354623904328, + "grad_norm": 0.9798529744148254, + "learning_rate": 4.404035522261826e-05, + "loss": 2.629, + "step": 25821 + }, + { + "epoch": 2.33942606056488, + "grad_norm": 0.9782924056053162, + "learning_rate": 4.403431402162751e-05, + "loss": 2.5667, + "step": 25822 + }, + { + "epoch": 2.3395166587393263, + "grad_norm": 1.009764313697815, + "learning_rate": 4.402827282063675e-05, + "loss": 2.6518, + "step": 25823 + }, + { + "epoch": 2.3396072569137734, + "grad_norm": 1.0701802968978882, + "learning_rate": 4.402223161964599e-05, + "loss": 2.5041, + "step": 25824 + }, + { + "epoch": 2.33969785508822, + "grad_norm": 0.856500506401062, + "learning_rate": 4.401619041865523e-05, + "loss": 1.902, + "step": 25825 + }, + { + "epoch": 2.339788453262667, + "grad_norm": 1.0801185369491577, + "learning_rate": 4.401014921766448e-05, + "loss": 2.8034, + "step": 25826 + }, + { + "epoch": 2.3398790514371135, + "grad_norm": 0.9505841732025146, + "learning_rate": 4.400410801667372e-05, + "loss": 2.676, + "step": 25827 + }, + { + "epoch": 2.3399696496115605, + "grad_norm": 1.1064685583114624, + "learning_rate": 4.399806681568296e-05, + "loss": 2.8564, + "step": 25828 + }, + { + "epoch": 2.340060247786007, + "grad_norm": 1.0094823837280273, + "learning_rate": 4.39920256146922e-05, + "loss": 2.6863, + "step": 25829 + }, + { + "epoch": 2.340150845960454, + "grad_norm": 1.0929462909698486, + "learning_rate": 4.398598441370145e-05, + "loss": 2.5646, + "step": 25830 + }, + { + "epoch": 2.3402414441349007, + "grad_norm": 1.0466948747634888, + "learning_rate": 4.397994321271069e-05, + "loss": 2.5358, + "step": 25831 + }, + { + "epoch": 2.3403320423093477, + "grad_norm": 1.196447491645813, + "learning_rate": 4.397390201171993e-05, + "loss": 2.5539, + "step": 25832 + }, + { + "epoch": 2.3404226404837942, + "grad_norm": 1.0348209142684937, + "learning_rate": 4.396786081072917e-05, + "loss": 2.9212, + "step": 25833 + }, + { + "epoch": 2.3405132386582412, + "grad_norm": 1.0001728534698486, + "learning_rate": 4.396181960973842e-05, + "loss": 2.4661, + "step": 25834 + }, + { + "epoch": 2.340603836832688, + "grad_norm": 0.9948230981826782, + "learning_rate": 4.395577840874766e-05, + "loss": 2.6821, + "step": 25835 + }, + { + "epoch": 2.3406944350071344, + "grad_norm": 0.9707476496696472, + "learning_rate": 4.3949737207756906e-05, + "loss": 2.657, + "step": 25836 + }, + { + "epoch": 2.3407850331815814, + "grad_norm": 1.046905279159546, + "learning_rate": 4.3943696006766146e-05, + "loss": 2.6859, + "step": 25837 + }, + { + "epoch": 2.3408756313560284, + "grad_norm": 0.9831117391586304, + "learning_rate": 4.3937654805775394e-05, + "loss": 2.6099, + "step": 25838 + }, + { + "epoch": 2.340966229530475, + "grad_norm": 1.051926851272583, + "learning_rate": 4.3931613604784635e-05, + "loss": 2.7068, + "step": 25839 + }, + { + "epoch": 2.3410568277049215, + "grad_norm": 1.0951213836669922, + "learning_rate": 4.3925572403793876e-05, + "loss": 2.6304, + "step": 25840 + }, + { + "epoch": 2.3411474258793685, + "grad_norm": 1.0080182552337646, + "learning_rate": 4.391953120280312e-05, + "loss": 2.7916, + "step": 25841 + }, + { + "epoch": 2.3412380240538155, + "grad_norm": 1.057499647140503, + "learning_rate": 4.3913490001812364e-05, + "loss": 2.8686, + "step": 25842 + }, + { + "epoch": 2.341328622228262, + "grad_norm": 0.9419983625411987, + "learning_rate": 4.3907448800821605e-05, + "loss": 2.5324, + "step": 25843 + }, + { + "epoch": 2.3414192204027087, + "grad_norm": 1.0416978597640991, + "learning_rate": 4.3901407599830846e-05, + "loss": 2.516, + "step": 25844 + }, + { + "epoch": 2.3415098185771557, + "grad_norm": 1.0527808666229248, + "learning_rate": 4.3895366398840093e-05, + "loss": 2.7276, + "step": 25845 + }, + { + "epoch": 2.3416004167516027, + "grad_norm": 0.9664528965950012, + "learning_rate": 4.3889325197849334e-05, + "loss": 2.4896, + "step": 25846 + }, + { + "epoch": 2.3416910149260493, + "grad_norm": 1.0288699865341187, + "learning_rate": 4.3883283996858575e-05, + "loss": 2.5609, + "step": 25847 + }, + { + "epoch": 2.341781613100496, + "grad_norm": 1.0461831092834473, + "learning_rate": 4.387724279586782e-05, + "loss": 2.5799, + "step": 25848 + }, + { + "epoch": 2.341872211274943, + "grad_norm": 1.030181646347046, + "learning_rate": 4.3871201594877063e-05, + "loss": 2.9596, + "step": 25849 + }, + { + "epoch": 2.34196280944939, + "grad_norm": 0.9441593289375305, + "learning_rate": 4.3865160393886304e-05, + "loss": 2.6382, + "step": 25850 + }, + { + "epoch": 2.3420534076238364, + "grad_norm": 1.0133399963378906, + "learning_rate": 4.3859119192895545e-05, + "loss": 2.4953, + "step": 25851 + }, + { + "epoch": 2.342144005798283, + "grad_norm": 1.034334421157837, + "learning_rate": 4.385307799190479e-05, + "loss": 2.6899, + "step": 25852 + }, + { + "epoch": 2.34223460397273, + "grad_norm": 1.136427640914917, + "learning_rate": 4.3847036790914034e-05, + "loss": 2.4282, + "step": 25853 + }, + { + "epoch": 2.3423252021471765, + "grad_norm": 1.0287983417510986, + "learning_rate": 4.384099558992328e-05, + "loss": 2.8, + "step": 25854 + }, + { + "epoch": 2.3424158003216236, + "grad_norm": 0.9727857112884521, + "learning_rate": 4.383495438893252e-05, + "loss": 2.6417, + "step": 25855 + }, + { + "epoch": 2.34250639849607, + "grad_norm": 1.020886778831482, + "learning_rate": 4.382891318794177e-05, + "loss": 2.6084, + "step": 25856 + }, + { + "epoch": 2.342596996670517, + "grad_norm": 1.061618447303772, + "learning_rate": 4.382287198695101e-05, + "loss": 2.7016, + "step": 25857 + }, + { + "epoch": 2.3426875948449637, + "grad_norm": 0.8680286407470703, + "learning_rate": 4.381683078596025e-05, + "loss": 1.9367, + "step": 25858 + }, + { + "epoch": 2.3427781930194107, + "grad_norm": 0.8474513292312622, + "learning_rate": 4.381078958496949e-05, + "loss": 2.0004, + "step": 25859 + }, + { + "epoch": 2.3428687911938573, + "grad_norm": 0.9833216667175293, + "learning_rate": 4.380474838397874e-05, + "loss": 2.514, + "step": 25860 + }, + { + "epoch": 2.3429593893683043, + "grad_norm": 1.031589150428772, + "learning_rate": 4.379870718298798e-05, + "loss": 2.6055, + "step": 25861 + }, + { + "epoch": 2.343049987542751, + "grad_norm": 1.1426491737365723, + "learning_rate": 4.379266598199722e-05, + "loss": 2.5237, + "step": 25862 + }, + { + "epoch": 2.343140585717198, + "grad_norm": 1.0250810384750366, + "learning_rate": 4.378662478100647e-05, + "loss": 2.6207, + "step": 25863 + }, + { + "epoch": 2.3432311838916444, + "grad_norm": 0.9678384065628052, + "learning_rate": 4.378058358001571e-05, + "loss": 2.555, + "step": 25864 + }, + { + "epoch": 2.3433217820660914, + "grad_norm": 0.868121862411499, + "learning_rate": 4.377454237902495e-05, + "loss": 1.7921, + "step": 25865 + }, + { + "epoch": 2.343412380240538, + "grad_norm": 0.9874829053878784, + "learning_rate": 4.376850117803419e-05, + "loss": 2.6416, + "step": 25866 + }, + { + "epoch": 2.343502978414985, + "grad_norm": 1.054185152053833, + "learning_rate": 4.376245997704344e-05, + "loss": 2.5684, + "step": 25867 + }, + { + "epoch": 2.3435935765894316, + "grad_norm": 1.0155187845230103, + "learning_rate": 4.375641877605268e-05, + "loss": 2.6048, + "step": 25868 + }, + { + "epoch": 2.3436841747638786, + "grad_norm": 0.9602643251419067, + "learning_rate": 4.375037757506192e-05, + "loss": 2.4307, + "step": 25869 + }, + { + "epoch": 2.343774772938325, + "grad_norm": 1.024889588356018, + "learning_rate": 4.374433637407117e-05, + "loss": 2.8016, + "step": 25870 + }, + { + "epoch": 2.343865371112772, + "grad_norm": 1.0075796842575073, + "learning_rate": 4.373829517308041e-05, + "loss": 2.5844, + "step": 25871 + }, + { + "epoch": 2.3439559692872187, + "grad_norm": 0.9192505478858948, + "learning_rate": 4.373225397208966e-05, + "loss": 2.2004, + "step": 25872 + }, + { + "epoch": 2.3440465674616657, + "grad_norm": 1.046069622039795, + "learning_rate": 4.37262127710989e-05, + "loss": 2.4777, + "step": 25873 + }, + { + "epoch": 2.3441371656361123, + "grad_norm": 1.0191594362258911, + "learning_rate": 4.372017157010814e-05, + "loss": 2.776, + "step": 25874 + }, + { + "epoch": 2.3442277638105593, + "grad_norm": 1.0280033349990845, + "learning_rate": 4.3714130369117386e-05, + "loss": 2.7668, + "step": 25875 + }, + { + "epoch": 2.344318361985006, + "grad_norm": 1.0405820608139038, + "learning_rate": 4.370808916812663e-05, + "loss": 2.8903, + "step": 25876 + }, + { + "epoch": 2.344408960159453, + "grad_norm": 0.9678443074226379, + "learning_rate": 4.370204796713587e-05, + "loss": 2.589, + "step": 25877 + }, + { + "epoch": 2.3444995583338994, + "grad_norm": 1.047227144241333, + "learning_rate": 4.3696006766145115e-05, + "loss": 2.5278, + "step": 25878 + }, + { + "epoch": 2.3445901565083465, + "grad_norm": 1.0528403520584106, + "learning_rate": 4.3689965565154356e-05, + "loss": 2.8689, + "step": 25879 + }, + { + "epoch": 2.344680754682793, + "grad_norm": 0.9780125021934509, + "learning_rate": 4.36839243641636e-05, + "loss": 2.5638, + "step": 25880 + }, + { + "epoch": 2.34477135285724, + "grad_norm": 0.9969431757926941, + "learning_rate": 4.367788316317284e-05, + "loss": 2.7496, + "step": 25881 + }, + { + "epoch": 2.3448619510316866, + "grad_norm": 0.9791396856307983, + "learning_rate": 4.3671841962182085e-05, + "loss": 2.8057, + "step": 25882 + }, + { + "epoch": 2.3449525492061336, + "grad_norm": 0.9812139868736267, + "learning_rate": 4.3665800761191326e-05, + "loss": 2.6361, + "step": 25883 + }, + { + "epoch": 2.34504314738058, + "grad_norm": 1.0143861770629883, + "learning_rate": 4.365975956020057e-05, + "loss": 2.7028, + "step": 25884 + }, + { + "epoch": 2.345133745555027, + "grad_norm": 1.0629875659942627, + "learning_rate": 4.365371835920981e-05, + "loss": 2.666, + "step": 25885 + }, + { + "epoch": 2.3452243437294737, + "grad_norm": 1.0414987802505493, + "learning_rate": 4.3647677158219055e-05, + "loss": 2.6036, + "step": 25886 + }, + { + "epoch": 2.3453149419039208, + "grad_norm": 0.9309139847755432, + "learning_rate": 4.3641635957228296e-05, + "loss": 2.5433, + "step": 25887 + }, + { + "epoch": 2.3454055400783673, + "grad_norm": 1.0611320734024048, + "learning_rate": 4.3635594756237544e-05, + "loss": 2.611, + "step": 25888 + }, + { + "epoch": 2.3454961382528143, + "grad_norm": 0.9793844223022461, + "learning_rate": 4.3629553555246785e-05, + "loss": 2.3737, + "step": 25889 + }, + { + "epoch": 2.345586736427261, + "grad_norm": 1.0843250751495361, + "learning_rate": 4.362351235425603e-05, + "loss": 2.6838, + "step": 25890 + }, + { + "epoch": 2.345677334601708, + "grad_norm": 1.051897644996643, + "learning_rate": 4.361747115326527e-05, + "loss": 2.4988, + "step": 25891 + }, + { + "epoch": 2.3457679327761545, + "grad_norm": 0.9924985766410828, + "learning_rate": 4.3611429952274514e-05, + "loss": 2.6489, + "step": 25892 + }, + { + "epoch": 2.3458585309506015, + "grad_norm": 1.0145902633666992, + "learning_rate": 4.360538875128376e-05, + "loss": 2.636, + "step": 25893 + }, + { + "epoch": 2.345949129125048, + "grad_norm": 0.8762139081954956, + "learning_rate": 4.3599347550293e-05, + "loss": 1.9144, + "step": 25894 + }, + { + "epoch": 2.346039727299495, + "grad_norm": 1.089013695716858, + "learning_rate": 4.359330634930224e-05, + "loss": 2.5922, + "step": 25895 + }, + { + "epoch": 2.3461303254739416, + "grad_norm": 1.0571447610855103, + "learning_rate": 4.3587265148311484e-05, + "loss": 2.6088, + "step": 25896 + }, + { + "epoch": 2.3462209236483886, + "grad_norm": 1.0052770376205444, + "learning_rate": 4.358122394732073e-05, + "loss": 2.6598, + "step": 25897 + }, + { + "epoch": 2.346311521822835, + "grad_norm": 1.0129183530807495, + "learning_rate": 4.357518274632997e-05, + "loss": 2.9873, + "step": 25898 + }, + { + "epoch": 2.346402119997282, + "grad_norm": 1.0165022611618042, + "learning_rate": 4.356914154533921e-05, + "loss": 2.5066, + "step": 25899 + }, + { + "epoch": 2.3464927181717288, + "grad_norm": 1.0415141582489014, + "learning_rate": 4.3563100344348454e-05, + "loss": 2.6483, + "step": 25900 + }, + { + "epoch": 2.346583316346176, + "grad_norm": 0.9659922122955322, + "learning_rate": 4.35570591433577e-05, + "loss": 2.6605, + "step": 25901 + }, + { + "epoch": 2.3466739145206224, + "grad_norm": 1.0504415035247803, + "learning_rate": 4.355101794236694e-05, + "loss": 2.8824, + "step": 25902 + }, + { + "epoch": 2.3467645126950694, + "grad_norm": 1.0494959354400635, + "learning_rate": 4.3544976741376183e-05, + "loss": 2.3557, + "step": 25903 + }, + { + "epoch": 2.346855110869516, + "grad_norm": 0.7090479135513306, + "learning_rate": 4.353893554038543e-05, + "loss": 1.2703, + "step": 25904 + }, + { + "epoch": 2.346945709043963, + "grad_norm": 0.7266339659690857, + "learning_rate": 4.353289433939468e-05, + "loss": 1.308, + "step": 25905 + }, + { + "epoch": 2.3470363072184095, + "grad_norm": 1.016699194908142, + "learning_rate": 4.352685313840392e-05, + "loss": 2.6873, + "step": 25906 + }, + { + "epoch": 2.3471269053928565, + "grad_norm": 1.1048816442489624, + "learning_rate": 4.352081193741316e-05, + "loss": 2.2944, + "step": 25907 + }, + { + "epoch": 2.347217503567303, + "grad_norm": 0.9485506415367126, + "learning_rate": 4.351477073642241e-05, + "loss": 2.3648, + "step": 25908 + }, + { + "epoch": 2.34730810174175, + "grad_norm": 1.0121829509735107, + "learning_rate": 4.350872953543165e-05, + "loss": 2.6363, + "step": 25909 + }, + { + "epoch": 2.3473986999161967, + "grad_norm": 0.9043336510658264, + "learning_rate": 4.350268833444089e-05, + "loss": 2.0831, + "step": 25910 + }, + { + "epoch": 2.3474892980906437, + "grad_norm": 1.0133479833602905, + "learning_rate": 4.349664713345013e-05, + "loss": 2.7201, + "step": 25911 + }, + { + "epoch": 2.3475798962650902, + "grad_norm": 0.9776719808578491, + "learning_rate": 4.349060593245938e-05, + "loss": 2.5922, + "step": 25912 + }, + { + "epoch": 2.3476704944395372, + "grad_norm": 0.9553937315940857, + "learning_rate": 4.348456473146862e-05, + "loss": 2.4649, + "step": 25913 + }, + { + "epoch": 2.347761092613984, + "grad_norm": 0.9589071869850159, + "learning_rate": 4.347852353047786e-05, + "loss": 2.6184, + "step": 25914 + }, + { + "epoch": 2.3478516907884304, + "grad_norm": 1.004909634590149, + "learning_rate": 4.34724823294871e-05, + "loss": 2.656, + "step": 25915 + }, + { + "epoch": 2.3479422889628774, + "grad_norm": 1.0113234519958496, + "learning_rate": 4.346644112849635e-05, + "loss": 2.5556, + "step": 25916 + }, + { + "epoch": 2.3480328871373244, + "grad_norm": 1.1550477743148804, + "learning_rate": 4.346039992750559e-05, + "loss": 2.6434, + "step": 25917 + }, + { + "epoch": 2.348123485311771, + "grad_norm": 0.9491479992866516, + "learning_rate": 4.345435872651483e-05, + "loss": 2.4282, + "step": 25918 + }, + { + "epoch": 2.3482140834862175, + "grad_norm": 1.142259955406189, + "learning_rate": 4.344831752552407e-05, + "loss": 2.3746, + "step": 25919 + }, + { + "epoch": 2.3483046816606645, + "grad_norm": 1.0511069297790527, + "learning_rate": 4.344227632453332e-05, + "loss": 2.5496, + "step": 25920 + }, + { + "epoch": 2.3483952798351115, + "grad_norm": 1.0198017358779907, + "learning_rate": 4.343623512354256e-05, + "loss": 2.6251, + "step": 25921 + }, + { + "epoch": 2.348485878009558, + "grad_norm": 0.9234383702278137, + "learning_rate": 4.343019392255181e-05, + "loss": 2.4535, + "step": 25922 + }, + { + "epoch": 2.3485764761840047, + "grad_norm": 0.8711468577384949, + "learning_rate": 4.3424152721561054e-05, + "loss": 2.1093, + "step": 25923 + }, + { + "epoch": 2.3486670743584517, + "grad_norm": 1.016886591911316, + "learning_rate": 4.3418111520570295e-05, + "loss": 2.7055, + "step": 25924 + }, + { + "epoch": 2.3487576725328987, + "grad_norm": 0.9864359498023987, + "learning_rate": 4.3412070319579536e-05, + "loss": 2.6082, + "step": 25925 + }, + { + "epoch": 2.3488482707073453, + "grad_norm": 1.0886338949203491, + "learning_rate": 4.340602911858878e-05, + "loss": 2.7386, + "step": 25926 + }, + { + "epoch": 2.348938868881792, + "grad_norm": 0.9650104641914368, + "learning_rate": 4.3399987917598024e-05, + "loss": 2.6413, + "step": 25927 + }, + { + "epoch": 2.349029467056239, + "grad_norm": 1.0116486549377441, + "learning_rate": 4.3393946716607265e-05, + "loss": 2.7731, + "step": 25928 + }, + { + "epoch": 2.349120065230686, + "grad_norm": 0.9371576905250549, + "learning_rate": 4.3387905515616506e-05, + "loss": 2.6598, + "step": 25929 + }, + { + "epoch": 2.3492106634051324, + "grad_norm": 0.9917187094688416, + "learning_rate": 4.338186431462575e-05, + "loss": 2.7087, + "step": 25930 + }, + { + "epoch": 2.349301261579579, + "grad_norm": 1.04741632938385, + "learning_rate": 4.3375823113634994e-05, + "loss": 2.8706, + "step": 25931 + }, + { + "epoch": 2.349391859754026, + "grad_norm": 1.0193034410476685, + "learning_rate": 4.3369781912644235e-05, + "loss": 2.5487, + "step": 25932 + }, + { + "epoch": 2.349482457928473, + "grad_norm": 1.0060662031173706, + "learning_rate": 4.3363740711653476e-05, + "loss": 2.4601, + "step": 25933 + }, + { + "epoch": 2.3495730561029196, + "grad_norm": 0.9874469637870789, + "learning_rate": 4.335769951066272e-05, + "loss": 2.6413, + "step": 25934 + }, + { + "epoch": 2.349663654277366, + "grad_norm": 0.9843688011169434, + "learning_rate": 4.3351658309671965e-05, + "loss": 2.5754, + "step": 25935 + }, + { + "epoch": 2.349754252451813, + "grad_norm": 1.143080472946167, + "learning_rate": 4.3345617108681205e-05, + "loss": 2.8236, + "step": 25936 + }, + { + "epoch": 2.3498448506262597, + "grad_norm": 0.8561882376670837, + "learning_rate": 4.3339575907690446e-05, + "loss": 2.3161, + "step": 25937 + }, + { + "epoch": 2.3499354488007067, + "grad_norm": 0.994733989238739, + "learning_rate": 4.3333534706699694e-05, + "loss": 2.8008, + "step": 25938 + }, + { + "epoch": 2.3500260469751533, + "grad_norm": 1.1505566835403442, + "learning_rate": 4.332749350570894e-05, + "loss": 2.312, + "step": 25939 + }, + { + "epoch": 2.3501166451496003, + "grad_norm": 0.7614502906799316, + "learning_rate": 4.332145230471818e-05, + "loss": 1.1554, + "step": 25940 + }, + { + "epoch": 2.350207243324047, + "grad_norm": 1.0277965068817139, + "learning_rate": 4.331541110372742e-05, + "loss": 2.604, + "step": 25941 + }, + { + "epoch": 2.350297841498494, + "grad_norm": 1.024452805519104, + "learning_rate": 4.330936990273667e-05, + "loss": 2.68, + "step": 25942 + }, + { + "epoch": 2.3503884396729404, + "grad_norm": 0.9127251505851746, + "learning_rate": 4.330332870174591e-05, + "loss": 2.0548, + "step": 25943 + }, + { + "epoch": 2.3504790378473874, + "grad_norm": 0.9797850847244263, + "learning_rate": 4.329728750075515e-05, + "loss": 2.4846, + "step": 25944 + }, + { + "epoch": 2.350569636021834, + "grad_norm": 0.975502073764801, + "learning_rate": 4.329124629976439e-05, + "loss": 2.4563, + "step": 25945 + }, + { + "epoch": 2.350660234196281, + "grad_norm": 0.9904894232749939, + "learning_rate": 4.328520509877364e-05, + "loss": 2.4895, + "step": 25946 + }, + { + "epoch": 2.3507508323707276, + "grad_norm": 1.0314875841140747, + "learning_rate": 4.327916389778288e-05, + "loss": 2.6977, + "step": 25947 + }, + { + "epoch": 2.3508414305451746, + "grad_norm": 0.9822983741760254, + "learning_rate": 4.327312269679212e-05, + "loss": 2.6291, + "step": 25948 + }, + { + "epoch": 2.350932028719621, + "grad_norm": 0.9989455938339233, + "learning_rate": 4.326708149580136e-05, + "loss": 2.3793, + "step": 25949 + }, + { + "epoch": 2.351022626894068, + "grad_norm": 0.9783684611320496, + "learning_rate": 4.326104029481061e-05, + "loss": 2.7331, + "step": 25950 + }, + { + "epoch": 2.3511132250685147, + "grad_norm": 1.109778642654419, + "learning_rate": 4.325499909381985e-05, + "loss": 2.4173, + "step": 25951 + }, + { + "epoch": 2.3512038232429617, + "grad_norm": 0.9872234463691711, + "learning_rate": 4.324895789282909e-05, + "loss": 2.5058, + "step": 25952 + }, + { + "epoch": 2.3512944214174083, + "grad_norm": 0.8327586054801941, + "learning_rate": 4.324291669183834e-05, + "loss": 1.8581, + "step": 25953 + }, + { + "epoch": 2.3513850195918553, + "grad_norm": 1.0356502532958984, + "learning_rate": 4.323687549084758e-05, + "loss": 2.5105, + "step": 25954 + }, + { + "epoch": 2.351475617766302, + "grad_norm": 0.8740201592445374, + "learning_rate": 4.323083428985683e-05, + "loss": 2.0529, + "step": 25955 + }, + { + "epoch": 2.351566215940749, + "grad_norm": 1.0630909204483032, + "learning_rate": 4.322479308886607e-05, + "loss": 2.4355, + "step": 25956 + }, + { + "epoch": 2.3516568141151954, + "grad_norm": 1.0686742067337036, + "learning_rate": 4.321875188787532e-05, + "loss": 2.442, + "step": 25957 + }, + { + "epoch": 2.3517474122896425, + "grad_norm": 1.0328370332717896, + "learning_rate": 4.321271068688456e-05, + "loss": 2.8377, + "step": 25958 + }, + { + "epoch": 2.351838010464089, + "grad_norm": 1.077552080154419, + "learning_rate": 4.32066694858938e-05, + "loss": 2.4842, + "step": 25959 + }, + { + "epoch": 2.351928608638536, + "grad_norm": 0.9696338176727295, + "learning_rate": 4.320062828490304e-05, + "loss": 2.6951, + "step": 25960 + }, + { + "epoch": 2.3520192068129826, + "grad_norm": 1.0197911262512207, + "learning_rate": 4.319458708391229e-05, + "loss": 2.7567, + "step": 25961 + }, + { + "epoch": 2.3521098049874296, + "grad_norm": 0.9492219090461731, + "learning_rate": 4.318854588292153e-05, + "loss": 2.7645, + "step": 25962 + }, + { + "epoch": 2.352200403161876, + "grad_norm": 0.7626693844795227, + "learning_rate": 4.318250468193077e-05, + "loss": 1.9215, + "step": 25963 + }, + { + "epoch": 2.352291001336323, + "grad_norm": 1.0810885429382324, + "learning_rate": 4.317646348094001e-05, + "loss": 2.884, + "step": 25964 + }, + { + "epoch": 2.3523815995107697, + "grad_norm": 0.9932699799537659, + "learning_rate": 4.317042227994926e-05, + "loss": 2.624, + "step": 25965 + }, + { + "epoch": 2.3524721976852168, + "grad_norm": 0.9573240280151367, + "learning_rate": 4.31643810789585e-05, + "loss": 2.626, + "step": 25966 + }, + { + "epoch": 2.3525627958596633, + "grad_norm": 1.003549575805664, + "learning_rate": 4.315833987796774e-05, + "loss": 2.6557, + "step": 25967 + }, + { + "epoch": 2.3526533940341103, + "grad_norm": 1.0043116807937622, + "learning_rate": 4.3152298676976986e-05, + "loss": 2.5751, + "step": 25968 + }, + { + "epoch": 2.352743992208557, + "grad_norm": 0.9927775263786316, + "learning_rate": 4.314625747598623e-05, + "loss": 2.7606, + "step": 25969 + }, + { + "epoch": 2.352834590383004, + "grad_norm": 1.055342435836792, + "learning_rate": 4.314021627499547e-05, + "loss": 2.6328, + "step": 25970 + }, + { + "epoch": 2.3529251885574505, + "grad_norm": 1.0073020458221436, + "learning_rate": 4.3134175074004716e-05, + "loss": 2.5906, + "step": 25971 + }, + { + "epoch": 2.3530157867318975, + "grad_norm": 0.9472389817237854, + "learning_rate": 4.3128133873013957e-05, + "loss": 2.6929, + "step": 25972 + }, + { + "epoch": 2.353106384906344, + "grad_norm": 0.9686990976333618, + "learning_rate": 4.3122092672023204e-05, + "loss": 2.6417, + "step": 25973 + }, + { + "epoch": 2.353196983080791, + "grad_norm": 1.0808192491531372, + "learning_rate": 4.3116051471032445e-05, + "loss": 2.6439, + "step": 25974 + }, + { + "epoch": 2.3532875812552376, + "grad_norm": 0.9853067994117737, + "learning_rate": 4.3110010270041686e-05, + "loss": 2.6646, + "step": 25975 + }, + { + "epoch": 2.3533781794296846, + "grad_norm": 0.9627915024757385, + "learning_rate": 4.3103969069050933e-05, + "loss": 2.5167, + "step": 25976 + }, + { + "epoch": 2.353468777604131, + "grad_norm": 1.1111228466033936, + "learning_rate": 4.3097927868060174e-05, + "loss": 2.6685, + "step": 25977 + }, + { + "epoch": 2.353559375778578, + "grad_norm": 0.961297333240509, + "learning_rate": 4.3091886667069415e-05, + "loss": 2.3755, + "step": 25978 + }, + { + "epoch": 2.3536499739530248, + "grad_norm": 1.0619585514068604, + "learning_rate": 4.3085845466078656e-05, + "loss": 3.0377, + "step": 25979 + }, + { + "epoch": 2.353740572127472, + "grad_norm": 1.0132453441619873, + "learning_rate": 4.3079804265087904e-05, + "loss": 2.4704, + "step": 25980 + }, + { + "epoch": 2.3538311703019184, + "grad_norm": 1.0051738023757935, + "learning_rate": 4.3073763064097144e-05, + "loss": 2.7185, + "step": 25981 + }, + { + "epoch": 2.3539217684763654, + "grad_norm": 0.988314151763916, + "learning_rate": 4.3067721863106385e-05, + "loss": 2.7313, + "step": 25982 + }, + { + "epoch": 2.354012366650812, + "grad_norm": 0.9603517055511475, + "learning_rate": 4.306168066211563e-05, + "loss": 2.5094, + "step": 25983 + }, + { + "epoch": 2.354102964825259, + "grad_norm": 0.9719129800796509, + "learning_rate": 4.3055639461124874e-05, + "loss": 2.628, + "step": 25984 + }, + { + "epoch": 2.3541935629997055, + "grad_norm": 0.8109812140464783, + "learning_rate": 4.3049598260134114e-05, + "loss": 1.9894, + "step": 25985 + }, + { + "epoch": 2.3542841611741525, + "grad_norm": 1.0274351835250854, + "learning_rate": 4.3043557059143355e-05, + "loss": 2.6199, + "step": 25986 + }, + { + "epoch": 2.354374759348599, + "grad_norm": 0.9789609313011169, + "learning_rate": 4.30375158581526e-05, + "loss": 2.6096, + "step": 25987 + }, + { + "epoch": 2.354465357523046, + "grad_norm": 1.01669180393219, + "learning_rate": 4.3031474657161844e-05, + "loss": 2.6619, + "step": 25988 + }, + { + "epoch": 2.3545559556974927, + "grad_norm": 1.1625295877456665, + "learning_rate": 4.302543345617109e-05, + "loss": 2.4995, + "step": 25989 + }, + { + "epoch": 2.3546465538719397, + "grad_norm": 1.0731697082519531, + "learning_rate": 4.301939225518033e-05, + "loss": 2.7293, + "step": 25990 + }, + { + "epoch": 2.3547371520463862, + "grad_norm": 1.0644603967666626, + "learning_rate": 4.301335105418958e-05, + "loss": 2.4428, + "step": 25991 + }, + { + "epoch": 2.3548277502208332, + "grad_norm": 1.131136417388916, + "learning_rate": 4.300730985319882e-05, + "loss": 2.5965, + "step": 25992 + }, + { + "epoch": 2.35491834839528, + "grad_norm": 0.9968559741973877, + "learning_rate": 4.300126865220806e-05, + "loss": 2.6612, + "step": 25993 + }, + { + "epoch": 2.355008946569727, + "grad_norm": 1.0485702753067017, + "learning_rate": 4.29952274512173e-05, + "loss": 2.6047, + "step": 25994 + }, + { + "epoch": 2.3550995447441734, + "grad_norm": 0.9814636707305908, + "learning_rate": 4.298918625022655e-05, + "loss": 2.5735, + "step": 25995 + }, + { + "epoch": 2.3551901429186204, + "grad_norm": 1.028619647026062, + "learning_rate": 4.298314504923579e-05, + "loss": 2.64, + "step": 25996 + }, + { + "epoch": 2.355280741093067, + "grad_norm": 0.9782925844192505, + "learning_rate": 4.297710384824503e-05, + "loss": 2.5171, + "step": 25997 + }, + { + "epoch": 2.3553713392675135, + "grad_norm": 0.9524559378623962, + "learning_rate": 4.297106264725428e-05, + "loss": 2.3227, + "step": 25998 + }, + { + "epoch": 2.3554619374419605, + "grad_norm": 1.0556715726852417, + "learning_rate": 4.296502144626352e-05, + "loss": 2.432, + "step": 25999 + }, + { + "epoch": 2.3555525356164075, + "grad_norm": 0.9454003572463989, + "learning_rate": 4.295898024527276e-05, + "loss": 2.3934, + "step": 26000 + }, + { + "epoch": 2.355643133790854, + "grad_norm": 1.038979172706604, + "learning_rate": 4.2952939044282e-05, + "loss": 2.5036, + "step": 26001 + }, + { + "epoch": 2.3557337319653007, + "grad_norm": 1.0968446731567383, + "learning_rate": 4.294689784329125e-05, + "loss": 2.4679, + "step": 26002 + }, + { + "epoch": 2.3558243301397477, + "grad_norm": 0.9926896691322327, + "learning_rate": 4.294085664230049e-05, + "loss": 2.7584, + "step": 26003 + }, + { + "epoch": 2.3559149283141947, + "grad_norm": 0.954714834690094, + "learning_rate": 4.293481544130973e-05, + "loss": 2.4398, + "step": 26004 + }, + { + "epoch": 2.3560055264886413, + "grad_norm": 0.9743886590003967, + "learning_rate": 4.292877424031898e-05, + "loss": 2.4917, + "step": 26005 + }, + { + "epoch": 2.356096124663088, + "grad_norm": 1.00576913356781, + "learning_rate": 4.292273303932822e-05, + "loss": 2.4074, + "step": 26006 + }, + { + "epoch": 2.356186722837535, + "grad_norm": 0.9447089433670044, + "learning_rate": 4.291669183833747e-05, + "loss": 2.1966, + "step": 26007 + }, + { + "epoch": 2.356277321011982, + "grad_norm": 1.0171477794647217, + "learning_rate": 4.291065063734671e-05, + "loss": 2.9502, + "step": 26008 + }, + { + "epoch": 2.3563679191864284, + "grad_norm": 1.0054516792297363, + "learning_rate": 4.290460943635595e-05, + "loss": 2.678, + "step": 26009 + }, + { + "epoch": 2.356458517360875, + "grad_norm": 1.048703670501709, + "learning_rate": 4.2898568235365196e-05, + "loss": 2.355, + "step": 26010 + }, + { + "epoch": 2.356549115535322, + "grad_norm": 0.924824595451355, + "learning_rate": 4.289252703437444e-05, + "loss": 2.0621, + "step": 26011 + }, + { + "epoch": 2.356639713709769, + "grad_norm": 1.0279561281204224, + "learning_rate": 4.288648583338368e-05, + "loss": 2.793, + "step": 26012 + }, + { + "epoch": 2.3567303118842156, + "grad_norm": 1.0111579895019531, + "learning_rate": 4.2880444632392925e-05, + "loss": 2.7626, + "step": 26013 + }, + { + "epoch": 2.356820910058662, + "grad_norm": 1.0525524616241455, + "learning_rate": 4.2874403431402166e-05, + "loss": 2.9373, + "step": 26014 + }, + { + "epoch": 2.356911508233109, + "grad_norm": 1.0281187295913696, + "learning_rate": 4.286836223041141e-05, + "loss": 2.7976, + "step": 26015 + }, + { + "epoch": 2.3570021064075557, + "grad_norm": 1.0458163022994995, + "learning_rate": 4.286232102942065e-05, + "loss": 2.5198, + "step": 26016 + }, + { + "epoch": 2.3570927045820027, + "grad_norm": 0.9398977756500244, + "learning_rate": 4.2856279828429896e-05, + "loss": 1.9716, + "step": 26017 + }, + { + "epoch": 2.3571833027564493, + "grad_norm": 0.9975727200508118, + "learning_rate": 4.2850238627439136e-05, + "loss": 2.6427, + "step": 26018 + }, + { + "epoch": 2.3572739009308963, + "grad_norm": 1.0922166109085083, + "learning_rate": 4.284419742644838e-05, + "loss": 2.7509, + "step": 26019 + }, + { + "epoch": 2.357364499105343, + "grad_norm": 1.1806552410125732, + "learning_rate": 4.283815622545762e-05, + "loss": 2.4445, + "step": 26020 + }, + { + "epoch": 2.35745509727979, + "grad_norm": 0.815394401550293, + "learning_rate": 4.2832115024466866e-05, + "loss": 1.8142, + "step": 26021 + }, + { + "epoch": 2.3575456954542364, + "grad_norm": 0.9920897483825684, + "learning_rate": 4.2826073823476106e-05, + "loss": 2.5126, + "step": 26022 + }, + { + "epoch": 2.3576362936286834, + "grad_norm": 0.9688383936882019, + "learning_rate": 4.2820032622485354e-05, + "loss": 2.6138, + "step": 26023 + }, + { + "epoch": 2.35772689180313, + "grad_norm": 1.0170701742172241, + "learning_rate": 4.2813991421494595e-05, + "loss": 2.7294, + "step": 26024 + }, + { + "epoch": 2.357817489977577, + "grad_norm": 1.0169084072113037, + "learning_rate": 4.280795022050384e-05, + "loss": 2.5304, + "step": 26025 + }, + { + "epoch": 2.3579080881520236, + "grad_norm": 0.7728280425071716, + "learning_rate": 4.280190901951308e-05, + "loss": 1.9174, + "step": 26026 + }, + { + "epoch": 2.3579986863264706, + "grad_norm": 1.1603158712387085, + "learning_rate": 4.2795867818522324e-05, + "loss": 2.5046, + "step": 26027 + }, + { + "epoch": 2.358089284500917, + "grad_norm": 1.0172057151794434, + "learning_rate": 4.278982661753157e-05, + "loss": 2.6062, + "step": 26028 + }, + { + "epoch": 2.358179882675364, + "grad_norm": 0.8816168308258057, + "learning_rate": 4.278378541654081e-05, + "loss": 1.9996, + "step": 26029 + }, + { + "epoch": 2.3582704808498107, + "grad_norm": 0.8685718774795532, + "learning_rate": 4.2777744215550053e-05, + "loss": 2.0283, + "step": 26030 + }, + { + "epoch": 2.3583610790242577, + "grad_norm": 1.1366934776306152, + "learning_rate": 4.2771703014559294e-05, + "loss": 2.3771, + "step": 26031 + }, + { + "epoch": 2.3584516771987043, + "grad_norm": 1.0717381238937378, + "learning_rate": 4.276566181356854e-05, + "loss": 2.4914, + "step": 26032 + }, + { + "epoch": 2.3585422753731513, + "grad_norm": 0.9836916923522949, + "learning_rate": 4.275962061257778e-05, + "loss": 2.7705, + "step": 26033 + }, + { + "epoch": 2.358632873547598, + "grad_norm": 1.0142829418182373, + "learning_rate": 4.2753579411587024e-05, + "loss": 2.6477, + "step": 26034 + }, + { + "epoch": 2.358723471722045, + "grad_norm": 0.8411500453948975, + "learning_rate": 4.2747538210596264e-05, + "loss": 2.0693, + "step": 26035 + }, + { + "epoch": 2.3588140698964914, + "grad_norm": 0.9639889001846313, + "learning_rate": 4.274149700960551e-05, + "loss": 2.7819, + "step": 26036 + }, + { + "epoch": 2.3589046680709385, + "grad_norm": 1.0061599016189575, + "learning_rate": 4.273545580861475e-05, + "loss": 2.48, + "step": 26037 + }, + { + "epoch": 2.358995266245385, + "grad_norm": 0.9705473184585571, + "learning_rate": 4.2729414607623994e-05, + "loss": 2.4614, + "step": 26038 + }, + { + "epoch": 2.359085864419832, + "grad_norm": 1.0229111909866333, + "learning_rate": 4.272337340663324e-05, + "loss": 2.6669, + "step": 26039 + }, + { + "epoch": 2.3591764625942786, + "grad_norm": 0.984583854675293, + "learning_rate": 4.271733220564248e-05, + "loss": 2.5425, + "step": 26040 + }, + { + "epoch": 2.3592670607687256, + "grad_norm": 1.1113308668136597, + "learning_rate": 4.271129100465173e-05, + "loss": 2.3847, + "step": 26041 + }, + { + "epoch": 2.359357658943172, + "grad_norm": 1.0123260021209717, + "learning_rate": 4.270524980366097e-05, + "loss": 2.6841, + "step": 26042 + }, + { + "epoch": 2.359448257117619, + "grad_norm": 0.9468674659729004, + "learning_rate": 4.269920860267022e-05, + "loss": 1.9651, + "step": 26043 + }, + { + "epoch": 2.3595388552920658, + "grad_norm": 1.0443956851959229, + "learning_rate": 4.269316740167946e-05, + "loss": 2.7654, + "step": 26044 + }, + { + "epoch": 2.3596294534665128, + "grad_norm": 0.990503191947937, + "learning_rate": 4.26871262006887e-05, + "loss": 2.691, + "step": 26045 + }, + { + "epoch": 2.3597200516409593, + "grad_norm": 0.9237751364707947, + "learning_rate": 4.268108499969794e-05, + "loss": 2.01, + "step": 26046 + }, + { + "epoch": 2.3598106498154063, + "grad_norm": 1.0392459630966187, + "learning_rate": 4.267504379870719e-05, + "loss": 2.803, + "step": 26047 + }, + { + "epoch": 2.359901247989853, + "grad_norm": 1.0150765180587769, + "learning_rate": 4.266900259771643e-05, + "loss": 2.7067, + "step": 26048 + }, + { + "epoch": 2.3599918461643, + "grad_norm": 1.0932637453079224, + "learning_rate": 4.266296139672567e-05, + "loss": 2.4786, + "step": 26049 + }, + { + "epoch": 2.3600824443387465, + "grad_norm": 0.8381004929542542, + "learning_rate": 4.265692019573491e-05, + "loss": 1.8925, + "step": 26050 + }, + { + "epoch": 2.3601730425131935, + "grad_norm": 0.9224766492843628, + "learning_rate": 4.265087899474416e-05, + "loss": 2.1655, + "step": 26051 + }, + { + "epoch": 2.36026364068764, + "grad_norm": 1.0854591131210327, + "learning_rate": 4.26448377937534e-05, + "loss": 2.5398, + "step": 26052 + }, + { + "epoch": 2.360354238862087, + "grad_norm": 1.0536998510360718, + "learning_rate": 4.263879659276264e-05, + "loss": 2.4474, + "step": 26053 + }, + { + "epoch": 2.3604448370365336, + "grad_norm": 1.0894025564193726, + "learning_rate": 4.263275539177188e-05, + "loss": 2.4451, + "step": 26054 + }, + { + "epoch": 2.3605354352109806, + "grad_norm": 1.0225801467895508, + "learning_rate": 4.262671419078113e-05, + "loss": 2.6236, + "step": 26055 + }, + { + "epoch": 2.360626033385427, + "grad_norm": 1.0026028156280518, + "learning_rate": 4.262067298979037e-05, + "loss": 2.0914, + "step": 26056 + }, + { + "epoch": 2.360716631559874, + "grad_norm": 0.9528965353965759, + "learning_rate": 4.261463178879962e-05, + "loss": 2.7201, + "step": 26057 + }, + { + "epoch": 2.3608072297343208, + "grad_norm": 0.992428183555603, + "learning_rate": 4.260859058780886e-05, + "loss": 1.9726, + "step": 26058 + }, + { + "epoch": 2.360897827908768, + "grad_norm": 1.1112773418426514, + "learning_rate": 4.2602549386818105e-05, + "loss": 2.7857, + "step": 26059 + }, + { + "epoch": 2.3609884260832144, + "grad_norm": 1.080403208732605, + "learning_rate": 4.2596508185827346e-05, + "loss": 2.4003, + "step": 26060 + }, + { + "epoch": 2.3610790242576614, + "grad_norm": 0.9497026801109314, + "learning_rate": 4.259046698483659e-05, + "loss": 2.5006, + "step": 26061 + }, + { + "epoch": 2.361169622432108, + "grad_norm": 1.11782968044281, + "learning_rate": 4.2584425783845835e-05, + "loss": 2.4413, + "step": 26062 + }, + { + "epoch": 2.361260220606555, + "grad_norm": 1.0323822498321533, + "learning_rate": 4.2578384582855075e-05, + "loss": 2.42, + "step": 26063 + }, + { + "epoch": 2.3613508187810015, + "grad_norm": 1.0285528898239136, + "learning_rate": 4.2572343381864316e-05, + "loss": 2.6409, + "step": 26064 + }, + { + "epoch": 2.3614414169554485, + "grad_norm": 0.9569602608680725, + "learning_rate": 4.256630218087356e-05, + "loss": 2.5696, + "step": 26065 + }, + { + "epoch": 2.361532015129895, + "grad_norm": 1.1599246263504028, + "learning_rate": 4.2560260979882805e-05, + "loss": 2.22, + "step": 26066 + }, + { + "epoch": 2.361622613304342, + "grad_norm": 1.1113643646240234, + "learning_rate": 4.2554219778892045e-05, + "loss": 2.506, + "step": 26067 + }, + { + "epoch": 2.3617132114787887, + "grad_norm": 1.015333652496338, + "learning_rate": 4.2548178577901286e-05, + "loss": 2.3851, + "step": 26068 + }, + { + "epoch": 2.3618038096532357, + "grad_norm": 0.9958947896957397, + "learning_rate": 4.254213737691053e-05, + "loss": 2.6889, + "step": 26069 + }, + { + "epoch": 2.3618944078276822, + "grad_norm": 0.952400267124176, + "learning_rate": 4.2536096175919775e-05, + "loss": 2.4645, + "step": 26070 + }, + { + "epoch": 2.3619850060021292, + "grad_norm": 1.0808075666427612, + "learning_rate": 4.2530054974929016e-05, + "loss": 3.0339, + "step": 26071 + }, + { + "epoch": 2.362075604176576, + "grad_norm": 1.0576767921447754, + "learning_rate": 4.2524013773938256e-05, + "loss": 2.8261, + "step": 26072 + }, + { + "epoch": 2.362166202351023, + "grad_norm": 0.9583423733711243, + "learning_rate": 4.2517972572947504e-05, + "loss": 2.5977, + "step": 26073 + }, + { + "epoch": 2.3622568005254694, + "grad_norm": 1.1226757764816284, + "learning_rate": 4.2511931371956745e-05, + "loss": 2.8132, + "step": 26074 + }, + { + "epoch": 2.3623473986999164, + "grad_norm": 1.0170038938522339, + "learning_rate": 4.250589017096599e-05, + "loss": 2.77, + "step": 26075 + }, + { + "epoch": 2.362437996874363, + "grad_norm": 0.9717448353767395, + "learning_rate": 4.249984896997523e-05, + "loss": 2.7626, + "step": 26076 + }, + { + "epoch": 2.3625285950488095, + "grad_norm": 1.0413877964019775, + "learning_rate": 4.249380776898448e-05, + "loss": 2.6, + "step": 26077 + }, + { + "epoch": 2.3626191932232565, + "grad_norm": 1.0275579690933228, + "learning_rate": 4.248776656799372e-05, + "loss": 2.668, + "step": 26078 + }, + { + "epoch": 2.3627097913977035, + "grad_norm": 0.9733349084854126, + "learning_rate": 4.248172536700296e-05, + "loss": 2.5039, + "step": 26079 + }, + { + "epoch": 2.36280038957215, + "grad_norm": 1.0022709369659424, + "learning_rate": 4.24756841660122e-05, + "loss": 2.6149, + "step": 26080 + }, + { + "epoch": 2.3628909877465967, + "grad_norm": 0.9635748267173767, + "learning_rate": 4.246964296502145e-05, + "loss": 2.6228, + "step": 26081 + }, + { + "epoch": 2.3629815859210437, + "grad_norm": 1.0127241611480713, + "learning_rate": 4.246360176403069e-05, + "loss": 2.4662, + "step": 26082 + }, + { + "epoch": 2.3630721840954907, + "grad_norm": 0.9822820425033569, + "learning_rate": 4.245756056303993e-05, + "loss": 2.4563, + "step": 26083 + }, + { + "epoch": 2.3631627822699373, + "grad_norm": 1.0812759399414062, + "learning_rate": 4.2451519362049173e-05, + "loss": 2.6325, + "step": 26084 + }, + { + "epoch": 2.363253380444384, + "grad_norm": 1.0911545753479004, + "learning_rate": 4.244547816105842e-05, + "loss": 2.9225, + "step": 26085 + }, + { + "epoch": 2.363343978618831, + "grad_norm": 0.9938536286354065, + "learning_rate": 4.243943696006766e-05, + "loss": 2.6431, + "step": 26086 + }, + { + "epoch": 2.363434576793278, + "grad_norm": 0.7939939498901367, + "learning_rate": 4.24333957590769e-05, + "loss": 1.973, + "step": 26087 + }, + { + "epoch": 2.3635251749677244, + "grad_norm": 1.023284673690796, + "learning_rate": 4.242735455808615e-05, + "loss": 2.636, + "step": 26088 + }, + { + "epoch": 2.363615773142171, + "grad_norm": 1.0510460138320923, + "learning_rate": 4.242131335709539e-05, + "loss": 2.3296, + "step": 26089 + }, + { + "epoch": 2.363706371316618, + "grad_norm": 1.0377897024154663, + "learning_rate": 4.241527215610463e-05, + "loss": 2.3551, + "step": 26090 + }, + { + "epoch": 2.363796969491065, + "grad_norm": 0.9691430926322937, + "learning_rate": 4.240923095511388e-05, + "loss": 2.6004, + "step": 26091 + }, + { + "epoch": 2.3638875676655116, + "grad_norm": 1.0658897161483765, + "learning_rate": 4.240318975412313e-05, + "loss": 2.9676, + "step": 26092 + }, + { + "epoch": 2.363978165839958, + "grad_norm": 1.0367690324783325, + "learning_rate": 4.239714855313237e-05, + "loss": 2.4889, + "step": 26093 + }, + { + "epoch": 2.364068764014405, + "grad_norm": 1.006941318511963, + "learning_rate": 4.239110735214161e-05, + "loss": 2.8085, + "step": 26094 + }, + { + "epoch": 2.364159362188852, + "grad_norm": 0.9648860692977905, + "learning_rate": 4.238506615115085e-05, + "loss": 2.436, + "step": 26095 + }, + { + "epoch": 2.3642499603632987, + "grad_norm": 0.8256365656852722, + "learning_rate": 4.23790249501601e-05, + "loss": 1.8371, + "step": 26096 + }, + { + "epoch": 2.3643405585377453, + "grad_norm": 0.9220486283302307, + "learning_rate": 4.237298374916934e-05, + "loss": 2.2208, + "step": 26097 + }, + { + "epoch": 2.3644311567121923, + "grad_norm": 0.9837729334831238, + "learning_rate": 4.236694254817858e-05, + "loss": 2.5073, + "step": 26098 + }, + { + "epoch": 2.364521754886639, + "grad_norm": 0.9876716136932373, + "learning_rate": 4.236090134718782e-05, + "loss": 2.515, + "step": 26099 + }, + { + "epoch": 2.364612353061086, + "grad_norm": 0.9697338938713074, + "learning_rate": 4.235486014619707e-05, + "loss": 1.8236, + "step": 26100 + }, + { + "epoch": 2.3647029512355324, + "grad_norm": 1.0122010707855225, + "learning_rate": 4.234881894520631e-05, + "loss": 2.4622, + "step": 26101 + }, + { + "epoch": 2.3647935494099794, + "grad_norm": 1.015225887298584, + "learning_rate": 4.234277774421555e-05, + "loss": 2.5512, + "step": 26102 + }, + { + "epoch": 2.364884147584426, + "grad_norm": 1.0354862213134766, + "learning_rate": 4.2336736543224797e-05, + "loss": 2.5361, + "step": 26103 + }, + { + "epoch": 2.364974745758873, + "grad_norm": 1.032948613166809, + "learning_rate": 4.233069534223404e-05, + "loss": 2.4951, + "step": 26104 + }, + { + "epoch": 2.3650653439333196, + "grad_norm": 1.0197398662567139, + "learning_rate": 4.232465414124328e-05, + "loss": 2.4604, + "step": 26105 + }, + { + "epoch": 2.3651559421077666, + "grad_norm": 0.9585536122322083, + "learning_rate": 4.231861294025252e-05, + "loss": 2.4732, + "step": 26106 + }, + { + "epoch": 2.365246540282213, + "grad_norm": 1.0040416717529297, + "learning_rate": 4.231257173926177e-05, + "loss": 2.5917, + "step": 26107 + }, + { + "epoch": 2.36533713845666, + "grad_norm": 1.0515276193618774, + "learning_rate": 4.230653053827101e-05, + "loss": 2.5883, + "step": 26108 + }, + { + "epoch": 2.3654277366311067, + "grad_norm": 1.036458969116211, + "learning_rate": 4.2300489337280255e-05, + "loss": 2.9156, + "step": 26109 + }, + { + "epoch": 2.3655183348055537, + "grad_norm": 1.0473504066467285, + "learning_rate": 4.2294448136289496e-05, + "loss": 2.6956, + "step": 26110 + }, + { + "epoch": 2.3656089329800003, + "grad_norm": 1.0681971311569214, + "learning_rate": 4.2288406935298744e-05, + "loss": 2.8439, + "step": 26111 + }, + { + "epoch": 2.3656995311544473, + "grad_norm": 1.0747305154800415, + "learning_rate": 4.2282365734307984e-05, + "loss": 2.6365, + "step": 26112 + }, + { + "epoch": 2.365790129328894, + "grad_norm": 0.9965832829475403, + "learning_rate": 4.2276324533317225e-05, + "loss": 2.7185, + "step": 26113 + }, + { + "epoch": 2.365880727503341, + "grad_norm": 1.056749939918518, + "learning_rate": 4.2270283332326466e-05, + "loss": 2.4754, + "step": 26114 + }, + { + "epoch": 2.3659713256777875, + "grad_norm": 1.032705307006836, + "learning_rate": 4.2264242131335714e-05, + "loss": 2.5557, + "step": 26115 + }, + { + "epoch": 2.3660619238522345, + "grad_norm": 0.9699844121932983, + "learning_rate": 4.2258200930344954e-05, + "loss": 2.496, + "step": 26116 + }, + { + "epoch": 2.366152522026681, + "grad_norm": 0.9728602170944214, + "learning_rate": 4.2252159729354195e-05, + "loss": 2.4253, + "step": 26117 + }, + { + "epoch": 2.366243120201128, + "grad_norm": 1.1933000087738037, + "learning_rate": 4.224611852836344e-05, + "loss": 2.4349, + "step": 26118 + }, + { + "epoch": 2.3663337183755746, + "grad_norm": 0.9449526071548462, + "learning_rate": 4.2240077327372684e-05, + "loss": 2.7266, + "step": 26119 + }, + { + "epoch": 2.3664243165500216, + "grad_norm": 1.0314826965332031, + "learning_rate": 4.2234036126381925e-05, + "loss": 2.9258, + "step": 26120 + }, + { + "epoch": 2.366514914724468, + "grad_norm": 1.0510568618774414, + "learning_rate": 4.2227994925391165e-05, + "loss": 2.5848, + "step": 26121 + }, + { + "epoch": 2.366605512898915, + "grad_norm": 1.059167742729187, + "learning_rate": 4.222195372440041e-05, + "loss": 2.6856, + "step": 26122 + }, + { + "epoch": 2.3666961110733618, + "grad_norm": 0.9667261242866516, + "learning_rate": 4.2215912523409654e-05, + "loss": 2.0099, + "step": 26123 + }, + { + "epoch": 2.3667867092478088, + "grad_norm": 1.0022053718566895, + "learning_rate": 4.2209871322418895e-05, + "loss": 2.5361, + "step": 26124 + }, + { + "epoch": 2.3668773074222553, + "grad_norm": 0.8766599893569946, + "learning_rate": 4.220383012142814e-05, + "loss": 2.1514, + "step": 26125 + }, + { + "epoch": 2.3669679055967023, + "grad_norm": 0.9403641223907471, + "learning_rate": 4.219778892043739e-05, + "loss": 1.9847, + "step": 26126 + }, + { + "epoch": 2.367058503771149, + "grad_norm": 1.1001030206680298, + "learning_rate": 4.219174771944663e-05, + "loss": 2.8454, + "step": 26127 + }, + { + "epoch": 2.367149101945596, + "grad_norm": 1.03644597530365, + "learning_rate": 4.218570651845587e-05, + "loss": 2.7389, + "step": 26128 + }, + { + "epoch": 2.3672397001200425, + "grad_norm": 1.0020641088485718, + "learning_rate": 4.217966531746512e-05, + "loss": 2.5256, + "step": 26129 + }, + { + "epoch": 2.3673302982944895, + "grad_norm": 0.9975964426994324, + "learning_rate": 4.217362411647436e-05, + "loss": 2.5248, + "step": 26130 + }, + { + "epoch": 2.367420896468936, + "grad_norm": 1.056203842163086, + "learning_rate": 4.21675829154836e-05, + "loss": 2.4578, + "step": 26131 + }, + { + "epoch": 2.367511494643383, + "grad_norm": 1.0159159898757935, + "learning_rate": 4.216154171449284e-05, + "loss": 2.7207, + "step": 26132 + }, + { + "epoch": 2.3676020928178296, + "grad_norm": 0.9711387753486633, + "learning_rate": 4.215550051350209e-05, + "loss": 2.407, + "step": 26133 + }, + { + "epoch": 2.3676926909922766, + "grad_norm": 1.0061858892440796, + "learning_rate": 4.214945931251133e-05, + "loss": 2.6879, + "step": 26134 + }, + { + "epoch": 2.367783289166723, + "grad_norm": 1.1046706438064575, + "learning_rate": 4.214341811152057e-05, + "loss": 2.4916, + "step": 26135 + }, + { + "epoch": 2.36787388734117, + "grad_norm": 1.0471216440200806, + "learning_rate": 4.213737691052981e-05, + "loss": 2.7989, + "step": 26136 + }, + { + "epoch": 2.367964485515617, + "grad_norm": 1.020586609840393, + "learning_rate": 4.213133570953906e-05, + "loss": 2.7261, + "step": 26137 + }, + { + "epoch": 2.368055083690064, + "grad_norm": 1.036975622177124, + "learning_rate": 4.21252945085483e-05, + "loss": 2.8364, + "step": 26138 + }, + { + "epoch": 2.3681456818645104, + "grad_norm": 1.0727170705795288, + "learning_rate": 4.211925330755754e-05, + "loss": 2.6273, + "step": 26139 + }, + { + "epoch": 2.3682362800389574, + "grad_norm": 1.094641089439392, + "learning_rate": 4.211321210656678e-05, + "loss": 2.8609, + "step": 26140 + }, + { + "epoch": 2.368326878213404, + "grad_norm": 0.9803380370140076, + "learning_rate": 4.210717090557603e-05, + "loss": 2.3465, + "step": 26141 + }, + { + "epoch": 2.368417476387851, + "grad_norm": 0.966953456401825, + "learning_rate": 4.210112970458528e-05, + "loss": 2.744, + "step": 26142 + }, + { + "epoch": 2.3685080745622975, + "grad_norm": 1.014543890953064, + "learning_rate": 4.209508850359452e-05, + "loss": 2.6139, + "step": 26143 + }, + { + "epoch": 2.3685986727367445, + "grad_norm": 1.1487109661102295, + "learning_rate": 4.2089047302603765e-05, + "loss": 2.5785, + "step": 26144 + }, + { + "epoch": 2.368689270911191, + "grad_norm": 0.8450473546981812, + "learning_rate": 4.2083006101613006e-05, + "loss": 1.7147, + "step": 26145 + }, + { + "epoch": 2.368779869085638, + "grad_norm": 1.001044750213623, + "learning_rate": 4.207696490062225e-05, + "loss": 2.628, + "step": 26146 + }, + { + "epoch": 2.3688704672600847, + "grad_norm": 1.0150355100631714, + "learning_rate": 4.207092369963149e-05, + "loss": 2.6685, + "step": 26147 + }, + { + "epoch": 2.3689610654345317, + "grad_norm": 1.0899547338485718, + "learning_rate": 4.2064882498640736e-05, + "loss": 2.3698, + "step": 26148 + }, + { + "epoch": 2.3690516636089782, + "grad_norm": 0.8113897442817688, + "learning_rate": 4.2058841297649976e-05, + "loss": 2.1863, + "step": 26149 + }, + { + "epoch": 2.3691422617834252, + "grad_norm": 1.1058354377746582, + "learning_rate": 4.205280009665922e-05, + "loss": 2.8082, + "step": 26150 + }, + { + "epoch": 2.369232859957872, + "grad_norm": 0.9790117144584656, + "learning_rate": 4.204675889566846e-05, + "loss": 2.5511, + "step": 26151 + }, + { + "epoch": 2.369323458132319, + "grad_norm": 0.940104067325592, + "learning_rate": 4.2040717694677706e-05, + "loss": 2.6448, + "step": 26152 + }, + { + "epoch": 2.3694140563067654, + "grad_norm": 0.8821229934692383, + "learning_rate": 4.2034676493686947e-05, + "loss": 2.0716, + "step": 26153 + }, + { + "epoch": 2.3695046544812124, + "grad_norm": 1.0384951829910278, + "learning_rate": 4.202863529269619e-05, + "loss": 2.5531, + "step": 26154 + }, + { + "epoch": 2.369595252655659, + "grad_norm": 1.0519845485687256, + "learning_rate": 4.202259409170543e-05, + "loss": 2.4763, + "step": 26155 + }, + { + "epoch": 2.369685850830106, + "grad_norm": 0.9864610433578491, + "learning_rate": 4.2016552890714676e-05, + "loss": 2.6482, + "step": 26156 + }, + { + "epoch": 2.3697764490045525, + "grad_norm": 1.0642170906066895, + "learning_rate": 4.2010511689723917e-05, + "loss": 2.6542, + "step": 26157 + }, + { + "epoch": 2.3698670471789995, + "grad_norm": 1.0222053527832031, + "learning_rate": 4.2004470488733164e-05, + "loss": 2.6159, + "step": 26158 + }, + { + "epoch": 2.369957645353446, + "grad_norm": 0.9779535531997681, + "learning_rate": 4.1998429287742405e-05, + "loss": 2.4161, + "step": 26159 + }, + { + "epoch": 2.3700482435278927, + "grad_norm": 0.9929385781288147, + "learning_rate": 4.199238808675165e-05, + "loss": 2.4155, + "step": 26160 + }, + { + "epoch": 2.3701388417023397, + "grad_norm": 1.029885172843933, + "learning_rate": 4.1986346885760893e-05, + "loss": 2.8033, + "step": 26161 + }, + { + "epoch": 2.3702294398767867, + "grad_norm": 0.964356541633606, + "learning_rate": 4.1980305684770134e-05, + "loss": 2.2719, + "step": 26162 + }, + { + "epoch": 2.3703200380512333, + "grad_norm": 1.145465612411499, + "learning_rate": 4.197426448377938e-05, + "loss": 2.7359, + "step": 26163 + }, + { + "epoch": 2.37041063622568, + "grad_norm": 1.1159336566925049, + "learning_rate": 4.196822328278862e-05, + "loss": 2.5665, + "step": 26164 + }, + { + "epoch": 2.370501234400127, + "grad_norm": 0.946675181388855, + "learning_rate": 4.1962182081797864e-05, + "loss": 2.5427, + "step": 26165 + }, + { + "epoch": 2.370591832574574, + "grad_norm": 0.9926179051399231, + "learning_rate": 4.1956140880807104e-05, + "loss": 2.4217, + "step": 26166 + }, + { + "epoch": 2.3706824307490204, + "grad_norm": 1.032852292060852, + "learning_rate": 4.195009967981635e-05, + "loss": 2.4358, + "step": 26167 + }, + { + "epoch": 2.370773028923467, + "grad_norm": 1.004180908203125, + "learning_rate": 4.194405847882559e-05, + "loss": 2.7208, + "step": 26168 + }, + { + "epoch": 2.370863627097914, + "grad_norm": 0.976290762424469, + "learning_rate": 4.1938017277834834e-05, + "loss": 2.5128, + "step": 26169 + }, + { + "epoch": 2.370954225272361, + "grad_norm": 0.8357664346694946, + "learning_rate": 4.1931976076844074e-05, + "loss": 1.9279, + "step": 26170 + }, + { + "epoch": 2.3710448234468076, + "grad_norm": 0.9508832693099976, + "learning_rate": 4.192593487585332e-05, + "loss": 2.598, + "step": 26171 + }, + { + "epoch": 2.371135421621254, + "grad_norm": 0.9889442324638367, + "learning_rate": 4.191989367486256e-05, + "loss": 2.6026, + "step": 26172 + }, + { + "epoch": 2.371226019795701, + "grad_norm": 1.1087464094161987, + "learning_rate": 4.1913852473871804e-05, + "loss": 2.5765, + "step": 26173 + }, + { + "epoch": 2.371316617970148, + "grad_norm": 0.9974360466003418, + "learning_rate": 4.190781127288105e-05, + "loss": 2.7573, + "step": 26174 + }, + { + "epoch": 2.3714072161445947, + "grad_norm": 0.8695074319839478, + "learning_rate": 4.190177007189029e-05, + "loss": 2.0674, + "step": 26175 + }, + { + "epoch": 2.3714978143190413, + "grad_norm": 0.9753913879394531, + "learning_rate": 4.189572887089954e-05, + "loss": 2.5821, + "step": 26176 + }, + { + "epoch": 2.3715884124934883, + "grad_norm": 1.0206871032714844, + "learning_rate": 4.188968766990878e-05, + "loss": 2.5466, + "step": 26177 + }, + { + "epoch": 2.371679010667935, + "grad_norm": 0.9282582998275757, + "learning_rate": 4.188364646891803e-05, + "loss": 2.7751, + "step": 26178 + }, + { + "epoch": 2.371769608842382, + "grad_norm": 0.9471420049667358, + "learning_rate": 4.187760526792727e-05, + "loss": 2.5581, + "step": 26179 + }, + { + "epoch": 2.3718602070168284, + "grad_norm": 0.9788874983787537, + "learning_rate": 4.187156406693651e-05, + "loss": 2.3916, + "step": 26180 + }, + { + "epoch": 2.3719508051912754, + "grad_norm": 0.8725920915603638, + "learning_rate": 4.186552286594575e-05, + "loss": 2.1008, + "step": 26181 + }, + { + "epoch": 2.372041403365722, + "grad_norm": 1.0283044576644897, + "learning_rate": 4.1859481664955e-05, + "loss": 2.7182, + "step": 26182 + }, + { + "epoch": 2.372132001540169, + "grad_norm": 0.9927959442138672, + "learning_rate": 4.185344046396424e-05, + "loss": 2.6858, + "step": 26183 + }, + { + "epoch": 2.3722225997146156, + "grad_norm": 0.9888747334480286, + "learning_rate": 4.184739926297348e-05, + "loss": 2.6629, + "step": 26184 + }, + { + "epoch": 2.3723131978890626, + "grad_norm": 1.067835807800293, + "learning_rate": 4.184135806198272e-05, + "loss": 2.6196, + "step": 26185 + }, + { + "epoch": 2.372403796063509, + "grad_norm": 1.0360487699508667, + "learning_rate": 4.183531686099197e-05, + "loss": 2.647, + "step": 26186 + }, + { + "epoch": 2.372494394237956, + "grad_norm": 1.0185316801071167, + "learning_rate": 4.182927566000121e-05, + "loss": 2.5555, + "step": 26187 + }, + { + "epoch": 2.3725849924124027, + "grad_norm": 0.94598388671875, + "learning_rate": 4.182323445901045e-05, + "loss": 2.3848, + "step": 26188 + }, + { + "epoch": 2.3726755905868497, + "grad_norm": 1.012601613998413, + "learning_rate": 4.18171932580197e-05, + "loss": 2.5872, + "step": 26189 + }, + { + "epoch": 2.3727661887612963, + "grad_norm": 0.9430896043777466, + "learning_rate": 4.181115205702894e-05, + "loss": 2.5524, + "step": 26190 + }, + { + "epoch": 2.3728567869357433, + "grad_norm": 0.9822878241539001, + "learning_rate": 4.180511085603818e-05, + "loss": 2.813, + "step": 26191 + }, + { + "epoch": 2.37294738511019, + "grad_norm": 0.9920432567596436, + "learning_rate": 4.179906965504743e-05, + "loss": 2.9863, + "step": 26192 + }, + { + "epoch": 2.373037983284637, + "grad_norm": 0.8366100788116455, + "learning_rate": 4.179302845405667e-05, + "loss": 2.1429, + "step": 26193 + }, + { + "epoch": 2.3731285814590835, + "grad_norm": 1.0589748620986938, + "learning_rate": 4.1786987253065915e-05, + "loss": 2.6302, + "step": 26194 + }, + { + "epoch": 2.3732191796335305, + "grad_norm": 1.0234228372573853, + "learning_rate": 4.1780946052075156e-05, + "loss": 2.8373, + "step": 26195 + }, + { + "epoch": 2.373309777807977, + "grad_norm": 0.9420713782310486, + "learning_rate": 4.17749048510844e-05, + "loss": 2.7157, + "step": 26196 + }, + { + "epoch": 2.373400375982424, + "grad_norm": 0.9899038076400757, + "learning_rate": 4.1768863650093645e-05, + "loss": 2.5069, + "step": 26197 + }, + { + "epoch": 2.3734909741568706, + "grad_norm": 1.0725170373916626, + "learning_rate": 4.1762822449102885e-05, + "loss": 2.8513, + "step": 26198 + }, + { + "epoch": 2.3735815723313176, + "grad_norm": 0.9730929732322693, + "learning_rate": 4.1756781248112126e-05, + "loss": 2.5794, + "step": 26199 + }, + { + "epoch": 2.373672170505764, + "grad_norm": 1.1106386184692383, + "learning_rate": 4.175074004712137e-05, + "loss": 2.5299, + "step": 26200 + }, + { + "epoch": 2.373762768680211, + "grad_norm": 0.9702664017677307, + "learning_rate": 4.1744698846130615e-05, + "loss": 2.7572, + "step": 26201 + }, + { + "epoch": 2.3738533668546578, + "grad_norm": 1.020843744277954, + "learning_rate": 4.1738657645139856e-05, + "loss": 2.9608, + "step": 26202 + }, + { + "epoch": 2.3739439650291048, + "grad_norm": 0.9661363363265991, + "learning_rate": 4.1732616444149096e-05, + "loss": 2.6085, + "step": 26203 + }, + { + "epoch": 2.3740345632035513, + "grad_norm": 1.0583592653274536, + "learning_rate": 4.1726575243158344e-05, + "loss": 2.4318, + "step": 26204 + }, + { + "epoch": 2.3741251613779983, + "grad_norm": 0.9880945682525635, + "learning_rate": 4.1720534042167585e-05, + "loss": 2.7115, + "step": 26205 + }, + { + "epoch": 2.374215759552445, + "grad_norm": 0.9690455794334412, + "learning_rate": 4.1714492841176826e-05, + "loss": 2.6078, + "step": 26206 + }, + { + "epoch": 2.374306357726892, + "grad_norm": 1.0237674713134766, + "learning_rate": 4.1708451640186066e-05, + "loss": 2.3446, + "step": 26207 + }, + { + "epoch": 2.3743969559013385, + "grad_norm": 0.9561778903007507, + "learning_rate": 4.1702410439195314e-05, + "loss": 2.4904, + "step": 26208 + }, + { + "epoch": 2.3744875540757855, + "grad_norm": 1.0362908840179443, + "learning_rate": 4.1696369238204555e-05, + "loss": 2.8779, + "step": 26209 + }, + { + "epoch": 2.374578152250232, + "grad_norm": 1.0049774646759033, + "learning_rate": 4.16903280372138e-05, + "loss": 2.3681, + "step": 26210 + }, + { + "epoch": 2.374668750424679, + "grad_norm": 0.9291828870773315, + "learning_rate": 4.168428683622304e-05, + "loss": 2.5991, + "step": 26211 + }, + { + "epoch": 2.3747593485991256, + "grad_norm": 0.900080680847168, + "learning_rate": 4.167824563523229e-05, + "loss": 1.9247, + "step": 26212 + }, + { + "epoch": 2.3748499467735726, + "grad_norm": 0.9888082146644592, + "learning_rate": 4.167220443424153e-05, + "loss": 2.6516, + "step": 26213 + }, + { + "epoch": 2.374940544948019, + "grad_norm": 1.0830739736557007, + "learning_rate": 4.166616323325077e-05, + "loss": 2.7079, + "step": 26214 + }, + { + "epoch": 2.375031143122466, + "grad_norm": 0.8838075995445251, + "learning_rate": 4.1660122032260013e-05, + "loss": 2.0771, + "step": 26215 + }, + { + "epoch": 2.375121741296913, + "grad_norm": 0.9301424026489258, + "learning_rate": 4.165408083126926e-05, + "loss": 2.6482, + "step": 26216 + }, + { + "epoch": 2.37521233947136, + "grad_norm": 1.038427710533142, + "learning_rate": 4.16480396302785e-05, + "loss": 2.6673, + "step": 26217 + }, + { + "epoch": 2.3753029376458064, + "grad_norm": 1.0009102821350098, + "learning_rate": 4.164199842928774e-05, + "loss": 2.4866, + "step": 26218 + }, + { + "epoch": 2.3753935358202534, + "grad_norm": 0.8869525790214539, + "learning_rate": 4.163595722829699e-05, + "loss": 2.0428, + "step": 26219 + }, + { + "epoch": 2.3754841339947, + "grad_norm": 1.2686376571655273, + "learning_rate": 4.162991602730623e-05, + "loss": 2.7123, + "step": 26220 + }, + { + "epoch": 2.375574732169147, + "grad_norm": 0.9911590218544006, + "learning_rate": 4.162387482631547e-05, + "loss": 2.6921, + "step": 26221 + }, + { + "epoch": 2.3756653303435935, + "grad_norm": 1.0597003698349, + "learning_rate": 4.161783362532471e-05, + "loss": 2.9805, + "step": 26222 + }, + { + "epoch": 2.3757559285180405, + "grad_norm": 1.0022016763687134, + "learning_rate": 4.161179242433396e-05, + "loss": 2.5802, + "step": 26223 + }, + { + "epoch": 2.375846526692487, + "grad_norm": 1.0301984548568726, + "learning_rate": 4.16057512233432e-05, + "loss": 3.0095, + "step": 26224 + }, + { + "epoch": 2.375937124866934, + "grad_norm": 1.023799180984497, + "learning_rate": 4.159971002235244e-05, + "loss": 2.2602, + "step": 26225 + }, + { + "epoch": 2.3760277230413807, + "grad_norm": 1.0262608528137207, + "learning_rate": 4.159366882136169e-05, + "loss": 2.8268, + "step": 26226 + }, + { + "epoch": 2.3761183212158277, + "grad_norm": 1.0238537788391113, + "learning_rate": 4.158762762037093e-05, + "loss": 2.5808, + "step": 26227 + }, + { + "epoch": 2.3762089193902742, + "grad_norm": 0.9513641595840454, + "learning_rate": 4.158158641938018e-05, + "loss": 2.2112, + "step": 26228 + }, + { + "epoch": 2.3762995175647212, + "grad_norm": 0.9927204847335815, + "learning_rate": 4.157554521838942e-05, + "loss": 2.6937, + "step": 26229 + }, + { + "epoch": 2.376390115739168, + "grad_norm": 0.8402673006057739, + "learning_rate": 4.156950401739866e-05, + "loss": 1.7824, + "step": 26230 + }, + { + "epoch": 2.376480713913615, + "grad_norm": 0.9552843570709229, + "learning_rate": 4.156346281640791e-05, + "loss": 2.6613, + "step": 26231 + }, + { + "epoch": 2.3765713120880614, + "grad_norm": 0.9585720300674438, + "learning_rate": 4.155742161541715e-05, + "loss": 2.6104, + "step": 26232 + }, + { + "epoch": 2.3766619102625084, + "grad_norm": 0.8747907280921936, + "learning_rate": 4.155138041442639e-05, + "loss": 1.9474, + "step": 26233 + }, + { + "epoch": 2.376752508436955, + "grad_norm": 0.9556230306625366, + "learning_rate": 4.154533921343564e-05, + "loss": 2.5562, + "step": 26234 + }, + { + "epoch": 2.376843106611402, + "grad_norm": 1.0192015171051025, + "learning_rate": 4.153929801244488e-05, + "loss": 2.691, + "step": 26235 + }, + { + "epoch": 2.3769337047858485, + "grad_norm": 1.0557326078414917, + "learning_rate": 4.153325681145412e-05, + "loss": 2.6193, + "step": 26236 + }, + { + "epoch": 2.3770243029602955, + "grad_norm": 0.9871798157691956, + "learning_rate": 4.152721561046336e-05, + "loss": 2.6727, + "step": 26237 + }, + { + "epoch": 2.377114901134742, + "grad_norm": 1.0133050680160522, + "learning_rate": 4.152117440947261e-05, + "loss": 2.5195, + "step": 26238 + }, + { + "epoch": 2.3772054993091887, + "grad_norm": 1.044339656829834, + "learning_rate": 4.151513320848185e-05, + "loss": 2.8343, + "step": 26239 + }, + { + "epoch": 2.3772960974836357, + "grad_norm": 1.0050146579742432, + "learning_rate": 4.150909200749109e-05, + "loss": 2.6592, + "step": 26240 + }, + { + "epoch": 2.3773866956580827, + "grad_norm": 1.0207865238189697, + "learning_rate": 4.150305080650033e-05, + "loss": 2.6157, + "step": 26241 + }, + { + "epoch": 2.3774772938325293, + "grad_norm": 0.9799445271492004, + "learning_rate": 4.149700960550958e-05, + "loss": 2.6427, + "step": 26242 + }, + { + "epoch": 2.377567892006976, + "grad_norm": 1.0773286819458008, + "learning_rate": 4.149096840451882e-05, + "loss": 2.6797, + "step": 26243 + }, + { + "epoch": 2.377658490181423, + "grad_norm": 1.045074701309204, + "learning_rate": 4.1484927203528065e-05, + "loss": 2.7437, + "step": 26244 + }, + { + "epoch": 2.37774908835587, + "grad_norm": 0.9878819584846497, + "learning_rate": 4.1478886002537306e-05, + "loss": 2.7204, + "step": 26245 + }, + { + "epoch": 2.3778396865303164, + "grad_norm": 1.0187760591506958, + "learning_rate": 4.1472844801546554e-05, + "loss": 2.6554, + "step": 26246 + }, + { + "epoch": 2.377930284704763, + "grad_norm": 0.9147017598152161, + "learning_rate": 4.1466803600555795e-05, + "loss": 2.0395, + "step": 26247 + }, + { + "epoch": 2.37802088287921, + "grad_norm": 1.0012753009796143, + "learning_rate": 4.1460762399565035e-05, + "loss": 2.6513, + "step": 26248 + }, + { + "epoch": 2.378111481053657, + "grad_norm": 0.9897130727767944, + "learning_rate": 4.145472119857428e-05, + "loss": 2.7052, + "step": 26249 + }, + { + "epoch": 2.3782020792281036, + "grad_norm": 0.9693812727928162, + "learning_rate": 4.1448679997583524e-05, + "loss": 2.5872, + "step": 26250 + }, + { + "epoch": 2.37829267740255, + "grad_norm": 1.06975257396698, + "learning_rate": 4.1442638796592765e-05, + "loss": 2.5902, + "step": 26251 + }, + { + "epoch": 2.378383275576997, + "grad_norm": 1.0429229736328125, + "learning_rate": 4.1436597595602005e-05, + "loss": 2.4774, + "step": 26252 + }, + { + "epoch": 2.378473873751444, + "grad_norm": 0.9940812587738037, + "learning_rate": 4.143055639461125e-05, + "loss": 2.4474, + "step": 26253 + }, + { + "epoch": 2.3785644719258907, + "grad_norm": 0.9898779392242432, + "learning_rate": 4.1424515193620494e-05, + "loss": 2.5906, + "step": 26254 + }, + { + "epoch": 2.3786550701003373, + "grad_norm": 0.9936796426773071, + "learning_rate": 4.1418473992629735e-05, + "loss": 2.7753, + "step": 26255 + }, + { + "epoch": 2.3787456682747843, + "grad_norm": 1.0769323110580444, + "learning_rate": 4.1412432791638976e-05, + "loss": 2.6056, + "step": 26256 + }, + { + "epoch": 2.3788362664492313, + "grad_norm": 0.9706584215164185, + "learning_rate": 4.140639159064822e-05, + "loss": 1.8786, + "step": 26257 + }, + { + "epoch": 2.378926864623678, + "grad_norm": 1.0569607019424438, + "learning_rate": 4.1400350389657464e-05, + "loss": 2.6365, + "step": 26258 + }, + { + "epoch": 2.3790174627981244, + "grad_norm": 1.016508936882019, + "learning_rate": 4.1394309188666705e-05, + "loss": 2.7812, + "step": 26259 + }, + { + "epoch": 2.3791080609725714, + "grad_norm": 0.8849676847457886, + "learning_rate": 4.138826798767595e-05, + "loss": 2.2122, + "step": 26260 + }, + { + "epoch": 2.379198659147018, + "grad_norm": 0.9691453576087952, + "learning_rate": 4.138222678668519e-05, + "loss": 2.5404, + "step": 26261 + }, + { + "epoch": 2.379289257321465, + "grad_norm": 1.0231882333755493, + "learning_rate": 4.137618558569444e-05, + "loss": 2.6524, + "step": 26262 + }, + { + "epoch": 2.3793798554959116, + "grad_norm": 0.74179607629776, + "learning_rate": 4.137014438470368e-05, + "loss": 1.5133, + "step": 26263 + }, + { + "epoch": 2.3794704536703586, + "grad_norm": 0.9649655222892761, + "learning_rate": 4.136410318371293e-05, + "loss": 2.7752, + "step": 26264 + }, + { + "epoch": 2.379561051844805, + "grad_norm": 0.9626860618591309, + "learning_rate": 4.135806198272217e-05, + "loss": 2.5156, + "step": 26265 + }, + { + "epoch": 2.379651650019252, + "grad_norm": 0.9569485783576965, + "learning_rate": 4.135202078173141e-05, + "loss": 2.3839, + "step": 26266 + }, + { + "epoch": 2.3797422481936987, + "grad_norm": 1.0809054374694824, + "learning_rate": 4.134597958074065e-05, + "loss": 2.7173, + "step": 26267 + }, + { + "epoch": 2.3798328463681457, + "grad_norm": 1.0359928607940674, + "learning_rate": 4.13399383797499e-05, + "loss": 2.6754, + "step": 26268 + }, + { + "epoch": 2.3799234445425923, + "grad_norm": 0.843254804611206, + "learning_rate": 4.133389717875914e-05, + "loss": 2.0477, + "step": 26269 + }, + { + "epoch": 2.3800140427170393, + "grad_norm": 0.9724708795547485, + "learning_rate": 4.132785597776838e-05, + "loss": 2.7807, + "step": 26270 + }, + { + "epoch": 2.380104640891486, + "grad_norm": 0.9860073328018188, + "learning_rate": 4.132181477677762e-05, + "loss": 2.6417, + "step": 26271 + }, + { + "epoch": 2.380195239065933, + "grad_norm": 1.026530146598816, + "learning_rate": 4.131577357578687e-05, + "loss": 2.9087, + "step": 26272 + }, + { + "epoch": 2.3802858372403795, + "grad_norm": 0.9961255192756653, + "learning_rate": 4.130973237479611e-05, + "loss": 2.4878, + "step": 26273 + }, + { + "epoch": 2.3803764354148265, + "grad_norm": 1.0048675537109375, + "learning_rate": 4.130369117380535e-05, + "loss": 2.5115, + "step": 26274 + }, + { + "epoch": 2.380467033589273, + "grad_norm": 0.9588611125946045, + "learning_rate": 4.129764997281459e-05, + "loss": 2.4697, + "step": 26275 + }, + { + "epoch": 2.38055763176372, + "grad_norm": 0.9529452323913574, + "learning_rate": 4.129160877182384e-05, + "loss": 2.5217, + "step": 26276 + }, + { + "epoch": 2.3806482299381666, + "grad_norm": 1.0417300462722778, + "learning_rate": 4.128556757083308e-05, + "loss": 2.7182, + "step": 26277 + }, + { + "epoch": 2.3807388281126136, + "grad_norm": 0.9356862902641296, + "learning_rate": 4.127952636984233e-05, + "loss": 1.876, + "step": 26278 + }, + { + "epoch": 2.38082942628706, + "grad_norm": 0.996981680393219, + "learning_rate": 4.127348516885157e-05, + "loss": 2.4113, + "step": 26279 + }, + { + "epoch": 2.380920024461507, + "grad_norm": 1.0734738111495972, + "learning_rate": 4.1267443967860816e-05, + "loss": 2.8016, + "step": 26280 + }, + { + "epoch": 2.3810106226359538, + "grad_norm": 0.9836707711219788, + "learning_rate": 4.126140276687006e-05, + "loss": 2.6899, + "step": 26281 + }, + { + "epoch": 2.3811012208104008, + "grad_norm": 1.0051170587539673, + "learning_rate": 4.12553615658793e-05, + "loss": 2.5362, + "step": 26282 + }, + { + "epoch": 2.3811918189848473, + "grad_norm": 1.069390892982483, + "learning_rate": 4.1249320364888546e-05, + "loss": 2.803, + "step": 26283 + }, + { + "epoch": 2.3812824171592943, + "grad_norm": 1.0515453815460205, + "learning_rate": 4.1243279163897787e-05, + "loss": 2.9184, + "step": 26284 + }, + { + "epoch": 2.381373015333741, + "grad_norm": 1.012952446937561, + "learning_rate": 4.123723796290703e-05, + "loss": 2.5221, + "step": 26285 + }, + { + "epoch": 2.381463613508188, + "grad_norm": 1.0062543153762817, + "learning_rate": 4.123119676191627e-05, + "loss": 2.5695, + "step": 26286 + }, + { + "epoch": 2.3815542116826345, + "grad_norm": 1.0673315525054932, + "learning_rate": 4.1225155560925516e-05, + "loss": 1.8948, + "step": 26287 + }, + { + "epoch": 2.3816448098570815, + "grad_norm": 1.040205955505371, + "learning_rate": 4.121911435993476e-05, + "loss": 2.5863, + "step": 26288 + }, + { + "epoch": 2.381735408031528, + "grad_norm": 1.003312587738037, + "learning_rate": 4.1213073158944e-05, + "loss": 2.4535, + "step": 26289 + }, + { + "epoch": 2.381826006205975, + "grad_norm": 1.0076271295547485, + "learning_rate": 4.120703195795324e-05, + "loss": 2.633, + "step": 26290 + }, + { + "epoch": 2.3819166043804216, + "grad_norm": 1.0396651029586792, + "learning_rate": 4.1200990756962486e-05, + "loss": 2.7197, + "step": 26291 + }, + { + "epoch": 2.3820072025548686, + "grad_norm": 0.9850336313247681, + "learning_rate": 4.119494955597173e-05, + "loss": 2.3098, + "step": 26292 + }, + { + "epoch": 2.382097800729315, + "grad_norm": 0.9839861392974854, + "learning_rate": 4.118890835498097e-05, + "loss": 2.5652, + "step": 26293 + }, + { + "epoch": 2.382188398903762, + "grad_norm": 0.9911019802093506, + "learning_rate": 4.1182867153990215e-05, + "loss": 2.6971, + "step": 26294 + }, + { + "epoch": 2.382278997078209, + "grad_norm": 1.0195419788360596, + "learning_rate": 4.1176825952999456e-05, + "loss": 2.4799, + "step": 26295 + }, + { + "epoch": 2.382369595252656, + "grad_norm": 1.0191882848739624, + "learning_rate": 4.1170784752008704e-05, + "loss": 2.849, + "step": 26296 + }, + { + "epoch": 2.3824601934271024, + "grad_norm": 0.8505864143371582, + "learning_rate": 4.1164743551017944e-05, + "loss": 2.0608, + "step": 26297 + }, + { + "epoch": 2.3825507916015494, + "grad_norm": 0.9713411331176758, + "learning_rate": 4.115870235002719e-05, + "loss": 2.5562, + "step": 26298 + }, + { + "epoch": 2.382641389775996, + "grad_norm": 0.852245032787323, + "learning_rate": 4.115266114903643e-05, + "loss": 1.9042, + "step": 26299 + }, + { + "epoch": 2.382731987950443, + "grad_norm": 0.8010985851287842, + "learning_rate": 4.1146619948045674e-05, + "loss": 1.9015, + "step": 26300 + }, + { + "epoch": 2.3828225861248895, + "grad_norm": 1.0603961944580078, + "learning_rate": 4.1140578747054915e-05, + "loss": 2.667, + "step": 26301 + }, + { + "epoch": 2.3829131842993365, + "grad_norm": 0.9751574397087097, + "learning_rate": 4.113453754606416e-05, + "loss": 2.7948, + "step": 26302 + }, + { + "epoch": 2.383003782473783, + "grad_norm": 0.948072075843811, + "learning_rate": 4.11284963450734e-05, + "loss": 2.3731, + "step": 26303 + }, + { + "epoch": 2.38309438064823, + "grad_norm": 1.0378087759017944, + "learning_rate": 4.1122455144082644e-05, + "loss": 2.9204, + "step": 26304 + }, + { + "epoch": 2.3831849788226767, + "grad_norm": 0.9207001328468323, + "learning_rate": 4.1116413943091885e-05, + "loss": 1.9621, + "step": 26305 + }, + { + "epoch": 2.3832755769971237, + "grad_norm": 0.91225266456604, + "learning_rate": 4.111037274210113e-05, + "loss": 2.3685, + "step": 26306 + }, + { + "epoch": 2.3833661751715702, + "grad_norm": 1.0897574424743652, + "learning_rate": 4.110433154111037e-05, + "loss": 2.5517, + "step": 26307 + }, + { + "epoch": 2.3834567733460172, + "grad_norm": 0.995118260383606, + "learning_rate": 4.1098290340119614e-05, + "loss": 2.6543, + "step": 26308 + }, + { + "epoch": 2.383547371520464, + "grad_norm": 0.9903801679611206, + "learning_rate": 4.109224913912886e-05, + "loss": 2.7374, + "step": 26309 + }, + { + "epoch": 2.383637969694911, + "grad_norm": 0.970836341381073, + "learning_rate": 4.10862079381381e-05, + "loss": 2.6012, + "step": 26310 + }, + { + "epoch": 2.3837285678693574, + "grad_norm": 1.1638010740280151, + "learning_rate": 4.108016673714734e-05, + "loss": 2.4558, + "step": 26311 + }, + { + "epoch": 2.3838191660438044, + "grad_norm": 0.8592080473899841, + "learning_rate": 4.107412553615659e-05, + "loss": 2.0598, + "step": 26312 + }, + { + "epoch": 2.383909764218251, + "grad_norm": 1.0019890069961548, + "learning_rate": 4.106808433516584e-05, + "loss": 2.6275, + "step": 26313 + }, + { + "epoch": 2.384000362392698, + "grad_norm": 1.0273417234420776, + "learning_rate": 4.106204313417508e-05, + "loss": 2.9125, + "step": 26314 + }, + { + "epoch": 2.3840909605671445, + "grad_norm": 1.0707138776779175, + "learning_rate": 4.105600193318432e-05, + "loss": 2.4056, + "step": 26315 + }, + { + "epoch": 2.3841815587415915, + "grad_norm": 1.0149414539337158, + "learning_rate": 4.104996073219356e-05, + "loss": 2.8721, + "step": 26316 + }, + { + "epoch": 2.384272156916038, + "grad_norm": 1.0149188041687012, + "learning_rate": 4.104391953120281e-05, + "loss": 2.7632, + "step": 26317 + }, + { + "epoch": 2.384362755090485, + "grad_norm": 0.8575348258018494, + "learning_rate": 4.103787833021205e-05, + "loss": 2.3471, + "step": 26318 + }, + { + "epoch": 2.3844533532649317, + "grad_norm": 0.9934307932853699, + "learning_rate": 4.103183712922129e-05, + "loss": 2.8301, + "step": 26319 + }, + { + "epoch": 2.3845439514393787, + "grad_norm": 0.9745296239852905, + "learning_rate": 4.102579592823053e-05, + "loss": 2.3487, + "step": 26320 + }, + { + "epoch": 2.3846345496138253, + "grad_norm": 1.042808175086975, + "learning_rate": 4.101975472723978e-05, + "loss": 2.5451, + "step": 26321 + }, + { + "epoch": 2.384725147788272, + "grad_norm": 1.1943550109863281, + "learning_rate": 4.101371352624902e-05, + "loss": 2.4501, + "step": 26322 + }, + { + "epoch": 2.384815745962719, + "grad_norm": 1.0627434253692627, + "learning_rate": 4.100767232525826e-05, + "loss": 2.5279, + "step": 26323 + }, + { + "epoch": 2.384906344137166, + "grad_norm": 1.0402007102966309, + "learning_rate": 4.100163112426751e-05, + "loss": 2.7418, + "step": 26324 + }, + { + "epoch": 2.3849969423116124, + "grad_norm": 1.2376559972763062, + "learning_rate": 4.099558992327675e-05, + "loss": 2.533, + "step": 26325 + }, + { + "epoch": 2.385087540486059, + "grad_norm": 1.1809450387954712, + "learning_rate": 4.098954872228599e-05, + "loss": 2.5211, + "step": 26326 + }, + { + "epoch": 2.385178138660506, + "grad_norm": 0.9386411309242249, + "learning_rate": 4.098350752129523e-05, + "loss": 2.4613, + "step": 26327 + }, + { + "epoch": 2.385268736834953, + "grad_norm": 1.050384521484375, + "learning_rate": 4.097746632030448e-05, + "loss": 1.9582, + "step": 26328 + }, + { + "epoch": 2.3853593350093996, + "grad_norm": 0.9701513051986694, + "learning_rate": 4.0971425119313726e-05, + "loss": 2.5542, + "step": 26329 + }, + { + "epoch": 2.385449933183846, + "grad_norm": 0.9649271368980408, + "learning_rate": 4.0965383918322966e-05, + "loss": 2.5461, + "step": 26330 + }, + { + "epoch": 2.385540531358293, + "grad_norm": 0.9462831616401672, + "learning_rate": 4.095934271733221e-05, + "loss": 2.5965, + "step": 26331 + }, + { + "epoch": 2.38563112953274, + "grad_norm": 1.1443145275115967, + "learning_rate": 4.0953301516341455e-05, + "loss": 2.5475, + "step": 26332 + }, + { + "epoch": 2.3857217277071867, + "grad_norm": 0.9976209402084351, + "learning_rate": 4.0947260315350696e-05, + "loss": 2.6609, + "step": 26333 + }, + { + "epoch": 2.3858123258816333, + "grad_norm": 1.0117319822311401, + "learning_rate": 4.0941219114359936e-05, + "loss": 3.0861, + "step": 26334 + }, + { + "epoch": 2.3859029240560803, + "grad_norm": 0.9263996481895447, + "learning_rate": 4.093517791336918e-05, + "loss": 2.5479, + "step": 26335 + }, + { + "epoch": 2.3859935222305273, + "grad_norm": 1.0968633890151978, + "learning_rate": 4.0929136712378425e-05, + "loss": 2.6725, + "step": 26336 + }, + { + "epoch": 2.386084120404974, + "grad_norm": 0.8885363340377808, + "learning_rate": 4.0923095511387666e-05, + "loss": 1.9965, + "step": 26337 + }, + { + "epoch": 2.3861747185794204, + "grad_norm": 1.015403389930725, + "learning_rate": 4.0917054310396907e-05, + "loss": 2.6321, + "step": 26338 + }, + { + "epoch": 2.3862653167538674, + "grad_norm": 0.9620737433433533, + "learning_rate": 4.0911013109406154e-05, + "loss": 2.6449, + "step": 26339 + }, + { + "epoch": 2.386355914928314, + "grad_norm": 0.849327027797699, + "learning_rate": 4.0904971908415395e-05, + "loss": 1.9798, + "step": 26340 + }, + { + "epoch": 2.386446513102761, + "grad_norm": 1.015418291091919, + "learning_rate": 4.0898930707424636e-05, + "loss": 2.5856, + "step": 26341 + }, + { + "epoch": 2.3865371112772076, + "grad_norm": 1.0062475204467773, + "learning_rate": 4.089288950643388e-05, + "loss": 2.6191, + "step": 26342 + }, + { + "epoch": 2.3866277094516546, + "grad_norm": 0.8809382915496826, + "learning_rate": 4.0886848305443124e-05, + "loss": 1.907, + "step": 26343 + }, + { + "epoch": 2.386718307626101, + "grad_norm": 0.8676456212997437, + "learning_rate": 4.0880807104452365e-05, + "loss": 2.0034, + "step": 26344 + }, + { + "epoch": 2.386808905800548, + "grad_norm": 1.0107204914093018, + "learning_rate": 4.0874765903461606e-05, + "loss": 2.7345, + "step": 26345 + }, + { + "epoch": 2.3868995039749947, + "grad_norm": 1.0924848318099976, + "learning_rate": 4.0868724702470854e-05, + "loss": 2.5943, + "step": 26346 + }, + { + "epoch": 2.3869901021494417, + "grad_norm": 0.9802119731903076, + "learning_rate": 4.08626835014801e-05, + "loss": 2.5314, + "step": 26347 + }, + { + "epoch": 2.3870807003238883, + "grad_norm": 0.6953401565551758, + "learning_rate": 4.085664230048934e-05, + "loss": 1.4638, + "step": 26348 + }, + { + "epoch": 2.3871712984983353, + "grad_norm": 1.1039314270019531, + "learning_rate": 4.085060109949858e-05, + "loss": 2.7192, + "step": 26349 + }, + { + "epoch": 2.387261896672782, + "grad_norm": 0.9944344162940979, + "learning_rate": 4.0844559898507824e-05, + "loss": 2.3489, + "step": 26350 + }, + { + "epoch": 2.387352494847229, + "grad_norm": 1.0771582126617432, + "learning_rate": 4.083851869751707e-05, + "loss": 2.5577, + "step": 26351 + }, + { + "epoch": 2.3874430930216755, + "grad_norm": 0.9723890423774719, + "learning_rate": 4.083247749652631e-05, + "loss": 2.4935, + "step": 26352 + }, + { + "epoch": 2.3875336911961225, + "grad_norm": 1.033808946609497, + "learning_rate": 4.082643629553555e-05, + "loss": 2.867, + "step": 26353 + }, + { + "epoch": 2.387624289370569, + "grad_norm": 1.0748049020767212, + "learning_rate": 4.08203950945448e-05, + "loss": 2.6511, + "step": 26354 + }, + { + "epoch": 2.387714887545016, + "grad_norm": 0.9518089890480042, + "learning_rate": 4.081435389355404e-05, + "loss": 2.4784, + "step": 26355 + }, + { + "epoch": 2.3878054857194626, + "grad_norm": 0.9396284818649292, + "learning_rate": 4.080831269256328e-05, + "loss": 2.3308, + "step": 26356 + }, + { + "epoch": 2.3878960838939096, + "grad_norm": 1.0171717405319214, + "learning_rate": 4.080227149157252e-05, + "loss": 2.6706, + "step": 26357 + }, + { + "epoch": 2.387986682068356, + "grad_norm": 0.9689787030220032, + "learning_rate": 4.079623029058177e-05, + "loss": 2.7141, + "step": 26358 + }, + { + "epoch": 2.388077280242803, + "grad_norm": 0.9470161199569702, + "learning_rate": 4.079018908959101e-05, + "loss": 2.5849, + "step": 26359 + }, + { + "epoch": 2.3881678784172498, + "grad_norm": 1.1982556581497192, + "learning_rate": 4.078414788860025e-05, + "loss": 2.3985, + "step": 26360 + }, + { + "epoch": 2.3882584765916968, + "grad_norm": 1.0492254495620728, + "learning_rate": 4.077810668760949e-05, + "loss": 2.469, + "step": 26361 + }, + { + "epoch": 2.3883490747661433, + "grad_norm": 1.0429104566574097, + "learning_rate": 4.077206548661874e-05, + "loss": 2.6063, + "step": 26362 + }, + { + "epoch": 2.3884396729405903, + "grad_norm": 1.0524407625198364, + "learning_rate": 4.076602428562799e-05, + "loss": 2.5803, + "step": 26363 + }, + { + "epoch": 2.388530271115037, + "grad_norm": 1.1492230892181396, + "learning_rate": 4.075998308463723e-05, + "loss": 2.6027, + "step": 26364 + }, + { + "epoch": 2.388620869289484, + "grad_norm": 0.9809294939041138, + "learning_rate": 4.075394188364647e-05, + "loss": 2.4272, + "step": 26365 + }, + { + "epoch": 2.3887114674639305, + "grad_norm": 1.049279808998108, + "learning_rate": 4.074790068265572e-05, + "loss": 2.6942, + "step": 26366 + }, + { + "epoch": 2.3888020656383775, + "grad_norm": 1.056600570678711, + "learning_rate": 4.074185948166496e-05, + "loss": 2.6421, + "step": 26367 + }, + { + "epoch": 2.388892663812824, + "grad_norm": 0.9696863889694214, + "learning_rate": 4.07358182806742e-05, + "loss": 2.5208, + "step": 26368 + }, + { + "epoch": 2.388983261987271, + "grad_norm": 1.0300792455673218, + "learning_rate": 4.072977707968345e-05, + "loss": 2.7525, + "step": 26369 + }, + { + "epoch": 2.3890738601617176, + "grad_norm": 1.0442801713943481, + "learning_rate": 4.072373587869269e-05, + "loss": 2.7189, + "step": 26370 + }, + { + "epoch": 2.3891644583361646, + "grad_norm": 0.9822102785110474, + "learning_rate": 4.071769467770193e-05, + "loss": 2.6073, + "step": 26371 + }, + { + "epoch": 2.389255056510611, + "grad_norm": 0.98946613073349, + "learning_rate": 4.071165347671117e-05, + "loss": 2.6205, + "step": 26372 + }, + { + "epoch": 2.389345654685058, + "grad_norm": 0.9559176564216614, + "learning_rate": 4.070561227572042e-05, + "loss": 2.5774, + "step": 26373 + }, + { + "epoch": 2.389436252859505, + "grad_norm": 0.9547986388206482, + "learning_rate": 4.069957107472966e-05, + "loss": 2.0961, + "step": 26374 + }, + { + "epoch": 2.389526851033952, + "grad_norm": 1.034119725227356, + "learning_rate": 4.06935298737389e-05, + "loss": 2.6935, + "step": 26375 + }, + { + "epoch": 2.3896174492083984, + "grad_norm": 0.8757471442222595, + "learning_rate": 4.068748867274814e-05, + "loss": 1.8675, + "step": 26376 + }, + { + "epoch": 2.3897080473828454, + "grad_norm": 0.9026951789855957, + "learning_rate": 4.068144747175739e-05, + "loss": 1.9003, + "step": 26377 + }, + { + "epoch": 2.389798645557292, + "grad_norm": 0.9888184070587158, + "learning_rate": 4.067540627076663e-05, + "loss": 2.5483, + "step": 26378 + }, + { + "epoch": 2.389889243731739, + "grad_norm": 1.0453436374664307, + "learning_rate": 4.0669365069775875e-05, + "loss": 2.5067, + "step": 26379 + }, + { + "epoch": 2.3899798419061855, + "grad_norm": 0.9942750930786133, + "learning_rate": 4.0663323868785116e-05, + "loss": 2.9103, + "step": 26380 + }, + { + "epoch": 2.3900704400806325, + "grad_norm": 1.021937370300293, + "learning_rate": 4.0657282667794364e-05, + "loss": 2.4494, + "step": 26381 + }, + { + "epoch": 2.390161038255079, + "grad_norm": 0.9474743604660034, + "learning_rate": 4.0651241466803605e-05, + "loss": 2.2673, + "step": 26382 + }, + { + "epoch": 2.390251636429526, + "grad_norm": 0.9730942249298096, + "learning_rate": 4.0645200265812846e-05, + "loss": 2.7386, + "step": 26383 + }, + { + "epoch": 2.3903422346039727, + "grad_norm": 0.9942505359649658, + "learning_rate": 4.063915906482209e-05, + "loss": 2.4558, + "step": 26384 + }, + { + "epoch": 2.3904328327784197, + "grad_norm": 1.062713384628296, + "learning_rate": 4.0633117863831334e-05, + "loss": 2.2424, + "step": 26385 + }, + { + "epoch": 2.3905234309528662, + "grad_norm": 1.0643784999847412, + "learning_rate": 4.0627076662840575e-05, + "loss": 2.5733, + "step": 26386 + }, + { + "epoch": 2.3906140291273132, + "grad_norm": 1.0501656532287598, + "learning_rate": 4.0621035461849816e-05, + "loss": 2.7122, + "step": 26387 + }, + { + "epoch": 2.39070462730176, + "grad_norm": 0.9925141930580139, + "learning_rate": 4.061499426085906e-05, + "loss": 2.5346, + "step": 26388 + }, + { + "epoch": 2.390795225476207, + "grad_norm": 0.985958456993103, + "learning_rate": 4.0608953059868304e-05, + "loss": 2.7054, + "step": 26389 + }, + { + "epoch": 2.3908858236506534, + "grad_norm": 0.9817752838134766, + "learning_rate": 4.0602911858877545e-05, + "loss": 2.797, + "step": 26390 + }, + { + "epoch": 2.3909764218251004, + "grad_norm": 0.9202873110771179, + "learning_rate": 4.0596870657886786e-05, + "loss": 2.1582, + "step": 26391 + }, + { + "epoch": 2.391067019999547, + "grad_norm": 1.0682132244110107, + "learning_rate": 4.059082945689603e-05, + "loss": 2.7429, + "step": 26392 + }, + { + "epoch": 2.391157618173994, + "grad_norm": 1.0294455289840698, + "learning_rate": 4.0584788255905274e-05, + "loss": 2.8169, + "step": 26393 + }, + { + "epoch": 2.3912482163484405, + "grad_norm": 1.0176295042037964, + "learning_rate": 4.0578747054914515e-05, + "loss": 2.6759, + "step": 26394 + }, + { + "epoch": 2.3913388145228875, + "grad_norm": 0.9441274404525757, + "learning_rate": 4.057270585392376e-05, + "loss": 1.9371, + "step": 26395 + }, + { + "epoch": 2.391429412697334, + "grad_norm": 1.0053138732910156, + "learning_rate": 4.0566664652933003e-05, + "loss": 2.353, + "step": 26396 + }, + { + "epoch": 2.391520010871781, + "grad_norm": 0.8606649041175842, + "learning_rate": 4.056062345194225e-05, + "loss": 1.7901, + "step": 26397 + }, + { + "epoch": 2.3916106090462277, + "grad_norm": 0.9508804678916931, + "learning_rate": 4.055458225095149e-05, + "loss": 2.5024, + "step": 26398 + }, + { + "epoch": 2.3917012072206747, + "grad_norm": 0.9209039807319641, + "learning_rate": 4.054854104996074e-05, + "loss": 2.3197, + "step": 26399 + }, + { + "epoch": 2.3917918053951213, + "grad_norm": 1.024868369102478, + "learning_rate": 4.054249984896998e-05, + "loss": 2.5942, + "step": 26400 + }, + { + "epoch": 2.391882403569568, + "grad_norm": 1.0648105144500732, + "learning_rate": 4.053645864797922e-05, + "loss": 2.6596, + "step": 26401 + }, + { + "epoch": 2.391973001744015, + "grad_norm": 1.100703239440918, + "learning_rate": 4.053041744698846e-05, + "loss": 2.5298, + "step": 26402 + }, + { + "epoch": 2.392063599918462, + "grad_norm": 0.8925734162330627, + "learning_rate": 4.052437624599771e-05, + "loss": 2.166, + "step": 26403 + }, + { + "epoch": 2.3921541980929084, + "grad_norm": 0.87971431016922, + "learning_rate": 4.051833504500695e-05, + "loss": 1.7186, + "step": 26404 + }, + { + "epoch": 2.392244796267355, + "grad_norm": 1.0159530639648438, + "learning_rate": 4.051229384401619e-05, + "loss": 2.6401, + "step": 26405 + }, + { + "epoch": 2.392335394441802, + "grad_norm": 1.036870002746582, + "learning_rate": 4.050625264302543e-05, + "loss": 2.5652, + "step": 26406 + }, + { + "epoch": 2.392425992616249, + "grad_norm": 1.1049420833587646, + "learning_rate": 4.050021144203468e-05, + "loss": 2.4579, + "step": 26407 + }, + { + "epoch": 2.3925165907906956, + "grad_norm": 1.0443108081817627, + "learning_rate": 4.049417024104392e-05, + "loss": 2.6541, + "step": 26408 + }, + { + "epoch": 2.392607188965142, + "grad_norm": 1.0767465829849243, + "learning_rate": 4.048812904005316e-05, + "loss": 2.4913, + "step": 26409 + }, + { + "epoch": 2.392697787139589, + "grad_norm": 1.0745519399642944, + "learning_rate": 4.048208783906241e-05, + "loss": 2.6287, + "step": 26410 + }, + { + "epoch": 2.392788385314036, + "grad_norm": 1.0373425483703613, + "learning_rate": 4.047604663807165e-05, + "loss": 2.295, + "step": 26411 + }, + { + "epoch": 2.3928789834884827, + "grad_norm": 0.8798538446426392, + "learning_rate": 4.047000543708089e-05, + "loss": 2.1588, + "step": 26412 + }, + { + "epoch": 2.3929695816629293, + "grad_norm": 0.9514580965042114, + "learning_rate": 4.046396423609014e-05, + "loss": 2.558, + "step": 26413 + }, + { + "epoch": 2.3930601798373763, + "grad_norm": 0.9990853667259216, + "learning_rate": 4.045792303509938e-05, + "loss": 2.8335, + "step": 26414 + }, + { + "epoch": 2.3931507780118233, + "grad_norm": 1.0815681219100952, + "learning_rate": 4.0451881834108627e-05, + "loss": 2.8714, + "step": 26415 + }, + { + "epoch": 2.39324137618627, + "grad_norm": 1.0284279584884644, + "learning_rate": 4.044584063311787e-05, + "loss": 2.6684, + "step": 26416 + }, + { + "epoch": 2.3933319743607164, + "grad_norm": 1.0115371942520142, + "learning_rate": 4.043979943212711e-05, + "loss": 2.8417, + "step": 26417 + }, + { + "epoch": 2.3934225725351634, + "grad_norm": 0.9821946620941162, + "learning_rate": 4.0433758231136356e-05, + "loss": 2.6049, + "step": 26418 + }, + { + "epoch": 2.3935131707096104, + "grad_norm": 0.969927966594696, + "learning_rate": 4.04277170301456e-05, + "loss": 2.4223, + "step": 26419 + }, + { + "epoch": 2.393603768884057, + "grad_norm": 1.0781084299087524, + "learning_rate": 4.042167582915484e-05, + "loss": 2.5633, + "step": 26420 + }, + { + "epoch": 2.3936943670585036, + "grad_norm": 0.9788858890533447, + "learning_rate": 4.041563462816408e-05, + "loss": 2.5509, + "step": 26421 + }, + { + "epoch": 2.3937849652329506, + "grad_norm": 0.9751614332199097, + "learning_rate": 4.0409593427173326e-05, + "loss": 2.6015, + "step": 26422 + }, + { + "epoch": 2.393875563407397, + "grad_norm": 1.0180296897888184, + "learning_rate": 4.040355222618257e-05, + "loss": 2.58, + "step": 26423 + }, + { + "epoch": 2.393966161581844, + "grad_norm": 1.0470184087753296, + "learning_rate": 4.039751102519181e-05, + "loss": 2.2298, + "step": 26424 + }, + { + "epoch": 2.3940567597562907, + "grad_norm": 0.8943172097206116, + "learning_rate": 4.0391469824201055e-05, + "loss": 1.758, + "step": 26425 + }, + { + "epoch": 2.3941473579307377, + "grad_norm": 0.9644699096679688, + "learning_rate": 4.0385428623210296e-05, + "loss": 2.4003, + "step": 26426 + }, + { + "epoch": 2.3942379561051843, + "grad_norm": 1.1937750577926636, + "learning_rate": 4.037938742221954e-05, + "loss": 2.5453, + "step": 26427 + }, + { + "epoch": 2.3943285542796313, + "grad_norm": 1.0948944091796875, + "learning_rate": 4.037334622122878e-05, + "loss": 2.5428, + "step": 26428 + }, + { + "epoch": 2.394419152454078, + "grad_norm": 0.9918789863586426, + "learning_rate": 4.0367305020238025e-05, + "loss": 2.5419, + "step": 26429 + }, + { + "epoch": 2.394509750628525, + "grad_norm": 0.9323528409004211, + "learning_rate": 4.0361263819247266e-05, + "loss": 2.6487, + "step": 26430 + }, + { + "epoch": 2.3946003488029715, + "grad_norm": 1.0420631170272827, + "learning_rate": 4.0355222618256514e-05, + "loss": 2.7772, + "step": 26431 + }, + { + "epoch": 2.3946909469774185, + "grad_norm": 0.9448039531707764, + "learning_rate": 4.0349181417265755e-05, + "loss": 2.4256, + "step": 26432 + }, + { + "epoch": 2.394781545151865, + "grad_norm": 1.0399051904678345, + "learning_rate": 4.0343140216275e-05, + "loss": 2.7525, + "step": 26433 + }, + { + "epoch": 2.394872143326312, + "grad_norm": 0.8529791831970215, + "learning_rate": 4.033709901528424e-05, + "loss": 2.0559, + "step": 26434 + }, + { + "epoch": 2.3949627415007586, + "grad_norm": 0.9789939522743225, + "learning_rate": 4.0331057814293484e-05, + "loss": 2.4811, + "step": 26435 + }, + { + "epoch": 2.3950533396752056, + "grad_norm": 1.0331705808639526, + "learning_rate": 4.0325016613302725e-05, + "loss": 2.45, + "step": 26436 + }, + { + "epoch": 2.395143937849652, + "grad_norm": 1.0427234172821045, + "learning_rate": 4.031897541231197e-05, + "loss": 2.8069, + "step": 26437 + }, + { + "epoch": 2.395234536024099, + "grad_norm": 1.0021032094955444, + "learning_rate": 4.031293421132121e-05, + "loss": 2.7355, + "step": 26438 + }, + { + "epoch": 2.3953251341985458, + "grad_norm": 1.1920777559280396, + "learning_rate": 4.0306893010330454e-05, + "loss": 2.4251, + "step": 26439 + }, + { + "epoch": 2.3954157323729928, + "grad_norm": 0.8946499824523926, + "learning_rate": 4.03008518093397e-05, + "loss": 2.0282, + "step": 26440 + }, + { + "epoch": 2.3955063305474393, + "grad_norm": 1.042969822883606, + "learning_rate": 4.029481060834894e-05, + "loss": 2.5958, + "step": 26441 + }, + { + "epoch": 2.3955969287218863, + "grad_norm": 1.017568826675415, + "learning_rate": 4.028876940735818e-05, + "loss": 2.7731, + "step": 26442 + }, + { + "epoch": 2.395687526896333, + "grad_norm": 1.2511956691741943, + "learning_rate": 4.0282728206367424e-05, + "loss": 2.5071, + "step": 26443 + }, + { + "epoch": 2.39577812507078, + "grad_norm": 1.0258667469024658, + "learning_rate": 4.027668700537667e-05, + "loss": 2.6466, + "step": 26444 + }, + { + "epoch": 2.3958687232452265, + "grad_norm": 1.0224840641021729, + "learning_rate": 4.027064580438591e-05, + "loss": 2.673, + "step": 26445 + }, + { + "epoch": 2.3959593214196735, + "grad_norm": 0.9980807900428772, + "learning_rate": 4.026460460339515e-05, + "loss": 2.5685, + "step": 26446 + }, + { + "epoch": 2.39604991959412, + "grad_norm": 0.9415930509567261, + "learning_rate": 4.02585634024044e-05, + "loss": 2.5005, + "step": 26447 + }, + { + "epoch": 2.396140517768567, + "grad_norm": 1.002407431602478, + "learning_rate": 4.025252220141364e-05, + "loss": 2.75, + "step": 26448 + }, + { + "epoch": 2.3962311159430136, + "grad_norm": 1.005115270614624, + "learning_rate": 4.024648100042289e-05, + "loss": 3.068, + "step": 26449 + }, + { + "epoch": 2.3963217141174606, + "grad_norm": 1.003835678100586, + "learning_rate": 4.024043979943213e-05, + "loss": 2.6396, + "step": 26450 + }, + { + "epoch": 2.396412312291907, + "grad_norm": 0.9865524768829346, + "learning_rate": 4.023439859844137e-05, + "loss": 2.442, + "step": 26451 + }, + { + "epoch": 2.396502910466354, + "grad_norm": 1.0082283020019531, + "learning_rate": 4.022835739745062e-05, + "loss": 2.7571, + "step": 26452 + }, + { + "epoch": 2.396593508640801, + "grad_norm": 1.0606305599212646, + "learning_rate": 4.022231619645986e-05, + "loss": 2.7369, + "step": 26453 + }, + { + "epoch": 2.396684106815248, + "grad_norm": 0.9517646431922913, + "learning_rate": 4.02162749954691e-05, + "loss": 2.7757, + "step": 26454 + }, + { + "epoch": 2.3967747049896944, + "grad_norm": 1.0106146335601807, + "learning_rate": 4.021023379447835e-05, + "loss": 2.6049, + "step": 26455 + }, + { + "epoch": 2.3968653031641414, + "grad_norm": 1.0092695951461792, + "learning_rate": 4.020419259348759e-05, + "loss": 2.8063, + "step": 26456 + }, + { + "epoch": 2.396955901338588, + "grad_norm": 0.9765347838401794, + "learning_rate": 4.019815139249683e-05, + "loss": 2.4148, + "step": 26457 + }, + { + "epoch": 2.397046499513035, + "grad_norm": 0.9313143491744995, + "learning_rate": 4.019211019150607e-05, + "loss": 2.3269, + "step": 26458 + }, + { + "epoch": 2.3971370976874815, + "grad_norm": 0.9534542560577393, + "learning_rate": 4.018606899051532e-05, + "loss": 2.1836, + "step": 26459 + }, + { + "epoch": 2.3972276958619285, + "grad_norm": 0.9983850717544556, + "learning_rate": 4.018002778952456e-05, + "loss": 2.6521, + "step": 26460 + }, + { + "epoch": 2.397318294036375, + "grad_norm": 1.0419042110443115, + "learning_rate": 4.01739865885338e-05, + "loss": 2.7333, + "step": 26461 + }, + { + "epoch": 2.397408892210822, + "grad_norm": 1.0577086210250854, + "learning_rate": 4.016794538754304e-05, + "loss": 2.4496, + "step": 26462 + }, + { + "epoch": 2.3974994903852687, + "grad_norm": 0.9680010080337524, + "learning_rate": 4.016190418655229e-05, + "loss": 2.5555, + "step": 26463 + }, + { + "epoch": 2.3975900885597157, + "grad_norm": 0.9955122470855713, + "learning_rate": 4.015586298556153e-05, + "loss": 2.5268, + "step": 26464 + }, + { + "epoch": 2.3976806867341622, + "grad_norm": 1.0004793405532837, + "learning_rate": 4.0149821784570776e-05, + "loss": 2.6348, + "step": 26465 + }, + { + "epoch": 2.3977712849086092, + "grad_norm": 1.026973843574524, + "learning_rate": 4.014378058358002e-05, + "loss": 2.4979, + "step": 26466 + }, + { + "epoch": 2.397861883083056, + "grad_norm": 1.0181794166564941, + "learning_rate": 4.0137739382589265e-05, + "loss": 2.6717, + "step": 26467 + }, + { + "epoch": 2.397952481257503, + "grad_norm": 1.0550105571746826, + "learning_rate": 4.0131698181598506e-05, + "loss": 2.8079, + "step": 26468 + }, + { + "epoch": 2.3980430794319494, + "grad_norm": 1.0411319732666016, + "learning_rate": 4.0125656980607747e-05, + "loss": 2.3679, + "step": 26469 + }, + { + "epoch": 2.3981336776063964, + "grad_norm": 1.0526058673858643, + "learning_rate": 4.0119615779616994e-05, + "loss": 2.3854, + "step": 26470 + }, + { + "epoch": 2.398224275780843, + "grad_norm": 1.003678560256958, + "learning_rate": 4.0113574578626235e-05, + "loss": 2.8276, + "step": 26471 + }, + { + "epoch": 2.39831487395529, + "grad_norm": 0.9859017133712769, + "learning_rate": 4.0107533377635476e-05, + "loss": 2.6266, + "step": 26472 + }, + { + "epoch": 2.3984054721297365, + "grad_norm": 1.0611027479171753, + "learning_rate": 4.010149217664472e-05, + "loss": 2.4217, + "step": 26473 + }, + { + "epoch": 2.3984960703041835, + "grad_norm": 0.9716672301292419, + "learning_rate": 4.0095450975653964e-05, + "loss": 2.625, + "step": 26474 + }, + { + "epoch": 2.39858666847863, + "grad_norm": 0.9751306772232056, + "learning_rate": 4.0089409774663205e-05, + "loss": 2.4975, + "step": 26475 + }, + { + "epoch": 2.398677266653077, + "grad_norm": 0.9613768458366394, + "learning_rate": 4.0083368573672446e-05, + "loss": 2.4769, + "step": 26476 + }, + { + "epoch": 2.3987678648275237, + "grad_norm": 1.0082589387893677, + "learning_rate": 4.007732737268169e-05, + "loss": 2.9116, + "step": 26477 + }, + { + "epoch": 2.3988584630019707, + "grad_norm": 0.8514493703842163, + "learning_rate": 4.0071286171690934e-05, + "loss": 1.9216, + "step": 26478 + }, + { + "epoch": 2.3989490611764173, + "grad_norm": 0.9236847758293152, + "learning_rate": 4.0065244970700175e-05, + "loss": 1.8837, + "step": 26479 + }, + { + "epoch": 2.3990396593508643, + "grad_norm": 0.9951184391975403, + "learning_rate": 4.0059203769709416e-05, + "loss": 2.7089, + "step": 26480 + }, + { + "epoch": 2.399130257525311, + "grad_norm": 0.9685987830162048, + "learning_rate": 4.0053162568718664e-05, + "loss": 2.5028, + "step": 26481 + }, + { + "epoch": 2.399220855699758, + "grad_norm": 0.9851621985435486, + "learning_rate": 4.0047121367727904e-05, + "loss": 2.9023, + "step": 26482 + }, + { + "epoch": 2.3993114538742044, + "grad_norm": 1.1096552610397339, + "learning_rate": 4.004108016673715e-05, + "loss": 2.4823, + "step": 26483 + }, + { + "epoch": 2.399402052048651, + "grad_norm": 0.9986449480056763, + "learning_rate": 4.003503896574639e-05, + "loss": 2.6646, + "step": 26484 + }, + { + "epoch": 2.399492650223098, + "grad_norm": 1.0439515113830566, + "learning_rate": 4.002899776475564e-05, + "loss": 2.1111, + "step": 26485 + }, + { + "epoch": 2.399583248397545, + "grad_norm": 1.0408753156661987, + "learning_rate": 4.002295656376488e-05, + "loss": 2.4349, + "step": 26486 + }, + { + "epoch": 2.3996738465719916, + "grad_norm": 1.0032936334609985, + "learning_rate": 4.001691536277412e-05, + "loss": 2.8027, + "step": 26487 + }, + { + "epoch": 2.399764444746438, + "grad_norm": 1.0978515148162842, + "learning_rate": 4.001087416178336e-05, + "loss": 2.4282, + "step": 26488 + }, + { + "epoch": 2.399855042920885, + "grad_norm": 1.0622223615646362, + "learning_rate": 4.000483296079261e-05, + "loss": 2.5902, + "step": 26489 + }, + { + "epoch": 2.399945641095332, + "grad_norm": 0.8965999484062195, + "learning_rate": 3.999879175980185e-05, + "loss": 2.3749, + "step": 26490 + }, + { + "epoch": 2.4000362392697787, + "grad_norm": 1.1147260665893555, + "learning_rate": 3.999275055881109e-05, + "loss": 2.3751, + "step": 26491 + }, + { + "epoch": 2.4001268374442253, + "grad_norm": 1.010371446609497, + "learning_rate": 3.998670935782033e-05, + "loss": 2.6786, + "step": 26492 + }, + { + "epoch": 2.4002174356186723, + "grad_norm": 0.9508867859840393, + "learning_rate": 3.998066815682958e-05, + "loss": 2.4808, + "step": 26493 + }, + { + "epoch": 2.4003080337931193, + "grad_norm": 1.0038082599639893, + "learning_rate": 3.997462695583882e-05, + "loss": 2.5969, + "step": 26494 + }, + { + "epoch": 2.400398631967566, + "grad_norm": 1.0266671180725098, + "learning_rate": 3.996858575484806e-05, + "loss": 2.9849, + "step": 26495 + }, + { + "epoch": 2.4004892301420124, + "grad_norm": 1.0211859941482544, + "learning_rate": 3.99625445538573e-05, + "loss": 2.5551, + "step": 26496 + }, + { + "epoch": 2.4005798283164594, + "grad_norm": 1.0206782817840576, + "learning_rate": 3.995650335286655e-05, + "loss": 2.534, + "step": 26497 + }, + { + "epoch": 2.4006704264909065, + "grad_norm": 0.9683552384376526, + "learning_rate": 3.995046215187579e-05, + "loss": 2.4736, + "step": 26498 + }, + { + "epoch": 2.400761024665353, + "grad_norm": 0.9675029516220093, + "learning_rate": 3.994442095088504e-05, + "loss": 2.5327, + "step": 26499 + }, + { + "epoch": 2.4008516228397996, + "grad_norm": 0.9980500340461731, + "learning_rate": 3.993837974989429e-05, + "loss": 2.6534, + "step": 26500 + }, + { + "epoch": 2.4009422210142466, + "grad_norm": 0.9768445491790771, + "learning_rate": 3.993233854890353e-05, + "loss": 2.6927, + "step": 26501 + }, + { + "epoch": 2.401032819188693, + "grad_norm": 1.1578588485717773, + "learning_rate": 3.992629734791277e-05, + "loss": 2.6701, + "step": 26502 + }, + { + "epoch": 2.40112341736314, + "grad_norm": 1.0075645446777344, + "learning_rate": 3.992025614692201e-05, + "loss": 2.6749, + "step": 26503 + }, + { + "epoch": 2.4012140155375867, + "grad_norm": 1.067148208618164, + "learning_rate": 3.991421494593126e-05, + "loss": 2.8472, + "step": 26504 + }, + { + "epoch": 2.4013046137120337, + "grad_norm": 0.9945630431175232, + "learning_rate": 3.99081737449405e-05, + "loss": 2.5736, + "step": 26505 + }, + { + "epoch": 2.4013952118864803, + "grad_norm": 0.847726583480835, + "learning_rate": 3.990213254394974e-05, + "loss": 1.9555, + "step": 26506 + }, + { + "epoch": 2.4014858100609273, + "grad_norm": 1.0393855571746826, + "learning_rate": 3.989609134295898e-05, + "loss": 2.5535, + "step": 26507 + }, + { + "epoch": 2.401576408235374, + "grad_norm": 1.1098023653030396, + "learning_rate": 3.989005014196823e-05, + "loss": 2.594, + "step": 26508 + }, + { + "epoch": 2.401667006409821, + "grad_norm": 1.043357253074646, + "learning_rate": 3.988400894097747e-05, + "loss": 2.3847, + "step": 26509 + }, + { + "epoch": 2.4017576045842675, + "grad_norm": 1.0634459257125854, + "learning_rate": 3.987796773998671e-05, + "loss": 2.6238, + "step": 26510 + }, + { + "epoch": 2.4018482027587145, + "grad_norm": 1.1003499031066895, + "learning_rate": 3.987192653899595e-05, + "loss": 2.603, + "step": 26511 + }, + { + "epoch": 2.401938800933161, + "grad_norm": 0.9692444205284119, + "learning_rate": 3.98658853380052e-05, + "loss": 2.5634, + "step": 26512 + }, + { + "epoch": 2.402029399107608, + "grad_norm": 1.035446286201477, + "learning_rate": 3.985984413701444e-05, + "loss": 2.6087, + "step": 26513 + }, + { + "epoch": 2.4021199972820546, + "grad_norm": 0.9426827430725098, + "learning_rate": 3.985380293602368e-05, + "loss": 2.2968, + "step": 26514 + }, + { + "epoch": 2.4022105954565016, + "grad_norm": 0.9132274985313416, + "learning_rate": 3.9847761735032926e-05, + "loss": 2.3395, + "step": 26515 + }, + { + "epoch": 2.402301193630948, + "grad_norm": 1.023595929145813, + "learning_rate": 3.9841720534042174e-05, + "loss": 2.2299, + "step": 26516 + }, + { + "epoch": 2.402391791805395, + "grad_norm": 0.9912078976631165, + "learning_rate": 3.9835679333051415e-05, + "loss": 2.4879, + "step": 26517 + }, + { + "epoch": 2.4024823899798418, + "grad_norm": 0.9871009588241577, + "learning_rate": 3.9829638132060656e-05, + "loss": 2.7037, + "step": 26518 + }, + { + "epoch": 2.4025729881542888, + "grad_norm": 1.0339808464050293, + "learning_rate": 3.98235969310699e-05, + "loss": 2.612, + "step": 26519 + }, + { + "epoch": 2.4026635863287353, + "grad_norm": 0.9496550559997559, + "learning_rate": 3.9817555730079144e-05, + "loss": 2.7458, + "step": 26520 + }, + { + "epoch": 2.4027541845031823, + "grad_norm": 0.9794831275939941, + "learning_rate": 3.9811514529088385e-05, + "loss": 2.64, + "step": 26521 + }, + { + "epoch": 2.402844782677629, + "grad_norm": 1.019955039024353, + "learning_rate": 3.9805473328097626e-05, + "loss": 2.6377, + "step": 26522 + }, + { + "epoch": 2.402935380852076, + "grad_norm": 0.9989651441574097, + "learning_rate": 3.979943212710687e-05, + "loss": 2.7754, + "step": 26523 + }, + { + "epoch": 2.4030259790265225, + "grad_norm": 1.007280945777893, + "learning_rate": 3.9793390926116114e-05, + "loss": 2.5506, + "step": 26524 + }, + { + "epoch": 2.4031165772009695, + "grad_norm": 0.9879105091094971, + "learning_rate": 3.9787349725125355e-05, + "loss": 2.6327, + "step": 26525 + }, + { + "epoch": 2.403207175375416, + "grad_norm": 0.8961758613586426, + "learning_rate": 3.9781308524134596e-05, + "loss": 2.5288, + "step": 26526 + }, + { + "epoch": 2.403297773549863, + "grad_norm": 1.0926356315612793, + "learning_rate": 3.9775267323143843e-05, + "loss": 2.7398, + "step": 26527 + }, + { + "epoch": 2.4033883717243096, + "grad_norm": 1.0800861120224, + "learning_rate": 3.9769226122153084e-05, + "loss": 2.7554, + "step": 26528 + }, + { + "epoch": 2.4034789698987566, + "grad_norm": 0.9892880916595459, + "learning_rate": 3.9763184921162325e-05, + "loss": 2.6912, + "step": 26529 + }, + { + "epoch": 2.403569568073203, + "grad_norm": 1.0030503273010254, + "learning_rate": 3.975714372017157e-05, + "loss": 2.5973, + "step": 26530 + }, + { + "epoch": 2.40366016624765, + "grad_norm": 1.027464509010315, + "learning_rate": 3.9751102519180814e-05, + "loss": 2.697, + "step": 26531 + }, + { + "epoch": 2.403750764422097, + "grad_norm": 0.8246117234230042, + "learning_rate": 3.9745061318190054e-05, + "loss": 1.7414, + "step": 26532 + }, + { + "epoch": 2.403841362596544, + "grad_norm": 0.9756263494491577, + "learning_rate": 3.97390201171993e-05, + "loss": 2.7908, + "step": 26533 + }, + { + "epoch": 2.4039319607709904, + "grad_norm": 0.9134637117385864, + "learning_rate": 3.973297891620855e-05, + "loss": 2.5953, + "step": 26534 + }, + { + "epoch": 2.4040225589454374, + "grad_norm": 1.023589015007019, + "learning_rate": 3.972693771521779e-05, + "loss": 2.6565, + "step": 26535 + }, + { + "epoch": 2.404113157119884, + "grad_norm": 0.8966521620750427, + "learning_rate": 3.972089651422703e-05, + "loss": 1.8503, + "step": 26536 + }, + { + "epoch": 2.404203755294331, + "grad_norm": 1.0215867757797241, + "learning_rate": 3.971485531323627e-05, + "loss": 2.4975, + "step": 26537 + }, + { + "epoch": 2.4042943534687775, + "grad_norm": 0.9649685025215149, + "learning_rate": 3.970881411224552e-05, + "loss": 2.0926, + "step": 26538 + }, + { + "epoch": 2.4043849516432245, + "grad_norm": 1.0876742601394653, + "learning_rate": 3.970277291125476e-05, + "loss": 2.7444, + "step": 26539 + }, + { + "epoch": 2.404475549817671, + "grad_norm": 0.9147919416427612, + "learning_rate": 3.9696731710264e-05, + "loss": 1.9428, + "step": 26540 + }, + { + "epoch": 2.404566147992118, + "grad_norm": 0.9254249930381775, + "learning_rate": 3.969069050927324e-05, + "loss": 1.9193, + "step": 26541 + }, + { + "epoch": 2.4046567461665647, + "grad_norm": 1.0318843126296997, + "learning_rate": 3.968464930828249e-05, + "loss": 2.811, + "step": 26542 + }, + { + "epoch": 2.4047473443410117, + "grad_norm": 1.0741182565689087, + "learning_rate": 3.967860810729173e-05, + "loss": 2.6476, + "step": 26543 + }, + { + "epoch": 2.4048379425154582, + "grad_norm": 0.9722288250923157, + "learning_rate": 3.967256690630097e-05, + "loss": 2.4503, + "step": 26544 + }, + { + "epoch": 2.4049285406899052, + "grad_norm": 1.002345323562622, + "learning_rate": 3.966652570531022e-05, + "loss": 2.485, + "step": 26545 + }, + { + "epoch": 2.405019138864352, + "grad_norm": 1.0431673526763916, + "learning_rate": 3.966048450431946e-05, + "loss": 2.4514, + "step": 26546 + }, + { + "epoch": 2.405109737038799, + "grad_norm": 1.0232558250427246, + "learning_rate": 3.96544433033287e-05, + "loss": 2.9522, + "step": 26547 + }, + { + "epoch": 2.4052003352132454, + "grad_norm": 1.032553791999817, + "learning_rate": 3.964840210233794e-05, + "loss": 2.3271, + "step": 26548 + }, + { + "epoch": 2.4052909333876924, + "grad_norm": 0.9275535941123962, + "learning_rate": 3.964236090134719e-05, + "loss": 2.3367, + "step": 26549 + }, + { + "epoch": 2.405381531562139, + "grad_norm": 0.8339654207229614, + "learning_rate": 3.963631970035644e-05, + "loss": 1.8019, + "step": 26550 + }, + { + "epoch": 2.405472129736586, + "grad_norm": 1.1747877597808838, + "learning_rate": 3.963027849936568e-05, + "loss": 2.6302, + "step": 26551 + }, + { + "epoch": 2.4055627279110325, + "grad_norm": 0.9686987996101379, + "learning_rate": 3.962423729837492e-05, + "loss": 2.42, + "step": 26552 + }, + { + "epoch": 2.4056533260854795, + "grad_norm": 1.0086950063705444, + "learning_rate": 3.9618196097384166e-05, + "loss": 2.9808, + "step": 26553 + }, + { + "epoch": 2.405743924259926, + "grad_norm": 0.99753737449646, + "learning_rate": 3.961215489639341e-05, + "loss": 2.4478, + "step": 26554 + }, + { + "epoch": 2.405834522434373, + "grad_norm": 0.9935030937194824, + "learning_rate": 3.960611369540265e-05, + "loss": 2.4913, + "step": 26555 + }, + { + "epoch": 2.4059251206088197, + "grad_norm": 1.0267268419265747, + "learning_rate": 3.960007249441189e-05, + "loss": 2.3825, + "step": 26556 + }, + { + "epoch": 2.4060157187832667, + "grad_norm": 1.0406488180160522, + "learning_rate": 3.9594031293421136e-05, + "loss": 2.7232, + "step": 26557 + }, + { + "epoch": 2.4061063169577133, + "grad_norm": 0.9751798510551453, + "learning_rate": 3.958799009243038e-05, + "loss": 2.7526, + "step": 26558 + }, + { + "epoch": 2.4061969151321603, + "grad_norm": 1.0125607252120972, + "learning_rate": 3.958194889143962e-05, + "loss": 2.5178, + "step": 26559 + }, + { + "epoch": 2.406287513306607, + "grad_norm": 1.0252083539962769, + "learning_rate": 3.9575907690448865e-05, + "loss": 3.0419, + "step": 26560 + }, + { + "epoch": 2.406378111481054, + "grad_norm": 1.1670488119125366, + "learning_rate": 3.9569866489458106e-05, + "loss": 2.5834, + "step": 26561 + }, + { + "epoch": 2.4064687096555004, + "grad_norm": 1.0041160583496094, + "learning_rate": 3.956382528846735e-05, + "loss": 2.7296, + "step": 26562 + }, + { + "epoch": 2.406559307829947, + "grad_norm": 1.2030301094055176, + "learning_rate": 3.955778408747659e-05, + "loss": 2.442, + "step": 26563 + }, + { + "epoch": 2.406649906004394, + "grad_norm": 0.898256778717041, + "learning_rate": 3.9551742886485835e-05, + "loss": 2.0129, + "step": 26564 + }, + { + "epoch": 2.406740504178841, + "grad_norm": 0.9362576007843018, + "learning_rate": 3.9545701685495076e-05, + "loss": 2.3981, + "step": 26565 + }, + { + "epoch": 2.4068311023532876, + "grad_norm": 0.9925419688224792, + "learning_rate": 3.9539660484504324e-05, + "loss": 2.3525, + "step": 26566 + }, + { + "epoch": 2.406921700527734, + "grad_norm": 1.006656527519226, + "learning_rate": 3.9533619283513565e-05, + "loss": 2.7347, + "step": 26567 + }, + { + "epoch": 2.407012298702181, + "grad_norm": 1.0371756553649902, + "learning_rate": 3.952757808252281e-05, + "loss": 2.8419, + "step": 26568 + }, + { + "epoch": 2.407102896876628, + "grad_norm": 1.1736663579940796, + "learning_rate": 3.952153688153205e-05, + "loss": 2.6418, + "step": 26569 + }, + { + "epoch": 2.4071934950510747, + "grad_norm": 1.0250825881958008, + "learning_rate": 3.9515495680541294e-05, + "loss": 2.6907, + "step": 26570 + }, + { + "epoch": 2.4072840932255213, + "grad_norm": 1.0280438661575317, + "learning_rate": 3.9509454479550535e-05, + "loss": 2.7432, + "step": 26571 + }, + { + "epoch": 2.4073746913999683, + "grad_norm": 1.0276753902435303, + "learning_rate": 3.950341327855978e-05, + "loss": 2.6137, + "step": 26572 + }, + { + "epoch": 2.4074652895744153, + "grad_norm": 0.9679116010665894, + "learning_rate": 3.949737207756902e-05, + "loss": 2.2757, + "step": 26573 + }, + { + "epoch": 2.407555887748862, + "grad_norm": 0.9891310334205627, + "learning_rate": 3.9491330876578264e-05, + "loss": 2.6336, + "step": 26574 + }, + { + "epoch": 2.4076464859233084, + "grad_norm": 0.9891055226325989, + "learning_rate": 3.948528967558751e-05, + "loss": 2.7422, + "step": 26575 + }, + { + "epoch": 2.4077370840977554, + "grad_norm": 1.0326412916183472, + "learning_rate": 3.947924847459675e-05, + "loss": 2.7463, + "step": 26576 + }, + { + "epoch": 2.4078276822722025, + "grad_norm": 1.0185174942016602, + "learning_rate": 3.947320727360599e-05, + "loss": 2.8004, + "step": 26577 + }, + { + "epoch": 2.407918280446649, + "grad_norm": 1.0111736059188843, + "learning_rate": 3.9467166072615234e-05, + "loss": 2.4582, + "step": 26578 + }, + { + "epoch": 2.4080088786210956, + "grad_norm": 1.1836103200912476, + "learning_rate": 3.946112487162448e-05, + "loss": 2.7033, + "step": 26579 + }, + { + "epoch": 2.4080994767955426, + "grad_norm": 1.076438307762146, + "learning_rate": 3.945508367063372e-05, + "loss": 2.5733, + "step": 26580 + }, + { + "epoch": 2.4081900749699896, + "grad_norm": 1.1353336572647095, + "learning_rate": 3.9449042469642963e-05, + "loss": 2.3893, + "step": 26581 + }, + { + "epoch": 2.408280673144436, + "grad_norm": 0.9453515410423279, + "learning_rate": 3.944300126865221e-05, + "loss": 2.5438, + "step": 26582 + }, + { + "epoch": 2.4083712713188827, + "grad_norm": 1.0290429592132568, + "learning_rate": 3.943696006766145e-05, + "loss": 2.3341, + "step": 26583 + }, + { + "epoch": 2.4084618694933297, + "grad_norm": 0.9720745086669922, + "learning_rate": 3.94309188666707e-05, + "loss": 2.7366, + "step": 26584 + }, + { + "epoch": 2.4085524676677763, + "grad_norm": 1.066048502922058, + "learning_rate": 3.942487766567994e-05, + "loss": 2.5927, + "step": 26585 + }, + { + "epoch": 2.4086430658422233, + "grad_norm": 0.9985412359237671, + "learning_rate": 3.941883646468918e-05, + "loss": 2.7368, + "step": 26586 + }, + { + "epoch": 2.40873366401667, + "grad_norm": 1.0602713823318481, + "learning_rate": 3.941279526369843e-05, + "loss": 2.698, + "step": 26587 + }, + { + "epoch": 2.408824262191117, + "grad_norm": 1.0239757299423218, + "learning_rate": 3.940675406270767e-05, + "loss": 2.7418, + "step": 26588 + }, + { + "epoch": 2.4089148603655635, + "grad_norm": 1.0065847635269165, + "learning_rate": 3.940071286171691e-05, + "loss": 2.5629, + "step": 26589 + }, + { + "epoch": 2.4090054585400105, + "grad_norm": 0.9571665525436401, + "learning_rate": 3.939467166072616e-05, + "loss": 2.5114, + "step": 26590 + }, + { + "epoch": 2.409096056714457, + "grad_norm": 0.8638721108436584, + "learning_rate": 3.93886304597354e-05, + "loss": 1.9847, + "step": 26591 + }, + { + "epoch": 2.409186654888904, + "grad_norm": 0.9859691858291626, + "learning_rate": 3.938258925874464e-05, + "loss": 2.4844, + "step": 26592 + }, + { + "epoch": 2.4092772530633506, + "grad_norm": 1.0004444122314453, + "learning_rate": 3.937654805775388e-05, + "loss": 2.5759, + "step": 26593 + }, + { + "epoch": 2.4093678512377976, + "grad_norm": 1.0481053590774536, + "learning_rate": 3.937050685676313e-05, + "loss": 2.6565, + "step": 26594 + }, + { + "epoch": 2.409458449412244, + "grad_norm": 1.076888918876648, + "learning_rate": 3.936446565577237e-05, + "loss": 2.742, + "step": 26595 + }, + { + "epoch": 2.409549047586691, + "grad_norm": 0.8560807108879089, + "learning_rate": 3.935842445478161e-05, + "loss": 2.1922, + "step": 26596 + }, + { + "epoch": 2.4096396457611378, + "grad_norm": 0.9030781984329224, + "learning_rate": 3.935238325379085e-05, + "loss": 1.9432, + "step": 26597 + }, + { + "epoch": 2.4097302439355848, + "grad_norm": 0.8473883867263794, + "learning_rate": 3.93463420528001e-05, + "loss": 2.1199, + "step": 26598 + }, + { + "epoch": 2.4098208421100313, + "grad_norm": 1.0000948905944824, + "learning_rate": 3.934030085180934e-05, + "loss": 2.536, + "step": 26599 + }, + { + "epoch": 2.4099114402844783, + "grad_norm": 1.0348237752914429, + "learning_rate": 3.933425965081859e-05, + "loss": 2.7765, + "step": 26600 + }, + { + "epoch": 2.410002038458925, + "grad_norm": 1.0070536136627197, + "learning_rate": 3.932821844982783e-05, + "loss": 2.5858, + "step": 26601 + }, + { + "epoch": 2.410092636633372, + "grad_norm": 0.9382168054580688, + "learning_rate": 3.9322177248837075e-05, + "loss": 2.5492, + "step": 26602 + }, + { + "epoch": 2.4101832348078185, + "grad_norm": 0.9404515624046326, + "learning_rate": 3.9316136047846316e-05, + "loss": 2.5974, + "step": 26603 + }, + { + "epoch": 2.4102738329822655, + "grad_norm": 0.9965949654579163, + "learning_rate": 3.931009484685556e-05, + "loss": 2.6952, + "step": 26604 + }, + { + "epoch": 2.410364431156712, + "grad_norm": 0.9848105311393738, + "learning_rate": 3.9304053645864804e-05, + "loss": 2.5703, + "step": 26605 + }, + { + "epoch": 2.410455029331159, + "grad_norm": 1.0436426401138306, + "learning_rate": 3.9298012444874045e-05, + "loss": 2.7262, + "step": 26606 + }, + { + "epoch": 2.4105456275056056, + "grad_norm": 0.9425270557403564, + "learning_rate": 3.9291971243883286e-05, + "loss": 2.4252, + "step": 26607 + }, + { + "epoch": 2.4106362256800526, + "grad_norm": 1.0011699199676514, + "learning_rate": 3.928593004289253e-05, + "loss": 2.6342, + "step": 26608 + }, + { + "epoch": 2.410726823854499, + "grad_norm": 1.030839443206787, + "learning_rate": 3.9279888841901774e-05, + "loss": 2.6449, + "step": 26609 + }, + { + "epoch": 2.410817422028946, + "grad_norm": 1.0032750368118286, + "learning_rate": 3.9273847640911015e-05, + "loss": 2.2892, + "step": 26610 + }, + { + "epoch": 2.410908020203393, + "grad_norm": 1.1504812240600586, + "learning_rate": 3.9267806439920256e-05, + "loss": 2.5348, + "step": 26611 + }, + { + "epoch": 2.41099861837784, + "grad_norm": 0.9037500023841858, + "learning_rate": 3.92617652389295e-05, + "loss": 2.0006, + "step": 26612 + }, + { + "epoch": 2.4110892165522864, + "grad_norm": 1.009236216545105, + "learning_rate": 3.9255724037938745e-05, + "loss": 2.7632, + "step": 26613 + }, + { + "epoch": 2.4111798147267334, + "grad_norm": 1.0065538883209229, + "learning_rate": 3.9249682836947985e-05, + "loss": 2.6924, + "step": 26614 + }, + { + "epoch": 2.41127041290118, + "grad_norm": 1.2488288879394531, + "learning_rate": 3.9243641635957226e-05, + "loss": 2.4416, + "step": 26615 + }, + { + "epoch": 2.411361011075627, + "grad_norm": 1.0338679552078247, + "learning_rate": 3.9237600434966474e-05, + "loss": 2.7154, + "step": 26616 + }, + { + "epoch": 2.4114516092500735, + "grad_norm": 1.0312919616699219, + "learning_rate": 3.9231559233975715e-05, + "loss": 2.6127, + "step": 26617 + }, + { + "epoch": 2.4115422074245205, + "grad_norm": 1.0137197971343994, + "learning_rate": 3.922551803298496e-05, + "loss": 2.4946, + "step": 26618 + }, + { + "epoch": 2.411632805598967, + "grad_norm": 1.000023365020752, + "learning_rate": 3.92194768319942e-05, + "loss": 2.6551, + "step": 26619 + }, + { + "epoch": 2.411723403773414, + "grad_norm": 0.9578250646591187, + "learning_rate": 3.921343563100345e-05, + "loss": 2.4014, + "step": 26620 + }, + { + "epoch": 2.4118140019478607, + "grad_norm": 1.1240860223770142, + "learning_rate": 3.920739443001269e-05, + "loss": 2.6684, + "step": 26621 + }, + { + "epoch": 2.4119046001223077, + "grad_norm": 0.9271045327186584, + "learning_rate": 3.920135322902193e-05, + "loss": 2.5379, + "step": 26622 + }, + { + "epoch": 2.4119951982967542, + "grad_norm": 1.0742894411087036, + "learning_rate": 3.919531202803117e-05, + "loss": 2.2849, + "step": 26623 + }, + { + "epoch": 2.4120857964712012, + "grad_norm": 0.9783130884170532, + "learning_rate": 3.918927082704042e-05, + "loss": 2.5284, + "step": 26624 + }, + { + "epoch": 2.412176394645648, + "grad_norm": 1.0188339948654175, + "learning_rate": 3.918322962604966e-05, + "loss": 2.3897, + "step": 26625 + }, + { + "epoch": 2.412266992820095, + "grad_norm": 1.012695074081421, + "learning_rate": 3.91771884250589e-05, + "loss": 2.6608, + "step": 26626 + }, + { + "epoch": 2.4123575909945414, + "grad_norm": 0.9571444988250732, + "learning_rate": 3.917114722406814e-05, + "loss": 2.4776, + "step": 26627 + }, + { + "epoch": 2.4124481891689884, + "grad_norm": 0.9822997450828552, + "learning_rate": 3.916510602307739e-05, + "loss": 2.4762, + "step": 26628 + }, + { + "epoch": 2.412538787343435, + "grad_norm": 1.0867884159088135, + "learning_rate": 3.915906482208663e-05, + "loss": 2.5332, + "step": 26629 + }, + { + "epoch": 2.412629385517882, + "grad_norm": 1.0377522706985474, + "learning_rate": 3.915302362109587e-05, + "loss": 2.5249, + "step": 26630 + }, + { + "epoch": 2.4127199836923285, + "grad_norm": 1.004615068435669, + "learning_rate": 3.914698242010511e-05, + "loss": 2.608, + "step": 26631 + }, + { + "epoch": 2.4128105818667755, + "grad_norm": 1.0166687965393066, + "learning_rate": 3.914094121911436e-05, + "loss": 2.7289, + "step": 26632 + }, + { + "epoch": 2.412901180041222, + "grad_norm": 0.995424747467041, + "learning_rate": 3.91349000181236e-05, + "loss": 2.6125, + "step": 26633 + }, + { + "epoch": 2.412991778215669, + "grad_norm": 0.7606468200683594, + "learning_rate": 3.912885881713285e-05, + "loss": 1.8554, + "step": 26634 + }, + { + "epoch": 2.4130823763901157, + "grad_norm": 0.7511498928070068, + "learning_rate": 3.912281761614209e-05, + "loss": 1.4095, + "step": 26635 + }, + { + "epoch": 2.4131729745645627, + "grad_norm": 0.8215427994728088, + "learning_rate": 3.911677641515134e-05, + "loss": 2.0204, + "step": 26636 + }, + { + "epoch": 2.4132635727390093, + "grad_norm": 1.0128448009490967, + "learning_rate": 3.911073521416058e-05, + "loss": 2.4788, + "step": 26637 + }, + { + "epoch": 2.4133541709134563, + "grad_norm": 1.0447057485580444, + "learning_rate": 3.910469401316982e-05, + "loss": 2.4032, + "step": 26638 + }, + { + "epoch": 2.413444769087903, + "grad_norm": 0.8821820020675659, + "learning_rate": 3.909865281217907e-05, + "loss": 2.1778, + "step": 26639 + }, + { + "epoch": 2.41353536726235, + "grad_norm": 0.8677047491073608, + "learning_rate": 3.909261161118831e-05, + "loss": 1.9529, + "step": 26640 + }, + { + "epoch": 2.4136259654367964, + "grad_norm": 0.959113359451294, + "learning_rate": 3.908657041019755e-05, + "loss": 2.6165, + "step": 26641 + }, + { + "epoch": 2.4137165636112434, + "grad_norm": 0.9840253591537476, + "learning_rate": 3.908052920920679e-05, + "loss": 2.7188, + "step": 26642 + }, + { + "epoch": 2.41380716178569, + "grad_norm": 0.9580073952674866, + "learning_rate": 3.907448800821604e-05, + "loss": 2.5977, + "step": 26643 + }, + { + "epoch": 2.413897759960137, + "grad_norm": 0.9516295194625854, + "learning_rate": 3.906844680722528e-05, + "loss": 2.5259, + "step": 26644 + }, + { + "epoch": 2.4139883581345836, + "grad_norm": 1.0344451665878296, + "learning_rate": 3.906240560623452e-05, + "loss": 2.4771, + "step": 26645 + }, + { + "epoch": 2.41407895630903, + "grad_norm": 0.9485998153686523, + "learning_rate": 3.905636440524376e-05, + "loss": 2.7175, + "step": 26646 + }, + { + "epoch": 2.414169554483477, + "grad_norm": 1.0025224685668945, + "learning_rate": 3.905032320425301e-05, + "loss": 2.6183, + "step": 26647 + }, + { + "epoch": 2.414260152657924, + "grad_norm": 0.9699389934539795, + "learning_rate": 3.904428200326225e-05, + "loss": 2.6265, + "step": 26648 + }, + { + "epoch": 2.4143507508323707, + "grad_norm": 1.1023550033569336, + "learning_rate": 3.903824080227149e-05, + "loss": 2.5418, + "step": 26649 + }, + { + "epoch": 2.4144413490068173, + "grad_norm": 1.065027117729187, + "learning_rate": 3.9032199601280737e-05, + "loss": 3.0663, + "step": 26650 + }, + { + "epoch": 2.4145319471812643, + "grad_norm": 1.028035283088684, + "learning_rate": 3.902615840028998e-05, + "loss": 2.7152, + "step": 26651 + }, + { + "epoch": 2.4146225453557113, + "grad_norm": 1.0578968524932861, + "learning_rate": 3.9020117199299225e-05, + "loss": 2.554, + "step": 26652 + }, + { + "epoch": 2.414713143530158, + "grad_norm": 1.0396311283111572, + "learning_rate": 3.9014075998308466e-05, + "loss": 2.6074, + "step": 26653 + }, + { + "epoch": 2.4148037417046044, + "grad_norm": 1.0328165292739868, + "learning_rate": 3.9008034797317713e-05, + "loss": 2.5864, + "step": 26654 + }, + { + "epoch": 2.4148943398790514, + "grad_norm": 1.056753396987915, + "learning_rate": 3.9001993596326954e-05, + "loss": 2.5879, + "step": 26655 + }, + { + "epoch": 2.4149849380534985, + "grad_norm": 1.0192136764526367, + "learning_rate": 3.8995952395336195e-05, + "loss": 2.6331, + "step": 26656 + }, + { + "epoch": 2.415075536227945, + "grad_norm": 0.9443731904029846, + "learning_rate": 3.8989911194345436e-05, + "loss": 2.5937, + "step": 26657 + }, + { + "epoch": 2.4151661344023916, + "grad_norm": 0.9764237403869629, + "learning_rate": 3.8983869993354683e-05, + "loss": 2.6463, + "step": 26658 + }, + { + "epoch": 2.4152567325768386, + "grad_norm": 1.074278712272644, + "learning_rate": 3.8977828792363924e-05, + "loss": 2.5205, + "step": 26659 + }, + { + "epoch": 2.4153473307512856, + "grad_norm": 1.1331000328063965, + "learning_rate": 3.8971787591373165e-05, + "loss": 2.4585, + "step": 26660 + }, + { + "epoch": 2.415437928925732, + "grad_norm": 0.9678032398223877, + "learning_rate": 3.8965746390382406e-05, + "loss": 2.3784, + "step": 26661 + }, + { + "epoch": 2.4155285271001787, + "grad_norm": 1.032082438468933, + "learning_rate": 3.8959705189391654e-05, + "loss": 2.4363, + "step": 26662 + }, + { + "epoch": 2.4156191252746257, + "grad_norm": 0.8943324685096741, + "learning_rate": 3.8953663988400894e-05, + "loss": 1.847, + "step": 26663 + }, + { + "epoch": 2.4157097234490723, + "grad_norm": 1.0641251802444458, + "learning_rate": 3.8947622787410135e-05, + "loss": 2.749, + "step": 26664 + }, + { + "epoch": 2.4158003216235193, + "grad_norm": 0.9607957005500793, + "learning_rate": 3.894158158641938e-05, + "loss": 1.8903, + "step": 26665 + }, + { + "epoch": 2.415890919797966, + "grad_norm": 0.9601845741271973, + "learning_rate": 3.8935540385428624e-05, + "loss": 2.5065, + "step": 26666 + }, + { + "epoch": 2.415981517972413, + "grad_norm": 0.940957248210907, + "learning_rate": 3.8929499184437865e-05, + "loss": 2.5246, + "step": 26667 + }, + { + "epoch": 2.4160721161468595, + "grad_norm": 1.055527687072754, + "learning_rate": 3.892345798344711e-05, + "loss": 2.9534, + "step": 26668 + }, + { + "epoch": 2.4161627143213065, + "grad_norm": 1.072511076927185, + "learning_rate": 3.891741678245635e-05, + "loss": 2.4346, + "step": 26669 + }, + { + "epoch": 2.416253312495753, + "grad_norm": 1.0275704860687256, + "learning_rate": 3.89113755814656e-05, + "loss": 2.6901, + "step": 26670 + }, + { + "epoch": 2.4163439106702, + "grad_norm": 0.9940920472145081, + "learning_rate": 3.890533438047484e-05, + "loss": 2.471, + "step": 26671 + }, + { + "epoch": 2.4164345088446466, + "grad_norm": 0.8605744242668152, + "learning_rate": 3.889929317948408e-05, + "loss": 1.9081, + "step": 26672 + }, + { + "epoch": 2.4165251070190936, + "grad_norm": 1.1243817806243896, + "learning_rate": 3.889325197849333e-05, + "loss": 2.4325, + "step": 26673 + }, + { + "epoch": 2.41661570519354, + "grad_norm": 0.9621209502220154, + "learning_rate": 3.888721077750257e-05, + "loss": 2.6801, + "step": 26674 + }, + { + "epoch": 2.416706303367987, + "grad_norm": 0.9095824360847473, + "learning_rate": 3.888116957651181e-05, + "loss": 2.0632, + "step": 26675 + }, + { + "epoch": 2.4167969015424338, + "grad_norm": 1.0360186100006104, + "learning_rate": 3.887512837552105e-05, + "loss": 1.9407, + "step": 26676 + }, + { + "epoch": 2.4168874997168808, + "grad_norm": 1.0518039464950562, + "learning_rate": 3.88690871745303e-05, + "loss": 2.5461, + "step": 26677 + }, + { + "epoch": 2.4169780978913273, + "grad_norm": 0.9974620342254639, + "learning_rate": 3.886304597353954e-05, + "loss": 2.522, + "step": 26678 + }, + { + "epoch": 2.4170686960657743, + "grad_norm": 1.0182995796203613, + "learning_rate": 3.885700477254878e-05, + "loss": 2.3901, + "step": 26679 + }, + { + "epoch": 2.417159294240221, + "grad_norm": 1.0165351629257202, + "learning_rate": 3.885096357155803e-05, + "loss": 2.4869, + "step": 26680 + }, + { + "epoch": 2.417249892414668, + "grad_norm": 1.031701922416687, + "learning_rate": 3.884492237056727e-05, + "loss": 2.5363, + "step": 26681 + }, + { + "epoch": 2.4173404905891145, + "grad_norm": 0.8218780755996704, + "learning_rate": 3.883888116957651e-05, + "loss": 1.8231, + "step": 26682 + }, + { + "epoch": 2.4174310887635615, + "grad_norm": 1.045672059059143, + "learning_rate": 3.883283996858575e-05, + "loss": 2.7205, + "step": 26683 + }, + { + "epoch": 2.417521686938008, + "grad_norm": 1.0191956758499146, + "learning_rate": 3.8826798767595e-05, + "loss": 2.5863, + "step": 26684 + }, + { + "epoch": 2.417612285112455, + "grad_norm": 1.0704524517059326, + "learning_rate": 3.882075756660424e-05, + "loss": 2.5569, + "step": 26685 + }, + { + "epoch": 2.4177028832869016, + "grad_norm": 0.973703145980835, + "learning_rate": 3.881471636561349e-05, + "loss": 2.4619, + "step": 26686 + }, + { + "epoch": 2.4177934814613486, + "grad_norm": 1.1088054180145264, + "learning_rate": 3.880867516462273e-05, + "loss": 2.3351, + "step": 26687 + }, + { + "epoch": 2.417884079635795, + "grad_norm": 0.9348124861717224, + "learning_rate": 3.8802633963631976e-05, + "loss": 2.3137, + "step": 26688 + }, + { + "epoch": 2.4179746778102422, + "grad_norm": 1.1439473628997803, + "learning_rate": 3.879659276264122e-05, + "loss": 2.6606, + "step": 26689 + }, + { + "epoch": 2.418065275984689, + "grad_norm": 0.9713089466094971, + "learning_rate": 3.879055156165046e-05, + "loss": 1.7532, + "step": 26690 + }, + { + "epoch": 2.418155874159136, + "grad_norm": 1.004715085029602, + "learning_rate": 3.8784510360659705e-05, + "loss": 2.498, + "step": 26691 + }, + { + "epoch": 2.4182464723335824, + "grad_norm": 0.9628165364265442, + "learning_rate": 3.8778469159668946e-05, + "loss": 2.6025, + "step": 26692 + }, + { + "epoch": 2.4183370705080294, + "grad_norm": 1.0001825094223022, + "learning_rate": 3.877242795867819e-05, + "loss": 2.5607, + "step": 26693 + }, + { + "epoch": 2.418427668682476, + "grad_norm": 0.986325204372406, + "learning_rate": 3.876638675768743e-05, + "loss": 2.4877, + "step": 26694 + }, + { + "epoch": 2.418518266856923, + "grad_norm": 0.9936251044273376, + "learning_rate": 3.8760345556696675e-05, + "loss": 2.7706, + "step": 26695 + }, + { + "epoch": 2.4186088650313695, + "grad_norm": 0.8771023750305176, + "learning_rate": 3.8754304355705916e-05, + "loss": 1.7321, + "step": 26696 + }, + { + "epoch": 2.4186994632058165, + "grad_norm": 0.9753736257553101, + "learning_rate": 3.874826315471516e-05, + "loss": 2.5346, + "step": 26697 + }, + { + "epoch": 2.418790061380263, + "grad_norm": 1.056667447090149, + "learning_rate": 3.87422219537244e-05, + "loss": 2.7317, + "step": 26698 + }, + { + "epoch": 2.41888065955471, + "grad_norm": 0.9970550537109375, + "learning_rate": 3.8736180752733646e-05, + "loss": 2.5178, + "step": 26699 + }, + { + "epoch": 2.4189712577291567, + "grad_norm": 0.9530006051063538, + "learning_rate": 3.8730139551742886e-05, + "loss": 2.5194, + "step": 26700 + }, + { + "epoch": 2.4190618559036037, + "grad_norm": 0.8524312376976013, + "learning_rate": 3.872409835075213e-05, + "loss": 1.9164, + "step": 26701 + }, + { + "epoch": 2.4191524540780502, + "grad_norm": 0.9564450979232788, + "learning_rate": 3.8718057149761375e-05, + "loss": 2.626, + "step": 26702 + }, + { + "epoch": 2.4192430522524973, + "grad_norm": 0.8473471999168396, + "learning_rate": 3.871201594877062e-05, + "loss": 1.7944, + "step": 26703 + }, + { + "epoch": 2.419333650426944, + "grad_norm": 1.0138400793075562, + "learning_rate": 3.870597474777986e-05, + "loss": 2.7135, + "step": 26704 + }, + { + "epoch": 2.419424248601391, + "grad_norm": 1.0975555181503296, + "learning_rate": 3.8699933546789104e-05, + "loss": 2.8344, + "step": 26705 + }, + { + "epoch": 2.4195148467758374, + "grad_norm": 0.9864324331283569, + "learning_rate": 3.869389234579835e-05, + "loss": 2.5136, + "step": 26706 + }, + { + "epoch": 2.4196054449502844, + "grad_norm": 1.1391600370407104, + "learning_rate": 3.868785114480759e-05, + "loss": 2.6562, + "step": 26707 + }, + { + "epoch": 2.419696043124731, + "grad_norm": 1.1373950242996216, + "learning_rate": 3.868180994381683e-05, + "loss": 2.9803, + "step": 26708 + }, + { + "epoch": 2.419786641299178, + "grad_norm": 1.0286508798599243, + "learning_rate": 3.8675768742826074e-05, + "loss": 2.7331, + "step": 26709 + }, + { + "epoch": 2.4198772394736245, + "grad_norm": 1.045444369316101, + "learning_rate": 3.866972754183532e-05, + "loss": 2.4991, + "step": 26710 + }, + { + "epoch": 2.4199678376480716, + "grad_norm": 0.8685245513916016, + "learning_rate": 3.866368634084456e-05, + "loss": 1.8362, + "step": 26711 + }, + { + "epoch": 2.420058435822518, + "grad_norm": 1.1752407550811768, + "learning_rate": 3.8657645139853803e-05, + "loss": 2.5465, + "step": 26712 + }, + { + "epoch": 2.420149033996965, + "grad_norm": 1.0086233615875244, + "learning_rate": 3.8651603938863044e-05, + "loss": 2.5552, + "step": 26713 + }, + { + "epoch": 2.4202396321714117, + "grad_norm": 1.0394734144210815, + "learning_rate": 3.864556273787229e-05, + "loss": 2.4842, + "step": 26714 + }, + { + "epoch": 2.4203302303458587, + "grad_norm": 0.830082893371582, + "learning_rate": 3.863952153688153e-05, + "loss": 2.0629, + "step": 26715 + }, + { + "epoch": 2.4204208285203053, + "grad_norm": 0.9810564517974854, + "learning_rate": 3.8633480335890774e-05, + "loss": 2.4971, + "step": 26716 + }, + { + "epoch": 2.4205114266947523, + "grad_norm": 0.9750928282737732, + "learning_rate": 3.8627439134900014e-05, + "loss": 1.8721, + "step": 26717 + }, + { + "epoch": 2.420602024869199, + "grad_norm": 1.003403902053833, + "learning_rate": 3.862139793390926e-05, + "loss": 2.6761, + "step": 26718 + }, + { + "epoch": 2.420692623043646, + "grad_norm": 1.031300663948059, + "learning_rate": 3.86153567329185e-05, + "loss": 2.7743, + "step": 26719 + }, + { + "epoch": 2.4207832212180924, + "grad_norm": 0.9947080016136169, + "learning_rate": 3.860931553192775e-05, + "loss": 2.6916, + "step": 26720 + }, + { + "epoch": 2.4208738193925394, + "grad_norm": 1.0547690391540527, + "learning_rate": 3.8603274330937e-05, + "loss": 2.6618, + "step": 26721 + }, + { + "epoch": 2.420964417566986, + "grad_norm": 1.0409457683563232, + "learning_rate": 3.859723312994624e-05, + "loss": 2.6395, + "step": 26722 + }, + { + "epoch": 2.421055015741433, + "grad_norm": 1.0039416551589966, + "learning_rate": 3.859119192895548e-05, + "loss": 2.577, + "step": 26723 + }, + { + "epoch": 2.4211456139158796, + "grad_norm": 1.0232688188552856, + "learning_rate": 3.858515072796472e-05, + "loss": 2.6646, + "step": 26724 + }, + { + "epoch": 2.421236212090326, + "grad_norm": 0.9946345686912537, + "learning_rate": 3.857910952697397e-05, + "loss": 2.558, + "step": 26725 + }, + { + "epoch": 2.421326810264773, + "grad_norm": 1.1189872026443481, + "learning_rate": 3.857306832598321e-05, + "loss": 2.6848, + "step": 26726 + }, + { + "epoch": 2.42141740843922, + "grad_norm": 0.8464004397392273, + "learning_rate": 3.856702712499245e-05, + "loss": 1.2282, + "step": 26727 + }, + { + "epoch": 2.4215080066136667, + "grad_norm": 1.0018565654754639, + "learning_rate": 3.856098592400169e-05, + "loss": 2.3412, + "step": 26728 + }, + { + "epoch": 2.4215986047881133, + "grad_norm": 0.9426217079162598, + "learning_rate": 3.855494472301094e-05, + "loss": 2.5302, + "step": 26729 + }, + { + "epoch": 2.4216892029625603, + "grad_norm": 1.080057978630066, + "learning_rate": 3.854890352202018e-05, + "loss": 2.4423, + "step": 26730 + }, + { + "epoch": 2.4217798011370073, + "grad_norm": 0.9789841771125793, + "learning_rate": 3.854286232102942e-05, + "loss": 2.6496, + "step": 26731 + }, + { + "epoch": 2.421870399311454, + "grad_norm": 1.0008227825164795, + "learning_rate": 3.853682112003866e-05, + "loss": 2.6544, + "step": 26732 + }, + { + "epoch": 2.4219609974859004, + "grad_norm": 1.0156267881393433, + "learning_rate": 3.853077991904791e-05, + "loss": 2.6032, + "step": 26733 + }, + { + "epoch": 2.4220515956603474, + "grad_norm": 0.9106912016868591, + "learning_rate": 3.852473871805715e-05, + "loss": 2.054, + "step": 26734 + }, + { + "epoch": 2.4221421938347945, + "grad_norm": 1.0783205032348633, + "learning_rate": 3.851869751706639e-05, + "loss": 2.5607, + "step": 26735 + }, + { + "epoch": 2.422232792009241, + "grad_norm": 1.0986318588256836, + "learning_rate": 3.851265631607564e-05, + "loss": 2.9102, + "step": 26736 + }, + { + "epoch": 2.4223233901836876, + "grad_norm": 1.0268899202346802, + "learning_rate": 3.8506615115084885e-05, + "loss": 2.6962, + "step": 26737 + }, + { + "epoch": 2.4224139883581346, + "grad_norm": 1.0075719356536865, + "learning_rate": 3.8500573914094126e-05, + "loss": 2.3923, + "step": 26738 + }, + { + "epoch": 2.4225045865325816, + "grad_norm": 0.9912543892860413, + "learning_rate": 3.849453271310337e-05, + "loss": 2.7175, + "step": 26739 + }, + { + "epoch": 2.422595184707028, + "grad_norm": 1.243485927581787, + "learning_rate": 3.8488491512112614e-05, + "loss": 2.3988, + "step": 26740 + }, + { + "epoch": 2.4226857828814747, + "grad_norm": 1.0114864110946655, + "learning_rate": 3.8482450311121855e-05, + "loss": 2.5456, + "step": 26741 + }, + { + "epoch": 2.4227763810559217, + "grad_norm": 0.9719287753105164, + "learning_rate": 3.8476409110131096e-05, + "loss": 1.9187, + "step": 26742 + }, + { + "epoch": 2.4228669792303688, + "grad_norm": 1.0316509008407593, + "learning_rate": 3.847036790914034e-05, + "loss": 2.6441, + "step": 26743 + }, + { + "epoch": 2.4229575774048153, + "grad_norm": 1.068511724472046, + "learning_rate": 3.8464326708149585e-05, + "loss": 2.819, + "step": 26744 + }, + { + "epoch": 2.423048175579262, + "grad_norm": 1.2286368608474731, + "learning_rate": 3.8458285507158825e-05, + "loss": 2.593, + "step": 26745 + }, + { + "epoch": 2.423138773753709, + "grad_norm": 0.991195797920227, + "learning_rate": 3.8452244306168066e-05, + "loss": 2.4307, + "step": 26746 + }, + { + "epoch": 2.4232293719281555, + "grad_norm": 1.0748027563095093, + "learning_rate": 3.844620310517731e-05, + "loss": 2.4908, + "step": 26747 + }, + { + "epoch": 2.4233199701026025, + "grad_norm": 1.1045464277267456, + "learning_rate": 3.8440161904186555e-05, + "loss": 2.6199, + "step": 26748 + }, + { + "epoch": 2.423410568277049, + "grad_norm": 0.9665427207946777, + "learning_rate": 3.8434120703195795e-05, + "loss": 2.3819, + "step": 26749 + }, + { + "epoch": 2.423501166451496, + "grad_norm": 1.0831303596496582, + "learning_rate": 3.8428079502205036e-05, + "loss": 2.4289, + "step": 26750 + }, + { + "epoch": 2.4235917646259426, + "grad_norm": 0.9062042832374573, + "learning_rate": 3.8422038301214284e-05, + "loss": 2.4535, + "step": 26751 + }, + { + "epoch": 2.4236823628003896, + "grad_norm": 1.0625728368759155, + "learning_rate": 3.8415997100223525e-05, + "loss": 2.5467, + "step": 26752 + }, + { + "epoch": 2.423772960974836, + "grad_norm": 1.075595498085022, + "learning_rate": 3.840995589923277e-05, + "loss": 2.6184, + "step": 26753 + }, + { + "epoch": 2.423863559149283, + "grad_norm": 0.9547441005706787, + "learning_rate": 3.840391469824201e-05, + "loss": 2.4776, + "step": 26754 + }, + { + "epoch": 2.4239541573237298, + "grad_norm": 1.004565715789795, + "learning_rate": 3.839787349725126e-05, + "loss": 2.6997, + "step": 26755 + }, + { + "epoch": 2.4240447554981768, + "grad_norm": 1.0214319229125977, + "learning_rate": 3.83918322962605e-05, + "loss": 2.6135, + "step": 26756 + }, + { + "epoch": 2.4241353536726233, + "grad_norm": 1.075164794921875, + "learning_rate": 3.838579109526974e-05, + "loss": 2.5351, + "step": 26757 + }, + { + "epoch": 2.4242259518470703, + "grad_norm": 1.055470585823059, + "learning_rate": 3.837974989427898e-05, + "loss": 2.5317, + "step": 26758 + }, + { + "epoch": 2.424316550021517, + "grad_norm": 1.0531247854232788, + "learning_rate": 3.837370869328823e-05, + "loss": 2.5065, + "step": 26759 + }, + { + "epoch": 2.424407148195964, + "grad_norm": 1.0842725038528442, + "learning_rate": 3.836766749229747e-05, + "loss": 2.7121, + "step": 26760 + }, + { + "epoch": 2.4244977463704105, + "grad_norm": 1.009909987449646, + "learning_rate": 3.836162629130671e-05, + "loss": 2.5147, + "step": 26761 + }, + { + "epoch": 2.4245883445448575, + "grad_norm": 0.969342827796936, + "learning_rate": 3.835558509031595e-05, + "loss": 2.5111, + "step": 26762 + }, + { + "epoch": 2.424678942719304, + "grad_norm": 1.0488699674606323, + "learning_rate": 3.83495438893252e-05, + "loss": 2.6106, + "step": 26763 + }, + { + "epoch": 2.424769540893751, + "grad_norm": 1.0699561834335327, + "learning_rate": 3.834350268833444e-05, + "loss": 2.5194, + "step": 26764 + }, + { + "epoch": 2.4248601390681976, + "grad_norm": 1.0465468168258667, + "learning_rate": 3.833746148734368e-05, + "loss": 2.7259, + "step": 26765 + }, + { + "epoch": 2.4249507372426446, + "grad_norm": 1.1256299018859863, + "learning_rate": 3.833142028635293e-05, + "loss": 2.5451, + "step": 26766 + }, + { + "epoch": 2.425041335417091, + "grad_norm": 1.0120728015899658, + "learning_rate": 3.832537908536217e-05, + "loss": 2.7137, + "step": 26767 + }, + { + "epoch": 2.4251319335915382, + "grad_norm": 1.0318517684936523, + "learning_rate": 3.831933788437141e-05, + "loss": 2.6924, + "step": 26768 + }, + { + "epoch": 2.425222531765985, + "grad_norm": 1.0330615043640137, + "learning_rate": 3.831329668338066e-05, + "loss": 2.6802, + "step": 26769 + }, + { + "epoch": 2.425313129940432, + "grad_norm": 1.015593409538269, + "learning_rate": 3.83072554823899e-05, + "loss": 2.5521, + "step": 26770 + }, + { + "epoch": 2.4254037281148784, + "grad_norm": 0.9498448371887207, + "learning_rate": 3.830121428139915e-05, + "loss": 2.5334, + "step": 26771 + }, + { + "epoch": 2.4254943262893254, + "grad_norm": 1.1487064361572266, + "learning_rate": 3.829517308040839e-05, + "loss": 2.3085, + "step": 26772 + }, + { + "epoch": 2.425584924463772, + "grad_norm": 1.1966675519943237, + "learning_rate": 3.828913187941763e-05, + "loss": 2.5025, + "step": 26773 + }, + { + "epoch": 2.425675522638219, + "grad_norm": 0.9776469469070435, + "learning_rate": 3.828309067842688e-05, + "loss": 2.4595, + "step": 26774 + }, + { + "epoch": 2.4257661208126655, + "grad_norm": 1.013951063156128, + "learning_rate": 3.827704947743612e-05, + "loss": 2.4276, + "step": 26775 + }, + { + "epoch": 2.4258567189871125, + "grad_norm": 0.9320534467697144, + "learning_rate": 3.827100827644536e-05, + "loss": 2.1245, + "step": 26776 + }, + { + "epoch": 2.425947317161559, + "grad_norm": 0.9809989929199219, + "learning_rate": 3.82649670754546e-05, + "loss": 2.7054, + "step": 26777 + }, + { + "epoch": 2.426037915336006, + "grad_norm": 1.1131529808044434, + "learning_rate": 3.825892587446385e-05, + "loss": 2.7269, + "step": 26778 + }, + { + "epoch": 2.4261285135104527, + "grad_norm": 0.9461930990219116, + "learning_rate": 3.825288467347309e-05, + "loss": 2.3774, + "step": 26779 + }, + { + "epoch": 2.4262191116848997, + "grad_norm": 1.0001249313354492, + "learning_rate": 3.824684347248233e-05, + "loss": 2.7891, + "step": 26780 + }, + { + "epoch": 2.4263097098593462, + "grad_norm": 1.0415692329406738, + "learning_rate": 3.8240802271491577e-05, + "loss": 2.7559, + "step": 26781 + }, + { + "epoch": 2.4264003080337933, + "grad_norm": 1.0365426540374756, + "learning_rate": 3.823476107050082e-05, + "loss": 2.6827, + "step": 26782 + }, + { + "epoch": 2.42649090620824, + "grad_norm": 0.9844202995300293, + "learning_rate": 3.822871986951006e-05, + "loss": 2.5388, + "step": 26783 + }, + { + "epoch": 2.426581504382687, + "grad_norm": 1.0260510444641113, + "learning_rate": 3.82226786685193e-05, + "loss": 2.5479, + "step": 26784 + }, + { + "epoch": 2.4266721025571334, + "grad_norm": 1.046311616897583, + "learning_rate": 3.821663746752855e-05, + "loss": 2.6933, + "step": 26785 + }, + { + "epoch": 2.4267627007315804, + "grad_norm": 0.9388464689254761, + "learning_rate": 3.821059626653779e-05, + "loss": 2.1073, + "step": 26786 + }, + { + "epoch": 2.426853298906027, + "grad_norm": 0.9937494397163391, + "learning_rate": 3.8204555065547035e-05, + "loss": 2.5652, + "step": 26787 + }, + { + "epoch": 2.426943897080474, + "grad_norm": 1.0164029598236084, + "learning_rate": 3.8198513864556276e-05, + "loss": 2.2192, + "step": 26788 + }, + { + "epoch": 2.4270344952549205, + "grad_norm": 1.0600191354751587, + "learning_rate": 3.8192472663565524e-05, + "loss": 2.565, + "step": 26789 + }, + { + "epoch": 2.4271250934293676, + "grad_norm": 1.0262819528579712, + "learning_rate": 3.8186431462574764e-05, + "loss": 2.8096, + "step": 26790 + }, + { + "epoch": 2.427215691603814, + "grad_norm": 1.0369802713394165, + "learning_rate": 3.8180390261584005e-05, + "loss": 2.7758, + "step": 26791 + }, + { + "epoch": 2.427306289778261, + "grad_norm": 0.9954445362091064, + "learning_rate": 3.8174349060593246e-05, + "loss": 2.6392, + "step": 26792 + }, + { + "epoch": 2.4273968879527077, + "grad_norm": 1.1793659925460815, + "learning_rate": 3.8168307859602494e-05, + "loss": 2.6247, + "step": 26793 + }, + { + "epoch": 2.4274874861271547, + "grad_norm": 1.0352001190185547, + "learning_rate": 3.8162266658611734e-05, + "loss": 2.697, + "step": 26794 + }, + { + "epoch": 2.4275780843016013, + "grad_norm": 0.9558374285697937, + "learning_rate": 3.8156225457620975e-05, + "loss": 1.8932, + "step": 26795 + }, + { + "epoch": 2.4276686824760483, + "grad_norm": 0.9918084144592285, + "learning_rate": 3.815018425663022e-05, + "loss": 2.6581, + "step": 26796 + }, + { + "epoch": 2.427759280650495, + "grad_norm": 0.8897110223770142, + "learning_rate": 3.8144143055639464e-05, + "loss": 2.1615, + "step": 26797 + }, + { + "epoch": 2.427849878824942, + "grad_norm": 0.9464601278305054, + "learning_rate": 3.8138101854648705e-05, + "loss": 2.5446, + "step": 26798 + }, + { + "epoch": 2.4279404769993884, + "grad_norm": 0.9882115125656128, + "learning_rate": 3.8132060653657945e-05, + "loss": 2.5376, + "step": 26799 + }, + { + "epoch": 2.4280310751738354, + "grad_norm": 0.9930427670478821, + "learning_rate": 3.812601945266719e-05, + "loss": 2.6176, + "step": 26800 + }, + { + "epoch": 2.428121673348282, + "grad_norm": 0.9959698915481567, + "learning_rate": 3.8119978251676434e-05, + "loss": 2.6055, + "step": 26801 + }, + { + "epoch": 2.428212271522729, + "grad_norm": 1.009505033493042, + "learning_rate": 3.8113937050685675e-05, + "loss": 2.5084, + "step": 26802 + }, + { + "epoch": 2.4283028696971756, + "grad_norm": 0.9681984782218933, + "learning_rate": 3.810789584969492e-05, + "loss": 2.0525, + "step": 26803 + }, + { + "epoch": 2.4283934678716226, + "grad_norm": 1.1578667163848877, + "learning_rate": 3.810185464870416e-05, + "loss": 2.5797, + "step": 26804 + }, + { + "epoch": 2.428484066046069, + "grad_norm": 1.0635042190551758, + "learning_rate": 3.809581344771341e-05, + "loss": 2.6795, + "step": 26805 + }, + { + "epoch": 2.428574664220516, + "grad_norm": 1.0661804676055908, + "learning_rate": 3.808977224672265e-05, + "loss": 2.5341, + "step": 26806 + }, + { + "epoch": 2.4286652623949627, + "grad_norm": 0.9853881001472473, + "learning_rate": 3.808373104573189e-05, + "loss": 2.502, + "step": 26807 + }, + { + "epoch": 2.4287558605694093, + "grad_norm": 0.9821219444274902, + "learning_rate": 3.807768984474114e-05, + "loss": 2.7807, + "step": 26808 + }, + { + "epoch": 2.4288464587438563, + "grad_norm": 1.1411703824996948, + "learning_rate": 3.807164864375038e-05, + "loss": 2.4185, + "step": 26809 + }, + { + "epoch": 2.4289370569183033, + "grad_norm": 0.9514387845993042, + "learning_rate": 3.806560744275962e-05, + "loss": 2.6251, + "step": 26810 + }, + { + "epoch": 2.42902765509275, + "grad_norm": 0.8390663862228394, + "learning_rate": 3.805956624176887e-05, + "loss": 1.9129, + "step": 26811 + }, + { + "epoch": 2.4291182532671964, + "grad_norm": 1.039537787437439, + "learning_rate": 3.805352504077811e-05, + "loss": 2.9187, + "step": 26812 + }, + { + "epoch": 2.4292088514416434, + "grad_norm": 0.9615302085876465, + "learning_rate": 3.804748383978735e-05, + "loss": 2.6196, + "step": 26813 + }, + { + "epoch": 2.4292994496160905, + "grad_norm": 1.0613956451416016, + "learning_rate": 3.804144263879659e-05, + "loss": 2.4815, + "step": 26814 + }, + { + "epoch": 2.429390047790537, + "grad_norm": 1.0461691617965698, + "learning_rate": 3.803540143780584e-05, + "loss": 2.645, + "step": 26815 + }, + { + "epoch": 2.4294806459649836, + "grad_norm": 0.9815627932548523, + "learning_rate": 3.802936023681508e-05, + "loss": 2.3066, + "step": 26816 + }, + { + "epoch": 2.4295712441394306, + "grad_norm": 0.9974837899208069, + "learning_rate": 3.802331903582432e-05, + "loss": 2.3654, + "step": 26817 + }, + { + "epoch": 2.4296618423138776, + "grad_norm": 1.0126450061798096, + "learning_rate": 3.801727783483356e-05, + "loss": 2.8628, + "step": 26818 + }, + { + "epoch": 2.429752440488324, + "grad_norm": 1.011284351348877, + "learning_rate": 3.801123663384281e-05, + "loss": 2.4813, + "step": 26819 + }, + { + "epoch": 2.4298430386627707, + "grad_norm": 1.0760985612869263, + "learning_rate": 3.800519543285205e-05, + "loss": 2.8788, + "step": 26820 + }, + { + "epoch": 2.4299336368372177, + "grad_norm": 0.9664290547370911, + "learning_rate": 3.79991542318613e-05, + "loss": 2.4218, + "step": 26821 + }, + { + "epoch": 2.4300242350116648, + "grad_norm": 1.0160834789276123, + "learning_rate": 3.799311303087054e-05, + "loss": 2.8807, + "step": 26822 + }, + { + "epoch": 2.4301148331861113, + "grad_norm": 0.9930752515792847, + "learning_rate": 3.7987071829879786e-05, + "loss": 2.7519, + "step": 26823 + }, + { + "epoch": 2.430205431360558, + "grad_norm": 0.7822825908660889, + "learning_rate": 3.798103062888903e-05, + "loss": 1.6808, + "step": 26824 + }, + { + "epoch": 2.430296029535005, + "grad_norm": 0.9903915524482727, + "learning_rate": 3.797498942789827e-05, + "loss": 2.8491, + "step": 26825 + }, + { + "epoch": 2.4303866277094515, + "grad_norm": 1.05716872215271, + "learning_rate": 3.7968948226907516e-05, + "loss": 2.5098, + "step": 26826 + }, + { + "epoch": 2.4304772258838985, + "grad_norm": 0.9870560765266418, + "learning_rate": 3.7962907025916756e-05, + "loss": 2.6602, + "step": 26827 + }, + { + "epoch": 2.430567824058345, + "grad_norm": 0.9080556631088257, + "learning_rate": 3.7956865824926e-05, + "loss": 1.9295, + "step": 26828 + }, + { + "epoch": 2.430658422232792, + "grad_norm": 1.0962629318237305, + "learning_rate": 3.795082462393524e-05, + "loss": 2.8786, + "step": 26829 + }, + { + "epoch": 2.4307490204072386, + "grad_norm": 0.9905121326446533, + "learning_rate": 3.7944783422944486e-05, + "loss": 2.5443, + "step": 26830 + }, + { + "epoch": 2.4308396185816856, + "grad_norm": 0.9989166259765625, + "learning_rate": 3.7938742221953726e-05, + "loss": 2.7489, + "step": 26831 + }, + { + "epoch": 2.430930216756132, + "grad_norm": 1.1063822507858276, + "learning_rate": 3.793270102096297e-05, + "loss": 3.0038, + "step": 26832 + }, + { + "epoch": 2.431020814930579, + "grad_norm": 0.9709409475326538, + "learning_rate": 3.792665981997221e-05, + "loss": 2.6229, + "step": 26833 + }, + { + "epoch": 2.4311114131050258, + "grad_norm": 0.9965871572494507, + "learning_rate": 3.7920618618981456e-05, + "loss": 2.0269, + "step": 26834 + }, + { + "epoch": 2.4312020112794728, + "grad_norm": 1.02584707736969, + "learning_rate": 3.7914577417990697e-05, + "loss": 2.6513, + "step": 26835 + }, + { + "epoch": 2.4312926094539193, + "grad_norm": 1.0617872476577759, + "learning_rate": 3.790853621699994e-05, + "loss": 2.6471, + "step": 26836 + }, + { + "epoch": 2.4313832076283663, + "grad_norm": 0.9888498783111572, + "learning_rate": 3.7902495016009185e-05, + "loss": 2.4449, + "step": 26837 + }, + { + "epoch": 2.431473805802813, + "grad_norm": 1.0032910108566284, + "learning_rate": 3.7896453815018426e-05, + "loss": 2.607, + "step": 26838 + }, + { + "epoch": 2.43156440397726, + "grad_norm": 1.0460090637207031, + "learning_rate": 3.7890412614027673e-05, + "loss": 2.6589, + "step": 26839 + }, + { + "epoch": 2.4316550021517065, + "grad_norm": 0.9515749216079712, + "learning_rate": 3.7884371413036914e-05, + "loss": 2.5103, + "step": 26840 + }, + { + "epoch": 2.4317456003261535, + "grad_norm": 1.0229324102401733, + "learning_rate": 3.787833021204616e-05, + "loss": 2.6874, + "step": 26841 + }, + { + "epoch": 2.4318361985006, + "grad_norm": 1.0529963970184326, + "learning_rate": 3.78722890110554e-05, + "loss": 2.711, + "step": 26842 + }, + { + "epoch": 2.431926796675047, + "grad_norm": 1.0777614116668701, + "learning_rate": 3.7866247810064644e-05, + "loss": 2.3498, + "step": 26843 + }, + { + "epoch": 2.4320173948494936, + "grad_norm": 0.8868266344070435, + "learning_rate": 3.7860206609073884e-05, + "loss": 1.956, + "step": 26844 + }, + { + "epoch": 2.4321079930239407, + "grad_norm": 1.0243494510650635, + "learning_rate": 3.785416540808313e-05, + "loss": 2.8416, + "step": 26845 + }, + { + "epoch": 2.432198591198387, + "grad_norm": 0.946211576461792, + "learning_rate": 3.784812420709237e-05, + "loss": 2.5002, + "step": 26846 + }, + { + "epoch": 2.4322891893728342, + "grad_norm": 1.021122694015503, + "learning_rate": 3.7842083006101614e-05, + "loss": 2.4507, + "step": 26847 + }, + { + "epoch": 2.432379787547281, + "grad_norm": 0.9561730623245239, + "learning_rate": 3.7836041805110854e-05, + "loss": 2.4616, + "step": 26848 + }, + { + "epoch": 2.432470385721728, + "grad_norm": 0.9484532475471497, + "learning_rate": 3.78300006041201e-05, + "loss": 2.4683, + "step": 26849 + }, + { + "epoch": 2.4325609838961744, + "grad_norm": 0.9709742069244385, + "learning_rate": 3.782395940312934e-05, + "loss": 2.4825, + "step": 26850 + }, + { + "epoch": 2.4326515820706214, + "grad_norm": 0.9958620667457581, + "learning_rate": 3.7817918202138584e-05, + "loss": 2.4879, + "step": 26851 + }, + { + "epoch": 2.432742180245068, + "grad_norm": 1.117174744606018, + "learning_rate": 3.7811877001147825e-05, + "loss": 2.3655, + "step": 26852 + }, + { + "epoch": 2.432832778419515, + "grad_norm": 0.879124104976654, + "learning_rate": 3.780583580015707e-05, + "loss": 2.0357, + "step": 26853 + }, + { + "epoch": 2.4329233765939615, + "grad_norm": 0.9611133933067322, + "learning_rate": 3.779979459916631e-05, + "loss": 2.4595, + "step": 26854 + }, + { + "epoch": 2.4330139747684085, + "grad_norm": 0.86031174659729, + "learning_rate": 3.779375339817556e-05, + "loss": 1.8919, + "step": 26855 + }, + { + "epoch": 2.433104572942855, + "grad_norm": 0.9211136102676392, + "learning_rate": 3.77877121971848e-05, + "loss": 2.4897, + "step": 26856 + }, + { + "epoch": 2.433195171117302, + "grad_norm": 1.0866810083389282, + "learning_rate": 3.778167099619405e-05, + "loss": 2.8451, + "step": 26857 + }, + { + "epoch": 2.4332857692917487, + "grad_norm": 1.0466512441635132, + "learning_rate": 3.777562979520329e-05, + "loss": 2.7254, + "step": 26858 + }, + { + "epoch": 2.4333763674661957, + "grad_norm": 0.9827839136123657, + "learning_rate": 3.776958859421253e-05, + "loss": 2.4213, + "step": 26859 + }, + { + "epoch": 2.4334669656406422, + "grad_norm": 1.1027698516845703, + "learning_rate": 3.776354739322178e-05, + "loss": 2.5862, + "step": 26860 + }, + { + "epoch": 2.4335575638150893, + "grad_norm": 1.0524612665176392, + "learning_rate": 3.775750619223102e-05, + "loss": 2.7874, + "step": 26861 + }, + { + "epoch": 2.433648161989536, + "grad_norm": 1.0977696180343628, + "learning_rate": 3.775146499124026e-05, + "loss": 2.4989, + "step": 26862 + }, + { + "epoch": 2.433738760163983, + "grad_norm": 0.9905154705047607, + "learning_rate": 3.77454237902495e-05, + "loss": 2.651, + "step": 26863 + }, + { + "epoch": 2.4338293583384294, + "grad_norm": 1.0068061351776123, + "learning_rate": 3.773938258925875e-05, + "loss": 2.7212, + "step": 26864 + }, + { + "epoch": 2.4339199565128764, + "grad_norm": 1.054421305656433, + "learning_rate": 3.773334138826799e-05, + "loss": 2.8204, + "step": 26865 + }, + { + "epoch": 2.434010554687323, + "grad_norm": 1.1189312934875488, + "learning_rate": 3.772730018727723e-05, + "loss": 2.6841, + "step": 26866 + }, + { + "epoch": 2.43410115286177, + "grad_norm": 1.065980315208435, + "learning_rate": 3.772125898628647e-05, + "loss": 2.838, + "step": 26867 + }, + { + "epoch": 2.4341917510362165, + "grad_norm": 0.9866963624954224, + "learning_rate": 3.771521778529572e-05, + "loss": 2.4774, + "step": 26868 + }, + { + "epoch": 2.4342823492106636, + "grad_norm": 1.168217420578003, + "learning_rate": 3.770917658430496e-05, + "loss": 2.6066, + "step": 26869 + }, + { + "epoch": 2.43437294738511, + "grad_norm": 0.9952583312988281, + "learning_rate": 3.77031353833142e-05, + "loss": 2.7538, + "step": 26870 + }, + { + "epoch": 2.434463545559557, + "grad_norm": 0.9830792546272278, + "learning_rate": 3.769709418232345e-05, + "loss": 2.5861, + "step": 26871 + }, + { + "epoch": 2.4345541437340037, + "grad_norm": 1.071612000465393, + "learning_rate": 3.769105298133269e-05, + "loss": 2.508, + "step": 26872 + }, + { + "epoch": 2.4346447419084507, + "grad_norm": 0.9991208910942078, + "learning_rate": 3.7685011780341936e-05, + "loss": 2.7191, + "step": 26873 + }, + { + "epoch": 2.4347353400828973, + "grad_norm": 0.967732846736908, + "learning_rate": 3.767897057935118e-05, + "loss": 2.6043, + "step": 26874 + }, + { + "epoch": 2.4348259382573443, + "grad_norm": 0.9279183745384216, + "learning_rate": 3.7672929378360425e-05, + "loss": 2.121, + "step": 26875 + }, + { + "epoch": 2.434916536431791, + "grad_norm": 0.9706891179084778, + "learning_rate": 3.7666888177369665e-05, + "loss": 1.9739, + "step": 26876 + }, + { + "epoch": 2.435007134606238, + "grad_norm": 0.9776198267936707, + "learning_rate": 3.7660846976378906e-05, + "loss": 2.7786, + "step": 26877 + }, + { + "epoch": 2.4350977327806844, + "grad_norm": 0.9793291687965393, + "learning_rate": 3.765480577538815e-05, + "loss": 2.5669, + "step": 26878 + }, + { + "epoch": 2.4351883309551314, + "grad_norm": 0.9669315218925476, + "learning_rate": 3.7648764574397395e-05, + "loss": 2.6995, + "step": 26879 + }, + { + "epoch": 2.435278929129578, + "grad_norm": 0.9852941632270813, + "learning_rate": 3.7642723373406636e-05, + "loss": 2.7607, + "step": 26880 + }, + { + "epoch": 2.435369527304025, + "grad_norm": 0.9501402378082275, + "learning_rate": 3.7636682172415876e-05, + "loss": 2.8072, + "step": 26881 + }, + { + "epoch": 2.4354601254784716, + "grad_norm": 0.9901970624923706, + "learning_rate": 3.763064097142512e-05, + "loss": 2.4965, + "step": 26882 + }, + { + "epoch": 2.4355507236529186, + "grad_norm": 0.8152176141738892, + "learning_rate": 3.7624599770434365e-05, + "loss": 2.0409, + "step": 26883 + }, + { + "epoch": 2.435641321827365, + "grad_norm": 0.9817392826080322, + "learning_rate": 3.7618558569443606e-05, + "loss": 2.7446, + "step": 26884 + }, + { + "epoch": 2.435731920001812, + "grad_norm": 0.9448914527893066, + "learning_rate": 3.7612517368452846e-05, + "loss": 2.5134, + "step": 26885 + }, + { + "epoch": 2.4358225181762587, + "grad_norm": 0.986335277557373, + "learning_rate": 3.7606476167462094e-05, + "loss": 2.5043, + "step": 26886 + }, + { + "epoch": 2.4359131163507053, + "grad_norm": 0.964945375919342, + "learning_rate": 3.7600434966471335e-05, + "loss": 2.4906, + "step": 26887 + }, + { + "epoch": 2.4360037145251523, + "grad_norm": 1.0340853929519653, + "learning_rate": 3.7594393765480576e-05, + "loss": 2.6716, + "step": 26888 + }, + { + "epoch": 2.4360943126995993, + "grad_norm": 0.8462927341461182, + "learning_rate": 3.758835256448982e-05, + "loss": 1.9569, + "step": 26889 + }, + { + "epoch": 2.436184910874046, + "grad_norm": 0.9980928301811218, + "learning_rate": 3.758231136349907e-05, + "loss": 2.7028, + "step": 26890 + }, + { + "epoch": 2.4362755090484924, + "grad_norm": 0.8835949301719666, + "learning_rate": 3.757627016250831e-05, + "loss": 2.0665, + "step": 26891 + }, + { + "epoch": 2.4363661072229394, + "grad_norm": 0.9084327816963196, + "learning_rate": 3.757022896151755e-05, + "loss": 2.0338, + "step": 26892 + }, + { + "epoch": 2.4364567053973865, + "grad_norm": 0.994874894618988, + "learning_rate": 3.7564187760526793e-05, + "loss": 2.6031, + "step": 26893 + }, + { + "epoch": 2.436547303571833, + "grad_norm": 0.9911819100379944, + "learning_rate": 3.755814655953604e-05, + "loss": 2.5449, + "step": 26894 + }, + { + "epoch": 2.4366379017462796, + "grad_norm": 0.932616651058197, + "learning_rate": 3.755210535854528e-05, + "loss": 1.9311, + "step": 26895 + }, + { + "epoch": 2.4367284999207266, + "grad_norm": 1.0861430168151855, + "learning_rate": 3.754606415755452e-05, + "loss": 2.8082, + "step": 26896 + }, + { + "epoch": 2.4368190980951736, + "grad_norm": 0.9116972088813782, + "learning_rate": 3.7540022956563764e-05, + "loss": 2.0019, + "step": 26897 + }, + { + "epoch": 2.43690969626962, + "grad_norm": 0.9440469741821289, + "learning_rate": 3.753398175557301e-05, + "loss": 2.446, + "step": 26898 + }, + { + "epoch": 2.4370002944440667, + "grad_norm": 1.0351988077163696, + "learning_rate": 3.752794055458225e-05, + "loss": 2.4717, + "step": 26899 + }, + { + "epoch": 2.4370908926185137, + "grad_norm": 1.054604411125183, + "learning_rate": 3.752189935359149e-05, + "loss": 2.409, + "step": 26900 + }, + { + "epoch": 2.4371814907929608, + "grad_norm": 0.9647099375724792, + "learning_rate": 3.751585815260074e-05, + "loss": 2.3272, + "step": 26901 + }, + { + "epoch": 2.4372720889674073, + "grad_norm": 1.0034469366073608, + "learning_rate": 3.750981695160998e-05, + "loss": 2.7106, + "step": 26902 + }, + { + "epoch": 2.437362687141854, + "grad_norm": 1.1354902982711792, + "learning_rate": 3.750377575061922e-05, + "loss": 2.5815, + "step": 26903 + }, + { + "epoch": 2.437453285316301, + "grad_norm": 0.8858702182769775, + "learning_rate": 3.749773454962846e-05, + "loss": 2.4066, + "step": 26904 + }, + { + "epoch": 2.437543883490748, + "grad_norm": 1.0054341554641724, + "learning_rate": 3.749169334863771e-05, + "loss": 2.3267, + "step": 26905 + }, + { + "epoch": 2.4376344816651945, + "grad_norm": 0.9553573131561279, + "learning_rate": 3.748565214764695e-05, + "loss": 2.4293, + "step": 26906 + }, + { + "epoch": 2.437725079839641, + "grad_norm": 1.2101629972457886, + "learning_rate": 3.74796109466562e-05, + "loss": 2.6907, + "step": 26907 + }, + { + "epoch": 2.437815678014088, + "grad_norm": 1.1939454078674316, + "learning_rate": 3.747356974566544e-05, + "loss": 2.4289, + "step": 26908 + }, + { + "epoch": 2.4379062761885346, + "grad_norm": 0.9920828938484192, + "learning_rate": 3.746752854467469e-05, + "loss": 2.7116, + "step": 26909 + }, + { + "epoch": 2.4379968743629816, + "grad_norm": 0.9016709327697754, + "learning_rate": 3.746148734368393e-05, + "loss": 2.0132, + "step": 26910 + }, + { + "epoch": 2.438087472537428, + "grad_norm": 1.044134497642517, + "learning_rate": 3.745544614269317e-05, + "loss": 2.4393, + "step": 26911 + }, + { + "epoch": 2.438178070711875, + "grad_norm": 1.0631587505340576, + "learning_rate": 3.744940494170241e-05, + "loss": 2.5619, + "step": 26912 + }, + { + "epoch": 2.4382686688863218, + "grad_norm": 0.8631560802459717, + "learning_rate": 3.744336374071166e-05, + "loss": 2.0437, + "step": 26913 + }, + { + "epoch": 2.4383592670607688, + "grad_norm": 0.9318866729736328, + "learning_rate": 3.74373225397209e-05, + "loss": 2.7976, + "step": 26914 + }, + { + "epoch": 2.4384498652352153, + "grad_norm": 0.9531300067901611, + "learning_rate": 3.743128133873014e-05, + "loss": 2.542, + "step": 26915 + }, + { + "epoch": 2.4385404634096624, + "grad_norm": 1.097123146057129, + "learning_rate": 3.742524013773939e-05, + "loss": 2.6482, + "step": 26916 + }, + { + "epoch": 2.438631061584109, + "grad_norm": 1.1088770627975464, + "learning_rate": 3.741919893674863e-05, + "loss": 2.9243, + "step": 26917 + }, + { + "epoch": 2.438721659758556, + "grad_norm": 0.979672908782959, + "learning_rate": 3.741315773575787e-05, + "loss": 2.5644, + "step": 26918 + }, + { + "epoch": 2.4388122579330025, + "grad_norm": 1.014011263847351, + "learning_rate": 3.740711653476711e-05, + "loss": 2.6677, + "step": 26919 + }, + { + "epoch": 2.4389028561074495, + "grad_norm": 1.0876740217208862, + "learning_rate": 3.740107533377636e-05, + "loss": 2.6257, + "step": 26920 + }, + { + "epoch": 2.438993454281896, + "grad_norm": 1.0319304466247559, + "learning_rate": 3.73950341327856e-05, + "loss": 2.657, + "step": 26921 + }, + { + "epoch": 2.439084052456343, + "grad_norm": 1.1027896404266357, + "learning_rate": 3.738899293179484e-05, + "loss": 2.7028, + "step": 26922 + }, + { + "epoch": 2.4391746506307896, + "grad_norm": 0.9903475642204285, + "learning_rate": 3.7382951730804086e-05, + "loss": 2.758, + "step": 26923 + }, + { + "epoch": 2.4392652488052367, + "grad_norm": 0.9942598342895508, + "learning_rate": 3.7376910529813334e-05, + "loss": 2.6421, + "step": 26924 + }, + { + "epoch": 2.439355846979683, + "grad_norm": 0.9717987179756165, + "learning_rate": 3.7370869328822575e-05, + "loss": 2.6176, + "step": 26925 + }, + { + "epoch": 2.4394464451541302, + "grad_norm": 1.0776702165603638, + "learning_rate": 3.7364828127831815e-05, + "loss": 2.819, + "step": 26926 + }, + { + "epoch": 2.439537043328577, + "grad_norm": 1.0686614513397217, + "learning_rate": 3.7358786926841056e-05, + "loss": 2.7347, + "step": 26927 + }, + { + "epoch": 2.439627641503024, + "grad_norm": 0.9736006855964661, + "learning_rate": 3.7352745725850304e-05, + "loss": 2.8877, + "step": 26928 + }, + { + "epoch": 2.4397182396774704, + "grad_norm": 1.0001399517059326, + "learning_rate": 3.7346704524859545e-05, + "loss": 2.6476, + "step": 26929 + }, + { + "epoch": 2.4398088378519174, + "grad_norm": 1.0159722566604614, + "learning_rate": 3.7340663323868785e-05, + "loss": 2.8655, + "step": 26930 + }, + { + "epoch": 2.439899436026364, + "grad_norm": 1.0374970436096191, + "learning_rate": 3.733462212287803e-05, + "loss": 2.7342, + "step": 26931 + }, + { + "epoch": 2.439990034200811, + "grad_norm": 0.9881963729858398, + "learning_rate": 3.7328580921887274e-05, + "loss": 2.391, + "step": 26932 + }, + { + "epoch": 2.4400806323752575, + "grad_norm": 0.9976596236228943, + "learning_rate": 3.7322539720896515e-05, + "loss": 2.6438, + "step": 26933 + }, + { + "epoch": 2.4401712305497045, + "grad_norm": 1.0285300016403198, + "learning_rate": 3.7316498519905756e-05, + "loss": 2.8001, + "step": 26934 + }, + { + "epoch": 2.440261828724151, + "grad_norm": 1.0863162279129028, + "learning_rate": 3.7310457318915e-05, + "loss": 2.8656, + "step": 26935 + }, + { + "epoch": 2.440352426898598, + "grad_norm": 1.008754014968872, + "learning_rate": 3.7304416117924244e-05, + "loss": 2.7699, + "step": 26936 + }, + { + "epoch": 2.4404430250730447, + "grad_norm": 0.9811878204345703, + "learning_rate": 3.7298374916933485e-05, + "loss": 2.5906, + "step": 26937 + }, + { + "epoch": 2.4405336232474917, + "grad_norm": 1.0165777206420898, + "learning_rate": 3.7292333715942726e-05, + "loss": 2.746, + "step": 26938 + }, + { + "epoch": 2.4406242214219382, + "grad_norm": 1.0320308208465576, + "learning_rate": 3.728629251495197e-05, + "loss": 2.6999, + "step": 26939 + }, + { + "epoch": 2.4407148195963853, + "grad_norm": 1.0502232313156128, + "learning_rate": 3.728025131396122e-05, + "loss": 2.7151, + "step": 26940 + }, + { + "epoch": 2.440805417770832, + "grad_norm": 0.9579252600669861, + "learning_rate": 3.727421011297046e-05, + "loss": 2.4117, + "step": 26941 + }, + { + "epoch": 2.440896015945279, + "grad_norm": 1.0901700258255005, + "learning_rate": 3.72681689119797e-05, + "loss": 2.1868, + "step": 26942 + }, + { + "epoch": 2.4409866141197254, + "grad_norm": 1.0158967971801758, + "learning_rate": 3.726212771098895e-05, + "loss": 2.6609, + "step": 26943 + }, + { + "epoch": 2.4410772122941724, + "grad_norm": 0.9330418109893799, + "learning_rate": 3.725608650999819e-05, + "loss": 2.8328, + "step": 26944 + }, + { + "epoch": 2.441167810468619, + "grad_norm": 0.9900937676429749, + "learning_rate": 3.725004530900743e-05, + "loss": 2.7069, + "step": 26945 + }, + { + "epoch": 2.441258408643066, + "grad_norm": 1.000099778175354, + "learning_rate": 3.724400410801668e-05, + "loss": 2.5662, + "step": 26946 + }, + { + "epoch": 2.4413490068175125, + "grad_norm": 1.097737193107605, + "learning_rate": 3.723796290702592e-05, + "loss": 2.5162, + "step": 26947 + }, + { + "epoch": 2.4414396049919596, + "grad_norm": 0.9722066521644592, + "learning_rate": 3.723192170603516e-05, + "loss": 2.5582, + "step": 26948 + }, + { + "epoch": 2.441530203166406, + "grad_norm": 0.9998865127563477, + "learning_rate": 3.72258805050444e-05, + "loss": 2.6299, + "step": 26949 + }, + { + "epoch": 2.441620801340853, + "grad_norm": 1.0043584108352661, + "learning_rate": 3.721983930405365e-05, + "loss": 2.4402, + "step": 26950 + }, + { + "epoch": 2.4417113995152997, + "grad_norm": 1.0096276998519897, + "learning_rate": 3.721379810306289e-05, + "loss": 2.7362, + "step": 26951 + }, + { + "epoch": 2.4418019976897467, + "grad_norm": 1.0581822395324707, + "learning_rate": 3.720775690207213e-05, + "loss": 2.4586, + "step": 26952 + }, + { + "epoch": 2.4418925958641933, + "grad_norm": 1.0924155712127686, + "learning_rate": 3.720171570108137e-05, + "loss": 2.4676, + "step": 26953 + }, + { + "epoch": 2.4419831940386403, + "grad_norm": 1.1381399631500244, + "learning_rate": 3.719567450009062e-05, + "loss": 2.6627, + "step": 26954 + }, + { + "epoch": 2.442073792213087, + "grad_norm": 1.0727550983428955, + "learning_rate": 3.718963329909986e-05, + "loss": 2.4934, + "step": 26955 + }, + { + "epoch": 2.442164390387534, + "grad_norm": 0.9294736385345459, + "learning_rate": 3.718359209810911e-05, + "loss": 2.0825, + "step": 26956 + }, + { + "epoch": 2.4422549885619804, + "grad_norm": 0.9974209666252136, + "learning_rate": 3.717755089711835e-05, + "loss": 2.5738, + "step": 26957 + }, + { + "epoch": 2.4423455867364274, + "grad_norm": 0.9743835926055908, + "learning_rate": 3.7171509696127596e-05, + "loss": 2.5922, + "step": 26958 + }, + { + "epoch": 2.442436184910874, + "grad_norm": 1.015289545059204, + "learning_rate": 3.716546849513684e-05, + "loss": 2.6189, + "step": 26959 + }, + { + "epoch": 2.442526783085321, + "grad_norm": 0.908828854560852, + "learning_rate": 3.715942729414608e-05, + "loss": 2.0757, + "step": 26960 + }, + { + "epoch": 2.4426173812597676, + "grad_norm": 0.9192240238189697, + "learning_rate": 3.7153386093155326e-05, + "loss": 2.4945, + "step": 26961 + }, + { + "epoch": 2.4427079794342146, + "grad_norm": 0.9188807010650635, + "learning_rate": 3.7147344892164567e-05, + "loss": 2.3494, + "step": 26962 + }, + { + "epoch": 2.442798577608661, + "grad_norm": 1.0046805143356323, + "learning_rate": 3.714130369117381e-05, + "loss": 2.4836, + "step": 26963 + }, + { + "epoch": 2.442889175783108, + "grad_norm": 1.0157309770584106, + "learning_rate": 3.713526249018305e-05, + "loss": 2.7274, + "step": 26964 + }, + { + "epoch": 2.4429797739575547, + "grad_norm": 1.0070089101791382, + "learning_rate": 3.7129221289192296e-05, + "loss": 2.7172, + "step": 26965 + }, + { + "epoch": 2.4430703721320017, + "grad_norm": 1.0108455419540405, + "learning_rate": 3.7123180088201537e-05, + "loss": 3.0924, + "step": 26966 + }, + { + "epoch": 2.4431609703064483, + "grad_norm": 1.0074408054351807, + "learning_rate": 3.711713888721078e-05, + "loss": 2.8989, + "step": 26967 + }, + { + "epoch": 2.4432515684808953, + "grad_norm": 0.9729233980178833, + "learning_rate": 3.711109768622002e-05, + "loss": 2.5122, + "step": 26968 + }, + { + "epoch": 2.443342166655342, + "grad_norm": 0.9986327886581421, + "learning_rate": 3.7105056485229266e-05, + "loss": 2.6351, + "step": 26969 + }, + { + "epoch": 2.4434327648297884, + "grad_norm": 0.9872602224349976, + "learning_rate": 3.709901528423851e-05, + "loss": 2.5074, + "step": 26970 + }, + { + "epoch": 2.4435233630042354, + "grad_norm": 0.9058818221092224, + "learning_rate": 3.709297408324775e-05, + "loss": 2.1407, + "step": 26971 + }, + { + "epoch": 2.4436139611786825, + "grad_norm": 0.9045591950416565, + "learning_rate": 3.708693288225699e-05, + "loss": 1.9805, + "step": 26972 + }, + { + "epoch": 2.443704559353129, + "grad_norm": 1.031522274017334, + "learning_rate": 3.7080891681266236e-05, + "loss": 2.6737, + "step": 26973 + }, + { + "epoch": 2.4437951575275756, + "grad_norm": 1.0504547357559204, + "learning_rate": 3.7074850480275484e-05, + "loss": 2.6524, + "step": 26974 + }, + { + "epoch": 2.4438857557020226, + "grad_norm": 1.0244756937026978, + "learning_rate": 3.7068809279284724e-05, + "loss": 2.7666, + "step": 26975 + }, + { + "epoch": 2.4439763538764696, + "grad_norm": 1.0010751485824585, + "learning_rate": 3.706276807829397e-05, + "loss": 2.6863, + "step": 26976 + }, + { + "epoch": 2.444066952050916, + "grad_norm": 0.9419021010398865, + "learning_rate": 3.705672687730321e-05, + "loss": 2.3376, + "step": 26977 + }, + { + "epoch": 2.4441575502253627, + "grad_norm": 1.0073314905166626, + "learning_rate": 3.7050685676312454e-05, + "loss": 2.7309, + "step": 26978 + }, + { + "epoch": 2.4442481483998098, + "grad_norm": 1.0420914888381958, + "learning_rate": 3.7044644475321694e-05, + "loss": 2.6339, + "step": 26979 + }, + { + "epoch": 2.4443387465742568, + "grad_norm": 0.9834699630737305, + "learning_rate": 3.703860327433094e-05, + "loss": 2.7228, + "step": 26980 + }, + { + "epoch": 2.4444293447487033, + "grad_norm": 0.9707427024841309, + "learning_rate": 3.703256207334018e-05, + "loss": 2.7776, + "step": 26981 + }, + { + "epoch": 2.44451994292315, + "grad_norm": 1.0389320850372314, + "learning_rate": 3.7026520872349424e-05, + "loss": 2.6922, + "step": 26982 + }, + { + "epoch": 2.444610541097597, + "grad_norm": 0.9975647926330566, + "learning_rate": 3.7020479671358665e-05, + "loss": 2.5746, + "step": 26983 + }, + { + "epoch": 2.444701139272044, + "grad_norm": 0.9778184294700623, + "learning_rate": 3.701443847036791e-05, + "loss": 2.356, + "step": 26984 + }, + { + "epoch": 2.4447917374464905, + "grad_norm": 1.0606800317764282, + "learning_rate": 3.700839726937715e-05, + "loss": 2.6842, + "step": 26985 + }, + { + "epoch": 2.444882335620937, + "grad_norm": 1.0199482440948486, + "learning_rate": 3.7002356068386394e-05, + "loss": 2.5835, + "step": 26986 + }, + { + "epoch": 2.444972933795384, + "grad_norm": 0.867266833782196, + "learning_rate": 3.699631486739564e-05, + "loss": 2.085, + "step": 26987 + }, + { + "epoch": 2.4450635319698306, + "grad_norm": 1.038559913635254, + "learning_rate": 3.699027366640488e-05, + "loss": 2.5354, + "step": 26988 + }, + { + "epoch": 2.4451541301442776, + "grad_norm": 1.053205966949463, + "learning_rate": 3.698423246541412e-05, + "loss": 2.7358, + "step": 26989 + }, + { + "epoch": 2.445244728318724, + "grad_norm": 1.199270248413086, + "learning_rate": 3.697819126442337e-05, + "loss": 1.874, + "step": 26990 + }, + { + "epoch": 2.445335326493171, + "grad_norm": 0.9965556263923645, + "learning_rate": 3.697215006343261e-05, + "loss": 2.5243, + "step": 26991 + }, + { + "epoch": 2.4454259246676178, + "grad_norm": 1.0420047044754028, + "learning_rate": 3.696610886244186e-05, + "loss": 2.6057, + "step": 26992 + }, + { + "epoch": 2.4455165228420648, + "grad_norm": 0.9854512810707092, + "learning_rate": 3.69600676614511e-05, + "loss": 2.3076, + "step": 26993 + }, + { + "epoch": 2.4456071210165113, + "grad_norm": 1.0040597915649414, + "learning_rate": 3.695402646046034e-05, + "loss": 2.8293, + "step": 26994 + }, + { + "epoch": 2.4456977191909584, + "grad_norm": 0.9528499245643616, + "learning_rate": 3.694798525946959e-05, + "loss": 2.5624, + "step": 26995 + }, + { + "epoch": 2.445788317365405, + "grad_norm": 1.083795428276062, + "learning_rate": 3.694194405847883e-05, + "loss": 2.7235, + "step": 26996 + }, + { + "epoch": 2.445878915539852, + "grad_norm": 0.9515275955200195, + "learning_rate": 3.693590285748807e-05, + "loss": 2.613, + "step": 26997 + }, + { + "epoch": 2.4459695137142985, + "grad_norm": 1.039231538772583, + "learning_rate": 3.692986165649731e-05, + "loss": 2.6625, + "step": 26998 + }, + { + "epoch": 2.4460601118887455, + "grad_norm": 0.9914910197257996, + "learning_rate": 3.692382045550656e-05, + "loss": 2.4802, + "step": 26999 + }, + { + "epoch": 2.446150710063192, + "grad_norm": 1.0722192525863647, + "learning_rate": 3.69177792545158e-05, + "loss": 2.6011, + "step": 27000 + }, + { + "epoch": 2.446241308237639, + "grad_norm": 1.0258433818817139, + "learning_rate": 3.691173805352504e-05, + "loss": 2.6886, + "step": 27001 + }, + { + "epoch": 2.4463319064120856, + "grad_norm": 0.9599880576133728, + "learning_rate": 3.690569685253429e-05, + "loss": 2.4646, + "step": 27002 + }, + { + "epoch": 2.4464225045865327, + "grad_norm": 1.160568356513977, + "learning_rate": 3.689965565154353e-05, + "loss": 2.7513, + "step": 27003 + }, + { + "epoch": 2.446513102760979, + "grad_norm": 0.9879041910171509, + "learning_rate": 3.689361445055277e-05, + "loss": 2.6532, + "step": 27004 + }, + { + "epoch": 2.4466037009354262, + "grad_norm": 1.0282063484191895, + "learning_rate": 3.688757324956201e-05, + "loss": 2.5455, + "step": 27005 + }, + { + "epoch": 2.446694299109873, + "grad_norm": 1.0510443449020386, + "learning_rate": 3.688153204857126e-05, + "loss": 2.453, + "step": 27006 + }, + { + "epoch": 2.44678489728432, + "grad_norm": 0.9915019273757935, + "learning_rate": 3.68754908475805e-05, + "loss": 2.6255, + "step": 27007 + }, + { + "epoch": 2.4468754954587664, + "grad_norm": 0.8980015516281128, + "learning_rate": 3.6869449646589746e-05, + "loss": 2.0352, + "step": 27008 + }, + { + "epoch": 2.4469660936332134, + "grad_norm": 0.9158027768135071, + "learning_rate": 3.686340844559899e-05, + "loss": 2.0555, + "step": 27009 + }, + { + "epoch": 2.44705669180766, + "grad_norm": 1.0360021591186523, + "learning_rate": 3.6857367244608235e-05, + "loss": 2.6386, + "step": 27010 + }, + { + "epoch": 2.447147289982107, + "grad_norm": 0.9863322377204895, + "learning_rate": 3.6851326043617476e-05, + "loss": 1.9017, + "step": 27011 + }, + { + "epoch": 2.4472378881565535, + "grad_norm": 0.9810447692871094, + "learning_rate": 3.6845284842626716e-05, + "loss": 2.535, + "step": 27012 + }, + { + "epoch": 2.4473284863310005, + "grad_norm": 1.0598468780517578, + "learning_rate": 3.683924364163596e-05, + "loss": 2.6136, + "step": 27013 + }, + { + "epoch": 2.447419084505447, + "grad_norm": 1.0039654970169067, + "learning_rate": 3.6833202440645205e-05, + "loss": 2.5457, + "step": 27014 + }, + { + "epoch": 2.447509682679894, + "grad_norm": 0.9194020628929138, + "learning_rate": 3.6827161239654446e-05, + "loss": 2.5251, + "step": 27015 + }, + { + "epoch": 2.4476002808543407, + "grad_norm": 0.9876812100410461, + "learning_rate": 3.6821120038663686e-05, + "loss": 2.8767, + "step": 27016 + }, + { + "epoch": 2.4476908790287877, + "grad_norm": 1.1491022109985352, + "learning_rate": 3.6815078837672934e-05, + "loss": 3.0309, + "step": 27017 + }, + { + "epoch": 2.4477814772032342, + "grad_norm": 1.0371078252792358, + "learning_rate": 3.6809037636682175e-05, + "loss": 2.3387, + "step": 27018 + }, + { + "epoch": 2.4478720753776813, + "grad_norm": 1.014838695526123, + "learning_rate": 3.6802996435691416e-05, + "loss": 2.6684, + "step": 27019 + }, + { + "epoch": 2.447962673552128, + "grad_norm": 0.9088861346244812, + "learning_rate": 3.6796955234700657e-05, + "loss": 2.436, + "step": 27020 + }, + { + "epoch": 2.448053271726575, + "grad_norm": 1.0257972478866577, + "learning_rate": 3.6790914033709904e-05, + "loss": 2.6526, + "step": 27021 + }, + { + "epoch": 2.4481438699010214, + "grad_norm": 1.0230343341827393, + "learning_rate": 3.6784872832719145e-05, + "loss": 2.6897, + "step": 27022 + }, + { + "epoch": 2.4482344680754684, + "grad_norm": 0.9124354720115662, + "learning_rate": 3.6778831631728386e-05, + "loss": 1.8575, + "step": 27023 + }, + { + "epoch": 2.448325066249915, + "grad_norm": 0.910862386226654, + "learning_rate": 3.6772790430737633e-05, + "loss": 1.7026, + "step": 27024 + }, + { + "epoch": 2.448415664424362, + "grad_norm": 1.03577721118927, + "learning_rate": 3.6766749229746874e-05, + "loss": 2.6362, + "step": 27025 + }, + { + "epoch": 2.4485062625988085, + "grad_norm": 0.9900491833686829, + "learning_rate": 3.676070802875612e-05, + "loss": 2.5238, + "step": 27026 + }, + { + "epoch": 2.4485968607732556, + "grad_norm": 1.0324385166168213, + "learning_rate": 3.675466682776536e-05, + "loss": 2.556, + "step": 27027 + }, + { + "epoch": 2.448687458947702, + "grad_norm": 0.9696211814880371, + "learning_rate": 3.6748625626774604e-05, + "loss": 2.4815, + "step": 27028 + }, + { + "epoch": 2.448778057122149, + "grad_norm": 0.9839722514152527, + "learning_rate": 3.674258442578385e-05, + "loss": 2.5229, + "step": 27029 + }, + { + "epoch": 2.4488686552965957, + "grad_norm": 1.0016779899597168, + "learning_rate": 3.673654322479309e-05, + "loss": 2.4724, + "step": 27030 + }, + { + "epoch": 2.4489592534710427, + "grad_norm": 0.9623323678970337, + "learning_rate": 3.673050202380233e-05, + "loss": 2.4132, + "step": 27031 + }, + { + "epoch": 2.4490498516454893, + "grad_norm": 0.9885806441307068, + "learning_rate": 3.672446082281158e-05, + "loss": 2.6145, + "step": 27032 + }, + { + "epoch": 2.4491404498199363, + "grad_norm": 0.9952616095542908, + "learning_rate": 3.671841962182082e-05, + "loss": 2.7209, + "step": 27033 + }, + { + "epoch": 2.449231047994383, + "grad_norm": 0.9812711477279663, + "learning_rate": 3.671237842083006e-05, + "loss": 2.521, + "step": 27034 + }, + { + "epoch": 2.44932164616883, + "grad_norm": 1.1586782932281494, + "learning_rate": 3.67063372198393e-05, + "loss": 2.5613, + "step": 27035 + }, + { + "epoch": 2.4494122443432764, + "grad_norm": 1.0369431972503662, + "learning_rate": 3.670029601884855e-05, + "loss": 2.5358, + "step": 27036 + }, + { + "epoch": 2.4495028425177234, + "grad_norm": 1.0256162881851196, + "learning_rate": 3.669425481785779e-05, + "loss": 2.5522, + "step": 27037 + }, + { + "epoch": 2.44959344069217, + "grad_norm": 0.9497537612915039, + "learning_rate": 3.668821361686703e-05, + "loss": 1.9905, + "step": 27038 + }, + { + "epoch": 2.449684038866617, + "grad_norm": 1.0296200513839722, + "learning_rate": 3.668217241587627e-05, + "loss": 2.5138, + "step": 27039 + }, + { + "epoch": 2.4497746370410636, + "grad_norm": 1.030551791191101, + "learning_rate": 3.667613121488552e-05, + "loss": 2.7633, + "step": 27040 + }, + { + "epoch": 2.4498652352155106, + "grad_norm": 1.0439319610595703, + "learning_rate": 3.667009001389476e-05, + "loss": 2.6298, + "step": 27041 + }, + { + "epoch": 2.449955833389957, + "grad_norm": 1.0673996210098267, + "learning_rate": 3.666404881290401e-05, + "loss": 2.4592, + "step": 27042 + }, + { + "epoch": 2.450046431564404, + "grad_norm": 1.0177429914474487, + "learning_rate": 3.665800761191325e-05, + "loss": 2.5378, + "step": 27043 + }, + { + "epoch": 2.4501370297388507, + "grad_norm": 1.0102605819702148, + "learning_rate": 3.66519664109225e-05, + "loss": 2.5551, + "step": 27044 + }, + { + "epoch": 2.4502276279132977, + "grad_norm": 1.0126136541366577, + "learning_rate": 3.664592520993174e-05, + "loss": 2.7308, + "step": 27045 + }, + { + "epoch": 2.4503182260877443, + "grad_norm": 0.8869131803512573, + "learning_rate": 3.663988400894098e-05, + "loss": 2.1807, + "step": 27046 + }, + { + "epoch": 2.4504088242621913, + "grad_norm": 1.0099643468856812, + "learning_rate": 3.663384280795023e-05, + "loss": 2.7573, + "step": 27047 + }, + { + "epoch": 2.450499422436638, + "grad_norm": 1.0196336507797241, + "learning_rate": 3.662780160695947e-05, + "loss": 2.9773, + "step": 27048 + }, + { + "epoch": 2.4505900206110844, + "grad_norm": 0.9816144108772278, + "learning_rate": 3.662176040596871e-05, + "loss": 2.6993, + "step": 27049 + }, + { + "epoch": 2.4506806187855315, + "grad_norm": 1.1172821521759033, + "learning_rate": 3.661571920497795e-05, + "loss": 2.6155, + "step": 27050 + }, + { + "epoch": 2.4507712169599785, + "grad_norm": 1.038750171661377, + "learning_rate": 3.66096780039872e-05, + "loss": 2.9744, + "step": 27051 + }, + { + "epoch": 2.450861815134425, + "grad_norm": 1.1936744451522827, + "learning_rate": 3.660363680299644e-05, + "loss": 2.3459, + "step": 27052 + }, + { + "epoch": 2.4509524133088716, + "grad_norm": 0.9916218519210815, + "learning_rate": 3.659759560200568e-05, + "loss": 2.6996, + "step": 27053 + }, + { + "epoch": 2.4510430114833186, + "grad_norm": 1.0546194314956665, + "learning_rate": 3.659155440101492e-05, + "loss": 2.5209, + "step": 27054 + }, + { + "epoch": 2.4511336096577656, + "grad_norm": 1.0969719886779785, + "learning_rate": 3.658551320002417e-05, + "loss": 2.5027, + "step": 27055 + }, + { + "epoch": 2.451224207832212, + "grad_norm": 1.1018325090408325, + "learning_rate": 3.657947199903341e-05, + "loss": 2.79, + "step": 27056 + }, + { + "epoch": 2.4513148060066587, + "grad_norm": 0.971001148223877, + "learning_rate": 3.657343079804265e-05, + "loss": 2.5111, + "step": 27057 + }, + { + "epoch": 2.4514054041811058, + "grad_norm": 0.9802680611610413, + "learning_rate": 3.6567389597051896e-05, + "loss": 2.5981, + "step": 27058 + }, + { + "epoch": 2.4514960023555528, + "grad_norm": 0.9538020491600037, + "learning_rate": 3.656134839606114e-05, + "loss": 2.5686, + "step": 27059 + }, + { + "epoch": 2.4515866005299993, + "grad_norm": 0.9696673154830933, + "learning_rate": 3.6555307195070385e-05, + "loss": 2.0179, + "step": 27060 + }, + { + "epoch": 2.451677198704446, + "grad_norm": 1.0112372636795044, + "learning_rate": 3.6549265994079625e-05, + "loss": 2.8244, + "step": 27061 + }, + { + "epoch": 2.451767796878893, + "grad_norm": 0.9642331004142761, + "learning_rate": 3.654322479308887e-05, + "loss": 2.5239, + "step": 27062 + }, + { + "epoch": 2.45185839505334, + "grad_norm": 0.9847930073738098, + "learning_rate": 3.6537183592098114e-05, + "loss": 2.4976, + "step": 27063 + }, + { + "epoch": 2.4519489932277865, + "grad_norm": 1.1102081537246704, + "learning_rate": 3.6531142391107355e-05, + "loss": 2.4581, + "step": 27064 + }, + { + "epoch": 2.452039591402233, + "grad_norm": 1.0700870752334595, + "learning_rate": 3.6525101190116596e-05, + "loss": 2.5888, + "step": 27065 + }, + { + "epoch": 2.45213018957668, + "grad_norm": 0.964391827583313, + "learning_rate": 3.651905998912584e-05, + "loss": 2.5223, + "step": 27066 + }, + { + "epoch": 2.452220787751127, + "grad_norm": 1.0872520208358765, + "learning_rate": 3.6513018788135084e-05, + "loss": 2.3137, + "step": 27067 + }, + { + "epoch": 2.4523113859255736, + "grad_norm": 1.1357930898666382, + "learning_rate": 3.6506977587144325e-05, + "loss": 2.6083, + "step": 27068 + }, + { + "epoch": 2.45240198410002, + "grad_norm": 1.11483633518219, + "learning_rate": 3.6500936386153566e-05, + "loss": 2.6354, + "step": 27069 + }, + { + "epoch": 2.452492582274467, + "grad_norm": 0.9305627346038818, + "learning_rate": 3.649489518516281e-05, + "loss": 2.5792, + "step": 27070 + }, + { + "epoch": 2.4525831804489138, + "grad_norm": 1.0622897148132324, + "learning_rate": 3.6488853984172054e-05, + "loss": 2.5548, + "step": 27071 + }, + { + "epoch": 2.452673778623361, + "grad_norm": 0.9995141625404358, + "learning_rate": 3.6482812783181295e-05, + "loss": 2.53, + "step": 27072 + }, + { + "epoch": 2.4527643767978073, + "grad_norm": 0.9725211262702942, + "learning_rate": 3.6476771582190536e-05, + "loss": 2.4318, + "step": 27073 + }, + { + "epoch": 2.4528549749722544, + "grad_norm": 1.0056493282318115, + "learning_rate": 3.647073038119978e-05, + "loss": 2.9263, + "step": 27074 + }, + { + "epoch": 2.452945573146701, + "grad_norm": 1.061113953590393, + "learning_rate": 3.6464689180209024e-05, + "loss": 2.6442, + "step": 27075 + }, + { + "epoch": 2.453036171321148, + "grad_norm": 0.9827350378036499, + "learning_rate": 3.645864797921827e-05, + "loss": 2.858, + "step": 27076 + }, + { + "epoch": 2.4531267694955945, + "grad_norm": 1.0413298606872559, + "learning_rate": 3.645260677822752e-05, + "loss": 2.738, + "step": 27077 + }, + { + "epoch": 2.4532173676700415, + "grad_norm": 0.9422804713249207, + "learning_rate": 3.644656557723676e-05, + "loss": 2.3525, + "step": 27078 + }, + { + "epoch": 2.453307965844488, + "grad_norm": 0.9661418795585632, + "learning_rate": 3.6440524376246e-05, + "loss": 2.7139, + "step": 27079 + }, + { + "epoch": 2.453398564018935, + "grad_norm": 0.9802595376968384, + "learning_rate": 3.643448317525524e-05, + "loss": 2.5709, + "step": 27080 + }, + { + "epoch": 2.4534891621933816, + "grad_norm": 1.231153964996338, + "learning_rate": 3.642844197426449e-05, + "loss": 2.7237, + "step": 27081 + }, + { + "epoch": 2.4535797603678287, + "grad_norm": 0.9798353314399719, + "learning_rate": 3.642240077327373e-05, + "loss": 2.4594, + "step": 27082 + }, + { + "epoch": 2.453670358542275, + "grad_norm": 0.9793251752853394, + "learning_rate": 3.641635957228297e-05, + "loss": 1.995, + "step": 27083 + }, + { + "epoch": 2.4537609567167222, + "grad_norm": 0.9889220595359802, + "learning_rate": 3.641031837129221e-05, + "loss": 2.5445, + "step": 27084 + }, + { + "epoch": 2.453851554891169, + "grad_norm": 0.877716064453125, + "learning_rate": 3.640427717030146e-05, + "loss": 1.7707, + "step": 27085 + }, + { + "epoch": 2.453942153065616, + "grad_norm": 0.9849480986595154, + "learning_rate": 3.63982359693107e-05, + "loss": 2.6206, + "step": 27086 + }, + { + "epoch": 2.4540327512400624, + "grad_norm": 0.9197669625282288, + "learning_rate": 3.639219476831994e-05, + "loss": 1.9369, + "step": 27087 + }, + { + "epoch": 2.4541233494145094, + "grad_norm": 0.978350818157196, + "learning_rate": 3.638615356732918e-05, + "loss": 2.6985, + "step": 27088 + }, + { + "epoch": 2.454213947588956, + "grad_norm": 1.051605463027954, + "learning_rate": 3.638011236633843e-05, + "loss": 2.7355, + "step": 27089 + }, + { + "epoch": 2.454304545763403, + "grad_norm": 1.1307014226913452, + "learning_rate": 3.637407116534767e-05, + "loss": 2.6132, + "step": 27090 + }, + { + "epoch": 2.4543951439378495, + "grad_norm": 0.9588204026222229, + "learning_rate": 3.636802996435691e-05, + "loss": 2.5591, + "step": 27091 + }, + { + "epoch": 2.4544857421122965, + "grad_norm": 1.0058889389038086, + "learning_rate": 3.636198876336616e-05, + "loss": 2.6105, + "step": 27092 + }, + { + "epoch": 2.454576340286743, + "grad_norm": 0.9466814398765564, + "learning_rate": 3.63559475623754e-05, + "loss": 2.5498, + "step": 27093 + }, + { + "epoch": 2.45466693846119, + "grad_norm": 1.0270155668258667, + "learning_rate": 3.634990636138465e-05, + "loss": 2.7907, + "step": 27094 + }, + { + "epoch": 2.4547575366356367, + "grad_norm": 0.929981529712677, + "learning_rate": 3.634386516039389e-05, + "loss": 2.5504, + "step": 27095 + }, + { + "epoch": 2.4548481348100837, + "grad_norm": 0.9975142478942871, + "learning_rate": 3.6337823959403136e-05, + "loss": 2.5479, + "step": 27096 + }, + { + "epoch": 2.4549387329845302, + "grad_norm": 0.8420780897140503, + "learning_rate": 3.633178275841238e-05, + "loss": 1.7918, + "step": 27097 + }, + { + "epoch": 2.4550293311589773, + "grad_norm": 0.9616134166717529, + "learning_rate": 3.632574155742162e-05, + "loss": 2.4806, + "step": 27098 + }, + { + "epoch": 2.455119929333424, + "grad_norm": 1.0098426342010498, + "learning_rate": 3.631970035643086e-05, + "loss": 2.6447, + "step": 27099 + }, + { + "epoch": 2.455210527507871, + "grad_norm": 1.1181913614273071, + "learning_rate": 3.6313659155440106e-05, + "loss": 2.7145, + "step": 27100 + }, + { + "epoch": 2.4553011256823174, + "grad_norm": 1.0428426265716553, + "learning_rate": 3.630761795444935e-05, + "loss": 2.7156, + "step": 27101 + }, + { + "epoch": 2.4553917238567644, + "grad_norm": 0.985857367515564, + "learning_rate": 3.630157675345859e-05, + "loss": 2.4888, + "step": 27102 + }, + { + "epoch": 2.455482322031211, + "grad_norm": 1.0019865036010742, + "learning_rate": 3.629553555246783e-05, + "loss": 2.469, + "step": 27103 + }, + { + "epoch": 2.455572920205658, + "grad_norm": 1.0313911437988281, + "learning_rate": 3.6289494351477076e-05, + "loss": 2.8999, + "step": 27104 + }, + { + "epoch": 2.4556635183801045, + "grad_norm": 1.0972139835357666, + "learning_rate": 3.628345315048632e-05, + "loss": 2.4707, + "step": 27105 + }, + { + "epoch": 2.4557541165545516, + "grad_norm": 1.0242525339126587, + "learning_rate": 3.627741194949556e-05, + "loss": 2.817, + "step": 27106 + }, + { + "epoch": 2.455844714728998, + "grad_norm": 1.0592623949050903, + "learning_rate": 3.6271370748504805e-05, + "loss": 2.5781, + "step": 27107 + }, + { + "epoch": 2.455935312903445, + "grad_norm": 0.97732013463974, + "learning_rate": 3.6265329547514046e-05, + "loss": 2.4396, + "step": 27108 + }, + { + "epoch": 2.4560259110778917, + "grad_norm": 0.9527752995491028, + "learning_rate": 3.625928834652329e-05, + "loss": 2.4256, + "step": 27109 + }, + { + "epoch": 2.4561165092523387, + "grad_norm": 0.9953362941741943, + "learning_rate": 3.6253247145532535e-05, + "loss": 2.839, + "step": 27110 + }, + { + "epoch": 2.4562071074267853, + "grad_norm": 0.988492488861084, + "learning_rate": 3.624720594454178e-05, + "loss": 2.4666, + "step": 27111 + }, + { + "epoch": 2.4562977056012323, + "grad_norm": 0.9089058041572571, + "learning_rate": 3.624116474355102e-05, + "loss": 1.903, + "step": 27112 + }, + { + "epoch": 2.456388303775679, + "grad_norm": 0.935810387134552, + "learning_rate": 3.6235123542560264e-05, + "loss": 2.453, + "step": 27113 + }, + { + "epoch": 2.456478901950126, + "grad_norm": 1.0147756338119507, + "learning_rate": 3.6229082341569505e-05, + "loss": 2.647, + "step": 27114 + }, + { + "epoch": 2.4565695001245724, + "grad_norm": 1.0894817113876343, + "learning_rate": 3.622304114057875e-05, + "loss": 2.6164, + "step": 27115 + }, + { + "epoch": 2.4566600982990194, + "grad_norm": 0.955040454864502, + "learning_rate": 3.621699993958799e-05, + "loss": 2.6101, + "step": 27116 + }, + { + "epoch": 2.456750696473466, + "grad_norm": 1.0034074783325195, + "learning_rate": 3.6210958738597234e-05, + "loss": 2.7508, + "step": 27117 + }, + { + "epoch": 2.456841294647913, + "grad_norm": 1.0633865594863892, + "learning_rate": 3.6204917537606475e-05, + "loss": 2.7735, + "step": 27118 + }, + { + "epoch": 2.4569318928223596, + "grad_norm": 1.0497742891311646, + "learning_rate": 3.619887633661572e-05, + "loss": 2.6766, + "step": 27119 + }, + { + "epoch": 2.4570224909968066, + "grad_norm": 0.9659339189529419, + "learning_rate": 3.619283513562496e-05, + "loss": 2.5764, + "step": 27120 + }, + { + "epoch": 2.457113089171253, + "grad_norm": 1.0460968017578125, + "learning_rate": 3.6186793934634204e-05, + "loss": 2.6954, + "step": 27121 + }, + { + "epoch": 2.4572036873457, + "grad_norm": 0.9374131560325623, + "learning_rate": 3.618075273364345e-05, + "loss": 2.6859, + "step": 27122 + }, + { + "epoch": 2.4572942855201467, + "grad_norm": 1.0316096544265747, + "learning_rate": 3.617471153265269e-05, + "loss": 2.6708, + "step": 27123 + }, + { + "epoch": 2.4573848836945937, + "grad_norm": 1.044122576713562, + "learning_rate": 3.616867033166193e-05, + "loss": 2.4882, + "step": 27124 + }, + { + "epoch": 2.4574754818690403, + "grad_norm": 0.9957994818687439, + "learning_rate": 3.6162629130671174e-05, + "loss": 2.4358, + "step": 27125 + }, + { + "epoch": 2.4575660800434873, + "grad_norm": 1.1214555501937866, + "learning_rate": 3.615658792968042e-05, + "loss": 2.7731, + "step": 27126 + }, + { + "epoch": 2.457656678217934, + "grad_norm": 1.1227318048477173, + "learning_rate": 3.615054672868967e-05, + "loss": 2.6123, + "step": 27127 + }, + { + "epoch": 2.457747276392381, + "grad_norm": 1.071946620941162, + "learning_rate": 3.614450552769891e-05, + "loss": 2.4383, + "step": 27128 + }, + { + "epoch": 2.4578378745668275, + "grad_norm": 0.9847451448440552, + "learning_rate": 3.613846432670815e-05, + "loss": 2.4314, + "step": 27129 + }, + { + "epoch": 2.4579284727412745, + "grad_norm": 0.8884822726249695, + "learning_rate": 3.61324231257174e-05, + "loss": 1.992, + "step": 27130 + }, + { + "epoch": 2.458019070915721, + "grad_norm": 0.9013583064079285, + "learning_rate": 3.612638192472664e-05, + "loss": 1.8401, + "step": 27131 + }, + { + "epoch": 2.4581096690901676, + "grad_norm": 0.9498261213302612, + "learning_rate": 3.612034072373588e-05, + "loss": 2.5983, + "step": 27132 + }, + { + "epoch": 2.4582002672646146, + "grad_norm": 1.073261022567749, + "learning_rate": 3.611429952274512e-05, + "loss": 2.6944, + "step": 27133 + }, + { + "epoch": 2.4582908654390616, + "grad_norm": 0.9966921806335449, + "learning_rate": 3.610825832175437e-05, + "loss": 2.413, + "step": 27134 + }, + { + "epoch": 2.458381463613508, + "grad_norm": 0.9805190563201904, + "learning_rate": 3.610221712076361e-05, + "loss": 2.5791, + "step": 27135 + }, + { + "epoch": 2.4584720617879547, + "grad_norm": 1.0936895608901978, + "learning_rate": 3.609617591977285e-05, + "loss": 2.7093, + "step": 27136 + }, + { + "epoch": 2.4585626599624018, + "grad_norm": 0.9955267906188965, + "learning_rate": 3.60901347187821e-05, + "loss": 2.5529, + "step": 27137 + }, + { + "epoch": 2.4586532581368488, + "grad_norm": 1.0232363939285278, + "learning_rate": 3.608409351779134e-05, + "loss": 2.7315, + "step": 27138 + }, + { + "epoch": 2.4587438563112953, + "grad_norm": 1.0203155279159546, + "learning_rate": 3.607805231680058e-05, + "loss": 2.7076, + "step": 27139 + }, + { + "epoch": 2.458834454485742, + "grad_norm": 1.065098524093628, + "learning_rate": 3.607201111580982e-05, + "loss": 2.8704, + "step": 27140 + }, + { + "epoch": 2.458925052660189, + "grad_norm": 0.9477559328079224, + "learning_rate": 3.606596991481907e-05, + "loss": 2.8452, + "step": 27141 + }, + { + "epoch": 2.459015650834636, + "grad_norm": 0.9819745421409607, + "learning_rate": 3.605992871382831e-05, + "loss": 2.4446, + "step": 27142 + }, + { + "epoch": 2.4591062490090825, + "grad_norm": 0.9743182063102722, + "learning_rate": 3.605388751283755e-05, + "loss": 2.3988, + "step": 27143 + }, + { + "epoch": 2.459196847183529, + "grad_norm": 0.9795365929603577, + "learning_rate": 3.60478463118468e-05, + "loss": 2.7986, + "step": 27144 + }, + { + "epoch": 2.459287445357976, + "grad_norm": 0.9663891196250916, + "learning_rate": 3.6041805110856045e-05, + "loss": 2.8348, + "step": 27145 + }, + { + "epoch": 2.459378043532423, + "grad_norm": 1.0163798332214355, + "learning_rate": 3.6035763909865286e-05, + "loss": 2.4868, + "step": 27146 + }, + { + "epoch": 2.4594686417068696, + "grad_norm": 1.044036626815796, + "learning_rate": 3.6029722708874527e-05, + "loss": 2.881, + "step": 27147 + }, + { + "epoch": 2.459559239881316, + "grad_norm": 1.0639071464538574, + "learning_rate": 3.602368150788377e-05, + "loss": 2.6514, + "step": 27148 + }, + { + "epoch": 2.459649838055763, + "grad_norm": 0.9454957246780396, + "learning_rate": 3.6017640306893015e-05, + "loss": 2.5994, + "step": 27149 + }, + { + "epoch": 2.4597404362302098, + "grad_norm": 1.0028183460235596, + "learning_rate": 3.6011599105902256e-05, + "loss": 2.7212, + "step": 27150 + }, + { + "epoch": 2.459831034404657, + "grad_norm": 1.0200835466384888, + "learning_rate": 3.60055579049115e-05, + "loss": 2.2613, + "step": 27151 + }, + { + "epoch": 2.4599216325791033, + "grad_norm": 0.9411371350288391, + "learning_rate": 3.5999516703920744e-05, + "loss": 1.7561, + "step": 27152 + }, + { + "epoch": 2.4600122307535504, + "grad_norm": 0.9670097231864929, + "learning_rate": 3.5993475502929985e-05, + "loss": 2.6931, + "step": 27153 + }, + { + "epoch": 2.460102828927997, + "grad_norm": 1.011712670326233, + "learning_rate": 3.5987434301939226e-05, + "loss": 2.7021, + "step": 27154 + }, + { + "epoch": 2.460193427102444, + "grad_norm": 1.0452643632888794, + "learning_rate": 3.598139310094847e-05, + "loss": 2.5427, + "step": 27155 + }, + { + "epoch": 2.4602840252768905, + "grad_norm": 0.9757522940635681, + "learning_rate": 3.5975351899957714e-05, + "loss": 2.5582, + "step": 27156 + }, + { + "epoch": 2.4603746234513375, + "grad_norm": 1.0084552764892578, + "learning_rate": 3.5969310698966955e-05, + "loss": 2.6417, + "step": 27157 + }, + { + "epoch": 2.460465221625784, + "grad_norm": 0.8777901530265808, + "learning_rate": 3.5963269497976196e-05, + "loss": 1.8618, + "step": 27158 + }, + { + "epoch": 2.460555819800231, + "grad_norm": 1.1160616874694824, + "learning_rate": 3.595722829698544e-05, + "loss": 2.288, + "step": 27159 + }, + { + "epoch": 2.4606464179746776, + "grad_norm": 1.0261329412460327, + "learning_rate": 3.5951187095994684e-05, + "loss": 2.5796, + "step": 27160 + }, + { + "epoch": 2.4607370161491247, + "grad_norm": 1.0228744745254517, + "learning_rate": 3.594514589500393e-05, + "loss": 2.6281, + "step": 27161 + }, + { + "epoch": 2.460827614323571, + "grad_norm": 1.0047261714935303, + "learning_rate": 3.593910469401317e-05, + "loss": 2.7341, + "step": 27162 + }, + { + "epoch": 2.4609182124980182, + "grad_norm": 0.9593146443367004, + "learning_rate": 3.5933063493022414e-05, + "loss": 2.5382, + "step": 27163 + }, + { + "epoch": 2.461008810672465, + "grad_norm": 0.9769907593727112, + "learning_rate": 3.592702229203166e-05, + "loss": 2.4567, + "step": 27164 + }, + { + "epoch": 2.461099408846912, + "grad_norm": 1.1380605697631836, + "learning_rate": 3.59209810910409e-05, + "loss": 2.4364, + "step": 27165 + }, + { + "epoch": 2.4611900070213584, + "grad_norm": 0.9918513894081116, + "learning_rate": 3.591493989005014e-05, + "loss": 2.4721, + "step": 27166 + }, + { + "epoch": 2.4612806051958054, + "grad_norm": 1.0483719110488892, + "learning_rate": 3.590889868905939e-05, + "loss": 2.5985, + "step": 27167 + }, + { + "epoch": 2.461371203370252, + "grad_norm": 1.0826432704925537, + "learning_rate": 3.590285748806863e-05, + "loss": 2.7769, + "step": 27168 + }, + { + "epoch": 2.461461801544699, + "grad_norm": 0.9781640768051147, + "learning_rate": 3.589681628707787e-05, + "loss": 2.5787, + "step": 27169 + }, + { + "epoch": 2.4615523997191455, + "grad_norm": 1.0359841585159302, + "learning_rate": 3.589077508608711e-05, + "loss": 2.6884, + "step": 27170 + }, + { + "epoch": 2.4616429978935925, + "grad_norm": 0.8771339654922485, + "learning_rate": 3.588473388509636e-05, + "loss": 2.0556, + "step": 27171 + }, + { + "epoch": 2.461733596068039, + "grad_norm": 0.9963441491127014, + "learning_rate": 3.58786926841056e-05, + "loss": 2.4478, + "step": 27172 + }, + { + "epoch": 2.461824194242486, + "grad_norm": 0.9260521531105042, + "learning_rate": 3.587265148311484e-05, + "loss": 2.6243, + "step": 27173 + }, + { + "epoch": 2.4619147924169327, + "grad_norm": 0.9489021301269531, + "learning_rate": 3.586661028212408e-05, + "loss": 2.8171, + "step": 27174 + }, + { + "epoch": 2.4620053905913797, + "grad_norm": 0.9743290543556213, + "learning_rate": 3.586056908113333e-05, + "loss": 2.6341, + "step": 27175 + }, + { + "epoch": 2.4620959887658262, + "grad_norm": 1.016595721244812, + "learning_rate": 3.585452788014257e-05, + "loss": 2.5959, + "step": 27176 + }, + { + "epoch": 2.4621865869402733, + "grad_norm": 0.9796852469444275, + "learning_rate": 3.584848667915182e-05, + "loss": 2.6879, + "step": 27177 + }, + { + "epoch": 2.46227718511472, + "grad_norm": 0.9508451223373413, + "learning_rate": 3.584244547816106e-05, + "loss": 2.5834, + "step": 27178 + }, + { + "epoch": 2.462367783289167, + "grad_norm": 0.9862967133522034, + "learning_rate": 3.583640427717031e-05, + "loss": 2.5111, + "step": 27179 + }, + { + "epoch": 2.4624583814636134, + "grad_norm": 0.9895160794258118, + "learning_rate": 3.583036307617955e-05, + "loss": 2.3238, + "step": 27180 + }, + { + "epoch": 2.4625489796380604, + "grad_norm": 1.028256893157959, + "learning_rate": 3.582432187518879e-05, + "loss": 2.67, + "step": 27181 + }, + { + "epoch": 2.462639577812507, + "grad_norm": 0.9581395983695984, + "learning_rate": 3.581828067419804e-05, + "loss": 1.6588, + "step": 27182 + }, + { + "epoch": 2.462730175986954, + "grad_norm": 1.0334851741790771, + "learning_rate": 3.581223947320728e-05, + "loss": 2.649, + "step": 27183 + }, + { + "epoch": 2.4628207741614005, + "grad_norm": 1.1844723224639893, + "learning_rate": 3.580619827221652e-05, + "loss": 2.4761, + "step": 27184 + }, + { + "epoch": 2.4629113723358476, + "grad_norm": 0.9315896034240723, + "learning_rate": 3.580015707122576e-05, + "loss": 2.0623, + "step": 27185 + }, + { + "epoch": 2.463001970510294, + "grad_norm": 1.0812896490097046, + "learning_rate": 3.579411587023501e-05, + "loss": 2.717, + "step": 27186 + }, + { + "epoch": 2.463092568684741, + "grad_norm": 1.0726414918899536, + "learning_rate": 3.578807466924425e-05, + "loss": 2.7616, + "step": 27187 + }, + { + "epoch": 2.4631831668591877, + "grad_norm": 1.019016981124878, + "learning_rate": 3.578203346825349e-05, + "loss": 2.5545, + "step": 27188 + }, + { + "epoch": 2.4632737650336347, + "grad_norm": 1.021997094154358, + "learning_rate": 3.577599226726273e-05, + "loss": 2.4755, + "step": 27189 + }, + { + "epoch": 2.4633643632080813, + "grad_norm": 0.9225133061408997, + "learning_rate": 3.576995106627198e-05, + "loss": 2.1299, + "step": 27190 + }, + { + "epoch": 2.4634549613825283, + "grad_norm": 1.0044673681259155, + "learning_rate": 3.576390986528122e-05, + "loss": 2.5344, + "step": 27191 + }, + { + "epoch": 2.463545559556975, + "grad_norm": 1.0117093324661255, + "learning_rate": 3.575786866429046e-05, + "loss": 1.965, + "step": 27192 + }, + { + "epoch": 2.463636157731422, + "grad_norm": 1.0497311353683472, + "learning_rate": 3.5751827463299706e-05, + "loss": 2.3788, + "step": 27193 + }, + { + "epoch": 2.4637267559058684, + "grad_norm": 0.9023634195327759, + "learning_rate": 3.574578626230895e-05, + "loss": 2.0385, + "step": 27194 + }, + { + "epoch": 2.4638173540803154, + "grad_norm": 0.9632201790809631, + "learning_rate": 3.5739745061318195e-05, + "loss": 2.7856, + "step": 27195 + }, + { + "epoch": 2.463907952254762, + "grad_norm": 1.005077600479126, + "learning_rate": 3.5733703860327436e-05, + "loss": 2.4806, + "step": 27196 + }, + { + "epoch": 2.463998550429209, + "grad_norm": 0.9832914471626282, + "learning_rate": 3.572766265933668e-05, + "loss": 2.6316, + "step": 27197 + }, + { + "epoch": 2.4640891486036556, + "grad_norm": 1.0316290855407715, + "learning_rate": 3.5721621458345924e-05, + "loss": 2.6385, + "step": 27198 + }, + { + "epoch": 2.4641797467781026, + "grad_norm": 0.9304162859916687, + "learning_rate": 3.5715580257355165e-05, + "loss": 2.5535, + "step": 27199 + }, + { + "epoch": 2.464270344952549, + "grad_norm": 0.9739006161689758, + "learning_rate": 3.5709539056364406e-05, + "loss": 2.7815, + "step": 27200 + }, + { + "epoch": 2.464360943126996, + "grad_norm": 1.0069398880004883, + "learning_rate": 3.570349785537365e-05, + "loss": 2.5749, + "step": 27201 + }, + { + "epoch": 2.4644515413014427, + "grad_norm": 0.9775124192237854, + "learning_rate": 3.5697456654382894e-05, + "loss": 2.5809, + "step": 27202 + }, + { + "epoch": 2.4645421394758897, + "grad_norm": 1.0078752040863037, + "learning_rate": 3.5691415453392135e-05, + "loss": 2.5429, + "step": 27203 + }, + { + "epoch": 2.4646327376503363, + "grad_norm": 1.1514825820922852, + "learning_rate": 3.5685374252401376e-05, + "loss": 2.7156, + "step": 27204 + }, + { + "epoch": 2.4647233358247833, + "grad_norm": 1.0365729331970215, + "learning_rate": 3.5679333051410623e-05, + "loss": 2.6125, + "step": 27205 + }, + { + "epoch": 2.46481393399923, + "grad_norm": 1.041266679763794, + "learning_rate": 3.5673291850419864e-05, + "loss": 2.4256, + "step": 27206 + }, + { + "epoch": 2.464904532173677, + "grad_norm": 1.1032992601394653, + "learning_rate": 3.5667250649429105e-05, + "loss": 2.409, + "step": 27207 + }, + { + "epoch": 2.4649951303481235, + "grad_norm": 1.0203834772109985, + "learning_rate": 3.5661209448438346e-05, + "loss": 2.4501, + "step": 27208 + }, + { + "epoch": 2.4650857285225705, + "grad_norm": 1.0087130069732666, + "learning_rate": 3.5655168247447593e-05, + "loss": 2.6122, + "step": 27209 + }, + { + "epoch": 2.465176326697017, + "grad_norm": 0.9418136477470398, + "learning_rate": 3.5649127046456834e-05, + "loss": 1.9938, + "step": 27210 + }, + { + "epoch": 2.4652669248714636, + "grad_norm": 1.006762146949768, + "learning_rate": 3.564308584546608e-05, + "loss": 2.5182, + "step": 27211 + }, + { + "epoch": 2.4653575230459106, + "grad_norm": 0.9889546632766724, + "learning_rate": 3.563704464447532e-05, + "loss": 2.1003, + "step": 27212 + }, + { + "epoch": 2.4654481212203576, + "grad_norm": 0.9893085360527039, + "learning_rate": 3.563100344348457e-05, + "loss": 2.5958, + "step": 27213 + }, + { + "epoch": 2.465538719394804, + "grad_norm": 1.0781358480453491, + "learning_rate": 3.562496224249381e-05, + "loss": 2.5678, + "step": 27214 + }, + { + "epoch": 2.4656293175692507, + "grad_norm": 1.0364078283309937, + "learning_rate": 3.561892104150305e-05, + "loss": 2.1862, + "step": 27215 + }, + { + "epoch": 2.4657199157436978, + "grad_norm": 0.9546562433242798, + "learning_rate": 3.56128798405123e-05, + "loss": 2.6797, + "step": 27216 + }, + { + "epoch": 2.4658105139181448, + "grad_norm": 0.997566819190979, + "learning_rate": 3.560683863952154e-05, + "loss": 2.6533, + "step": 27217 + }, + { + "epoch": 2.4659011120925913, + "grad_norm": 0.9350143074989319, + "learning_rate": 3.560079743853078e-05, + "loss": 2.3515, + "step": 27218 + }, + { + "epoch": 2.465991710267038, + "grad_norm": 0.9386445879936218, + "learning_rate": 3.559475623754002e-05, + "loss": 2.1335, + "step": 27219 + }, + { + "epoch": 2.466082308441485, + "grad_norm": 0.9578209519386292, + "learning_rate": 3.558871503654927e-05, + "loss": 2.7284, + "step": 27220 + }, + { + "epoch": 2.466172906615932, + "grad_norm": 0.9841880202293396, + "learning_rate": 3.558267383555851e-05, + "loss": 2.629, + "step": 27221 + }, + { + "epoch": 2.4662635047903785, + "grad_norm": 1.0108617544174194, + "learning_rate": 3.557663263456775e-05, + "loss": 2.5605, + "step": 27222 + }, + { + "epoch": 2.466354102964825, + "grad_norm": 0.9443399310112, + "learning_rate": 3.557059143357699e-05, + "loss": 2.3703, + "step": 27223 + }, + { + "epoch": 2.466444701139272, + "grad_norm": 1.049899697303772, + "learning_rate": 3.556455023258624e-05, + "loss": 2.8147, + "step": 27224 + }, + { + "epoch": 2.466535299313719, + "grad_norm": 0.9827964901924133, + "learning_rate": 3.555850903159548e-05, + "loss": 2.5677, + "step": 27225 + }, + { + "epoch": 2.4666258974881656, + "grad_norm": 1.0329145193099976, + "learning_rate": 3.555246783060472e-05, + "loss": 2.6946, + "step": 27226 + }, + { + "epoch": 2.466716495662612, + "grad_norm": 1.1185581684112549, + "learning_rate": 3.554642662961397e-05, + "loss": 2.6512, + "step": 27227 + }, + { + "epoch": 2.466807093837059, + "grad_norm": 0.9964622855186462, + "learning_rate": 3.554038542862321e-05, + "loss": 2.5451, + "step": 27228 + }, + { + "epoch": 2.466897692011506, + "grad_norm": 1.0363693237304688, + "learning_rate": 3.553434422763246e-05, + "loss": 2.4093, + "step": 27229 + }, + { + "epoch": 2.466988290185953, + "grad_norm": 1.012795090675354, + "learning_rate": 3.55283030266417e-05, + "loss": 2.6174, + "step": 27230 + }, + { + "epoch": 2.4670788883603993, + "grad_norm": 1.2564254999160767, + "learning_rate": 3.5522261825650946e-05, + "loss": 2.3498, + "step": 27231 + }, + { + "epoch": 2.4671694865348464, + "grad_norm": 1.1091480255126953, + "learning_rate": 3.551622062466019e-05, + "loss": 2.5361, + "step": 27232 + }, + { + "epoch": 2.467260084709293, + "grad_norm": 1.080025553703308, + "learning_rate": 3.551017942366943e-05, + "loss": 2.6186, + "step": 27233 + }, + { + "epoch": 2.46735068288374, + "grad_norm": 0.9681399464607239, + "learning_rate": 3.550413822267867e-05, + "loss": 2.8165, + "step": 27234 + }, + { + "epoch": 2.4674412810581865, + "grad_norm": 0.9421623349189758, + "learning_rate": 3.5498097021687916e-05, + "loss": 2.8136, + "step": 27235 + }, + { + "epoch": 2.4675318792326335, + "grad_norm": 1.203343152999878, + "learning_rate": 3.549205582069716e-05, + "loss": 2.2782, + "step": 27236 + }, + { + "epoch": 2.46762247740708, + "grad_norm": 0.9699657559394836, + "learning_rate": 3.54860146197064e-05, + "loss": 2.5519, + "step": 27237 + }, + { + "epoch": 2.467713075581527, + "grad_norm": 1.0078518390655518, + "learning_rate": 3.547997341871564e-05, + "loss": 2.5624, + "step": 27238 + }, + { + "epoch": 2.4678036737559736, + "grad_norm": 1.2081613540649414, + "learning_rate": 3.5473932217724886e-05, + "loss": 2.6009, + "step": 27239 + }, + { + "epoch": 2.4678942719304207, + "grad_norm": 1.0049006938934326, + "learning_rate": 3.546789101673413e-05, + "loss": 2.4662, + "step": 27240 + }, + { + "epoch": 2.4679848701048672, + "grad_norm": 1.134531855583191, + "learning_rate": 3.546184981574337e-05, + "loss": 2.5741, + "step": 27241 + }, + { + "epoch": 2.4680754682793142, + "grad_norm": 0.9711933732032776, + "learning_rate": 3.5455808614752615e-05, + "loss": 2.3719, + "step": 27242 + }, + { + "epoch": 2.468166066453761, + "grad_norm": 1.0616040229797363, + "learning_rate": 3.5449767413761856e-05, + "loss": 2.5908, + "step": 27243 + }, + { + "epoch": 2.468256664628208, + "grad_norm": 1.01474928855896, + "learning_rate": 3.54437262127711e-05, + "loss": 2.4194, + "step": 27244 + }, + { + "epoch": 2.4683472628026544, + "grad_norm": 0.9633707404136658, + "learning_rate": 3.5437685011780345e-05, + "loss": 2.8152, + "step": 27245 + }, + { + "epoch": 2.4684378609771014, + "grad_norm": 1.0163726806640625, + "learning_rate": 3.5431643810789586e-05, + "loss": 2.6332, + "step": 27246 + }, + { + "epoch": 2.468528459151548, + "grad_norm": 1.047460675239563, + "learning_rate": 3.542560260979883e-05, + "loss": 2.7024, + "step": 27247 + }, + { + "epoch": 2.468619057325995, + "grad_norm": 1.0110112428665161, + "learning_rate": 3.5419561408808074e-05, + "loss": 2.6172, + "step": 27248 + }, + { + "epoch": 2.4687096555004415, + "grad_norm": 1.0075478553771973, + "learning_rate": 3.5413520207817315e-05, + "loss": 2.5497, + "step": 27249 + }, + { + "epoch": 2.4688002536748885, + "grad_norm": 1.0136492252349854, + "learning_rate": 3.540747900682656e-05, + "loss": 2.6445, + "step": 27250 + }, + { + "epoch": 2.468890851849335, + "grad_norm": 1.0279488563537598, + "learning_rate": 3.54014378058358e-05, + "loss": 2.4129, + "step": 27251 + }, + { + "epoch": 2.468981450023782, + "grad_norm": 0.9852150082588196, + "learning_rate": 3.5395396604845044e-05, + "loss": 2.5767, + "step": 27252 + }, + { + "epoch": 2.4690720481982287, + "grad_norm": 1.022430658340454, + "learning_rate": 3.5389355403854285e-05, + "loss": 2.6218, + "step": 27253 + }, + { + "epoch": 2.4691626463726757, + "grad_norm": 1.0017400979995728, + "learning_rate": 3.538331420286353e-05, + "loss": 2.3346, + "step": 27254 + }, + { + "epoch": 2.4692532445471223, + "grad_norm": 1.0018064975738525, + "learning_rate": 3.537727300187277e-05, + "loss": 2.7881, + "step": 27255 + }, + { + "epoch": 2.4693438427215693, + "grad_norm": 1.0501102209091187, + "learning_rate": 3.5371231800882014e-05, + "loss": 2.6197, + "step": 27256 + }, + { + "epoch": 2.469434440896016, + "grad_norm": 1.0023037195205688, + "learning_rate": 3.536519059989126e-05, + "loss": 2.6538, + "step": 27257 + }, + { + "epoch": 2.469525039070463, + "grad_norm": 1.0054082870483398, + "learning_rate": 3.53591493989005e-05, + "loss": 2.5511, + "step": 27258 + }, + { + "epoch": 2.4696156372449094, + "grad_norm": 1.046448826789856, + "learning_rate": 3.5353108197909743e-05, + "loss": 2.4962, + "step": 27259 + }, + { + "epoch": 2.4697062354193564, + "grad_norm": 0.9592337608337402, + "learning_rate": 3.5347066996918984e-05, + "loss": 2.2707, + "step": 27260 + }, + { + "epoch": 2.469796833593803, + "grad_norm": 0.9301201701164246, + "learning_rate": 3.534102579592823e-05, + "loss": 2.4094, + "step": 27261 + }, + { + "epoch": 2.46988743176825, + "grad_norm": 1.0572644472122192, + "learning_rate": 3.533498459493747e-05, + "loss": 2.8636, + "step": 27262 + }, + { + "epoch": 2.4699780299426966, + "grad_norm": 1.0429459810256958, + "learning_rate": 3.532894339394672e-05, + "loss": 2.4958, + "step": 27263 + }, + { + "epoch": 2.4700686281171436, + "grad_norm": 1.006643295288086, + "learning_rate": 3.532290219295596e-05, + "loss": 2.6822, + "step": 27264 + }, + { + "epoch": 2.47015922629159, + "grad_norm": 1.0092881917953491, + "learning_rate": 3.531686099196521e-05, + "loss": 2.5562, + "step": 27265 + }, + { + "epoch": 2.470249824466037, + "grad_norm": 0.9291585683822632, + "learning_rate": 3.531081979097445e-05, + "loss": 2.4917, + "step": 27266 + }, + { + "epoch": 2.4703404226404837, + "grad_norm": 0.8681920766830444, + "learning_rate": 3.530477858998369e-05, + "loss": 1.9336, + "step": 27267 + }, + { + "epoch": 2.4704310208149307, + "grad_norm": 1.0329490900039673, + "learning_rate": 3.529873738899294e-05, + "loss": 2.7418, + "step": 27268 + }, + { + "epoch": 2.4705216189893773, + "grad_norm": 0.9897164702415466, + "learning_rate": 3.529269618800218e-05, + "loss": 2.6359, + "step": 27269 + }, + { + "epoch": 2.4706122171638243, + "grad_norm": 1.0903749465942383, + "learning_rate": 3.528665498701142e-05, + "loss": 2.6786, + "step": 27270 + }, + { + "epoch": 2.470702815338271, + "grad_norm": 1.0673049688339233, + "learning_rate": 3.528061378602066e-05, + "loss": 2.5582, + "step": 27271 + }, + { + "epoch": 2.470793413512718, + "grad_norm": 0.9956898093223572, + "learning_rate": 3.527457258502991e-05, + "loss": 2.6497, + "step": 27272 + }, + { + "epoch": 2.4708840116871644, + "grad_norm": 0.9363039135932922, + "learning_rate": 3.526853138403915e-05, + "loss": 2.5257, + "step": 27273 + }, + { + "epoch": 2.4709746098616114, + "grad_norm": 0.997515857219696, + "learning_rate": 3.526249018304839e-05, + "loss": 2.6033, + "step": 27274 + }, + { + "epoch": 2.471065208036058, + "grad_norm": 1.0833325386047363, + "learning_rate": 3.525644898205763e-05, + "loss": 2.9219, + "step": 27275 + }, + { + "epoch": 2.471155806210505, + "grad_norm": 1.0100353956222534, + "learning_rate": 3.525040778106688e-05, + "loss": 2.809, + "step": 27276 + }, + { + "epoch": 2.4712464043849516, + "grad_norm": 1.0163263082504272, + "learning_rate": 3.524436658007612e-05, + "loss": 2.6094, + "step": 27277 + }, + { + "epoch": 2.4713370025593986, + "grad_norm": 0.9886258244514465, + "learning_rate": 3.523832537908536e-05, + "loss": 2.4369, + "step": 27278 + }, + { + "epoch": 2.471427600733845, + "grad_norm": 1.0187864303588867, + "learning_rate": 3.523228417809461e-05, + "loss": 2.5681, + "step": 27279 + }, + { + "epoch": 2.471518198908292, + "grad_norm": 0.9883895516395569, + "learning_rate": 3.522624297710385e-05, + "loss": 2.6865, + "step": 27280 + }, + { + "epoch": 2.4716087970827387, + "grad_norm": 0.9905745387077332, + "learning_rate": 3.5220201776113096e-05, + "loss": 2.803, + "step": 27281 + }, + { + "epoch": 2.4716993952571857, + "grad_norm": 0.9718431234359741, + "learning_rate": 3.521416057512234e-05, + "loss": 2.6225, + "step": 27282 + }, + { + "epoch": 2.4717899934316323, + "grad_norm": 1.0347058773040771, + "learning_rate": 3.5208119374131584e-05, + "loss": 2.633, + "step": 27283 + }, + { + "epoch": 2.4718805916060793, + "grad_norm": 0.9691718220710754, + "learning_rate": 3.5202078173140825e-05, + "loss": 2.6746, + "step": 27284 + }, + { + "epoch": 2.471971189780526, + "grad_norm": 1.1587473154067993, + "learning_rate": 3.5196036972150066e-05, + "loss": 2.4569, + "step": 27285 + }, + { + "epoch": 2.472061787954973, + "grad_norm": 1.0372605323791504, + "learning_rate": 3.518999577115931e-05, + "loss": 2.5516, + "step": 27286 + }, + { + "epoch": 2.4721523861294195, + "grad_norm": 1.026595950126648, + "learning_rate": 3.5183954570168554e-05, + "loss": 2.5297, + "step": 27287 + }, + { + "epoch": 2.4722429843038665, + "grad_norm": 1.0392403602600098, + "learning_rate": 3.5177913369177795e-05, + "loss": 2.6948, + "step": 27288 + }, + { + "epoch": 2.472333582478313, + "grad_norm": 0.9951345920562744, + "learning_rate": 3.5171872168187036e-05, + "loss": 2.679, + "step": 27289 + }, + { + "epoch": 2.47242418065276, + "grad_norm": 0.9665607213973999, + "learning_rate": 3.516583096719628e-05, + "loss": 2.5342, + "step": 27290 + }, + { + "epoch": 2.4725147788272066, + "grad_norm": 1.1001569032669067, + "learning_rate": 3.5159789766205524e-05, + "loss": 2.6425, + "step": 27291 + }, + { + "epoch": 2.4726053770016536, + "grad_norm": 0.9644256234169006, + "learning_rate": 3.5153748565214765e-05, + "loss": 2.7415, + "step": 27292 + }, + { + "epoch": 2.4726959751761, + "grad_norm": 1.0656228065490723, + "learning_rate": 3.5147707364224006e-05, + "loss": 2.5118, + "step": 27293 + }, + { + "epoch": 2.4727865733505467, + "grad_norm": 0.9856387972831726, + "learning_rate": 3.514166616323325e-05, + "loss": 1.9282, + "step": 27294 + }, + { + "epoch": 2.4728771715249938, + "grad_norm": 0.8600306510925293, + "learning_rate": 3.5135624962242495e-05, + "loss": 2.0343, + "step": 27295 + }, + { + "epoch": 2.4729677696994408, + "grad_norm": 1.139249324798584, + "learning_rate": 3.5129583761251735e-05, + "loss": 2.5824, + "step": 27296 + }, + { + "epoch": 2.4730583678738873, + "grad_norm": 1.0240987539291382, + "learning_rate": 3.512354256026098e-05, + "loss": 2.4731, + "step": 27297 + }, + { + "epoch": 2.473148966048334, + "grad_norm": 1.0165468454360962, + "learning_rate": 3.511750135927023e-05, + "loss": 2.4988, + "step": 27298 + }, + { + "epoch": 2.473239564222781, + "grad_norm": 0.9747100472450256, + "learning_rate": 3.511146015827947e-05, + "loss": 2.5891, + "step": 27299 + }, + { + "epoch": 2.473330162397228, + "grad_norm": 1.0059376955032349, + "learning_rate": 3.510541895728871e-05, + "loss": 2.2487, + "step": 27300 + }, + { + "epoch": 2.4734207605716745, + "grad_norm": 1.1526713371276855, + "learning_rate": 3.509937775629795e-05, + "loss": 2.6176, + "step": 27301 + }, + { + "epoch": 2.473511358746121, + "grad_norm": 0.9708913564682007, + "learning_rate": 3.50933365553072e-05, + "loss": 2.4461, + "step": 27302 + }, + { + "epoch": 2.473601956920568, + "grad_norm": 1.0814663171768188, + "learning_rate": 3.508729535431644e-05, + "loss": 2.6473, + "step": 27303 + }, + { + "epoch": 2.473692555095015, + "grad_norm": 1.0030646324157715, + "learning_rate": 3.508125415332568e-05, + "loss": 2.6368, + "step": 27304 + }, + { + "epoch": 2.4737831532694616, + "grad_norm": 0.923551619052887, + "learning_rate": 3.507521295233492e-05, + "loss": 1.8833, + "step": 27305 + }, + { + "epoch": 2.473873751443908, + "grad_norm": 1.0335938930511475, + "learning_rate": 3.506917175134417e-05, + "loss": 2.5494, + "step": 27306 + }, + { + "epoch": 2.473964349618355, + "grad_norm": 1.025915265083313, + "learning_rate": 3.506313055035341e-05, + "loss": 2.4813, + "step": 27307 + }, + { + "epoch": 2.474054947792802, + "grad_norm": 1.0246530771255493, + "learning_rate": 3.505708934936265e-05, + "loss": 2.7219, + "step": 27308 + }, + { + "epoch": 2.474145545967249, + "grad_norm": 1.0100975036621094, + "learning_rate": 3.505104814837189e-05, + "loss": 2.5934, + "step": 27309 + }, + { + "epoch": 2.4742361441416953, + "grad_norm": 1.0450623035430908, + "learning_rate": 3.504500694738114e-05, + "loss": 2.6946, + "step": 27310 + }, + { + "epoch": 2.4743267423161424, + "grad_norm": 0.9985696077346802, + "learning_rate": 3.503896574639038e-05, + "loss": 2.6341, + "step": 27311 + }, + { + "epoch": 2.474417340490589, + "grad_norm": 1.102407693862915, + "learning_rate": 3.503292454539962e-05, + "loss": 2.7979, + "step": 27312 + }, + { + "epoch": 2.474507938665036, + "grad_norm": 0.9947782158851624, + "learning_rate": 3.502688334440887e-05, + "loss": 2.8744, + "step": 27313 + }, + { + "epoch": 2.4745985368394825, + "grad_norm": 1.0840747356414795, + "learning_rate": 3.502084214341812e-05, + "loss": 2.5883, + "step": 27314 + }, + { + "epoch": 2.4746891350139295, + "grad_norm": 0.9657639861106873, + "learning_rate": 3.501480094242736e-05, + "loss": 2.356, + "step": 27315 + }, + { + "epoch": 2.474779733188376, + "grad_norm": 1.0394608974456787, + "learning_rate": 3.50087597414366e-05, + "loss": 2.6631, + "step": 27316 + }, + { + "epoch": 2.474870331362823, + "grad_norm": 0.851361095905304, + "learning_rate": 3.500271854044585e-05, + "loss": 2.0215, + "step": 27317 + }, + { + "epoch": 2.4749609295372696, + "grad_norm": 1.0310873985290527, + "learning_rate": 3.499667733945509e-05, + "loss": 2.4675, + "step": 27318 + }, + { + "epoch": 2.4750515277117167, + "grad_norm": 0.9971164464950562, + "learning_rate": 3.499063613846433e-05, + "loss": 2.7706, + "step": 27319 + }, + { + "epoch": 2.4751421258861632, + "grad_norm": 0.9698038697242737, + "learning_rate": 3.498459493747357e-05, + "loss": 2.4955, + "step": 27320 + }, + { + "epoch": 2.4752327240606102, + "grad_norm": 1.05073082447052, + "learning_rate": 3.497855373648282e-05, + "loss": 2.6035, + "step": 27321 + }, + { + "epoch": 2.475323322235057, + "grad_norm": 1.0875582695007324, + "learning_rate": 3.497251253549206e-05, + "loss": 2.793, + "step": 27322 + }, + { + "epoch": 2.475413920409504, + "grad_norm": 0.989181399345398, + "learning_rate": 3.49664713345013e-05, + "loss": 2.5868, + "step": 27323 + }, + { + "epoch": 2.4755045185839504, + "grad_norm": 0.8955581188201904, + "learning_rate": 3.496043013351054e-05, + "loss": 2.0746, + "step": 27324 + }, + { + "epoch": 2.4755951167583974, + "grad_norm": 1.0163919925689697, + "learning_rate": 3.495438893251979e-05, + "loss": 2.6605, + "step": 27325 + }, + { + "epoch": 2.475685714932844, + "grad_norm": 1.100640058517456, + "learning_rate": 3.494834773152903e-05, + "loss": 2.4908, + "step": 27326 + }, + { + "epoch": 2.475776313107291, + "grad_norm": 1.0446871519088745, + "learning_rate": 3.494230653053827e-05, + "loss": 2.534, + "step": 27327 + }, + { + "epoch": 2.4758669112817375, + "grad_norm": 0.9830228686332703, + "learning_rate": 3.4936265329547516e-05, + "loss": 2.4684, + "step": 27328 + }, + { + "epoch": 2.4759575094561845, + "grad_norm": 1.0958658456802368, + "learning_rate": 3.493022412855676e-05, + "loss": 2.6219, + "step": 27329 + }, + { + "epoch": 2.476048107630631, + "grad_norm": 1.0833319425582886, + "learning_rate": 3.4924182927566e-05, + "loss": 2.5027, + "step": 27330 + }, + { + "epoch": 2.476138705805078, + "grad_norm": 1.0479943752288818, + "learning_rate": 3.4918141726575246e-05, + "loss": 2.6888, + "step": 27331 + }, + { + "epoch": 2.4762293039795247, + "grad_norm": 0.9983671307563782, + "learning_rate": 3.491210052558449e-05, + "loss": 2.5441, + "step": 27332 + }, + { + "epoch": 2.4763199021539717, + "grad_norm": 1.0575830936431885, + "learning_rate": 3.4906059324593734e-05, + "loss": 2.8569, + "step": 27333 + }, + { + "epoch": 2.4764105003284183, + "grad_norm": 0.8646189570426941, + "learning_rate": 3.4900018123602975e-05, + "loss": 1.7126, + "step": 27334 + }, + { + "epoch": 2.4765010985028653, + "grad_norm": 0.8551718592643738, + "learning_rate": 3.4893976922612216e-05, + "loss": 1.817, + "step": 27335 + }, + { + "epoch": 2.476591696677312, + "grad_norm": 1.0278064012527466, + "learning_rate": 3.4887935721621463e-05, + "loss": 2.7456, + "step": 27336 + }, + { + "epoch": 2.476682294851759, + "grad_norm": 1.0269728899002075, + "learning_rate": 3.4881894520630704e-05, + "loss": 2.7172, + "step": 27337 + }, + { + "epoch": 2.4767728930262054, + "grad_norm": 1.1281635761260986, + "learning_rate": 3.4875853319639945e-05, + "loss": 2.5536, + "step": 27338 + }, + { + "epoch": 2.4768634912006524, + "grad_norm": 0.9979761242866516, + "learning_rate": 3.4869812118649186e-05, + "loss": 2.5828, + "step": 27339 + }, + { + "epoch": 2.476954089375099, + "grad_norm": 1.1473395824432373, + "learning_rate": 3.4863770917658434e-05, + "loss": 2.4597, + "step": 27340 + }, + { + "epoch": 2.477044687549546, + "grad_norm": 0.972741961479187, + "learning_rate": 3.4857729716667674e-05, + "loss": 2.546, + "step": 27341 + }, + { + "epoch": 2.4771352857239926, + "grad_norm": 0.9967737793922424, + "learning_rate": 3.4851688515676915e-05, + "loss": 2.5026, + "step": 27342 + }, + { + "epoch": 2.4772258838984396, + "grad_norm": 0.9878135323524475, + "learning_rate": 3.484564731468616e-05, + "loss": 2.5085, + "step": 27343 + }, + { + "epoch": 2.477316482072886, + "grad_norm": 1.0135385990142822, + "learning_rate": 3.4839606113695404e-05, + "loss": 2.6704, + "step": 27344 + }, + { + "epoch": 2.477407080247333, + "grad_norm": 0.9912946820259094, + "learning_rate": 3.4833564912704644e-05, + "loss": 2.4628, + "step": 27345 + }, + { + "epoch": 2.4774976784217797, + "grad_norm": 0.9309930801391602, + "learning_rate": 3.4827523711713885e-05, + "loss": 2.0508, + "step": 27346 + }, + { + "epoch": 2.4775882765962267, + "grad_norm": 1.0367287397384644, + "learning_rate": 3.482148251072313e-05, + "loss": 2.7299, + "step": 27347 + }, + { + "epoch": 2.4776788747706733, + "grad_norm": 1.0382239818572998, + "learning_rate": 3.481544130973238e-05, + "loss": 2.5881, + "step": 27348 + }, + { + "epoch": 2.4777694729451203, + "grad_norm": 1.0665769577026367, + "learning_rate": 3.480940010874162e-05, + "loss": 2.7416, + "step": 27349 + }, + { + "epoch": 2.477860071119567, + "grad_norm": 1.0014071464538574, + "learning_rate": 3.480335890775086e-05, + "loss": 2.8784, + "step": 27350 + }, + { + "epoch": 2.477950669294014, + "grad_norm": 0.9889076948165894, + "learning_rate": 3.479731770676011e-05, + "loss": 2.533, + "step": 27351 + }, + { + "epoch": 2.4780412674684604, + "grad_norm": 0.9928693771362305, + "learning_rate": 3.479127650576935e-05, + "loss": 1.9, + "step": 27352 + }, + { + "epoch": 2.4781318656429074, + "grad_norm": 1.0685920715332031, + "learning_rate": 3.478523530477859e-05, + "loss": 2.542, + "step": 27353 + }, + { + "epoch": 2.478222463817354, + "grad_norm": 0.9963719844818115, + "learning_rate": 3.477919410378783e-05, + "loss": 2.3453, + "step": 27354 + }, + { + "epoch": 2.478313061991801, + "grad_norm": 1.0471971035003662, + "learning_rate": 3.477315290279708e-05, + "loss": 2.8028, + "step": 27355 + }, + { + "epoch": 2.4784036601662476, + "grad_norm": 0.9707067012786865, + "learning_rate": 3.476711170180632e-05, + "loss": 2.6675, + "step": 27356 + }, + { + "epoch": 2.4784942583406946, + "grad_norm": 1.1043154001235962, + "learning_rate": 3.476107050081556e-05, + "loss": 2.6369, + "step": 27357 + }, + { + "epoch": 2.478584856515141, + "grad_norm": 1.0238697528839111, + "learning_rate": 3.475502929982481e-05, + "loss": 2.5652, + "step": 27358 + }, + { + "epoch": 2.478675454689588, + "grad_norm": 0.978294312953949, + "learning_rate": 3.474898809883405e-05, + "loss": 2.8013, + "step": 27359 + }, + { + "epoch": 2.4787660528640347, + "grad_norm": 0.9852452874183655, + "learning_rate": 3.474294689784329e-05, + "loss": 2.7507, + "step": 27360 + }, + { + "epoch": 2.4788566510384817, + "grad_norm": 0.9660177230834961, + "learning_rate": 3.473690569685253e-05, + "loss": 2.7524, + "step": 27361 + }, + { + "epoch": 2.4789472492129283, + "grad_norm": 0.9882683753967285, + "learning_rate": 3.473086449586178e-05, + "loss": 2.6315, + "step": 27362 + }, + { + "epoch": 2.4790378473873753, + "grad_norm": 0.9845373630523682, + "learning_rate": 3.472482329487102e-05, + "loss": 2.6748, + "step": 27363 + }, + { + "epoch": 2.479128445561822, + "grad_norm": 1.0559914112091064, + "learning_rate": 3.471878209388027e-05, + "loss": 2.7218, + "step": 27364 + }, + { + "epoch": 2.479219043736269, + "grad_norm": 1.162386417388916, + "learning_rate": 3.471274089288951e-05, + "loss": 2.4574, + "step": 27365 + }, + { + "epoch": 2.4793096419107155, + "grad_norm": 0.9763046503067017, + "learning_rate": 3.4706699691898756e-05, + "loss": 2.3587, + "step": 27366 + }, + { + "epoch": 2.4794002400851625, + "grad_norm": 1.010975956916809, + "learning_rate": 3.4700658490908e-05, + "loss": 2.6349, + "step": 27367 + }, + { + "epoch": 2.479490838259609, + "grad_norm": 1.1118437051773071, + "learning_rate": 3.469461728991724e-05, + "loss": 2.4864, + "step": 27368 + }, + { + "epoch": 2.479581436434056, + "grad_norm": 1.1021145582199097, + "learning_rate": 3.468857608892648e-05, + "loss": 2.5774, + "step": 27369 + }, + { + "epoch": 2.4796720346085026, + "grad_norm": 0.8978527784347534, + "learning_rate": 3.4682534887935726e-05, + "loss": 1.6608, + "step": 27370 + }, + { + "epoch": 2.4797626327829496, + "grad_norm": 1.003939151763916, + "learning_rate": 3.467649368694497e-05, + "loss": 2.709, + "step": 27371 + }, + { + "epoch": 2.479853230957396, + "grad_norm": 1.0264551639556885, + "learning_rate": 3.467045248595421e-05, + "loss": 2.454, + "step": 27372 + }, + { + "epoch": 2.4799438291318427, + "grad_norm": 1.049196720123291, + "learning_rate": 3.4664411284963455e-05, + "loss": 2.6414, + "step": 27373 + }, + { + "epoch": 2.4800344273062898, + "grad_norm": 1.1246525049209595, + "learning_rate": 3.4658370083972696e-05, + "loss": 2.5733, + "step": 27374 + }, + { + "epoch": 2.4801250254807368, + "grad_norm": 1.0864877700805664, + "learning_rate": 3.465232888298194e-05, + "loss": 2.6804, + "step": 27375 + }, + { + "epoch": 2.4802156236551833, + "grad_norm": 1.0764284133911133, + "learning_rate": 3.464628768199118e-05, + "loss": 2.4786, + "step": 27376 + }, + { + "epoch": 2.48030622182963, + "grad_norm": 1.2030445337295532, + "learning_rate": 3.4640246481000426e-05, + "loss": 2.3313, + "step": 27377 + }, + { + "epoch": 2.480396820004077, + "grad_norm": 1.1857749223709106, + "learning_rate": 3.4634205280009666e-05, + "loss": 2.5674, + "step": 27378 + }, + { + "epoch": 2.480487418178524, + "grad_norm": 0.8746626973152161, + "learning_rate": 3.462816407901891e-05, + "loss": 2.0988, + "step": 27379 + }, + { + "epoch": 2.4805780163529705, + "grad_norm": 1.0992653369903564, + "learning_rate": 3.4622122878028155e-05, + "loss": 2.5189, + "step": 27380 + }, + { + "epoch": 2.480668614527417, + "grad_norm": 0.9485092759132385, + "learning_rate": 3.4616081677037396e-05, + "loss": 2.0596, + "step": 27381 + }, + { + "epoch": 2.480759212701864, + "grad_norm": 0.9744150638580322, + "learning_rate": 3.461004047604664e-05, + "loss": 2.5232, + "step": 27382 + }, + { + "epoch": 2.480849810876311, + "grad_norm": 0.9934223294258118, + "learning_rate": 3.4603999275055884e-05, + "loss": 2.366, + "step": 27383 + }, + { + "epoch": 2.4809404090507576, + "grad_norm": 1.0150872468948364, + "learning_rate": 3.4597958074065125e-05, + "loss": 2.6059, + "step": 27384 + }, + { + "epoch": 2.481031007225204, + "grad_norm": 0.9312131404876709, + "learning_rate": 3.459191687307437e-05, + "loss": 2.3896, + "step": 27385 + }, + { + "epoch": 2.481121605399651, + "grad_norm": 1.0464179515838623, + "learning_rate": 3.458587567208361e-05, + "loss": 2.7038, + "step": 27386 + }, + { + "epoch": 2.481212203574098, + "grad_norm": 1.0334254503250122, + "learning_rate": 3.4579834471092854e-05, + "loss": 2.796, + "step": 27387 + }, + { + "epoch": 2.481302801748545, + "grad_norm": 1.007368564605713, + "learning_rate": 3.45737932701021e-05, + "loss": 2.3082, + "step": 27388 + }, + { + "epoch": 2.4813933999229913, + "grad_norm": 0.9327188730239868, + "learning_rate": 3.456775206911134e-05, + "loss": 2.6186, + "step": 27389 + }, + { + "epoch": 2.4814839980974384, + "grad_norm": 1.0355079174041748, + "learning_rate": 3.4561710868120583e-05, + "loss": 2.609, + "step": 27390 + }, + { + "epoch": 2.4815745962718854, + "grad_norm": 0.8600581288337708, + "learning_rate": 3.4555669667129824e-05, + "loss": 1.7624, + "step": 27391 + }, + { + "epoch": 2.481665194446332, + "grad_norm": 1.1285600662231445, + "learning_rate": 3.454962846613907e-05, + "loss": 2.5707, + "step": 27392 + }, + { + "epoch": 2.4817557926207785, + "grad_norm": 1.141281247138977, + "learning_rate": 3.454358726514831e-05, + "loss": 2.2838, + "step": 27393 + }, + { + "epoch": 2.4818463907952255, + "grad_norm": 1.053963303565979, + "learning_rate": 3.4537546064157554e-05, + "loss": 2.4027, + "step": 27394 + }, + { + "epoch": 2.481936988969672, + "grad_norm": 0.9750102758407593, + "learning_rate": 3.4531504863166794e-05, + "loss": 2.4936, + "step": 27395 + }, + { + "epoch": 2.482027587144119, + "grad_norm": 0.9892765283584595, + "learning_rate": 3.452546366217604e-05, + "loss": 2.4246, + "step": 27396 + }, + { + "epoch": 2.4821181853185657, + "grad_norm": 1.0052947998046875, + "learning_rate": 3.451942246118528e-05, + "loss": 2.6292, + "step": 27397 + }, + { + "epoch": 2.4822087834930127, + "grad_norm": 1.0165443420410156, + "learning_rate": 3.451338126019453e-05, + "loss": 2.5676, + "step": 27398 + }, + { + "epoch": 2.4822993816674592, + "grad_norm": 0.9904553890228271, + "learning_rate": 3.450734005920377e-05, + "loss": 2.6473, + "step": 27399 + }, + { + "epoch": 2.4823899798419062, + "grad_norm": 0.9669339060783386, + "learning_rate": 3.450129885821302e-05, + "loss": 2.5049, + "step": 27400 + }, + { + "epoch": 2.482480578016353, + "grad_norm": 0.9995520114898682, + "learning_rate": 3.449525765722226e-05, + "loss": 2.5156, + "step": 27401 + }, + { + "epoch": 2.4825711761908, + "grad_norm": 0.9584543704986572, + "learning_rate": 3.44892164562315e-05, + "loss": 2.7259, + "step": 27402 + }, + { + "epoch": 2.4826617743652464, + "grad_norm": 0.962647557258606, + "learning_rate": 3.448317525524075e-05, + "loss": 2.6432, + "step": 27403 + }, + { + "epoch": 2.4827523725396934, + "grad_norm": 1.066084384918213, + "learning_rate": 3.447713405424999e-05, + "loss": 2.5828, + "step": 27404 + }, + { + "epoch": 2.48284297071414, + "grad_norm": 1.029516577720642, + "learning_rate": 3.447109285325923e-05, + "loss": 2.8085, + "step": 27405 + }, + { + "epoch": 2.482933568888587, + "grad_norm": 0.9725853800773621, + "learning_rate": 3.446505165226847e-05, + "loss": 2.9583, + "step": 27406 + }, + { + "epoch": 2.4830241670630335, + "grad_norm": 0.9644931554794312, + "learning_rate": 3.445901045127772e-05, + "loss": 2.7007, + "step": 27407 + }, + { + "epoch": 2.4831147652374805, + "grad_norm": 1.0610723495483398, + "learning_rate": 3.445296925028696e-05, + "loss": 2.4308, + "step": 27408 + }, + { + "epoch": 2.483205363411927, + "grad_norm": 1.0166946649551392, + "learning_rate": 3.44469280492962e-05, + "loss": 2.9309, + "step": 27409 + }, + { + "epoch": 2.483295961586374, + "grad_norm": 0.941520631313324, + "learning_rate": 3.444088684830544e-05, + "loss": 2.4774, + "step": 27410 + }, + { + "epoch": 2.4833865597608207, + "grad_norm": 1.0788326263427734, + "learning_rate": 3.443484564731469e-05, + "loss": 2.4758, + "step": 27411 + }, + { + "epoch": 2.4834771579352677, + "grad_norm": 0.7972872257232666, + "learning_rate": 3.442880444632393e-05, + "loss": 1.9803, + "step": 27412 + }, + { + "epoch": 2.4835677561097143, + "grad_norm": 1.0409623384475708, + "learning_rate": 3.442276324533317e-05, + "loss": 2.6199, + "step": 27413 + }, + { + "epoch": 2.4836583542841613, + "grad_norm": 1.0195894241333008, + "learning_rate": 3.441672204434242e-05, + "loss": 2.7504, + "step": 27414 + }, + { + "epoch": 2.483748952458608, + "grad_norm": 0.9781420826911926, + "learning_rate": 3.441068084335166e-05, + "loss": 2.749, + "step": 27415 + }, + { + "epoch": 2.483839550633055, + "grad_norm": 1.1063117980957031, + "learning_rate": 3.4404639642360906e-05, + "loss": 2.4426, + "step": 27416 + }, + { + "epoch": 2.4839301488075014, + "grad_norm": 1.026212215423584, + "learning_rate": 3.439859844137015e-05, + "loss": 2.6089, + "step": 27417 + }, + { + "epoch": 2.4840207469819484, + "grad_norm": 0.929702639579773, + "learning_rate": 3.4392557240379394e-05, + "loss": 2.0872, + "step": 27418 + }, + { + "epoch": 2.484111345156395, + "grad_norm": 1.0021982192993164, + "learning_rate": 3.4386516039388635e-05, + "loss": 2.5939, + "step": 27419 + }, + { + "epoch": 2.484201943330842, + "grad_norm": 1.0138797760009766, + "learning_rate": 3.4380474838397876e-05, + "loss": 2.676, + "step": 27420 + }, + { + "epoch": 2.4842925415052886, + "grad_norm": 0.9877864122390747, + "learning_rate": 3.437443363740712e-05, + "loss": 2.4398, + "step": 27421 + }, + { + "epoch": 2.4843831396797356, + "grad_norm": 1.0374430418014526, + "learning_rate": 3.4368392436416365e-05, + "loss": 2.6787, + "step": 27422 + }, + { + "epoch": 2.484473737854182, + "grad_norm": 1.0640803575515747, + "learning_rate": 3.4362351235425605e-05, + "loss": 2.6151, + "step": 27423 + }, + { + "epoch": 2.484564336028629, + "grad_norm": 0.9730553030967712, + "learning_rate": 3.4356310034434846e-05, + "loss": 2.5851, + "step": 27424 + }, + { + "epoch": 2.4846549342030757, + "grad_norm": 1.1735260486602783, + "learning_rate": 3.435026883344409e-05, + "loss": 2.7386, + "step": 27425 + }, + { + "epoch": 2.4847455323775227, + "grad_norm": 1.090493083000183, + "learning_rate": 3.4344227632453335e-05, + "loss": 2.4529, + "step": 27426 + }, + { + "epoch": 2.4848361305519693, + "grad_norm": 1.0681828260421753, + "learning_rate": 3.4338186431462575e-05, + "loss": 2.469, + "step": 27427 + }, + { + "epoch": 2.4849267287264163, + "grad_norm": 0.941067099571228, + "learning_rate": 3.4332145230471816e-05, + "loss": 1.798, + "step": 27428 + }, + { + "epoch": 2.485017326900863, + "grad_norm": 0.9585873484611511, + "learning_rate": 3.432610402948106e-05, + "loss": 2.5923, + "step": 27429 + }, + { + "epoch": 2.48510792507531, + "grad_norm": 1.0638359785079956, + "learning_rate": 3.4320062828490305e-05, + "loss": 2.4166, + "step": 27430 + }, + { + "epoch": 2.4851985232497564, + "grad_norm": 1.0210055112838745, + "learning_rate": 3.4314021627499546e-05, + "loss": 2.5822, + "step": 27431 + }, + { + "epoch": 2.4852891214242034, + "grad_norm": 1.0031498670578003, + "learning_rate": 3.430798042650879e-05, + "loss": 2.4139, + "step": 27432 + }, + { + "epoch": 2.48537971959865, + "grad_norm": 0.975932240486145, + "learning_rate": 3.4301939225518034e-05, + "loss": 2.4229, + "step": 27433 + }, + { + "epoch": 2.485470317773097, + "grad_norm": 0.9035269021987915, + "learning_rate": 3.429589802452728e-05, + "loss": 2.1788, + "step": 27434 + }, + { + "epoch": 2.4855609159475436, + "grad_norm": 1.0433605909347534, + "learning_rate": 3.428985682353652e-05, + "loss": 3.0446, + "step": 27435 + }, + { + "epoch": 2.4856515141219906, + "grad_norm": 1.0027765035629272, + "learning_rate": 3.428381562254576e-05, + "loss": 2.5677, + "step": 27436 + }, + { + "epoch": 2.485742112296437, + "grad_norm": 0.9920461177825928, + "learning_rate": 3.427777442155501e-05, + "loss": 2.4927, + "step": 27437 + }, + { + "epoch": 2.485832710470884, + "grad_norm": 0.9058137536048889, + "learning_rate": 3.427173322056425e-05, + "loss": 1.9779, + "step": 27438 + }, + { + "epoch": 2.4859233086453307, + "grad_norm": 1.0090339183807373, + "learning_rate": 3.426569201957349e-05, + "loss": 2.6255, + "step": 27439 + }, + { + "epoch": 2.4860139068197777, + "grad_norm": 1.0405559539794922, + "learning_rate": 3.425965081858273e-05, + "loss": 2.3096, + "step": 27440 + }, + { + "epoch": 2.4861045049942243, + "grad_norm": 1.0738188028335571, + "learning_rate": 3.425360961759198e-05, + "loss": 2.7225, + "step": 27441 + }, + { + "epoch": 2.4861951031686713, + "grad_norm": 1.0826234817504883, + "learning_rate": 3.424756841660122e-05, + "loss": 2.4793, + "step": 27442 + }, + { + "epoch": 2.486285701343118, + "grad_norm": 0.96408611536026, + "learning_rate": 3.424152721561046e-05, + "loss": 2.4422, + "step": 27443 + }, + { + "epoch": 2.486376299517565, + "grad_norm": 1.0031287670135498, + "learning_rate": 3.4235486014619703e-05, + "loss": 2.3797, + "step": 27444 + }, + { + "epoch": 2.4864668976920115, + "grad_norm": 0.9829111695289612, + "learning_rate": 3.422944481362895e-05, + "loss": 2.502, + "step": 27445 + }, + { + "epoch": 2.4865574958664585, + "grad_norm": 1.088591456413269, + "learning_rate": 3.422340361263819e-05, + "loss": 2.6601, + "step": 27446 + }, + { + "epoch": 2.486648094040905, + "grad_norm": 0.9893772602081299, + "learning_rate": 3.421736241164743e-05, + "loss": 2.3936, + "step": 27447 + }, + { + "epoch": 2.486738692215352, + "grad_norm": 1.019051432609558, + "learning_rate": 3.421132121065668e-05, + "loss": 2.8377, + "step": 27448 + }, + { + "epoch": 2.4868292903897986, + "grad_norm": 1.0241211652755737, + "learning_rate": 3.420528000966592e-05, + "loss": 2.5264, + "step": 27449 + }, + { + "epoch": 2.4869198885642456, + "grad_norm": 0.9823741316795349, + "learning_rate": 3.419923880867517e-05, + "loss": 2.3979, + "step": 27450 + }, + { + "epoch": 2.487010486738692, + "grad_norm": 0.9921584725379944, + "learning_rate": 3.419319760768441e-05, + "loss": 2.4099, + "step": 27451 + }, + { + "epoch": 2.487101084913139, + "grad_norm": 0.8472718000411987, + "learning_rate": 3.418715640669366e-05, + "loss": 1.8681, + "step": 27452 + }, + { + "epoch": 2.4871916830875858, + "grad_norm": 0.9914552569389343, + "learning_rate": 3.41811152057029e-05, + "loss": 2.651, + "step": 27453 + }, + { + "epoch": 2.4872822812620328, + "grad_norm": 0.9749642014503479, + "learning_rate": 3.417507400471214e-05, + "loss": 2.4137, + "step": 27454 + }, + { + "epoch": 2.4873728794364793, + "grad_norm": 0.8209484219551086, + "learning_rate": 3.416903280372138e-05, + "loss": 1.7616, + "step": 27455 + }, + { + "epoch": 2.487463477610926, + "grad_norm": 1.0498993396759033, + "learning_rate": 3.416299160273063e-05, + "loss": 2.8587, + "step": 27456 + }, + { + "epoch": 2.487554075785373, + "grad_norm": 1.2377567291259766, + "learning_rate": 3.415695040173987e-05, + "loss": 2.2619, + "step": 27457 + }, + { + "epoch": 2.48764467395982, + "grad_norm": 1.084453821182251, + "learning_rate": 3.415090920074911e-05, + "loss": 2.4891, + "step": 27458 + }, + { + "epoch": 2.4877352721342665, + "grad_norm": 1.0104495286941528, + "learning_rate": 3.414486799975835e-05, + "loss": 2.7287, + "step": 27459 + }, + { + "epoch": 2.487825870308713, + "grad_norm": 1.3025989532470703, + "learning_rate": 3.41388267987676e-05, + "loss": 2.7455, + "step": 27460 + }, + { + "epoch": 2.48791646848316, + "grad_norm": 0.9808380603790283, + "learning_rate": 3.413278559777684e-05, + "loss": 2.7229, + "step": 27461 + }, + { + "epoch": 2.488007066657607, + "grad_norm": 0.9915830492973328, + "learning_rate": 3.412674439678608e-05, + "loss": 2.4824, + "step": 27462 + }, + { + "epoch": 2.4880976648320536, + "grad_norm": 1.059745192527771, + "learning_rate": 3.412070319579533e-05, + "loss": 2.6676, + "step": 27463 + }, + { + "epoch": 2.4881882630065, + "grad_norm": 1.001821517944336, + "learning_rate": 3.411466199480457e-05, + "loss": 2.537, + "step": 27464 + }, + { + "epoch": 2.488278861180947, + "grad_norm": 1.008873462677002, + "learning_rate": 3.410862079381381e-05, + "loss": 2.621, + "step": 27465 + }, + { + "epoch": 2.488369459355394, + "grad_norm": 1.088641881942749, + "learning_rate": 3.4102579592823056e-05, + "loss": 2.6915, + "step": 27466 + }, + { + "epoch": 2.488460057529841, + "grad_norm": 0.961846113204956, + "learning_rate": 3.40965383918323e-05, + "loss": 2.7774, + "step": 27467 + }, + { + "epoch": 2.4885506557042874, + "grad_norm": 0.8216055631637573, + "learning_rate": 3.4090497190841544e-05, + "loss": 1.8303, + "step": 27468 + }, + { + "epoch": 2.4886412538787344, + "grad_norm": 1.031324863433838, + "learning_rate": 3.4084455989850785e-05, + "loss": 2.3381, + "step": 27469 + }, + { + "epoch": 2.4887318520531814, + "grad_norm": 1.0200695991516113, + "learning_rate": 3.4078414788860026e-05, + "loss": 2.6586, + "step": 27470 + }, + { + "epoch": 2.488822450227628, + "grad_norm": 0.8377060890197754, + "learning_rate": 3.4072373587869274e-05, + "loss": 1.833, + "step": 27471 + }, + { + "epoch": 2.4889130484020745, + "grad_norm": 0.9814702868461609, + "learning_rate": 3.4066332386878514e-05, + "loss": 2.6618, + "step": 27472 + }, + { + "epoch": 2.4890036465765215, + "grad_norm": 0.9714720249176025, + "learning_rate": 3.4060291185887755e-05, + "loss": 2.7621, + "step": 27473 + }, + { + "epoch": 2.489094244750968, + "grad_norm": 0.9865705370903015, + "learning_rate": 3.4054249984896996e-05, + "loss": 2.806, + "step": 27474 + }, + { + "epoch": 2.489184842925415, + "grad_norm": 1.12827730178833, + "learning_rate": 3.4048208783906244e-05, + "loss": 2.4819, + "step": 27475 + }, + { + "epoch": 2.4892754410998617, + "grad_norm": 0.9487974047660828, + "learning_rate": 3.4042167582915485e-05, + "loss": 2.4486, + "step": 27476 + }, + { + "epoch": 2.4893660392743087, + "grad_norm": 0.9963285326957703, + "learning_rate": 3.4036126381924725e-05, + "loss": 2.4334, + "step": 27477 + }, + { + "epoch": 2.4894566374487552, + "grad_norm": 0.9784010648727417, + "learning_rate": 3.403008518093397e-05, + "loss": 2.6712, + "step": 27478 + }, + { + "epoch": 2.4895472356232022, + "grad_norm": 1.027772307395935, + "learning_rate": 3.4024043979943214e-05, + "loss": 2.5141, + "step": 27479 + }, + { + "epoch": 2.489637833797649, + "grad_norm": 0.9724069237709045, + "learning_rate": 3.4018002778952455e-05, + "loss": 2.4863, + "step": 27480 + }, + { + "epoch": 2.489728431972096, + "grad_norm": 0.8594902753829956, + "learning_rate": 3.4011961577961695e-05, + "loss": 1.8069, + "step": 27481 + }, + { + "epoch": 2.4898190301465424, + "grad_norm": 1.0055580139160156, + "learning_rate": 3.400592037697094e-05, + "loss": 2.7653, + "step": 27482 + }, + { + "epoch": 2.4899096283209894, + "grad_norm": 0.9855490922927856, + "learning_rate": 3.3999879175980184e-05, + "loss": 2.6013, + "step": 27483 + }, + { + "epoch": 2.490000226495436, + "grad_norm": 1.022646188735962, + "learning_rate": 3.399383797498943e-05, + "loss": 2.4981, + "step": 27484 + }, + { + "epoch": 2.490090824669883, + "grad_norm": 0.9331412315368652, + "learning_rate": 3.398779677399867e-05, + "loss": 2.2652, + "step": 27485 + }, + { + "epoch": 2.4901814228443295, + "grad_norm": 1.0300887823104858, + "learning_rate": 3.398175557300792e-05, + "loss": 2.919, + "step": 27486 + }, + { + "epoch": 2.4902720210187765, + "grad_norm": 0.9549657106399536, + "learning_rate": 3.397571437201716e-05, + "loss": 2.681, + "step": 27487 + }, + { + "epoch": 2.490362619193223, + "grad_norm": 1.0643794536590576, + "learning_rate": 3.39696731710264e-05, + "loss": 2.5574, + "step": 27488 + }, + { + "epoch": 2.49045321736767, + "grad_norm": 0.9930906295776367, + "learning_rate": 3.396363197003564e-05, + "loss": 2.4569, + "step": 27489 + }, + { + "epoch": 2.4905438155421167, + "grad_norm": 0.9423810243606567, + "learning_rate": 3.395759076904489e-05, + "loss": 2.5458, + "step": 27490 + }, + { + "epoch": 2.4906344137165637, + "grad_norm": 0.9525013566017151, + "learning_rate": 3.395154956805413e-05, + "loss": 2.466, + "step": 27491 + }, + { + "epoch": 2.4907250118910103, + "grad_norm": 0.9604390859603882, + "learning_rate": 3.394550836706337e-05, + "loss": 2.5938, + "step": 27492 + }, + { + "epoch": 2.4908156100654573, + "grad_norm": 0.9515600800514221, + "learning_rate": 3.393946716607262e-05, + "loss": 2.6997, + "step": 27493 + }, + { + "epoch": 2.490906208239904, + "grad_norm": 1.0583301782608032, + "learning_rate": 3.393342596508186e-05, + "loss": 2.7974, + "step": 27494 + }, + { + "epoch": 2.490996806414351, + "grad_norm": 0.9379804730415344, + "learning_rate": 3.39273847640911e-05, + "loss": 2.6218, + "step": 27495 + }, + { + "epoch": 2.4910874045887974, + "grad_norm": 1.0012134313583374, + "learning_rate": 3.392134356310034e-05, + "loss": 2.3811, + "step": 27496 + }, + { + "epoch": 2.4911780027632444, + "grad_norm": 0.969407320022583, + "learning_rate": 3.391530236210959e-05, + "loss": 2.4514, + "step": 27497 + }, + { + "epoch": 2.491268600937691, + "grad_norm": 1.0493568181991577, + "learning_rate": 3.390926116111883e-05, + "loss": 2.6801, + "step": 27498 + }, + { + "epoch": 2.491359199112138, + "grad_norm": 0.956220269203186, + "learning_rate": 3.390321996012807e-05, + "loss": 2.7536, + "step": 27499 + }, + { + "epoch": 2.4914497972865846, + "grad_norm": 1.0267661809921265, + "learning_rate": 3.389717875913732e-05, + "loss": 2.3835, + "step": 27500 + }, + { + "epoch": 2.4915403954610316, + "grad_norm": 1.0187827348709106, + "learning_rate": 3.3891137558146566e-05, + "loss": 2.7939, + "step": 27501 + }, + { + "epoch": 2.491630993635478, + "grad_norm": 0.7069545984268188, + "learning_rate": 3.388509635715581e-05, + "loss": 1.4765, + "step": 27502 + }, + { + "epoch": 2.491721591809925, + "grad_norm": 1.0105412006378174, + "learning_rate": 3.387905515616505e-05, + "loss": 2.5852, + "step": 27503 + }, + { + "epoch": 2.4918121899843717, + "grad_norm": 0.990220308303833, + "learning_rate": 3.387301395517429e-05, + "loss": 2.7039, + "step": 27504 + }, + { + "epoch": 2.4919027881588187, + "grad_norm": 1.0319221019744873, + "learning_rate": 3.3866972754183536e-05, + "loss": 2.6394, + "step": 27505 + }, + { + "epoch": 2.4919933863332653, + "grad_norm": 1.0050116777420044, + "learning_rate": 3.386093155319278e-05, + "loss": 2.663, + "step": 27506 + }, + { + "epoch": 2.4920839845077123, + "grad_norm": 1.0294852256774902, + "learning_rate": 3.385489035220202e-05, + "loss": 2.7775, + "step": 27507 + }, + { + "epoch": 2.492174582682159, + "grad_norm": 0.996131420135498, + "learning_rate": 3.3848849151211266e-05, + "loss": 2.5614, + "step": 27508 + }, + { + "epoch": 2.492265180856606, + "grad_norm": 0.9541469216346741, + "learning_rate": 3.3842807950220506e-05, + "loss": 2.3791, + "step": 27509 + }, + { + "epoch": 2.4923557790310524, + "grad_norm": 1.012386441230774, + "learning_rate": 3.383676674922975e-05, + "loss": 2.8199, + "step": 27510 + }, + { + "epoch": 2.4924463772054994, + "grad_norm": 0.9611616730690002, + "learning_rate": 3.383072554823899e-05, + "loss": 2.7711, + "step": 27511 + }, + { + "epoch": 2.492536975379946, + "grad_norm": 0.9580291509628296, + "learning_rate": 3.3824684347248236e-05, + "loss": 2.4627, + "step": 27512 + }, + { + "epoch": 2.492627573554393, + "grad_norm": 1.0599744319915771, + "learning_rate": 3.3818643146257477e-05, + "loss": 2.5711, + "step": 27513 + }, + { + "epoch": 2.4927181717288396, + "grad_norm": 0.9771770238876343, + "learning_rate": 3.381260194526672e-05, + "loss": 2.7225, + "step": 27514 + }, + { + "epoch": 2.4928087699032866, + "grad_norm": 1.0608030557632446, + "learning_rate": 3.380656074427596e-05, + "loss": 2.6286, + "step": 27515 + }, + { + "epoch": 2.492899368077733, + "grad_norm": 1.0612187385559082, + "learning_rate": 3.3800519543285206e-05, + "loss": 2.6368, + "step": 27516 + }, + { + "epoch": 2.49298996625218, + "grad_norm": 0.8978808522224426, + "learning_rate": 3.379447834229445e-05, + "loss": 1.9045, + "step": 27517 + }, + { + "epoch": 2.4930805644266267, + "grad_norm": 0.9684039950370789, + "learning_rate": 3.3788437141303694e-05, + "loss": 2.5238, + "step": 27518 + }, + { + "epoch": 2.4931711626010737, + "grad_norm": 1.2384529113769531, + "learning_rate": 3.3782395940312935e-05, + "loss": 2.2338, + "step": 27519 + }, + { + "epoch": 2.4932617607755203, + "grad_norm": 0.9540917873382568, + "learning_rate": 3.377635473932218e-05, + "loss": 2.6221, + "step": 27520 + }, + { + "epoch": 2.4933523589499673, + "grad_norm": 1.0157397985458374, + "learning_rate": 3.3770313538331423e-05, + "loss": 2.5888, + "step": 27521 + }, + { + "epoch": 2.493442957124414, + "grad_norm": 1.123481273651123, + "learning_rate": 3.3764272337340664e-05, + "loss": 2.4434, + "step": 27522 + }, + { + "epoch": 2.493533555298861, + "grad_norm": 1.0606461763381958, + "learning_rate": 3.375823113634991e-05, + "loss": 2.4131, + "step": 27523 + }, + { + "epoch": 2.4936241534733075, + "grad_norm": 0.9752703309059143, + "learning_rate": 3.375218993535915e-05, + "loss": 2.8488, + "step": 27524 + }, + { + "epoch": 2.4937147516477545, + "grad_norm": 1.1318758726119995, + "learning_rate": 3.3746148734368394e-05, + "loss": 2.4742, + "step": 27525 + }, + { + "epoch": 2.493805349822201, + "grad_norm": 0.9851961135864258, + "learning_rate": 3.3740107533377634e-05, + "loss": 2.7636, + "step": 27526 + }, + { + "epoch": 2.493895947996648, + "grad_norm": 0.986544132232666, + "learning_rate": 3.373406633238688e-05, + "loss": 2.9673, + "step": 27527 + }, + { + "epoch": 2.4939865461710946, + "grad_norm": 1.0135732889175415, + "learning_rate": 3.372802513139612e-05, + "loss": 2.5682, + "step": 27528 + }, + { + "epoch": 2.4940771443455416, + "grad_norm": 1.0294257402420044, + "learning_rate": 3.3721983930405364e-05, + "loss": 2.6208, + "step": 27529 + }, + { + "epoch": 2.494167742519988, + "grad_norm": 0.9823210835456848, + "learning_rate": 3.3715942729414605e-05, + "loss": 2.4578, + "step": 27530 + }, + { + "epoch": 2.494258340694435, + "grad_norm": 1.0534569025039673, + "learning_rate": 3.370990152842385e-05, + "loss": 2.5868, + "step": 27531 + }, + { + "epoch": 2.4943489388688818, + "grad_norm": 0.941628098487854, + "learning_rate": 3.370386032743309e-05, + "loss": 2.4976, + "step": 27532 + }, + { + "epoch": 2.4944395370433288, + "grad_norm": 1.0150576829910278, + "learning_rate": 3.3697819126442334e-05, + "loss": 2.553, + "step": 27533 + }, + { + "epoch": 2.4945301352177753, + "grad_norm": 0.9829752445220947, + "learning_rate": 3.369177792545158e-05, + "loss": 2.9268, + "step": 27534 + }, + { + "epoch": 2.494620733392222, + "grad_norm": 1.0832507610321045, + "learning_rate": 3.368573672446083e-05, + "loss": 2.8033, + "step": 27535 + }, + { + "epoch": 2.494711331566669, + "grad_norm": 1.0535801649093628, + "learning_rate": 3.367969552347007e-05, + "loss": 2.6167, + "step": 27536 + }, + { + "epoch": 2.494801929741116, + "grad_norm": 0.8377740383148193, + "learning_rate": 3.367365432247931e-05, + "loss": 1.8797, + "step": 27537 + }, + { + "epoch": 2.4948925279155625, + "grad_norm": 1.063681721687317, + "learning_rate": 3.366761312148856e-05, + "loss": 2.5318, + "step": 27538 + }, + { + "epoch": 2.494983126090009, + "grad_norm": 1.0521537065505981, + "learning_rate": 3.36615719204978e-05, + "loss": 2.6931, + "step": 27539 + }, + { + "epoch": 2.495073724264456, + "grad_norm": 0.8189927339553833, + "learning_rate": 3.365553071950704e-05, + "loss": 1.8609, + "step": 27540 + }, + { + "epoch": 2.495164322438903, + "grad_norm": 1.0710645914077759, + "learning_rate": 3.364948951851628e-05, + "loss": 2.5036, + "step": 27541 + }, + { + "epoch": 2.4952549206133496, + "grad_norm": 1.0530973672866821, + "learning_rate": 3.364344831752553e-05, + "loss": 2.5457, + "step": 27542 + }, + { + "epoch": 2.495345518787796, + "grad_norm": 0.987583339214325, + "learning_rate": 3.363740711653477e-05, + "loss": 1.8753, + "step": 27543 + }, + { + "epoch": 2.495436116962243, + "grad_norm": 1.0161371231079102, + "learning_rate": 3.363136591554401e-05, + "loss": 2.6187, + "step": 27544 + }, + { + "epoch": 2.49552671513669, + "grad_norm": 1.0164666175842285, + "learning_rate": 3.362532471455325e-05, + "loss": 2.4542, + "step": 27545 + }, + { + "epoch": 2.495617313311137, + "grad_norm": 0.9217551350593567, + "learning_rate": 3.36192835135625e-05, + "loss": 2.0459, + "step": 27546 + }, + { + "epoch": 2.4957079114855834, + "grad_norm": 0.9825547933578491, + "learning_rate": 3.361324231257174e-05, + "loss": 2.6081, + "step": 27547 + }, + { + "epoch": 2.4957985096600304, + "grad_norm": 0.914415180683136, + "learning_rate": 3.360720111158098e-05, + "loss": 2.0434, + "step": 27548 + }, + { + "epoch": 2.4958891078344774, + "grad_norm": 0.9972822070121765, + "learning_rate": 3.360115991059023e-05, + "loss": 2.6629, + "step": 27549 + }, + { + "epoch": 2.495979706008924, + "grad_norm": 1.0131453275680542, + "learning_rate": 3.359511870959947e-05, + "loss": 2.6221, + "step": 27550 + }, + { + "epoch": 2.4960703041833705, + "grad_norm": 1.143847107887268, + "learning_rate": 3.3589077508608716e-05, + "loss": 2.9221, + "step": 27551 + }, + { + "epoch": 2.4961609023578175, + "grad_norm": 1.0202006101608276, + "learning_rate": 3.358303630761796e-05, + "loss": 2.5537, + "step": 27552 + }, + { + "epoch": 2.4962515005322645, + "grad_norm": 0.9955464601516724, + "learning_rate": 3.3576995106627205e-05, + "loss": 2.5136, + "step": 27553 + }, + { + "epoch": 2.496342098706711, + "grad_norm": 1.0453671216964722, + "learning_rate": 3.3570953905636445e-05, + "loss": 2.4631, + "step": 27554 + }, + { + "epoch": 2.4964326968811577, + "grad_norm": 1.1135283708572388, + "learning_rate": 3.3564912704645686e-05, + "loss": 2.4801, + "step": 27555 + }, + { + "epoch": 2.4965232950556047, + "grad_norm": 1.1112065315246582, + "learning_rate": 3.355887150365493e-05, + "loss": 2.5885, + "step": 27556 + }, + { + "epoch": 2.4966138932300512, + "grad_norm": 1.0688928365707397, + "learning_rate": 3.3552830302664175e-05, + "loss": 2.5299, + "step": 27557 + }, + { + "epoch": 2.4967044914044982, + "grad_norm": 0.9527790546417236, + "learning_rate": 3.3546789101673415e-05, + "loss": 2.6925, + "step": 27558 + }, + { + "epoch": 2.496795089578945, + "grad_norm": 0.9664731621742249, + "learning_rate": 3.3540747900682656e-05, + "loss": 2.5443, + "step": 27559 + }, + { + "epoch": 2.496885687753392, + "grad_norm": 1.1598702669143677, + "learning_rate": 3.35347066996919e-05, + "loss": 2.7991, + "step": 27560 + }, + { + "epoch": 2.4969762859278384, + "grad_norm": 0.9320375323295593, + "learning_rate": 3.3528665498701145e-05, + "loss": 2.6645, + "step": 27561 + }, + { + "epoch": 2.4970668841022854, + "grad_norm": 1.0301433801651, + "learning_rate": 3.3522624297710386e-05, + "loss": 2.6788, + "step": 27562 + }, + { + "epoch": 2.497157482276732, + "grad_norm": 1.0910378694534302, + "learning_rate": 3.3516583096719626e-05, + "loss": 2.4992, + "step": 27563 + }, + { + "epoch": 2.497248080451179, + "grad_norm": 0.9272887706756592, + "learning_rate": 3.3510541895728874e-05, + "loss": 2.4864, + "step": 27564 + }, + { + "epoch": 2.4973386786256255, + "grad_norm": 1.0553609132766724, + "learning_rate": 3.3504500694738115e-05, + "loss": 2.6143, + "step": 27565 + }, + { + "epoch": 2.4974292768000725, + "grad_norm": 0.8948726654052734, + "learning_rate": 3.3498459493747356e-05, + "loss": 2.4443, + "step": 27566 + }, + { + "epoch": 2.497519874974519, + "grad_norm": 0.9834989905357361, + "learning_rate": 3.34924182927566e-05, + "loss": 2.6484, + "step": 27567 + }, + { + "epoch": 2.497610473148966, + "grad_norm": 0.8601419925689697, + "learning_rate": 3.3486377091765844e-05, + "loss": 1.7755, + "step": 27568 + }, + { + "epoch": 2.4977010713234127, + "grad_norm": 1.0005007982254028, + "learning_rate": 3.348033589077509e-05, + "loss": 2.5645, + "step": 27569 + }, + { + "epoch": 2.4977916694978597, + "grad_norm": 1.011488676071167, + "learning_rate": 3.347429468978433e-05, + "loss": 2.5963, + "step": 27570 + }, + { + "epoch": 2.4978822676723063, + "grad_norm": 1.0360571146011353, + "learning_rate": 3.346825348879357e-05, + "loss": 2.7115, + "step": 27571 + }, + { + "epoch": 2.4979728658467533, + "grad_norm": 1.0138239860534668, + "learning_rate": 3.346221228780282e-05, + "loss": 2.487, + "step": 27572 + }, + { + "epoch": 2.4980634640212, + "grad_norm": 0.9154064059257507, + "learning_rate": 3.345617108681206e-05, + "loss": 2.0572, + "step": 27573 + }, + { + "epoch": 2.498154062195647, + "grad_norm": 0.9796592593193054, + "learning_rate": 3.34501298858213e-05, + "loss": 2.6781, + "step": 27574 + }, + { + "epoch": 2.4982446603700934, + "grad_norm": 1.0425701141357422, + "learning_rate": 3.3444088684830543e-05, + "loss": 2.8568, + "step": 27575 + }, + { + "epoch": 2.4983352585445404, + "grad_norm": 0.984289288520813, + "learning_rate": 3.343804748383979e-05, + "loss": 2.5818, + "step": 27576 + }, + { + "epoch": 2.498425856718987, + "grad_norm": 1.0170128345489502, + "learning_rate": 3.343200628284903e-05, + "loss": 2.689, + "step": 27577 + }, + { + "epoch": 2.498516454893434, + "grad_norm": 1.0232266187667847, + "learning_rate": 3.342596508185827e-05, + "loss": 2.6384, + "step": 27578 + }, + { + "epoch": 2.4986070530678806, + "grad_norm": 0.8979201912879944, + "learning_rate": 3.341992388086752e-05, + "loss": 1.8634, + "step": 27579 + }, + { + "epoch": 2.4986976512423276, + "grad_norm": 1.0367647409439087, + "learning_rate": 3.341388267987676e-05, + "loss": 2.6111, + "step": 27580 + }, + { + "epoch": 2.498788249416774, + "grad_norm": 1.0311766862869263, + "learning_rate": 3.3407841478886e-05, + "loss": 2.8322, + "step": 27581 + }, + { + "epoch": 2.498878847591221, + "grad_norm": 0.9319051504135132, + "learning_rate": 3.340180027789524e-05, + "loss": 2.7779, + "step": 27582 + }, + { + "epoch": 2.4989694457656677, + "grad_norm": 1.0239249467849731, + "learning_rate": 3.339575907690449e-05, + "loss": 2.6869, + "step": 27583 + }, + { + "epoch": 2.4990600439401147, + "grad_norm": 0.9779316782951355, + "learning_rate": 3.338971787591373e-05, + "loss": 2.7458, + "step": 27584 + }, + { + "epoch": 2.4991506421145613, + "grad_norm": 1.0376938581466675, + "learning_rate": 3.338367667492298e-05, + "loss": 2.657, + "step": 27585 + }, + { + "epoch": 2.4992412402890083, + "grad_norm": 1.0591659545898438, + "learning_rate": 3.337763547393222e-05, + "loss": 2.7024, + "step": 27586 + }, + { + "epoch": 2.499331838463455, + "grad_norm": 1.0496838092803955, + "learning_rate": 3.337159427294147e-05, + "loss": 2.2232, + "step": 27587 + }, + { + "epoch": 2.499422436637902, + "grad_norm": 1.0895984172821045, + "learning_rate": 3.336555307195071e-05, + "loss": 2.7268, + "step": 27588 + }, + { + "epoch": 2.4995130348123484, + "grad_norm": 0.9989398121833801, + "learning_rate": 3.335951187095995e-05, + "loss": 2.6188, + "step": 27589 + }, + { + "epoch": 2.4996036329867954, + "grad_norm": 1.1604113578796387, + "learning_rate": 3.335347066996919e-05, + "loss": 2.459, + "step": 27590 + }, + { + "epoch": 2.499694231161242, + "grad_norm": 0.9474701285362244, + "learning_rate": 3.334742946897844e-05, + "loss": 2.6618, + "step": 27591 + }, + { + "epoch": 2.499784829335689, + "grad_norm": 0.9423670768737793, + "learning_rate": 3.334138826798768e-05, + "loss": 2.7387, + "step": 27592 + }, + { + "epoch": 2.4998754275101356, + "grad_norm": 1.1851800680160522, + "learning_rate": 3.333534706699692e-05, + "loss": 2.8198, + "step": 27593 + }, + { + "epoch": 2.4999660256845826, + "grad_norm": 0.985687255859375, + "learning_rate": 3.332930586600617e-05, + "loss": 2.5927, + "step": 27594 + }, + { + "epoch": 2.500056623859029, + "grad_norm": 0.9808753132820129, + "learning_rate": 3.332326466501541e-05, + "loss": 2.4929, + "step": 27595 + }, + { + "epoch": 2.500147222033476, + "grad_norm": 1.0811487436294556, + "learning_rate": 3.331722346402465e-05, + "loss": 2.6461, + "step": 27596 + }, + { + "epoch": 2.5002378202079227, + "grad_norm": 0.9965894222259521, + "learning_rate": 3.331118226303389e-05, + "loss": 2.5776, + "step": 27597 + }, + { + "epoch": 2.5003284183823697, + "grad_norm": 1.023468017578125, + "learning_rate": 3.330514106204314e-05, + "loss": 2.7055, + "step": 27598 + }, + { + "epoch": 2.5004190165568163, + "grad_norm": 0.9711852073669434, + "learning_rate": 3.329909986105238e-05, + "loss": 2.4267, + "step": 27599 + }, + { + "epoch": 2.5005096147312633, + "grad_norm": 0.9700057506561279, + "learning_rate": 3.329305866006162e-05, + "loss": 2.4517, + "step": 27600 + }, + { + "epoch": 2.50060021290571, + "grad_norm": 1.0172697305679321, + "learning_rate": 3.3287017459070866e-05, + "loss": 2.48, + "step": 27601 + }, + { + "epoch": 2.500690811080157, + "grad_norm": 1.0276730060577393, + "learning_rate": 3.328097625808011e-05, + "loss": 2.9045, + "step": 27602 + }, + { + "epoch": 2.5007814092546035, + "grad_norm": 0.9980876445770264, + "learning_rate": 3.3274935057089354e-05, + "loss": 2.6217, + "step": 27603 + }, + { + "epoch": 2.5008720074290505, + "grad_norm": 1.0318232774734497, + "learning_rate": 3.3268893856098595e-05, + "loss": 2.691, + "step": 27604 + }, + { + "epoch": 2.500962605603497, + "grad_norm": 1.0277533531188965, + "learning_rate": 3.3262852655107836e-05, + "loss": 2.4486, + "step": 27605 + }, + { + "epoch": 2.5010532037779436, + "grad_norm": 0.9802619814872742, + "learning_rate": 3.3256811454117084e-05, + "loss": 1.8892, + "step": 27606 + }, + { + "epoch": 2.5011438019523906, + "grad_norm": 1.060105562210083, + "learning_rate": 3.3250770253126325e-05, + "loss": 2.7439, + "step": 27607 + }, + { + "epoch": 2.5012344001268376, + "grad_norm": 1.0872520208358765, + "learning_rate": 3.3244729052135565e-05, + "loss": 2.6555, + "step": 27608 + }, + { + "epoch": 2.501324998301284, + "grad_norm": 1.010972499847412, + "learning_rate": 3.323868785114481e-05, + "loss": 2.7087, + "step": 27609 + }, + { + "epoch": 2.5014155964757308, + "grad_norm": 0.9884101748466492, + "learning_rate": 3.3232646650154054e-05, + "loss": 2.5629, + "step": 27610 + }, + { + "epoch": 2.5015061946501778, + "grad_norm": 0.8793758153915405, + "learning_rate": 3.3226605449163295e-05, + "loss": 2.3755, + "step": 27611 + }, + { + "epoch": 2.5015967928246248, + "grad_norm": 1.0987606048583984, + "learning_rate": 3.3220564248172535e-05, + "loss": 2.794, + "step": 27612 + }, + { + "epoch": 2.5016873909990713, + "grad_norm": 0.9464471340179443, + "learning_rate": 3.321452304718178e-05, + "loss": 2.5318, + "step": 27613 + }, + { + "epoch": 2.501777989173518, + "grad_norm": 1.0226424932479858, + "learning_rate": 3.3208481846191024e-05, + "loss": 2.3278, + "step": 27614 + }, + { + "epoch": 2.501868587347965, + "grad_norm": 1.0016428232192993, + "learning_rate": 3.3202440645200265e-05, + "loss": 2.7848, + "step": 27615 + }, + { + "epoch": 2.501959185522412, + "grad_norm": 0.9941603541374207, + "learning_rate": 3.3196399444209506e-05, + "loss": 2.6509, + "step": 27616 + }, + { + "epoch": 2.5020497836968585, + "grad_norm": 1.0194125175476074, + "learning_rate": 3.319035824321875e-05, + "loss": 2.7474, + "step": 27617 + }, + { + "epoch": 2.502140381871305, + "grad_norm": 1.040023922920227, + "learning_rate": 3.3184317042227994e-05, + "loss": 2.6167, + "step": 27618 + }, + { + "epoch": 2.502230980045752, + "grad_norm": 1.0116386413574219, + "learning_rate": 3.317827584123724e-05, + "loss": 2.7334, + "step": 27619 + }, + { + "epoch": 2.502321578220199, + "grad_norm": 0.9973770380020142, + "learning_rate": 3.317223464024648e-05, + "loss": 2.6124, + "step": 27620 + }, + { + "epoch": 2.5024121763946456, + "grad_norm": 0.9794725775718689, + "learning_rate": 3.316619343925573e-05, + "loss": 2.4396, + "step": 27621 + }, + { + "epoch": 2.502502774569092, + "grad_norm": 0.9920920133590698, + "learning_rate": 3.316015223826497e-05, + "loss": 2.4856, + "step": 27622 + }, + { + "epoch": 2.502593372743539, + "grad_norm": 1.0442430973052979, + "learning_rate": 3.315411103727421e-05, + "loss": 2.3536, + "step": 27623 + }, + { + "epoch": 2.5026839709179862, + "grad_norm": 1.2030162811279297, + "learning_rate": 3.314806983628346e-05, + "loss": 2.7073, + "step": 27624 + }, + { + "epoch": 2.502774569092433, + "grad_norm": 0.9900228977203369, + "learning_rate": 3.31420286352927e-05, + "loss": 2.7096, + "step": 27625 + }, + { + "epoch": 2.5028651672668794, + "grad_norm": 1.067720651626587, + "learning_rate": 3.313598743430194e-05, + "loss": 2.45, + "step": 27626 + }, + { + "epoch": 2.5029557654413264, + "grad_norm": 1.042359471321106, + "learning_rate": 3.312994623331118e-05, + "loss": 2.6628, + "step": 27627 + }, + { + "epoch": 2.5030463636157734, + "grad_norm": 1.1317468881607056, + "learning_rate": 3.312390503232043e-05, + "loss": 2.4078, + "step": 27628 + }, + { + "epoch": 2.50313696179022, + "grad_norm": 0.9842556118965149, + "learning_rate": 3.311786383132967e-05, + "loss": 2.8884, + "step": 27629 + }, + { + "epoch": 2.5032275599646665, + "grad_norm": 0.9851131439208984, + "learning_rate": 3.311182263033891e-05, + "loss": 2.4673, + "step": 27630 + }, + { + "epoch": 2.5033181581391135, + "grad_norm": 1.064497470855713, + "learning_rate": 3.310578142934815e-05, + "loss": 2.416, + "step": 27631 + }, + { + "epoch": 2.5034087563135605, + "grad_norm": 1.0550508499145508, + "learning_rate": 3.30997402283574e-05, + "loss": 2.367, + "step": 27632 + }, + { + "epoch": 2.503499354488007, + "grad_norm": 1.0646998882293701, + "learning_rate": 3.309369902736664e-05, + "loss": 2.625, + "step": 27633 + }, + { + "epoch": 2.5035899526624537, + "grad_norm": 1.058776617050171, + "learning_rate": 3.308765782637588e-05, + "loss": 2.484, + "step": 27634 + }, + { + "epoch": 2.5036805508369007, + "grad_norm": 0.9885905385017395, + "learning_rate": 3.308161662538513e-05, + "loss": 2.6291, + "step": 27635 + }, + { + "epoch": 2.5037711490113477, + "grad_norm": 0.8879197239875793, + "learning_rate": 3.307557542439437e-05, + "loss": 2.0857, + "step": 27636 + }, + { + "epoch": 2.5038617471857942, + "grad_norm": 0.9889165759086609, + "learning_rate": 3.306953422340362e-05, + "loss": 2.6253, + "step": 27637 + }, + { + "epoch": 2.503952345360241, + "grad_norm": 0.9723260402679443, + "learning_rate": 3.306349302241286e-05, + "loss": 2.1327, + "step": 27638 + }, + { + "epoch": 2.504042943534688, + "grad_norm": 0.9797160029411316, + "learning_rate": 3.3057451821422106e-05, + "loss": 2.6804, + "step": 27639 + }, + { + "epoch": 2.504133541709135, + "grad_norm": 1.0400062799453735, + "learning_rate": 3.3051410620431346e-05, + "loss": 2.535, + "step": 27640 + }, + { + "epoch": 2.5042241398835814, + "grad_norm": 1.012284517288208, + "learning_rate": 3.304536941944059e-05, + "loss": 2.7576, + "step": 27641 + }, + { + "epoch": 2.504314738058028, + "grad_norm": 1.0014877319335938, + "learning_rate": 3.303932821844983e-05, + "loss": 2.3723, + "step": 27642 + }, + { + "epoch": 2.504405336232475, + "grad_norm": 1.0331532955169678, + "learning_rate": 3.3033287017459076e-05, + "loss": 2.6385, + "step": 27643 + }, + { + "epoch": 2.504495934406922, + "grad_norm": 0.9879505038261414, + "learning_rate": 3.3027245816468317e-05, + "loss": 2.77, + "step": 27644 + }, + { + "epoch": 2.5045865325813685, + "grad_norm": 1.1544827222824097, + "learning_rate": 3.302120461547756e-05, + "loss": 2.5545, + "step": 27645 + }, + { + "epoch": 2.504677130755815, + "grad_norm": 0.8805121779441833, + "learning_rate": 3.30151634144868e-05, + "loss": 1.8526, + "step": 27646 + }, + { + "epoch": 2.504767728930262, + "grad_norm": 0.990354597568512, + "learning_rate": 3.3009122213496046e-05, + "loss": 2.6667, + "step": 27647 + }, + { + "epoch": 2.5048583271047087, + "grad_norm": 1.0410155057907104, + "learning_rate": 3.300308101250529e-05, + "loss": 2.512, + "step": 27648 + }, + { + "epoch": 2.5049489252791557, + "grad_norm": 0.8897408843040466, + "learning_rate": 3.299703981151453e-05, + "loss": 1.8705, + "step": 27649 + }, + { + "epoch": 2.5050395234536023, + "grad_norm": 1.1156939268112183, + "learning_rate": 3.299099861052377e-05, + "loss": 2.4993, + "step": 27650 + }, + { + "epoch": 2.5051301216280493, + "grad_norm": 0.9922839999198914, + "learning_rate": 3.2984957409533016e-05, + "loss": 2.6911, + "step": 27651 + }, + { + "epoch": 2.505220719802496, + "grad_norm": 1.029855489730835, + "learning_rate": 3.297891620854226e-05, + "loss": 2.5529, + "step": 27652 + }, + { + "epoch": 2.505311317976943, + "grad_norm": 1.0003297328948975, + "learning_rate": 3.2972875007551504e-05, + "loss": 2.4078, + "step": 27653 + }, + { + "epoch": 2.5054019161513894, + "grad_norm": 1.1206145286560059, + "learning_rate": 3.2966833806560745e-05, + "loss": 2.3362, + "step": 27654 + }, + { + "epoch": 2.5054925143258364, + "grad_norm": 1.1624616384506226, + "learning_rate": 3.296079260556999e-05, + "loss": 2.6347, + "step": 27655 + }, + { + "epoch": 2.505583112500283, + "grad_norm": 0.9517157077789307, + "learning_rate": 3.2954751404579234e-05, + "loss": 2.6003, + "step": 27656 + }, + { + "epoch": 2.50567371067473, + "grad_norm": 0.8881055116653442, + "learning_rate": 3.2948710203588474e-05, + "loss": 2.0269, + "step": 27657 + }, + { + "epoch": 2.5057643088491766, + "grad_norm": 1.030887484550476, + "learning_rate": 3.294266900259772e-05, + "loss": 2.8535, + "step": 27658 + }, + { + "epoch": 2.5058549070236236, + "grad_norm": 1.029808521270752, + "learning_rate": 3.293662780160696e-05, + "loss": 2.8329, + "step": 27659 + }, + { + "epoch": 2.50594550519807, + "grad_norm": 0.8234435319900513, + "learning_rate": 3.2930586600616204e-05, + "loss": 1.8838, + "step": 27660 + }, + { + "epoch": 2.506036103372517, + "grad_norm": 1.223894476890564, + "learning_rate": 3.2924545399625445e-05, + "loss": 2.3726, + "step": 27661 + }, + { + "epoch": 2.5061267015469637, + "grad_norm": 1.0610721111297607, + "learning_rate": 3.291850419863469e-05, + "loss": 2.6307, + "step": 27662 + }, + { + "epoch": 2.5062172997214107, + "grad_norm": 1.0025242567062378, + "learning_rate": 3.291246299764393e-05, + "loss": 2.5355, + "step": 27663 + }, + { + "epoch": 2.5063078978958573, + "grad_norm": 1.0759462118148804, + "learning_rate": 3.2906421796653174e-05, + "loss": 2.7615, + "step": 27664 + }, + { + "epoch": 2.5063984960703043, + "grad_norm": 0.917276918888092, + "learning_rate": 3.2900380595662415e-05, + "loss": 1.8625, + "step": 27665 + }, + { + "epoch": 2.506489094244751, + "grad_norm": 1.044822335243225, + "learning_rate": 3.289433939467166e-05, + "loss": 2.5313, + "step": 27666 + }, + { + "epoch": 2.506579692419198, + "grad_norm": 0.9789858460426331, + "learning_rate": 3.28882981936809e-05, + "loss": 2.815, + "step": 27667 + }, + { + "epoch": 2.5066702905936444, + "grad_norm": 0.9735116958618164, + "learning_rate": 3.2882256992690144e-05, + "loss": 2.4951, + "step": 27668 + }, + { + "epoch": 2.5067608887680914, + "grad_norm": 0.9764034152030945, + "learning_rate": 3.287621579169939e-05, + "loss": 2.7733, + "step": 27669 + }, + { + "epoch": 2.506851486942538, + "grad_norm": 0.8608284592628479, + "learning_rate": 3.287017459070863e-05, + "loss": 1.9361, + "step": 27670 + }, + { + "epoch": 2.506942085116985, + "grad_norm": 0.992238461971283, + "learning_rate": 3.286413338971788e-05, + "loss": 2.4834, + "step": 27671 + }, + { + "epoch": 2.5070326832914316, + "grad_norm": 1.030786395072937, + "learning_rate": 3.285809218872712e-05, + "loss": 2.8752, + "step": 27672 + }, + { + "epoch": 2.5071232814658786, + "grad_norm": 0.9570466876029968, + "learning_rate": 3.285205098773637e-05, + "loss": 2.7473, + "step": 27673 + }, + { + "epoch": 2.507213879640325, + "grad_norm": 0.8201632499694824, + "learning_rate": 3.284600978674561e-05, + "loss": 2.0174, + "step": 27674 + }, + { + "epoch": 2.507304477814772, + "grad_norm": 0.9057826399803162, + "learning_rate": 3.283996858575485e-05, + "loss": 1.997, + "step": 27675 + }, + { + "epoch": 2.5073950759892187, + "grad_norm": 0.8310666084289551, + "learning_rate": 3.283392738476409e-05, + "loss": 1.8574, + "step": 27676 + }, + { + "epoch": 2.5074856741636657, + "grad_norm": 1.0226924419403076, + "learning_rate": 3.282788618377334e-05, + "loss": 2.6549, + "step": 27677 + }, + { + "epoch": 2.5075762723381123, + "grad_norm": 1.0588829517364502, + "learning_rate": 3.282184498278258e-05, + "loss": 2.7235, + "step": 27678 + }, + { + "epoch": 2.5076668705125593, + "grad_norm": 0.9637079238891602, + "learning_rate": 3.281580378179182e-05, + "loss": 2.5228, + "step": 27679 + }, + { + "epoch": 2.507757468687006, + "grad_norm": 0.9428008198738098, + "learning_rate": 3.280976258080106e-05, + "loss": 2.3969, + "step": 27680 + }, + { + "epoch": 2.507848066861453, + "grad_norm": 1.0235968828201294, + "learning_rate": 3.280372137981031e-05, + "loss": 2.5207, + "step": 27681 + }, + { + "epoch": 2.5079386650358995, + "grad_norm": 0.9246338605880737, + "learning_rate": 3.279768017881955e-05, + "loss": 2.407, + "step": 27682 + }, + { + "epoch": 2.5080292632103465, + "grad_norm": 1.0682109594345093, + "learning_rate": 3.279163897782879e-05, + "loss": 3.0171, + "step": 27683 + }, + { + "epoch": 2.508119861384793, + "grad_norm": 0.9903019070625305, + "learning_rate": 3.278559777683804e-05, + "loss": 2.6229, + "step": 27684 + }, + { + "epoch": 2.50821045955924, + "grad_norm": 1.0843760967254639, + "learning_rate": 3.277955657584728e-05, + "loss": 2.723, + "step": 27685 + }, + { + "epoch": 2.5083010577336866, + "grad_norm": 1.0079647302627563, + "learning_rate": 3.277351537485652e-05, + "loss": 2.5236, + "step": 27686 + }, + { + "epoch": 2.5083916559081336, + "grad_norm": 1.2417701482772827, + "learning_rate": 3.276747417386577e-05, + "loss": 2.6885, + "step": 27687 + }, + { + "epoch": 2.50848225408258, + "grad_norm": 1.1082704067230225, + "learning_rate": 3.2761432972875015e-05, + "loss": 2.5049, + "step": 27688 + }, + { + "epoch": 2.5085728522570268, + "grad_norm": 0.9964885115623474, + "learning_rate": 3.2755391771884256e-05, + "loss": 2.5944, + "step": 27689 + }, + { + "epoch": 2.5086634504314738, + "grad_norm": 1.009381890296936, + "learning_rate": 3.2749350570893496e-05, + "loss": 2.6362, + "step": 27690 + }, + { + "epoch": 2.5087540486059208, + "grad_norm": 0.9967193007469177, + "learning_rate": 3.274330936990274e-05, + "loss": 2.9097, + "step": 27691 + }, + { + "epoch": 2.5088446467803673, + "grad_norm": 0.8927245736122131, + "learning_rate": 3.2737268168911985e-05, + "loss": 1.8374, + "step": 27692 + }, + { + "epoch": 2.508935244954814, + "grad_norm": 0.9505090117454529, + "learning_rate": 3.2731226967921226e-05, + "loss": 2.6782, + "step": 27693 + }, + { + "epoch": 2.509025843129261, + "grad_norm": 1.0573087930679321, + "learning_rate": 3.2725185766930466e-05, + "loss": 2.5801, + "step": 27694 + }, + { + "epoch": 2.509116441303708, + "grad_norm": 0.9817312359809875, + "learning_rate": 3.271914456593971e-05, + "loss": 2.9171, + "step": 27695 + }, + { + "epoch": 2.5092070394781545, + "grad_norm": 0.9968918561935425, + "learning_rate": 3.2713103364948955e-05, + "loss": 2.3946, + "step": 27696 + }, + { + "epoch": 2.509297637652601, + "grad_norm": 0.9519995450973511, + "learning_rate": 3.2707062163958196e-05, + "loss": 2.5598, + "step": 27697 + }, + { + "epoch": 2.509388235827048, + "grad_norm": 1.0185781717300415, + "learning_rate": 3.2701020962967437e-05, + "loss": 2.4114, + "step": 27698 + }, + { + "epoch": 2.509478834001495, + "grad_norm": 0.9496009945869446, + "learning_rate": 3.2694979761976684e-05, + "loss": 2.3798, + "step": 27699 + }, + { + "epoch": 2.5095694321759416, + "grad_norm": 1.0103857517242432, + "learning_rate": 3.2688938560985925e-05, + "loss": 2.6215, + "step": 27700 + }, + { + "epoch": 2.509660030350388, + "grad_norm": 1.1941142082214355, + "learning_rate": 3.2682897359995166e-05, + "loss": 2.699, + "step": 27701 + }, + { + "epoch": 2.509750628524835, + "grad_norm": 1.0549604892730713, + "learning_rate": 3.267685615900441e-05, + "loss": 2.6374, + "step": 27702 + }, + { + "epoch": 2.5098412266992822, + "grad_norm": 0.99135422706604, + "learning_rate": 3.2670814958013654e-05, + "loss": 2.601, + "step": 27703 + }, + { + "epoch": 2.509931824873729, + "grad_norm": 1.049094319343567, + "learning_rate": 3.2664773757022895e-05, + "loss": 2.6652, + "step": 27704 + }, + { + "epoch": 2.5100224230481754, + "grad_norm": 0.9664808511734009, + "learning_rate": 3.265873255603214e-05, + "loss": 2.6699, + "step": 27705 + }, + { + "epoch": 2.5101130212226224, + "grad_norm": 1.1924629211425781, + "learning_rate": 3.2652691355041384e-05, + "loss": 2.5114, + "step": 27706 + }, + { + "epoch": 2.5102036193970694, + "grad_norm": 1.0247176885604858, + "learning_rate": 3.264665015405063e-05, + "loss": 2.5236, + "step": 27707 + }, + { + "epoch": 2.510294217571516, + "grad_norm": 1.0361928939819336, + "learning_rate": 3.264060895305987e-05, + "loss": 2.6855, + "step": 27708 + }, + { + "epoch": 2.5103848157459625, + "grad_norm": 0.870244026184082, + "learning_rate": 3.263456775206911e-05, + "loss": 1.9775, + "step": 27709 + }, + { + "epoch": 2.5104754139204095, + "grad_norm": 0.8519884347915649, + "learning_rate": 3.2628526551078354e-05, + "loss": 2.0271, + "step": 27710 + }, + { + "epoch": 2.5105660120948565, + "grad_norm": 0.9986400604248047, + "learning_rate": 3.26224853500876e-05, + "loss": 2.7566, + "step": 27711 + }, + { + "epoch": 2.510656610269303, + "grad_norm": 1.0334420204162598, + "learning_rate": 3.261644414909684e-05, + "loss": 2.0125, + "step": 27712 + }, + { + "epoch": 2.5107472084437497, + "grad_norm": 1.0270073413848877, + "learning_rate": 3.261040294810608e-05, + "loss": 2.6414, + "step": 27713 + }, + { + "epoch": 2.5108378066181967, + "grad_norm": 1.0707560777664185, + "learning_rate": 3.260436174711533e-05, + "loss": 2.6664, + "step": 27714 + }, + { + "epoch": 2.5109284047926437, + "grad_norm": 0.9673702716827393, + "learning_rate": 3.259832054612457e-05, + "loss": 2.6602, + "step": 27715 + }, + { + "epoch": 2.5110190029670902, + "grad_norm": 1.0345144271850586, + "learning_rate": 3.259227934513381e-05, + "loss": 2.7001, + "step": 27716 + }, + { + "epoch": 2.511109601141537, + "grad_norm": 1.0410932302474976, + "learning_rate": 3.258623814414305e-05, + "loss": 2.7651, + "step": 27717 + }, + { + "epoch": 2.511200199315984, + "grad_norm": 1.0012781620025635, + "learning_rate": 3.25801969431523e-05, + "loss": 2.7062, + "step": 27718 + }, + { + "epoch": 2.511290797490431, + "grad_norm": 0.8812430500984192, + "learning_rate": 3.257415574216154e-05, + "loss": 1.8485, + "step": 27719 + }, + { + "epoch": 2.5113813956648774, + "grad_norm": 0.9587365388870239, + "learning_rate": 3.256811454117078e-05, + "loss": 2.4209, + "step": 27720 + }, + { + "epoch": 2.511471993839324, + "grad_norm": 1.0389372110366821, + "learning_rate": 3.256207334018003e-05, + "loss": 2.6441, + "step": 27721 + }, + { + "epoch": 2.511562592013771, + "grad_norm": 1.1714636087417603, + "learning_rate": 3.255603213918928e-05, + "loss": 2.8094, + "step": 27722 + }, + { + "epoch": 2.511653190188218, + "grad_norm": 1.0019875764846802, + "learning_rate": 3.254999093819852e-05, + "loss": 2.8571, + "step": 27723 + }, + { + "epoch": 2.5117437883626645, + "grad_norm": 0.8888403177261353, + "learning_rate": 3.254394973720776e-05, + "loss": 1.9427, + "step": 27724 + }, + { + "epoch": 2.511834386537111, + "grad_norm": 1.0604829788208008, + "learning_rate": 3.2537908536217e-05, + "loss": 2.6, + "step": 27725 + }, + { + "epoch": 2.511924984711558, + "grad_norm": 0.9581702351570129, + "learning_rate": 3.253186733522625e-05, + "loss": 2.3736, + "step": 27726 + }, + { + "epoch": 2.5120155828860047, + "grad_norm": 1.0179766416549683, + "learning_rate": 3.252582613423549e-05, + "loss": 2.5325, + "step": 27727 + }, + { + "epoch": 2.5121061810604517, + "grad_norm": 0.9788075089454651, + "learning_rate": 3.251978493324473e-05, + "loss": 2.6192, + "step": 27728 + }, + { + "epoch": 2.5121967792348983, + "grad_norm": 1.0920777320861816, + "learning_rate": 3.251374373225398e-05, + "loss": 2.3967, + "step": 27729 + }, + { + "epoch": 2.5122873774093453, + "grad_norm": 1.0401387214660645, + "learning_rate": 3.250770253126322e-05, + "loss": 2.499, + "step": 27730 + }, + { + "epoch": 2.512377975583792, + "grad_norm": 1.032505989074707, + "learning_rate": 3.250166133027246e-05, + "loss": 2.8761, + "step": 27731 + }, + { + "epoch": 2.512468573758239, + "grad_norm": 1.1034846305847168, + "learning_rate": 3.24956201292817e-05, + "loss": 2.5215, + "step": 27732 + }, + { + "epoch": 2.5125591719326854, + "grad_norm": 1.0644872188568115, + "learning_rate": 3.248957892829095e-05, + "loss": 2.4775, + "step": 27733 + }, + { + "epoch": 2.5126497701071324, + "grad_norm": 1.1406043767929077, + "learning_rate": 3.248353772730019e-05, + "loss": 2.6913, + "step": 27734 + }, + { + "epoch": 2.512740368281579, + "grad_norm": 1.0558403730392456, + "learning_rate": 3.247749652630943e-05, + "loss": 2.4217, + "step": 27735 + }, + { + "epoch": 2.512830966456026, + "grad_norm": 0.9914082884788513, + "learning_rate": 3.247145532531867e-05, + "loss": 2.7973, + "step": 27736 + }, + { + "epoch": 2.5129215646304726, + "grad_norm": 1.0138163566589355, + "learning_rate": 3.246541412432792e-05, + "loss": 2.5435, + "step": 27737 + }, + { + "epoch": 2.5130121628049196, + "grad_norm": 0.9837296009063721, + "learning_rate": 3.2459372923337165e-05, + "loss": 1.8875, + "step": 27738 + }, + { + "epoch": 2.513102760979366, + "grad_norm": 1.0499509572982788, + "learning_rate": 3.2453331722346405e-05, + "loss": 2.6181, + "step": 27739 + }, + { + "epoch": 2.513193359153813, + "grad_norm": 0.9683258533477783, + "learning_rate": 3.2447290521355646e-05, + "loss": 2.578, + "step": 27740 + }, + { + "epoch": 2.5132839573282597, + "grad_norm": 0.9182964563369751, + "learning_rate": 3.2441249320364894e-05, + "loss": 2.4942, + "step": 27741 + }, + { + "epoch": 2.5133745555027067, + "grad_norm": 1.0333894491195679, + "learning_rate": 3.2435208119374135e-05, + "loss": 2.3921, + "step": 27742 + }, + { + "epoch": 2.5134651536771533, + "grad_norm": 1.0560160875320435, + "learning_rate": 3.2429166918383376e-05, + "loss": 2.5888, + "step": 27743 + }, + { + "epoch": 2.5135557518516003, + "grad_norm": 1.070266604423523, + "learning_rate": 3.242312571739262e-05, + "loss": 2.7031, + "step": 27744 + }, + { + "epoch": 2.513646350026047, + "grad_norm": 1.014907956123352, + "learning_rate": 3.2417084516401864e-05, + "loss": 2.6009, + "step": 27745 + }, + { + "epoch": 2.513736948200494, + "grad_norm": 1.0375587940216064, + "learning_rate": 3.2411043315411105e-05, + "loss": 2.4591, + "step": 27746 + }, + { + "epoch": 2.5138275463749404, + "grad_norm": 1.004060983657837, + "learning_rate": 3.2405002114420346e-05, + "loss": 2.4288, + "step": 27747 + }, + { + "epoch": 2.5139181445493874, + "grad_norm": 1.0504478216171265, + "learning_rate": 3.239896091342959e-05, + "loss": 2.7524, + "step": 27748 + }, + { + "epoch": 2.514008742723834, + "grad_norm": 0.9116109013557434, + "learning_rate": 3.2392919712438834e-05, + "loss": 1.8137, + "step": 27749 + }, + { + "epoch": 2.514099340898281, + "grad_norm": 1.0372834205627441, + "learning_rate": 3.2386878511448075e-05, + "loss": 2.5869, + "step": 27750 + }, + { + "epoch": 2.5141899390727276, + "grad_norm": 1.0382795333862305, + "learning_rate": 3.2380837310457316e-05, + "loss": 2.615, + "step": 27751 + }, + { + "epoch": 2.5142805372471746, + "grad_norm": 1.0300434827804565, + "learning_rate": 3.237479610946656e-05, + "loss": 2.7901, + "step": 27752 + }, + { + "epoch": 2.514371135421621, + "grad_norm": 1.0607061386108398, + "learning_rate": 3.2368754908475804e-05, + "loss": 2.6895, + "step": 27753 + }, + { + "epoch": 2.514461733596068, + "grad_norm": 1.080600380897522, + "learning_rate": 3.236271370748505e-05, + "loss": 2.6441, + "step": 27754 + }, + { + "epoch": 2.5145523317705147, + "grad_norm": 1.0237197875976562, + "learning_rate": 3.235667250649429e-05, + "loss": 2.55, + "step": 27755 + }, + { + "epoch": 2.5146429299449617, + "grad_norm": 0.9946667551994324, + "learning_rate": 3.235063130550354e-05, + "loss": 2.5124, + "step": 27756 + }, + { + "epoch": 2.5147335281194083, + "grad_norm": 1.0086066722869873, + "learning_rate": 3.234459010451278e-05, + "loss": 2.5483, + "step": 27757 + }, + { + "epoch": 2.5148241262938553, + "grad_norm": 0.9880803823471069, + "learning_rate": 3.233854890352202e-05, + "loss": 2.5942, + "step": 27758 + }, + { + "epoch": 2.514914724468302, + "grad_norm": 1.136169672012329, + "learning_rate": 3.233250770253127e-05, + "loss": 2.8108, + "step": 27759 + }, + { + "epoch": 2.515005322642749, + "grad_norm": 0.8806406259536743, + "learning_rate": 3.232646650154051e-05, + "loss": 2.0818, + "step": 27760 + }, + { + "epoch": 2.5150959208171955, + "grad_norm": 0.9723238348960876, + "learning_rate": 3.232042530054975e-05, + "loss": 2.687, + "step": 27761 + }, + { + "epoch": 2.5151865189916425, + "grad_norm": 1.062362790107727, + "learning_rate": 3.231438409955899e-05, + "loss": 2.8552, + "step": 27762 + }, + { + "epoch": 2.515277117166089, + "grad_norm": 1.127951979637146, + "learning_rate": 3.230834289856824e-05, + "loss": 2.7243, + "step": 27763 + }, + { + "epoch": 2.515367715340536, + "grad_norm": 0.9731011986732483, + "learning_rate": 3.230230169757748e-05, + "loss": 1.9809, + "step": 27764 + }, + { + "epoch": 2.5154583135149826, + "grad_norm": 0.9828032851219177, + "learning_rate": 3.229626049658672e-05, + "loss": 2.9314, + "step": 27765 + }, + { + "epoch": 2.5155489116894296, + "grad_norm": 0.9912621974945068, + "learning_rate": 3.229021929559596e-05, + "loss": 2.6933, + "step": 27766 + }, + { + "epoch": 2.515639509863876, + "grad_norm": 1.0261847972869873, + "learning_rate": 3.228417809460521e-05, + "loss": 2.8696, + "step": 27767 + }, + { + "epoch": 2.5157301080383228, + "grad_norm": 0.8631082773208618, + "learning_rate": 3.227813689361445e-05, + "loss": 1.865, + "step": 27768 + }, + { + "epoch": 2.5158207062127698, + "grad_norm": 1.1366562843322754, + "learning_rate": 3.227209569262369e-05, + "loss": 2.4407, + "step": 27769 + }, + { + "epoch": 2.5159113043872168, + "grad_norm": 0.9995933175086975, + "learning_rate": 3.226605449163293e-05, + "loss": 2.5218, + "step": 27770 + }, + { + "epoch": 2.5160019025616633, + "grad_norm": 1.1567652225494385, + "learning_rate": 3.226001329064218e-05, + "loss": 2.5792, + "step": 27771 + }, + { + "epoch": 2.51609250073611, + "grad_norm": 1.0703537464141846, + "learning_rate": 3.225397208965143e-05, + "loss": 2.4181, + "step": 27772 + }, + { + "epoch": 2.516183098910557, + "grad_norm": 1.2346217632293701, + "learning_rate": 3.224793088866067e-05, + "loss": 2.6823, + "step": 27773 + }, + { + "epoch": 2.516273697085004, + "grad_norm": 0.8714615106582642, + "learning_rate": 3.2241889687669916e-05, + "loss": 1.9698, + "step": 27774 + }, + { + "epoch": 2.5163642952594505, + "grad_norm": 1.0252790451049805, + "learning_rate": 3.2235848486679157e-05, + "loss": 2.7073, + "step": 27775 + }, + { + "epoch": 2.516454893433897, + "grad_norm": 1.0247242450714111, + "learning_rate": 3.22298072856884e-05, + "loss": 2.5025, + "step": 27776 + }, + { + "epoch": 2.516545491608344, + "grad_norm": 1.1556525230407715, + "learning_rate": 3.222376608469764e-05, + "loss": 2.3716, + "step": 27777 + }, + { + "epoch": 2.516636089782791, + "grad_norm": 1.1026719808578491, + "learning_rate": 3.2217724883706886e-05, + "loss": 2.5289, + "step": 27778 + }, + { + "epoch": 2.5167266879572376, + "grad_norm": 1.1941322088241577, + "learning_rate": 3.221168368271613e-05, + "loss": 2.6631, + "step": 27779 + }, + { + "epoch": 2.516817286131684, + "grad_norm": 1.093363881111145, + "learning_rate": 3.220564248172537e-05, + "loss": 2.5911, + "step": 27780 + }, + { + "epoch": 2.516907884306131, + "grad_norm": 0.9741074442863464, + "learning_rate": 3.219960128073461e-05, + "loss": 2.3974, + "step": 27781 + }, + { + "epoch": 2.5169984824805782, + "grad_norm": 1.0292381048202515, + "learning_rate": 3.2193560079743856e-05, + "loss": 2.6012, + "step": 27782 + }, + { + "epoch": 2.517089080655025, + "grad_norm": 0.9738761782646179, + "learning_rate": 3.21875188787531e-05, + "loss": 1.9284, + "step": 27783 + }, + { + "epoch": 2.5171796788294714, + "grad_norm": 0.9100828170776367, + "learning_rate": 3.218147767776234e-05, + "loss": 2.6068, + "step": 27784 + }, + { + "epoch": 2.5172702770039184, + "grad_norm": 0.9496859312057495, + "learning_rate": 3.217543647677158e-05, + "loss": 2.3214, + "step": 27785 + }, + { + "epoch": 2.5173608751783654, + "grad_norm": 1.003645896911621, + "learning_rate": 3.2169395275780826e-05, + "loss": 2.6769, + "step": 27786 + }, + { + "epoch": 2.517451473352812, + "grad_norm": 1.0733996629714966, + "learning_rate": 3.216335407479007e-05, + "loss": 2.6099, + "step": 27787 + }, + { + "epoch": 2.5175420715272585, + "grad_norm": 0.8603838682174683, + "learning_rate": 3.2157312873799314e-05, + "loss": 2.234, + "step": 27788 + }, + { + "epoch": 2.5176326697017055, + "grad_norm": 1.0602672100067139, + "learning_rate": 3.2151271672808555e-05, + "loss": 2.7577, + "step": 27789 + }, + { + "epoch": 2.5177232678761525, + "grad_norm": 1.0449546575546265, + "learning_rate": 3.21452304718178e-05, + "loss": 2.6182, + "step": 27790 + }, + { + "epoch": 2.517813866050599, + "grad_norm": 1.0024476051330566, + "learning_rate": 3.2139189270827044e-05, + "loss": 2.8038, + "step": 27791 + }, + { + "epoch": 2.5179044642250457, + "grad_norm": 0.9844589233398438, + "learning_rate": 3.2133148069836285e-05, + "loss": 2.8099, + "step": 27792 + }, + { + "epoch": 2.5179950623994927, + "grad_norm": 1.0322166681289673, + "learning_rate": 3.212710686884553e-05, + "loss": 2.6265, + "step": 27793 + }, + { + "epoch": 2.5180856605739397, + "grad_norm": 0.9679667353630066, + "learning_rate": 3.212106566785477e-05, + "loss": 2.6792, + "step": 27794 + }, + { + "epoch": 2.5181762587483862, + "grad_norm": 0.9971910715103149, + "learning_rate": 3.2115024466864014e-05, + "loss": 2.6449, + "step": 27795 + }, + { + "epoch": 2.518266856922833, + "grad_norm": 0.9773272275924683, + "learning_rate": 3.2108983265873255e-05, + "loss": 2.539, + "step": 27796 + }, + { + "epoch": 2.51835745509728, + "grad_norm": 1.0620296001434326, + "learning_rate": 3.21029420648825e-05, + "loss": 2.6709, + "step": 27797 + }, + { + "epoch": 2.518448053271727, + "grad_norm": 1.0044865608215332, + "learning_rate": 3.209690086389174e-05, + "loss": 2.5617, + "step": 27798 + }, + { + "epoch": 2.5185386514461734, + "grad_norm": 1.019270420074463, + "learning_rate": 3.2090859662900984e-05, + "loss": 2.6272, + "step": 27799 + }, + { + "epoch": 2.51862924962062, + "grad_norm": 1.0277137756347656, + "learning_rate": 3.2084818461910225e-05, + "loss": 2.5995, + "step": 27800 + }, + { + "epoch": 2.518719847795067, + "grad_norm": 0.9067503213882446, + "learning_rate": 3.207877726091947e-05, + "loss": 1.9269, + "step": 27801 + }, + { + "epoch": 2.518810445969514, + "grad_norm": 1.0199077129364014, + "learning_rate": 3.207273605992871e-05, + "loss": 2.7665, + "step": 27802 + }, + { + "epoch": 2.5189010441439605, + "grad_norm": 1.0021644830703735, + "learning_rate": 3.2066694858937954e-05, + "loss": 2.4743, + "step": 27803 + }, + { + "epoch": 2.518991642318407, + "grad_norm": 0.9352452754974365, + "learning_rate": 3.20606536579472e-05, + "loss": 2.4715, + "step": 27804 + }, + { + "epoch": 2.519082240492854, + "grad_norm": 0.9505132436752319, + "learning_rate": 3.205461245695644e-05, + "loss": 2.6041, + "step": 27805 + }, + { + "epoch": 2.519172838667301, + "grad_norm": 1.0939024686813354, + "learning_rate": 3.204857125596569e-05, + "loss": 2.6686, + "step": 27806 + }, + { + "epoch": 2.5192634368417477, + "grad_norm": 1.0360170602798462, + "learning_rate": 3.204253005497493e-05, + "loss": 3.0762, + "step": 27807 + }, + { + "epoch": 2.5193540350161943, + "grad_norm": 0.9629941582679749, + "learning_rate": 3.203648885398418e-05, + "loss": 2.4719, + "step": 27808 + }, + { + "epoch": 2.5194446331906413, + "grad_norm": 0.9998902082443237, + "learning_rate": 3.203044765299342e-05, + "loss": 2.6131, + "step": 27809 + }, + { + "epoch": 2.519535231365088, + "grad_norm": 1.0246717929840088, + "learning_rate": 3.202440645200266e-05, + "loss": 2.6113, + "step": 27810 + }, + { + "epoch": 2.519625829539535, + "grad_norm": 0.9349135756492615, + "learning_rate": 3.20183652510119e-05, + "loss": 2.0157, + "step": 27811 + }, + { + "epoch": 2.5197164277139814, + "grad_norm": 0.9814527630805969, + "learning_rate": 3.201232405002115e-05, + "loss": 2.7036, + "step": 27812 + }, + { + "epoch": 2.5198070258884284, + "grad_norm": 1.067328691482544, + "learning_rate": 3.200628284903039e-05, + "loss": 2.4791, + "step": 27813 + }, + { + "epoch": 2.519897624062875, + "grad_norm": 1.036312460899353, + "learning_rate": 3.200024164803963e-05, + "loss": 2.4051, + "step": 27814 + }, + { + "epoch": 2.519988222237322, + "grad_norm": 0.9847208261489868, + "learning_rate": 3.199420044704887e-05, + "loss": 2.8095, + "step": 27815 + }, + { + "epoch": 2.5200788204117686, + "grad_norm": 1.0132520198822021, + "learning_rate": 3.198815924605812e-05, + "loss": 2.7066, + "step": 27816 + }, + { + "epoch": 2.5201694185862156, + "grad_norm": 0.979019820690155, + "learning_rate": 3.198211804506736e-05, + "loss": 2.5301, + "step": 27817 + }, + { + "epoch": 2.520260016760662, + "grad_norm": 1.045721173286438, + "learning_rate": 3.19760768440766e-05, + "loss": 2.423, + "step": 27818 + }, + { + "epoch": 2.520350614935109, + "grad_norm": 1.00835382938385, + "learning_rate": 3.197003564308585e-05, + "loss": 2.645, + "step": 27819 + }, + { + "epoch": 2.5204412131095557, + "grad_norm": 1.0367711782455444, + "learning_rate": 3.196399444209509e-05, + "loss": 2.9774, + "step": 27820 + }, + { + "epoch": 2.5205318112840027, + "grad_norm": 0.9855578541755676, + "learning_rate": 3.195795324110433e-05, + "loss": 2.7473, + "step": 27821 + }, + { + "epoch": 2.5206224094584493, + "grad_norm": 1.0227538347244263, + "learning_rate": 3.195191204011358e-05, + "loss": 2.4779, + "step": 27822 + }, + { + "epoch": 2.5207130076328963, + "grad_norm": 1.006934404373169, + "learning_rate": 3.194587083912282e-05, + "loss": 2.8556, + "step": 27823 + }, + { + "epoch": 2.520803605807343, + "grad_norm": 0.9967920780181885, + "learning_rate": 3.1939829638132066e-05, + "loss": 2.5834, + "step": 27824 + }, + { + "epoch": 2.52089420398179, + "grad_norm": 0.9984090924263, + "learning_rate": 3.1933788437141307e-05, + "loss": 2.7625, + "step": 27825 + }, + { + "epoch": 2.5209848021562364, + "grad_norm": 0.9255580902099609, + "learning_rate": 3.192774723615055e-05, + "loss": 2.1037, + "step": 27826 + }, + { + "epoch": 2.5210754003306834, + "grad_norm": 1.1000702381134033, + "learning_rate": 3.1921706035159795e-05, + "loss": 2.6369, + "step": 27827 + }, + { + "epoch": 2.52116599850513, + "grad_norm": 1.0669870376586914, + "learning_rate": 3.1915664834169036e-05, + "loss": 2.4376, + "step": 27828 + }, + { + "epoch": 2.521256596679577, + "grad_norm": 0.9523299932479858, + "learning_rate": 3.1909623633178277e-05, + "loss": 2.5988, + "step": 27829 + }, + { + "epoch": 2.5213471948540236, + "grad_norm": 1.1592743396759033, + "learning_rate": 3.190358243218752e-05, + "loss": 2.5552, + "step": 27830 + }, + { + "epoch": 2.5214377930284706, + "grad_norm": 1.0736746788024902, + "learning_rate": 3.1897541231196765e-05, + "loss": 2.3633, + "step": 27831 + }, + { + "epoch": 2.521528391202917, + "grad_norm": 1.0539932250976562, + "learning_rate": 3.1891500030206006e-05, + "loss": 2.8254, + "step": 27832 + }, + { + "epoch": 2.521618989377364, + "grad_norm": 0.9931973218917847, + "learning_rate": 3.188545882921525e-05, + "loss": 2.5443, + "step": 27833 + }, + { + "epoch": 2.5217095875518107, + "grad_norm": 1.0637316703796387, + "learning_rate": 3.1879417628224494e-05, + "loss": 2.7108, + "step": 27834 + }, + { + "epoch": 2.5218001857262577, + "grad_norm": 1.0691642761230469, + "learning_rate": 3.1873376427233735e-05, + "loss": 2.6109, + "step": 27835 + }, + { + "epoch": 2.5218907839007043, + "grad_norm": 0.9986197352409363, + "learning_rate": 3.1867335226242976e-05, + "loss": 2.6581, + "step": 27836 + }, + { + "epoch": 2.5219813820751513, + "grad_norm": 1.0081639289855957, + "learning_rate": 3.186129402525222e-05, + "loss": 2.5047, + "step": 27837 + }, + { + "epoch": 2.522071980249598, + "grad_norm": 0.9211063981056213, + "learning_rate": 3.1855252824261464e-05, + "loss": 1.7577, + "step": 27838 + }, + { + "epoch": 2.522162578424045, + "grad_norm": 1.0596359968185425, + "learning_rate": 3.1849211623270705e-05, + "loss": 2.7059, + "step": 27839 + }, + { + "epoch": 2.5222531765984915, + "grad_norm": 1.0647518634796143, + "learning_rate": 3.184317042227995e-05, + "loss": 2.511, + "step": 27840 + }, + { + "epoch": 2.5223437747729385, + "grad_norm": 1.012990951538086, + "learning_rate": 3.1837129221289194e-05, + "loss": 2.6439, + "step": 27841 + }, + { + "epoch": 2.522434372947385, + "grad_norm": 0.9937217831611633, + "learning_rate": 3.183108802029844e-05, + "loss": 2.8556, + "step": 27842 + }, + { + "epoch": 2.522524971121832, + "grad_norm": 0.9466270804405212, + "learning_rate": 3.182504681930768e-05, + "loss": 2.6089, + "step": 27843 + }, + { + "epoch": 2.5226155692962786, + "grad_norm": 1.008934497833252, + "learning_rate": 3.181900561831692e-05, + "loss": 2.7255, + "step": 27844 + }, + { + "epoch": 2.5227061674707256, + "grad_norm": 0.9397927522659302, + "learning_rate": 3.181296441732617e-05, + "loss": 2.8385, + "step": 27845 + }, + { + "epoch": 2.522796765645172, + "grad_norm": 1.0176877975463867, + "learning_rate": 3.180692321633541e-05, + "loss": 2.5514, + "step": 27846 + }, + { + "epoch": 2.522887363819619, + "grad_norm": 0.9967371821403503, + "learning_rate": 3.180088201534465e-05, + "loss": 2.4583, + "step": 27847 + }, + { + "epoch": 2.5229779619940658, + "grad_norm": 1.0093659162521362, + "learning_rate": 3.179484081435389e-05, + "loss": 2.6331, + "step": 27848 + }, + { + "epoch": 2.5230685601685128, + "grad_norm": 1.033589482307434, + "learning_rate": 3.178879961336314e-05, + "loss": 2.8188, + "step": 27849 + }, + { + "epoch": 2.5231591583429593, + "grad_norm": 0.949713945388794, + "learning_rate": 3.178275841237238e-05, + "loss": 2.3774, + "step": 27850 + }, + { + "epoch": 2.523249756517406, + "grad_norm": 1.0315814018249512, + "learning_rate": 3.177671721138162e-05, + "loss": 2.6571, + "step": 27851 + }, + { + "epoch": 2.523340354691853, + "grad_norm": 0.8411725163459778, + "learning_rate": 3.177067601039086e-05, + "loss": 2.036, + "step": 27852 + }, + { + "epoch": 2.5234309528663, + "grad_norm": 0.9800949096679688, + "learning_rate": 3.176463480940011e-05, + "loss": 2.6171, + "step": 27853 + }, + { + "epoch": 2.5235215510407465, + "grad_norm": 0.9787057638168335, + "learning_rate": 3.175859360840935e-05, + "loss": 2.5909, + "step": 27854 + }, + { + "epoch": 2.523612149215193, + "grad_norm": 1.1197973489761353, + "learning_rate": 3.175255240741859e-05, + "loss": 2.3228, + "step": 27855 + }, + { + "epoch": 2.52370274738964, + "grad_norm": 1.0714035034179688, + "learning_rate": 3.174651120642784e-05, + "loss": 2.682, + "step": 27856 + }, + { + "epoch": 2.523793345564087, + "grad_norm": 1.0079271793365479, + "learning_rate": 3.174047000543708e-05, + "loss": 2.6797, + "step": 27857 + }, + { + "epoch": 2.5238839437385336, + "grad_norm": 1.1327112913131714, + "learning_rate": 3.173442880444633e-05, + "loss": 2.7063, + "step": 27858 + }, + { + "epoch": 2.52397454191298, + "grad_norm": 1.0202689170837402, + "learning_rate": 3.172838760345557e-05, + "loss": 2.7162, + "step": 27859 + }, + { + "epoch": 2.524065140087427, + "grad_norm": 0.9319111704826355, + "learning_rate": 3.172234640246482e-05, + "loss": 2.5858, + "step": 27860 + }, + { + "epoch": 2.5241557382618742, + "grad_norm": 1.0316600799560547, + "learning_rate": 3.171630520147406e-05, + "loss": 2.7855, + "step": 27861 + }, + { + "epoch": 2.524246336436321, + "grad_norm": 0.9509663581848145, + "learning_rate": 3.17102640004833e-05, + "loss": 2.5797, + "step": 27862 + }, + { + "epoch": 2.5243369346107674, + "grad_norm": 1.0277554988861084, + "learning_rate": 3.170422279949254e-05, + "loss": 2.4298, + "step": 27863 + }, + { + "epoch": 2.5244275327852144, + "grad_norm": 1.0537149906158447, + "learning_rate": 3.169818159850179e-05, + "loss": 2.4902, + "step": 27864 + }, + { + "epoch": 2.5245181309596614, + "grad_norm": 1.018640398979187, + "learning_rate": 3.169214039751103e-05, + "loss": 2.7468, + "step": 27865 + }, + { + "epoch": 2.524608729134108, + "grad_norm": 0.8467735052108765, + "learning_rate": 3.168609919652027e-05, + "loss": 2.0601, + "step": 27866 + }, + { + "epoch": 2.5246993273085545, + "grad_norm": 0.9415774345397949, + "learning_rate": 3.168005799552951e-05, + "loss": 2.5136, + "step": 27867 + }, + { + "epoch": 2.5247899254830015, + "grad_norm": 0.9943996667861938, + "learning_rate": 3.167401679453876e-05, + "loss": 2.6415, + "step": 27868 + }, + { + "epoch": 2.5248805236574485, + "grad_norm": 1.0246424674987793, + "learning_rate": 3.1667975593548e-05, + "loss": 1.7042, + "step": 27869 + }, + { + "epoch": 2.524971121831895, + "grad_norm": 0.9510373473167419, + "learning_rate": 3.166193439255724e-05, + "loss": 2.4964, + "step": 27870 + }, + { + "epoch": 2.5250617200063417, + "grad_norm": 0.9915392994880676, + "learning_rate": 3.165589319156648e-05, + "loss": 2.5995, + "step": 27871 + }, + { + "epoch": 2.5251523181807887, + "grad_norm": 0.9590114951133728, + "learning_rate": 3.164985199057573e-05, + "loss": 2.7262, + "step": 27872 + }, + { + "epoch": 2.5252429163552357, + "grad_norm": 1.0225673913955688, + "learning_rate": 3.164381078958497e-05, + "loss": 2.4971, + "step": 27873 + }, + { + "epoch": 2.5253335145296822, + "grad_norm": 1.0372016429901123, + "learning_rate": 3.1637769588594216e-05, + "loss": 2.6403, + "step": 27874 + }, + { + "epoch": 2.525424112704129, + "grad_norm": 0.9406722784042358, + "learning_rate": 3.163172838760346e-05, + "loss": 2.3231, + "step": 27875 + }, + { + "epoch": 2.525514710878576, + "grad_norm": 0.9607231020927429, + "learning_rate": 3.1625687186612704e-05, + "loss": 2.5955, + "step": 27876 + }, + { + "epoch": 2.525605309053023, + "grad_norm": 1.0083866119384766, + "learning_rate": 3.1619645985621945e-05, + "loss": 2.7146, + "step": 27877 + }, + { + "epoch": 2.5256959072274694, + "grad_norm": 0.961741030216217, + "learning_rate": 3.1613604784631186e-05, + "loss": 2.6168, + "step": 27878 + }, + { + "epoch": 2.525786505401916, + "grad_norm": 1.0647568702697754, + "learning_rate": 3.160756358364043e-05, + "loss": 2.7014, + "step": 27879 + }, + { + "epoch": 2.525877103576363, + "grad_norm": 1.1064633131027222, + "learning_rate": 3.1601522382649674e-05, + "loss": 2.8646, + "step": 27880 + }, + { + "epoch": 2.52596770175081, + "grad_norm": 1.1099566221237183, + "learning_rate": 3.1595481181658915e-05, + "loss": 2.7733, + "step": 27881 + }, + { + "epoch": 2.5260582999252565, + "grad_norm": 1.015801191329956, + "learning_rate": 3.1589439980668156e-05, + "loss": 2.6541, + "step": 27882 + }, + { + "epoch": 2.526148898099703, + "grad_norm": 1.0531281232833862, + "learning_rate": 3.15833987796774e-05, + "loss": 2.5069, + "step": 27883 + }, + { + "epoch": 2.52623949627415, + "grad_norm": 0.9864223599433899, + "learning_rate": 3.1577357578686644e-05, + "loss": 2.4041, + "step": 27884 + }, + { + "epoch": 2.526330094448597, + "grad_norm": 0.9803937673568726, + "learning_rate": 3.1571316377695885e-05, + "loss": 2.7054, + "step": 27885 + }, + { + "epoch": 2.5264206926230437, + "grad_norm": 0.9742732048034668, + "learning_rate": 3.1565275176705126e-05, + "loss": 2.4526, + "step": 27886 + }, + { + "epoch": 2.5265112907974903, + "grad_norm": 0.8834808468818665, + "learning_rate": 3.1559233975714373e-05, + "loss": 2.1616, + "step": 27887 + }, + { + "epoch": 2.5266018889719373, + "grad_norm": 1.0425710678100586, + "learning_rate": 3.1553192774723614e-05, + "loss": 2.5606, + "step": 27888 + }, + { + "epoch": 2.526692487146384, + "grad_norm": 1.0418559312820435, + "learning_rate": 3.1547151573732855e-05, + "loss": 2.7727, + "step": 27889 + }, + { + "epoch": 2.526783085320831, + "grad_norm": 1.1473335027694702, + "learning_rate": 3.15411103727421e-05, + "loss": 2.4366, + "step": 27890 + }, + { + "epoch": 2.5268736834952774, + "grad_norm": 1.0629147291183472, + "learning_rate": 3.1535069171751344e-05, + "loss": 2.4537, + "step": 27891 + }, + { + "epoch": 2.5269642816697244, + "grad_norm": 0.9924492835998535, + "learning_rate": 3.152902797076059e-05, + "loss": 2.7533, + "step": 27892 + }, + { + "epoch": 2.527054879844171, + "grad_norm": 1.121718406677246, + "learning_rate": 3.152298676976983e-05, + "loss": 2.4876, + "step": 27893 + }, + { + "epoch": 2.527145478018618, + "grad_norm": 0.8462633490562439, + "learning_rate": 3.151694556877908e-05, + "loss": 1.9335, + "step": 27894 + }, + { + "epoch": 2.5272360761930646, + "grad_norm": 1.0347907543182373, + "learning_rate": 3.151090436778832e-05, + "loss": 2.6314, + "step": 27895 + }, + { + "epoch": 2.5273266743675116, + "grad_norm": 1.091894268989563, + "learning_rate": 3.150486316679756e-05, + "loss": 2.8342, + "step": 27896 + }, + { + "epoch": 2.527417272541958, + "grad_norm": 1.1594336032867432, + "learning_rate": 3.14988219658068e-05, + "loss": 2.6952, + "step": 27897 + }, + { + "epoch": 2.527507870716405, + "grad_norm": 0.8646700382232666, + "learning_rate": 3.149278076481605e-05, + "loss": 1.7999, + "step": 27898 + }, + { + "epoch": 2.5275984688908517, + "grad_norm": 1.087796688079834, + "learning_rate": 3.148673956382529e-05, + "loss": 2.6328, + "step": 27899 + }, + { + "epoch": 2.5276890670652987, + "grad_norm": 1.0494986772537231, + "learning_rate": 3.148069836283453e-05, + "loss": 2.4433, + "step": 27900 + }, + { + "epoch": 2.5277796652397453, + "grad_norm": 0.990834653377533, + "learning_rate": 3.147465716184377e-05, + "loss": 2.5872, + "step": 27901 + }, + { + "epoch": 2.5278702634141923, + "grad_norm": 0.9897601008415222, + "learning_rate": 3.146861596085302e-05, + "loss": 2.4807, + "step": 27902 + }, + { + "epoch": 2.527960861588639, + "grad_norm": 1.0070184469223022, + "learning_rate": 3.146257475986226e-05, + "loss": 2.4036, + "step": 27903 + }, + { + "epoch": 2.528051459763086, + "grad_norm": 1.171865701675415, + "learning_rate": 3.14565335588715e-05, + "loss": 2.3034, + "step": 27904 + }, + { + "epoch": 2.5281420579375324, + "grad_norm": 1.002251148223877, + "learning_rate": 3.145049235788075e-05, + "loss": 2.5848, + "step": 27905 + }, + { + "epoch": 2.5282326561119794, + "grad_norm": 0.9081368446350098, + "learning_rate": 3.144445115688999e-05, + "loss": 2.0538, + "step": 27906 + }, + { + "epoch": 2.528323254286426, + "grad_norm": 0.806614100933075, + "learning_rate": 3.143840995589923e-05, + "loss": 1.8974, + "step": 27907 + }, + { + "epoch": 2.528413852460873, + "grad_norm": 0.8695312738418579, + "learning_rate": 3.143236875490848e-05, + "loss": 1.9553, + "step": 27908 + }, + { + "epoch": 2.5285044506353196, + "grad_norm": 0.9148284792900085, + "learning_rate": 3.1426327553917726e-05, + "loss": 1.9935, + "step": 27909 + }, + { + "epoch": 2.5285950488097666, + "grad_norm": 1.0105117559432983, + "learning_rate": 3.142028635292697e-05, + "loss": 2.6267, + "step": 27910 + }, + { + "epoch": 2.528685646984213, + "grad_norm": 0.9921701550483704, + "learning_rate": 3.141424515193621e-05, + "loss": 1.7962, + "step": 27911 + }, + { + "epoch": 2.52877624515866, + "grad_norm": 1.0197502374649048, + "learning_rate": 3.140820395094545e-05, + "loss": 1.8627, + "step": 27912 + }, + { + "epoch": 2.5288668433331067, + "grad_norm": 1.048256754875183, + "learning_rate": 3.1402162749954696e-05, + "loss": 2.6332, + "step": 27913 + }, + { + "epoch": 2.5289574415075537, + "grad_norm": 0.7475205659866333, + "learning_rate": 3.139612154896394e-05, + "loss": 1.3361, + "step": 27914 + }, + { + "epoch": 2.5290480396820003, + "grad_norm": 1.0027265548706055, + "learning_rate": 3.139008034797318e-05, + "loss": 2.6462, + "step": 27915 + }, + { + "epoch": 2.5291386378564473, + "grad_norm": 1.0325289964675903, + "learning_rate": 3.138403914698242e-05, + "loss": 2.7469, + "step": 27916 + }, + { + "epoch": 2.529229236030894, + "grad_norm": 1.0767874717712402, + "learning_rate": 3.1377997945991666e-05, + "loss": 2.488, + "step": 27917 + }, + { + "epoch": 2.529319834205341, + "grad_norm": 0.9585093855857849, + "learning_rate": 3.137195674500091e-05, + "loss": 2.5758, + "step": 27918 + }, + { + "epoch": 2.5294104323797875, + "grad_norm": 1.1408942937850952, + "learning_rate": 3.136591554401015e-05, + "loss": 2.9049, + "step": 27919 + }, + { + "epoch": 2.5295010305542345, + "grad_norm": 0.9586656093597412, + "learning_rate": 3.1359874343019395e-05, + "loss": 2.5763, + "step": 27920 + }, + { + "epoch": 2.529591628728681, + "grad_norm": 1.0696943998336792, + "learning_rate": 3.1353833142028636e-05, + "loss": 2.7821, + "step": 27921 + }, + { + "epoch": 2.529682226903128, + "grad_norm": 1.0280262231826782, + "learning_rate": 3.134779194103788e-05, + "loss": 2.5225, + "step": 27922 + }, + { + "epoch": 2.5297728250775746, + "grad_norm": 1.025492787361145, + "learning_rate": 3.134175074004712e-05, + "loss": 2.4629, + "step": 27923 + }, + { + "epoch": 2.5298634232520216, + "grad_norm": 1.0325123071670532, + "learning_rate": 3.1335709539056365e-05, + "loss": 2.9005, + "step": 27924 + }, + { + "epoch": 2.529954021426468, + "grad_norm": 1.0123194456100464, + "learning_rate": 3.132966833806561e-05, + "loss": 2.6868, + "step": 27925 + }, + { + "epoch": 2.530044619600915, + "grad_norm": 1.1700412034988403, + "learning_rate": 3.1323627137074854e-05, + "loss": 2.5993, + "step": 27926 + }, + { + "epoch": 2.5301352177753618, + "grad_norm": 1.0537261962890625, + "learning_rate": 3.1317585936084095e-05, + "loss": 2.4111, + "step": 27927 + }, + { + "epoch": 2.5302258159498088, + "grad_norm": 1.032569169998169, + "learning_rate": 3.131154473509334e-05, + "loss": 2.5657, + "step": 27928 + }, + { + "epoch": 2.5303164141242553, + "grad_norm": 1.1321746110916138, + "learning_rate": 3.130550353410258e-05, + "loss": 2.6729, + "step": 27929 + }, + { + "epoch": 2.530407012298702, + "grad_norm": 1.1104708909988403, + "learning_rate": 3.1299462333111824e-05, + "loss": 2.5369, + "step": 27930 + }, + { + "epoch": 2.530497610473149, + "grad_norm": 0.9704563021659851, + "learning_rate": 3.1293421132121065e-05, + "loss": 2.4825, + "step": 27931 + }, + { + "epoch": 2.530588208647596, + "grad_norm": 1.058717131614685, + "learning_rate": 3.128737993113031e-05, + "loss": 2.4797, + "step": 27932 + }, + { + "epoch": 2.5306788068220425, + "grad_norm": 1.054203987121582, + "learning_rate": 3.128133873013955e-05, + "loss": 2.7233, + "step": 27933 + }, + { + "epoch": 2.530769404996489, + "grad_norm": 0.9866122007369995, + "learning_rate": 3.1275297529148794e-05, + "loss": 2.4083, + "step": 27934 + }, + { + "epoch": 2.530860003170936, + "grad_norm": 0.9880120754241943, + "learning_rate": 3.126925632815804e-05, + "loss": 2.5822, + "step": 27935 + }, + { + "epoch": 2.530950601345383, + "grad_norm": 1.0276731252670288, + "learning_rate": 3.126321512716728e-05, + "loss": 2.5836, + "step": 27936 + }, + { + "epoch": 2.5310411995198296, + "grad_norm": 1.085072636604309, + "learning_rate": 3.125717392617652e-05, + "loss": 2.8171, + "step": 27937 + }, + { + "epoch": 2.531131797694276, + "grad_norm": 0.9544817805290222, + "learning_rate": 3.1251132725185764e-05, + "loss": 2.4218, + "step": 27938 + }, + { + "epoch": 2.531222395868723, + "grad_norm": 1.0584725141525269, + "learning_rate": 3.124509152419501e-05, + "loss": 2.514, + "step": 27939 + }, + { + "epoch": 2.5313129940431702, + "grad_norm": 1.0780357122421265, + "learning_rate": 3.123905032320425e-05, + "loss": 2.7402, + "step": 27940 + }, + { + "epoch": 2.531403592217617, + "grad_norm": 1.04417085647583, + "learning_rate": 3.12330091222135e-05, + "loss": 2.5688, + "step": 27941 + }, + { + "epoch": 2.5314941903920634, + "grad_norm": 1.0196261405944824, + "learning_rate": 3.122696792122274e-05, + "loss": 2.6698, + "step": 27942 + }, + { + "epoch": 2.5315847885665104, + "grad_norm": 1.059329867362976, + "learning_rate": 3.122092672023199e-05, + "loss": 2.6385, + "step": 27943 + }, + { + "epoch": 2.5316753867409574, + "grad_norm": 1.0126808881759644, + "learning_rate": 3.121488551924123e-05, + "loss": 2.3927, + "step": 27944 + }, + { + "epoch": 2.531765984915404, + "grad_norm": 0.9819774031639099, + "learning_rate": 3.120884431825047e-05, + "loss": 2.7223, + "step": 27945 + }, + { + "epoch": 2.5318565830898505, + "grad_norm": 1.07384192943573, + "learning_rate": 3.120280311725971e-05, + "loss": 2.6797, + "step": 27946 + }, + { + "epoch": 2.5319471812642975, + "grad_norm": 1.038372278213501, + "learning_rate": 3.119676191626896e-05, + "loss": 2.8969, + "step": 27947 + }, + { + "epoch": 2.5320377794387445, + "grad_norm": 1.1127232313156128, + "learning_rate": 3.11907207152782e-05, + "loss": 2.6335, + "step": 27948 + }, + { + "epoch": 2.532128377613191, + "grad_norm": 1.0514757633209229, + "learning_rate": 3.118467951428744e-05, + "loss": 2.6915, + "step": 27949 + }, + { + "epoch": 2.5322189757876377, + "grad_norm": 0.954721987247467, + "learning_rate": 3.117863831329669e-05, + "loss": 2.3983, + "step": 27950 + }, + { + "epoch": 2.5323095739620847, + "grad_norm": 1.0266015529632568, + "learning_rate": 3.117259711230593e-05, + "loss": 2.5115, + "step": 27951 + }, + { + "epoch": 2.5324001721365317, + "grad_norm": 0.9948520660400391, + "learning_rate": 3.116655591131517e-05, + "loss": 2.6622, + "step": 27952 + }, + { + "epoch": 2.5324907703109782, + "grad_norm": 1.0669480562210083, + "learning_rate": 3.116051471032441e-05, + "loss": 2.5619, + "step": 27953 + }, + { + "epoch": 2.532581368485425, + "grad_norm": 0.9699499607086182, + "learning_rate": 3.115447350933366e-05, + "loss": 2.4654, + "step": 27954 + }, + { + "epoch": 2.532671966659872, + "grad_norm": 1.041367530822754, + "learning_rate": 3.11484323083429e-05, + "loss": 2.3266, + "step": 27955 + }, + { + "epoch": 2.532762564834319, + "grad_norm": 0.8584282398223877, + "learning_rate": 3.114239110735214e-05, + "loss": 2.089, + "step": 27956 + }, + { + "epoch": 2.5328531630087654, + "grad_norm": 1.0487550497055054, + "learning_rate": 3.113634990636138e-05, + "loss": 2.6129, + "step": 27957 + }, + { + "epoch": 2.532943761183212, + "grad_norm": 0.986311674118042, + "learning_rate": 3.113030870537063e-05, + "loss": 2.6705, + "step": 27958 + }, + { + "epoch": 2.533034359357659, + "grad_norm": 1.0443322658538818, + "learning_rate": 3.1124267504379876e-05, + "loss": 2.6627, + "step": 27959 + }, + { + "epoch": 2.533124957532106, + "grad_norm": 1.1004093885421753, + "learning_rate": 3.111822630338912e-05, + "loss": 2.636, + "step": 27960 + }, + { + "epoch": 2.5332155557065525, + "grad_norm": 1.0323925018310547, + "learning_rate": 3.111218510239836e-05, + "loss": 2.6065, + "step": 27961 + }, + { + "epoch": 2.533306153880999, + "grad_norm": 1.0189458131790161, + "learning_rate": 3.1106143901407605e-05, + "loss": 2.6296, + "step": 27962 + }, + { + "epoch": 2.533396752055446, + "grad_norm": 1.003721833229065, + "learning_rate": 3.1100102700416846e-05, + "loss": 2.4662, + "step": 27963 + }, + { + "epoch": 2.533487350229893, + "grad_norm": 1.003250241279602, + "learning_rate": 3.109406149942609e-05, + "loss": 2.7176, + "step": 27964 + }, + { + "epoch": 2.5335779484043397, + "grad_norm": 0.9936999082565308, + "learning_rate": 3.1088020298435334e-05, + "loss": 2.6407, + "step": 27965 + }, + { + "epoch": 2.5336685465787863, + "grad_norm": 1.0174282789230347, + "learning_rate": 3.1081979097444575e-05, + "loss": 2.4305, + "step": 27966 + }, + { + "epoch": 2.5337591447532333, + "grad_norm": 0.9946056604385376, + "learning_rate": 3.1075937896453816e-05, + "loss": 2.6017, + "step": 27967 + }, + { + "epoch": 2.5338497429276803, + "grad_norm": 1.0402954816818237, + "learning_rate": 3.106989669546306e-05, + "loss": 2.5539, + "step": 27968 + }, + { + "epoch": 2.533940341102127, + "grad_norm": 1.0410678386688232, + "learning_rate": 3.1063855494472304e-05, + "loss": 2.4298, + "step": 27969 + }, + { + "epoch": 2.5340309392765734, + "grad_norm": 0.9543876051902771, + "learning_rate": 3.1057814293481545e-05, + "loss": 2.4436, + "step": 27970 + }, + { + "epoch": 2.5341215374510204, + "grad_norm": 1.0144144296646118, + "learning_rate": 3.1051773092490786e-05, + "loss": 2.6011, + "step": 27971 + }, + { + "epoch": 2.534212135625467, + "grad_norm": 1.0732359886169434, + "learning_rate": 3.104573189150003e-05, + "loss": 2.5759, + "step": 27972 + }, + { + "epoch": 2.534302733799914, + "grad_norm": 0.9981427788734436, + "learning_rate": 3.1039690690509275e-05, + "loss": 2.601, + "step": 27973 + }, + { + "epoch": 2.5343933319743606, + "grad_norm": 1.033430814743042, + "learning_rate": 3.1033649489518515e-05, + "loss": 2.7417, + "step": 27974 + }, + { + "epoch": 2.5344839301488076, + "grad_norm": 0.9637141227722168, + "learning_rate": 3.102760828852776e-05, + "loss": 2.561, + "step": 27975 + }, + { + "epoch": 2.534574528323254, + "grad_norm": 1.0066744089126587, + "learning_rate": 3.1021567087537004e-05, + "loss": 2.5952, + "step": 27976 + }, + { + "epoch": 2.534665126497701, + "grad_norm": 1.125330924987793, + "learning_rate": 3.101552588654625e-05, + "loss": 2.5767, + "step": 27977 + }, + { + "epoch": 2.5347557246721477, + "grad_norm": 1.011897087097168, + "learning_rate": 3.100948468555549e-05, + "loss": 2.5748, + "step": 27978 + }, + { + "epoch": 2.5348463228465947, + "grad_norm": 1.0362948179244995, + "learning_rate": 3.100344348456473e-05, + "loss": 2.8662, + "step": 27979 + }, + { + "epoch": 2.5349369210210413, + "grad_norm": 1.0341682434082031, + "learning_rate": 3.099740228357398e-05, + "loss": 2.7232, + "step": 27980 + }, + { + "epoch": 2.5350275191954883, + "grad_norm": 1.0157181024551392, + "learning_rate": 3.099136108258322e-05, + "loss": 2.7305, + "step": 27981 + }, + { + "epoch": 2.535118117369935, + "grad_norm": 1.0286270380020142, + "learning_rate": 3.098531988159246e-05, + "loss": 2.5472, + "step": 27982 + }, + { + "epoch": 2.535208715544382, + "grad_norm": 1.014508605003357, + "learning_rate": 3.09792786806017e-05, + "loss": 2.5763, + "step": 27983 + }, + { + "epoch": 2.5352993137188284, + "grad_norm": 1.0319552421569824, + "learning_rate": 3.097323747961095e-05, + "loss": 2.6954, + "step": 27984 + }, + { + "epoch": 2.5353899118932754, + "grad_norm": 1.0435707569122314, + "learning_rate": 3.096719627862019e-05, + "loss": 2.6868, + "step": 27985 + }, + { + "epoch": 2.535480510067722, + "grad_norm": 1.0421943664550781, + "learning_rate": 3.096115507762943e-05, + "loss": 2.9612, + "step": 27986 + }, + { + "epoch": 2.535571108242169, + "grad_norm": 1.05232834815979, + "learning_rate": 3.095511387663867e-05, + "loss": 2.6379, + "step": 27987 + }, + { + "epoch": 2.5356617064166156, + "grad_norm": 1.0340251922607422, + "learning_rate": 3.094907267564792e-05, + "loss": 2.7117, + "step": 27988 + }, + { + "epoch": 2.5357523045910626, + "grad_norm": 1.076103925704956, + "learning_rate": 3.094303147465716e-05, + "loss": 2.7441, + "step": 27989 + }, + { + "epoch": 2.535842902765509, + "grad_norm": 1.0514965057373047, + "learning_rate": 3.09369902736664e-05, + "loss": 2.528, + "step": 27990 + }, + { + "epoch": 2.535933500939956, + "grad_norm": 1.0455461740493774, + "learning_rate": 3.093094907267565e-05, + "loss": 2.9023, + "step": 27991 + }, + { + "epoch": 2.5360240991144027, + "grad_norm": 1.05954110622406, + "learning_rate": 3.092490787168489e-05, + "loss": 2.4807, + "step": 27992 + }, + { + "epoch": 2.5361146972888498, + "grad_norm": 1.0304324626922607, + "learning_rate": 3.091886667069414e-05, + "loss": 2.6827, + "step": 27993 + }, + { + "epoch": 2.5362052954632963, + "grad_norm": 1.0222721099853516, + "learning_rate": 3.091282546970338e-05, + "loss": 2.607, + "step": 27994 + }, + { + "epoch": 2.5362958936377433, + "grad_norm": 1.0185213088989258, + "learning_rate": 3.090678426871263e-05, + "loss": 2.6927, + "step": 27995 + }, + { + "epoch": 2.53638649181219, + "grad_norm": 1.05205237865448, + "learning_rate": 3.090074306772187e-05, + "loss": 2.5324, + "step": 27996 + }, + { + "epoch": 2.536477089986637, + "grad_norm": 0.9733694195747375, + "learning_rate": 3.089470186673111e-05, + "loss": 2.9362, + "step": 27997 + }, + { + "epoch": 2.5365676881610835, + "grad_norm": 0.9830822944641113, + "learning_rate": 3.088866066574035e-05, + "loss": 2.3502, + "step": 27998 + }, + { + "epoch": 2.5366582863355305, + "grad_norm": 1.0388203859329224, + "learning_rate": 3.08826194647496e-05, + "loss": 2.6297, + "step": 27999 + }, + { + "epoch": 2.536748884509977, + "grad_norm": 0.87519770860672, + "learning_rate": 3.087657826375884e-05, + "loss": 1.7983, + "step": 28000 + }, + { + "epoch": 2.536839482684424, + "grad_norm": 0.9593524932861328, + "learning_rate": 3.087053706276808e-05, + "loss": 2.5042, + "step": 28001 + }, + { + "epoch": 2.5369300808588706, + "grad_norm": 1.0250335931777954, + "learning_rate": 3.086449586177732e-05, + "loss": 2.5849, + "step": 28002 + }, + { + "epoch": 2.5370206790333176, + "grad_norm": 0.7920342087745667, + "learning_rate": 3.085845466078657e-05, + "loss": 1.4413, + "step": 28003 + }, + { + "epoch": 2.537111277207764, + "grad_norm": 0.9742071628570557, + "learning_rate": 3.085241345979581e-05, + "loss": 2.4516, + "step": 28004 + }, + { + "epoch": 2.537201875382211, + "grad_norm": 1.058195948600769, + "learning_rate": 3.084637225880505e-05, + "loss": 2.5336, + "step": 28005 + }, + { + "epoch": 2.5372924735566578, + "grad_norm": 1.0209015607833862, + "learning_rate": 3.084033105781429e-05, + "loss": 2.7083, + "step": 28006 + }, + { + "epoch": 2.5373830717311048, + "grad_norm": 1.0117028951644897, + "learning_rate": 3.083428985682354e-05, + "loss": 2.607, + "step": 28007 + }, + { + "epoch": 2.5374736699055513, + "grad_norm": 0.9954072833061218, + "learning_rate": 3.082824865583278e-05, + "loss": 2.7407, + "step": 28008 + }, + { + "epoch": 2.5375642680799984, + "grad_norm": 1.0595048666000366, + "learning_rate": 3.0822207454842026e-05, + "loss": 2.6722, + "step": 28009 + }, + { + "epoch": 2.537654866254445, + "grad_norm": 1.0064653158187866, + "learning_rate": 3.0816166253851267e-05, + "loss": 2.6773, + "step": 28010 + }, + { + "epoch": 2.537745464428892, + "grad_norm": 0.9512438178062439, + "learning_rate": 3.0810125052860514e-05, + "loss": 1.7861, + "step": 28011 + }, + { + "epoch": 2.5378360626033385, + "grad_norm": 0.9475626349449158, + "learning_rate": 3.0804083851869755e-05, + "loss": 2.3742, + "step": 28012 + }, + { + "epoch": 2.537926660777785, + "grad_norm": 1.0528920888900757, + "learning_rate": 3.0798042650878996e-05, + "loss": 2.3384, + "step": 28013 + }, + { + "epoch": 2.538017258952232, + "grad_norm": 1.0105575323104858, + "learning_rate": 3.0792001449888243e-05, + "loss": 2.5926, + "step": 28014 + }, + { + "epoch": 2.538107857126679, + "grad_norm": 1.0231026411056519, + "learning_rate": 3.0785960248897484e-05, + "loss": 2.5036, + "step": 28015 + }, + { + "epoch": 2.5381984553011256, + "grad_norm": 0.9710310101509094, + "learning_rate": 3.0779919047906725e-05, + "loss": 2.4788, + "step": 28016 + }, + { + "epoch": 2.538289053475572, + "grad_norm": 0.9969069361686707, + "learning_rate": 3.0773877846915966e-05, + "loss": 2.5376, + "step": 28017 + }, + { + "epoch": 2.538379651650019, + "grad_norm": 1.2859888076782227, + "learning_rate": 3.0767836645925214e-05, + "loss": 2.6196, + "step": 28018 + }, + { + "epoch": 2.5384702498244662, + "grad_norm": 0.9635736346244812, + "learning_rate": 3.0761795444934454e-05, + "loss": 2.5797, + "step": 28019 + }, + { + "epoch": 2.538560847998913, + "grad_norm": 1.0814679861068726, + "learning_rate": 3.0755754243943695e-05, + "loss": 2.8594, + "step": 28020 + }, + { + "epoch": 2.5386514461733594, + "grad_norm": 1.0359835624694824, + "learning_rate": 3.0749713042952936e-05, + "loss": 2.5153, + "step": 28021 + }, + { + "epoch": 2.5387420443478064, + "grad_norm": 0.9890246391296387, + "learning_rate": 3.0743671841962184e-05, + "loss": 2.5199, + "step": 28022 + }, + { + "epoch": 2.5388326425222534, + "grad_norm": 0.8725444078445435, + "learning_rate": 3.0737630640971424e-05, + "loss": 2.0221, + "step": 28023 + }, + { + "epoch": 2.5389232406967, + "grad_norm": 1.0475202798843384, + "learning_rate": 3.0731589439980665e-05, + "loss": 2.5697, + "step": 28024 + }, + { + "epoch": 2.5390138388711465, + "grad_norm": 0.9767577648162842, + "learning_rate": 3.072554823898991e-05, + "loss": 1.8036, + "step": 28025 + }, + { + "epoch": 2.5391044370455935, + "grad_norm": 1.0074717998504639, + "learning_rate": 3.0719507037999154e-05, + "loss": 2.6472, + "step": 28026 + }, + { + "epoch": 2.5391950352200405, + "grad_norm": 1.126904010772705, + "learning_rate": 3.07134658370084e-05, + "loss": 2.3092, + "step": 28027 + }, + { + "epoch": 2.539285633394487, + "grad_norm": 0.903751015663147, + "learning_rate": 3.070742463601764e-05, + "loss": 1.8094, + "step": 28028 + }, + { + "epoch": 2.5393762315689337, + "grad_norm": 0.9582602977752686, + "learning_rate": 3.070138343502689e-05, + "loss": 2.7015, + "step": 28029 + }, + { + "epoch": 2.5394668297433807, + "grad_norm": 0.9831751585006714, + "learning_rate": 3.069534223403613e-05, + "loss": 2.5847, + "step": 28030 + }, + { + "epoch": 2.5395574279178277, + "grad_norm": 0.9457942843437195, + "learning_rate": 3.068930103304537e-05, + "loss": 2.5886, + "step": 28031 + }, + { + "epoch": 2.5396480260922742, + "grad_norm": 1.0620334148406982, + "learning_rate": 3.068325983205461e-05, + "loss": 2.5391, + "step": 28032 + }, + { + "epoch": 2.539738624266721, + "grad_norm": 0.9696319699287415, + "learning_rate": 3.067721863106386e-05, + "loss": 2.5814, + "step": 28033 + }, + { + "epoch": 2.539829222441168, + "grad_norm": 0.9754471778869629, + "learning_rate": 3.06711774300731e-05, + "loss": 2.8339, + "step": 28034 + }, + { + "epoch": 2.539919820615615, + "grad_norm": 0.9874913692474365, + "learning_rate": 3.066513622908234e-05, + "loss": 2.7689, + "step": 28035 + }, + { + "epoch": 2.5400104187900614, + "grad_norm": 0.8518908023834229, + "learning_rate": 3.065909502809158e-05, + "loss": 1.8701, + "step": 28036 + }, + { + "epoch": 2.540101016964508, + "grad_norm": 0.858862578868866, + "learning_rate": 3.065305382710083e-05, + "loss": 1.8974, + "step": 28037 + }, + { + "epoch": 2.540191615138955, + "grad_norm": 1.0152751207351685, + "learning_rate": 3.064701262611007e-05, + "loss": 2.783, + "step": 28038 + }, + { + "epoch": 2.540282213313402, + "grad_norm": 1.2359763383865356, + "learning_rate": 3.064097142511931e-05, + "loss": 2.5849, + "step": 28039 + }, + { + "epoch": 2.5403728114878485, + "grad_norm": 1.0510908365249634, + "learning_rate": 3.063493022412856e-05, + "loss": 2.5945, + "step": 28040 + }, + { + "epoch": 2.540463409662295, + "grad_norm": 1.0631799697875977, + "learning_rate": 3.06288890231378e-05, + "loss": 2.5224, + "step": 28041 + }, + { + "epoch": 2.540554007836742, + "grad_norm": 1.0225768089294434, + "learning_rate": 3.062284782214704e-05, + "loss": 2.6266, + "step": 28042 + }, + { + "epoch": 2.540644606011189, + "grad_norm": 1.0679765939712524, + "learning_rate": 3.061680662115629e-05, + "loss": 2.5776, + "step": 28043 + }, + { + "epoch": 2.5407352041856357, + "grad_norm": 0.9468295574188232, + "learning_rate": 3.061076542016553e-05, + "loss": 2.6741, + "step": 28044 + }, + { + "epoch": 2.5408258023600823, + "grad_norm": 1.0670920610427856, + "learning_rate": 3.060472421917478e-05, + "loss": 2.4402, + "step": 28045 + }, + { + "epoch": 2.5409164005345293, + "grad_norm": 0.9679888486862183, + "learning_rate": 3.059868301818402e-05, + "loss": 2.4176, + "step": 28046 + }, + { + "epoch": 2.5410069987089763, + "grad_norm": 0.8680529594421387, + "learning_rate": 3.059264181719326e-05, + "loss": 1.9902, + "step": 28047 + }, + { + "epoch": 2.541097596883423, + "grad_norm": 0.9370946288108826, + "learning_rate": 3.0586600616202506e-05, + "loss": 2.5144, + "step": 28048 + }, + { + "epoch": 2.5411881950578694, + "grad_norm": 1.028241753578186, + "learning_rate": 3.058055941521175e-05, + "loss": 2.588, + "step": 28049 + }, + { + "epoch": 2.5412787932323164, + "grad_norm": 1.0352778434753418, + "learning_rate": 3.057451821422099e-05, + "loss": 1.9082, + "step": 28050 + }, + { + "epoch": 2.541369391406763, + "grad_norm": 0.8937753438949585, + "learning_rate": 3.056847701323023e-05, + "loss": 2.0486, + "step": 28051 + }, + { + "epoch": 2.54145998958121, + "grad_norm": 1.0717589855194092, + "learning_rate": 3.0562435812239476e-05, + "loss": 2.4815, + "step": 28052 + }, + { + "epoch": 2.5415505877556566, + "grad_norm": 1.1854398250579834, + "learning_rate": 3.055639461124872e-05, + "loss": 2.3792, + "step": 28053 + }, + { + "epoch": 2.5416411859301036, + "grad_norm": 1.0051286220550537, + "learning_rate": 3.055035341025796e-05, + "loss": 2.3827, + "step": 28054 + }, + { + "epoch": 2.54173178410455, + "grad_norm": 1.015460729598999, + "learning_rate": 3.0544312209267206e-05, + "loss": 3.0573, + "step": 28055 + }, + { + "epoch": 2.541822382278997, + "grad_norm": 1.1873581409454346, + "learning_rate": 3.0538271008276446e-05, + "loss": 2.6563, + "step": 28056 + }, + { + "epoch": 2.5419129804534437, + "grad_norm": 0.9137253165245056, + "learning_rate": 3.053222980728569e-05, + "loss": 2.4169, + "step": 28057 + }, + { + "epoch": 2.5420035786278907, + "grad_norm": 1.016302466392517, + "learning_rate": 3.052618860629493e-05, + "loss": 2.5897, + "step": 28058 + }, + { + "epoch": 2.5420941768023373, + "grad_norm": 0.9572266340255737, + "learning_rate": 3.0520147405304176e-05, + "loss": 2.5049, + "step": 28059 + }, + { + "epoch": 2.5421847749767843, + "grad_norm": 0.9945127367973328, + "learning_rate": 3.051410620431342e-05, + "loss": 2.5691, + "step": 28060 + }, + { + "epoch": 2.542275373151231, + "grad_norm": 1.0075938701629639, + "learning_rate": 3.050806500332266e-05, + "loss": 2.3953, + "step": 28061 + }, + { + "epoch": 2.542365971325678, + "grad_norm": 1.0225014686584473, + "learning_rate": 3.05020238023319e-05, + "loss": 2.6683, + "step": 28062 + }, + { + "epoch": 2.5424565695001244, + "grad_norm": 1.0644279718399048, + "learning_rate": 3.049598260134115e-05, + "loss": 2.6272, + "step": 28063 + }, + { + "epoch": 2.5425471676745715, + "grad_norm": 1.0529613494873047, + "learning_rate": 3.048994140035039e-05, + "loss": 2.6147, + "step": 28064 + }, + { + "epoch": 2.542637765849018, + "grad_norm": 0.9701792001724243, + "learning_rate": 3.0483900199359634e-05, + "loss": 2.8831, + "step": 28065 + }, + { + "epoch": 2.542728364023465, + "grad_norm": 0.9683768153190613, + "learning_rate": 3.0477858998368875e-05, + "loss": 2.6515, + "step": 28066 + }, + { + "epoch": 2.5428189621979116, + "grad_norm": 1.030328392982483, + "learning_rate": 3.0471817797378123e-05, + "loss": 2.6967, + "step": 28067 + }, + { + "epoch": 2.5429095603723586, + "grad_norm": 1.1424689292907715, + "learning_rate": 3.0465776596387363e-05, + "loss": 2.6938, + "step": 28068 + }, + { + "epoch": 2.543000158546805, + "grad_norm": 0.9222508072853088, + "learning_rate": 3.0459735395396604e-05, + "loss": 2.5317, + "step": 28069 + }, + { + "epoch": 2.543090756721252, + "grad_norm": 0.7599748969078064, + "learning_rate": 3.0453694194405852e-05, + "loss": 1.205, + "step": 28070 + }, + { + "epoch": 2.5431813548956987, + "grad_norm": 1.024778962135315, + "learning_rate": 3.0447652993415093e-05, + "loss": 2.56, + "step": 28071 + }, + { + "epoch": 2.5432719530701458, + "grad_norm": 1.0841625928878784, + "learning_rate": 3.0441611792424333e-05, + "loss": 2.8628, + "step": 28072 + }, + { + "epoch": 2.5433625512445923, + "grad_norm": 1.0723527669906616, + "learning_rate": 3.0435570591433578e-05, + "loss": 2.618, + "step": 28073 + }, + { + "epoch": 2.5434531494190393, + "grad_norm": 0.9651447534561157, + "learning_rate": 3.0429529390442822e-05, + "loss": 2.7502, + "step": 28074 + }, + { + "epoch": 2.543543747593486, + "grad_norm": 1.0456345081329346, + "learning_rate": 3.0423488189452066e-05, + "loss": 2.8541, + "step": 28075 + }, + { + "epoch": 2.543634345767933, + "grad_norm": 0.998799741268158, + "learning_rate": 3.0417446988461307e-05, + "loss": 2.6548, + "step": 28076 + }, + { + "epoch": 2.5437249439423795, + "grad_norm": 1.1917630434036255, + "learning_rate": 3.0411405787470548e-05, + "loss": 2.538, + "step": 28077 + }, + { + "epoch": 2.5438155421168265, + "grad_norm": 0.9599166512489319, + "learning_rate": 3.0405364586479795e-05, + "loss": 2.6698, + "step": 28078 + }, + { + "epoch": 2.543906140291273, + "grad_norm": 0.9829027652740479, + "learning_rate": 3.0399323385489036e-05, + "loss": 2.5967, + "step": 28079 + }, + { + "epoch": 2.54399673846572, + "grad_norm": 1.1305032968521118, + "learning_rate": 3.0393282184498277e-05, + "loss": 2.4909, + "step": 28080 + }, + { + "epoch": 2.5440873366401666, + "grad_norm": 1.0403956174850464, + "learning_rate": 3.038724098350752e-05, + "loss": 2.6496, + "step": 28081 + }, + { + "epoch": 2.5441779348146136, + "grad_norm": 1.0314606428146362, + "learning_rate": 3.0381199782516766e-05, + "loss": 2.9602, + "step": 28082 + }, + { + "epoch": 2.54426853298906, + "grad_norm": 1.0889766216278076, + "learning_rate": 3.037515858152601e-05, + "loss": 2.9131, + "step": 28083 + }, + { + "epoch": 2.544359131163507, + "grad_norm": 0.9945451617240906, + "learning_rate": 3.036911738053525e-05, + "loss": 2.6149, + "step": 28084 + }, + { + "epoch": 2.5444497293379538, + "grad_norm": 0.9763860702514648, + "learning_rate": 3.0363076179544498e-05, + "loss": 2.6109, + "step": 28085 + }, + { + "epoch": 2.544540327512401, + "grad_norm": 0.9598534107208252, + "learning_rate": 3.035703497855374e-05, + "loss": 2.7807, + "step": 28086 + }, + { + "epoch": 2.5446309256868473, + "grad_norm": 1.0371387004852295, + "learning_rate": 3.035099377756298e-05, + "loss": 2.9212, + "step": 28087 + }, + { + "epoch": 2.5447215238612944, + "grad_norm": 1.073263168334961, + "learning_rate": 3.034495257657222e-05, + "loss": 2.7375, + "step": 28088 + }, + { + "epoch": 2.544812122035741, + "grad_norm": 1.030087947845459, + "learning_rate": 3.0338911375581468e-05, + "loss": 2.7062, + "step": 28089 + }, + { + "epoch": 2.544902720210188, + "grad_norm": 1.1483032703399658, + "learning_rate": 3.033287017459071e-05, + "loss": 2.5259, + "step": 28090 + }, + { + "epoch": 2.5449933183846345, + "grad_norm": 1.0922526121139526, + "learning_rate": 3.0326828973599953e-05, + "loss": 2.6284, + "step": 28091 + }, + { + "epoch": 2.545083916559081, + "grad_norm": 0.9498128294944763, + "learning_rate": 3.0320787772609194e-05, + "loss": 2.4062, + "step": 28092 + }, + { + "epoch": 2.545174514733528, + "grad_norm": 0.9780092835426331, + "learning_rate": 3.0314746571618442e-05, + "loss": 2.6428, + "step": 28093 + }, + { + "epoch": 2.545265112907975, + "grad_norm": 0.8217872977256775, + "learning_rate": 3.0308705370627683e-05, + "loss": 1.9355, + "step": 28094 + }, + { + "epoch": 2.5453557110824216, + "grad_norm": 1.0361207723617554, + "learning_rate": 3.0302664169636923e-05, + "loss": 2.6093, + "step": 28095 + }, + { + "epoch": 2.545446309256868, + "grad_norm": 1.1957825422286987, + "learning_rate": 3.0296622968646164e-05, + "loss": 2.382, + "step": 28096 + }, + { + "epoch": 2.545536907431315, + "grad_norm": 1.1777061223983765, + "learning_rate": 3.0290581767655412e-05, + "loss": 2.439, + "step": 28097 + }, + { + "epoch": 2.5456275056057622, + "grad_norm": 0.9963244199752808, + "learning_rate": 3.0284540566664653e-05, + "loss": 2.419, + "step": 28098 + }, + { + "epoch": 2.545718103780209, + "grad_norm": 1.0520775318145752, + "learning_rate": 3.0278499365673897e-05, + "loss": 2.6261, + "step": 28099 + }, + { + "epoch": 2.5458087019546554, + "grad_norm": 1.1285241842269897, + "learning_rate": 3.0272458164683144e-05, + "loss": 2.5507, + "step": 28100 + }, + { + "epoch": 2.5458993001291024, + "grad_norm": 1.2259225845336914, + "learning_rate": 3.0266416963692385e-05, + "loss": 2.5729, + "step": 28101 + }, + { + "epoch": 2.5459898983035494, + "grad_norm": 0.986599862575531, + "learning_rate": 3.0260375762701626e-05, + "loss": 2.4449, + "step": 28102 + }, + { + "epoch": 2.546080496477996, + "grad_norm": 1.007869005203247, + "learning_rate": 3.0254334561710867e-05, + "loss": 2.8982, + "step": 28103 + }, + { + "epoch": 2.5461710946524425, + "grad_norm": 0.9760690331459045, + "learning_rate": 3.0248293360720115e-05, + "loss": 2.588, + "step": 28104 + }, + { + "epoch": 2.5462616928268895, + "grad_norm": 0.9426671862602234, + "learning_rate": 3.0242252159729355e-05, + "loss": 2.612, + "step": 28105 + }, + { + "epoch": 2.5463522910013365, + "grad_norm": 1.0683274269104004, + "learning_rate": 3.0236210958738596e-05, + "loss": 2.6161, + "step": 28106 + }, + { + "epoch": 2.546442889175783, + "grad_norm": 1.0334959030151367, + "learning_rate": 3.023016975774784e-05, + "loss": 2.7063, + "step": 28107 + }, + { + "epoch": 2.5465334873502297, + "grad_norm": 1.0281176567077637, + "learning_rate": 3.0224128556757088e-05, + "loss": 2.4272, + "step": 28108 + }, + { + "epoch": 2.5466240855246767, + "grad_norm": 0.9796651601791382, + "learning_rate": 3.021808735576633e-05, + "loss": 2.4876, + "step": 28109 + }, + { + "epoch": 2.5467146836991237, + "grad_norm": 0.962152898311615, + "learning_rate": 3.021204615477557e-05, + "loss": 2.5166, + "step": 28110 + }, + { + "epoch": 2.5468052818735702, + "grad_norm": 1.1947284936904907, + "learning_rate": 3.020600495378481e-05, + "loss": 2.5297, + "step": 28111 + }, + { + "epoch": 2.546895880048017, + "grad_norm": 1.0858519077301025, + "learning_rate": 3.0199963752794058e-05, + "loss": 2.6834, + "step": 28112 + }, + { + "epoch": 2.546986478222464, + "grad_norm": 0.9971588850021362, + "learning_rate": 3.01939225518033e-05, + "loss": 2.5718, + "step": 28113 + }, + { + "epoch": 2.547077076396911, + "grad_norm": 1.0297894477844238, + "learning_rate": 3.018788135081254e-05, + "loss": 2.8445, + "step": 28114 + }, + { + "epoch": 2.5471676745713574, + "grad_norm": 1.0445764064788818, + "learning_rate": 3.0181840149821787e-05, + "loss": 2.7606, + "step": 28115 + }, + { + "epoch": 2.547258272745804, + "grad_norm": 0.9030144214630127, + "learning_rate": 3.017579894883103e-05, + "loss": 2.075, + "step": 28116 + }, + { + "epoch": 2.547348870920251, + "grad_norm": 0.9915078282356262, + "learning_rate": 3.0169757747840272e-05, + "loss": 2.732, + "step": 28117 + }, + { + "epoch": 2.547439469094698, + "grad_norm": 1.0671945810317993, + "learning_rate": 3.0163716546849513e-05, + "loss": 2.3148, + "step": 28118 + }, + { + "epoch": 2.5475300672691445, + "grad_norm": 1.0907788276672363, + "learning_rate": 3.015767534585876e-05, + "loss": 2.4952, + "step": 28119 + }, + { + "epoch": 2.547620665443591, + "grad_norm": 0.9856995940208435, + "learning_rate": 3.0151634144868002e-05, + "loss": 2.9404, + "step": 28120 + }, + { + "epoch": 2.547711263618038, + "grad_norm": 1.0359607934951782, + "learning_rate": 3.0145592943877243e-05, + "loss": 2.594, + "step": 28121 + }, + { + "epoch": 2.547801861792485, + "grad_norm": 0.9891136884689331, + "learning_rate": 3.0139551742886483e-05, + "loss": 2.7618, + "step": 28122 + }, + { + "epoch": 2.5478924599669317, + "grad_norm": 1.0065104961395264, + "learning_rate": 3.013351054189573e-05, + "loss": 2.5404, + "step": 28123 + }, + { + "epoch": 2.5479830581413783, + "grad_norm": 0.881070613861084, + "learning_rate": 3.0127469340904975e-05, + "loss": 2.019, + "step": 28124 + }, + { + "epoch": 2.5480736563158253, + "grad_norm": 1.045283555984497, + "learning_rate": 3.0121428139914216e-05, + "loss": 2.6797, + "step": 28125 + }, + { + "epoch": 2.5481642544902723, + "grad_norm": 1.0588750839233398, + "learning_rate": 3.0115386938923464e-05, + "loss": 2.632, + "step": 28126 + }, + { + "epoch": 2.548254852664719, + "grad_norm": 0.928350567817688, + "learning_rate": 3.0109345737932704e-05, + "loss": 2.1688, + "step": 28127 + }, + { + "epoch": 2.5483454508391654, + "grad_norm": 1.0977184772491455, + "learning_rate": 3.0103304536941945e-05, + "loss": 2.6129, + "step": 28128 + }, + { + "epoch": 2.5484360490136124, + "grad_norm": 1.0324937105178833, + "learning_rate": 3.0097263335951186e-05, + "loss": 2.7323, + "step": 28129 + }, + { + "epoch": 2.5485266471880594, + "grad_norm": 1.0501246452331543, + "learning_rate": 3.0091222134960434e-05, + "loss": 2.6855, + "step": 28130 + }, + { + "epoch": 2.548617245362506, + "grad_norm": 0.9473429918289185, + "learning_rate": 3.0085180933969675e-05, + "loss": 2.5414, + "step": 28131 + }, + { + "epoch": 2.5487078435369526, + "grad_norm": 0.9840118288993835, + "learning_rate": 3.0079139732978915e-05, + "loss": 2.6313, + "step": 28132 + }, + { + "epoch": 2.5487984417113996, + "grad_norm": 0.9917880892753601, + "learning_rate": 3.007309853198816e-05, + "loss": 2.6487, + "step": 28133 + }, + { + "epoch": 2.548889039885846, + "grad_norm": 1.0956532955169678, + "learning_rate": 3.0067057330997407e-05, + "loss": 3.0974, + "step": 28134 + }, + { + "epoch": 2.548979638060293, + "grad_norm": 1.0421390533447266, + "learning_rate": 3.0061016130006648e-05, + "loss": 2.5567, + "step": 28135 + }, + { + "epoch": 2.5490702362347397, + "grad_norm": 1.0744075775146484, + "learning_rate": 3.005497492901589e-05, + "loss": 2.6676, + "step": 28136 + }, + { + "epoch": 2.5491608344091867, + "grad_norm": 1.1152000427246094, + "learning_rate": 3.004893372802513e-05, + "loss": 2.4355, + "step": 28137 + }, + { + "epoch": 2.5492514325836333, + "grad_norm": 0.9707445502281189, + "learning_rate": 3.0042892527034377e-05, + "loss": 2.3501, + "step": 28138 + }, + { + "epoch": 2.5493420307580803, + "grad_norm": 1.0634574890136719, + "learning_rate": 3.0036851326043618e-05, + "loss": 2.7652, + "step": 28139 + }, + { + "epoch": 2.549432628932527, + "grad_norm": 0.9793853163719177, + "learning_rate": 3.003081012505286e-05, + "loss": 2.5808, + "step": 28140 + }, + { + "epoch": 2.549523227106974, + "grad_norm": 1.041408658027649, + "learning_rate": 3.0024768924062107e-05, + "loss": 2.8493, + "step": 28141 + }, + { + "epoch": 2.5496138252814204, + "grad_norm": 0.9863114356994629, + "learning_rate": 3.001872772307135e-05, + "loss": 2.5095, + "step": 28142 + }, + { + "epoch": 2.5497044234558675, + "grad_norm": 1.1916006803512573, + "learning_rate": 3.001268652208059e-05, + "loss": 2.5556, + "step": 28143 + }, + { + "epoch": 2.549795021630314, + "grad_norm": 0.9884935617446899, + "learning_rate": 3.0006645321089832e-05, + "loss": 2.4511, + "step": 28144 + }, + { + "epoch": 2.549885619804761, + "grad_norm": 1.077507495880127, + "learning_rate": 3.000060412009908e-05, + "loss": 2.7421, + "step": 28145 + }, + { + "epoch": 2.5499762179792076, + "grad_norm": 0.9376364946365356, + "learning_rate": 2.999456291910832e-05, + "loss": 1.8965, + "step": 28146 + }, + { + "epoch": 2.5500668161536546, + "grad_norm": 1.0469551086425781, + "learning_rate": 2.9988521718117562e-05, + "loss": 2.5999, + "step": 28147 + }, + { + "epoch": 2.550157414328101, + "grad_norm": 0.9821861386299133, + "learning_rate": 2.9982480517126803e-05, + "loss": 2.6285, + "step": 28148 + }, + { + "epoch": 2.550248012502548, + "grad_norm": 1.01646089553833, + "learning_rate": 2.997643931613605e-05, + "loss": 2.5199, + "step": 28149 + }, + { + "epoch": 2.5503386106769947, + "grad_norm": 0.9595254063606262, + "learning_rate": 2.9970398115145294e-05, + "loss": 2.5764, + "step": 28150 + }, + { + "epoch": 2.5504292088514418, + "grad_norm": 0.9322342276573181, + "learning_rate": 2.9964356914154535e-05, + "loss": 1.9259, + "step": 28151 + }, + { + "epoch": 2.5505198070258883, + "grad_norm": 0.9890906810760498, + "learning_rate": 2.9958315713163776e-05, + "loss": 2.7279, + "step": 28152 + }, + { + "epoch": 2.5506104052003353, + "grad_norm": 1.013867974281311, + "learning_rate": 2.9952274512173024e-05, + "loss": 2.523, + "step": 28153 + }, + { + "epoch": 2.550701003374782, + "grad_norm": 0.9432204365730286, + "learning_rate": 2.9946233311182264e-05, + "loss": 2.5716, + "step": 28154 + }, + { + "epoch": 2.550791601549229, + "grad_norm": 1.0828640460968018, + "learning_rate": 2.9940192110191505e-05, + "loss": 1.7787, + "step": 28155 + }, + { + "epoch": 2.5508821997236755, + "grad_norm": 1.0527247190475464, + "learning_rate": 2.9934150909200753e-05, + "loss": 2.3574, + "step": 28156 + }, + { + "epoch": 2.5509727978981225, + "grad_norm": 1.0543012619018555, + "learning_rate": 2.9928109708209994e-05, + "loss": 2.6217, + "step": 28157 + }, + { + "epoch": 2.551063396072569, + "grad_norm": 0.986025869846344, + "learning_rate": 2.9922068507219238e-05, + "loss": 2.5919, + "step": 28158 + }, + { + "epoch": 2.551153994247016, + "grad_norm": 1.0237247943878174, + "learning_rate": 2.991602730622848e-05, + "loss": 2.6796, + "step": 28159 + }, + { + "epoch": 2.5512445924214626, + "grad_norm": 1.0028064250946045, + "learning_rate": 2.9909986105237726e-05, + "loss": 2.5311, + "step": 28160 + }, + { + "epoch": 2.5513351905959096, + "grad_norm": 1.0595531463623047, + "learning_rate": 2.9903944904246967e-05, + "loss": 2.7508, + "step": 28161 + }, + { + "epoch": 2.551425788770356, + "grad_norm": 0.9653079509735107, + "learning_rate": 2.9897903703256208e-05, + "loss": 2.4961, + "step": 28162 + }, + { + "epoch": 2.551516386944803, + "grad_norm": 0.9782003164291382, + "learning_rate": 2.989186250226545e-05, + "loss": 2.8639, + "step": 28163 + }, + { + "epoch": 2.5516069851192498, + "grad_norm": 0.9451918005943298, + "learning_rate": 2.9885821301274696e-05, + "loss": 2.5604, + "step": 28164 + }, + { + "epoch": 2.551697583293697, + "grad_norm": 0.9527831673622131, + "learning_rate": 2.9879780100283937e-05, + "loss": 2.7613, + "step": 28165 + }, + { + "epoch": 2.5517881814681433, + "grad_norm": 0.9895327091217041, + "learning_rate": 2.987373889929318e-05, + "loss": 2.4793, + "step": 28166 + }, + { + "epoch": 2.5518787796425904, + "grad_norm": 0.9751509428024292, + "learning_rate": 2.9867697698302422e-05, + "loss": 2.5114, + "step": 28167 + }, + { + "epoch": 2.551969377817037, + "grad_norm": 0.9973841905593872, + "learning_rate": 2.986165649731167e-05, + "loss": 2.6108, + "step": 28168 + }, + { + "epoch": 2.552059975991484, + "grad_norm": 0.914553165435791, + "learning_rate": 2.985561529632091e-05, + "loss": 2.0115, + "step": 28169 + }, + { + "epoch": 2.5521505741659305, + "grad_norm": 0.9971249103546143, + "learning_rate": 2.984957409533015e-05, + "loss": 2.7839, + "step": 28170 + }, + { + "epoch": 2.5522411723403775, + "grad_norm": 1.0210868120193481, + "learning_rate": 2.98435328943394e-05, + "loss": 2.8701, + "step": 28171 + }, + { + "epoch": 2.552331770514824, + "grad_norm": 1.0602959394454956, + "learning_rate": 2.983749169334864e-05, + "loss": 2.6573, + "step": 28172 + }, + { + "epoch": 2.552422368689271, + "grad_norm": 0.961524486541748, + "learning_rate": 2.983145049235788e-05, + "loss": 2.5141, + "step": 28173 + }, + { + "epoch": 2.5525129668637176, + "grad_norm": 0.9172956943511963, + "learning_rate": 2.9825409291367125e-05, + "loss": 1.9458, + "step": 28174 + }, + { + "epoch": 2.552603565038164, + "grad_norm": 0.9498351216316223, + "learning_rate": 2.981936809037637e-05, + "loss": 2.5233, + "step": 28175 + }, + { + "epoch": 2.5526941632126112, + "grad_norm": 1.012453556060791, + "learning_rate": 2.9813326889385614e-05, + "loss": 2.6859, + "step": 28176 + }, + { + "epoch": 2.5527847613870582, + "grad_norm": 0.9946523308753967, + "learning_rate": 2.9807285688394854e-05, + "loss": 2.5198, + "step": 28177 + }, + { + "epoch": 2.552875359561505, + "grad_norm": 1.1480456590652466, + "learning_rate": 2.9801244487404095e-05, + "loss": 2.6293, + "step": 28178 + }, + { + "epoch": 2.5529659577359514, + "grad_norm": 0.9592020511627197, + "learning_rate": 2.9795203286413343e-05, + "loss": 2.4436, + "step": 28179 + }, + { + "epoch": 2.5530565559103984, + "grad_norm": 1.0352215766906738, + "learning_rate": 2.9789162085422584e-05, + "loss": 2.3761, + "step": 28180 + }, + { + "epoch": 2.5531471540848454, + "grad_norm": 1.0178697109222412, + "learning_rate": 2.9783120884431824e-05, + "loss": 2.5827, + "step": 28181 + }, + { + "epoch": 2.553237752259292, + "grad_norm": 0.9623971581459045, + "learning_rate": 2.977707968344107e-05, + "loss": 2.4257, + "step": 28182 + }, + { + "epoch": 2.5533283504337385, + "grad_norm": 0.9324342012405396, + "learning_rate": 2.9771038482450313e-05, + "loss": 2.5857, + "step": 28183 + }, + { + "epoch": 2.5534189486081855, + "grad_norm": 1.1039528846740723, + "learning_rate": 2.9764997281459557e-05, + "loss": 2.5291, + "step": 28184 + }, + { + "epoch": 2.5535095467826325, + "grad_norm": 1.0185415744781494, + "learning_rate": 2.9758956080468798e-05, + "loss": 2.7647, + "step": 28185 + }, + { + "epoch": 2.553600144957079, + "grad_norm": 0.8815171718597412, + "learning_rate": 2.9752914879478046e-05, + "loss": 1.9202, + "step": 28186 + }, + { + "epoch": 2.5536907431315257, + "grad_norm": 1.0125722885131836, + "learning_rate": 2.9746873678487286e-05, + "loss": 2.7819, + "step": 28187 + }, + { + "epoch": 2.5537813413059727, + "grad_norm": 1.0143606662750244, + "learning_rate": 2.9740832477496527e-05, + "loss": 2.5971, + "step": 28188 + }, + { + "epoch": 2.5538719394804197, + "grad_norm": 1.047937273979187, + "learning_rate": 2.9734791276505768e-05, + "loss": 2.6142, + "step": 28189 + }, + { + "epoch": 2.5539625376548662, + "grad_norm": 1.0924688577651978, + "learning_rate": 2.9728750075515016e-05, + "loss": 2.5301, + "step": 28190 + }, + { + "epoch": 2.554053135829313, + "grad_norm": 1.0803059339523315, + "learning_rate": 2.9722708874524256e-05, + "loss": 2.4691, + "step": 28191 + }, + { + "epoch": 2.55414373400376, + "grad_norm": 1.1072914600372314, + "learning_rate": 2.97166676735335e-05, + "loss": 2.5086, + "step": 28192 + }, + { + "epoch": 2.554234332178207, + "grad_norm": 0.872814953327179, + "learning_rate": 2.971062647254274e-05, + "loss": 2.0241, + "step": 28193 + }, + { + "epoch": 2.5543249303526534, + "grad_norm": 0.9247955679893494, + "learning_rate": 2.970458527155199e-05, + "loss": 2.3323, + "step": 28194 + }, + { + "epoch": 2.5544155285271, + "grad_norm": 0.9761650562286377, + "learning_rate": 2.969854407056123e-05, + "loss": 2.6006, + "step": 28195 + }, + { + "epoch": 2.554506126701547, + "grad_norm": 0.9970954060554504, + "learning_rate": 2.969250286957047e-05, + "loss": 2.7265, + "step": 28196 + }, + { + "epoch": 2.554596724875994, + "grad_norm": 1.0050091743469238, + "learning_rate": 2.968646166857971e-05, + "loss": 2.5015, + "step": 28197 + }, + { + "epoch": 2.5546873230504406, + "grad_norm": 1.0739481449127197, + "learning_rate": 2.968042046758896e-05, + "loss": 2.662, + "step": 28198 + }, + { + "epoch": 2.554777921224887, + "grad_norm": 1.0172584056854248, + "learning_rate": 2.96743792665982e-05, + "loss": 2.643, + "step": 28199 + }, + { + "epoch": 2.554868519399334, + "grad_norm": 1.026818037033081, + "learning_rate": 2.9668338065607444e-05, + "loss": 2.6233, + "step": 28200 + }, + { + "epoch": 2.554959117573781, + "grad_norm": 1.0670360326766968, + "learning_rate": 2.966229686461669e-05, + "loss": 2.7777, + "step": 28201 + }, + { + "epoch": 2.5550497157482277, + "grad_norm": 0.8899913430213928, + "learning_rate": 2.9656255663625933e-05, + "loss": 2.2711, + "step": 28202 + }, + { + "epoch": 2.5551403139226743, + "grad_norm": 1.073209524154663, + "learning_rate": 2.9650214462635174e-05, + "loss": 2.5605, + "step": 28203 + }, + { + "epoch": 2.5552309120971213, + "grad_norm": 1.0249491930007935, + "learning_rate": 2.9644173261644414e-05, + "loss": 2.7698, + "step": 28204 + }, + { + "epoch": 2.5553215102715683, + "grad_norm": 1.0369606018066406, + "learning_rate": 2.9638132060653662e-05, + "loss": 2.6253, + "step": 28205 + }, + { + "epoch": 2.555412108446015, + "grad_norm": 1.032401204109192, + "learning_rate": 2.9632090859662903e-05, + "loss": 2.6061, + "step": 28206 + }, + { + "epoch": 2.5555027066204614, + "grad_norm": 1.0222423076629639, + "learning_rate": 2.9626049658672144e-05, + "loss": 2.7016, + "step": 28207 + }, + { + "epoch": 2.5555933047949084, + "grad_norm": 1.1004599332809448, + "learning_rate": 2.9620008457681388e-05, + "loss": 2.8728, + "step": 28208 + }, + { + "epoch": 2.5556839029693554, + "grad_norm": 0.9370786547660828, + "learning_rate": 2.9613967256690632e-05, + "loss": 2.6415, + "step": 28209 + }, + { + "epoch": 2.555774501143802, + "grad_norm": 1.0612494945526123, + "learning_rate": 2.9607926055699876e-05, + "loss": 2.422, + "step": 28210 + }, + { + "epoch": 2.5558650993182486, + "grad_norm": 0.9809315800666809, + "learning_rate": 2.9601884854709117e-05, + "loss": 2.6055, + "step": 28211 + }, + { + "epoch": 2.5559556974926956, + "grad_norm": 0.8326271176338196, + "learning_rate": 2.9595843653718358e-05, + "loss": 2.0935, + "step": 28212 + }, + { + "epoch": 2.556046295667142, + "grad_norm": 0.9280330538749695, + "learning_rate": 2.9589802452727606e-05, + "loss": 2.6109, + "step": 28213 + }, + { + "epoch": 2.556136893841589, + "grad_norm": 1.0030380487442017, + "learning_rate": 2.9583761251736846e-05, + "loss": 2.5503, + "step": 28214 + }, + { + "epoch": 2.5562274920160357, + "grad_norm": 1.04789137840271, + "learning_rate": 2.9577720050746087e-05, + "loss": 2.1989, + "step": 28215 + }, + { + "epoch": 2.5563180901904827, + "grad_norm": 1.0219180583953857, + "learning_rate": 2.9571678849755335e-05, + "loss": 2.6281, + "step": 28216 + }, + { + "epoch": 2.5564086883649293, + "grad_norm": 1.0705993175506592, + "learning_rate": 2.9565637648764576e-05, + "loss": 2.638, + "step": 28217 + }, + { + "epoch": 2.5564992865393763, + "grad_norm": 1.101702094078064, + "learning_rate": 2.955959644777382e-05, + "loss": 2.5465, + "step": 28218 + }, + { + "epoch": 2.556589884713823, + "grad_norm": 1.0530390739440918, + "learning_rate": 2.955355524678306e-05, + "loss": 2.4984, + "step": 28219 + }, + { + "epoch": 2.55668048288827, + "grad_norm": 0.9041100740432739, + "learning_rate": 2.9547514045792308e-05, + "loss": 2.2959, + "step": 28220 + }, + { + "epoch": 2.5567710810627164, + "grad_norm": 0.9041721820831299, + "learning_rate": 2.954147284480155e-05, + "loss": 1.8925, + "step": 28221 + }, + { + "epoch": 2.5568616792371635, + "grad_norm": 1.0244654417037964, + "learning_rate": 2.953543164381079e-05, + "loss": 2.7626, + "step": 28222 + }, + { + "epoch": 2.55695227741161, + "grad_norm": 1.051100492477417, + "learning_rate": 2.952939044282003e-05, + "loss": 2.5837, + "step": 28223 + }, + { + "epoch": 2.557042875586057, + "grad_norm": 0.9367727041244507, + "learning_rate": 2.952334924182928e-05, + "loss": 2.5194, + "step": 28224 + }, + { + "epoch": 2.5571334737605036, + "grad_norm": 0.9785082340240479, + "learning_rate": 2.951730804083852e-05, + "loss": 2.5416, + "step": 28225 + }, + { + "epoch": 2.5572240719349506, + "grad_norm": 1.0368554592132568, + "learning_rate": 2.9511266839847763e-05, + "loss": 2.926, + "step": 28226 + }, + { + "epoch": 2.557314670109397, + "grad_norm": 1.0616636276245117, + "learning_rate": 2.9505225638857004e-05, + "loss": 2.6716, + "step": 28227 + }, + { + "epoch": 2.557405268283844, + "grad_norm": 0.8894646167755127, + "learning_rate": 2.9499184437866252e-05, + "loss": 1.9806, + "step": 28228 + }, + { + "epoch": 2.5574958664582907, + "grad_norm": 0.9816880226135254, + "learning_rate": 2.9493143236875493e-05, + "loss": 2.4924, + "step": 28229 + }, + { + "epoch": 2.5575864646327378, + "grad_norm": 1.0808273553848267, + "learning_rate": 2.9487102035884734e-05, + "loss": 2.495, + "step": 28230 + }, + { + "epoch": 2.5576770628071843, + "grad_norm": 0.9493873119354248, + "learning_rate": 2.948106083489398e-05, + "loss": 1.9315, + "step": 28231 + }, + { + "epoch": 2.5577676609816313, + "grad_norm": 1.0105392932891846, + "learning_rate": 2.9475019633903222e-05, + "loss": 2.5388, + "step": 28232 + }, + { + "epoch": 2.557858259156078, + "grad_norm": 1.07523775100708, + "learning_rate": 2.9468978432912463e-05, + "loss": 2.6956, + "step": 28233 + }, + { + "epoch": 2.557948857330525, + "grad_norm": 1.0082786083221436, + "learning_rate": 2.9462937231921707e-05, + "loss": 2.5733, + "step": 28234 + }, + { + "epoch": 2.5580394555049715, + "grad_norm": 1.0670756101608276, + "learning_rate": 2.945689603093095e-05, + "loss": 2.9031, + "step": 28235 + }, + { + "epoch": 2.5581300536794185, + "grad_norm": 1.0610564947128296, + "learning_rate": 2.9450854829940195e-05, + "loss": 2.7346, + "step": 28236 + }, + { + "epoch": 2.558220651853865, + "grad_norm": 1.0748157501220703, + "learning_rate": 2.9444813628949436e-05, + "loss": 2.7446, + "step": 28237 + }, + { + "epoch": 2.558311250028312, + "grad_norm": 1.029723048210144, + "learning_rate": 2.9438772427958677e-05, + "loss": 2.6573, + "step": 28238 + }, + { + "epoch": 2.5584018482027586, + "grad_norm": 1.0438830852508545, + "learning_rate": 2.9432731226967925e-05, + "loss": 2.57, + "step": 28239 + }, + { + "epoch": 2.5584924463772056, + "grad_norm": 0.9858278036117554, + "learning_rate": 2.9426690025977166e-05, + "loss": 2.3478, + "step": 28240 + }, + { + "epoch": 2.558583044551652, + "grad_norm": 1.030290126800537, + "learning_rate": 2.9420648824986406e-05, + "loss": 2.6808, + "step": 28241 + }, + { + "epoch": 2.558673642726099, + "grad_norm": 1.0435410737991333, + "learning_rate": 2.941460762399565e-05, + "loss": 2.6593, + "step": 28242 + }, + { + "epoch": 2.5587642409005458, + "grad_norm": 0.8728961944580078, + "learning_rate": 2.9408566423004895e-05, + "loss": 1.8702, + "step": 28243 + }, + { + "epoch": 2.558854839074993, + "grad_norm": 1.0608214139938354, + "learning_rate": 2.940252522201414e-05, + "loss": 2.6377, + "step": 28244 + }, + { + "epoch": 2.5589454372494393, + "grad_norm": 0.8794126510620117, + "learning_rate": 2.939648402102338e-05, + "loss": 1.8774, + "step": 28245 + }, + { + "epoch": 2.5590360354238864, + "grad_norm": 0.9786323308944702, + "learning_rate": 2.9390442820032627e-05, + "loss": 2.5175, + "step": 28246 + }, + { + "epoch": 2.559126633598333, + "grad_norm": 0.9609363675117493, + "learning_rate": 2.9384401619041868e-05, + "loss": 2.593, + "step": 28247 + }, + { + "epoch": 2.55921723177278, + "grad_norm": 0.8349510431289673, + "learning_rate": 2.937836041805111e-05, + "loss": 2.0635, + "step": 28248 + }, + { + "epoch": 2.5593078299472265, + "grad_norm": 1.0648053884506226, + "learning_rate": 2.937231921706035e-05, + "loss": 2.7158, + "step": 28249 + }, + { + "epoch": 2.5593984281216735, + "grad_norm": 0.9566590189933777, + "learning_rate": 2.9366278016069598e-05, + "loss": 2.6231, + "step": 28250 + }, + { + "epoch": 2.55948902629612, + "grad_norm": 0.9375921487808228, + "learning_rate": 2.936023681507884e-05, + "loss": 2.3364, + "step": 28251 + }, + { + "epoch": 2.559579624470567, + "grad_norm": 1.0249574184417725, + "learning_rate": 2.9354195614088083e-05, + "loss": 2.7736, + "step": 28252 + }, + { + "epoch": 2.5596702226450136, + "grad_norm": 1.0145009756088257, + "learning_rate": 2.9348154413097323e-05, + "loss": 2.5624, + "step": 28253 + }, + { + "epoch": 2.55976082081946, + "grad_norm": 0.9715798497200012, + "learning_rate": 2.934211321210657e-05, + "loss": 2.4252, + "step": 28254 + }, + { + "epoch": 2.5598514189939072, + "grad_norm": 1.0295336246490479, + "learning_rate": 2.9336072011115812e-05, + "loss": 2.7895, + "step": 28255 + }, + { + "epoch": 2.5599420171683542, + "grad_norm": 0.9941121935844421, + "learning_rate": 2.9330030810125053e-05, + "loss": 2.6948, + "step": 28256 + }, + { + "epoch": 2.560032615342801, + "grad_norm": 1.0657228231430054, + "learning_rate": 2.9323989609134294e-05, + "loss": 2.7949, + "step": 28257 + }, + { + "epoch": 2.5601232135172474, + "grad_norm": 0.9968532919883728, + "learning_rate": 2.931794840814354e-05, + "loss": 2.8886, + "step": 28258 + }, + { + "epoch": 2.5602138116916944, + "grad_norm": 1.1615488529205322, + "learning_rate": 2.9311907207152782e-05, + "loss": 2.836, + "step": 28259 + }, + { + "epoch": 2.5603044098661414, + "grad_norm": 1.020612359046936, + "learning_rate": 2.9305866006162026e-05, + "loss": 2.7329, + "step": 28260 + }, + { + "epoch": 2.560395008040588, + "grad_norm": 1.0192745923995972, + "learning_rate": 2.929982480517127e-05, + "loss": 2.6441, + "step": 28261 + }, + { + "epoch": 2.5604856062150345, + "grad_norm": 0.9970499277114868, + "learning_rate": 2.9293783604180515e-05, + "loss": 2.6744, + "step": 28262 + }, + { + "epoch": 2.5605762043894815, + "grad_norm": 0.9681770205497742, + "learning_rate": 2.9287742403189755e-05, + "loss": 2.7107, + "step": 28263 + }, + { + "epoch": 2.5606668025639285, + "grad_norm": 1.0695298910140991, + "learning_rate": 2.9281701202198996e-05, + "loss": 2.6642, + "step": 28264 + }, + { + "epoch": 2.560757400738375, + "grad_norm": 1.0788697004318237, + "learning_rate": 2.9275660001208244e-05, + "loss": 2.4814, + "step": 28265 + }, + { + "epoch": 2.5608479989128217, + "grad_norm": 0.9791141748428345, + "learning_rate": 2.9269618800217485e-05, + "loss": 2.5159, + "step": 28266 + }, + { + "epoch": 2.5609385970872687, + "grad_norm": 1.0031400918960571, + "learning_rate": 2.9263577599226726e-05, + "loss": 2.5544, + "step": 28267 + }, + { + "epoch": 2.5610291952617157, + "grad_norm": 1.1707382202148438, + "learning_rate": 2.925753639823597e-05, + "loss": 2.4273, + "step": 28268 + }, + { + "epoch": 2.5611197934361623, + "grad_norm": 0.9555999040603638, + "learning_rate": 2.9251495197245214e-05, + "loss": 2.5185, + "step": 28269 + }, + { + "epoch": 2.561210391610609, + "grad_norm": 1.0483683347702026, + "learning_rate": 2.9245453996254458e-05, + "loss": 2.5986, + "step": 28270 + }, + { + "epoch": 2.561300989785056, + "grad_norm": 0.9672346115112305, + "learning_rate": 2.92394127952637e-05, + "loss": 2.7792, + "step": 28271 + }, + { + "epoch": 2.561391587959503, + "grad_norm": 1.013685941696167, + "learning_rate": 2.923337159427294e-05, + "loss": 2.5405, + "step": 28272 + }, + { + "epoch": 2.5614821861339494, + "grad_norm": 1.0877920389175415, + "learning_rate": 2.9227330393282187e-05, + "loss": 2.5817, + "step": 28273 + }, + { + "epoch": 2.561572784308396, + "grad_norm": 0.9581306576728821, + "learning_rate": 2.9221289192291428e-05, + "loss": 2.5779, + "step": 28274 + }, + { + "epoch": 2.561663382482843, + "grad_norm": 1.1870574951171875, + "learning_rate": 2.921524799130067e-05, + "loss": 2.2411, + "step": 28275 + }, + { + "epoch": 2.56175398065729, + "grad_norm": 1.1042262315750122, + "learning_rate": 2.9209206790309917e-05, + "loss": 2.6581, + "step": 28276 + }, + { + "epoch": 2.5618445788317366, + "grad_norm": 0.953034520149231, + "learning_rate": 2.9203165589319158e-05, + "loss": 2.3522, + "step": 28277 + }, + { + "epoch": 2.561935177006183, + "grad_norm": 0.8112918138504028, + "learning_rate": 2.9197124388328402e-05, + "loss": 1.9427, + "step": 28278 + }, + { + "epoch": 2.56202577518063, + "grad_norm": 1.071689248085022, + "learning_rate": 2.9191083187337643e-05, + "loss": 2.7334, + "step": 28279 + }, + { + "epoch": 2.562116373355077, + "grad_norm": 1.058824896812439, + "learning_rate": 2.918504198634689e-05, + "loss": 2.5519, + "step": 28280 + }, + { + "epoch": 2.5622069715295237, + "grad_norm": 0.969674825668335, + "learning_rate": 2.917900078535613e-05, + "loss": 2.1671, + "step": 28281 + }, + { + "epoch": 2.5622975697039703, + "grad_norm": 0.8399945497512817, + "learning_rate": 2.9172959584365372e-05, + "loss": 1.9953, + "step": 28282 + }, + { + "epoch": 2.5623881678784173, + "grad_norm": 0.9352213144302368, + "learning_rate": 2.9166918383374613e-05, + "loss": 2.0793, + "step": 28283 + }, + { + "epoch": 2.5624787660528643, + "grad_norm": 1.0554958581924438, + "learning_rate": 2.916087718238386e-05, + "loss": 2.619, + "step": 28284 + }, + { + "epoch": 2.562569364227311, + "grad_norm": 0.8747959733009338, + "learning_rate": 2.91548359813931e-05, + "loss": 2.1701, + "step": 28285 + }, + { + "epoch": 2.5626599624017574, + "grad_norm": 1.1514767408370972, + "learning_rate": 2.9148794780402345e-05, + "loss": 2.296, + "step": 28286 + }, + { + "epoch": 2.5627505605762044, + "grad_norm": 1.0553057193756104, + "learning_rate": 2.9142753579411586e-05, + "loss": 2.7624, + "step": 28287 + }, + { + "epoch": 2.5628411587506514, + "grad_norm": 1.048258662223816, + "learning_rate": 2.9136712378420834e-05, + "loss": 2.4985, + "step": 28288 + }, + { + "epoch": 2.562931756925098, + "grad_norm": 0.8306698203086853, + "learning_rate": 2.9130671177430075e-05, + "loss": 1.7844, + "step": 28289 + }, + { + "epoch": 2.5630223550995446, + "grad_norm": 0.9548990726470947, + "learning_rate": 2.9124629976439315e-05, + "loss": 2.5316, + "step": 28290 + }, + { + "epoch": 2.5631129532739916, + "grad_norm": 0.9401902556419373, + "learning_rate": 2.9118588775448563e-05, + "loss": 2.6162, + "step": 28291 + }, + { + "epoch": 2.5632035514484386, + "grad_norm": 0.9713405966758728, + "learning_rate": 2.9112547574457804e-05, + "loss": 2.6846, + "step": 28292 + }, + { + "epoch": 2.563294149622885, + "grad_norm": 1.1885266304016113, + "learning_rate": 2.9106506373467045e-05, + "loss": 2.2311, + "step": 28293 + }, + { + "epoch": 2.5633847477973317, + "grad_norm": 1.1618138551712036, + "learning_rate": 2.910046517247629e-05, + "loss": 2.4708, + "step": 28294 + }, + { + "epoch": 2.5634753459717787, + "grad_norm": 0.9694071412086487, + "learning_rate": 2.9094423971485537e-05, + "loss": 2.4929, + "step": 28295 + }, + { + "epoch": 2.5635659441462253, + "grad_norm": 0.9907936453819275, + "learning_rate": 2.9088382770494777e-05, + "loss": 2.8522, + "step": 28296 + }, + { + "epoch": 2.5636565423206723, + "grad_norm": 0.8896963596343994, + "learning_rate": 2.9082341569504018e-05, + "loss": 2.2485, + "step": 28297 + }, + { + "epoch": 2.563747140495119, + "grad_norm": 1.080242395401001, + "learning_rate": 2.907630036851326e-05, + "loss": 2.6938, + "step": 28298 + }, + { + "epoch": 2.563837738669566, + "grad_norm": 1.0369408130645752, + "learning_rate": 2.9070259167522507e-05, + "loss": 2.7225, + "step": 28299 + }, + { + "epoch": 2.5639283368440124, + "grad_norm": 1.081396222114563, + "learning_rate": 2.9064217966531747e-05, + "loss": 2.7798, + "step": 28300 + }, + { + "epoch": 2.5640189350184595, + "grad_norm": 1.1391382217407227, + "learning_rate": 2.9058176765540988e-05, + "loss": 2.8109, + "step": 28301 + }, + { + "epoch": 2.564109533192906, + "grad_norm": 1.0315297842025757, + "learning_rate": 2.9052135564550232e-05, + "loss": 2.5371, + "step": 28302 + }, + { + "epoch": 2.564200131367353, + "grad_norm": 1.005595326423645, + "learning_rate": 2.904609436355948e-05, + "loss": 2.6605, + "step": 28303 + }, + { + "epoch": 2.5642907295417996, + "grad_norm": 1.0631234645843506, + "learning_rate": 2.904005316256872e-05, + "loss": 2.632, + "step": 28304 + }, + { + "epoch": 2.5643813277162466, + "grad_norm": 0.9608191847801208, + "learning_rate": 2.9034011961577962e-05, + "loss": 2.5486, + "step": 28305 + }, + { + "epoch": 2.564471925890693, + "grad_norm": 1.065076470375061, + "learning_rate": 2.902797076058721e-05, + "loss": 2.601, + "step": 28306 + }, + { + "epoch": 2.56456252406514, + "grad_norm": 0.9545571208000183, + "learning_rate": 2.902192955959645e-05, + "loss": 2.5126, + "step": 28307 + }, + { + "epoch": 2.5646531222395867, + "grad_norm": 1.0252090692520142, + "learning_rate": 2.901588835860569e-05, + "loss": 2.8535, + "step": 28308 + }, + { + "epoch": 2.5647437204140338, + "grad_norm": 1.0462368726730347, + "learning_rate": 2.9009847157614932e-05, + "loss": 2.5915, + "step": 28309 + }, + { + "epoch": 2.5648343185884803, + "grad_norm": 1.0391924381256104, + "learning_rate": 2.900380595662418e-05, + "loss": 2.9372, + "step": 28310 + }, + { + "epoch": 2.5649249167629273, + "grad_norm": 1.0870647430419922, + "learning_rate": 2.899776475563342e-05, + "loss": 2.7984, + "step": 28311 + }, + { + "epoch": 2.565015514937374, + "grad_norm": 1.0021578073501587, + "learning_rate": 2.8991723554642665e-05, + "loss": 2.4984, + "step": 28312 + }, + { + "epoch": 2.565106113111821, + "grad_norm": 1.027647852897644, + "learning_rate": 2.8985682353651905e-05, + "loss": 2.7901, + "step": 28313 + }, + { + "epoch": 2.5651967112862675, + "grad_norm": 0.9866224527359009, + "learning_rate": 2.8979641152661153e-05, + "loss": 1.9849, + "step": 28314 + }, + { + "epoch": 2.5652873094607145, + "grad_norm": 0.9945634007453918, + "learning_rate": 2.8973599951670394e-05, + "loss": 2.6001, + "step": 28315 + }, + { + "epoch": 2.565377907635161, + "grad_norm": 0.965005099773407, + "learning_rate": 2.8967558750679635e-05, + "loss": 2.8208, + "step": 28316 + }, + { + "epoch": 2.565468505809608, + "grad_norm": 1.0263173580169678, + "learning_rate": 2.8961517549688875e-05, + "loss": 2.5793, + "step": 28317 + }, + { + "epoch": 2.5655591039840546, + "grad_norm": 1.095881700515747, + "learning_rate": 2.8955476348698123e-05, + "loss": 2.694, + "step": 28318 + }, + { + "epoch": 2.5656497021585016, + "grad_norm": 1.0068556070327759, + "learning_rate": 2.8949435147707364e-05, + "loss": 2.7713, + "step": 28319 + }, + { + "epoch": 2.565740300332948, + "grad_norm": 1.035058617591858, + "learning_rate": 2.8943393946716608e-05, + "loss": 2.6456, + "step": 28320 + }, + { + "epoch": 2.565830898507395, + "grad_norm": 1.0783262252807617, + "learning_rate": 2.8937352745725856e-05, + "loss": 2.6367, + "step": 28321 + }, + { + "epoch": 2.5659214966818418, + "grad_norm": 1.009781837463379, + "learning_rate": 2.8931311544735097e-05, + "loss": 2.3934, + "step": 28322 + }, + { + "epoch": 2.566012094856289, + "grad_norm": 1.0051114559173584, + "learning_rate": 2.8925270343744337e-05, + "loss": 2.6306, + "step": 28323 + }, + { + "epoch": 2.5661026930307353, + "grad_norm": 0.985057532787323, + "learning_rate": 2.8919229142753578e-05, + "loss": 2.5381, + "step": 28324 + }, + { + "epoch": 2.5661932912051824, + "grad_norm": 1.031529188156128, + "learning_rate": 2.8913187941762826e-05, + "loss": 2.5796, + "step": 28325 + }, + { + "epoch": 2.566283889379629, + "grad_norm": 1.0078189373016357, + "learning_rate": 2.8907146740772067e-05, + "loss": 2.4827, + "step": 28326 + }, + { + "epoch": 2.566374487554076, + "grad_norm": 1.032320499420166, + "learning_rate": 2.8901105539781307e-05, + "loss": 2.6056, + "step": 28327 + }, + { + "epoch": 2.5664650857285225, + "grad_norm": 1.0843064785003662, + "learning_rate": 2.889506433879055e-05, + "loss": 2.7731, + "step": 28328 + }, + { + "epoch": 2.5665556839029695, + "grad_norm": 0.9532425403594971, + "learning_rate": 2.88890231377998e-05, + "loss": 2.4687, + "step": 28329 + }, + { + "epoch": 2.566646282077416, + "grad_norm": 0.9636434316635132, + "learning_rate": 2.888298193680904e-05, + "loss": 2.4394, + "step": 28330 + }, + { + "epoch": 2.566736880251863, + "grad_norm": 1.0036115646362305, + "learning_rate": 2.887694073581828e-05, + "loss": 2.5135, + "step": 28331 + }, + { + "epoch": 2.5668274784263096, + "grad_norm": 1.1112911701202393, + "learning_rate": 2.8870899534827522e-05, + "loss": 2.5341, + "step": 28332 + }, + { + "epoch": 2.5669180766007567, + "grad_norm": 0.956378161907196, + "learning_rate": 2.886485833383677e-05, + "loss": 2.3326, + "step": 28333 + }, + { + "epoch": 2.5670086747752032, + "grad_norm": 1.045473575592041, + "learning_rate": 2.885881713284601e-05, + "loss": 2.5687, + "step": 28334 + }, + { + "epoch": 2.5670992729496502, + "grad_norm": 1.0990488529205322, + "learning_rate": 2.885277593185525e-05, + "loss": 2.6889, + "step": 28335 + }, + { + "epoch": 2.567189871124097, + "grad_norm": 1.1050848960876465, + "learning_rate": 2.88467347308645e-05, + "loss": 2.7507, + "step": 28336 + }, + { + "epoch": 2.5672804692985434, + "grad_norm": 0.919455885887146, + "learning_rate": 2.8840693529873743e-05, + "loss": 2.0015, + "step": 28337 + }, + { + "epoch": 2.5673710674729904, + "grad_norm": 0.9918840527534485, + "learning_rate": 2.8834652328882984e-05, + "loss": 2.5836, + "step": 28338 + }, + { + "epoch": 2.5674616656474374, + "grad_norm": 1.0632083415985107, + "learning_rate": 2.8828611127892225e-05, + "loss": 2.8273, + "step": 28339 + }, + { + "epoch": 2.567552263821884, + "grad_norm": 0.8701878786087036, + "learning_rate": 2.8822569926901472e-05, + "loss": 1.9972, + "step": 28340 + }, + { + "epoch": 2.5676428619963305, + "grad_norm": 0.935057520866394, + "learning_rate": 2.8816528725910713e-05, + "loss": 2.0741, + "step": 28341 + }, + { + "epoch": 2.5677334601707775, + "grad_norm": 1.0537543296813965, + "learning_rate": 2.8810487524919954e-05, + "loss": 2.5313, + "step": 28342 + }, + { + "epoch": 2.5678240583452245, + "grad_norm": 1.0357381105422974, + "learning_rate": 2.8804446323929195e-05, + "loss": 2.5397, + "step": 28343 + }, + { + "epoch": 2.567914656519671, + "grad_norm": 1.0155420303344727, + "learning_rate": 2.8798405122938442e-05, + "loss": 2.4517, + "step": 28344 + }, + { + "epoch": 2.5680052546941177, + "grad_norm": 0.9853700995445251, + "learning_rate": 2.8792363921947686e-05, + "loss": 2.6914, + "step": 28345 + }, + { + "epoch": 2.5680958528685647, + "grad_norm": 0.9822298288345337, + "learning_rate": 2.8786322720956927e-05, + "loss": 2.5464, + "step": 28346 + }, + { + "epoch": 2.5681864510430117, + "grad_norm": 1.0284864902496338, + "learning_rate": 2.8780281519966168e-05, + "loss": 2.4833, + "step": 28347 + }, + { + "epoch": 2.5682770492174583, + "grad_norm": 0.8296863436698914, + "learning_rate": 2.8774240318975416e-05, + "loss": 1.9024, + "step": 28348 + }, + { + "epoch": 2.568367647391905, + "grad_norm": 0.9637714624404907, + "learning_rate": 2.8768199117984657e-05, + "loss": 2.3691, + "step": 28349 + }, + { + "epoch": 2.568458245566352, + "grad_norm": 0.9216776490211487, + "learning_rate": 2.8762157916993897e-05, + "loss": 2.2022, + "step": 28350 + }, + { + "epoch": 2.568548843740799, + "grad_norm": 1.0420634746551514, + "learning_rate": 2.8756116716003145e-05, + "loss": 2.7322, + "step": 28351 + }, + { + "epoch": 2.5686394419152454, + "grad_norm": 0.9913337826728821, + "learning_rate": 2.8750075515012386e-05, + "loss": 2.5857, + "step": 28352 + }, + { + "epoch": 2.568730040089692, + "grad_norm": 1.0936412811279297, + "learning_rate": 2.874403431402163e-05, + "loss": 2.5135, + "step": 28353 + }, + { + "epoch": 2.568820638264139, + "grad_norm": 0.9921934604644775, + "learning_rate": 2.873799311303087e-05, + "loss": 2.5773, + "step": 28354 + }, + { + "epoch": 2.568911236438586, + "grad_norm": 1.051408052444458, + "learning_rate": 2.873195191204012e-05, + "loss": 2.8737, + "step": 28355 + }, + { + "epoch": 2.5690018346130326, + "grad_norm": 1.0155366659164429, + "learning_rate": 2.872591071104936e-05, + "loss": 2.4524, + "step": 28356 + }, + { + "epoch": 2.569092432787479, + "grad_norm": 1.0258235931396484, + "learning_rate": 2.87198695100586e-05, + "loss": 2.5304, + "step": 28357 + }, + { + "epoch": 2.569183030961926, + "grad_norm": 1.028236746788025, + "learning_rate": 2.871382830906784e-05, + "loss": 2.8249, + "step": 28358 + }, + { + "epoch": 2.569273629136373, + "grad_norm": 0.9995954036712646, + "learning_rate": 2.870778710807709e-05, + "loss": 2.5567, + "step": 28359 + }, + { + "epoch": 2.5693642273108197, + "grad_norm": 1.0274927616119385, + "learning_rate": 2.870174590708633e-05, + "loss": 2.6055, + "step": 28360 + }, + { + "epoch": 2.5694548254852663, + "grad_norm": 1.0083595514297485, + "learning_rate": 2.8695704706095574e-05, + "loss": 2.8796, + "step": 28361 + }, + { + "epoch": 2.5695454236597133, + "grad_norm": 0.9419438242912292, + "learning_rate": 2.8689663505104814e-05, + "loss": 2.3439, + "step": 28362 + }, + { + "epoch": 2.5696360218341603, + "grad_norm": 0.9661315083503723, + "learning_rate": 2.8683622304114062e-05, + "loss": 2.4377, + "step": 28363 + }, + { + "epoch": 2.569726620008607, + "grad_norm": 1.081638216972351, + "learning_rate": 2.8677581103123303e-05, + "loss": 2.6284, + "step": 28364 + }, + { + "epoch": 2.5698172181830534, + "grad_norm": 1.0466248989105225, + "learning_rate": 2.8671539902132544e-05, + "loss": 2.7195, + "step": 28365 + }, + { + "epoch": 2.5699078163575004, + "grad_norm": 1.0629996061325073, + "learning_rate": 2.866549870114179e-05, + "loss": 2.3269, + "step": 28366 + }, + { + "epoch": 2.5699984145319474, + "grad_norm": 0.9490298628807068, + "learning_rate": 2.8659457500151032e-05, + "loss": 2.5322, + "step": 28367 + }, + { + "epoch": 2.570089012706394, + "grad_norm": 1.056740641593933, + "learning_rate": 2.8653416299160273e-05, + "loss": 2.8452, + "step": 28368 + }, + { + "epoch": 2.5701796108808406, + "grad_norm": 1.0202735662460327, + "learning_rate": 2.8647375098169517e-05, + "loss": 2.6098, + "step": 28369 + }, + { + "epoch": 2.5702702090552876, + "grad_norm": 1.0547854900360107, + "learning_rate": 2.864133389717876e-05, + "loss": 2.7403, + "step": 28370 + }, + { + "epoch": 2.5703608072297346, + "grad_norm": 1.037095069885254, + "learning_rate": 2.8635292696188006e-05, + "loss": 2.613, + "step": 28371 + }, + { + "epoch": 2.570451405404181, + "grad_norm": 1.0596156120300293, + "learning_rate": 2.8629251495197246e-05, + "loss": 2.4327, + "step": 28372 + }, + { + "epoch": 2.5705420035786277, + "grad_norm": 0.9950723648071289, + "learning_rate": 2.8623210294206487e-05, + "loss": 2.5983, + "step": 28373 + }, + { + "epoch": 2.5706326017530747, + "grad_norm": 1.0968785285949707, + "learning_rate": 2.8617169093215735e-05, + "loss": 2.516, + "step": 28374 + }, + { + "epoch": 2.5707231999275213, + "grad_norm": 0.9983850121498108, + "learning_rate": 2.8611127892224976e-05, + "loss": 2.608, + "step": 28375 + }, + { + "epoch": 2.5708137981019683, + "grad_norm": 1.0301506519317627, + "learning_rate": 2.8605086691234217e-05, + "loss": 2.6649, + "step": 28376 + }, + { + "epoch": 2.570904396276415, + "grad_norm": 1.088161826133728, + "learning_rate": 2.8599045490243457e-05, + "loss": 2.601, + "step": 28377 + }, + { + "epoch": 2.570994994450862, + "grad_norm": 0.9957136511802673, + "learning_rate": 2.8593004289252705e-05, + "loss": 2.7924, + "step": 28378 + }, + { + "epoch": 2.5710855926253084, + "grad_norm": 0.729956865310669, + "learning_rate": 2.858696308826195e-05, + "loss": 1.3327, + "step": 28379 + }, + { + "epoch": 2.5711761907997555, + "grad_norm": 0.8967634439468384, + "learning_rate": 2.858092188727119e-05, + "loss": 1.9279, + "step": 28380 + }, + { + "epoch": 2.571266788974202, + "grad_norm": 1.0754616260528564, + "learning_rate": 2.8574880686280438e-05, + "loss": 2.7934, + "step": 28381 + }, + { + "epoch": 2.571357387148649, + "grad_norm": 0.995401918888092, + "learning_rate": 2.856883948528968e-05, + "loss": 2.7277, + "step": 28382 + }, + { + "epoch": 2.5714479853230956, + "grad_norm": 1.1908408403396606, + "learning_rate": 2.856279828429892e-05, + "loss": 2.5405, + "step": 28383 + }, + { + "epoch": 2.5715385834975426, + "grad_norm": 0.8616782426834106, + "learning_rate": 2.855675708330816e-05, + "loss": 1.9154, + "step": 28384 + }, + { + "epoch": 2.571629181671989, + "grad_norm": 1.038501501083374, + "learning_rate": 2.8550715882317408e-05, + "loss": 2.8187, + "step": 28385 + }, + { + "epoch": 2.571719779846436, + "grad_norm": 1.0167676210403442, + "learning_rate": 2.854467468132665e-05, + "loss": 2.2657, + "step": 28386 + }, + { + "epoch": 2.5718103780208827, + "grad_norm": 0.9939191341400146, + "learning_rate": 2.8538633480335893e-05, + "loss": 2.5493, + "step": 28387 + }, + { + "epoch": 2.5719009761953298, + "grad_norm": 1.0297402143478394, + "learning_rate": 2.8532592279345134e-05, + "loss": 2.6105, + "step": 28388 + }, + { + "epoch": 2.5719915743697763, + "grad_norm": 1.0092782974243164, + "learning_rate": 2.852655107835438e-05, + "loss": 2.4801, + "step": 28389 + }, + { + "epoch": 2.5720821725442233, + "grad_norm": 1.04916512966156, + "learning_rate": 2.8520509877363622e-05, + "loss": 2.7035, + "step": 28390 + }, + { + "epoch": 2.57217277071867, + "grad_norm": 0.9696593880653381, + "learning_rate": 2.8514468676372863e-05, + "loss": 2.4155, + "step": 28391 + }, + { + "epoch": 2.572263368893117, + "grad_norm": 0.8737934231758118, + "learning_rate": 2.8508427475382104e-05, + "loss": 2.0457, + "step": 28392 + }, + { + "epoch": 2.5723539670675635, + "grad_norm": 1.0089707374572754, + "learning_rate": 2.850238627439135e-05, + "loss": 2.8099, + "step": 28393 + }, + { + "epoch": 2.5724445652420105, + "grad_norm": 1.0488723516464233, + "learning_rate": 2.8496345073400592e-05, + "loss": 2.5925, + "step": 28394 + }, + { + "epoch": 2.572535163416457, + "grad_norm": 1.1260002851486206, + "learning_rate": 2.8490303872409836e-05, + "loss": 2.6105, + "step": 28395 + }, + { + "epoch": 2.572625761590904, + "grad_norm": 0.9150083661079407, + "learning_rate": 2.848426267141908e-05, + "loss": 1.8976, + "step": 28396 + }, + { + "epoch": 2.5727163597653506, + "grad_norm": 1.0504734516143799, + "learning_rate": 2.8478221470428325e-05, + "loss": 2.7879, + "step": 28397 + }, + { + "epoch": 2.5728069579397976, + "grad_norm": 1.0490210056304932, + "learning_rate": 2.8472180269437566e-05, + "loss": 2.5372, + "step": 28398 + }, + { + "epoch": 2.572897556114244, + "grad_norm": 0.9836129546165466, + "learning_rate": 2.8466139068446806e-05, + "loss": 2.6825, + "step": 28399 + }, + { + "epoch": 2.572988154288691, + "grad_norm": 1.0230512619018555, + "learning_rate": 2.8460097867456054e-05, + "loss": 2.6929, + "step": 28400 + }, + { + "epoch": 2.5730787524631378, + "grad_norm": 1.1383286714553833, + "learning_rate": 2.8454056666465295e-05, + "loss": 2.54, + "step": 28401 + }, + { + "epoch": 2.573169350637585, + "grad_norm": 1.0063177347183228, + "learning_rate": 2.8448015465474536e-05, + "loss": 2.6656, + "step": 28402 + }, + { + "epoch": 2.5732599488120314, + "grad_norm": 1.034000277519226, + "learning_rate": 2.844197426448378e-05, + "loss": 2.4902, + "step": 28403 + }, + { + "epoch": 2.5733505469864784, + "grad_norm": 1.01467764377594, + "learning_rate": 2.8435933063493024e-05, + "loss": 2.8369, + "step": 28404 + }, + { + "epoch": 2.573441145160925, + "grad_norm": 1.0147162675857544, + "learning_rate": 2.842989186250227e-05, + "loss": 2.6604, + "step": 28405 + }, + { + "epoch": 2.573531743335372, + "grad_norm": 1.044044017791748, + "learning_rate": 2.842385066151151e-05, + "loss": 2.8367, + "step": 28406 + }, + { + "epoch": 2.5736223415098185, + "grad_norm": 1.0428383350372314, + "learning_rate": 2.8417809460520757e-05, + "loss": 2.8318, + "step": 28407 + }, + { + "epoch": 2.5737129396842655, + "grad_norm": 1.0454336404800415, + "learning_rate": 2.8411768259529998e-05, + "loss": 2.5407, + "step": 28408 + }, + { + "epoch": 2.573803537858712, + "grad_norm": 0.9497042298316956, + "learning_rate": 2.840572705853924e-05, + "loss": 2.7457, + "step": 28409 + }, + { + "epoch": 2.573894136033159, + "grad_norm": 1.058821678161621, + "learning_rate": 2.839968585754848e-05, + "loss": 2.7682, + "step": 28410 + }, + { + "epoch": 2.5739847342076057, + "grad_norm": 0.8974762558937073, + "learning_rate": 2.8393644656557727e-05, + "loss": 1.9093, + "step": 28411 + }, + { + "epoch": 2.5740753323820527, + "grad_norm": 1.051661729812622, + "learning_rate": 2.8387603455566968e-05, + "loss": 2.8756, + "step": 28412 + }, + { + "epoch": 2.5741659305564992, + "grad_norm": 1.0979467630386353, + "learning_rate": 2.8381562254576212e-05, + "loss": 2.6771, + "step": 28413 + }, + { + "epoch": 2.5742565287309462, + "grad_norm": 1.0879908800125122, + "learning_rate": 2.8375521053585453e-05, + "loss": 2.5735, + "step": 28414 + }, + { + "epoch": 2.574347126905393, + "grad_norm": 1.1369240283966064, + "learning_rate": 2.83694798525947e-05, + "loss": 2.8271, + "step": 28415 + }, + { + "epoch": 2.5744377250798394, + "grad_norm": 0.8640952706336975, + "learning_rate": 2.836343865160394e-05, + "loss": 1.9945, + "step": 28416 + }, + { + "epoch": 2.5745283232542864, + "grad_norm": 0.9096302390098572, + "learning_rate": 2.8357397450613182e-05, + "loss": 2.4949, + "step": 28417 + }, + { + "epoch": 2.5746189214287334, + "grad_norm": 1.1909630298614502, + "learning_rate": 2.8351356249622423e-05, + "loss": 2.5956, + "step": 28418 + }, + { + "epoch": 2.57470951960318, + "grad_norm": 1.0614932775497437, + "learning_rate": 2.834531504863167e-05, + "loss": 2.652, + "step": 28419 + }, + { + "epoch": 2.5748001177776265, + "grad_norm": 1.089420199394226, + "learning_rate": 2.833927384764091e-05, + "loss": 2.556, + "step": 28420 + }, + { + "epoch": 2.5748907159520735, + "grad_norm": 1.2115647792816162, + "learning_rate": 2.8333232646650155e-05, + "loss": 2.5822, + "step": 28421 + }, + { + "epoch": 2.5749813141265205, + "grad_norm": 1.0842310190200806, + "learning_rate": 2.83271914456594e-05, + "loss": 2.7641, + "step": 28422 + }, + { + "epoch": 2.575071912300967, + "grad_norm": 0.9046067595481873, + "learning_rate": 2.8321150244668644e-05, + "loss": 2.0238, + "step": 28423 + }, + { + "epoch": 2.5751625104754137, + "grad_norm": 0.9859452843666077, + "learning_rate": 2.8315109043677885e-05, + "loss": 2.6721, + "step": 28424 + }, + { + "epoch": 2.5752531086498607, + "grad_norm": 0.988021969795227, + "learning_rate": 2.8309067842687126e-05, + "loss": 2.7421, + "step": 28425 + }, + { + "epoch": 2.5753437068243077, + "grad_norm": 0.9984407424926758, + "learning_rate": 2.8303026641696373e-05, + "loss": 2.6666, + "step": 28426 + }, + { + "epoch": 2.5754343049987543, + "grad_norm": 0.8319879770278931, + "learning_rate": 2.8296985440705614e-05, + "loss": 2.0839, + "step": 28427 + }, + { + "epoch": 2.575524903173201, + "grad_norm": 0.9038972854614258, + "learning_rate": 2.8290944239714855e-05, + "loss": 1.7792, + "step": 28428 + }, + { + "epoch": 2.575615501347648, + "grad_norm": 0.9813900589942932, + "learning_rate": 2.82849030387241e-05, + "loss": 2.7096, + "step": 28429 + }, + { + "epoch": 2.575706099522095, + "grad_norm": 1.1453783512115479, + "learning_rate": 2.8278861837733343e-05, + "loss": 2.7266, + "step": 28430 + }, + { + "epoch": 2.5757966976965414, + "grad_norm": 1.0442111492156982, + "learning_rate": 2.8272820636742587e-05, + "loss": 2.8696, + "step": 28431 + }, + { + "epoch": 2.575887295870988, + "grad_norm": 1.031966209411621, + "learning_rate": 2.826677943575183e-05, + "loss": 2.5682, + "step": 28432 + }, + { + "epoch": 2.575977894045435, + "grad_norm": 1.0146911144256592, + "learning_rate": 2.826073823476107e-05, + "loss": 2.5723, + "step": 28433 + }, + { + "epoch": 2.576068492219882, + "grad_norm": 0.9781644344329834, + "learning_rate": 2.8254697033770317e-05, + "loss": 2.6284, + "step": 28434 + }, + { + "epoch": 2.5761590903943286, + "grad_norm": 1.0148475170135498, + "learning_rate": 2.8248655832779558e-05, + "loss": 2.659, + "step": 28435 + }, + { + "epoch": 2.576249688568775, + "grad_norm": 0.9615299701690674, + "learning_rate": 2.82426146317888e-05, + "loss": 2.4391, + "step": 28436 + }, + { + "epoch": 2.576340286743222, + "grad_norm": 0.9793601036071777, + "learning_rate": 2.8236573430798046e-05, + "loss": 2.5598, + "step": 28437 + }, + { + "epoch": 2.576430884917669, + "grad_norm": 1.028250813484192, + "learning_rate": 2.8230532229807287e-05, + "loss": 2.773, + "step": 28438 + }, + { + "epoch": 2.5765214830921157, + "grad_norm": 0.9072570204734802, + "learning_rate": 2.822449102881653e-05, + "loss": 2.5333, + "step": 28439 + }, + { + "epoch": 2.5766120812665623, + "grad_norm": 0.9306108951568604, + "learning_rate": 2.8218449827825772e-05, + "loss": 2.4818, + "step": 28440 + }, + { + "epoch": 2.5767026794410093, + "grad_norm": 1.1113553047180176, + "learning_rate": 2.821240862683502e-05, + "loss": 2.6976, + "step": 28441 + }, + { + "epoch": 2.5767932776154563, + "grad_norm": 1.1375454664230347, + "learning_rate": 2.820636742584426e-05, + "loss": 2.6764, + "step": 28442 + }, + { + "epoch": 2.576883875789903, + "grad_norm": 1.1361660957336426, + "learning_rate": 2.82003262248535e-05, + "loss": 2.7833, + "step": 28443 + }, + { + "epoch": 2.5769744739643494, + "grad_norm": 1.0752943754196167, + "learning_rate": 2.8194285023862742e-05, + "loss": 2.6206, + "step": 28444 + }, + { + "epoch": 2.5770650721387964, + "grad_norm": 1.1375374794006348, + "learning_rate": 2.818824382287199e-05, + "loss": 2.6855, + "step": 28445 + }, + { + "epoch": 2.5771556703132434, + "grad_norm": 1.074056625366211, + "learning_rate": 2.818220262188123e-05, + "loss": 2.6504, + "step": 28446 + }, + { + "epoch": 2.57724626848769, + "grad_norm": 1.002465009689331, + "learning_rate": 2.8176161420890475e-05, + "loss": 2.6042, + "step": 28447 + }, + { + "epoch": 2.5773368666621366, + "grad_norm": 1.1116645336151123, + "learning_rate": 2.8170120219899715e-05, + "loss": 2.3536, + "step": 28448 + }, + { + "epoch": 2.5774274648365836, + "grad_norm": 1.015047311782837, + "learning_rate": 2.8164079018908963e-05, + "loss": 2.5422, + "step": 28449 + }, + { + "epoch": 2.5775180630110306, + "grad_norm": 1.008710265159607, + "learning_rate": 2.8158037817918204e-05, + "loss": 2.6325, + "step": 28450 + }, + { + "epoch": 2.577608661185477, + "grad_norm": 0.9884365797042847, + "learning_rate": 2.8151996616927445e-05, + "loss": 2.8226, + "step": 28451 + }, + { + "epoch": 2.5776992593599237, + "grad_norm": 1.0683159828186035, + "learning_rate": 2.8145955415936692e-05, + "loss": 2.5258, + "step": 28452 + }, + { + "epoch": 2.5777898575343707, + "grad_norm": 1.0422779321670532, + "learning_rate": 2.8139914214945933e-05, + "loss": 2.5771, + "step": 28453 + }, + { + "epoch": 2.5778804557088177, + "grad_norm": 1.0838055610656738, + "learning_rate": 2.8133873013955174e-05, + "loss": 2.7867, + "step": 28454 + }, + { + "epoch": 2.5779710538832643, + "grad_norm": 1.0357208251953125, + "learning_rate": 2.8127831812964418e-05, + "loss": 2.5094, + "step": 28455 + }, + { + "epoch": 2.578061652057711, + "grad_norm": 0.8638319969177246, + "learning_rate": 2.8121790611973662e-05, + "loss": 2.1279, + "step": 28456 + }, + { + "epoch": 2.578152250232158, + "grad_norm": 1.0324554443359375, + "learning_rate": 2.8115749410982907e-05, + "loss": 2.7177, + "step": 28457 + }, + { + "epoch": 2.5782428484066044, + "grad_norm": 1.0666698217391968, + "learning_rate": 2.8109708209992147e-05, + "loss": 2.6429, + "step": 28458 + }, + { + "epoch": 2.5783334465810515, + "grad_norm": 1.0068883895874023, + "learning_rate": 2.810366700900139e-05, + "loss": 2.4607, + "step": 28459 + }, + { + "epoch": 2.578424044755498, + "grad_norm": 0.9974794983863831, + "learning_rate": 2.8097625808010636e-05, + "loss": 2.7032, + "step": 28460 + }, + { + "epoch": 2.578514642929945, + "grad_norm": 0.8466158509254456, + "learning_rate": 2.8091584607019877e-05, + "loss": 1.7784, + "step": 28461 + }, + { + "epoch": 2.5786052411043916, + "grad_norm": 0.9870588779449463, + "learning_rate": 2.8085543406029118e-05, + "loss": 2.593, + "step": 28462 + }, + { + "epoch": 2.5786958392788386, + "grad_norm": 0.9872276782989502, + "learning_rate": 2.8079502205038362e-05, + "loss": 2.6967, + "step": 28463 + }, + { + "epoch": 2.578786437453285, + "grad_norm": 1.0286692380905151, + "learning_rate": 2.8073461004047606e-05, + "loss": 2.5808, + "step": 28464 + }, + { + "epoch": 2.578877035627732, + "grad_norm": 1.0880019664764404, + "learning_rate": 2.806741980305685e-05, + "loss": 2.5917, + "step": 28465 + }, + { + "epoch": 2.5789676338021787, + "grad_norm": 1.0738927125930786, + "learning_rate": 2.806137860206609e-05, + "loss": 2.7365, + "step": 28466 + }, + { + "epoch": 2.5790582319766258, + "grad_norm": 1.2491785287857056, + "learning_rate": 2.805533740107534e-05, + "loss": 2.6579, + "step": 28467 + }, + { + "epoch": 2.5791488301510723, + "grad_norm": 1.0048325061798096, + "learning_rate": 2.804929620008458e-05, + "loss": 2.6994, + "step": 28468 + }, + { + "epoch": 2.5792394283255193, + "grad_norm": 1.0044740438461304, + "learning_rate": 2.804325499909382e-05, + "loss": 2.7735, + "step": 28469 + }, + { + "epoch": 2.579330026499966, + "grad_norm": 0.9995697140693665, + "learning_rate": 2.803721379810306e-05, + "loss": 2.9377, + "step": 28470 + }, + { + "epoch": 2.579420624674413, + "grad_norm": 0.9853288531303406, + "learning_rate": 2.803117259711231e-05, + "loss": 1.7941, + "step": 28471 + }, + { + "epoch": 2.5795112228488595, + "grad_norm": 1.043015718460083, + "learning_rate": 2.802513139612155e-05, + "loss": 2.8644, + "step": 28472 + }, + { + "epoch": 2.5796018210233065, + "grad_norm": 0.9995205998420715, + "learning_rate": 2.8019090195130794e-05, + "loss": 2.9712, + "step": 28473 + }, + { + "epoch": 2.579692419197753, + "grad_norm": 1.1116538047790527, + "learning_rate": 2.8013048994140035e-05, + "loss": 2.5606, + "step": 28474 + }, + { + "epoch": 2.5797830173722, + "grad_norm": 1.1769388914108276, + "learning_rate": 2.8007007793149282e-05, + "loss": 2.7037, + "step": 28475 + }, + { + "epoch": 2.5798736155466466, + "grad_norm": 1.1467934846878052, + "learning_rate": 2.8000966592158523e-05, + "loss": 2.7428, + "step": 28476 + }, + { + "epoch": 2.5799642137210936, + "grad_norm": 1.004278302192688, + "learning_rate": 2.7994925391167764e-05, + "loss": 2.6447, + "step": 28477 + }, + { + "epoch": 2.58005481189554, + "grad_norm": 0.9975390434265137, + "learning_rate": 2.7988884190177005e-05, + "loss": 2.4396, + "step": 28478 + }, + { + "epoch": 2.580145410069987, + "grad_norm": 0.9339754581451416, + "learning_rate": 2.7982842989186252e-05, + "loss": 2.1659, + "step": 28479 + }, + { + "epoch": 2.5802360082444338, + "grad_norm": 1.0677993297576904, + "learning_rate": 2.7976801788195493e-05, + "loss": 2.8409, + "step": 28480 + }, + { + "epoch": 2.580326606418881, + "grad_norm": 0.9465197324752808, + "learning_rate": 2.7970760587204737e-05, + "loss": 2.8846, + "step": 28481 + }, + { + "epoch": 2.5804172045933274, + "grad_norm": 0.9230384230613708, + "learning_rate": 2.7964719386213985e-05, + "loss": 2.0985, + "step": 28482 + }, + { + "epoch": 2.5805078027677744, + "grad_norm": 0.9779885411262512, + "learning_rate": 2.7958678185223226e-05, + "loss": 2.6095, + "step": 28483 + }, + { + "epoch": 2.580598400942221, + "grad_norm": 1.105844497680664, + "learning_rate": 2.7952636984232467e-05, + "loss": 2.7449, + "step": 28484 + }, + { + "epoch": 2.580688999116668, + "grad_norm": 0.9614375233650208, + "learning_rate": 2.7946595783241707e-05, + "loss": 2.549, + "step": 28485 + }, + { + "epoch": 2.5807795972911145, + "grad_norm": 0.8269302248954773, + "learning_rate": 2.7940554582250955e-05, + "loss": 1.79, + "step": 28486 + }, + { + "epoch": 2.5808701954655615, + "grad_norm": 0.9685688018798828, + "learning_rate": 2.7934513381260196e-05, + "loss": 2.4525, + "step": 28487 + }, + { + "epoch": 2.580960793640008, + "grad_norm": 0.9134342670440674, + "learning_rate": 2.7928472180269437e-05, + "loss": 2.412, + "step": 28488 + }, + { + "epoch": 2.581051391814455, + "grad_norm": 1.0450855493545532, + "learning_rate": 2.792243097927868e-05, + "loss": 2.5845, + "step": 28489 + }, + { + "epoch": 2.5811419899889017, + "grad_norm": 1.0936076641082764, + "learning_rate": 2.791638977828793e-05, + "loss": 2.6375, + "step": 28490 + }, + { + "epoch": 2.5812325881633487, + "grad_norm": 1.0281821489334106, + "learning_rate": 2.791034857729717e-05, + "loss": 2.6098, + "step": 28491 + }, + { + "epoch": 2.5813231863377952, + "grad_norm": 0.9935274124145508, + "learning_rate": 2.790430737630641e-05, + "loss": 2.6082, + "step": 28492 + }, + { + "epoch": 2.5814137845122422, + "grad_norm": 1.1090166568756104, + "learning_rate": 2.789826617531565e-05, + "loss": 2.4825, + "step": 28493 + }, + { + "epoch": 2.581504382686689, + "grad_norm": 0.8472970128059387, + "learning_rate": 2.78922249743249e-05, + "loss": 1.87, + "step": 28494 + }, + { + "epoch": 2.581594980861136, + "grad_norm": 1.0744051933288574, + "learning_rate": 2.788618377333414e-05, + "loss": 2.7395, + "step": 28495 + }, + { + "epoch": 2.5816855790355824, + "grad_norm": 0.8275654315948486, + "learning_rate": 2.788014257234338e-05, + "loss": 1.8239, + "step": 28496 + }, + { + "epoch": 2.5817761772100294, + "grad_norm": 1.018463373184204, + "learning_rate": 2.7874101371352628e-05, + "loss": 2.8465, + "step": 28497 + }, + { + "epoch": 2.581866775384476, + "grad_norm": 1.0056869983673096, + "learning_rate": 2.786806017036187e-05, + "loss": 2.5729, + "step": 28498 + }, + { + "epoch": 2.5819573735589225, + "grad_norm": 1.0533407926559448, + "learning_rate": 2.7862018969371113e-05, + "loss": 2.6228, + "step": 28499 + }, + { + "epoch": 2.5820479717333695, + "grad_norm": 1.0015556812286377, + "learning_rate": 2.7855977768380354e-05, + "loss": 2.3916, + "step": 28500 + }, + { + "epoch": 2.5821385699078165, + "grad_norm": 1.022661566734314, + "learning_rate": 2.78499365673896e-05, + "loss": 2.6171, + "step": 28501 + }, + { + "epoch": 2.582229168082263, + "grad_norm": 1.034009575843811, + "learning_rate": 2.7843895366398842e-05, + "loss": 2.6968, + "step": 28502 + }, + { + "epoch": 2.5823197662567097, + "grad_norm": 1.0876432657241821, + "learning_rate": 2.7837854165408083e-05, + "loss": 2.291, + "step": 28503 + }, + { + "epoch": 2.5824103644311567, + "grad_norm": 1.031313180923462, + "learning_rate": 2.7831812964417324e-05, + "loss": 3.0048, + "step": 28504 + }, + { + "epoch": 2.5825009626056037, + "grad_norm": 0.9004068374633789, + "learning_rate": 2.782577176342657e-05, + "loss": 1.9201, + "step": 28505 + }, + { + "epoch": 2.5825915607800503, + "grad_norm": 1.1680511236190796, + "learning_rate": 2.7819730562435812e-05, + "loss": 2.7736, + "step": 28506 + }, + { + "epoch": 2.582682158954497, + "grad_norm": 1.0199304819107056, + "learning_rate": 2.7813689361445057e-05, + "loss": 2.9189, + "step": 28507 + }, + { + "epoch": 2.582772757128944, + "grad_norm": 1.0222270488739014, + "learning_rate": 2.7807648160454297e-05, + "loss": 2.5023, + "step": 28508 + }, + { + "epoch": 2.582863355303391, + "grad_norm": 1.0185935497283936, + "learning_rate": 2.7801606959463545e-05, + "loss": 2.7524, + "step": 28509 + }, + { + "epoch": 2.5829539534778374, + "grad_norm": 1.1326971054077148, + "learning_rate": 2.7795565758472786e-05, + "loss": 2.3427, + "step": 28510 + }, + { + "epoch": 2.583044551652284, + "grad_norm": 1.0692416429519653, + "learning_rate": 2.7789524557482027e-05, + "loss": 2.8333, + "step": 28511 + }, + { + "epoch": 2.583135149826731, + "grad_norm": 1.0248109102249146, + "learning_rate": 2.7783483356491274e-05, + "loss": 2.5149, + "step": 28512 + }, + { + "epoch": 2.583225748001178, + "grad_norm": 1.0246700048446655, + "learning_rate": 2.7777442155500515e-05, + "loss": 2.6622, + "step": 28513 + }, + { + "epoch": 2.5833163461756246, + "grad_norm": 0.9616072177886963, + "learning_rate": 2.7771400954509756e-05, + "loss": 2.5729, + "step": 28514 + }, + { + "epoch": 2.583406944350071, + "grad_norm": 1.0359872579574585, + "learning_rate": 2.7765359753519e-05, + "loss": 2.6605, + "step": 28515 + }, + { + "epoch": 2.583497542524518, + "grad_norm": 0.9954844117164612, + "learning_rate": 2.7759318552528248e-05, + "loss": 2.7597, + "step": 28516 + }, + { + "epoch": 2.583588140698965, + "grad_norm": 1.0405069589614868, + "learning_rate": 2.775327735153749e-05, + "loss": 2.8821, + "step": 28517 + }, + { + "epoch": 2.5836787388734117, + "grad_norm": 1.003001093864441, + "learning_rate": 2.774723615054673e-05, + "loss": 2.6433, + "step": 28518 + }, + { + "epoch": 2.5837693370478583, + "grad_norm": 1.0680850744247437, + "learning_rate": 2.774119494955597e-05, + "loss": 2.4568, + "step": 28519 + }, + { + "epoch": 2.5838599352223053, + "grad_norm": 1.0257662534713745, + "learning_rate": 2.7735153748565218e-05, + "loss": 2.5585, + "step": 28520 + }, + { + "epoch": 2.5839505333967523, + "grad_norm": 1.0560818910598755, + "learning_rate": 2.772911254757446e-05, + "loss": 2.4557, + "step": 28521 + }, + { + "epoch": 2.584041131571199, + "grad_norm": 1.0959190130233765, + "learning_rate": 2.77230713465837e-05, + "loss": 2.7288, + "step": 28522 + }, + { + "epoch": 2.5841317297456454, + "grad_norm": 1.0005916357040405, + "learning_rate": 2.7717030145592944e-05, + "loss": 2.5806, + "step": 28523 + }, + { + "epoch": 2.5842223279200924, + "grad_norm": 0.9876531958580017, + "learning_rate": 2.771098894460219e-05, + "loss": 2.9236, + "step": 28524 + }, + { + "epoch": 2.5843129260945394, + "grad_norm": 1.1094334125518799, + "learning_rate": 2.7704947743611432e-05, + "loss": 2.6733, + "step": 28525 + }, + { + "epoch": 2.584403524268986, + "grad_norm": 0.933120846748352, + "learning_rate": 2.7698906542620673e-05, + "loss": 2.5807, + "step": 28526 + }, + { + "epoch": 2.5844941224434326, + "grad_norm": 1.0193710327148438, + "learning_rate": 2.769286534162992e-05, + "loss": 2.4535, + "step": 28527 + }, + { + "epoch": 2.5845847206178796, + "grad_norm": 1.066829800605774, + "learning_rate": 2.768682414063916e-05, + "loss": 2.597, + "step": 28528 + }, + { + "epoch": 2.5846753187923266, + "grad_norm": 0.9797355532646179, + "learning_rate": 2.7680782939648402e-05, + "loss": 2.5785, + "step": 28529 + }, + { + "epoch": 2.584765916966773, + "grad_norm": 0.9493997693061829, + "learning_rate": 2.7674741738657643e-05, + "loss": 2.4777, + "step": 28530 + }, + { + "epoch": 2.5848565151412197, + "grad_norm": 1.0147753953933716, + "learning_rate": 2.766870053766689e-05, + "loss": 2.6626, + "step": 28531 + }, + { + "epoch": 2.5849471133156667, + "grad_norm": 1.0427567958831787, + "learning_rate": 2.7662659336676135e-05, + "loss": 2.4583, + "step": 28532 + }, + { + "epoch": 2.5850377114901137, + "grad_norm": 1.062558889389038, + "learning_rate": 2.7656618135685376e-05, + "loss": 3.0477, + "step": 28533 + }, + { + "epoch": 2.5851283096645603, + "grad_norm": 1.0276075601577759, + "learning_rate": 2.7650576934694617e-05, + "loss": 2.857, + "step": 28534 + }, + { + "epoch": 2.585218907839007, + "grad_norm": 0.9954867362976074, + "learning_rate": 2.7644535733703864e-05, + "loss": 2.464, + "step": 28535 + }, + { + "epoch": 2.585309506013454, + "grad_norm": 0.9954818487167358, + "learning_rate": 2.7638494532713105e-05, + "loss": 2.6482, + "step": 28536 + }, + { + "epoch": 2.5854001041879004, + "grad_norm": 0.9922828674316406, + "learning_rate": 2.7632453331722346e-05, + "loss": 2.8141, + "step": 28537 + }, + { + "epoch": 2.5854907023623475, + "grad_norm": 1.058327078819275, + "learning_rate": 2.7626412130731587e-05, + "loss": 2.7482, + "step": 28538 + }, + { + "epoch": 2.585581300536794, + "grad_norm": 0.9922651052474976, + "learning_rate": 2.7620370929740834e-05, + "loss": 2.5427, + "step": 28539 + }, + { + "epoch": 2.585671898711241, + "grad_norm": 1.0060672760009766, + "learning_rate": 2.761432972875008e-05, + "loss": 2.5506, + "step": 28540 + }, + { + "epoch": 2.5857624968856876, + "grad_norm": 0.9732863903045654, + "learning_rate": 2.760828852775932e-05, + "loss": 2.8388, + "step": 28541 + }, + { + "epoch": 2.5858530950601346, + "grad_norm": 0.9481502771377563, + "learning_rate": 2.7602247326768567e-05, + "loss": 2.6425, + "step": 28542 + }, + { + "epoch": 2.585943693234581, + "grad_norm": 0.9890325665473938, + "learning_rate": 2.7596206125777808e-05, + "loss": 2.4213, + "step": 28543 + }, + { + "epoch": 2.586034291409028, + "grad_norm": 0.9939655065536499, + "learning_rate": 2.759016492478705e-05, + "loss": 2.7597, + "step": 28544 + }, + { + "epoch": 2.5861248895834748, + "grad_norm": 0.947820782661438, + "learning_rate": 2.758412372379629e-05, + "loss": 2.5151, + "step": 28545 + }, + { + "epoch": 2.5862154877579218, + "grad_norm": 0.9844976663589478, + "learning_rate": 2.7578082522805537e-05, + "loss": 2.5964, + "step": 28546 + }, + { + "epoch": 2.5863060859323683, + "grad_norm": 0.9928586483001709, + "learning_rate": 2.7572041321814778e-05, + "loss": 2.5588, + "step": 28547 + }, + { + "epoch": 2.5863966841068153, + "grad_norm": 1.0153498649597168, + "learning_rate": 2.7566000120824022e-05, + "loss": 2.8861, + "step": 28548 + }, + { + "epoch": 2.586487282281262, + "grad_norm": 0.9363626837730408, + "learning_rate": 2.7559958919833263e-05, + "loss": 2.4719, + "step": 28549 + }, + { + "epoch": 2.586577880455709, + "grad_norm": 0.9268759489059448, + "learning_rate": 2.755391771884251e-05, + "loss": 2.7229, + "step": 28550 + }, + { + "epoch": 2.5866684786301555, + "grad_norm": 0.9959967732429504, + "learning_rate": 2.754787651785175e-05, + "loss": 2.6843, + "step": 28551 + }, + { + "epoch": 2.5867590768046025, + "grad_norm": 0.9333212375640869, + "learning_rate": 2.7541835316860992e-05, + "loss": 2.4729, + "step": 28552 + }, + { + "epoch": 2.586849674979049, + "grad_norm": 1.0499482154846191, + "learning_rate": 2.7535794115870233e-05, + "loss": 2.5894, + "step": 28553 + }, + { + "epoch": 2.586940273153496, + "grad_norm": 0.9151393175125122, + "learning_rate": 2.752975291487948e-05, + "loss": 2.0193, + "step": 28554 + }, + { + "epoch": 2.5870308713279426, + "grad_norm": 0.8933428525924683, + "learning_rate": 2.752371171388872e-05, + "loss": 1.7581, + "step": 28555 + }, + { + "epoch": 2.5871214695023896, + "grad_norm": 1.0407967567443848, + "learning_rate": 2.7517670512897966e-05, + "loss": 2.6169, + "step": 28556 + }, + { + "epoch": 2.587212067676836, + "grad_norm": 1.021500825881958, + "learning_rate": 2.751162931190721e-05, + "loss": 2.762, + "step": 28557 + }, + { + "epoch": 2.587302665851283, + "grad_norm": 0.9328601956367493, + "learning_rate": 2.7505588110916454e-05, + "loss": 2.2731, + "step": 28558 + }, + { + "epoch": 2.5873932640257298, + "grad_norm": 1.1110918521881104, + "learning_rate": 2.7499546909925695e-05, + "loss": 2.6039, + "step": 28559 + }, + { + "epoch": 2.587483862200177, + "grad_norm": 0.966049313545227, + "learning_rate": 2.7493505708934936e-05, + "loss": 2.6705, + "step": 28560 + }, + { + "epoch": 2.5875744603746234, + "grad_norm": 1.104556679725647, + "learning_rate": 2.7487464507944183e-05, + "loss": 2.6444, + "step": 28561 + }, + { + "epoch": 2.5876650585490704, + "grad_norm": 0.9555646777153015, + "learning_rate": 2.7481423306953424e-05, + "loss": 2.4282, + "step": 28562 + }, + { + "epoch": 2.587755656723517, + "grad_norm": 0.9796559810638428, + "learning_rate": 2.7475382105962665e-05, + "loss": 2.6611, + "step": 28563 + }, + { + "epoch": 2.587846254897964, + "grad_norm": 0.9730734825134277, + "learning_rate": 2.7469340904971906e-05, + "loss": 2.435, + "step": 28564 + }, + { + "epoch": 2.5879368530724105, + "grad_norm": 1.0055497884750366, + "learning_rate": 2.7463299703981153e-05, + "loss": 2.5963, + "step": 28565 + }, + { + "epoch": 2.5880274512468575, + "grad_norm": 1.0169192552566528, + "learning_rate": 2.7457258502990398e-05, + "loss": 2.5894, + "step": 28566 + }, + { + "epoch": 2.588118049421304, + "grad_norm": 0.9956843256950378, + "learning_rate": 2.745121730199964e-05, + "loss": 2.5445, + "step": 28567 + }, + { + "epoch": 2.588208647595751, + "grad_norm": 1.2150760889053345, + "learning_rate": 2.744517610100888e-05, + "loss": 2.4891, + "step": 28568 + }, + { + "epoch": 2.5882992457701977, + "grad_norm": 0.9957037568092346, + "learning_rate": 2.7439134900018127e-05, + "loss": 2.5319, + "step": 28569 + }, + { + "epoch": 2.5883898439446447, + "grad_norm": 1.1052690744400024, + "learning_rate": 2.7433093699027368e-05, + "loss": 2.6475, + "step": 28570 + }, + { + "epoch": 2.5884804421190912, + "grad_norm": 1.0407114028930664, + "learning_rate": 2.742705249803661e-05, + "loss": 2.5848, + "step": 28571 + }, + { + "epoch": 2.5885710402935382, + "grad_norm": 0.9908226132392883, + "learning_rate": 2.7421011297045856e-05, + "loss": 2.5428, + "step": 28572 + }, + { + "epoch": 2.588661638467985, + "grad_norm": 1.0080175399780273, + "learning_rate": 2.7414970096055097e-05, + "loss": 2.8197, + "step": 28573 + }, + { + "epoch": 2.588752236642432, + "grad_norm": 1.0551722049713135, + "learning_rate": 2.740892889506434e-05, + "loss": 2.5707, + "step": 28574 + }, + { + "epoch": 2.5888428348168784, + "grad_norm": 1.0133918523788452, + "learning_rate": 2.7402887694073582e-05, + "loss": 2.5658, + "step": 28575 + }, + { + "epoch": 2.5889334329913254, + "grad_norm": 1.1123892068862915, + "learning_rate": 2.739684649308283e-05, + "loss": 2.5027, + "step": 28576 + }, + { + "epoch": 2.589024031165772, + "grad_norm": 0.9063557386398315, + "learning_rate": 2.739080529209207e-05, + "loss": 2.024, + "step": 28577 + }, + { + "epoch": 2.5891146293402185, + "grad_norm": 1.0310887098312378, + "learning_rate": 2.738476409110131e-05, + "loss": 2.3512, + "step": 28578 + }, + { + "epoch": 2.5892052275146655, + "grad_norm": 1.0667670965194702, + "learning_rate": 2.7378722890110552e-05, + "loss": 2.5683, + "step": 28579 + }, + { + "epoch": 2.5892958256891125, + "grad_norm": 1.2180708646774292, + "learning_rate": 2.73726816891198e-05, + "loss": 2.6666, + "step": 28580 + }, + { + "epoch": 2.589386423863559, + "grad_norm": 1.0463860034942627, + "learning_rate": 2.736664048812904e-05, + "loss": 2.7567, + "step": 28581 + }, + { + "epoch": 2.5894770220380057, + "grad_norm": 1.2264879941940308, + "learning_rate": 2.7360599287138285e-05, + "loss": 2.4427, + "step": 28582 + }, + { + "epoch": 2.5895676202124527, + "grad_norm": 1.0142735242843628, + "learning_rate": 2.7354558086147526e-05, + "loss": 2.5555, + "step": 28583 + }, + { + "epoch": 2.5896582183868997, + "grad_norm": 1.0854319334030151, + "learning_rate": 2.7348516885156773e-05, + "loss": 2.5565, + "step": 28584 + }, + { + "epoch": 2.5897488165613463, + "grad_norm": 1.0391877889633179, + "learning_rate": 2.7342475684166014e-05, + "loss": 2.7268, + "step": 28585 + }, + { + "epoch": 2.589839414735793, + "grad_norm": 0.9273542165756226, + "learning_rate": 2.7336434483175255e-05, + "loss": 1.8417, + "step": 28586 + }, + { + "epoch": 2.58993001291024, + "grad_norm": 1.0160449743270874, + "learning_rate": 2.7330393282184502e-05, + "loss": 2.71, + "step": 28587 + }, + { + "epoch": 2.590020611084687, + "grad_norm": 1.041961669921875, + "learning_rate": 2.7324352081193743e-05, + "loss": 2.6414, + "step": 28588 + }, + { + "epoch": 2.5901112092591334, + "grad_norm": 0.9588469862937927, + "learning_rate": 2.7318310880202984e-05, + "loss": 2.5559, + "step": 28589 + }, + { + "epoch": 2.59020180743358, + "grad_norm": 0.9941871762275696, + "learning_rate": 2.731226967921223e-05, + "loss": 2.4553, + "step": 28590 + }, + { + "epoch": 2.590292405608027, + "grad_norm": 1.038806438446045, + "learning_rate": 2.7306228478221473e-05, + "loss": 2.9433, + "step": 28591 + }, + { + "epoch": 2.590383003782474, + "grad_norm": 1.031529426574707, + "learning_rate": 2.7300187277230717e-05, + "loss": 2.7369, + "step": 28592 + }, + { + "epoch": 2.5904736019569206, + "grad_norm": 0.9906982183456421, + "learning_rate": 2.7294146076239958e-05, + "loss": 2.7454, + "step": 28593 + }, + { + "epoch": 2.590564200131367, + "grad_norm": 1.008229374885559, + "learning_rate": 2.72881048752492e-05, + "loss": 2.8408, + "step": 28594 + }, + { + "epoch": 2.590654798305814, + "grad_norm": 1.055298089981079, + "learning_rate": 2.7282063674258446e-05, + "loss": 2.6748, + "step": 28595 + }, + { + "epoch": 2.590745396480261, + "grad_norm": 1.0172884464263916, + "learning_rate": 2.7276022473267687e-05, + "loss": 2.5948, + "step": 28596 + }, + { + "epoch": 2.5908359946547077, + "grad_norm": 1.0180271863937378, + "learning_rate": 2.7269981272276928e-05, + "loss": 2.7554, + "step": 28597 + }, + { + "epoch": 2.5909265928291543, + "grad_norm": 1.0822232961654663, + "learning_rate": 2.7263940071286172e-05, + "loss": 2.7983, + "step": 28598 + }, + { + "epoch": 2.5910171910036013, + "grad_norm": 1.0659008026123047, + "learning_rate": 2.7257898870295416e-05, + "loss": 2.5426, + "step": 28599 + }, + { + "epoch": 2.5911077891780483, + "grad_norm": 1.0177873373031616, + "learning_rate": 2.725185766930466e-05, + "loss": 2.9248, + "step": 28600 + }, + { + "epoch": 2.591198387352495, + "grad_norm": 0.9539422392845154, + "learning_rate": 2.72458164683139e-05, + "loss": 2.4526, + "step": 28601 + }, + { + "epoch": 2.5912889855269414, + "grad_norm": 1.0043349266052246, + "learning_rate": 2.723977526732315e-05, + "loss": 2.6442, + "step": 28602 + }, + { + "epoch": 2.5913795837013884, + "grad_norm": 0.9918823838233948, + "learning_rate": 2.723373406633239e-05, + "loss": 2.6221, + "step": 28603 + }, + { + "epoch": 2.5914701818758354, + "grad_norm": 0.9454717636108398, + "learning_rate": 2.722769286534163e-05, + "loss": 2.3376, + "step": 28604 + }, + { + "epoch": 2.591560780050282, + "grad_norm": 1.0160744190216064, + "learning_rate": 2.722165166435087e-05, + "loss": 2.5497, + "step": 28605 + }, + { + "epoch": 2.5916513782247286, + "grad_norm": 0.9379852414131165, + "learning_rate": 2.721561046336012e-05, + "loss": 2.4288, + "step": 28606 + }, + { + "epoch": 2.5917419763991756, + "grad_norm": 0.9990365505218506, + "learning_rate": 2.720956926236936e-05, + "loss": 2.8977, + "step": 28607 + }, + { + "epoch": 2.5918325745736226, + "grad_norm": 1.0727591514587402, + "learning_rate": 2.7203528061378604e-05, + "loss": 2.4971, + "step": 28608 + }, + { + "epoch": 2.591923172748069, + "grad_norm": 1.025066614151001, + "learning_rate": 2.7197486860387845e-05, + "loss": 2.6666, + "step": 28609 + }, + { + "epoch": 2.5920137709225157, + "grad_norm": 0.9888936877250671, + "learning_rate": 2.7191445659397092e-05, + "loss": 2.6489, + "step": 28610 + }, + { + "epoch": 2.5921043690969627, + "grad_norm": 0.9710732698440552, + "learning_rate": 2.7185404458406333e-05, + "loss": 2.5634, + "step": 28611 + }, + { + "epoch": 2.5921949672714097, + "grad_norm": 1.31568443775177, + "learning_rate": 2.7179363257415574e-05, + "loss": 2.4194, + "step": 28612 + }, + { + "epoch": 2.5922855654458563, + "grad_norm": 1.0462268590927124, + "learning_rate": 2.7173322056424815e-05, + "loss": 2.5923, + "step": 28613 + }, + { + "epoch": 2.592376163620303, + "grad_norm": 1.034132719039917, + "learning_rate": 2.7167280855434062e-05, + "loss": 2.8975, + "step": 28614 + }, + { + "epoch": 2.59246676179475, + "grad_norm": 0.8982876539230347, + "learning_rate": 2.7161239654443303e-05, + "loss": 2.2153, + "step": 28615 + }, + { + "epoch": 2.592557359969197, + "grad_norm": 0.9996770024299622, + "learning_rate": 2.7155198453452548e-05, + "loss": 2.5379, + "step": 28616 + }, + { + "epoch": 2.5926479581436435, + "grad_norm": 0.9883580207824707, + "learning_rate": 2.7149157252461792e-05, + "loss": 2.4968, + "step": 28617 + }, + { + "epoch": 2.59273855631809, + "grad_norm": 1.0353463888168335, + "learning_rate": 2.7143116051471036e-05, + "loss": 2.6137, + "step": 28618 + }, + { + "epoch": 2.592829154492537, + "grad_norm": 0.8914060592651367, + "learning_rate": 2.7137074850480277e-05, + "loss": 1.9975, + "step": 28619 + }, + { + "epoch": 2.5929197526669836, + "grad_norm": 0.7585070133209229, + "learning_rate": 2.7131033649489518e-05, + "loss": 1.2458, + "step": 28620 + }, + { + "epoch": 2.5930103508414306, + "grad_norm": 1.0307084321975708, + "learning_rate": 2.7124992448498765e-05, + "loss": 2.6563, + "step": 28621 + }, + { + "epoch": 2.593100949015877, + "grad_norm": 1.0296392440795898, + "learning_rate": 2.7118951247508006e-05, + "loss": 2.7042, + "step": 28622 + }, + { + "epoch": 2.593191547190324, + "grad_norm": 0.9860687255859375, + "learning_rate": 2.7112910046517247e-05, + "loss": 2.7039, + "step": 28623 + }, + { + "epoch": 2.5932821453647708, + "grad_norm": 1.0663992166519165, + "learning_rate": 2.710686884552649e-05, + "loss": 2.7501, + "step": 28624 + }, + { + "epoch": 2.5933727435392178, + "grad_norm": 0.9666402339935303, + "learning_rate": 2.7100827644535735e-05, + "loss": 2.3913, + "step": 28625 + }, + { + "epoch": 2.5934633417136643, + "grad_norm": 1.0758209228515625, + "learning_rate": 2.709478644354498e-05, + "loss": 2.6064, + "step": 28626 + }, + { + "epoch": 2.5935539398881113, + "grad_norm": 0.9920579791069031, + "learning_rate": 2.708874524255422e-05, + "loss": 2.8056, + "step": 28627 + }, + { + "epoch": 2.593644538062558, + "grad_norm": 1.0956233739852905, + "learning_rate": 2.708270404156346e-05, + "loss": 2.5086, + "step": 28628 + }, + { + "epoch": 2.593735136237005, + "grad_norm": 0.9456515908241272, + "learning_rate": 2.707666284057271e-05, + "loss": 2.7723, + "step": 28629 + }, + { + "epoch": 2.5938257344114515, + "grad_norm": 1.107696771621704, + "learning_rate": 2.707062163958195e-05, + "loss": 2.5332, + "step": 28630 + }, + { + "epoch": 2.5939163325858985, + "grad_norm": 0.8619837760925293, + "learning_rate": 2.706458043859119e-05, + "loss": 2.1745, + "step": 28631 + }, + { + "epoch": 2.594006930760345, + "grad_norm": 1.031760811805725, + "learning_rate": 2.7058539237600438e-05, + "loss": 2.4386, + "step": 28632 + }, + { + "epoch": 2.594097528934792, + "grad_norm": 1.1059931516647339, + "learning_rate": 2.705249803660968e-05, + "loss": 2.5601, + "step": 28633 + }, + { + "epoch": 2.5941881271092386, + "grad_norm": 0.9769114851951599, + "learning_rate": 2.7046456835618923e-05, + "loss": 2.6267, + "step": 28634 + }, + { + "epoch": 2.5942787252836856, + "grad_norm": 0.9835904836654663, + "learning_rate": 2.7040415634628164e-05, + "loss": 2.952, + "step": 28635 + }, + { + "epoch": 2.594369323458132, + "grad_norm": 1.0670452117919922, + "learning_rate": 2.703437443363741e-05, + "loss": 2.7013, + "step": 28636 + }, + { + "epoch": 2.594459921632579, + "grad_norm": 1.0597362518310547, + "learning_rate": 2.7028333232646652e-05, + "loss": 2.6453, + "step": 28637 + }, + { + "epoch": 2.594550519807026, + "grad_norm": 1.0297337770462036, + "learning_rate": 2.7022292031655893e-05, + "loss": 2.4789, + "step": 28638 + }, + { + "epoch": 2.594641117981473, + "grad_norm": 0.998053252696991, + "learning_rate": 2.7016250830665134e-05, + "loss": 2.5735, + "step": 28639 + }, + { + "epoch": 2.5947317161559194, + "grad_norm": 1.0247689485549927, + "learning_rate": 2.701020962967438e-05, + "loss": 2.4732, + "step": 28640 + }, + { + "epoch": 2.5948223143303664, + "grad_norm": 1.0225127935409546, + "learning_rate": 2.7004168428683622e-05, + "loss": 2.4649, + "step": 28641 + }, + { + "epoch": 2.594912912504813, + "grad_norm": 0.9711358547210693, + "learning_rate": 2.6998127227692867e-05, + "loss": 2.4237, + "step": 28642 + }, + { + "epoch": 2.59500351067926, + "grad_norm": 1.0263595581054688, + "learning_rate": 2.6992086026702108e-05, + "loss": 2.3051, + "step": 28643 + }, + { + "epoch": 2.5950941088537065, + "grad_norm": 1.009169340133667, + "learning_rate": 2.6986044825711355e-05, + "loss": 2.4317, + "step": 28644 + }, + { + "epoch": 2.5951847070281535, + "grad_norm": 0.9974600672721863, + "learning_rate": 2.6980003624720596e-05, + "loss": 2.54, + "step": 28645 + }, + { + "epoch": 2.5952753052026, + "grad_norm": 0.9961544275283813, + "learning_rate": 2.6973962423729837e-05, + "loss": 2.5074, + "step": 28646 + }, + { + "epoch": 2.595365903377047, + "grad_norm": 1.016213297843933, + "learning_rate": 2.6967921222739084e-05, + "loss": 2.5353, + "step": 28647 + }, + { + "epoch": 2.5954565015514937, + "grad_norm": 1.1201566457748413, + "learning_rate": 2.6961880021748325e-05, + "loss": 2.5544, + "step": 28648 + }, + { + "epoch": 2.5955470997259407, + "grad_norm": 1.0005443096160889, + "learning_rate": 2.6955838820757566e-05, + "loss": 2.8301, + "step": 28649 + }, + { + "epoch": 2.5956376979003872, + "grad_norm": 1.0255190134048462, + "learning_rate": 2.694979761976681e-05, + "loss": 2.7436, + "step": 28650 + }, + { + "epoch": 2.5957282960748342, + "grad_norm": 0.9868202805519104, + "learning_rate": 2.6943756418776054e-05, + "loss": 2.5229, + "step": 28651 + }, + { + "epoch": 2.595818894249281, + "grad_norm": 1.0032459497451782, + "learning_rate": 2.69377152177853e-05, + "loss": 2.7941, + "step": 28652 + }, + { + "epoch": 2.595909492423728, + "grad_norm": 0.9813753962516785, + "learning_rate": 2.693167401679454e-05, + "loss": 2.4901, + "step": 28653 + }, + { + "epoch": 2.5960000905981744, + "grad_norm": 1.0040680170059204, + "learning_rate": 2.692563281580378e-05, + "loss": 2.6772, + "step": 28654 + }, + { + "epoch": 2.5960906887726214, + "grad_norm": 0.8887565732002258, + "learning_rate": 2.6919591614813028e-05, + "loss": 1.9378, + "step": 28655 + }, + { + "epoch": 2.596181286947068, + "grad_norm": 1.0277154445648193, + "learning_rate": 2.691355041382227e-05, + "loss": 2.5928, + "step": 28656 + }, + { + "epoch": 2.596271885121515, + "grad_norm": 0.9783146977424622, + "learning_rate": 2.690750921283151e-05, + "loss": 2.3995, + "step": 28657 + }, + { + "epoch": 2.5963624832959615, + "grad_norm": 0.9726536273956299, + "learning_rate": 2.6901468011840754e-05, + "loss": 2.6314, + "step": 28658 + }, + { + "epoch": 2.5964530814704085, + "grad_norm": 1.013647198677063, + "learning_rate": 2.6895426810849998e-05, + "loss": 2.4578, + "step": 28659 + }, + { + "epoch": 2.596543679644855, + "grad_norm": 1.0139870643615723, + "learning_rate": 2.6889385609859242e-05, + "loss": 2.684, + "step": 28660 + }, + { + "epoch": 2.5966342778193017, + "grad_norm": 1.0213069915771484, + "learning_rate": 2.6883344408868483e-05, + "loss": 2.485, + "step": 28661 + }, + { + "epoch": 2.5967248759937487, + "grad_norm": 1.0082828998565674, + "learning_rate": 2.687730320787773e-05, + "loss": 2.8854, + "step": 28662 + }, + { + "epoch": 2.5968154741681957, + "grad_norm": 1.051259160041809, + "learning_rate": 2.687126200688697e-05, + "loss": 2.7949, + "step": 28663 + }, + { + "epoch": 2.5969060723426423, + "grad_norm": 0.9873660206794739, + "learning_rate": 2.6865220805896212e-05, + "loss": 2.5279, + "step": 28664 + }, + { + "epoch": 2.596996670517089, + "grad_norm": 0.9233992695808411, + "learning_rate": 2.6859179604905453e-05, + "loss": 2.5251, + "step": 28665 + }, + { + "epoch": 2.597087268691536, + "grad_norm": 1.055496335029602, + "learning_rate": 2.68531384039147e-05, + "loss": 2.9504, + "step": 28666 + }, + { + "epoch": 2.597177866865983, + "grad_norm": 1.01252281665802, + "learning_rate": 2.684709720292394e-05, + "loss": 2.5862, + "step": 28667 + }, + { + "epoch": 2.5972684650404294, + "grad_norm": 1.0053809881210327, + "learning_rate": 2.6841056001933186e-05, + "loss": 2.8013, + "step": 28668 + }, + { + "epoch": 2.597359063214876, + "grad_norm": 0.8819801807403564, + "learning_rate": 2.6835014800942427e-05, + "loss": 2.3234, + "step": 28669 + }, + { + "epoch": 2.597449661389323, + "grad_norm": 0.9731669425964355, + "learning_rate": 2.6828973599951674e-05, + "loss": 2.5707, + "step": 28670 + }, + { + "epoch": 2.59754025956377, + "grad_norm": 1.0161110162734985, + "learning_rate": 2.6822932398960915e-05, + "loss": 2.5879, + "step": 28671 + }, + { + "epoch": 2.5976308577382166, + "grad_norm": 0.8292002081871033, + "learning_rate": 2.6816891197970156e-05, + "loss": 1.9469, + "step": 28672 + }, + { + "epoch": 2.597721455912663, + "grad_norm": 1.0033129453659058, + "learning_rate": 2.6810849996979397e-05, + "loss": 2.7098, + "step": 28673 + }, + { + "epoch": 2.59781205408711, + "grad_norm": 1.0347447395324707, + "learning_rate": 2.6804808795988644e-05, + "loss": 2.9676, + "step": 28674 + }, + { + "epoch": 2.597902652261557, + "grad_norm": 1.0056979656219482, + "learning_rate": 2.6798767594997885e-05, + "loss": 2.4846, + "step": 28675 + }, + { + "epoch": 2.5979932504360037, + "grad_norm": 0.9854031205177307, + "learning_rate": 2.679272639400713e-05, + "loss": 2.4455, + "step": 28676 + }, + { + "epoch": 2.5980838486104503, + "grad_norm": 1.0960967540740967, + "learning_rate": 2.6786685193016377e-05, + "loss": 2.5381, + "step": 28677 + }, + { + "epoch": 2.5981744467848973, + "grad_norm": 0.9295430779457092, + "learning_rate": 2.6780643992025618e-05, + "loss": 2.4978, + "step": 28678 + }, + { + "epoch": 2.5982650449593443, + "grad_norm": 1.026086449623108, + "learning_rate": 2.677460279103486e-05, + "loss": 2.695, + "step": 28679 + }, + { + "epoch": 2.598355643133791, + "grad_norm": 1.0688766241073608, + "learning_rate": 2.67685615900441e-05, + "loss": 2.5164, + "step": 28680 + }, + { + "epoch": 2.5984462413082374, + "grad_norm": 1.1019020080566406, + "learning_rate": 2.6762520389053347e-05, + "loss": 2.2565, + "step": 28681 + }, + { + "epoch": 2.5985368394826844, + "grad_norm": 1.0191415548324585, + "learning_rate": 2.6756479188062588e-05, + "loss": 2.4489, + "step": 28682 + }, + { + "epoch": 2.5986274376571314, + "grad_norm": 0.994073748588562, + "learning_rate": 2.675043798707183e-05, + "loss": 2.6622, + "step": 28683 + }, + { + "epoch": 2.598718035831578, + "grad_norm": 1.1104518175125122, + "learning_rate": 2.6744396786081073e-05, + "loss": 2.4661, + "step": 28684 + }, + { + "epoch": 2.5988086340060246, + "grad_norm": 1.0154144763946533, + "learning_rate": 2.6738355585090317e-05, + "loss": 2.437, + "step": 28685 + }, + { + "epoch": 2.5988992321804716, + "grad_norm": 0.9936385750770569, + "learning_rate": 2.673231438409956e-05, + "loss": 2.3082, + "step": 28686 + }, + { + "epoch": 2.5989898303549186, + "grad_norm": 0.8745002150535583, + "learning_rate": 2.6726273183108802e-05, + "loss": 2.1967, + "step": 28687 + }, + { + "epoch": 2.599080428529365, + "grad_norm": 1.0207041501998901, + "learning_rate": 2.672023198211805e-05, + "loss": 2.5842, + "step": 28688 + }, + { + "epoch": 2.5991710267038117, + "grad_norm": 0.96004319190979, + "learning_rate": 2.671419078112729e-05, + "loss": 2.7426, + "step": 28689 + }, + { + "epoch": 2.5992616248782587, + "grad_norm": 1.0164967775344849, + "learning_rate": 2.670814958013653e-05, + "loss": 2.6989, + "step": 28690 + }, + { + "epoch": 2.5993522230527057, + "grad_norm": 1.0629414319992065, + "learning_rate": 2.6702108379145772e-05, + "loss": 2.6615, + "step": 28691 + }, + { + "epoch": 2.5994428212271523, + "grad_norm": 1.0217804908752441, + "learning_rate": 2.669606717815502e-05, + "loss": 2.3786, + "step": 28692 + }, + { + "epoch": 2.599533419401599, + "grad_norm": 1.0827096700668335, + "learning_rate": 2.669002597716426e-05, + "loss": 2.497, + "step": 28693 + }, + { + "epoch": 2.599624017576046, + "grad_norm": 0.9928130507469177, + "learning_rate": 2.6683984776173505e-05, + "loss": 2.6559, + "step": 28694 + }, + { + "epoch": 2.599714615750493, + "grad_norm": 0.8429438471794128, + "learning_rate": 2.6677943575182746e-05, + "loss": 1.8448, + "step": 28695 + }, + { + "epoch": 2.5998052139249395, + "grad_norm": 0.9640179872512817, + "learning_rate": 2.6671902374191993e-05, + "loss": 2.6819, + "step": 28696 + }, + { + "epoch": 2.599895812099386, + "grad_norm": 1.0039108991622925, + "learning_rate": 2.6665861173201234e-05, + "loss": 2.4678, + "step": 28697 + }, + { + "epoch": 2.599986410273833, + "grad_norm": 1.0464253425598145, + "learning_rate": 2.6659819972210475e-05, + "loss": 2.542, + "step": 28698 + }, + { + "epoch": 2.6000770084482796, + "grad_norm": 0.8976888656616211, + "learning_rate": 2.6653778771219716e-05, + "loss": 1.8378, + "step": 28699 + }, + { + "epoch": 2.6001676066227266, + "grad_norm": 1.0692930221557617, + "learning_rate": 2.6647737570228964e-05, + "loss": 2.6912, + "step": 28700 + }, + { + "epoch": 2.600258204797173, + "grad_norm": 1.0292410850524902, + "learning_rate": 2.6641696369238204e-05, + "loss": 2.5124, + "step": 28701 + }, + { + "epoch": 2.60034880297162, + "grad_norm": 0.989410936832428, + "learning_rate": 2.663565516824745e-05, + "loss": 2.713, + "step": 28702 + }, + { + "epoch": 2.6004394011460668, + "grad_norm": 1.0550925731658936, + "learning_rate": 2.6629613967256696e-05, + "loss": 1.923, + "step": 28703 + }, + { + "epoch": 2.6005299993205138, + "grad_norm": 0.7940321564674377, + "learning_rate": 2.6623572766265937e-05, + "loss": 1.9021, + "step": 28704 + }, + { + "epoch": 2.6006205974949603, + "grad_norm": 0.9407736659049988, + "learning_rate": 2.6617531565275178e-05, + "loss": 2.0345, + "step": 28705 + }, + { + "epoch": 2.6007111956694073, + "grad_norm": 0.975691556930542, + "learning_rate": 2.661149036428442e-05, + "loss": 2.3309, + "step": 28706 + }, + { + "epoch": 2.600801793843854, + "grad_norm": 0.9816084504127502, + "learning_rate": 2.6605449163293666e-05, + "loss": 2.2925, + "step": 28707 + }, + { + "epoch": 2.600892392018301, + "grad_norm": 0.9339619278907776, + "learning_rate": 2.6599407962302907e-05, + "loss": 2.6606, + "step": 28708 + }, + { + "epoch": 2.6009829901927475, + "grad_norm": 1.0455305576324463, + "learning_rate": 2.6593366761312148e-05, + "loss": 2.7607, + "step": 28709 + }, + { + "epoch": 2.6010735883671945, + "grad_norm": 1.0659452676773071, + "learning_rate": 2.6587325560321392e-05, + "loss": 2.9771, + "step": 28710 + }, + { + "epoch": 2.601164186541641, + "grad_norm": 1.037669062614441, + "learning_rate": 2.658128435933064e-05, + "loss": 2.6402, + "step": 28711 + }, + { + "epoch": 2.601254784716088, + "grad_norm": 0.9544963240623474, + "learning_rate": 2.657524315833988e-05, + "loss": 2.5122, + "step": 28712 + }, + { + "epoch": 2.6013453828905346, + "grad_norm": 0.9823341369628906, + "learning_rate": 2.656920195734912e-05, + "loss": 2.5103, + "step": 28713 + }, + { + "epoch": 2.6014359810649816, + "grad_norm": 0.9496473073959351, + "learning_rate": 2.6563160756358362e-05, + "loss": 2.4792, + "step": 28714 + }, + { + "epoch": 2.601526579239428, + "grad_norm": 1.0249592065811157, + "learning_rate": 2.655711955536761e-05, + "loss": 2.6101, + "step": 28715 + }, + { + "epoch": 2.601617177413875, + "grad_norm": 1.0154227018356323, + "learning_rate": 2.655107835437685e-05, + "loss": 2.8853, + "step": 28716 + }, + { + "epoch": 2.601707775588322, + "grad_norm": 0.9823999404907227, + "learning_rate": 2.654503715338609e-05, + "loss": 2.3464, + "step": 28717 + }, + { + "epoch": 2.601798373762769, + "grad_norm": 1.0134456157684326, + "learning_rate": 2.653899595239534e-05, + "loss": 2.6016, + "step": 28718 + }, + { + "epoch": 2.6018889719372154, + "grad_norm": 1.0659911632537842, + "learning_rate": 2.6532954751404583e-05, + "loss": 2.6262, + "step": 28719 + }, + { + "epoch": 2.6019795701116624, + "grad_norm": 1.0238784551620483, + "learning_rate": 2.6526913550413824e-05, + "loss": 2.7814, + "step": 28720 + }, + { + "epoch": 2.602070168286109, + "grad_norm": 1.0675419569015503, + "learning_rate": 2.6520872349423065e-05, + "loss": 2.6238, + "step": 28721 + }, + { + "epoch": 2.602160766460556, + "grad_norm": 0.9951993227005005, + "learning_rate": 2.6514831148432313e-05, + "loss": 2.6172, + "step": 28722 + }, + { + "epoch": 2.6022513646350025, + "grad_norm": 1.0861268043518066, + "learning_rate": 2.6508789947441553e-05, + "loss": 2.6179, + "step": 28723 + }, + { + "epoch": 2.6023419628094495, + "grad_norm": 0.8950884938240051, + "learning_rate": 2.6502748746450794e-05, + "loss": 2.035, + "step": 28724 + }, + { + "epoch": 2.602432560983896, + "grad_norm": 1.059024453163147, + "learning_rate": 2.6496707545460035e-05, + "loss": 2.7495, + "step": 28725 + }, + { + "epoch": 2.602523159158343, + "grad_norm": 0.956468403339386, + "learning_rate": 2.6490666344469283e-05, + "loss": 2.4322, + "step": 28726 + }, + { + "epoch": 2.6026137573327897, + "grad_norm": 0.9791015386581421, + "learning_rate": 2.6484625143478527e-05, + "loss": 2.7205, + "step": 28727 + }, + { + "epoch": 2.6027043555072367, + "grad_norm": 1.1188774108886719, + "learning_rate": 2.6478583942487768e-05, + "loss": 2.4967, + "step": 28728 + }, + { + "epoch": 2.6027949536816832, + "grad_norm": 1.073055386543274, + "learning_rate": 2.647254274149701e-05, + "loss": 2.6424, + "step": 28729 + }, + { + "epoch": 2.6028855518561302, + "grad_norm": 1.117303729057312, + "learning_rate": 2.6466501540506256e-05, + "loss": 2.6446, + "step": 28730 + }, + { + "epoch": 2.602976150030577, + "grad_norm": 1.1385046243667603, + "learning_rate": 2.6460460339515497e-05, + "loss": 2.6555, + "step": 28731 + }, + { + "epoch": 2.603066748205024, + "grad_norm": 1.017391562461853, + "learning_rate": 2.6454419138524738e-05, + "loss": 2.4886, + "step": 28732 + }, + { + "epoch": 2.6031573463794704, + "grad_norm": 0.9809550642967224, + "learning_rate": 2.6448377937533985e-05, + "loss": 2.556, + "step": 28733 + }, + { + "epoch": 2.6032479445539174, + "grad_norm": 0.9619052410125732, + "learning_rate": 2.6442336736543226e-05, + "loss": 2.5416, + "step": 28734 + }, + { + "epoch": 2.603338542728364, + "grad_norm": 0.9569469690322876, + "learning_rate": 2.643629553555247e-05, + "loss": 2.8786, + "step": 28735 + }, + { + "epoch": 2.603429140902811, + "grad_norm": 1.0258773565292358, + "learning_rate": 2.643025433456171e-05, + "loss": 2.5901, + "step": 28736 + }, + { + "epoch": 2.6035197390772575, + "grad_norm": 1.0600025653839111, + "learning_rate": 2.642421313357096e-05, + "loss": 2.7271, + "step": 28737 + }, + { + "epoch": 2.6036103372517045, + "grad_norm": 1.1697158813476562, + "learning_rate": 2.64181719325802e-05, + "loss": 2.4977, + "step": 28738 + }, + { + "epoch": 2.603700935426151, + "grad_norm": 0.9300900101661682, + "learning_rate": 2.641213073158944e-05, + "loss": 2.398, + "step": 28739 + }, + { + "epoch": 2.6037915336005977, + "grad_norm": 1.2132799625396729, + "learning_rate": 2.640608953059868e-05, + "loss": 2.287, + "step": 28740 + }, + { + "epoch": 2.6038821317750447, + "grad_norm": 1.1048500537872314, + "learning_rate": 2.640004832960793e-05, + "loss": 2.5311, + "step": 28741 + }, + { + "epoch": 2.6039727299494917, + "grad_norm": 1.0525429248809814, + "learning_rate": 2.639400712861717e-05, + "loss": 2.7657, + "step": 28742 + }, + { + "epoch": 2.6040633281239383, + "grad_norm": 1.073649525642395, + "learning_rate": 2.638796592762641e-05, + "loss": 2.6589, + "step": 28743 + }, + { + "epoch": 2.604153926298385, + "grad_norm": 1.128576636314392, + "learning_rate": 2.6381924726635655e-05, + "loss": 2.5482, + "step": 28744 + }, + { + "epoch": 2.604244524472832, + "grad_norm": 1.0314632654190063, + "learning_rate": 2.6375883525644903e-05, + "loss": 2.6212, + "step": 28745 + }, + { + "epoch": 2.604335122647279, + "grad_norm": 1.1372603178024292, + "learning_rate": 2.6369842324654143e-05, + "loss": 2.6459, + "step": 28746 + }, + { + "epoch": 2.6044257208217254, + "grad_norm": 1.04656982421875, + "learning_rate": 2.6363801123663384e-05, + "loss": 2.67, + "step": 28747 + }, + { + "epoch": 2.604516318996172, + "grad_norm": 1.0535850524902344, + "learning_rate": 2.6357759922672632e-05, + "loss": 2.4591, + "step": 28748 + }, + { + "epoch": 2.604606917170619, + "grad_norm": 1.1759188175201416, + "learning_rate": 2.6351718721681873e-05, + "loss": 2.5201, + "step": 28749 + }, + { + "epoch": 2.604697515345066, + "grad_norm": 0.9333315491676331, + "learning_rate": 2.6345677520691113e-05, + "loss": 2.4922, + "step": 28750 + }, + { + "epoch": 2.6047881135195126, + "grad_norm": 1.1067659854888916, + "learning_rate": 2.6339636319700354e-05, + "loss": 2.6815, + "step": 28751 + }, + { + "epoch": 2.604878711693959, + "grad_norm": 1.095880389213562, + "learning_rate": 2.6333595118709602e-05, + "loss": 2.6563, + "step": 28752 + }, + { + "epoch": 2.604969309868406, + "grad_norm": 1.1148656606674194, + "learning_rate": 2.6327553917718846e-05, + "loss": 2.5763, + "step": 28753 + }, + { + "epoch": 2.605059908042853, + "grad_norm": 0.9860131740570068, + "learning_rate": 2.6321512716728087e-05, + "loss": 2.4757, + "step": 28754 + }, + { + "epoch": 2.6051505062172997, + "grad_norm": 0.8410740494728088, + "learning_rate": 2.6315471515737328e-05, + "loss": 2.1132, + "step": 28755 + }, + { + "epoch": 2.6052411043917463, + "grad_norm": 0.8657092452049255, + "learning_rate": 2.6309430314746575e-05, + "loss": 1.8333, + "step": 28756 + }, + { + "epoch": 2.6053317025661933, + "grad_norm": 1.0130383968353271, + "learning_rate": 2.6303389113755816e-05, + "loss": 2.536, + "step": 28757 + }, + { + "epoch": 2.6054223007406403, + "grad_norm": 0.9708025455474854, + "learning_rate": 2.6297347912765057e-05, + "loss": 1.9267, + "step": 28758 + }, + { + "epoch": 2.605512898915087, + "grad_norm": 1.0077314376831055, + "learning_rate": 2.6291306711774298e-05, + "loss": 2.5676, + "step": 28759 + }, + { + "epoch": 2.6056034970895334, + "grad_norm": 0.8770456314086914, + "learning_rate": 2.6285265510783545e-05, + "loss": 1.9394, + "step": 28760 + }, + { + "epoch": 2.6056940952639804, + "grad_norm": 1.0194517374038696, + "learning_rate": 2.627922430979279e-05, + "loss": 2.6535, + "step": 28761 + }, + { + "epoch": 2.6057846934384274, + "grad_norm": 0.9703195095062256, + "learning_rate": 2.627318310880203e-05, + "loss": 2.4767, + "step": 28762 + }, + { + "epoch": 2.605875291612874, + "grad_norm": 0.9639017581939697, + "learning_rate": 2.6267141907811278e-05, + "loss": 2.5493, + "step": 28763 + }, + { + "epoch": 2.6059658897873206, + "grad_norm": 0.9704294800758362, + "learning_rate": 2.626110070682052e-05, + "loss": 2.6782, + "step": 28764 + }, + { + "epoch": 2.6060564879617676, + "grad_norm": 1.0472162961959839, + "learning_rate": 2.625505950582976e-05, + "loss": 2.682, + "step": 28765 + }, + { + "epoch": 2.6061470861362146, + "grad_norm": 1.0679553747177124, + "learning_rate": 2.6249018304839e-05, + "loss": 2.6232, + "step": 28766 + }, + { + "epoch": 2.606237684310661, + "grad_norm": 0.9812053442001343, + "learning_rate": 2.6242977103848248e-05, + "loss": 2.429, + "step": 28767 + }, + { + "epoch": 2.6063282824851077, + "grad_norm": 1.0405161380767822, + "learning_rate": 2.623693590285749e-05, + "loss": 2.6744, + "step": 28768 + }, + { + "epoch": 2.6064188806595547, + "grad_norm": 0.9485151171684265, + "learning_rate": 2.6230894701866733e-05, + "loss": 2.5808, + "step": 28769 + }, + { + "epoch": 2.6065094788340017, + "grad_norm": 0.9605273008346558, + "learning_rate": 2.6224853500875974e-05, + "loss": 2.2892, + "step": 28770 + }, + { + "epoch": 2.6066000770084483, + "grad_norm": 0.9731546640396118, + "learning_rate": 2.621881229988522e-05, + "loss": 2.6171, + "step": 28771 + }, + { + "epoch": 2.606690675182895, + "grad_norm": 0.8094351887702942, + "learning_rate": 2.6212771098894463e-05, + "loss": 1.9746, + "step": 28772 + }, + { + "epoch": 2.606781273357342, + "grad_norm": 0.9875834584236145, + "learning_rate": 2.6206729897903703e-05, + "loss": 2.4576, + "step": 28773 + }, + { + "epoch": 2.606871871531789, + "grad_norm": 0.8475181460380554, + "learning_rate": 2.6200688696912944e-05, + "loss": 2.0572, + "step": 28774 + }, + { + "epoch": 2.6069624697062355, + "grad_norm": 0.9655445218086243, + "learning_rate": 2.6194647495922192e-05, + "loss": 2.5734, + "step": 28775 + }, + { + "epoch": 2.607053067880682, + "grad_norm": 1.0829799175262451, + "learning_rate": 2.6188606294931433e-05, + "loss": 2.765, + "step": 28776 + }, + { + "epoch": 2.607143666055129, + "grad_norm": 0.9789988398551941, + "learning_rate": 2.6182565093940677e-05, + "loss": 2.419, + "step": 28777 + }, + { + "epoch": 2.607234264229576, + "grad_norm": 1.0028082132339478, + "learning_rate": 2.617652389294992e-05, + "loss": 2.5254, + "step": 28778 + }, + { + "epoch": 2.6073248624040226, + "grad_norm": 1.0234721899032593, + "learning_rate": 2.6170482691959165e-05, + "loss": 2.5421, + "step": 28779 + }, + { + "epoch": 2.607415460578469, + "grad_norm": 1.1811484098434448, + "learning_rate": 2.6164441490968406e-05, + "loss": 2.5298, + "step": 28780 + }, + { + "epoch": 2.607506058752916, + "grad_norm": 1.0079553127288818, + "learning_rate": 2.6158400289977647e-05, + "loss": 2.6487, + "step": 28781 + }, + { + "epoch": 2.6075966569273628, + "grad_norm": 0.9007257223129272, + "learning_rate": 2.6152359088986895e-05, + "loss": 2.3924, + "step": 28782 + }, + { + "epoch": 2.6076872551018098, + "grad_norm": 1.167409062385559, + "learning_rate": 2.6146317887996135e-05, + "loss": 2.6715, + "step": 28783 + }, + { + "epoch": 2.6077778532762563, + "grad_norm": 0.9831781983375549, + "learning_rate": 2.6140276687005376e-05, + "loss": 2.6553, + "step": 28784 + }, + { + "epoch": 2.6078684514507033, + "grad_norm": 0.9990386366844177, + "learning_rate": 2.613423548601462e-05, + "loss": 2.5921, + "step": 28785 + }, + { + "epoch": 2.60795904962515, + "grad_norm": 0.9603080749511719, + "learning_rate": 2.6128194285023865e-05, + "loss": 2.649, + "step": 28786 + }, + { + "epoch": 2.608049647799597, + "grad_norm": 0.9879866242408752, + "learning_rate": 2.612215308403311e-05, + "loss": 2.5776, + "step": 28787 + }, + { + "epoch": 2.6081402459740435, + "grad_norm": 0.9745803475379944, + "learning_rate": 2.611611188304235e-05, + "loss": 2.5381, + "step": 28788 + }, + { + "epoch": 2.6082308441484905, + "grad_norm": 1.050648808479309, + "learning_rate": 2.611007068205159e-05, + "loss": 2.7311, + "step": 28789 + }, + { + "epoch": 2.608321442322937, + "grad_norm": 1.0316784381866455, + "learning_rate": 2.6104029481060838e-05, + "loss": 2.6191, + "step": 28790 + }, + { + "epoch": 2.608412040497384, + "grad_norm": 1.0598220825195312, + "learning_rate": 2.609798828007008e-05, + "loss": 2.5543, + "step": 28791 + }, + { + "epoch": 2.6085026386718306, + "grad_norm": 1.1065952777862549, + "learning_rate": 2.609194707907932e-05, + "loss": 2.6693, + "step": 28792 + }, + { + "epoch": 2.6085932368462776, + "grad_norm": 1.0245791673660278, + "learning_rate": 2.6085905878088567e-05, + "loss": 2.5993, + "step": 28793 + }, + { + "epoch": 2.608683835020724, + "grad_norm": 1.0114161968231201, + "learning_rate": 2.6079864677097808e-05, + "loss": 2.4686, + "step": 28794 + }, + { + "epoch": 2.608774433195171, + "grad_norm": 0.9191794395446777, + "learning_rate": 2.6073823476107052e-05, + "loss": 2.0366, + "step": 28795 + }, + { + "epoch": 2.608865031369618, + "grad_norm": 0.9538415670394897, + "learning_rate": 2.6067782275116293e-05, + "loss": 2.7751, + "step": 28796 + }, + { + "epoch": 2.608955629544065, + "grad_norm": 0.9668097496032715, + "learning_rate": 2.606174107412554e-05, + "loss": 2.8054, + "step": 28797 + }, + { + "epoch": 2.6090462277185114, + "grad_norm": 0.9987263083457947, + "learning_rate": 2.605569987313478e-05, + "loss": 2.5994, + "step": 28798 + }, + { + "epoch": 2.6091368258929584, + "grad_norm": 0.9191100597381592, + "learning_rate": 2.6049658672144023e-05, + "loss": 2.2241, + "step": 28799 + }, + { + "epoch": 2.609227424067405, + "grad_norm": 1.0013939142227173, + "learning_rate": 2.6043617471153263e-05, + "loss": 2.4998, + "step": 28800 + }, + { + "epoch": 2.609318022241852, + "grad_norm": 0.966893196105957, + "learning_rate": 2.603757627016251e-05, + "loss": 2.4318, + "step": 28801 + }, + { + "epoch": 2.6094086204162985, + "grad_norm": 1.1083208322525024, + "learning_rate": 2.6031535069171752e-05, + "loss": 2.4745, + "step": 28802 + }, + { + "epoch": 2.6094992185907455, + "grad_norm": 1.1036256551742554, + "learning_rate": 2.6025493868180996e-05, + "loss": 2.6079, + "step": 28803 + }, + { + "epoch": 2.609589816765192, + "grad_norm": 0.7003099322319031, + "learning_rate": 2.6019452667190237e-05, + "loss": 1.3643, + "step": 28804 + }, + { + "epoch": 2.609680414939639, + "grad_norm": 0.8582046031951904, + "learning_rate": 2.6013411466199484e-05, + "loss": 2.1346, + "step": 28805 + }, + { + "epoch": 2.6097710131140857, + "grad_norm": 0.9772330522537231, + "learning_rate": 2.6007370265208725e-05, + "loss": 2.9388, + "step": 28806 + }, + { + "epoch": 2.6098616112885327, + "grad_norm": 0.9263828992843628, + "learning_rate": 2.6001329064217966e-05, + "loss": 2.2368, + "step": 28807 + }, + { + "epoch": 2.6099522094629792, + "grad_norm": 1.0486308336257935, + "learning_rate": 2.5995287863227214e-05, + "loss": 2.6279, + "step": 28808 + }, + { + "epoch": 2.6100428076374262, + "grad_norm": 1.0038644075393677, + "learning_rate": 2.5989246662236455e-05, + "loss": 2.5788, + "step": 28809 + }, + { + "epoch": 2.610133405811873, + "grad_norm": 0.8689230680465698, + "learning_rate": 2.5983205461245695e-05, + "loss": 2.0649, + "step": 28810 + }, + { + "epoch": 2.61022400398632, + "grad_norm": 0.9876354932785034, + "learning_rate": 2.597716426025494e-05, + "loss": 2.4187, + "step": 28811 + }, + { + "epoch": 2.6103146021607664, + "grad_norm": 1.0385544300079346, + "learning_rate": 2.5971123059264184e-05, + "loss": 2.4516, + "step": 28812 + }, + { + "epoch": 2.6104052003352134, + "grad_norm": 0.962929904460907, + "learning_rate": 2.5965081858273428e-05, + "loss": 2.566, + "step": 28813 + }, + { + "epoch": 2.61049579850966, + "grad_norm": 1.0542962551116943, + "learning_rate": 2.595904065728267e-05, + "loss": 2.4797, + "step": 28814 + }, + { + "epoch": 2.610586396684107, + "grad_norm": 1.0220714807510376, + "learning_rate": 2.595299945629191e-05, + "loss": 2.8274, + "step": 28815 + }, + { + "epoch": 2.6106769948585535, + "grad_norm": 1.0324578285217285, + "learning_rate": 2.5946958255301157e-05, + "loss": 2.4923, + "step": 28816 + }, + { + "epoch": 2.6107675930330005, + "grad_norm": 0.930962085723877, + "learning_rate": 2.5940917054310398e-05, + "loss": 2.6429, + "step": 28817 + }, + { + "epoch": 2.610858191207447, + "grad_norm": 0.9681732654571533, + "learning_rate": 2.593487585331964e-05, + "loss": 2.5518, + "step": 28818 + }, + { + "epoch": 2.610948789381894, + "grad_norm": 0.9858750700950623, + "learning_rate": 2.5928834652328883e-05, + "loss": 2.7388, + "step": 28819 + }, + { + "epoch": 2.6110393875563407, + "grad_norm": 1.1034586429595947, + "learning_rate": 2.5922793451338127e-05, + "loss": 2.5624, + "step": 28820 + }, + { + "epoch": 2.6111299857307877, + "grad_norm": 1.13870108127594, + "learning_rate": 2.591675225034737e-05, + "loss": 2.5019, + "step": 28821 + }, + { + "epoch": 2.6112205839052343, + "grad_norm": 1.0663832426071167, + "learning_rate": 2.5910711049356612e-05, + "loss": 2.7189, + "step": 28822 + }, + { + "epoch": 2.611311182079681, + "grad_norm": 0.8258037567138672, + "learning_rate": 2.590466984836586e-05, + "loss": 1.6576, + "step": 28823 + }, + { + "epoch": 2.611401780254128, + "grad_norm": 0.856999933719635, + "learning_rate": 2.58986286473751e-05, + "loss": 1.8611, + "step": 28824 + }, + { + "epoch": 2.611492378428575, + "grad_norm": 1.0090053081512451, + "learning_rate": 2.589258744638434e-05, + "loss": 2.3701, + "step": 28825 + }, + { + "epoch": 2.6115829766030214, + "grad_norm": 0.9895356297492981, + "learning_rate": 2.5886546245393583e-05, + "loss": 2.5801, + "step": 28826 + }, + { + "epoch": 2.611673574777468, + "grad_norm": 1.06200110912323, + "learning_rate": 2.588050504440283e-05, + "loss": 2.617, + "step": 28827 + }, + { + "epoch": 2.611764172951915, + "grad_norm": 1.028821587562561, + "learning_rate": 2.587446384341207e-05, + "loss": 2.402, + "step": 28828 + }, + { + "epoch": 2.611854771126362, + "grad_norm": 0.9497833251953125, + "learning_rate": 2.5868422642421315e-05, + "loss": 2.6968, + "step": 28829 + }, + { + "epoch": 2.6119453693008086, + "grad_norm": 1.0602648258209229, + "learning_rate": 2.5862381441430556e-05, + "loss": 2.7533, + "step": 28830 + }, + { + "epoch": 2.612035967475255, + "grad_norm": 0.9343637228012085, + "learning_rate": 2.5856340240439804e-05, + "loss": 2.5396, + "step": 28831 + }, + { + "epoch": 2.612126565649702, + "grad_norm": 0.835864782333374, + "learning_rate": 2.5850299039449044e-05, + "loss": 1.9309, + "step": 28832 + }, + { + "epoch": 2.612217163824149, + "grad_norm": 1.0056720972061157, + "learning_rate": 2.5844257838458285e-05, + "loss": 2.6038, + "step": 28833 + }, + { + "epoch": 2.6123077619985957, + "grad_norm": 1.0223182439804077, + "learning_rate": 2.5838216637467526e-05, + "loss": 2.6791, + "step": 28834 + }, + { + "epoch": 2.6123983601730423, + "grad_norm": 0.98411625623703, + "learning_rate": 2.5832175436476774e-05, + "loss": 2.5795, + "step": 28835 + }, + { + "epoch": 2.6124889583474893, + "grad_norm": 1.1714482307434082, + "learning_rate": 2.5826134235486015e-05, + "loss": 2.7265, + "step": 28836 + }, + { + "epoch": 2.6125795565219363, + "grad_norm": 0.9565488696098328, + "learning_rate": 2.582009303449526e-05, + "loss": 2.6111, + "step": 28837 + }, + { + "epoch": 2.612670154696383, + "grad_norm": 0.9048640727996826, + "learning_rate": 2.5814051833504503e-05, + "loss": 1.8959, + "step": 28838 + }, + { + "epoch": 2.6127607528708294, + "grad_norm": 0.9450054168701172, + "learning_rate": 2.5808010632513747e-05, + "loss": 2.4748, + "step": 28839 + }, + { + "epoch": 2.6128513510452764, + "grad_norm": 1.0048199892044067, + "learning_rate": 2.5801969431522988e-05, + "loss": 2.7881, + "step": 28840 + }, + { + "epoch": 2.6129419492197234, + "grad_norm": 1.0566517114639282, + "learning_rate": 2.579592823053223e-05, + "loss": 2.7188, + "step": 28841 + }, + { + "epoch": 2.61303254739417, + "grad_norm": 1.08887779712677, + "learning_rate": 2.5789887029541476e-05, + "loss": 2.3789, + "step": 28842 + }, + { + "epoch": 2.6131231455686166, + "grad_norm": 0.8963090181350708, + "learning_rate": 2.5783845828550717e-05, + "loss": 2.108, + "step": 28843 + }, + { + "epoch": 2.6132137437430636, + "grad_norm": 0.9781869053840637, + "learning_rate": 2.5777804627559958e-05, + "loss": 2.5374, + "step": 28844 + }, + { + "epoch": 2.6133043419175106, + "grad_norm": 0.9281021356582642, + "learning_rate": 2.5771763426569202e-05, + "loss": 2.4549, + "step": 28845 + }, + { + "epoch": 2.613394940091957, + "grad_norm": 1.0223808288574219, + "learning_rate": 2.5765722225578447e-05, + "loss": 2.6069, + "step": 28846 + }, + { + "epoch": 2.6134855382664037, + "grad_norm": 1.1130551099777222, + "learning_rate": 2.575968102458769e-05, + "loss": 2.4501, + "step": 28847 + }, + { + "epoch": 2.6135761364408507, + "grad_norm": 0.8567325472831726, + "learning_rate": 2.575363982359693e-05, + "loss": 1.6698, + "step": 28848 + }, + { + "epoch": 2.6136667346152977, + "grad_norm": 0.9281505346298218, + "learning_rate": 2.5747598622606172e-05, + "loss": 2.2605, + "step": 28849 + }, + { + "epoch": 2.6137573327897443, + "grad_norm": 0.9486412405967712, + "learning_rate": 2.574155742161542e-05, + "loss": 1.982, + "step": 28850 + }, + { + "epoch": 2.613847930964191, + "grad_norm": 1.0054715871810913, + "learning_rate": 2.573551622062466e-05, + "loss": 2.6547, + "step": 28851 + }, + { + "epoch": 2.613938529138638, + "grad_norm": 1.0286738872528076, + "learning_rate": 2.57294750196339e-05, + "loss": 2.6434, + "step": 28852 + }, + { + "epoch": 2.614029127313085, + "grad_norm": 1.1096450090408325, + "learning_rate": 2.572343381864315e-05, + "loss": 2.674, + "step": 28853 + }, + { + "epoch": 2.6141197254875315, + "grad_norm": 1.1624637842178345, + "learning_rate": 2.571739261765239e-05, + "loss": 2.5637, + "step": 28854 + }, + { + "epoch": 2.614210323661978, + "grad_norm": 1.0971366167068481, + "learning_rate": 2.5711351416661634e-05, + "loss": 2.4632, + "step": 28855 + }, + { + "epoch": 2.614300921836425, + "grad_norm": 0.9729316234588623, + "learning_rate": 2.5705310215670875e-05, + "loss": 2.8252, + "step": 28856 + }, + { + "epoch": 2.614391520010872, + "grad_norm": 0.9615283608436584, + "learning_rate": 2.5699269014680123e-05, + "loss": 2.4961, + "step": 28857 + }, + { + "epoch": 2.6144821181853186, + "grad_norm": 0.9973322153091431, + "learning_rate": 2.5693227813689364e-05, + "loss": 2.6295, + "step": 28858 + }, + { + "epoch": 2.614572716359765, + "grad_norm": 1.004199743270874, + "learning_rate": 2.5687186612698604e-05, + "loss": 2.68, + "step": 28859 + }, + { + "epoch": 2.614663314534212, + "grad_norm": 1.0307753086090088, + "learning_rate": 2.5681145411707845e-05, + "loss": 2.6848, + "step": 28860 + }, + { + "epoch": 2.6147539127086588, + "grad_norm": 1.0978872776031494, + "learning_rate": 2.5675104210717093e-05, + "loss": 2.7923, + "step": 28861 + }, + { + "epoch": 2.6148445108831058, + "grad_norm": 1.0459861755371094, + "learning_rate": 2.5669063009726334e-05, + "loss": 2.7206, + "step": 28862 + }, + { + "epoch": 2.6149351090575523, + "grad_norm": 0.9645904302597046, + "learning_rate": 2.5663021808735578e-05, + "loss": 1.9255, + "step": 28863 + }, + { + "epoch": 2.6150257072319993, + "grad_norm": 1.0114537477493286, + "learning_rate": 2.565698060774482e-05, + "loss": 2.4918, + "step": 28864 + }, + { + "epoch": 2.615116305406446, + "grad_norm": 0.9919339418411255, + "learning_rate": 2.5650939406754066e-05, + "loss": 2.6309, + "step": 28865 + }, + { + "epoch": 2.615206903580893, + "grad_norm": 1.0192902088165283, + "learning_rate": 2.5644898205763307e-05, + "loss": 2.5075, + "step": 28866 + }, + { + "epoch": 2.6152975017553395, + "grad_norm": 0.9863771796226501, + "learning_rate": 2.5638857004772548e-05, + "loss": 2.7016, + "step": 28867 + }, + { + "epoch": 2.6153880999297865, + "grad_norm": 1.031923532485962, + "learning_rate": 2.5632815803781796e-05, + "loss": 2.8728, + "step": 28868 + }, + { + "epoch": 2.615478698104233, + "grad_norm": 0.9979265928268433, + "learning_rate": 2.5626774602791036e-05, + "loss": 2.5289, + "step": 28869 + }, + { + "epoch": 2.61556929627868, + "grad_norm": 1.0384007692337036, + "learning_rate": 2.5620733401800277e-05, + "loss": 2.6315, + "step": 28870 + }, + { + "epoch": 2.6156598944531266, + "grad_norm": 0.9868913292884827, + "learning_rate": 2.561469220080952e-05, + "loss": 2.7336, + "step": 28871 + }, + { + "epoch": 2.6157504926275736, + "grad_norm": 1.0834778547286987, + "learning_rate": 2.5608650999818766e-05, + "loss": 2.9884, + "step": 28872 + }, + { + "epoch": 2.61584109080202, + "grad_norm": 0.9010059237480164, + "learning_rate": 2.560260979882801e-05, + "loss": 1.8918, + "step": 28873 + }, + { + "epoch": 2.615931688976467, + "grad_norm": 1.0039212703704834, + "learning_rate": 2.559656859783725e-05, + "loss": 2.7999, + "step": 28874 + }, + { + "epoch": 2.616022287150914, + "grad_norm": 1.0548691749572754, + "learning_rate": 2.559052739684649e-05, + "loss": 2.6925, + "step": 28875 + }, + { + "epoch": 2.616112885325361, + "grad_norm": 1.0343170166015625, + "learning_rate": 2.558448619585574e-05, + "loss": 1.7979, + "step": 28876 + }, + { + "epoch": 2.6162034834998074, + "grad_norm": 1.0267574787139893, + "learning_rate": 2.557844499486498e-05, + "loss": 2.4965, + "step": 28877 + }, + { + "epoch": 2.6162940816742544, + "grad_norm": 0.9642609357833862, + "learning_rate": 2.557240379387422e-05, + "loss": 2.6906, + "step": 28878 + }, + { + "epoch": 2.616384679848701, + "grad_norm": 1.031558632850647, + "learning_rate": 2.5566362592883465e-05, + "loss": 2.4359, + "step": 28879 + }, + { + "epoch": 2.616475278023148, + "grad_norm": 1.037596583366394, + "learning_rate": 2.556032139189271e-05, + "loss": 2.486, + "step": 28880 + }, + { + "epoch": 2.6165658761975945, + "grad_norm": 1.0696988105773926, + "learning_rate": 2.5554280190901953e-05, + "loss": 2.9493, + "step": 28881 + }, + { + "epoch": 2.6166564743720415, + "grad_norm": 1.0007909536361694, + "learning_rate": 2.5548238989911194e-05, + "loss": 2.8196, + "step": 28882 + }, + { + "epoch": 2.616747072546488, + "grad_norm": 1.0303955078125, + "learning_rate": 2.5542197788920442e-05, + "loss": 2.4745, + "step": 28883 + }, + { + "epoch": 2.616837670720935, + "grad_norm": 0.9936689138412476, + "learning_rate": 2.5536156587929683e-05, + "loss": 2.75, + "step": 28884 + }, + { + "epoch": 2.6169282688953817, + "grad_norm": 1.0714036226272583, + "learning_rate": 2.5530115386938924e-05, + "loss": 2.548, + "step": 28885 + }, + { + "epoch": 2.6170188670698287, + "grad_norm": 0.9595810174942017, + "learning_rate": 2.5524074185948164e-05, + "loss": 2.424, + "step": 28886 + }, + { + "epoch": 2.6171094652442752, + "grad_norm": 1.0261993408203125, + "learning_rate": 2.5518032984957412e-05, + "loss": 2.5625, + "step": 28887 + }, + { + "epoch": 2.6172000634187222, + "grad_norm": 1.0131137371063232, + "learning_rate": 2.5511991783966653e-05, + "loss": 2.5736, + "step": 28888 + }, + { + "epoch": 2.617290661593169, + "grad_norm": 1.0025720596313477, + "learning_rate": 2.5505950582975897e-05, + "loss": 2.939, + "step": 28889 + }, + { + "epoch": 2.617381259767616, + "grad_norm": 1.0725079774856567, + "learning_rate": 2.5499909381985138e-05, + "loss": 2.6485, + "step": 28890 + }, + { + "epoch": 2.6174718579420624, + "grad_norm": 0.9487033486366272, + "learning_rate": 2.5493868180994386e-05, + "loss": 1.8359, + "step": 28891 + }, + { + "epoch": 2.6175624561165094, + "grad_norm": 1.0132302045822144, + "learning_rate": 2.5487826980003626e-05, + "loss": 2.5442, + "step": 28892 + }, + { + "epoch": 2.617653054290956, + "grad_norm": 1.0701698064804077, + "learning_rate": 2.5481785779012867e-05, + "loss": 2.5447, + "step": 28893 + }, + { + "epoch": 2.617743652465403, + "grad_norm": 0.8254140615463257, + "learning_rate": 2.5475744578022108e-05, + "loss": 2.0027, + "step": 28894 + }, + { + "epoch": 2.6178342506398495, + "grad_norm": 1.071158766746521, + "learning_rate": 2.5469703377031356e-05, + "loss": 2.809, + "step": 28895 + }, + { + "epoch": 2.6179248488142965, + "grad_norm": 0.9174321889877319, + "learning_rate": 2.5463662176040596e-05, + "loss": 2.3763, + "step": 28896 + }, + { + "epoch": 2.618015446988743, + "grad_norm": 1.030063509941101, + "learning_rate": 2.545762097504984e-05, + "loss": 2.7054, + "step": 28897 + }, + { + "epoch": 2.61810604516319, + "grad_norm": 0.9536631107330322, + "learning_rate": 2.5451579774059088e-05, + "loss": 2.2016, + "step": 28898 + }, + { + "epoch": 2.6181966433376367, + "grad_norm": 1.033951997756958, + "learning_rate": 2.544553857306833e-05, + "loss": 2.5033, + "step": 28899 + }, + { + "epoch": 2.6182872415120837, + "grad_norm": 1.0235880613327026, + "learning_rate": 2.543949737207757e-05, + "loss": 2.2041, + "step": 28900 + }, + { + "epoch": 2.6183778396865303, + "grad_norm": 1.1567645072937012, + "learning_rate": 2.543345617108681e-05, + "loss": 2.6095, + "step": 28901 + }, + { + "epoch": 2.618468437860977, + "grad_norm": 1.0286766290664673, + "learning_rate": 2.542741497009606e-05, + "loss": 2.4083, + "step": 28902 + }, + { + "epoch": 2.618559036035424, + "grad_norm": 0.8753343224525452, + "learning_rate": 2.54213737691053e-05, + "loss": 1.994, + "step": 28903 + }, + { + "epoch": 2.618649634209871, + "grad_norm": 1.0360430479049683, + "learning_rate": 2.541533256811454e-05, + "loss": 2.4459, + "step": 28904 + }, + { + "epoch": 2.6187402323843174, + "grad_norm": 0.9419375061988831, + "learning_rate": 2.5409291367123784e-05, + "loss": 2.0657, + "step": 28905 + }, + { + "epoch": 2.618830830558764, + "grad_norm": 0.9454533457756042, + "learning_rate": 2.5403250166133032e-05, + "loss": 2.4952, + "step": 28906 + }, + { + "epoch": 2.618921428733211, + "grad_norm": 1.0552233457565308, + "learning_rate": 2.5397208965142273e-05, + "loss": 2.7305, + "step": 28907 + }, + { + "epoch": 2.619012026907658, + "grad_norm": 1.005175232887268, + "learning_rate": 2.5391167764151513e-05, + "loss": 2.6262, + "step": 28908 + }, + { + "epoch": 2.6191026250821046, + "grad_norm": 1.0551912784576416, + "learning_rate": 2.5385126563160754e-05, + "loss": 2.5678, + "step": 28909 + }, + { + "epoch": 2.619193223256551, + "grad_norm": 1.0489223003387451, + "learning_rate": 2.5379085362170002e-05, + "loss": 2.561, + "step": 28910 + }, + { + "epoch": 2.619283821430998, + "grad_norm": 1.116159200668335, + "learning_rate": 2.5373044161179243e-05, + "loss": 2.4646, + "step": 28911 + }, + { + "epoch": 2.619374419605445, + "grad_norm": 0.9805566668510437, + "learning_rate": 2.5367002960188484e-05, + "loss": 2.423, + "step": 28912 + }, + { + "epoch": 2.6194650177798917, + "grad_norm": 0.9772522449493408, + "learning_rate": 2.536096175919773e-05, + "loss": 2.465, + "step": 28913 + }, + { + "epoch": 2.6195556159543383, + "grad_norm": 0.9340047240257263, + "learning_rate": 2.5354920558206975e-05, + "loss": 2.5334, + "step": 28914 + }, + { + "epoch": 2.6196462141287853, + "grad_norm": 1.0142362117767334, + "learning_rate": 2.5348879357216216e-05, + "loss": 2.8077, + "step": 28915 + }, + { + "epoch": 2.6197368123032323, + "grad_norm": 0.980637788772583, + "learning_rate": 2.5342838156225457e-05, + "loss": 2.7018, + "step": 28916 + }, + { + "epoch": 2.619827410477679, + "grad_norm": 0.8950552940368652, + "learning_rate": 2.5336796955234705e-05, + "loss": 1.8002, + "step": 28917 + }, + { + "epoch": 2.6199180086521254, + "grad_norm": 1.085113286972046, + "learning_rate": 2.5330755754243946e-05, + "loss": 2.7674, + "step": 28918 + }, + { + "epoch": 2.6200086068265724, + "grad_norm": 1.0094001293182373, + "learning_rate": 2.5324714553253186e-05, + "loss": 2.5372, + "step": 28919 + }, + { + "epoch": 2.6200992050010194, + "grad_norm": 0.9536176919937134, + "learning_rate": 2.5318673352262427e-05, + "loss": 2.6358, + "step": 28920 + }, + { + "epoch": 2.620189803175466, + "grad_norm": 1.1283979415893555, + "learning_rate": 2.5312632151271675e-05, + "loss": 2.5725, + "step": 28921 + }, + { + "epoch": 2.6202804013499126, + "grad_norm": 1.0039118528366089, + "learning_rate": 2.530659095028092e-05, + "loss": 2.6351, + "step": 28922 + }, + { + "epoch": 2.6203709995243596, + "grad_norm": 0.9484413862228394, + "learning_rate": 2.530054974929016e-05, + "loss": 2.2025, + "step": 28923 + }, + { + "epoch": 2.6204615976988066, + "grad_norm": 1.069254755973816, + "learning_rate": 2.52945085482994e-05, + "loss": 2.7098, + "step": 28924 + }, + { + "epoch": 2.620552195873253, + "grad_norm": 0.9639595150947571, + "learning_rate": 2.5288467347308648e-05, + "loss": 2.8317, + "step": 28925 + }, + { + "epoch": 2.6206427940476997, + "grad_norm": 1.0443432331085205, + "learning_rate": 2.528242614631789e-05, + "loss": 2.6538, + "step": 28926 + }, + { + "epoch": 2.6207333922221467, + "grad_norm": 0.97618168592453, + "learning_rate": 2.527638494532713e-05, + "loss": 1.9198, + "step": 28927 + }, + { + "epoch": 2.6208239903965938, + "grad_norm": 1.0123411417007446, + "learning_rate": 2.5270343744336378e-05, + "loss": 2.6532, + "step": 28928 + }, + { + "epoch": 2.6209145885710403, + "grad_norm": 0.8592365384101868, + "learning_rate": 2.526430254334562e-05, + "loss": 2.0904, + "step": 28929 + }, + { + "epoch": 2.621005186745487, + "grad_norm": 0.9946090579032898, + "learning_rate": 2.525826134235486e-05, + "loss": 2.7245, + "step": 28930 + }, + { + "epoch": 2.621095784919934, + "grad_norm": 1.0128734111785889, + "learning_rate": 2.5252220141364103e-05, + "loss": 2.6334, + "step": 28931 + }, + { + "epoch": 2.621186383094381, + "grad_norm": 1.0185433626174927, + "learning_rate": 2.524617894037335e-05, + "loss": 2.7412, + "step": 28932 + }, + { + "epoch": 2.6212769812688275, + "grad_norm": 0.9825023412704468, + "learning_rate": 2.5240137739382592e-05, + "loss": 2.7138, + "step": 28933 + }, + { + "epoch": 2.621367579443274, + "grad_norm": 0.9362449645996094, + "learning_rate": 2.5234096538391833e-05, + "loss": 2.7768, + "step": 28934 + }, + { + "epoch": 2.621458177617721, + "grad_norm": 0.8624821305274963, + "learning_rate": 2.5228055337401073e-05, + "loss": 1.9326, + "step": 28935 + }, + { + "epoch": 2.621548775792168, + "grad_norm": 0.9619746208190918, + "learning_rate": 2.522201413641032e-05, + "loss": 2.4092, + "step": 28936 + }, + { + "epoch": 2.6216393739666146, + "grad_norm": 0.909321665763855, + "learning_rate": 2.5215972935419562e-05, + "loss": 2.0883, + "step": 28937 + }, + { + "epoch": 2.621729972141061, + "grad_norm": 0.9440787434577942, + "learning_rate": 2.5209931734428803e-05, + "loss": 1.7664, + "step": 28938 + }, + { + "epoch": 2.621820570315508, + "grad_norm": 0.9679258465766907, + "learning_rate": 2.5203890533438047e-05, + "loss": 2.3779, + "step": 28939 + }, + { + "epoch": 2.621911168489955, + "grad_norm": 1.01569402217865, + "learning_rate": 2.5197849332447295e-05, + "loss": 2.6965, + "step": 28940 + }, + { + "epoch": 2.6220017666644018, + "grad_norm": 0.8770269155502319, + "learning_rate": 2.5191808131456535e-05, + "loss": 1.8341, + "step": 28941 + }, + { + "epoch": 2.6220923648388483, + "grad_norm": 1.0334157943725586, + "learning_rate": 2.5185766930465776e-05, + "loss": 2.6325, + "step": 28942 + }, + { + "epoch": 2.6221829630132953, + "grad_norm": 1.0307587385177612, + "learning_rate": 2.5179725729475024e-05, + "loss": 2.8006, + "step": 28943 + }, + { + "epoch": 2.622273561187742, + "grad_norm": 0.9379618167877197, + "learning_rate": 2.5173684528484265e-05, + "loss": 2.4024, + "step": 28944 + }, + { + "epoch": 2.622364159362189, + "grad_norm": 1.0221787691116333, + "learning_rate": 2.5167643327493505e-05, + "loss": 2.8323, + "step": 28945 + }, + { + "epoch": 2.6224547575366355, + "grad_norm": 0.9977494478225708, + "learning_rate": 2.5161602126502746e-05, + "loss": 2.6235, + "step": 28946 + }, + { + "epoch": 2.6225453557110825, + "grad_norm": 1.0101873874664307, + "learning_rate": 2.5155560925511994e-05, + "loss": 2.61, + "step": 28947 + }, + { + "epoch": 2.622635953885529, + "grad_norm": 0.9771812558174133, + "learning_rate": 2.5149519724521238e-05, + "loss": 2.5821, + "step": 28948 + }, + { + "epoch": 2.622726552059976, + "grad_norm": 1.0588617324829102, + "learning_rate": 2.514347852353048e-05, + "loss": 2.6214, + "step": 28949 + }, + { + "epoch": 2.6228171502344226, + "grad_norm": 1.0070335865020752, + "learning_rate": 2.513743732253972e-05, + "loss": 2.7656, + "step": 28950 + }, + { + "epoch": 2.6229077484088696, + "grad_norm": 1.1152517795562744, + "learning_rate": 2.5131396121548967e-05, + "loss": 2.664, + "step": 28951 + }, + { + "epoch": 2.622998346583316, + "grad_norm": 1.0632882118225098, + "learning_rate": 2.5125354920558208e-05, + "loss": 2.3719, + "step": 28952 + }, + { + "epoch": 2.623088944757763, + "grad_norm": 1.126962661743164, + "learning_rate": 2.511931371956745e-05, + "loss": 2.6236, + "step": 28953 + }, + { + "epoch": 2.62317954293221, + "grad_norm": 1.048713207244873, + "learning_rate": 2.511327251857669e-05, + "loss": 2.5873, + "step": 28954 + }, + { + "epoch": 2.623270141106657, + "grad_norm": 1.02570378780365, + "learning_rate": 2.5107231317585938e-05, + "loss": 2.4303, + "step": 28955 + }, + { + "epoch": 2.6233607392811034, + "grad_norm": 0.9911465048789978, + "learning_rate": 2.5101190116595182e-05, + "loss": 2.6587, + "step": 28956 + }, + { + "epoch": 2.6234513374555504, + "grad_norm": 0.9057897329330444, + "learning_rate": 2.5095148915604423e-05, + "loss": 2.1284, + "step": 28957 + }, + { + "epoch": 2.623541935629997, + "grad_norm": 1.0204802751541138, + "learning_rate": 2.508910771461367e-05, + "loss": 2.5444, + "step": 28958 + }, + { + "epoch": 2.623632533804444, + "grad_norm": 1.0647964477539062, + "learning_rate": 2.508306651362291e-05, + "loss": 2.5676, + "step": 28959 + }, + { + "epoch": 2.6237231319788905, + "grad_norm": 0.9920529127120972, + "learning_rate": 2.5077025312632152e-05, + "loss": 2.4871, + "step": 28960 + }, + { + "epoch": 2.6238137301533375, + "grad_norm": 1.0581241846084595, + "learning_rate": 2.5070984111641393e-05, + "loss": 2.5078, + "step": 28961 + }, + { + "epoch": 2.623904328327784, + "grad_norm": 0.9886451959609985, + "learning_rate": 2.506494291065064e-05, + "loss": 2.5066, + "step": 28962 + }, + { + "epoch": 2.623994926502231, + "grad_norm": 1.0374581813812256, + "learning_rate": 2.505890170965988e-05, + "loss": 2.7436, + "step": 28963 + }, + { + "epoch": 2.6240855246766777, + "grad_norm": 0.9630736112594604, + "learning_rate": 2.5052860508669125e-05, + "loss": 2.4328, + "step": 28964 + }, + { + "epoch": 2.6241761228511247, + "grad_norm": 0.8896225690841675, + "learning_rate": 2.5046819307678366e-05, + "loss": 1.8147, + "step": 28965 + }, + { + "epoch": 2.6242667210255712, + "grad_norm": 1.0850088596343994, + "learning_rate": 2.5040778106687614e-05, + "loss": 2.6089, + "step": 28966 + }, + { + "epoch": 2.6243573192000182, + "grad_norm": 0.9770991206169128, + "learning_rate": 2.5034736905696855e-05, + "loss": 2.7716, + "step": 28967 + }, + { + "epoch": 2.624447917374465, + "grad_norm": 1.083247184753418, + "learning_rate": 2.5028695704706095e-05, + "loss": 2.713, + "step": 28968 + }, + { + "epoch": 2.624538515548912, + "grad_norm": 0.8397424221038818, + "learning_rate": 2.5022654503715336e-05, + "loss": 1.8579, + "step": 28969 + }, + { + "epoch": 2.6246291137233584, + "grad_norm": 0.9452641010284424, + "learning_rate": 2.5016613302724584e-05, + "loss": 2.3854, + "step": 28970 + }, + { + "epoch": 2.6247197118978054, + "grad_norm": 0.9595363140106201, + "learning_rate": 2.5010572101733825e-05, + "loss": 2.5888, + "step": 28971 + }, + { + "epoch": 2.624810310072252, + "grad_norm": 1.031072735786438, + "learning_rate": 2.500453090074307e-05, + "loss": 2.3837, + "step": 28972 + }, + { + "epoch": 2.624900908246699, + "grad_norm": 1.0170241594314575, + "learning_rate": 2.4998489699752313e-05, + "loss": 2.5052, + "step": 28973 + }, + { + "epoch": 2.6249915064211455, + "grad_norm": 0.9623063802719116, + "learning_rate": 2.4992448498761557e-05, + "loss": 2.6488, + "step": 28974 + }, + { + "epoch": 2.6250821045955925, + "grad_norm": 1.0056997537612915, + "learning_rate": 2.4986407297770798e-05, + "loss": 2.8795, + "step": 28975 + }, + { + "epoch": 2.625172702770039, + "grad_norm": 0.9666791558265686, + "learning_rate": 2.4980366096780042e-05, + "loss": 2.4099, + "step": 28976 + }, + { + "epoch": 2.625263300944486, + "grad_norm": 0.9667389392852783, + "learning_rate": 2.4974324895789283e-05, + "loss": 2.6793, + "step": 28977 + }, + { + "epoch": 2.6253538991189327, + "grad_norm": 0.9623873829841614, + "learning_rate": 2.4968283694798527e-05, + "loss": 2.8168, + "step": 28978 + }, + { + "epoch": 2.6254444972933797, + "grad_norm": 1.048866868019104, + "learning_rate": 2.4962242493807768e-05, + "loss": 2.4786, + "step": 28979 + }, + { + "epoch": 2.6255350954678263, + "grad_norm": 1.0144553184509277, + "learning_rate": 2.4956201292817012e-05, + "loss": 2.7151, + "step": 28980 + }, + { + "epoch": 2.6256256936422733, + "grad_norm": 0.9891345500946045, + "learning_rate": 2.4950160091826257e-05, + "loss": 2.3856, + "step": 28981 + }, + { + "epoch": 2.62571629181672, + "grad_norm": 0.9190739393234253, + "learning_rate": 2.49441188908355e-05, + "loss": 2.5899, + "step": 28982 + }, + { + "epoch": 2.625806889991167, + "grad_norm": 1.1401664018630981, + "learning_rate": 2.4938077689844742e-05, + "loss": 2.4709, + "step": 28983 + }, + { + "epoch": 2.6258974881656134, + "grad_norm": 1.0219447612762451, + "learning_rate": 2.4932036488853986e-05, + "loss": 2.6998, + "step": 28984 + }, + { + "epoch": 2.62598808634006, + "grad_norm": 0.995453417301178, + "learning_rate": 2.492599528786323e-05, + "loss": 2.5326, + "step": 28985 + }, + { + "epoch": 2.626078684514507, + "grad_norm": 1.0132991075515747, + "learning_rate": 2.491995408687247e-05, + "loss": 2.6316, + "step": 28986 + }, + { + "epoch": 2.626169282688954, + "grad_norm": 1.0997295379638672, + "learning_rate": 2.4913912885881715e-05, + "loss": 2.6264, + "step": 28987 + }, + { + "epoch": 2.6262598808634006, + "grad_norm": 1.0253456830978394, + "learning_rate": 2.4907871684890956e-05, + "loss": 2.5952, + "step": 28988 + }, + { + "epoch": 2.626350479037847, + "grad_norm": 1.0379732847213745, + "learning_rate": 2.49018304839002e-05, + "loss": 2.615, + "step": 28989 + }, + { + "epoch": 2.626441077212294, + "grad_norm": 0.9954696297645569, + "learning_rate": 2.4895789282909444e-05, + "loss": 2.5621, + "step": 28990 + }, + { + "epoch": 2.626531675386741, + "grad_norm": 0.9737968444824219, + "learning_rate": 2.488974808191869e-05, + "loss": 2.6543, + "step": 28991 + }, + { + "epoch": 2.6266222735611877, + "grad_norm": 1.00263512134552, + "learning_rate": 2.488370688092793e-05, + "loss": 2.5495, + "step": 28992 + }, + { + "epoch": 2.6267128717356343, + "grad_norm": 0.971217930316925, + "learning_rate": 2.4877665679937174e-05, + "loss": 2.6282, + "step": 28993 + }, + { + "epoch": 2.6268034699100813, + "grad_norm": 0.9469628930091858, + "learning_rate": 2.4871624478946415e-05, + "loss": 2.5272, + "step": 28994 + }, + { + "epoch": 2.6268940680845283, + "grad_norm": 0.9971373677253723, + "learning_rate": 2.486558327795566e-05, + "loss": 2.6868, + "step": 28995 + }, + { + "epoch": 2.626984666258975, + "grad_norm": 0.9533106684684753, + "learning_rate": 2.48595420769649e-05, + "loss": 2.7818, + "step": 28996 + }, + { + "epoch": 2.6270752644334214, + "grad_norm": 1.0817887783050537, + "learning_rate": 2.4853500875974144e-05, + "loss": 2.7201, + "step": 28997 + }, + { + "epoch": 2.6271658626078684, + "grad_norm": 0.7450571656227112, + "learning_rate": 2.4847459674983388e-05, + "loss": 1.3341, + "step": 28998 + }, + { + "epoch": 2.6272564607823155, + "grad_norm": 0.971695065498352, + "learning_rate": 2.4841418473992632e-05, + "loss": 2.8596, + "step": 28999 + }, + { + "epoch": 2.627347058956762, + "grad_norm": 1.0289843082427979, + "learning_rate": 2.4835377273001876e-05, + "loss": 2.7935, + "step": 29000 + }, + { + "epoch": 2.6274376571312086, + "grad_norm": 0.9724569320678711, + "learning_rate": 2.4829336072011117e-05, + "loss": 2.6279, + "step": 29001 + }, + { + "epoch": 2.6275282553056556, + "grad_norm": 1.0341355800628662, + "learning_rate": 2.482329487102036e-05, + "loss": 2.7623, + "step": 29002 + }, + { + "epoch": 2.6276188534801026, + "grad_norm": 1.086519718170166, + "learning_rate": 2.4817253670029602e-05, + "loss": 2.6704, + "step": 29003 + }, + { + "epoch": 2.627709451654549, + "grad_norm": 1.0697234869003296, + "learning_rate": 2.4811212469038847e-05, + "loss": 2.4448, + "step": 29004 + }, + { + "epoch": 2.6278000498289957, + "grad_norm": 0.7399821281433105, + "learning_rate": 2.4805171268048087e-05, + "loss": 1.4019, + "step": 29005 + }, + { + "epoch": 2.6278906480034427, + "grad_norm": 1.0475654602050781, + "learning_rate": 2.479913006705733e-05, + "loss": 2.6764, + "step": 29006 + }, + { + "epoch": 2.6279812461778898, + "grad_norm": 1.0790247917175293, + "learning_rate": 2.4793088866066576e-05, + "loss": 2.6797, + "step": 29007 + }, + { + "epoch": 2.6280718443523363, + "grad_norm": 0.9809174537658691, + "learning_rate": 2.478704766507582e-05, + "loss": 2.2861, + "step": 29008 + }, + { + "epoch": 2.628162442526783, + "grad_norm": 1.0521401166915894, + "learning_rate": 2.478100646408506e-05, + "loss": 2.5946, + "step": 29009 + }, + { + "epoch": 2.62825304070123, + "grad_norm": 0.9827528595924377, + "learning_rate": 2.4774965263094305e-05, + "loss": 2.5607, + "step": 29010 + }, + { + "epoch": 2.628343638875677, + "grad_norm": 1.0348106622695923, + "learning_rate": 2.4768924062103546e-05, + "loss": 2.5663, + "step": 29011 + }, + { + "epoch": 2.6284342370501235, + "grad_norm": 0.9374512434005737, + "learning_rate": 2.476288286111279e-05, + "loss": 2.8103, + "step": 29012 + }, + { + "epoch": 2.62852483522457, + "grad_norm": 0.9867594242095947, + "learning_rate": 2.475684166012203e-05, + "loss": 2.5997, + "step": 29013 + }, + { + "epoch": 2.628615433399017, + "grad_norm": 0.9737594127655029, + "learning_rate": 2.4750800459131275e-05, + "loss": 2.67, + "step": 29014 + }, + { + "epoch": 2.628706031573464, + "grad_norm": 1.0153295993804932, + "learning_rate": 2.474475925814052e-05, + "loss": 2.482, + "step": 29015 + }, + { + "epoch": 2.6287966297479106, + "grad_norm": 0.9856018424034119, + "learning_rate": 2.4738718057149764e-05, + "loss": 2.6041, + "step": 29016 + }, + { + "epoch": 2.628887227922357, + "grad_norm": 1.021597981452942, + "learning_rate": 2.4732676856159008e-05, + "loss": 2.7593, + "step": 29017 + }, + { + "epoch": 2.628977826096804, + "grad_norm": 0.8799956440925598, + "learning_rate": 2.472663565516825e-05, + "loss": 2.0513, + "step": 29018 + }, + { + "epoch": 2.629068424271251, + "grad_norm": 0.9854252934455872, + "learning_rate": 2.4720594454177493e-05, + "loss": 2.49, + "step": 29019 + }, + { + "epoch": 2.6291590224456978, + "grad_norm": 1.0984481573104858, + "learning_rate": 2.4714553253186734e-05, + "loss": 2.9059, + "step": 29020 + }, + { + "epoch": 2.6292496206201443, + "grad_norm": 1.0036826133728027, + "learning_rate": 2.4708512052195978e-05, + "loss": 2.3736, + "step": 29021 + }, + { + "epoch": 2.6293402187945913, + "grad_norm": 0.978705883026123, + "learning_rate": 2.470247085120522e-05, + "loss": 2.5513, + "step": 29022 + }, + { + "epoch": 2.629430816969038, + "grad_norm": 1.1081116199493408, + "learning_rate": 2.4696429650214463e-05, + "loss": 2.7015, + "step": 29023 + }, + { + "epoch": 2.629521415143485, + "grad_norm": 1.056781530380249, + "learning_rate": 2.4690388449223707e-05, + "loss": 2.8775, + "step": 29024 + }, + { + "epoch": 2.6296120133179315, + "grad_norm": 1.0051904916763306, + "learning_rate": 2.468434724823295e-05, + "loss": 2.6559, + "step": 29025 + }, + { + "epoch": 2.6297026114923785, + "grad_norm": 1.0288640260696411, + "learning_rate": 2.4678306047242192e-05, + "loss": 2.8454, + "step": 29026 + }, + { + "epoch": 2.629793209666825, + "grad_norm": 1.0300512313842773, + "learning_rate": 2.4672264846251436e-05, + "loss": 2.7903, + "step": 29027 + }, + { + "epoch": 2.629883807841272, + "grad_norm": 0.9633411169052124, + "learning_rate": 2.4666223645260677e-05, + "loss": 2.6274, + "step": 29028 + }, + { + "epoch": 2.6299744060157186, + "grad_norm": 1.0885637998580933, + "learning_rate": 2.466018244426992e-05, + "loss": 2.6304, + "step": 29029 + }, + { + "epoch": 2.6300650041901656, + "grad_norm": 1.0358951091766357, + "learning_rate": 2.4654141243279166e-05, + "loss": 2.3408, + "step": 29030 + }, + { + "epoch": 2.630155602364612, + "grad_norm": 0.9985037446022034, + "learning_rate": 2.4648100042288407e-05, + "loss": 2.6032, + "step": 29031 + }, + { + "epoch": 2.630246200539059, + "grad_norm": 1.0260868072509766, + "learning_rate": 2.464205884129765e-05, + "loss": 2.7052, + "step": 29032 + }, + { + "epoch": 2.630336798713506, + "grad_norm": 1.0370771884918213, + "learning_rate": 2.4636017640306895e-05, + "loss": 2.6979, + "step": 29033 + }, + { + "epoch": 2.630427396887953, + "grad_norm": 1.0810686349868774, + "learning_rate": 2.462997643931614e-05, + "loss": 2.6568, + "step": 29034 + }, + { + "epoch": 2.6305179950623994, + "grad_norm": 0.9873971939086914, + "learning_rate": 2.462393523832538e-05, + "loss": 2.6568, + "step": 29035 + }, + { + "epoch": 2.6306085932368464, + "grad_norm": 0.9997642040252686, + "learning_rate": 2.4617894037334624e-05, + "loss": 3.0183, + "step": 29036 + }, + { + "epoch": 2.630699191411293, + "grad_norm": 1.0819694995880127, + "learning_rate": 2.4611852836343865e-05, + "loss": 2.6622, + "step": 29037 + }, + { + "epoch": 2.63078978958574, + "grad_norm": 1.024583101272583, + "learning_rate": 2.460581163535311e-05, + "loss": 2.6651, + "step": 29038 + }, + { + "epoch": 2.6308803877601865, + "grad_norm": 0.9725327491760254, + "learning_rate": 2.459977043436235e-05, + "loss": 2.6765, + "step": 29039 + }, + { + "epoch": 2.6309709859346335, + "grad_norm": 0.9928376078605652, + "learning_rate": 2.4593729233371594e-05, + "loss": 2.5543, + "step": 29040 + }, + { + "epoch": 2.63106158410908, + "grad_norm": 1.006777048110962, + "learning_rate": 2.458768803238084e-05, + "loss": 2.6644, + "step": 29041 + }, + { + "epoch": 2.631152182283527, + "grad_norm": 0.9867772459983826, + "learning_rate": 2.4581646831390083e-05, + "loss": 2.8533, + "step": 29042 + }, + { + "epoch": 2.6312427804579737, + "grad_norm": 1.0116400718688965, + "learning_rate": 2.4575605630399324e-05, + "loss": 2.6271, + "step": 29043 + }, + { + "epoch": 2.6313333786324207, + "grad_norm": 1.0523087978363037, + "learning_rate": 2.4569564429408568e-05, + "loss": 2.4123, + "step": 29044 + }, + { + "epoch": 2.6314239768068672, + "grad_norm": 1.0116549730300903, + "learning_rate": 2.4563523228417812e-05, + "loss": 2.729, + "step": 29045 + }, + { + "epoch": 2.6315145749813142, + "grad_norm": 1.0001254081726074, + "learning_rate": 2.4557482027427053e-05, + "loss": 1.9351, + "step": 29046 + }, + { + "epoch": 2.631605173155761, + "grad_norm": 0.9676617383956909, + "learning_rate": 2.4551440826436297e-05, + "loss": 2.3498, + "step": 29047 + }, + { + "epoch": 2.631695771330208, + "grad_norm": 0.8709383606910706, + "learning_rate": 2.4545399625445538e-05, + "loss": 1.9103, + "step": 29048 + }, + { + "epoch": 2.6317863695046544, + "grad_norm": 1.0896824598312378, + "learning_rate": 2.4539358424454782e-05, + "loss": 2.7255, + "step": 29049 + }, + { + "epoch": 2.6318769676791014, + "grad_norm": 0.9806045889854431, + "learning_rate": 2.4533317223464026e-05, + "loss": 2.6103, + "step": 29050 + }, + { + "epoch": 2.631967565853548, + "grad_norm": 1.07316255569458, + "learning_rate": 2.452727602247327e-05, + "loss": 2.5998, + "step": 29051 + }, + { + "epoch": 2.632058164027995, + "grad_norm": 1.1389904022216797, + "learning_rate": 2.452123482148251e-05, + "loss": 2.1747, + "step": 29052 + }, + { + "epoch": 2.6321487622024415, + "grad_norm": 0.9983422160148621, + "learning_rate": 2.4515193620491756e-05, + "loss": 2.7003, + "step": 29053 + }, + { + "epoch": 2.6322393603768885, + "grad_norm": 0.9341314435005188, + "learning_rate": 2.4509152419500996e-05, + "loss": 2.5554, + "step": 29054 + }, + { + "epoch": 2.632329958551335, + "grad_norm": 0.9317852854728699, + "learning_rate": 2.450311121851024e-05, + "loss": 2.5293, + "step": 29055 + }, + { + "epoch": 2.632420556725782, + "grad_norm": 0.9481920599937439, + "learning_rate": 2.449707001751948e-05, + "loss": 2.4672, + "step": 29056 + }, + { + "epoch": 2.6325111549002287, + "grad_norm": 1.0795526504516602, + "learning_rate": 2.4491028816528726e-05, + "loss": 2.1851, + "step": 29057 + }, + { + "epoch": 2.6326017530746757, + "grad_norm": 0.8679572939872742, + "learning_rate": 2.448498761553797e-05, + "loss": 1.9427, + "step": 29058 + }, + { + "epoch": 2.6326923512491223, + "grad_norm": 1.1054061651229858, + "learning_rate": 2.4478946414547214e-05, + "loss": 2.4049, + "step": 29059 + }, + { + "epoch": 2.6327829494235693, + "grad_norm": 1.0856757164001465, + "learning_rate": 2.447290521355646e-05, + "loss": 2.7819, + "step": 29060 + }, + { + "epoch": 2.632873547598016, + "grad_norm": 1.0114182233810425, + "learning_rate": 2.44668640125657e-05, + "loss": 2.511, + "step": 29061 + }, + { + "epoch": 2.632964145772463, + "grad_norm": 1.1292784214019775, + "learning_rate": 2.4460822811574943e-05, + "loss": 2.6946, + "step": 29062 + }, + { + "epoch": 2.6330547439469094, + "grad_norm": 0.9733827114105225, + "learning_rate": 2.4454781610584184e-05, + "loss": 2.6217, + "step": 29063 + }, + { + "epoch": 2.633145342121356, + "grad_norm": 0.9622227549552917, + "learning_rate": 2.444874040959343e-05, + "loss": 1.9663, + "step": 29064 + }, + { + "epoch": 2.633235940295803, + "grad_norm": 0.8316214084625244, + "learning_rate": 2.444269920860267e-05, + "loss": 2.0195, + "step": 29065 + }, + { + "epoch": 2.63332653847025, + "grad_norm": 1.0995171070098877, + "learning_rate": 2.4436658007611914e-05, + "loss": 2.8627, + "step": 29066 + }, + { + "epoch": 2.6334171366446966, + "grad_norm": 1.0469181537628174, + "learning_rate": 2.4430616806621158e-05, + "loss": 2.7755, + "step": 29067 + }, + { + "epoch": 2.633507734819143, + "grad_norm": 0.9900240898132324, + "learning_rate": 2.4424575605630402e-05, + "loss": 2.6028, + "step": 29068 + }, + { + "epoch": 2.63359833299359, + "grad_norm": 1.0566514730453491, + "learning_rate": 2.4418534404639643e-05, + "loss": 2.4527, + "step": 29069 + }, + { + "epoch": 2.633688931168037, + "grad_norm": 1.047403335571289, + "learning_rate": 2.4412493203648887e-05, + "loss": 2.6728, + "step": 29070 + }, + { + "epoch": 2.6337795293424837, + "grad_norm": 0.9832709431648254, + "learning_rate": 2.4406452002658128e-05, + "loss": 2.5807, + "step": 29071 + }, + { + "epoch": 2.6338701275169303, + "grad_norm": 0.9500866532325745, + "learning_rate": 2.4400410801667372e-05, + "loss": 2.6122, + "step": 29072 + }, + { + "epoch": 2.6339607256913773, + "grad_norm": 1.0800994634628296, + "learning_rate": 2.4394369600676616e-05, + "loss": 2.7397, + "step": 29073 + }, + { + "epoch": 2.6340513238658243, + "grad_norm": 1.0374480485916138, + "learning_rate": 2.4388328399685857e-05, + "loss": 2.4938, + "step": 29074 + }, + { + "epoch": 2.634141922040271, + "grad_norm": 1.1107919216156006, + "learning_rate": 2.43822871986951e-05, + "loss": 2.6723, + "step": 29075 + }, + { + "epoch": 2.6342325202147174, + "grad_norm": 0.9897051453590393, + "learning_rate": 2.4376245997704346e-05, + "loss": 2.5483, + "step": 29076 + }, + { + "epoch": 2.6343231183891644, + "grad_norm": 0.9946452975273132, + "learning_rate": 2.437020479671359e-05, + "loss": 2.3106, + "step": 29077 + }, + { + "epoch": 2.6344137165636115, + "grad_norm": 1.028090238571167, + "learning_rate": 2.436416359572283e-05, + "loss": 2.8111, + "step": 29078 + }, + { + "epoch": 2.634504314738058, + "grad_norm": 0.9854893088340759, + "learning_rate": 2.4358122394732075e-05, + "loss": 2.5766, + "step": 29079 + }, + { + "epoch": 2.6345949129125046, + "grad_norm": 1.1188220977783203, + "learning_rate": 2.4352081193741316e-05, + "loss": 2.6389, + "step": 29080 + }, + { + "epoch": 2.6346855110869516, + "grad_norm": 1.022261619567871, + "learning_rate": 2.434603999275056e-05, + "loss": 2.684, + "step": 29081 + }, + { + "epoch": 2.6347761092613986, + "grad_norm": 0.9637933969497681, + "learning_rate": 2.43399987917598e-05, + "loss": 2.6509, + "step": 29082 + }, + { + "epoch": 2.634866707435845, + "grad_norm": 1.0286192893981934, + "learning_rate": 2.4333957590769045e-05, + "loss": 2.7925, + "step": 29083 + }, + { + "epoch": 2.6349573056102917, + "grad_norm": 0.978060245513916, + "learning_rate": 2.432791638977829e-05, + "loss": 2.5189, + "step": 29084 + }, + { + "epoch": 2.6350479037847387, + "grad_norm": 1.1131064891815186, + "learning_rate": 2.4321875188787533e-05, + "loss": 2.586, + "step": 29085 + }, + { + "epoch": 2.6351385019591858, + "grad_norm": 1.0152417421340942, + "learning_rate": 2.4315833987796774e-05, + "loss": 2.5621, + "step": 29086 + }, + { + "epoch": 2.6352291001336323, + "grad_norm": 0.9940605163574219, + "learning_rate": 2.430979278680602e-05, + "loss": 2.5222, + "step": 29087 + }, + { + "epoch": 2.635319698308079, + "grad_norm": 1.0161051750183105, + "learning_rate": 2.4303751585815263e-05, + "loss": 2.735, + "step": 29088 + }, + { + "epoch": 2.635410296482526, + "grad_norm": 1.0635606050491333, + "learning_rate": 2.4297710384824503e-05, + "loss": 2.8904, + "step": 29089 + }, + { + "epoch": 2.635500894656973, + "grad_norm": 1.1138131618499756, + "learning_rate": 2.4291669183833748e-05, + "loss": 2.531, + "step": 29090 + }, + { + "epoch": 2.6355914928314195, + "grad_norm": 1.047061800956726, + "learning_rate": 2.428562798284299e-05, + "loss": 2.3801, + "step": 29091 + }, + { + "epoch": 2.635682091005866, + "grad_norm": 1.0614417791366577, + "learning_rate": 2.4279586781852233e-05, + "loss": 2.616, + "step": 29092 + }, + { + "epoch": 2.635772689180313, + "grad_norm": 1.0205355882644653, + "learning_rate": 2.4273545580861477e-05, + "loss": 2.8354, + "step": 29093 + }, + { + "epoch": 2.63586328735476, + "grad_norm": 1.0036327838897705, + "learning_rate": 2.426750437987072e-05, + "loss": 2.4093, + "step": 29094 + }, + { + "epoch": 2.6359538855292066, + "grad_norm": 1.0487653017044067, + "learning_rate": 2.4261463178879962e-05, + "loss": 2.719, + "step": 29095 + }, + { + "epoch": 2.636044483703653, + "grad_norm": 0.8337616324424744, + "learning_rate": 2.4255421977889206e-05, + "loss": 2.0868, + "step": 29096 + }, + { + "epoch": 2.6361350818781, + "grad_norm": 0.9763174057006836, + "learning_rate": 2.4249380776898447e-05, + "loss": 2.5014, + "step": 29097 + }, + { + "epoch": 2.636225680052547, + "grad_norm": 1.0555301904678345, + "learning_rate": 2.424333957590769e-05, + "loss": 2.526, + "step": 29098 + }, + { + "epoch": 2.6363162782269938, + "grad_norm": 1.0036113262176514, + "learning_rate": 2.4237298374916932e-05, + "loss": 2.76, + "step": 29099 + }, + { + "epoch": 2.6364068764014403, + "grad_norm": 0.9762412309646606, + "learning_rate": 2.4231257173926176e-05, + "loss": 2.6261, + "step": 29100 + }, + { + "epoch": 2.6364974745758873, + "grad_norm": 1.1003409624099731, + "learning_rate": 2.422521597293542e-05, + "loss": 2.7437, + "step": 29101 + }, + { + "epoch": 2.6365880727503344, + "grad_norm": 1.0245656967163086, + "learning_rate": 2.4219174771944665e-05, + "loss": 2.8193, + "step": 29102 + }, + { + "epoch": 2.636678670924781, + "grad_norm": 1.1059061288833618, + "learning_rate": 2.421313357095391e-05, + "loss": 2.6724, + "step": 29103 + }, + { + "epoch": 2.6367692690992275, + "grad_norm": 1.009169340133667, + "learning_rate": 2.420709236996315e-05, + "loss": 2.7389, + "step": 29104 + }, + { + "epoch": 2.6368598672736745, + "grad_norm": 1.0526511669158936, + "learning_rate": 2.4201051168972394e-05, + "loss": 2.8574, + "step": 29105 + }, + { + "epoch": 2.636950465448121, + "grad_norm": 1.044662356376648, + "learning_rate": 2.4195009967981635e-05, + "loss": 2.7556, + "step": 29106 + }, + { + "epoch": 2.637041063622568, + "grad_norm": 0.9959329962730408, + "learning_rate": 2.418896876699088e-05, + "loss": 2.4102, + "step": 29107 + }, + { + "epoch": 2.6371316617970146, + "grad_norm": 1.0012770891189575, + "learning_rate": 2.418292756600012e-05, + "loss": 2.6463, + "step": 29108 + }, + { + "epoch": 2.6372222599714616, + "grad_norm": 0.9498288631439209, + "learning_rate": 2.4176886365009367e-05, + "loss": 2.5497, + "step": 29109 + }, + { + "epoch": 2.637312858145908, + "grad_norm": 0.9812195301055908, + "learning_rate": 2.4170845164018608e-05, + "loss": 2.7462, + "step": 29110 + }, + { + "epoch": 2.637403456320355, + "grad_norm": 1.0352824926376343, + "learning_rate": 2.4164803963027853e-05, + "loss": 2.6713, + "step": 29111 + }, + { + "epoch": 2.637494054494802, + "grad_norm": 0.9885072708129883, + "learning_rate": 2.4158762762037093e-05, + "loss": 2.5817, + "step": 29112 + }, + { + "epoch": 2.637584652669249, + "grad_norm": 1.1228071451187134, + "learning_rate": 2.4152721561046338e-05, + "loss": 2.5436, + "step": 29113 + }, + { + "epoch": 2.6376752508436954, + "grad_norm": 1.0754057168960571, + "learning_rate": 2.414668036005558e-05, + "loss": 2.3609, + "step": 29114 + }, + { + "epoch": 2.6377658490181424, + "grad_norm": 0.982236385345459, + "learning_rate": 2.4140639159064823e-05, + "loss": 2.6256, + "step": 29115 + }, + { + "epoch": 2.637856447192589, + "grad_norm": 0.976948618888855, + "learning_rate": 2.4134597958074063e-05, + "loss": 2.4484, + "step": 29116 + }, + { + "epoch": 2.637947045367036, + "grad_norm": 0.9858495593070984, + "learning_rate": 2.4128556757083308e-05, + "loss": 2.5044, + "step": 29117 + }, + { + "epoch": 2.6380376435414825, + "grad_norm": 1.0349215269088745, + "learning_rate": 2.4122515556092555e-05, + "loss": 2.3015, + "step": 29118 + }, + { + "epoch": 2.6381282417159295, + "grad_norm": 1.032065987586975, + "learning_rate": 2.4116474355101796e-05, + "loss": 2.8041, + "step": 29119 + }, + { + "epoch": 2.638218839890376, + "grad_norm": 0.983567476272583, + "learning_rate": 2.411043315411104e-05, + "loss": 2.5695, + "step": 29120 + }, + { + "epoch": 2.638309438064823, + "grad_norm": 0.993388831615448, + "learning_rate": 2.410439195312028e-05, + "loss": 2.6498, + "step": 29121 + }, + { + "epoch": 2.6384000362392697, + "grad_norm": 0.8740693926811218, + "learning_rate": 2.4098350752129525e-05, + "loss": 1.9815, + "step": 29122 + }, + { + "epoch": 2.6384906344137167, + "grad_norm": 1.0330885648727417, + "learning_rate": 2.4092309551138766e-05, + "loss": 2.5896, + "step": 29123 + }, + { + "epoch": 2.6385812325881632, + "grad_norm": 0.994254469871521, + "learning_rate": 2.408626835014801e-05, + "loss": 2.3575, + "step": 29124 + }, + { + "epoch": 2.6386718307626102, + "grad_norm": 1.1103297472000122, + "learning_rate": 2.408022714915725e-05, + "loss": 2.5974, + "step": 29125 + }, + { + "epoch": 2.638762428937057, + "grad_norm": 1.057851791381836, + "learning_rate": 2.40741859481665e-05, + "loss": 2.6196, + "step": 29126 + }, + { + "epoch": 2.638853027111504, + "grad_norm": 0.9863763451576233, + "learning_rate": 2.406814474717574e-05, + "loss": 2.8871, + "step": 29127 + }, + { + "epoch": 2.6389436252859504, + "grad_norm": 0.9873208999633789, + "learning_rate": 2.4062103546184984e-05, + "loss": 2.4487, + "step": 29128 + }, + { + "epoch": 2.6390342234603974, + "grad_norm": 0.871606707572937, + "learning_rate": 2.4056062345194225e-05, + "loss": 2.0897, + "step": 29129 + }, + { + "epoch": 2.639124821634844, + "grad_norm": 1.184149980545044, + "learning_rate": 2.405002114420347e-05, + "loss": 2.4332, + "step": 29130 + }, + { + "epoch": 2.639215419809291, + "grad_norm": 1.0248669385910034, + "learning_rate": 2.404397994321271e-05, + "loss": 2.7277, + "step": 29131 + }, + { + "epoch": 2.6393060179837375, + "grad_norm": 1.0505638122558594, + "learning_rate": 2.4037938742221954e-05, + "loss": 2.4023, + "step": 29132 + }, + { + "epoch": 2.6393966161581845, + "grad_norm": 0.90705806016922, + "learning_rate": 2.4031897541231198e-05, + "loss": 2.3179, + "step": 29133 + }, + { + "epoch": 2.639487214332631, + "grad_norm": 1.0783535242080688, + "learning_rate": 2.4025856340240442e-05, + "loss": 2.6003, + "step": 29134 + }, + { + "epoch": 2.639577812507078, + "grad_norm": 1.180431604385376, + "learning_rate": 2.4019815139249687e-05, + "loss": 2.464, + "step": 29135 + }, + { + "epoch": 2.6396684106815247, + "grad_norm": 0.9869813323020935, + "learning_rate": 2.4013773938258927e-05, + "loss": 2.5877, + "step": 29136 + }, + { + "epoch": 2.6397590088559717, + "grad_norm": 1.2369929552078247, + "learning_rate": 2.400773273726817e-05, + "loss": 2.7222, + "step": 29137 + }, + { + "epoch": 2.6398496070304183, + "grad_norm": 1.0408563613891602, + "learning_rate": 2.4001691536277412e-05, + "loss": 2.3402, + "step": 29138 + }, + { + "epoch": 2.6399402052048653, + "grad_norm": 1.015864372253418, + "learning_rate": 2.3995650335286657e-05, + "loss": 2.5892, + "step": 29139 + }, + { + "epoch": 2.640030803379312, + "grad_norm": 1.0149623155593872, + "learning_rate": 2.3989609134295898e-05, + "loss": 2.6522, + "step": 29140 + }, + { + "epoch": 2.640121401553759, + "grad_norm": 1.0578821897506714, + "learning_rate": 2.3983567933305142e-05, + "loss": 2.7655, + "step": 29141 + }, + { + "epoch": 2.6402119997282054, + "grad_norm": 0.803134024143219, + "learning_rate": 2.3977526732314386e-05, + "loss": 2.0209, + "step": 29142 + }, + { + "epoch": 2.6403025979026524, + "grad_norm": 0.9807721376419067, + "learning_rate": 2.397148553132363e-05, + "loss": 2.3253, + "step": 29143 + }, + { + "epoch": 2.640393196077099, + "grad_norm": 1.032403588294983, + "learning_rate": 2.396544433033287e-05, + "loss": 2.5978, + "step": 29144 + }, + { + "epoch": 2.640483794251546, + "grad_norm": 1.082077145576477, + "learning_rate": 2.3959403129342115e-05, + "loss": 2.7451, + "step": 29145 + }, + { + "epoch": 2.6405743924259926, + "grad_norm": 0.8677197694778442, + "learning_rate": 2.3953361928351356e-05, + "loss": 1.699, + "step": 29146 + }, + { + "epoch": 2.640664990600439, + "grad_norm": 1.0672463178634644, + "learning_rate": 2.39473207273606e-05, + "loss": 2.5324, + "step": 29147 + }, + { + "epoch": 2.640755588774886, + "grad_norm": 1.0864436626434326, + "learning_rate": 2.3941279526369845e-05, + "loss": 2.521, + "step": 29148 + }, + { + "epoch": 2.640846186949333, + "grad_norm": 0.9639827013015747, + "learning_rate": 2.3935238325379085e-05, + "loss": 2.7352, + "step": 29149 + }, + { + "epoch": 2.6409367851237797, + "grad_norm": 0.9784150719642639, + "learning_rate": 2.392919712438833e-05, + "loss": 2.6859, + "step": 29150 + }, + { + "epoch": 2.6410273832982263, + "grad_norm": 0.9839655756950378, + "learning_rate": 2.3923155923397574e-05, + "loss": 1.6779, + "step": 29151 + }, + { + "epoch": 2.6411179814726733, + "grad_norm": 0.9986762404441833, + "learning_rate": 2.3917114722406818e-05, + "loss": 2.6536, + "step": 29152 + }, + { + "epoch": 2.6412085796471203, + "grad_norm": 1.0667121410369873, + "learning_rate": 2.391107352141606e-05, + "loss": 2.4602, + "step": 29153 + }, + { + "epoch": 2.641299177821567, + "grad_norm": 0.811095654964447, + "learning_rate": 2.3905032320425303e-05, + "loss": 1.8455, + "step": 29154 + }, + { + "epoch": 2.6413897759960134, + "grad_norm": 0.9688177108764648, + "learning_rate": 2.3898991119434544e-05, + "loss": 2.4613, + "step": 29155 + }, + { + "epoch": 2.6414803741704604, + "grad_norm": 1.1061887741088867, + "learning_rate": 2.3892949918443788e-05, + "loss": 2.4808, + "step": 29156 + }, + { + "epoch": 2.6415709723449075, + "grad_norm": 0.9357749223709106, + "learning_rate": 2.388690871745303e-05, + "loss": 2.1639, + "step": 29157 + }, + { + "epoch": 2.641661570519354, + "grad_norm": 0.9928023815155029, + "learning_rate": 2.3880867516462273e-05, + "loss": 2.47, + "step": 29158 + }, + { + "epoch": 2.6417521686938006, + "grad_norm": 0.9801912903785706, + "learning_rate": 2.3874826315471517e-05, + "loss": 2.3549, + "step": 29159 + }, + { + "epoch": 2.6418427668682476, + "grad_norm": 1.0139636993408203, + "learning_rate": 2.386878511448076e-05, + "loss": 2.6139, + "step": 29160 + }, + { + "epoch": 2.6419333650426946, + "grad_norm": 0.8829042315483093, + "learning_rate": 2.3862743913490002e-05, + "loss": 2.1829, + "step": 29161 + }, + { + "epoch": 2.642023963217141, + "grad_norm": 0.9613895416259766, + "learning_rate": 2.3856702712499247e-05, + "loss": 2.6955, + "step": 29162 + }, + { + "epoch": 2.6421145613915877, + "grad_norm": 1.0502419471740723, + "learning_rate": 2.385066151150849e-05, + "loss": 2.8271, + "step": 29163 + }, + { + "epoch": 2.6422051595660347, + "grad_norm": 1.068243145942688, + "learning_rate": 2.384462031051773e-05, + "loss": 2.5374, + "step": 29164 + }, + { + "epoch": 2.6422957577404818, + "grad_norm": 0.9682178497314453, + "learning_rate": 2.3838579109526976e-05, + "loss": 2.5674, + "step": 29165 + }, + { + "epoch": 2.6423863559149283, + "grad_norm": 1.12442147731781, + "learning_rate": 2.3832537908536217e-05, + "loss": 2.7831, + "step": 29166 + }, + { + "epoch": 2.642476954089375, + "grad_norm": 1.0259208679199219, + "learning_rate": 2.382649670754546e-05, + "loss": 2.6033, + "step": 29167 + }, + { + "epoch": 2.642567552263822, + "grad_norm": 1.0941245555877686, + "learning_rate": 2.3820455506554705e-05, + "loss": 2.8631, + "step": 29168 + }, + { + "epoch": 2.642658150438269, + "grad_norm": 1.094390630722046, + "learning_rate": 2.381441430556395e-05, + "loss": 2.6499, + "step": 29169 + }, + { + "epoch": 2.6427487486127155, + "grad_norm": 1.200584053993225, + "learning_rate": 2.380837310457319e-05, + "loss": 2.6133, + "step": 29170 + }, + { + "epoch": 2.642839346787162, + "grad_norm": 1.0132397413253784, + "learning_rate": 2.3802331903582434e-05, + "loss": 2.4707, + "step": 29171 + }, + { + "epoch": 2.642929944961609, + "grad_norm": 1.0178104639053345, + "learning_rate": 2.3796290702591675e-05, + "loss": 2.5614, + "step": 29172 + }, + { + "epoch": 2.643020543136056, + "grad_norm": 0.9573476910591125, + "learning_rate": 2.379024950160092e-05, + "loss": 2.4328, + "step": 29173 + }, + { + "epoch": 2.6431111413105026, + "grad_norm": 0.9862550497055054, + "learning_rate": 2.378420830061016e-05, + "loss": 2.7321, + "step": 29174 + }, + { + "epoch": 2.643201739484949, + "grad_norm": 1.003406286239624, + "learning_rate": 2.3778167099619405e-05, + "loss": 2.8613, + "step": 29175 + }, + { + "epoch": 2.643292337659396, + "grad_norm": 1.001921534538269, + "learning_rate": 2.377212589862865e-05, + "loss": 2.6008, + "step": 29176 + }, + { + "epoch": 2.643382935833843, + "grad_norm": 0.9666783809661865, + "learning_rate": 2.3766084697637893e-05, + "loss": 2.3275, + "step": 29177 + }, + { + "epoch": 2.6434735340082898, + "grad_norm": 1.1462781429290771, + "learning_rate": 2.3760043496647137e-05, + "loss": 2.4539, + "step": 29178 + }, + { + "epoch": 2.6435641321827363, + "grad_norm": 0.8990371227264404, + "learning_rate": 2.3754002295656378e-05, + "loss": 2.4582, + "step": 29179 + }, + { + "epoch": 2.6436547303571833, + "grad_norm": 1.0181430578231812, + "learning_rate": 2.3747961094665622e-05, + "loss": 2.5621, + "step": 29180 + }, + { + "epoch": 2.6437453285316304, + "grad_norm": 0.8591440916061401, + "learning_rate": 2.3741919893674863e-05, + "loss": 1.9926, + "step": 29181 + }, + { + "epoch": 2.643835926706077, + "grad_norm": 0.953795850276947, + "learning_rate": 2.3735878692684107e-05, + "loss": 2.3467, + "step": 29182 + }, + { + "epoch": 2.6439265248805235, + "grad_norm": 1.095398187637329, + "learning_rate": 2.3729837491693348e-05, + "loss": 2.6156, + "step": 29183 + }, + { + "epoch": 2.6440171230549705, + "grad_norm": 0.9560208916664124, + "learning_rate": 2.3723796290702592e-05, + "loss": 2.57, + "step": 29184 + }, + { + "epoch": 2.644107721229417, + "grad_norm": 1.0610707998275757, + "learning_rate": 2.3717755089711837e-05, + "loss": 2.6875, + "step": 29185 + }, + { + "epoch": 2.644198319403864, + "grad_norm": 0.9945278167724609, + "learning_rate": 2.371171388872108e-05, + "loss": 2.8321, + "step": 29186 + }, + { + "epoch": 2.6442889175783106, + "grad_norm": 1.012621521949768, + "learning_rate": 2.370567268773032e-05, + "loss": 2.6549, + "step": 29187 + }, + { + "epoch": 2.6443795157527576, + "grad_norm": 1.0321840047836304, + "learning_rate": 2.3699631486739566e-05, + "loss": 2.5927, + "step": 29188 + }, + { + "epoch": 2.644470113927204, + "grad_norm": 1.0087605714797974, + "learning_rate": 2.3693590285748807e-05, + "loss": 2.5144, + "step": 29189 + }, + { + "epoch": 2.6445607121016512, + "grad_norm": 0.7314065098762512, + "learning_rate": 2.368754908475805e-05, + "loss": 1.1897, + "step": 29190 + }, + { + "epoch": 2.644651310276098, + "grad_norm": 0.9101048111915588, + "learning_rate": 2.3681507883767295e-05, + "loss": 2.5593, + "step": 29191 + }, + { + "epoch": 2.644741908450545, + "grad_norm": 1.0022263526916504, + "learning_rate": 2.3675466682776536e-05, + "loss": 2.3438, + "step": 29192 + }, + { + "epoch": 2.6448325066249914, + "grad_norm": 0.9430140852928162, + "learning_rate": 2.366942548178578e-05, + "loss": 2.607, + "step": 29193 + }, + { + "epoch": 2.6449231047994384, + "grad_norm": 1.1711559295654297, + "learning_rate": 2.3663384280795024e-05, + "loss": 2.4156, + "step": 29194 + }, + { + "epoch": 2.645013702973885, + "grad_norm": 1.0635015964508057, + "learning_rate": 2.365734307980427e-05, + "loss": 2.3725, + "step": 29195 + }, + { + "epoch": 2.645104301148332, + "grad_norm": 1.0146335363388062, + "learning_rate": 2.365130187881351e-05, + "loss": 2.6354, + "step": 29196 + }, + { + "epoch": 2.6451948993227785, + "grad_norm": 1.0256117582321167, + "learning_rate": 2.3645260677822754e-05, + "loss": 2.6706, + "step": 29197 + }, + { + "epoch": 2.6452854974972255, + "grad_norm": 1.08171808719635, + "learning_rate": 2.3639219476831994e-05, + "loss": 2.5845, + "step": 29198 + }, + { + "epoch": 2.645376095671672, + "grad_norm": 0.9431012272834778, + "learning_rate": 2.363317827584124e-05, + "loss": 2.423, + "step": 29199 + }, + { + "epoch": 2.645466693846119, + "grad_norm": 1.063209891319275, + "learning_rate": 2.362713707485048e-05, + "loss": 2.637, + "step": 29200 + }, + { + "epoch": 2.6455572920205657, + "grad_norm": 1.0564476251602173, + "learning_rate": 2.3621095873859724e-05, + "loss": 2.6017, + "step": 29201 + }, + { + "epoch": 2.6456478901950127, + "grad_norm": 0.946756899356842, + "learning_rate": 2.3615054672868968e-05, + "loss": 2.3728, + "step": 29202 + }, + { + "epoch": 2.6457384883694592, + "grad_norm": 1.0867502689361572, + "learning_rate": 2.3609013471878212e-05, + "loss": 2.4257, + "step": 29203 + }, + { + "epoch": 2.6458290865439063, + "grad_norm": 1.0228646993637085, + "learning_rate": 2.3602972270887453e-05, + "loss": 2.6415, + "step": 29204 + }, + { + "epoch": 2.645919684718353, + "grad_norm": 0.9434329867362976, + "learning_rate": 2.3596931069896697e-05, + "loss": 2.5688, + "step": 29205 + }, + { + "epoch": 2.6460102828928, + "grad_norm": 1.0705217123031616, + "learning_rate": 2.359088986890594e-05, + "loss": 2.553, + "step": 29206 + }, + { + "epoch": 2.6461008810672464, + "grad_norm": 0.9811322689056396, + "learning_rate": 2.3584848667915182e-05, + "loss": 2.7397, + "step": 29207 + }, + { + "epoch": 2.6461914792416934, + "grad_norm": 1.0384652614593506, + "learning_rate": 2.3578807466924426e-05, + "loss": 2.3552, + "step": 29208 + }, + { + "epoch": 2.64628207741614, + "grad_norm": 1.1221954822540283, + "learning_rate": 2.3572766265933667e-05, + "loss": 2.6976, + "step": 29209 + }, + { + "epoch": 2.646372675590587, + "grad_norm": 0.9834927916526794, + "learning_rate": 2.356672506494291e-05, + "loss": 2.5618, + "step": 29210 + }, + { + "epoch": 2.6464632737650335, + "grad_norm": 0.9557576179504395, + "learning_rate": 2.3560683863952156e-05, + "loss": 2.6969, + "step": 29211 + }, + { + "epoch": 2.6465538719394806, + "grad_norm": 0.9859456419944763, + "learning_rate": 2.35546426629614e-05, + "loss": 2.8881, + "step": 29212 + }, + { + "epoch": 2.646644470113927, + "grad_norm": 1.0329509973526, + "learning_rate": 2.354860146197064e-05, + "loss": 2.9532, + "step": 29213 + }, + { + "epoch": 2.646735068288374, + "grad_norm": 1.0573872327804565, + "learning_rate": 2.3542560260979885e-05, + "loss": 2.7929, + "step": 29214 + }, + { + "epoch": 2.6468256664628207, + "grad_norm": 0.9823874235153198, + "learning_rate": 2.3536519059989126e-05, + "loss": 2.7053, + "step": 29215 + }, + { + "epoch": 2.6469162646372677, + "grad_norm": 1.0072518587112427, + "learning_rate": 2.353047785899837e-05, + "loss": 2.671, + "step": 29216 + }, + { + "epoch": 2.6470068628117143, + "grad_norm": 0.9699189066886902, + "learning_rate": 2.352443665800761e-05, + "loss": 2.8571, + "step": 29217 + }, + { + "epoch": 2.6470974609861613, + "grad_norm": 0.9773995280265808, + "learning_rate": 2.3518395457016855e-05, + "loss": 2.6886, + "step": 29218 + }, + { + "epoch": 2.647188059160608, + "grad_norm": 1.0306562185287476, + "learning_rate": 2.35123542560261e-05, + "loss": 2.624, + "step": 29219 + }, + { + "epoch": 2.647278657335055, + "grad_norm": 0.8898968696594238, + "learning_rate": 2.3506313055035343e-05, + "loss": 2.0498, + "step": 29220 + }, + { + "epoch": 2.6473692555095014, + "grad_norm": 0.9985901117324829, + "learning_rate": 2.3500271854044588e-05, + "loss": 2.4258, + "step": 29221 + }, + { + "epoch": 2.6474598536839484, + "grad_norm": 0.9976523518562317, + "learning_rate": 2.349423065305383e-05, + "loss": 2.8019, + "step": 29222 + }, + { + "epoch": 2.647550451858395, + "grad_norm": 0.8565361499786377, + "learning_rate": 2.3488189452063073e-05, + "loss": 2.0595, + "step": 29223 + }, + { + "epoch": 2.647641050032842, + "grad_norm": 1.0237869024276733, + "learning_rate": 2.3482148251072314e-05, + "loss": 2.5119, + "step": 29224 + }, + { + "epoch": 2.6477316482072886, + "grad_norm": 0.9893431067466736, + "learning_rate": 2.3476107050081558e-05, + "loss": 2.5699, + "step": 29225 + }, + { + "epoch": 2.647822246381735, + "grad_norm": 1.0009210109710693, + "learning_rate": 2.34700658490908e-05, + "loss": 2.6948, + "step": 29226 + }, + { + "epoch": 2.647912844556182, + "grad_norm": 1.1108627319335938, + "learning_rate": 2.3464024648100043e-05, + "loss": 2.5682, + "step": 29227 + }, + { + "epoch": 2.648003442730629, + "grad_norm": 1.178315281867981, + "learning_rate": 2.3457983447109287e-05, + "loss": 2.4112, + "step": 29228 + }, + { + "epoch": 2.6480940409050757, + "grad_norm": 1.1196918487548828, + "learning_rate": 2.345194224611853e-05, + "loss": 2.5613, + "step": 29229 + }, + { + "epoch": 2.6481846390795223, + "grad_norm": 1.0336201190948486, + "learning_rate": 2.3445901045127772e-05, + "loss": 2.7443, + "step": 29230 + }, + { + "epoch": 2.6482752372539693, + "grad_norm": 1.10301673412323, + "learning_rate": 2.3439859844137016e-05, + "loss": 2.6062, + "step": 29231 + }, + { + "epoch": 2.6483658354284163, + "grad_norm": 0.9788155555725098, + "learning_rate": 2.3433818643146257e-05, + "loss": 2.4804, + "step": 29232 + }, + { + "epoch": 2.648456433602863, + "grad_norm": 0.8767409920692444, + "learning_rate": 2.34277774421555e-05, + "loss": 2.0157, + "step": 29233 + }, + { + "epoch": 2.6485470317773094, + "grad_norm": 1.0194573402404785, + "learning_rate": 2.3421736241164742e-05, + "loss": 2.5317, + "step": 29234 + }, + { + "epoch": 2.6486376299517564, + "grad_norm": 0.9218549728393555, + "learning_rate": 2.3415695040173986e-05, + "loss": 1.8881, + "step": 29235 + }, + { + "epoch": 2.6487282281262035, + "grad_norm": 1.047335147857666, + "learning_rate": 2.340965383918323e-05, + "loss": 3.0325, + "step": 29236 + }, + { + "epoch": 2.64881882630065, + "grad_norm": 1.0003198385238647, + "learning_rate": 2.3403612638192475e-05, + "loss": 2.412, + "step": 29237 + }, + { + "epoch": 2.6489094244750966, + "grad_norm": 1.0652647018432617, + "learning_rate": 2.339757143720172e-05, + "loss": 2.6837, + "step": 29238 + }, + { + "epoch": 2.6490000226495436, + "grad_norm": 1.0405133962631226, + "learning_rate": 2.339153023621096e-05, + "loss": 3.0041, + "step": 29239 + }, + { + "epoch": 2.6490906208239906, + "grad_norm": 0.9910460710525513, + "learning_rate": 2.3385489035220204e-05, + "loss": 2.3881, + "step": 29240 + }, + { + "epoch": 2.649181218998437, + "grad_norm": 0.9627813100814819, + "learning_rate": 2.3379447834229445e-05, + "loss": 2.4188, + "step": 29241 + }, + { + "epoch": 2.6492718171728837, + "grad_norm": 1.0675419569015503, + "learning_rate": 2.337340663323869e-05, + "loss": 2.4851, + "step": 29242 + }, + { + "epoch": 2.6493624153473307, + "grad_norm": 0.9921587705612183, + "learning_rate": 2.336736543224793e-05, + "loss": 2.5337, + "step": 29243 + }, + { + "epoch": 2.6494530135217778, + "grad_norm": 0.9912614226341248, + "learning_rate": 2.3361324231257174e-05, + "loss": 2.7233, + "step": 29244 + }, + { + "epoch": 2.6495436116962243, + "grad_norm": 0.9736371040344238, + "learning_rate": 2.335528303026642e-05, + "loss": 2.6189, + "step": 29245 + }, + { + "epoch": 2.649634209870671, + "grad_norm": 1.015974760055542, + "learning_rate": 2.3349241829275663e-05, + "loss": 2.4408, + "step": 29246 + }, + { + "epoch": 2.649724808045118, + "grad_norm": 1.0700922012329102, + "learning_rate": 2.3343200628284903e-05, + "loss": 2.7574, + "step": 29247 + }, + { + "epoch": 2.649815406219565, + "grad_norm": 1.0433896780014038, + "learning_rate": 2.3337159427294148e-05, + "loss": 2.6867, + "step": 29248 + }, + { + "epoch": 2.6499060043940115, + "grad_norm": 1.009939432144165, + "learning_rate": 2.333111822630339e-05, + "loss": 2.7154, + "step": 29249 + }, + { + "epoch": 2.649996602568458, + "grad_norm": 1.0345760583877563, + "learning_rate": 2.3325077025312633e-05, + "loss": 2.6079, + "step": 29250 + }, + { + "epoch": 2.650087200742905, + "grad_norm": 1.1550348997116089, + "learning_rate": 2.3319035824321877e-05, + "loss": 2.3311, + "step": 29251 + }, + { + "epoch": 2.650177798917352, + "grad_norm": 1.0754231214523315, + "learning_rate": 2.3312994623331118e-05, + "loss": 2.4874, + "step": 29252 + }, + { + "epoch": 2.6502683970917986, + "grad_norm": 1.0654608011245728, + "learning_rate": 2.3306953422340362e-05, + "loss": 2.6264, + "step": 29253 + }, + { + "epoch": 2.650358995266245, + "grad_norm": 0.9080896973609924, + "learning_rate": 2.3300912221349606e-05, + "loss": 1.8998, + "step": 29254 + }, + { + "epoch": 2.650449593440692, + "grad_norm": 1.0235165357589722, + "learning_rate": 2.329487102035885e-05, + "loss": 2.7947, + "step": 29255 + }, + { + "epoch": 2.650540191615139, + "grad_norm": 1.0840421915054321, + "learning_rate": 2.328882981936809e-05, + "loss": 2.502, + "step": 29256 + }, + { + "epoch": 2.6506307897895858, + "grad_norm": 1.0297975540161133, + "learning_rate": 2.3282788618377335e-05, + "loss": 2.7253, + "step": 29257 + }, + { + "epoch": 2.6507213879640323, + "grad_norm": 1.0139604806900024, + "learning_rate": 2.3276747417386576e-05, + "loss": 2.7283, + "step": 29258 + }, + { + "epoch": 2.6508119861384793, + "grad_norm": 1.0112252235412598, + "learning_rate": 2.327070621639582e-05, + "loss": 2.7193, + "step": 29259 + }, + { + "epoch": 2.6509025843129264, + "grad_norm": 0.9532920122146606, + "learning_rate": 2.326466501540506e-05, + "loss": 2.278, + "step": 29260 + }, + { + "epoch": 2.650993182487373, + "grad_norm": 0.9674574136734009, + "learning_rate": 2.3258623814414306e-05, + "loss": 2.5315, + "step": 29261 + }, + { + "epoch": 2.6510837806618195, + "grad_norm": 1.031347393989563, + "learning_rate": 2.325258261342355e-05, + "loss": 2.5836, + "step": 29262 + }, + { + "epoch": 2.6511743788362665, + "grad_norm": 0.9981238842010498, + "learning_rate": 2.3246541412432794e-05, + "loss": 2.5124, + "step": 29263 + }, + { + "epoch": 2.6512649770107135, + "grad_norm": 1.069319486618042, + "learning_rate": 2.3240500211442035e-05, + "loss": 2.491, + "step": 29264 + }, + { + "epoch": 2.65135557518516, + "grad_norm": 0.9248358011245728, + "learning_rate": 2.323445901045128e-05, + "loss": 2.4391, + "step": 29265 + }, + { + "epoch": 2.6514461733596066, + "grad_norm": 1.008965253829956, + "learning_rate": 2.3228417809460523e-05, + "loss": 2.6226, + "step": 29266 + }, + { + "epoch": 2.6515367715340536, + "grad_norm": 0.9943681955337524, + "learning_rate": 2.3222376608469764e-05, + "loss": 2.4956, + "step": 29267 + }, + { + "epoch": 2.6516273697085, + "grad_norm": 1.0506349802017212, + "learning_rate": 2.321633540747901e-05, + "loss": 2.6299, + "step": 29268 + }, + { + "epoch": 2.6517179678829472, + "grad_norm": 1.0277515649795532, + "learning_rate": 2.321029420648825e-05, + "loss": 2.8579, + "step": 29269 + }, + { + "epoch": 2.651808566057394, + "grad_norm": 0.9889166355133057, + "learning_rate": 2.3204253005497493e-05, + "loss": 2.5928, + "step": 29270 + }, + { + "epoch": 2.651899164231841, + "grad_norm": 1.003477931022644, + "learning_rate": 2.3198211804506738e-05, + "loss": 2.6887, + "step": 29271 + }, + { + "epoch": 2.6519897624062874, + "grad_norm": 1.0282024145126343, + "learning_rate": 2.3192170603515982e-05, + "loss": 2.8576, + "step": 29272 + }, + { + "epoch": 2.6520803605807344, + "grad_norm": 0.869127094745636, + "learning_rate": 2.3186129402525223e-05, + "loss": 2.2167, + "step": 29273 + }, + { + "epoch": 2.652170958755181, + "grad_norm": 1.1121983528137207, + "learning_rate": 2.3180088201534467e-05, + "loss": 2.7396, + "step": 29274 + }, + { + "epoch": 2.652261556929628, + "grad_norm": 1.0366439819335938, + "learning_rate": 2.3174047000543708e-05, + "loss": 2.514, + "step": 29275 + }, + { + "epoch": 2.6523521551040745, + "grad_norm": 1.0720665454864502, + "learning_rate": 2.3168005799552952e-05, + "loss": 2.6737, + "step": 29276 + }, + { + "epoch": 2.6524427532785215, + "grad_norm": 1.0090599060058594, + "learning_rate": 2.3161964598562193e-05, + "loss": 2.5775, + "step": 29277 + }, + { + "epoch": 2.652533351452968, + "grad_norm": 0.9425678253173828, + "learning_rate": 2.3155923397571437e-05, + "loss": 2.6036, + "step": 29278 + }, + { + "epoch": 2.652623949627415, + "grad_norm": 1.1951829195022583, + "learning_rate": 2.314988219658068e-05, + "loss": 2.7296, + "step": 29279 + }, + { + "epoch": 2.6527145478018617, + "grad_norm": 0.9572477340698242, + "learning_rate": 2.3143840995589925e-05, + "loss": 2.0983, + "step": 29280 + }, + { + "epoch": 2.6528051459763087, + "grad_norm": 0.9990147948265076, + "learning_rate": 2.313779979459917e-05, + "loss": 2.4775, + "step": 29281 + }, + { + "epoch": 2.6528957441507552, + "grad_norm": 1.0217524766921997, + "learning_rate": 2.313175859360841e-05, + "loss": 2.4934, + "step": 29282 + }, + { + "epoch": 2.6529863423252023, + "grad_norm": 1.098067045211792, + "learning_rate": 2.3125717392617655e-05, + "loss": 2.482, + "step": 29283 + }, + { + "epoch": 2.653076940499649, + "grad_norm": 0.998786449432373, + "learning_rate": 2.3119676191626895e-05, + "loss": 2.5699, + "step": 29284 + }, + { + "epoch": 2.653167538674096, + "grad_norm": 1.1695352792739868, + "learning_rate": 2.311363499063614e-05, + "loss": 2.6921, + "step": 29285 + }, + { + "epoch": 2.6532581368485424, + "grad_norm": 1.0088472366333008, + "learning_rate": 2.310759378964538e-05, + "loss": 2.5266, + "step": 29286 + }, + { + "epoch": 2.6533487350229894, + "grad_norm": 1.0896817445755005, + "learning_rate": 2.3101552588654625e-05, + "loss": 2.4161, + "step": 29287 + }, + { + "epoch": 2.653439333197436, + "grad_norm": 1.0057997703552246, + "learning_rate": 2.309551138766387e-05, + "loss": 2.6515, + "step": 29288 + }, + { + "epoch": 2.653529931371883, + "grad_norm": 0.9986514449119568, + "learning_rate": 2.3089470186673113e-05, + "loss": 2.7259, + "step": 29289 + }, + { + "epoch": 2.6536205295463295, + "grad_norm": 0.9734776020050049, + "learning_rate": 2.3083428985682354e-05, + "loss": 2.6681, + "step": 29290 + }, + { + "epoch": 2.6537111277207766, + "grad_norm": 1.02590012550354, + "learning_rate": 2.3077387784691598e-05, + "loss": 2.3725, + "step": 29291 + }, + { + "epoch": 2.653801725895223, + "grad_norm": 1.0604599714279175, + "learning_rate": 2.307134658370084e-05, + "loss": 2.5064, + "step": 29292 + }, + { + "epoch": 2.65389232406967, + "grad_norm": 1.0164927244186401, + "learning_rate": 2.3065305382710083e-05, + "loss": 2.4191, + "step": 29293 + }, + { + "epoch": 2.6539829222441167, + "grad_norm": 1.1058708429336548, + "learning_rate": 2.3059264181719324e-05, + "loss": 2.7917, + "step": 29294 + }, + { + "epoch": 2.6540735204185637, + "grad_norm": 1.0031447410583496, + "learning_rate": 2.305322298072857e-05, + "loss": 2.5529, + "step": 29295 + }, + { + "epoch": 2.6541641185930103, + "grad_norm": 1.1295990943908691, + "learning_rate": 2.3047181779737813e-05, + "loss": 2.5989, + "step": 29296 + }, + { + "epoch": 2.6542547167674573, + "grad_norm": 1.0041307210922241, + "learning_rate": 2.3041140578747057e-05, + "loss": 2.542, + "step": 29297 + }, + { + "epoch": 2.654345314941904, + "grad_norm": 0.9525985717773438, + "learning_rate": 2.30350993777563e-05, + "loss": 2.6205, + "step": 29298 + }, + { + "epoch": 2.654435913116351, + "grad_norm": 1.1616675853729248, + "learning_rate": 2.3029058176765542e-05, + "loss": 2.5502, + "step": 29299 + }, + { + "epoch": 2.6545265112907974, + "grad_norm": 0.9733635783195496, + "learning_rate": 2.3023016975774786e-05, + "loss": 2.4431, + "step": 29300 + }, + { + "epoch": 2.6546171094652444, + "grad_norm": 1.01566481590271, + "learning_rate": 2.3016975774784027e-05, + "loss": 2.7207, + "step": 29301 + }, + { + "epoch": 2.654707707639691, + "grad_norm": 0.8174760937690735, + "learning_rate": 2.301093457379327e-05, + "loss": 1.9452, + "step": 29302 + }, + { + "epoch": 2.654798305814138, + "grad_norm": 1.0769140720367432, + "learning_rate": 2.3004893372802512e-05, + "loss": 2.7284, + "step": 29303 + }, + { + "epoch": 2.6548889039885846, + "grad_norm": 0.9309031963348389, + "learning_rate": 2.2998852171811756e-05, + "loss": 1.9662, + "step": 29304 + }, + { + "epoch": 2.6549795021630316, + "grad_norm": 0.9799239039421082, + "learning_rate": 2.2992810970821e-05, + "loss": 2.4745, + "step": 29305 + }, + { + "epoch": 2.655070100337478, + "grad_norm": 1.0427978038787842, + "learning_rate": 2.2986769769830245e-05, + "loss": 2.4718, + "step": 29306 + }, + { + "epoch": 2.655160698511925, + "grad_norm": 0.9342843890190125, + "learning_rate": 2.2980728568839485e-05, + "loss": 2.6011, + "step": 29307 + }, + { + "epoch": 2.6552512966863717, + "grad_norm": 1.0159324407577515, + "learning_rate": 2.297468736784873e-05, + "loss": 2.7507, + "step": 29308 + }, + { + "epoch": 2.6553418948608183, + "grad_norm": 1.0129808187484741, + "learning_rate": 2.296864616685797e-05, + "loss": 2.601, + "step": 29309 + }, + { + "epoch": 2.6554324930352653, + "grad_norm": 0.95185786485672, + "learning_rate": 2.2962604965867215e-05, + "loss": 2.7565, + "step": 29310 + }, + { + "epoch": 2.6555230912097123, + "grad_norm": 0.8762996196746826, + "learning_rate": 2.295656376487646e-05, + "loss": 2.0952, + "step": 29311 + }, + { + "epoch": 2.655613689384159, + "grad_norm": 1.0078428983688354, + "learning_rate": 2.29505225638857e-05, + "loss": 2.6454, + "step": 29312 + }, + { + "epoch": 2.6557042875586054, + "grad_norm": 1.001407265663147, + "learning_rate": 2.2944481362894947e-05, + "loss": 2.4967, + "step": 29313 + }, + { + "epoch": 2.6557948857330524, + "grad_norm": 0.9995973706245422, + "learning_rate": 2.2938440161904188e-05, + "loss": 2.7837, + "step": 29314 + }, + { + "epoch": 2.6558854839074995, + "grad_norm": 1.0012542009353638, + "learning_rate": 2.2932398960913432e-05, + "loss": 2.6302, + "step": 29315 + }, + { + "epoch": 2.655976082081946, + "grad_norm": 0.943308413028717, + "learning_rate": 2.2926357759922673e-05, + "loss": 2.0496, + "step": 29316 + }, + { + "epoch": 2.6560666802563926, + "grad_norm": 0.880695104598999, + "learning_rate": 2.2920316558931917e-05, + "loss": 1.7144, + "step": 29317 + }, + { + "epoch": 2.6561572784308396, + "grad_norm": 0.9727616906166077, + "learning_rate": 2.2914275357941158e-05, + "loss": 2.4209, + "step": 29318 + }, + { + "epoch": 2.6562478766052866, + "grad_norm": 1.0761644840240479, + "learning_rate": 2.2908234156950402e-05, + "loss": 2.4743, + "step": 29319 + }, + { + "epoch": 2.656338474779733, + "grad_norm": 1.0024405717849731, + "learning_rate": 2.2902192955959643e-05, + "loss": 2.5202, + "step": 29320 + }, + { + "epoch": 2.6564290729541797, + "grad_norm": 1.0797654390335083, + "learning_rate": 2.289615175496889e-05, + "loss": 2.8848, + "step": 29321 + }, + { + "epoch": 2.6565196711286267, + "grad_norm": 0.9929060935974121, + "learning_rate": 2.2890110553978132e-05, + "loss": 2.7138, + "step": 29322 + }, + { + "epoch": 2.6566102693030738, + "grad_norm": 0.95534348487854, + "learning_rate": 2.2884069352987376e-05, + "loss": 2.4748, + "step": 29323 + }, + { + "epoch": 2.6567008674775203, + "grad_norm": 0.8048610091209412, + "learning_rate": 2.2878028151996617e-05, + "loss": 1.9697, + "step": 29324 + }, + { + "epoch": 2.656791465651967, + "grad_norm": 0.9878123998641968, + "learning_rate": 2.287198695100586e-05, + "loss": 2.5777, + "step": 29325 + }, + { + "epoch": 2.656882063826414, + "grad_norm": 1.0055601596832275, + "learning_rate": 2.2865945750015105e-05, + "loss": 2.901, + "step": 29326 + }, + { + "epoch": 2.656972662000861, + "grad_norm": 0.9365196824073792, + "learning_rate": 2.2859904549024346e-05, + "loss": 2.6554, + "step": 29327 + }, + { + "epoch": 2.6570632601753075, + "grad_norm": 1.0662089586257935, + "learning_rate": 2.285386334803359e-05, + "loss": 2.4954, + "step": 29328 + }, + { + "epoch": 2.657153858349754, + "grad_norm": 1.0378148555755615, + "learning_rate": 2.284782214704283e-05, + "loss": 2.9104, + "step": 29329 + }, + { + "epoch": 2.657244456524201, + "grad_norm": 0.99809730052948, + "learning_rate": 2.284178094605208e-05, + "loss": 2.684, + "step": 29330 + }, + { + "epoch": 2.657335054698648, + "grad_norm": 1.0437612533569336, + "learning_rate": 2.283573974506132e-05, + "loss": 2.7188, + "step": 29331 + }, + { + "epoch": 2.6574256528730946, + "grad_norm": 1.084702968597412, + "learning_rate": 2.2829698544070564e-05, + "loss": 2.4848, + "step": 29332 + }, + { + "epoch": 2.657516251047541, + "grad_norm": 0.9866193532943726, + "learning_rate": 2.2823657343079805e-05, + "loss": 2.8452, + "step": 29333 + }, + { + "epoch": 2.657606849221988, + "grad_norm": 0.9750553965568542, + "learning_rate": 2.281761614208905e-05, + "loss": 2.6646, + "step": 29334 + }, + { + "epoch": 2.657697447396435, + "grad_norm": 1.0240733623504639, + "learning_rate": 2.281157494109829e-05, + "loss": 2.6694, + "step": 29335 + }, + { + "epoch": 2.6577880455708818, + "grad_norm": 1.163097858428955, + "learning_rate": 2.2805533740107534e-05, + "loss": 2.6768, + "step": 29336 + }, + { + "epoch": 2.6578786437453283, + "grad_norm": 0.720166027545929, + "learning_rate": 2.2799492539116775e-05, + "loss": 1.3411, + "step": 29337 + }, + { + "epoch": 2.6579692419197753, + "grad_norm": 1.0049816370010376, + "learning_rate": 2.2793451338126022e-05, + "loss": 2.747, + "step": 29338 + }, + { + "epoch": 2.6580598400942224, + "grad_norm": 0.9745616316795349, + "learning_rate": 2.2787410137135266e-05, + "loss": 2.6956, + "step": 29339 + }, + { + "epoch": 2.658150438268669, + "grad_norm": 1.0711525678634644, + "learning_rate": 2.2781368936144507e-05, + "loss": 2.6283, + "step": 29340 + }, + { + "epoch": 2.6582410364431155, + "grad_norm": 1.1061971187591553, + "learning_rate": 2.277532773515375e-05, + "loss": 2.4798, + "step": 29341 + }, + { + "epoch": 2.6583316346175625, + "grad_norm": 1.0475245714187622, + "learning_rate": 2.2769286534162992e-05, + "loss": 2.7044, + "step": 29342 + }, + { + "epoch": 2.6584222327920095, + "grad_norm": 1.0968685150146484, + "learning_rate": 2.2763245333172237e-05, + "loss": 2.5858, + "step": 29343 + }, + { + "epoch": 2.658512830966456, + "grad_norm": 0.9686611890792847, + "learning_rate": 2.2757204132181477e-05, + "loss": 2.6912, + "step": 29344 + }, + { + "epoch": 2.6586034291409026, + "grad_norm": 1.0220223665237427, + "learning_rate": 2.275116293119072e-05, + "loss": 2.6762, + "step": 29345 + }, + { + "epoch": 2.6586940273153497, + "grad_norm": 0.9998987913131714, + "learning_rate": 2.2745121730199966e-05, + "loss": 2.6319, + "step": 29346 + }, + { + "epoch": 2.658784625489796, + "grad_norm": 1.0693881511688232, + "learning_rate": 2.273908052920921e-05, + "loss": 2.8921, + "step": 29347 + }, + { + "epoch": 2.6588752236642432, + "grad_norm": 1.0379716157913208, + "learning_rate": 2.273303932821845e-05, + "loss": 2.8341, + "step": 29348 + }, + { + "epoch": 2.65896582183869, + "grad_norm": 0.8601734638214111, + "learning_rate": 2.2726998127227695e-05, + "loss": 2.1161, + "step": 29349 + }, + { + "epoch": 2.659056420013137, + "grad_norm": 1.0071983337402344, + "learning_rate": 2.2720956926236936e-05, + "loss": 2.5076, + "step": 29350 + }, + { + "epoch": 2.6591470181875834, + "grad_norm": 1.2810428142547607, + "learning_rate": 2.271491572524618e-05, + "loss": 2.5834, + "step": 29351 + }, + { + "epoch": 2.6592376163620304, + "grad_norm": 0.9216343760490417, + "learning_rate": 2.270887452425542e-05, + "loss": 2.4047, + "step": 29352 + }, + { + "epoch": 2.659328214536477, + "grad_norm": 0.8503725528717041, + "learning_rate": 2.2702833323264665e-05, + "loss": 2.1237, + "step": 29353 + }, + { + "epoch": 2.659418812710924, + "grad_norm": 1.0565816164016724, + "learning_rate": 2.269679212227391e-05, + "loss": 2.7405, + "step": 29354 + }, + { + "epoch": 2.6595094108853705, + "grad_norm": 0.8079135417938232, + "learning_rate": 2.2690750921283154e-05, + "loss": 2.0491, + "step": 29355 + }, + { + "epoch": 2.6596000090598175, + "grad_norm": 1.0215144157409668, + "learning_rate": 2.2684709720292398e-05, + "loss": 2.6928, + "step": 29356 + }, + { + "epoch": 2.659690607234264, + "grad_norm": 1.008453369140625, + "learning_rate": 2.267866851930164e-05, + "loss": 2.4487, + "step": 29357 + }, + { + "epoch": 2.659781205408711, + "grad_norm": 0.952481210231781, + "learning_rate": 2.2672627318310883e-05, + "loss": 2.5139, + "step": 29358 + }, + { + "epoch": 2.6598718035831577, + "grad_norm": 1.0261725187301636, + "learning_rate": 2.2666586117320124e-05, + "loss": 2.7728, + "step": 29359 + }, + { + "epoch": 2.6599624017576047, + "grad_norm": 1.133545994758606, + "learning_rate": 2.2660544916329368e-05, + "loss": 2.6565, + "step": 29360 + }, + { + "epoch": 2.6600529999320512, + "grad_norm": 1.042002558708191, + "learning_rate": 2.265450371533861e-05, + "loss": 2.6948, + "step": 29361 + }, + { + "epoch": 2.6601435981064983, + "grad_norm": 0.993970513343811, + "learning_rate": 2.2648462514347853e-05, + "loss": 2.5262, + "step": 29362 + }, + { + "epoch": 2.660234196280945, + "grad_norm": 0.930397093296051, + "learning_rate": 2.2642421313357097e-05, + "loss": 2.69, + "step": 29363 + }, + { + "epoch": 2.660324794455392, + "grad_norm": 1.0932745933532715, + "learning_rate": 2.263638011236634e-05, + "loss": 2.4616, + "step": 29364 + }, + { + "epoch": 2.6604153926298384, + "grad_norm": 0.7983771562576294, + "learning_rate": 2.2630338911375582e-05, + "loss": 1.0967, + "step": 29365 + }, + { + "epoch": 2.6605059908042854, + "grad_norm": 0.8344699144363403, + "learning_rate": 2.2624297710384826e-05, + "loss": 2.0465, + "step": 29366 + }, + { + "epoch": 2.660596588978732, + "grad_norm": 1.0203462839126587, + "learning_rate": 2.2618256509394067e-05, + "loss": 2.689, + "step": 29367 + }, + { + "epoch": 2.660687187153179, + "grad_norm": 0.928325355052948, + "learning_rate": 2.261221530840331e-05, + "loss": 2.491, + "step": 29368 + }, + { + "epoch": 2.6607777853276255, + "grad_norm": 1.0351482629776, + "learning_rate": 2.2606174107412556e-05, + "loss": 2.6013, + "step": 29369 + }, + { + "epoch": 2.6608683835020726, + "grad_norm": 0.8773258924484253, + "learning_rate": 2.2600132906421797e-05, + "loss": 1.8938, + "step": 29370 + }, + { + "epoch": 2.660958981676519, + "grad_norm": 0.9734537601470947, + "learning_rate": 2.259409170543104e-05, + "loss": 2.4215, + "step": 29371 + }, + { + "epoch": 2.661049579850966, + "grad_norm": 1.0665363073349, + "learning_rate": 2.2588050504440285e-05, + "loss": 2.2549, + "step": 29372 + }, + { + "epoch": 2.6611401780254127, + "grad_norm": 0.9724802374839783, + "learning_rate": 2.258200930344953e-05, + "loss": 2.4895, + "step": 29373 + }, + { + "epoch": 2.6612307761998597, + "grad_norm": 1.010487675666809, + "learning_rate": 2.257596810245877e-05, + "loss": 2.6308, + "step": 29374 + }, + { + "epoch": 2.6613213743743063, + "grad_norm": 0.9635859131813049, + "learning_rate": 2.2569926901468014e-05, + "loss": 2.4983, + "step": 29375 + }, + { + "epoch": 2.6614119725487533, + "grad_norm": 1.0316234827041626, + "learning_rate": 2.2563885700477255e-05, + "loss": 2.6355, + "step": 29376 + }, + { + "epoch": 2.6615025707232, + "grad_norm": 1.0528830289840698, + "learning_rate": 2.25578444994865e-05, + "loss": 2.763, + "step": 29377 + }, + { + "epoch": 2.661593168897647, + "grad_norm": 0.975108802318573, + "learning_rate": 2.255180329849574e-05, + "loss": 2.5371, + "step": 29378 + }, + { + "epoch": 2.6616837670720934, + "grad_norm": 1.0940818786621094, + "learning_rate": 2.2545762097504984e-05, + "loss": 2.3564, + "step": 29379 + }, + { + "epoch": 2.6617743652465404, + "grad_norm": 1.075232744216919, + "learning_rate": 2.253972089651423e-05, + "loss": 2.522, + "step": 29380 + }, + { + "epoch": 2.661864963420987, + "grad_norm": 1.0048253536224365, + "learning_rate": 2.2533679695523473e-05, + "loss": 2.6464, + "step": 29381 + }, + { + "epoch": 2.661955561595434, + "grad_norm": 1.0008833408355713, + "learning_rate": 2.2527638494532714e-05, + "loss": 2.6627, + "step": 29382 + }, + { + "epoch": 2.6620461597698806, + "grad_norm": 0.9973824620246887, + "learning_rate": 2.2521597293541958e-05, + "loss": 2.4627, + "step": 29383 + }, + { + "epoch": 2.6621367579443276, + "grad_norm": 1.0474896430969238, + "learning_rate": 2.2515556092551202e-05, + "loss": 2.6523, + "step": 29384 + }, + { + "epoch": 2.662227356118774, + "grad_norm": 1.0957647562026978, + "learning_rate": 2.2509514891560443e-05, + "loss": 2.5556, + "step": 29385 + }, + { + "epoch": 2.662317954293221, + "grad_norm": 1.174452781677246, + "learning_rate": 2.2503473690569687e-05, + "loss": 2.5584, + "step": 29386 + }, + { + "epoch": 2.6624085524676677, + "grad_norm": 1.1595592498779297, + "learning_rate": 2.2497432489578928e-05, + "loss": 2.4781, + "step": 29387 + }, + { + "epoch": 2.6624991506421143, + "grad_norm": 0.9803510904312134, + "learning_rate": 2.2491391288588172e-05, + "loss": 2.6744, + "step": 29388 + }, + { + "epoch": 2.6625897488165613, + "grad_norm": 1.1508084535598755, + "learning_rate": 2.2485350087597416e-05, + "loss": 2.41, + "step": 29389 + }, + { + "epoch": 2.6626803469910083, + "grad_norm": 0.8973304629325867, + "learning_rate": 2.247930888660666e-05, + "loss": 2.1165, + "step": 29390 + }, + { + "epoch": 2.662770945165455, + "grad_norm": 0.8667932748794556, + "learning_rate": 2.24732676856159e-05, + "loss": 2.0498, + "step": 29391 + }, + { + "epoch": 2.6628615433399014, + "grad_norm": 0.9943073987960815, + "learning_rate": 2.2467226484625146e-05, + "loss": 2.6103, + "step": 29392 + }, + { + "epoch": 2.6629521415143484, + "grad_norm": 1.0613583326339722, + "learning_rate": 2.2461185283634386e-05, + "loss": 3.0024, + "step": 29393 + }, + { + "epoch": 2.6630427396887955, + "grad_norm": 0.9945619106292725, + "learning_rate": 2.245514408264363e-05, + "loss": 2.7379, + "step": 29394 + }, + { + "epoch": 2.663133337863242, + "grad_norm": 1.009132981300354, + "learning_rate": 2.244910288165287e-05, + "loss": 2.5576, + "step": 29395 + }, + { + "epoch": 2.6632239360376886, + "grad_norm": 0.9882352948188782, + "learning_rate": 2.2443061680662116e-05, + "loss": 2.574, + "step": 29396 + }, + { + "epoch": 2.6633145342121356, + "grad_norm": 1.0529775619506836, + "learning_rate": 2.243702047967136e-05, + "loss": 2.7827, + "step": 29397 + }, + { + "epoch": 2.6634051323865826, + "grad_norm": 1.0923819541931152, + "learning_rate": 2.2430979278680604e-05, + "loss": 2.5433, + "step": 29398 + }, + { + "epoch": 2.663495730561029, + "grad_norm": 1.110962986946106, + "learning_rate": 2.242493807768985e-05, + "loss": 2.8617, + "step": 29399 + }, + { + "epoch": 2.6635863287354757, + "grad_norm": 0.9550348520278931, + "learning_rate": 2.241889687669909e-05, + "loss": 2.3465, + "step": 29400 + }, + { + "epoch": 2.6636769269099227, + "grad_norm": 0.943656861782074, + "learning_rate": 2.2412855675708333e-05, + "loss": 2.7468, + "step": 29401 + }, + { + "epoch": 2.6637675250843698, + "grad_norm": 1.002632737159729, + "learning_rate": 2.2406814474717574e-05, + "loss": 2.9625, + "step": 29402 + }, + { + "epoch": 2.6638581232588163, + "grad_norm": 1.0239802598953247, + "learning_rate": 2.240077327372682e-05, + "loss": 2.7586, + "step": 29403 + }, + { + "epoch": 2.663948721433263, + "grad_norm": 0.9635593295097351, + "learning_rate": 2.239473207273606e-05, + "loss": 2.6165, + "step": 29404 + }, + { + "epoch": 2.66403931960771, + "grad_norm": 1.033217430114746, + "learning_rate": 2.2388690871745304e-05, + "loss": 2.6343, + "step": 29405 + }, + { + "epoch": 2.664129917782157, + "grad_norm": 0.9951211810112, + "learning_rate": 2.2382649670754548e-05, + "loss": 2.3041, + "step": 29406 + }, + { + "epoch": 2.6642205159566035, + "grad_norm": 0.908699095249176, + "learning_rate": 2.2376608469763792e-05, + "loss": 1.8402, + "step": 29407 + }, + { + "epoch": 2.66431111413105, + "grad_norm": 1.032052993774414, + "learning_rate": 2.2370567268773033e-05, + "loss": 2.7428, + "step": 29408 + }, + { + "epoch": 2.664401712305497, + "grad_norm": 0.9931343793869019, + "learning_rate": 2.2364526067782277e-05, + "loss": 2.5295, + "step": 29409 + }, + { + "epoch": 2.664492310479944, + "grad_norm": 1.0433017015457153, + "learning_rate": 2.2358484866791518e-05, + "loss": 3.0046, + "step": 29410 + }, + { + "epoch": 2.6645829086543906, + "grad_norm": 0.9925936460494995, + "learning_rate": 2.2352443665800762e-05, + "loss": 2.7767, + "step": 29411 + }, + { + "epoch": 2.664673506828837, + "grad_norm": 0.952045202255249, + "learning_rate": 2.2346402464810003e-05, + "loss": 2.4889, + "step": 29412 + }, + { + "epoch": 2.664764105003284, + "grad_norm": 1.0334841012954712, + "learning_rate": 2.2340361263819247e-05, + "loss": 2.4309, + "step": 29413 + }, + { + "epoch": 2.664854703177731, + "grad_norm": 1.0073645114898682, + "learning_rate": 2.233432006282849e-05, + "loss": 2.833, + "step": 29414 + }, + { + "epoch": 2.6649453013521778, + "grad_norm": 0.9821744561195374, + "learning_rate": 2.2328278861837736e-05, + "loss": 2.5738, + "step": 29415 + }, + { + "epoch": 2.6650358995266243, + "grad_norm": 0.9851236343383789, + "learning_rate": 2.232223766084698e-05, + "loss": 2.6936, + "step": 29416 + }, + { + "epoch": 2.6651264977010714, + "grad_norm": 1.070791482925415, + "learning_rate": 2.231619645985622e-05, + "loss": 2.5127, + "step": 29417 + }, + { + "epoch": 2.6652170958755184, + "grad_norm": 0.9445247054100037, + "learning_rate": 2.2310155258865465e-05, + "loss": 2.359, + "step": 29418 + }, + { + "epoch": 2.665307694049965, + "grad_norm": 0.9951275587081909, + "learning_rate": 2.2304114057874706e-05, + "loss": 2.6272, + "step": 29419 + }, + { + "epoch": 2.6653982922244115, + "grad_norm": 0.9605356454849243, + "learning_rate": 2.229807285688395e-05, + "loss": 2.5922, + "step": 29420 + }, + { + "epoch": 2.6654888903988585, + "grad_norm": 0.9687554836273193, + "learning_rate": 2.229203165589319e-05, + "loss": 2.3846, + "step": 29421 + }, + { + "epoch": 2.6655794885733055, + "grad_norm": 1.044725775718689, + "learning_rate": 2.2285990454902435e-05, + "loss": 2.9605, + "step": 29422 + }, + { + "epoch": 2.665670086747752, + "grad_norm": 0.8727520704269409, + "learning_rate": 2.227994925391168e-05, + "loss": 2.145, + "step": 29423 + }, + { + "epoch": 2.6657606849221986, + "grad_norm": 1.0098217725753784, + "learning_rate": 2.2273908052920923e-05, + "loss": 2.4793, + "step": 29424 + }, + { + "epoch": 2.6658512830966457, + "grad_norm": 1.0293124914169312, + "learning_rate": 2.2267866851930164e-05, + "loss": 2.9131, + "step": 29425 + }, + { + "epoch": 2.6659418812710927, + "grad_norm": 0.9647511839866638, + "learning_rate": 2.226182565093941e-05, + "loss": 2.6907, + "step": 29426 + }, + { + "epoch": 2.6660324794455392, + "grad_norm": 0.8478683829307556, + "learning_rate": 2.225578444994865e-05, + "loss": 2.1564, + "step": 29427 + }, + { + "epoch": 2.666123077619986, + "grad_norm": 1.0491890907287598, + "learning_rate": 2.2249743248957893e-05, + "loss": 2.602, + "step": 29428 + }, + { + "epoch": 2.666213675794433, + "grad_norm": 0.9943246245384216, + "learning_rate": 2.2243702047967138e-05, + "loss": 2.6573, + "step": 29429 + }, + { + "epoch": 2.6663042739688794, + "grad_norm": 0.980380117893219, + "learning_rate": 2.223766084697638e-05, + "loss": 2.5592, + "step": 29430 + }, + { + "epoch": 2.6663948721433264, + "grad_norm": 0.9682400226593018, + "learning_rate": 2.2231619645985623e-05, + "loss": 2.5075, + "step": 29431 + }, + { + "epoch": 2.666485470317773, + "grad_norm": 1.060307264328003, + "learning_rate": 2.2225578444994867e-05, + "loss": 2.7645, + "step": 29432 + }, + { + "epoch": 2.66657606849222, + "grad_norm": 1.0138729810714722, + "learning_rate": 2.221953724400411e-05, + "loss": 2.6249, + "step": 29433 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.180346131324768, + "learning_rate": 2.2213496043013352e-05, + "loss": 1.8828, + "step": 29434 + }, + { + "epoch": 2.6667572648411135, + "grad_norm": 1.0642105340957642, + "learning_rate": 2.2207454842022596e-05, + "loss": 2.7485, + "step": 29435 + }, + { + "epoch": 2.66684786301556, + "grad_norm": 1.105490803718567, + "learning_rate": 2.2201413641031837e-05, + "loss": 2.7866, + "step": 29436 + }, + { + "epoch": 2.666938461190007, + "grad_norm": 1.0452698469161987, + "learning_rate": 2.219537244004108e-05, + "loss": 2.6944, + "step": 29437 + }, + { + "epoch": 2.6670290593644537, + "grad_norm": 0.9644485116004944, + "learning_rate": 2.2189331239050322e-05, + "loss": 2.7089, + "step": 29438 + }, + { + "epoch": 2.6671196575389007, + "grad_norm": 0.9594945311546326, + "learning_rate": 2.2183290038059566e-05, + "loss": 2.7193, + "step": 29439 + }, + { + "epoch": 2.6672102557133472, + "grad_norm": 0.9941704869270325, + "learning_rate": 2.217724883706881e-05, + "loss": 2.6172, + "step": 29440 + }, + { + "epoch": 2.6673008538877943, + "grad_norm": 1.011109471321106, + "learning_rate": 2.2171207636078055e-05, + "loss": 2.5465, + "step": 29441 + }, + { + "epoch": 2.667391452062241, + "grad_norm": 1.0121064186096191, + "learning_rate": 2.2165166435087296e-05, + "loss": 2.4505, + "step": 29442 + }, + { + "epoch": 2.667482050236688, + "grad_norm": 0.9977006316184998, + "learning_rate": 2.215912523409654e-05, + "loss": 2.4529, + "step": 29443 + }, + { + "epoch": 2.6675726484111344, + "grad_norm": 1.0637454986572266, + "learning_rate": 2.2153084033105784e-05, + "loss": 3.1341, + "step": 29444 + }, + { + "epoch": 2.6676632465855814, + "grad_norm": 0.974476158618927, + "learning_rate": 2.2147042832115025e-05, + "loss": 2.7619, + "step": 29445 + }, + { + "epoch": 2.667753844760028, + "grad_norm": 1.01145601272583, + "learning_rate": 2.214100163112427e-05, + "loss": 2.6866, + "step": 29446 + }, + { + "epoch": 2.667844442934475, + "grad_norm": 1.075109839439392, + "learning_rate": 2.213496043013351e-05, + "loss": 2.7272, + "step": 29447 + }, + { + "epoch": 2.6679350411089215, + "grad_norm": 1.0130388736724854, + "learning_rate": 2.2128919229142754e-05, + "loss": 2.9037, + "step": 29448 + }, + { + "epoch": 2.6680256392833686, + "grad_norm": 0.9346802830696106, + "learning_rate": 2.2122878028151998e-05, + "loss": 2.4492, + "step": 29449 + }, + { + "epoch": 2.668116237457815, + "grad_norm": 0.9877626299858093, + "learning_rate": 2.2116836827161242e-05, + "loss": 2.3969, + "step": 29450 + }, + { + "epoch": 2.668206835632262, + "grad_norm": 1.0017714500427246, + "learning_rate": 2.2110795626170483e-05, + "loss": 2.5663, + "step": 29451 + }, + { + "epoch": 2.6682974338067087, + "grad_norm": 1.0762184858322144, + "learning_rate": 2.2104754425179728e-05, + "loss": 2.8629, + "step": 29452 + }, + { + "epoch": 2.6683880319811557, + "grad_norm": 1.002110242843628, + "learning_rate": 2.209871322418897e-05, + "loss": 2.8386, + "step": 29453 + }, + { + "epoch": 2.6684786301556023, + "grad_norm": 0.969412624835968, + "learning_rate": 2.2092672023198213e-05, + "loss": 2.5582, + "step": 29454 + }, + { + "epoch": 2.6685692283300493, + "grad_norm": 0.9702164530754089, + "learning_rate": 2.2086630822207453e-05, + "loss": 2.3949, + "step": 29455 + }, + { + "epoch": 2.668659826504496, + "grad_norm": 0.9960227608680725, + "learning_rate": 2.2080589621216698e-05, + "loss": 2.4486, + "step": 29456 + }, + { + "epoch": 2.668750424678943, + "grad_norm": 0.9290706515312195, + "learning_rate": 2.2074548420225942e-05, + "loss": 2.4977, + "step": 29457 + }, + { + "epoch": 2.6688410228533894, + "grad_norm": 1.0280215740203857, + "learning_rate": 2.2068507219235186e-05, + "loss": 2.8313, + "step": 29458 + }, + { + "epoch": 2.6689316210278364, + "grad_norm": 0.9862033128738403, + "learning_rate": 2.206246601824443e-05, + "loss": 2.7216, + "step": 29459 + }, + { + "epoch": 2.669022219202283, + "grad_norm": 0.9700080156326294, + "learning_rate": 2.205642481725367e-05, + "loss": 2.6016, + "step": 29460 + }, + { + "epoch": 2.66911281737673, + "grad_norm": 0.9895078539848328, + "learning_rate": 2.2050383616262915e-05, + "loss": 2.6009, + "step": 29461 + }, + { + "epoch": 2.6692034155511766, + "grad_norm": 1.0152006149291992, + "learning_rate": 2.2044342415272156e-05, + "loss": 2.6896, + "step": 29462 + }, + { + "epoch": 2.6692940137256236, + "grad_norm": 0.8723787665367126, + "learning_rate": 2.20383012142814e-05, + "loss": 1.9734, + "step": 29463 + }, + { + "epoch": 2.66938461190007, + "grad_norm": 1.2831473350524902, + "learning_rate": 2.203226001329064e-05, + "loss": 2.4449, + "step": 29464 + }, + { + "epoch": 2.669475210074517, + "grad_norm": 1.1582603454589844, + "learning_rate": 2.2026218812299885e-05, + "loss": 2.7385, + "step": 29465 + }, + { + "epoch": 2.6695658082489637, + "grad_norm": 0.8351774215698242, + "learning_rate": 2.202017761130913e-05, + "loss": 1.8779, + "step": 29466 + }, + { + "epoch": 2.6696564064234107, + "grad_norm": 1.005900263786316, + "learning_rate": 2.2014136410318374e-05, + "loss": 2.6671, + "step": 29467 + }, + { + "epoch": 2.6697470045978573, + "grad_norm": 1.003418207168579, + "learning_rate": 2.2008095209327615e-05, + "loss": 2.584, + "step": 29468 + }, + { + "epoch": 2.6698376027723043, + "grad_norm": 1.0153234004974365, + "learning_rate": 2.200205400833686e-05, + "loss": 2.5175, + "step": 29469 + }, + { + "epoch": 2.669928200946751, + "grad_norm": 1.0152256488800049, + "learning_rate": 2.19960128073461e-05, + "loss": 2.743, + "step": 29470 + }, + { + "epoch": 2.6700187991211974, + "grad_norm": 1.1663861274719238, + "learning_rate": 2.1989971606355344e-05, + "loss": 2.5679, + "step": 29471 + }, + { + "epoch": 2.6701093972956444, + "grad_norm": 1.0940403938293457, + "learning_rate": 2.1983930405364585e-05, + "loss": 2.8852, + "step": 29472 + }, + { + "epoch": 2.6701999954700915, + "grad_norm": 1.0350490808486938, + "learning_rate": 2.197788920437383e-05, + "loss": 2.8268, + "step": 29473 + }, + { + "epoch": 2.670290593644538, + "grad_norm": 1.0455154180526733, + "learning_rate": 2.1971848003383073e-05, + "loss": 2.62, + "step": 29474 + }, + { + "epoch": 2.6703811918189846, + "grad_norm": 1.0103988647460938, + "learning_rate": 2.1965806802392317e-05, + "loss": 2.7045, + "step": 29475 + }, + { + "epoch": 2.6704717899934316, + "grad_norm": 0.9688650369644165, + "learning_rate": 2.195976560140156e-05, + "loss": 2.6214, + "step": 29476 + }, + { + "epoch": 2.6705623881678786, + "grad_norm": 0.9988352060317993, + "learning_rate": 2.1953724400410802e-05, + "loss": 2.7582, + "step": 29477 + }, + { + "epoch": 2.670652986342325, + "grad_norm": 1.0123752355575562, + "learning_rate": 2.1947683199420047e-05, + "loss": 2.681, + "step": 29478 + }, + { + "epoch": 2.6707435845167717, + "grad_norm": 0.9908990859985352, + "learning_rate": 2.1941641998429288e-05, + "loss": 2.866, + "step": 29479 + }, + { + "epoch": 2.6708341826912187, + "grad_norm": 0.9739581942558289, + "learning_rate": 2.1935600797438532e-05, + "loss": 2.4586, + "step": 29480 + }, + { + "epoch": 2.6709247808656658, + "grad_norm": 0.9663794636726379, + "learning_rate": 2.1929559596447773e-05, + "loss": 2.573, + "step": 29481 + }, + { + "epoch": 2.6710153790401123, + "grad_norm": 1.0384711027145386, + "learning_rate": 2.1923518395457017e-05, + "loss": 2.4511, + "step": 29482 + }, + { + "epoch": 2.671105977214559, + "grad_norm": 0.9888169169425964, + "learning_rate": 2.191747719446626e-05, + "loss": 2.4351, + "step": 29483 + }, + { + "epoch": 2.671196575389006, + "grad_norm": 1.0106035470962524, + "learning_rate": 2.1911435993475505e-05, + "loss": 2.516, + "step": 29484 + }, + { + "epoch": 2.671287173563453, + "grad_norm": 0.967266857624054, + "learning_rate": 2.1905394792484746e-05, + "loss": 2.4921, + "step": 29485 + }, + { + "epoch": 2.6713777717378995, + "grad_norm": 0.948885440826416, + "learning_rate": 2.189935359149399e-05, + "loss": 2.4521, + "step": 29486 + }, + { + "epoch": 2.671468369912346, + "grad_norm": 1.170155644416809, + "learning_rate": 2.1893312390503234e-05, + "loss": 2.3943, + "step": 29487 + }, + { + "epoch": 2.671558968086793, + "grad_norm": 1.0779370069503784, + "learning_rate": 2.1887271189512475e-05, + "loss": 2.0774, + "step": 29488 + }, + { + "epoch": 2.67164956626124, + "grad_norm": 1.0735315084457397, + "learning_rate": 2.188122998852172e-05, + "loss": 2.7159, + "step": 29489 + }, + { + "epoch": 2.6717401644356866, + "grad_norm": 1.0386021137237549, + "learning_rate": 2.187518878753096e-05, + "loss": 2.6019, + "step": 29490 + }, + { + "epoch": 2.671830762610133, + "grad_norm": 0.867247462272644, + "learning_rate": 2.1869147586540205e-05, + "loss": 1.7021, + "step": 29491 + }, + { + "epoch": 2.67192136078458, + "grad_norm": 0.9824296236038208, + "learning_rate": 2.186310638554945e-05, + "loss": 2.5076, + "step": 29492 + }, + { + "epoch": 2.672011958959027, + "grad_norm": 1.0207438468933105, + "learning_rate": 2.1857065184558693e-05, + "loss": 2.5436, + "step": 29493 + }, + { + "epoch": 2.6721025571334738, + "grad_norm": 1.0671865940093994, + "learning_rate": 2.1851023983567934e-05, + "loss": 2.2289, + "step": 29494 + }, + { + "epoch": 2.6721931553079203, + "grad_norm": 1.0381301641464233, + "learning_rate": 2.1844982782577178e-05, + "loss": 2.6779, + "step": 29495 + }, + { + "epoch": 2.6722837534823674, + "grad_norm": 1.0351418256759644, + "learning_rate": 2.183894158158642e-05, + "loss": 2.99, + "step": 29496 + }, + { + "epoch": 2.6723743516568144, + "grad_norm": 0.9691254496574402, + "learning_rate": 2.1832900380595663e-05, + "loss": 2.6561, + "step": 29497 + }, + { + "epoch": 2.672464949831261, + "grad_norm": 1.0574465990066528, + "learning_rate": 2.1826859179604904e-05, + "loss": 2.5934, + "step": 29498 + }, + { + "epoch": 2.6725555480057075, + "grad_norm": 1.152770757675171, + "learning_rate": 2.1820817978614148e-05, + "loss": 2.2268, + "step": 29499 + }, + { + "epoch": 2.6726461461801545, + "grad_norm": 1.1072171926498413, + "learning_rate": 2.1814776777623392e-05, + "loss": 2.7698, + "step": 29500 + }, + { + "epoch": 2.6727367443546015, + "grad_norm": 0.9797632694244385, + "learning_rate": 2.1808735576632637e-05, + "loss": 2.6935, + "step": 29501 + }, + { + "epoch": 2.672827342529048, + "grad_norm": 0.9984073042869568, + "learning_rate": 2.180269437564188e-05, + "loss": 2.6811, + "step": 29502 + }, + { + "epoch": 2.6729179407034946, + "grad_norm": 0.9622427225112915, + "learning_rate": 2.179665317465112e-05, + "loss": 2.5567, + "step": 29503 + }, + { + "epoch": 2.6730085388779417, + "grad_norm": 1.1133460998535156, + "learning_rate": 2.1790611973660366e-05, + "loss": 2.382, + "step": 29504 + }, + { + "epoch": 2.6730991370523887, + "grad_norm": 1.022795557975769, + "learning_rate": 2.1784570772669607e-05, + "loss": 2.5302, + "step": 29505 + }, + { + "epoch": 2.6731897352268352, + "grad_norm": 1.0396087169647217, + "learning_rate": 2.177852957167885e-05, + "loss": 2.8371, + "step": 29506 + }, + { + "epoch": 2.673280333401282, + "grad_norm": 0.9976890087127686, + "learning_rate": 2.1772488370688092e-05, + "loss": 2.6445, + "step": 29507 + }, + { + "epoch": 2.673370931575729, + "grad_norm": 1.0150953531265259, + "learning_rate": 2.176644716969734e-05, + "loss": 2.6969, + "step": 29508 + }, + { + "epoch": 2.6734615297501754, + "grad_norm": 0.9720335006713867, + "learning_rate": 2.176040596870658e-05, + "loss": 2.7297, + "step": 29509 + }, + { + "epoch": 2.6735521279246224, + "grad_norm": 0.996324896812439, + "learning_rate": 2.1754364767715824e-05, + "loss": 2.6768, + "step": 29510 + }, + { + "epoch": 2.673642726099069, + "grad_norm": 1.0333144664764404, + "learning_rate": 2.1748323566725065e-05, + "loss": 2.8015, + "step": 29511 + }, + { + "epoch": 2.673733324273516, + "grad_norm": 1.0650222301483154, + "learning_rate": 2.174228236573431e-05, + "loss": 2.6115, + "step": 29512 + }, + { + "epoch": 2.6738239224479625, + "grad_norm": 0.984839141368866, + "learning_rate": 2.173624116474355e-05, + "loss": 2.721, + "step": 29513 + }, + { + "epoch": 2.6739145206224095, + "grad_norm": 1.1018545627593994, + "learning_rate": 2.1730199963752794e-05, + "loss": 2.3068, + "step": 29514 + }, + { + "epoch": 2.674005118796856, + "grad_norm": 1.0791094303131104, + "learning_rate": 2.1724158762762035e-05, + "loss": 2.6343, + "step": 29515 + }, + { + "epoch": 2.674095716971303, + "grad_norm": 1.055611491203308, + "learning_rate": 2.171811756177128e-05, + "loss": 2.6646, + "step": 29516 + }, + { + "epoch": 2.6741863151457497, + "grad_norm": 1.0378992557525635, + "learning_rate": 2.1712076360780527e-05, + "loss": 2.728, + "step": 29517 + }, + { + "epoch": 2.6742769133201967, + "grad_norm": 1.0686125755310059, + "learning_rate": 2.1706035159789768e-05, + "loss": 2.7256, + "step": 29518 + }, + { + "epoch": 2.6743675114946432, + "grad_norm": 1.0103486776351929, + "learning_rate": 2.1699993958799012e-05, + "loss": 2.5572, + "step": 29519 + }, + { + "epoch": 2.6744581096690903, + "grad_norm": 0.9721692204475403, + "learning_rate": 2.1693952757808253e-05, + "loss": 2.5613, + "step": 29520 + }, + { + "epoch": 2.674548707843537, + "grad_norm": 1.0556591749191284, + "learning_rate": 2.1687911556817497e-05, + "loss": 2.7628, + "step": 29521 + }, + { + "epoch": 2.674639306017984, + "grad_norm": 0.9885812401771545, + "learning_rate": 2.1681870355826738e-05, + "loss": 2.6104, + "step": 29522 + }, + { + "epoch": 2.6747299041924304, + "grad_norm": 0.9950326681137085, + "learning_rate": 2.1675829154835982e-05, + "loss": 2.6757, + "step": 29523 + }, + { + "epoch": 2.6748205023668774, + "grad_norm": 0.9961103796958923, + "learning_rate": 2.1669787953845223e-05, + "loss": 2.6879, + "step": 29524 + }, + { + "epoch": 2.674911100541324, + "grad_norm": 1.0193629264831543, + "learning_rate": 2.166374675285447e-05, + "loss": 2.7702, + "step": 29525 + }, + { + "epoch": 2.675001698715771, + "grad_norm": 0.8602124452590942, + "learning_rate": 2.165770555186371e-05, + "loss": 1.9946, + "step": 29526 + }, + { + "epoch": 2.6750922968902175, + "grad_norm": 0.9696493744850159, + "learning_rate": 2.1651664350872956e-05, + "loss": 2.6325, + "step": 29527 + }, + { + "epoch": 2.6751828950646646, + "grad_norm": 0.9895071387290955, + "learning_rate": 2.1645623149882197e-05, + "loss": 2.9786, + "step": 29528 + }, + { + "epoch": 2.675273493239111, + "grad_norm": 1.0369200706481934, + "learning_rate": 2.163958194889144e-05, + "loss": 2.6335, + "step": 29529 + }, + { + "epoch": 2.675364091413558, + "grad_norm": 1.1143181324005127, + "learning_rate": 2.163354074790068e-05, + "loss": 2.5561, + "step": 29530 + }, + { + "epoch": 2.6754546895880047, + "grad_norm": 1.1157358884811401, + "learning_rate": 2.1627499546909926e-05, + "loss": 2.6713, + "step": 29531 + }, + { + "epoch": 2.6755452877624517, + "grad_norm": 1.0479662418365479, + "learning_rate": 2.162145834591917e-05, + "loss": 2.4081, + "step": 29532 + }, + { + "epoch": 2.6756358859368983, + "grad_norm": 0.8742961287498474, + "learning_rate": 2.1615417144928414e-05, + "loss": 2.0079, + "step": 29533 + }, + { + "epoch": 2.6757264841113453, + "grad_norm": 0.9678521752357483, + "learning_rate": 2.160937594393766e-05, + "loss": 2.3833, + "step": 29534 + }, + { + "epoch": 2.675817082285792, + "grad_norm": 0.9746519923210144, + "learning_rate": 2.16033347429469e-05, + "loss": 2.5911, + "step": 29535 + }, + { + "epoch": 2.675907680460239, + "grad_norm": 1.0249353647232056, + "learning_rate": 2.1597293541956144e-05, + "loss": 2.776, + "step": 29536 + }, + { + "epoch": 2.6759982786346854, + "grad_norm": 0.9268456697463989, + "learning_rate": 2.1591252340965384e-05, + "loss": 2.6273, + "step": 29537 + }, + { + "epoch": 2.6760888768091324, + "grad_norm": 0.8459568023681641, + "learning_rate": 2.158521113997463e-05, + "loss": 1.9452, + "step": 29538 + }, + { + "epoch": 2.676179474983579, + "grad_norm": 1.081792950630188, + "learning_rate": 2.157916993898387e-05, + "loss": 2.4838, + "step": 29539 + }, + { + "epoch": 2.676270073158026, + "grad_norm": 1.0239619016647339, + "learning_rate": 2.1573128737993114e-05, + "loss": 2.5255, + "step": 29540 + }, + { + "epoch": 2.6763606713324726, + "grad_norm": 0.9548388123512268, + "learning_rate": 2.1567087537002358e-05, + "loss": 2.8115, + "step": 29541 + }, + { + "epoch": 2.6764512695069196, + "grad_norm": 1.102002501487732, + "learning_rate": 2.1561046336011602e-05, + "loss": 2.6751, + "step": 29542 + }, + { + "epoch": 2.676541867681366, + "grad_norm": 0.8839002251625061, + "learning_rate": 2.1555005135020843e-05, + "loss": 1.8702, + "step": 29543 + }, + { + "epoch": 2.676632465855813, + "grad_norm": 1.1458468437194824, + "learning_rate": 2.1548963934030087e-05, + "loss": 2.508, + "step": 29544 + }, + { + "epoch": 2.6767230640302597, + "grad_norm": 1.0329415798187256, + "learning_rate": 2.1542922733039328e-05, + "loss": 2.7634, + "step": 29545 + }, + { + "epoch": 2.6768136622047067, + "grad_norm": 1.0545008182525635, + "learning_rate": 2.1536881532048572e-05, + "loss": 2.7354, + "step": 29546 + }, + { + "epoch": 2.6769042603791533, + "grad_norm": 1.0434446334838867, + "learning_rate": 2.1530840331057816e-05, + "loss": 2.5947, + "step": 29547 + }, + { + "epoch": 2.6769948585536003, + "grad_norm": 0.9625966548919678, + "learning_rate": 2.1524799130067057e-05, + "loss": 2.5522, + "step": 29548 + }, + { + "epoch": 2.677085456728047, + "grad_norm": 1.0721596479415894, + "learning_rate": 2.15187579290763e-05, + "loss": 2.5905, + "step": 29549 + }, + { + "epoch": 2.6771760549024934, + "grad_norm": 0.9651902914047241, + "learning_rate": 2.1512716728085546e-05, + "loss": 2.556, + "step": 29550 + }, + { + "epoch": 2.6772666530769405, + "grad_norm": 1.0738518238067627, + "learning_rate": 2.150667552709479e-05, + "loss": 2.8498, + "step": 29551 + }, + { + "epoch": 2.6773572512513875, + "grad_norm": 1.1065092086791992, + "learning_rate": 2.150063432610403e-05, + "loss": 2.8002, + "step": 29552 + }, + { + "epoch": 2.677447849425834, + "grad_norm": 1.0448695421218872, + "learning_rate": 2.1494593125113275e-05, + "loss": 2.473, + "step": 29553 + }, + { + "epoch": 2.6775384476002806, + "grad_norm": 1.0299253463745117, + "learning_rate": 2.1488551924122516e-05, + "loss": 2.5972, + "step": 29554 + }, + { + "epoch": 2.6776290457747276, + "grad_norm": 1.0148279666900635, + "learning_rate": 2.148251072313176e-05, + "loss": 2.6242, + "step": 29555 + }, + { + "epoch": 2.6777196439491746, + "grad_norm": 0.9712324142456055, + "learning_rate": 2.1476469522141e-05, + "loss": 2.4248, + "step": 29556 + }, + { + "epoch": 2.677810242123621, + "grad_norm": 1.0093305110931396, + "learning_rate": 2.1470428321150245e-05, + "loss": 2.5848, + "step": 29557 + }, + { + "epoch": 2.6779008402980677, + "grad_norm": 1.0496037006378174, + "learning_rate": 2.146438712015949e-05, + "loss": 2.7292, + "step": 29558 + }, + { + "epoch": 2.6779914384725148, + "grad_norm": 0.9503282308578491, + "learning_rate": 2.1458345919168733e-05, + "loss": 2.5583, + "step": 29559 + }, + { + "epoch": 2.6780820366469618, + "grad_norm": 0.9919906258583069, + "learning_rate": 2.1452304718177974e-05, + "loss": 2.6058, + "step": 29560 + }, + { + "epoch": 2.6781726348214083, + "grad_norm": 0.9990535974502563, + "learning_rate": 2.144626351718722e-05, + "loss": 2.6842, + "step": 29561 + }, + { + "epoch": 2.678263232995855, + "grad_norm": 1.4162040948867798, + "learning_rate": 2.1440222316196463e-05, + "loss": 2.2803, + "step": 29562 + }, + { + "epoch": 2.678353831170302, + "grad_norm": 0.9760135412216187, + "learning_rate": 2.1434181115205704e-05, + "loss": 2.6789, + "step": 29563 + }, + { + "epoch": 2.678444429344749, + "grad_norm": 1.0041093826293945, + "learning_rate": 2.1428139914214948e-05, + "loss": 2.5939, + "step": 29564 + }, + { + "epoch": 2.6785350275191955, + "grad_norm": 1.0253866910934448, + "learning_rate": 2.142209871322419e-05, + "loss": 2.7002, + "step": 29565 + }, + { + "epoch": 2.678625625693642, + "grad_norm": 1.1592302322387695, + "learning_rate": 2.1416057512233433e-05, + "loss": 2.5163, + "step": 29566 + }, + { + "epoch": 2.678716223868089, + "grad_norm": 1.017030119895935, + "learning_rate": 2.1410016311242677e-05, + "loss": 2.5973, + "step": 29567 + }, + { + "epoch": 2.678806822042536, + "grad_norm": 0.94847172498703, + "learning_rate": 2.140397511025192e-05, + "loss": 2.0157, + "step": 29568 + }, + { + "epoch": 2.6788974202169826, + "grad_norm": 1.1742100715637207, + "learning_rate": 2.1397933909261162e-05, + "loss": 2.7743, + "step": 29569 + }, + { + "epoch": 2.678988018391429, + "grad_norm": 0.935032844543457, + "learning_rate": 2.1391892708270406e-05, + "loss": 1.9418, + "step": 29570 + }, + { + "epoch": 2.679078616565876, + "grad_norm": 0.9351357817649841, + "learning_rate": 2.1385851507279647e-05, + "loss": 2.4198, + "step": 29571 + }, + { + "epoch": 2.679169214740323, + "grad_norm": 1.090120553970337, + "learning_rate": 2.137981030628889e-05, + "loss": 2.7868, + "step": 29572 + }, + { + "epoch": 2.67925981291477, + "grad_norm": 1.0950915813446045, + "learning_rate": 2.1373769105298132e-05, + "loss": 2.8343, + "step": 29573 + }, + { + "epoch": 2.6793504110892163, + "grad_norm": 0.9771090149879456, + "learning_rate": 2.1367727904307376e-05, + "loss": 2.5565, + "step": 29574 + }, + { + "epoch": 2.6794410092636634, + "grad_norm": 0.9962331652641296, + "learning_rate": 2.136168670331662e-05, + "loss": 2.6391, + "step": 29575 + }, + { + "epoch": 2.6795316074381104, + "grad_norm": 1.0288045406341553, + "learning_rate": 2.1355645502325865e-05, + "loss": 2.6178, + "step": 29576 + }, + { + "epoch": 2.679622205612557, + "grad_norm": 1.0954772233963013, + "learning_rate": 2.134960430133511e-05, + "loss": 2.3252, + "step": 29577 + }, + { + "epoch": 2.6797128037870035, + "grad_norm": 1.1195589303970337, + "learning_rate": 2.134356310034435e-05, + "loss": 2.4446, + "step": 29578 + }, + { + "epoch": 2.6798034019614505, + "grad_norm": 1.0484567880630493, + "learning_rate": 2.1337521899353594e-05, + "loss": 2.6837, + "step": 29579 + }, + { + "epoch": 2.6798940001358975, + "grad_norm": 1.0101710557937622, + "learning_rate": 2.1331480698362835e-05, + "loss": 2.4027, + "step": 29580 + }, + { + "epoch": 2.679984598310344, + "grad_norm": 0.9873012900352478, + "learning_rate": 2.132543949737208e-05, + "loss": 2.4841, + "step": 29581 + }, + { + "epoch": 2.6800751964847906, + "grad_norm": 0.9784918427467346, + "learning_rate": 2.131939829638132e-05, + "loss": 2.5506, + "step": 29582 + }, + { + "epoch": 2.6801657946592377, + "grad_norm": 1.0063624382019043, + "learning_rate": 2.1313357095390564e-05, + "loss": 2.7804, + "step": 29583 + }, + { + "epoch": 2.6802563928336847, + "grad_norm": 0.9374464154243469, + "learning_rate": 2.130731589439981e-05, + "loss": 2.5859, + "step": 29584 + }, + { + "epoch": 2.6803469910081312, + "grad_norm": 1.0286626815795898, + "learning_rate": 2.1301274693409053e-05, + "loss": 2.6723, + "step": 29585 + }, + { + "epoch": 2.680437589182578, + "grad_norm": 1.0394890308380127, + "learning_rate": 2.1295233492418293e-05, + "loss": 2.7346, + "step": 29586 + }, + { + "epoch": 2.680528187357025, + "grad_norm": 0.9591361284255981, + "learning_rate": 2.1289192291427538e-05, + "loss": 2.4865, + "step": 29587 + }, + { + "epoch": 2.680618785531472, + "grad_norm": 1.0415470600128174, + "learning_rate": 2.128315109043678e-05, + "loss": 2.488, + "step": 29588 + }, + { + "epoch": 2.6807093837059184, + "grad_norm": 1.0262500047683716, + "learning_rate": 2.1277109889446023e-05, + "loss": 2.6321, + "step": 29589 + }, + { + "epoch": 2.680799981880365, + "grad_norm": 0.9911346435546875, + "learning_rate": 2.1271068688455264e-05, + "loss": 2.6475, + "step": 29590 + }, + { + "epoch": 2.680890580054812, + "grad_norm": 0.9608847498893738, + "learning_rate": 2.1265027487464508e-05, + "loss": 2.7872, + "step": 29591 + }, + { + "epoch": 2.6809811782292585, + "grad_norm": 1.0573846101760864, + "learning_rate": 2.1258986286473752e-05, + "loss": 2.6595, + "step": 29592 + }, + { + "epoch": 2.6810717764037055, + "grad_norm": 0.968748927116394, + "learning_rate": 2.1252945085482996e-05, + "loss": 2.586, + "step": 29593 + }, + { + "epoch": 2.681162374578152, + "grad_norm": 0.9537730813026428, + "learning_rate": 2.124690388449224e-05, + "loss": 2.5282, + "step": 29594 + }, + { + "epoch": 2.681252972752599, + "grad_norm": 1.0496939420700073, + "learning_rate": 2.124086268350148e-05, + "loss": 3.0639, + "step": 29595 + }, + { + "epoch": 2.6813435709270457, + "grad_norm": 1.0181702375411987, + "learning_rate": 2.1234821482510725e-05, + "loss": 2.935, + "step": 29596 + }, + { + "epoch": 2.6814341691014927, + "grad_norm": 1.0297905206680298, + "learning_rate": 2.1228780281519966e-05, + "loss": 2.8534, + "step": 29597 + }, + { + "epoch": 2.6815247672759392, + "grad_norm": 1.0155329704284668, + "learning_rate": 2.122273908052921e-05, + "loss": 2.6605, + "step": 29598 + }, + { + "epoch": 2.6816153654503863, + "grad_norm": 0.9736824631690979, + "learning_rate": 2.121669787953845e-05, + "loss": 2.9446, + "step": 29599 + }, + { + "epoch": 2.681705963624833, + "grad_norm": 0.9954028725624084, + "learning_rate": 2.1210656678547696e-05, + "loss": 2.7567, + "step": 29600 + }, + { + "epoch": 2.68179656179928, + "grad_norm": 0.9530680775642395, + "learning_rate": 2.120461547755694e-05, + "loss": 2.4976, + "step": 29601 + }, + { + "epoch": 2.6818871599737264, + "grad_norm": 0.8915514349937439, + "learning_rate": 2.1198574276566184e-05, + "loss": 1.9317, + "step": 29602 + }, + { + "epoch": 2.6819777581481734, + "grad_norm": 1.000116229057312, + "learning_rate": 2.1192533075575425e-05, + "loss": 2.5626, + "step": 29603 + }, + { + "epoch": 2.68206835632262, + "grad_norm": 0.895536482334137, + "learning_rate": 2.118649187458467e-05, + "loss": 1.9099, + "step": 29604 + }, + { + "epoch": 2.682158954497067, + "grad_norm": 0.9474380016326904, + "learning_rate": 2.118045067359391e-05, + "loss": 2.3209, + "step": 29605 + }, + { + "epoch": 2.6822495526715135, + "grad_norm": 0.9147542119026184, + "learning_rate": 2.1174409472603154e-05, + "loss": 1.9254, + "step": 29606 + }, + { + "epoch": 2.6823401508459606, + "grad_norm": 0.9997895956039429, + "learning_rate": 2.1168368271612398e-05, + "loss": 2.5433, + "step": 29607 + }, + { + "epoch": 2.682430749020407, + "grad_norm": 1.0684905052185059, + "learning_rate": 2.116232707062164e-05, + "loss": 2.831, + "step": 29608 + }, + { + "epoch": 2.682521347194854, + "grad_norm": 0.9584136605262756, + "learning_rate": 2.1156285869630883e-05, + "loss": 2.6049, + "step": 29609 + }, + { + "epoch": 2.6826119453693007, + "grad_norm": 0.9338745474815369, + "learning_rate": 2.1150244668640128e-05, + "loss": 2.4627, + "step": 29610 + }, + { + "epoch": 2.6827025435437477, + "grad_norm": 1.0944499969482422, + "learning_rate": 2.1144203467649372e-05, + "loss": 2.5047, + "step": 29611 + }, + { + "epoch": 2.6827931417181943, + "grad_norm": 1.1094696521759033, + "learning_rate": 2.1138162266658613e-05, + "loss": 2.4045, + "step": 29612 + }, + { + "epoch": 2.6828837398926413, + "grad_norm": 0.9763241410255432, + "learning_rate": 2.1132121065667857e-05, + "loss": 2.5108, + "step": 29613 + }, + { + "epoch": 2.682974338067088, + "grad_norm": 1.040058970451355, + "learning_rate": 2.1126079864677098e-05, + "loss": 2.7928, + "step": 29614 + }, + { + "epoch": 2.683064936241535, + "grad_norm": 1.0372459888458252, + "learning_rate": 2.1120038663686342e-05, + "loss": 2.6893, + "step": 29615 + }, + { + "epoch": 2.6831555344159814, + "grad_norm": 1.0854079723358154, + "learning_rate": 2.1113997462695583e-05, + "loss": 2.556, + "step": 29616 + }, + { + "epoch": 2.6832461325904284, + "grad_norm": 1.0067837238311768, + "learning_rate": 2.1107956261704827e-05, + "loss": 2.9351, + "step": 29617 + }, + { + "epoch": 2.683336730764875, + "grad_norm": 0.8842148184776306, + "learning_rate": 2.110191506071407e-05, + "loss": 1.6213, + "step": 29618 + }, + { + "epoch": 2.683427328939322, + "grad_norm": 0.8460096716880798, + "learning_rate": 2.1095873859723315e-05, + "loss": 1.9882, + "step": 29619 + }, + { + "epoch": 2.6835179271137686, + "grad_norm": 0.9912413358688354, + "learning_rate": 2.108983265873256e-05, + "loss": 2.3786, + "step": 29620 + }, + { + "epoch": 2.6836085252882156, + "grad_norm": 0.9303222298622131, + "learning_rate": 2.10837914577418e-05, + "loss": 2.538, + "step": 29621 + }, + { + "epoch": 2.683699123462662, + "grad_norm": 1.0080232620239258, + "learning_rate": 2.1077750256751045e-05, + "loss": 2.6437, + "step": 29622 + }, + { + "epoch": 2.683789721637109, + "grad_norm": 1.033993124961853, + "learning_rate": 2.1071709055760285e-05, + "loss": 2.6067, + "step": 29623 + }, + { + "epoch": 2.6838803198115557, + "grad_norm": 1.1295336484909058, + "learning_rate": 2.106566785476953e-05, + "loss": 2.7315, + "step": 29624 + }, + { + "epoch": 2.6839709179860027, + "grad_norm": 1.0039194822311401, + "learning_rate": 2.105962665377877e-05, + "loss": 2.341, + "step": 29625 + }, + { + "epoch": 2.6840615161604493, + "grad_norm": 1.1031415462493896, + "learning_rate": 2.1053585452788015e-05, + "loss": 2.4664, + "step": 29626 + }, + { + "epoch": 2.6841521143348963, + "grad_norm": 1.0113353729248047, + "learning_rate": 2.104754425179726e-05, + "loss": 2.681, + "step": 29627 + }, + { + "epoch": 2.684242712509343, + "grad_norm": 0.9678761959075928, + "learning_rate": 2.1041503050806503e-05, + "loss": 2.5162, + "step": 29628 + }, + { + "epoch": 2.68433331068379, + "grad_norm": 0.909782886505127, + "learning_rate": 2.1035461849815744e-05, + "loss": 2.5396, + "step": 29629 + }, + { + "epoch": 2.6844239088582365, + "grad_norm": 0.9642780423164368, + "learning_rate": 2.1029420648824988e-05, + "loss": 2.4719, + "step": 29630 + }, + { + "epoch": 2.6845145070326835, + "grad_norm": 1.0799517631530762, + "learning_rate": 2.102337944783423e-05, + "loss": 2.4231, + "step": 29631 + }, + { + "epoch": 2.68460510520713, + "grad_norm": 1.028730034828186, + "learning_rate": 2.1017338246843473e-05, + "loss": 2.6344, + "step": 29632 + }, + { + "epoch": 2.6846957033815766, + "grad_norm": 1.1288031339645386, + "learning_rate": 2.1011297045852714e-05, + "loss": 2.6513, + "step": 29633 + }, + { + "epoch": 2.6847863015560236, + "grad_norm": 1.0216403007507324, + "learning_rate": 2.1005255844861958e-05, + "loss": 2.6485, + "step": 29634 + }, + { + "epoch": 2.6848768997304706, + "grad_norm": 0.8747886419296265, + "learning_rate": 2.0999214643871203e-05, + "loss": 2.0875, + "step": 29635 + }, + { + "epoch": 2.684967497904917, + "grad_norm": 0.8830687999725342, + "learning_rate": 2.0993173442880447e-05, + "loss": 2.0207, + "step": 29636 + }, + { + "epoch": 2.6850580960793637, + "grad_norm": 0.9545493125915527, + "learning_rate": 2.098713224188969e-05, + "loss": 2.598, + "step": 29637 + }, + { + "epoch": 2.6851486942538108, + "grad_norm": 1.0545626878738403, + "learning_rate": 2.0981091040898932e-05, + "loss": 2.5996, + "step": 29638 + }, + { + "epoch": 2.6852392924282578, + "grad_norm": 1.0642414093017578, + "learning_rate": 2.0975049839908176e-05, + "loss": 2.5073, + "step": 29639 + }, + { + "epoch": 2.6853298906027043, + "grad_norm": 1.0246940851211548, + "learning_rate": 2.0969008638917417e-05, + "loss": 2.627, + "step": 29640 + }, + { + "epoch": 2.685420488777151, + "grad_norm": 1.3438708782196045, + "learning_rate": 2.096296743792666e-05, + "loss": 2.5039, + "step": 29641 + }, + { + "epoch": 2.685511086951598, + "grad_norm": 1.110903263092041, + "learning_rate": 2.0956926236935902e-05, + "loss": 2.7206, + "step": 29642 + }, + { + "epoch": 2.685601685126045, + "grad_norm": 0.9686195254325867, + "learning_rate": 2.0950885035945146e-05, + "loss": 2.5411, + "step": 29643 + }, + { + "epoch": 2.6856922833004915, + "grad_norm": 1.0021893978118896, + "learning_rate": 2.094484383495439e-05, + "loss": 2.7291, + "step": 29644 + }, + { + "epoch": 2.685782881474938, + "grad_norm": 1.079752802848816, + "learning_rate": 2.0938802633963635e-05, + "loss": 2.5824, + "step": 29645 + }, + { + "epoch": 2.685873479649385, + "grad_norm": 1.0448607206344604, + "learning_rate": 2.0932761432972875e-05, + "loss": 2.291, + "step": 29646 + }, + { + "epoch": 2.685964077823832, + "grad_norm": 1.0564806461334229, + "learning_rate": 2.092672023198212e-05, + "loss": 2.5036, + "step": 29647 + }, + { + "epoch": 2.6860546759982786, + "grad_norm": 0.9865442514419556, + "learning_rate": 2.092067903099136e-05, + "loss": 2.3993, + "step": 29648 + }, + { + "epoch": 2.686145274172725, + "grad_norm": 1.1027499437332153, + "learning_rate": 2.0914637830000605e-05, + "loss": 2.5974, + "step": 29649 + }, + { + "epoch": 2.686235872347172, + "grad_norm": 1.0224286317825317, + "learning_rate": 2.090859662900985e-05, + "loss": 2.8074, + "step": 29650 + }, + { + "epoch": 2.686326470521619, + "grad_norm": 1.078921914100647, + "learning_rate": 2.090255542801909e-05, + "loss": 2.554, + "step": 29651 + }, + { + "epoch": 2.686417068696066, + "grad_norm": 1.1178823709487915, + "learning_rate": 2.0896514227028334e-05, + "loss": 2.4536, + "step": 29652 + }, + { + "epoch": 2.6865076668705123, + "grad_norm": 1.1299428939819336, + "learning_rate": 2.0890473026037578e-05, + "loss": 2.7009, + "step": 29653 + }, + { + "epoch": 2.6865982650449594, + "grad_norm": 1.1953672170639038, + "learning_rate": 2.0884431825046822e-05, + "loss": 2.4465, + "step": 29654 + }, + { + "epoch": 2.6866888632194064, + "grad_norm": 0.9608029723167419, + "learning_rate": 2.0878390624056063e-05, + "loss": 2.6882, + "step": 29655 + }, + { + "epoch": 2.686779461393853, + "grad_norm": 0.866386890411377, + "learning_rate": 2.0872349423065307e-05, + "loss": 1.873, + "step": 29656 + }, + { + "epoch": 2.6868700595682995, + "grad_norm": 0.9948055744171143, + "learning_rate": 2.0866308222074548e-05, + "loss": 2.6144, + "step": 29657 + }, + { + "epoch": 2.6869606577427465, + "grad_norm": 1.0884655714035034, + "learning_rate": 2.0860267021083792e-05, + "loss": 2.7044, + "step": 29658 + }, + { + "epoch": 2.6870512559171935, + "grad_norm": 1.0161010026931763, + "learning_rate": 2.0854225820093033e-05, + "loss": 2.7312, + "step": 29659 + }, + { + "epoch": 2.68714185409164, + "grad_norm": 1.014182448387146, + "learning_rate": 2.0848184619102277e-05, + "loss": 2.7076, + "step": 29660 + }, + { + "epoch": 2.6872324522660866, + "grad_norm": 0.9427753686904907, + "learning_rate": 2.084214341811152e-05, + "loss": 2.3781, + "step": 29661 + }, + { + "epoch": 2.6873230504405337, + "grad_norm": 1.0341635942459106, + "learning_rate": 2.0836102217120766e-05, + "loss": 2.4517, + "step": 29662 + }, + { + "epoch": 2.6874136486149807, + "grad_norm": 1.0074093341827393, + "learning_rate": 2.0830061016130007e-05, + "loss": 2.6148, + "step": 29663 + }, + { + "epoch": 2.6875042467894272, + "grad_norm": 1.042081594467163, + "learning_rate": 2.082401981513925e-05, + "loss": 2.5422, + "step": 29664 + }, + { + "epoch": 2.687594844963874, + "grad_norm": 1.0642558336257935, + "learning_rate": 2.0817978614148495e-05, + "loss": 2.7994, + "step": 29665 + }, + { + "epoch": 2.687685443138321, + "grad_norm": 1.0664085149765015, + "learning_rate": 2.0811937413157736e-05, + "loss": 2.4745, + "step": 29666 + }, + { + "epoch": 2.687776041312768, + "grad_norm": 1.01088547706604, + "learning_rate": 2.080589621216698e-05, + "loss": 2.6116, + "step": 29667 + }, + { + "epoch": 2.6878666394872144, + "grad_norm": 0.99920654296875, + "learning_rate": 2.079985501117622e-05, + "loss": 2.5742, + "step": 29668 + }, + { + "epoch": 2.687957237661661, + "grad_norm": 0.9587351679801941, + "learning_rate": 2.0793813810185465e-05, + "loss": 2.675, + "step": 29669 + }, + { + "epoch": 2.688047835836108, + "grad_norm": 1.0102171897888184, + "learning_rate": 2.078777260919471e-05, + "loss": 2.6905, + "step": 29670 + }, + { + "epoch": 2.6881384340105545, + "grad_norm": 0.9568462371826172, + "learning_rate": 2.0781731408203954e-05, + "loss": 2.0336, + "step": 29671 + }, + { + "epoch": 2.6882290321850015, + "grad_norm": 1.059211015701294, + "learning_rate": 2.0775690207213195e-05, + "loss": 2.9004, + "step": 29672 + }, + { + "epoch": 2.688319630359448, + "grad_norm": 1.0040632486343384, + "learning_rate": 2.076964900622244e-05, + "loss": 2.4578, + "step": 29673 + }, + { + "epoch": 2.688410228533895, + "grad_norm": 0.9735854864120483, + "learning_rate": 2.076360780523168e-05, + "loss": 2.7998, + "step": 29674 + }, + { + "epoch": 2.6885008267083417, + "grad_norm": 0.9788171648979187, + "learning_rate": 2.0757566604240924e-05, + "loss": 2.4148, + "step": 29675 + }, + { + "epoch": 2.6885914248827887, + "grad_norm": 1.0522512197494507, + "learning_rate": 2.0751525403250165e-05, + "loss": 2.9084, + "step": 29676 + }, + { + "epoch": 2.6886820230572352, + "grad_norm": 0.9054359197616577, + "learning_rate": 2.074548420225941e-05, + "loss": 2.4316, + "step": 29677 + }, + { + "epoch": 2.6887726212316823, + "grad_norm": 0.9033638834953308, + "learning_rate": 2.0739443001268653e-05, + "loss": 2.1055, + "step": 29678 + }, + { + "epoch": 2.688863219406129, + "grad_norm": 1.0079525709152222, + "learning_rate": 2.0733401800277897e-05, + "loss": 2.5267, + "step": 29679 + }, + { + "epoch": 2.688953817580576, + "grad_norm": 0.992957592010498, + "learning_rate": 2.072736059928714e-05, + "loss": 2.4314, + "step": 29680 + }, + { + "epoch": 2.6890444157550224, + "grad_norm": 1.0353740453720093, + "learning_rate": 2.0721319398296382e-05, + "loss": 2.8162, + "step": 29681 + }, + { + "epoch": 2.6891350139294694, + "grad_norm": 1.0102741718292236, + "learning_rate": 2.0715278197305627e-05, + "loss": 2.6784, + "step": 29682 + }, + { + "epoch": 2.689225612103916, + "grad_norm": 0.9504556059837341, + "learning_rate": 2.0709236996314867e-05, + "loss": 2.0357, + "step": 29683 + }, + { + "epoch": 2.689316210278363, + "grad_norm": 0.9890999794006348, + "learning_rate": 2.070319579532411e-05, + "loss": 2.7631, + "step": 29684 + }, + { + "epoch": 2.6894068084528095, + "grad_norm": 1.0703924894332886, + "learning_rate": 2.0697154594333352e-05, + "loss": 2.4292, + "step": 29685 + }, + { + "epoch": 2.6894974066272566, + "grad_norm": 0.8997946977615356, + "learning_rate": 2.0691113393342597e-05, + "loss": 1.955, + "step": 29686 + }, + { + "epoch": 2.689588004801703, + "grad_norm": 1.1977102756500244, + "learning_rate": 2.068507219235184e-05, + "loss": 2.6064, + "step": 29687 + }, + { + "epoch": 2.68967860297615, + "grad_norm": 0.8402679562568665, + "learning_rate": 2.0679030991361085e-05, + "loss": 1.8188, + "step": 29688 + }, + { + "epoch": 2.6897692011505967, + "grad_norm": 1.076764702796936, + "learning_rate": 2.0672989790370326e-05, + "loss": 2.5919, + "step": 29689 + }, + { + "epoch": 2.6898597993250437, + "grad_norm": 1.047319769859314, + "learning_rate": 2.066694858937957e-05, + "loss": 2.6258, + "step": 29690 + }, + { + "epoch": 2.6899503974994903, + "grad_norm": 0.9923487305641174, + "learning_rate": 2.066090738838881e-05, + "loss": 2.7622, + "step": 29691 + }, + { + "epoch": 2.6900409956739373, + "grad_norm": 1.0168932676315308, + "learning_rate": 2.0654866187398055e-05, + "loss": 2.4664, + "step": 29692 + }, + { + "epoch": 2.690131593848384, + "grad_norm": 1.0326759815216064, + "learning_rate": 2.0648824986407296e-05, + "loss": 3.1128, + "step": 29693 + }, + { + "epoch": 2.690222192022831, + "grad_norm": 1.0743303298950195, + "learning_rate": 2.064278378541654e-05, + "loss": 2.4538, + "step": 29694 + }, + { + "epoch": 2.6903127901972774, + "grad_norm": 1.1186820268630981, + "learning_rate": 2.0636742584425784e-05, + "loss": 2.5401, + "step": 29695 + }, + { + "epoch": 2.6904033883717244, + "grad_norm": 0.9245012402534485, + "learning_rate": 2.063070138343503e-05, + "loss": 2.4693, + "step": 29696 + }, + { + "epoch": 2.690493986546171, + "grad_norm": 1.083834171295166, + "learning_rate": 2.0624660182444273e-05, + "loss": 2.6186, + "step": 29697 + }, + { + "epoch": 2.690584584720618, + "grad_norm": 1.1426146030426025, + "learning_rate": 2.0618618981453514e-05, + "loss": 2.6055, + "step": 29698 + }, + { + "epoch": 2.6906751828950646, + "grad_norm": 0.9849733114242554, + "learning_rate": 2.0612577780462758e-05, + "loss": 2.602, + "step": 29699 + }, + { + "epoch": 2.6907657810695116, + "grad_norm": 0.9917604327201843, + "learning_rate": 2.0606536579472e-05, + "loss": 2.5832, + "step": 29700 + }, + { + "epoch": 2.690856379243958, + "grad_norm": 0.9889087677001953, + "learning_rate": 2.0600495378481243e-05, + "loss": 2.6881, + "step": 29701 + }, + { + "epoch": 2.690946977418405, + "grad_norm": 0.9783139228820801, + "learning_rate": 2.0594454177490484e-05, + "loss": 2.4505, + "step": 29702 + }, + { + "epoch": 2.6910375755928517, + "grad_norm": 0.9381400346755981, + "learning_rate": 2.0588412976499728e-05, + "loss": 2.6775, + "step": 29703 + }, + { + "epoch": 2.6911281737672987, + "grad_norm": 0.9555516839027405, + "learning_rate": 2.0582371775508972e-05, + "loss": 2.3627, + "step": 29704 + }, + { + "epoch": 2.6912187719417453, + "grad_norm": 1.043545126914978, + "learning_rate": 2.0576330574518216e-05, + "loss": 2.5886, + "step": 29705 + }, + { + "epoch": 2.6913093701161923, + "grad_norm": 1.095318078994751, + "learning_rate": 2.0570289373527457e-05, + "loss": 2.802, + "step": 29706 + }, + { + "epoch": 2.691399968290639, + "grad_norm": 1.049457311630249, + "learning_rate": 2.05642481725367e-05, + "loss": 2.4906, + "step": 29707 + }, + { + "epoch": 2.691490566465086, + "grad_norm": 0.9704784154891968, + "learning_rate": 2.0558206971545942e-05, + "loss": 2.425, + "step": 29708 + }, + { + "epoch": 2.6915811646395325, + "grad_norm": 1.0744125843048096, + "learning_rate": 2.0552165770555187e-05, + "loss": 2.5197, + "step": 29709 + }, + { + "epoch": 2.6916717628139795, + "grad_norm": 1.050436019897461, + "learning_rate": 2.054612456956443e-05, + "loss": 2.6744, + "step": 29710 + }, + { + "epoch": 2.691762360988426, + "grad_norm": 0.870887279510498, + "learning_rate": 2.054008336857367e-05, + "loss": 1.982, + "step": 29711 + }, + { + "epoch": 2.6918529591628726, + "grad_norm": 1.0195883512496948, + "learning_rate": 2.053404216758292e-05, + "loss": 2.8306, + "step": 29712 + }, + { + "epoch": 2.6919435573373196, + "grad_norm": 0.9958652853965759, + "learning_rate": 2.052800096659216e-05, + "loss": 2.8764, + "step": 29713 + }, + { + "epoch": 2.6920341555117666, + "grad_norm": 0.9786666631698608, + "learning_rate": 2.0521959765601404e-05, + "loss": 2.555, + "step": 29714 + }, + { + "epoch": 2.692124753686213, + "grad_norm": 1.0215343236923218, + "learning_rate": 2.0515918564610645e-05, + "loss": 2.9252, + "step": 29715 + }, + { + "epoch": 2.6922153518606597, + "grad_norm": 1.0648386478424072, + "learning_rate": 2.050987736361989e-05, + "loss": 2.6982, + "step": 29716 + }, + { + "epoch": 2.6923059500351068, + "grad_norm": 1.067391037940979, + "learning_rate": 2.050383616262913e-05, + "loss": 2.7849, + "step": 29717 + }, + { + "epoch": 2.6923965482095538, + "grad_norm": 0.8720073103904724, + "learning_rate": 2.0497794961638374e-05, + "loss": 1.9694, + "step": 29718 + }, + { + "epoch": 2.6924871463840003, + "grad_norm": 1.0991606712341309, + "learning_rate": 2.0491753760647615e-05, + "loss": 2.6264, + "step": 29719 + }, + { + "epoch": 2.692577744558447, + "grad_norm": 0.9637977480888367, + "learning_rate": 2.0485712559656863e-05, + "loss": 2.6757, + "step": 29720 + }, + { + "epoch": 2.692668342732894, + "grad_norm": 1.0525190830230713, + "learning_rate": 2.0479671358666104e-05, + "loss": 2.7085, + "step": 29721 + }, + { + "epoch": 2.692758940907341, + "grad_norm": 0.9619733095169067, + "learning_rate": 2.0473630157675348e-05, + "loss": 2.5509, + "step": 29722 + }, + { + "epoch": 2.6928495390817875, + "grad_norm": 0.8426433801651001, + "learning_rate": 2.046758895668459e-05, + "loss": 1.9663, + "step": 29723 + }, + { + "epoch": 2.692940137256234, + "grad_norm": 0.9941441416740417, + "learning_rate": 2.0461547755693833e-05, + "loss": 2.7144, + "step": 29724 + }, + { + "epoch": 2.693030735430681, + "grad_norm": 1.0410562753677368, + "learning_rate": 2.0455506554703077e-05, + "loss": 2.7156, + "step": 29725 + }, + { + "epoch": 2.693121333605128, + "grad_norm": 1.0904626846313477, + "learning_rate": 2.0449465353712318e-05, + "loss": 2.4413, + "step": 29726 + }, + { + "epoch": 2.6932119317795746, + "grad_norm": 0.8485749959945679, + "learning_rate": 2.0443424152721562e-05, + "loss": 2.007, + "step": 29727 + }, + { + "epoch": 2.693302529954021, + "grad_norm": 1.0364521741867065, + "learning_rate": 2.0437382951730803e-05, + "loss": 2.8666, + "step": 29728 + }, + { + "epoch": 2.693393128128468, + "grad_norm": 0.9446043968200684, + "learning_rate": 2.043134175074005e-05, + "loss": 2.6002, + "step": 29729 + }, + { + "epoch": 2.693483726302915, + "grad_norm": 1.0363378524780273, + "learning_rate": 2.042530054974929e-05, + "loss": 2.8793, + "step": 29730 + }, + { + "epoch": 2.693574324477362, + "grad_norm": 0.9086282253265381, + "learning_rate": 2.0419259348758536e-05, + "loss": 2.0013, + "step": 29731 + }, + { + "epoch": 2.6936649226518083, + "grad_norm": 0.9607391953468323, + "learning_rate": 2.0413218147767776e-05, + "loss": 1.7407, + "step": 29732 + }, + { + "epoch": 2.6937555208262554, + "grad_norm": 1.004486083984375, + "learning_rate": 2.040717694677702e-05, + "loss": 2.5468, + "step": 29733 + }, + { + "epoch": 2.6938461190007024, + "grad_norm": 1.0406967401504517, + "learning_rate": 2.040113574578626e-05, + "loss": 2.5822, + "step": 29734 + }, + { + "epoch": 2.693936717175149, + "grad_norm": 0.9507969617843628, + "learning_rate": 2.0395094544795506e-05, + "loss": 2.5376, + "step": 29735 + }, + { + "epoch": 2.6940273153495955, + "grad_norm": 0.9823386073112488, + "learning_rate": 2.0389053343804747e-05, + "loss": 2.5054, + "step": 29736 + }, + { + "epoch": 2.6941179135240425, + "grad_norm": 1.0145443677902222, + "learning_rate": 2.0383012142813994e-05, + "loss": 2.5127, + "step": 29737 + }, + { + "epoch": 2.6942085116984895, + "grad_norm": 1.1683681011199951, + "learning_rate": 2.0376970941823235e-05, + "loss": 2.4525, + "step": 29738 + }, + { + "epoch": 2.694299109872936, + "grad_norm": 1.0543667078018188, + "learning_rate": 2.037092974083248e-05, + "loss": 2.6383, + "step": 29739 + }, + { + "epoch": 2.6943897080473826, + "grad_norm": 0.8958307504653931, + "learning_rate": 2.0364888539841723e-05, + "loss": 1.9192, + "step": 29740 + }, + { + "epoch": 2.6944803062218297, + "grad_norm": 0.8344907164573669, + "learning_rate": 2.0358847338850964e-05, + "loss": 2.0829, + "step": 29741 + }, + { + "epoch": 2.6945709043962767, + "grad_norm": 0.924370527267456, + "learning_rate": 2.035280613786021e-05, + "loss": 2.4862, + "step": 29742 + }, + { + "epoch": 2.6946615025707232, + "grad_norm": 1.0514675378799438, + "learning_rate": 2.034676493686945e-05, + "loss": 2.6025, + "step": 29743 + }, + { + "epoch": 2.69475210074517, + "grad_norm": 1.1349259614944458, + "learning_rate": 2.0340723735878693e-05, + "loss": 2.5068, + "step": 29744 + }, + { + "epoch": 2.694842698919617, + "grad_norm": 1.0162582397460938, + "learning_rate": 2.0334682534887938e-05, + "loss": 2.5937, + "step": 29745 + }, + { + "epoch": 2.694933297094064, + "grad_norm": 0.9260345101356506, + "learning_rate": 2.0328641333897182e-05, + "loss": 2.7984, + "step": 29746 + }, + { + "epoch": 2.6950238952685104, + "grad_norm": 0.984635591506958, + "learning_rate": 2.0322600132906423e-05, + "loss": 2.4962, + "step": 29747 + }, + { + "epoch": 2.695114493442957, + "grad_norm": 1.0035955905914307, + "learning_rate": 2.0316558931915667e-05, + "loss": 2.8263, + "step": 29748 + }, + { + "epoch": 2.695205091617404, + "grad_norm": 0.903089165687561, + "learning_rate": 2.0310517730924908e-05, + "loss": 1.9551, + "step": 29749 + }, + { + "epoch": 2.695295689791851, + "grad_norm": 0.9744765758514404, + "learning_rate": 2.0304476529934152e-05, + "loss": 2.6633, + "step": 29750 + }, + { + "epoch": 2.6953862879662975, + "grad_norm": 1.0519248247146606, + "learning_rate": 2.0298435328943393e-05, + "loss": 2.794, + "step": 29751 + }, + { + "epoch": 2.695476886140744, + "grad_norm": 1.0402469635009766, + "learning_rate": 2.0292394127952637e-05, + "loss": 2.6604, + "step": 29752 + }, + { + "epoch": 2.695567484315191, + "grad_norm": 0.9924309253692627, + "learning_rate": 2.028635292696188e-05, + "loss": 2.6577, + "step": 29753 + }, + { + "epoch": 2.6956580824896377, + "grad_norm": 0.9530954957008362, + "learning_rate": 2.0280311725971126e-05, + "loss": 2.6556, + "step": 29754 + }, + { + "epoch": 2.6957486806640847, + "grad_norm": 1.037682056427002, + "learning_rate": 2.027427052498037e-05, + "loss": 2.8803, + "step": 29755 + }, + { + "epoch": 2.6958392788385312, + "grad_norm": 1.025341510772705, + "learning_rate": 2.026822932398961e-05, + "loss": 2.83, + "step": 29756 + }, + { + "epoch": 2.6959298770129783, + "grad_norm": 1.0958731174468994, + "learning_rate": 2.0262188122998855e-05, + "loss": 2.2999, + "step": 29757 + }, + { + "epoch": 2.696020475187425, + "grad_norm": 1.0175954103469849, + "learning_rate": 2.0256146922008096e-05, + "loss": 2.3189, + "step": 29758 + }, + { + "epoch": 2.696111073361872, + "grad_norm": 1.0251694917678833, + "learning_rate": 2.025010572101734e-05, + "loss": 2.5608, + "step": 29759 + }, + { + "epoch": 2.6962016715363184, + "grad_norm": 1.0396621227264404, + "learning_rate": 2.024406452002658e-05, + "loss": 2.6172, + "step": 29760 + }, + { + "epoch": 2.6962922697107654, + "grad_norm": 0.9772084951400757, + "learning_rate": 2.0238023319035825e-05, + "loss": 2.6683, + "step": 29761 + }, + { + "epoch": 2.696382867885212, + "grad_norm": 0.9870798587799072, + "learning_rate": 2.023198211804507e-05, + "loss": 2.6105, + "step": 29762 + }, + { + "epoch": 2.696473466059659, + "grad_norm": 0.9537641406059265, + "learning_rate": 2.0225940917054313e-05, + "loss": 2.5143, + "step": 29763 + }, + { + "epoch": 2.6965640642341056, + "grad_norm": 0.941880464553833, + "learning_rate": 2.0219899716063554e-05, + "loss": 2.3767, + "step": 29764 + }, + { + "epoch": 2.6966546624085526, + "grad_norm": 0.9495505690574646, + "learning_rate": 2.02138585150728e-05, + "loss": 2.4919, + "step": 29765 + }, + { + "epoch": 2.696745260582999, + "grad_norm": 1.2292917966842651, + "learning_rate": 2.020781731408204e-05, + "loss": 2.5951, + "step": 29766 + }, + { + "epoch": 2.696835858757446, + "grad_norm": 0.994698166847229, + "learning_rate": 2.0201776113091283e-05, + "loss": 2.3963, + "step": 29767 + }, + { + "epoch": 2.6969264569318927, + "grad_norm": 0.8332136273384094, + "learning_rate": 2.0195734912100528e-05, + "loss": 2.0351, + "step": 29768 + }, + { + "epoch": 2.6970170551063397, + "grad_norm": 0.9395326375961304, + "learning_rate": 2.018969371110977e-05, + "loss": 2.3713, + "step": 29769 + }, + { + "epoch": 2.6971076532807863, + "grad_norm": 0.9787823557853699, + "learning_rate": 2.0183652510119013e-05, + "loss": 2.5739, + "step": 29770 + }, + { + "epoch": 2.6971982514552333, + "grad_norm": 0.9970622658729553, + "learning_rate": 2.0177611309128257e-05, + "loss": 2.5268, + "step": 29771 + }, + { + "epoch": 2.69728884962968, + "grad_norm": 1.0134197473526, + "learning_rate": 2.01715701081375e-05, + "loss": 2.757, + "step": 29772 + }, + { + "epoch": 2.697379447804127, + "grad_norm": 0.9852170348167419, + "learning_rate": 2.0165528907146742e-05, + "loss": 2.772, + "step": 29773 + }, + { + "epoch": 2.6974700459785734, + "grad_norm": 1.032658576965332, + "learning_rate": 2.0159487706155986e-05, + "loss": 2.5118, + "step": 29774 + }, + { + "epoch": 2.6975606441530204, + "grad_norm": 0.9689270257949829, + "learning_rate": 2.0153446505165227e-05, + "loss": 2.0503, + "step": 29775 + }, + { + "epoch": 2.697651242327467, + "grad_norm": 1.026226282119751, + "learning_rate": 2.014740530417447e-05, + "loss": 2.5856, + "step": 29776 + }, + { + "epoch": 2.697741840501914, + "grad_norm": 0.9900429844856262, + "learning_rate": 2.0141364103183712e-05, + "loss": 2.3822, + "step": 29777 + }, + { + "epoch": 2.6978324386763606, + "grad_norm": 1.0028607845306396, + "learning_rate": 2.0135322902192956e-05, + "loss": 2.4035, + "step": 29778 + }, + { + "epoch": 2.6979230368508076, + "grad_norm": 1.0640584230422974, + "learning_rate": 2.01292817012022e-05, + "loss": 2.6385, + "step": 29779 + }, + { + "epoch": 2.698013635025254, + "grad_norm": 0.993992805480957, + "learning_rate": 2.0123240500211445e-05, + "loss": 2.5575, + "step": 29780 + }, + { + "epoch": 2.698104233199701, + "grad_norm": 1.0507335662841797, + "learning_rate": 2.0117199299220685e-05, + "loss": 2.5475, + "step": 29781 + }, + { + "epoch": 2.6981948313741477, + "grad_norm": 0.9734666347503662, + "learning_rate": 2.011115809822993e-05, + "loss": 2.4206, + "step": 29782 + }, + { + "epoch": 2.6982854295485947, + "grad_norm": 1.0212757587432861, + "learning_rate": 2.0105116897239174e-05, + "loss": 2.5493, + "step": 29783 + }, + { + "epoch": 2.6983760277230413, + "grad_norm": 1.069698691368103, + "learning_rate": 2.0099075696248415e-05, + "loss": 2.6739, + "step": 29784 + }, + { + "epoch": 2.6984666258974883, + "grad_norm": 1.083485722541809, + "learning_rate": 2.009303449525766e-05, + "loss": 2.4531, + "step": 29785 + }, + { + "epoch": 2.698557224071935, + "grad_norm": 1.0093108415603638, + "learning_rate": 2.00869932942669e-05, + "loss": 2.4132, + "step": 29786 + }, + { + "epoch": 2.698647822246382, + "grad_norm": 1.045198917388916, + "learning_rate": 2.0080952093276144e-05, + "loss": 2.3011, + "step": 29787 + }, + { + "epoch": 2.6987384204208285, + "grad_norm": 1.0405876636505127, + "learning_rate": 2.0074910892285388e-05, + "loss": 2.6201, + "step": 29788 + }, + { + "epoch": 2.6988290185952755, + "grad_norm": 0.9892001152038574, + "learning_rate": 2.0068869691294632e-05, + "loss": 2.3219, + "step": 29789 + }, + { + "epoch": 2.698919616769722, + "grad_norm": 0.9833927154541016, + "learning_rate": 2.0062828490303873e-05, + "loss": 2.4615, + "step": 29790 + }, + { + "epoch": 2.699010214944169, + "grad_norm": 0.968481719493866, + "learning_rate": 2.0056787289313118e-05, + "loss": 2.6841, + "step": 29791 + }, + { + "epoch": 2.6991008131186156, + "grad_norm": 1.048549771308899, + "learning_rate": 2.005074608832236e-05, + "loss": 2.7177, + "step": 29792 + }, + { + "epoch": 2.6991914112930626, + "grad_norm": 0.93036288022995, + "learning_rate": 2.0044704887331603e-05, + "loss": 2.7399, + "step": 29793 + }, + { + "epoch": 2.699282009467509, + "grad_norm": 1.0771070718765259, + "learning_rate": 2.0038663686340843e-05, + "loss": 2.5498, + "step": 29794 + }, + { + "epoch": 2.6993726076419557, + "grad_norm": 0.9596938490867615, + "learning_rate": 2.0032622485350088e-05, + "loss": 2.7815, + "step": 29795 + }, + { + "epoch": 2.6994632058164028, + "grad_norm": 1.0800893306732178, + "learning_rate": 2.0026581284359332e-05, + "loss": 2.6223, + "step": 29796 + }, + { + "epoch": 2.6995538039908498, + "grad_norm": 0.9684368371963501, + "learning_rate": 2.0020540083368576e-05, + "loss": 2.6809, + "step": 29797 + }, + { + "epoch": 2.6996444021652963, + "grad_norm": 0.971270740032196, + "learning_rate": 2.001449888237782e-05, + "loss": 2.3398, + "step": 29798 + }, + { + "epoch": 2.699735000339743, + "grad_norm": 1.1052662134170532, + "learning_rate": 2.000845768138706e-05, + "loss": 2.4088, + "step": 29799 + }, + { + "epoch": 2.69982559851419, + "grad_norm": 0.9997886419296265, + "learning_rate": 2.0002416480396305e-05, + "loss": 2.5182, + "step": 29800 + }, + { + "epoch": 2.699916196688637, + "grad_norm": 0.7738324999809265, + "learning_rate": 1.9996375279405546e-05, + "loss": 1.3178, + "step": 29801 + }, + { + "epoch": 2.7000067948630835, + "grad_norm": 1.2301650047302246, + "learning_rate": 1.999033407841479e-05, + "loss": 2.5995, + "step": 29802 + }, + { + "epoch": 2.70009739303753, + "grad_norm": 1.018438458442688, + "learning_rate": 1.998429287742403e-05, + "loss": 2.345, + "step": 29803 + }, + { + "epoch": 2.700187991211977, + "grad_norm": 1.0656148195266724, + "learning_rate": 1.9978251676433275e-05, + "loss": 2.5585, + "step": 29804 + }, + { + "epoch": 2.700278589386424, + "grad_norm": 0.9719221591949463, + "learning_rate": 1.997221047544252e-05, + "loss": 2.6513, + "step": 29805 + }, + { + "epoch": 2.7003691875608706, + "grad_norm": 1.1062028408050537, + "learning_rate": 1.9966169274451764e-05, + "loss": 2.4573, + "step": 29806 + }, + { + "epoch": 2.700459785735317, + "grad_norm": 1.1795557737350464, + "learning_rate": 1.9960128073461005e-05, + "loss": 2.4244, + "step": 29807 + }, + { + "epoch": 2.700550383909764, + "grad_norm": 0.901666522026062, + "learning_rate": 1.995408687247025e-05, + "loss": 1.8414, + "step": 29808 + }, + { + "epoch": 2.700640982084211, + "grad_norm": 0.8886109590530396, + "learning_rate": 1.994804567147949e-05, + "loss": 2.1041, + "step": 29809 + }, + { + "epoch": 2.700731580258658, + "grad_norm": 1.0419977903366089, + "learning_rate": 1.9942004470488734e-05, + "loss": 2.7381, + "step": 29810 + }, + { + "epoch": 2.7008221784331043, + "grad_norm": 1.0335158109664917, + "learning_rate": 1.9935963269497975e-05, + "loss": 2.672, + "step": 29811 + }, + { + "epoch": 2.7009127766075514, + "grad_norm": 0.820887565612793, + "learning_rate": 1.992992206850722e-05, + "loss": 1.9125, + "step": 29812 + }, + { + "epoch": 2.7010033747819984, + "grad_norm": 0.8454146981239319, + "learning_rate": 1.9923880867516463e-05, + "loss": 2.0878, + "step": 29813 + }, + { + "epoch": 2.701093972956445, + "grad_norm": 1.0667017698287964, + "learning_rate": 1.9917839666525707e-05, + "loss": 2.8154, + "step": 29814 + }, + { + "epoch": 2.7011845711308915, + "grad_norm": 0.9980435371398926, + "learning_rate": 1.991179846553495e-05, + "loss": 2.6082, + "step": 29815 + }, + { + "epoch": 2.7012751693053385, + "grad_norm": 0.8382897973060608, + "learning_rate": 1.9905757264544192e-05, + "loss": 1.892, + "step": 29816 + }, + { + "epoch": 2.7013657674797855, + "grad_norm": 0.9603105187416077, + "learning_rate": 1.9899716063553437e-05, + "loss": 2.8301, + "step": 29817 + }, + { + "epoch": 2.701456365654232, + "grad_norm": 1.0635346174240112, + "learning_rate": 1.9893674862562678e-05, + "loss": 2.7375, + "step": 29818 + }, + { + "epoch": 2.7015469638286786, + "grad_norm": 1.0080870389938354, + "learning_rate": 1.9887633661571922e-05, + "loss": 2.4536, + "step": 29819 + }, + { + "epoch": 2.7016375620031257, + "grad_norm": 0.9883286952972412, + "learning_rate": 1.9881592460581163e-05, + "loss": 2.5438, + "step": 29820 + }, + { + "epoch": 2.7017281601775727, + "grad_norm": 0.9736708998680115, + "learning_rate": 1.9875551259590407e-05, + "loss": 2.4944, + "step": 29821 + }, + { + "epoch": 2.7018187583520192, + "grad_norm": 0.9620993137359619, + "learning_rate": 1.986951005859965e-05, + "loss": 2.412, + "step": 29822 + }, + { + "epoch": 2.701909356526466, + "grad_norm": 0.9828398823738098, + "learning_rate": 1.9863468857608895e-05, + "loss": 2.5573, + "step": 29823 + }, + { + "epoch": 2.701999954700913, + "grad_norm": 0.9492015838623047, + "learning_rate": 1.9857427656618136e-05, + "loss": 2.701, + "step": 29824 + }, + { + "epoch": 2.70209055287536, + "grad_norm": 0.855679988861084, + "learning_rate": 1.985138645562738e-05, + "loss": 1.8521, + "step": 29825 + }, + { + "epoch": 2.7021811510498064, + "grad_norm": 0.8783889412879944, + "learning_rate": 1.984534525463662e-05, + "loss": 2.5169, + "step": 29826 + }, + { + "epoch": 2.702271749224253, + "grad_norm": 0.9074050188064575, + "learning_rate": 1.9839304053645865e-05, + "loss": 2.1212, + "step": 29827 + }, + { + "epoch": 2.7023623473987, + "grad_norm": 1.0088996887207031, + "learning_rate": 1.983326285265511e-05, + "loss": 2.659, + "step": 29828 + }, + { + "epoch": 2.702452945573147, + "grad_norm": 1.1134461164474487, + "learning_rate": 1.982722165166435e-05, + "loss": 2.3624, + "step": 29829 + }, + { + "epoch": 2.7025435437475935, + "grad_norm": 1.039231777191162, + "learning_rate": 1.9821180450673595e-05, + "loss": 2.5676, + "step": 29830 + }, + { + "epoch": 2.70263414192204, + "grad_norm": 1.0682311058044434, + "learning_rate": 1.981513924968284e-05, + "loss": 2.6579, + "step": 29831 + }, + { + "epoch": 2.702724740096487, + "grad_norm": 0.9432011842727661, + "learning_rate": 1.9809098048692083e-05, + "loss": 2.4652, + "step": 29832 + }, + { + "epoch": 2.7028153382709337, + "grad_norm": 1.0306144952774048, + "learning_rate": 1.9803056847701324e-05, + "loss": 2.5735, + "step": 29833 + }, + { + "epoch": 2.7029059364453807, + "grad_norm": 1.0993577241897583, + "learning_rate": 1.9797015646710568e-05, + "loss": 2.673, + "step": 29834 + }, + { + "epoch": 2.7029965346198273, + "grad_norm": 0.996411919593811, + "learning_rate": 1.979097444571981e-05, + "loss": 2.7093, + "step": 29835 + }, + { + "epoch": 2.7030871327942743, + "grad_norm": 0.9663637280464172, + "learning_rate": 1.9784933244729053e-05, + "loss": 2.6846, + "step": 29836 + }, + { + "epoch": 2.703177730968721, + "grad_norm": 0.9406023621559143, + "learning_rate": 1.9778892043738294e-05, + "loss": 2.0973, + "step": 29837 + }, + { + "epoch": 2.703268329143168, + "grad_norm": 1.1277414560317993, + "learning_rate": 1.9772850842747538e-05, + "loss": 2.5127, + "step": 29838 + }, + { + "epoch": 2.7033589273176144, + "grad_norm": 1.1344658136367798, + "learning_rate": 1.9766809641756782e-05, + "loss": 2.5942, + "step": 29839 + }, + { + "epoch": 2.7034495254920614, + "grad_norm": 0.983563244342804, + "learning_rate": 1.9760768440766027e-05, + "loss": 2.6893, + "step": 29840 + }, + { + "epoch": 2.703540123666508, + "grad_norm": 0.9526764154434204, + "learning_rate": 1.9754727239775267e-05, + "loss": 2.5684, + "step": 29841 + }, + { + "epoch": 2.703630721840955, + "grad_norm": 1.1483445167541504, + "learning_rate": 1.974868603878451e-05, + "loss": 2.4492, + "step": 29842 + }, + { + "epoch": 2.7037213200154016, + "grad_norm": 1.0102589130401611, + "learning_rate": 1.9742644837793756e-05, + "loss": 2.5717, + "step": 29843 + }, + { + "epoch": 2.7038119181898486, + "grad_norm": 0.9814205765724182, + "learning_rate": 1.9736603636802997e-05, + "loss": 2.6346, + "step": 29844 + }, + { + "epoch": 2.703902516364295, + "grad_norm": 1.0073256492614746, + "learning_rate": 1.973056243581224e-05, + "loss": 2.85, + "step": 29845 + }, + { + "epoch": 2.703993114538742, + "grad_norm": 0.9674273729324341, + "learning_rate": 1.9724521234821482e-05, + "loss": 2.7453, + "step": 29846 + }, + { + "epoch": 2.7040837127131887, + "grad_norm": 1.1301164627075195, + "learning_rate": 1.9718480033830726e-05, + "loss": 2.6718, + "step": 29847 + }, + { + "epoch": 2.7041743108876357, + "grad_norm": 1.0418987274169922, + "learning_rate": 1.971243883283997e-05, + "loss": 2.9206, + "step": 29848 + }, + { + "epoch": 2.7042649090620823, + "grad_norm": 1.0286020040512085, + "learning_rate": 1.9706397631849214e-05, + "loss": 2.4902, + "step": 29849 + }, + { + "epoch": 2.7043555072365293, + "grad_norm": 1.0421135425567627, + "learning_rate": 1.9700356430858455e-05, + "loss": 2.7382, + "step": 29850 + }, + { + "epoch": 2.704446105410976, + "grad_norm": 1.0083495378494263, + "learning_rate": 1.96943152298677e-05, + "loss": 2.7439, + "step": 29851 + }, + { + "epoch": 2.704536703585423, + "grad_norm": 1.0503252744674683, + "learning_rate": 1.968827402887694e-05, + "loss": 2.6494, + "step": 29852 + }, + { + "epoch": 2.7046273017598694, + "grad_norm": 0.9929784536361694, + "learning_rate": 1.9682232827886184e-05, + "loss": 2.7134, + "step": 29853 + }, + { + "epoch": 2.7047178999343164, + "grad_norm": 1.0248712301254272, + "learning_rate": 1.9676191626895425e-05, + "loss": 2.7919, + "step": 29854 + }, + { + "epoch": 2.704808498108763, + "grad_norm": 1.010488510131836, + "learning_rate": 1.967015042590467e-05, + "loss": 2.7776, + "step": 29855 + }, + { + "epoch": 2.70489909628321, + "grad_norm": 0.8660452961921692, + "learning_rate": 1.9664109224913914e-05, + "loss": 2.0401, + "step": 29856 + }, + { + "epoch": 2.7049896944576566, + "grad_norm": 1.1037778854370117, + "learning_rate": 1.9658068023923158e-05, + "loss": 2.6226, + "step": 29857 + }, + { + "epoch": 2.7050802926321036, + "grad_norm": 1.0766316652297974, + "learning_rate": 1.9652026822932402e-05, + "loss": 2.659, + "step": 29858 + }, + { + "epoch": 2.70517089080655, + "grad_norm": 0.9471074938774109, + "learning_rate": 1.9645985621941643e-05, + "loss": 2.1201, + "step": 29859 + }, + { + "epoch": 2.705261488980997, + "grad_norm": 0.9981778860092163, + "learning_rate": 1.9639944420950887e-05, + "loss": 2.911, + "step": 29860 + }, + { + "epoch": 2.7053520871554437, + "grad_norm": 0.9993799924850464, + "learning_rate": 1.9633903219960128e-05, + "loss": 2.3766, + "step": 29861 + }, + { + "epoch": 2.7054426853298907, + "grad_norm": 1.0992059707641602, + "learning_rate": 1.9627862018969372e-05, + "loss": 2.3001, + "step": 29862 + }, + { + "epoch": 2.7055332835043373, + "grad_norm": 1.0471662282943726, + "learning_rate": 1.9621820817978613e-05, + "loss": 2.6617, + "step": 29863 + }, + { + "epoch": 2.7056238816787843, + "grad_norm": 0.9751003384590149, + "learning_rate": 1.9615779616987857e-05, + "loss": 2.65, + "step": 29864 + }, + { + "epoch": 2.705714479853231, + "grad_norm": 1.005250334739685, + "learning_rate": 1.96097384159971e-05, + "loss": 2.6223, + "step": 29865 + }, + { + "epoch": 2.705805078027678, + "grad_norm": 1.0321413278579712, + "learning_rate": 1.9603697215006346e-05, + "loss": 2.46, + "step": 29866 + }, + { + "epoch": 2.7058956762021245, + "grad_norm": 1.0098669528961182, + "learning_rate": 1.9597656014015587e-05, + "loss": 2.6495, + "step": 29867 + }, + { + "epoch": 2.7059862743765715, + "grad_norm": 0.9699143171310425, + "learning_rate": 1.959161481302483e-05, + "loss": 1.8509, + "step": 29868 + }, + { + "epoch": 2.706076872551018, + "grad_norm": 1.1093701124191284, + "learning_rate": 1.958557361203407e-05, + "loss": 2.2732, + "step": 29869 + }, + { + "epoch": 2.706167470725465, + "grad_norm": 1.054879069328308, + "learning_rate": 1.9579532411043316e-05, + "loss": 2.7496, + "step": 29870 + }, + { + "epoch": 2.7062580688999116, + "grad_norm": 0.9932588338851929, + "learning_rate": 1.9573491210052557e-05, + "loss": 2.5742, + "step": 29871 + }, + { + "epoch": 2.7063486670743586, + "grad_norm": 1.0048737525939941, + "learning_rate": 1.95674500090618e-05, + "loss": 2.592, + "step": 29872 + }, + { + "epoch": 2.706439265248805, + "grad_norm": 1.0323665142059326, + "learning_rate": 1.9561408808071045e-05, + "loss": 2.4382, + "step": 29873 + }, + { + "epoch": 2.7065298634232517, + "grad_norm": 0.9815744161605835, + "learning_rate": 1.955536760708029e-05, + "loss": 2.7768, + "step": 29874 + }, + { + "epoch": 2.7066204615976988, + "grad_norm": 1.056721568107605, + "learning_rate": 1.9549326406089534e-05, + "loss": 2.8938, + "step": 29875 + }, + { + "epoch": 2.7067110597721458, + "grad_norm": 1.0447895526885986, + "learning_rate": 1.9543285205098774e-05, + "loss": 2.7725, + "step": 29876 + }, + { + "epoch": 2.7068016579465923, + "grad_norm": 0.9695820212364197, + "learning_rate": 1.953724400410802e-05, + "loss": 2.5255, + "step": 29877 + }, + { + "epoch": 2.706892256121039, + "grad_norm": 0.9638631343841553, + "learning_rate": 1.953120280311726e-05, + "loss": 2.69, + "step": 29878 + }, + { + "epoch": 2.706982854295486, + "grad_norm": 1.033952236175537, + "learning_rate": 1.9525161602126504e-05, + "loss": 2.6553, + "step": 29879 + }, + { + "epoch": 2.707073452469933, + "grad_norm": 1.103268027305603, + "learning_rate": 1.9519120401135744e-05, + "loss": 2.7302, + "step": 29880 + }, + { + "epoch": 2.7071640506443795, + "grad_norm": 0.9509903788566589, + "learning_rate": 1.951307920014499e-05, + "loss": 2.6566, + "step": 29881 + }, + { + "epoch": 2.707254648818826, + "grad_norm": 0.9688193798065186, + "learning_rate": 1.9507037999154233e-05, + "loss": 2.553, + "step": 29882 + }, + { + "epoch": 2.707345246993273, + "grad_norm": 1.170229196548462, + "learning_rate": 1.9500996798163477e-05, + "loss": 2.5616, + "step": 29883 + }, + { + "epoch": 2.70743584516772, + "grad_norm": 0.9794600605964661, + "learning_rate": 1.9494955597172718e-05, + "loss": 2.6305, + "step": 29884 + }, + { + "epoch": 2.7075264433421666, + "grad_norm": 0.9156389832496643, + "learning_rate": 1.9488914396181962e-05, + "loss": 1.997, + "step": 29885 + }, + { + "epoch": 2.707617041516613, + "grad_norm": 0.9550330638885498, + "learning_rate": 1.9482873195191203e-05, + "loss": 2.6559, + "step": 29886 + }, + { + "epoch": 2.70770763969106, + "grad_norm": 0.9754215478897095, + "learning_rate": 1.9476831994200447e-05, + "loss": 2.1314, + "step": 29887 + }, + { + "epoch": 2.707798237865507, + "grad_norm": 0.94338458776474, + "learning_rate": 1.947079079320969e-05, + "loss": 2.6495, + "step": 29888 + }, + { + "epoch": 2.707888836039954, + "grad_norm": 0.9356971979141235, + "learning_rate": 1.9464749592218932e-05, + "loss": 2.5355, + "step": 29889 + }, + { + "epoch": 2.7079794342144003, + "grad_norm": 0.860453188419342, + "learning_rate": 1.9458708391228176e-05, + "loss": 1.8162, + "step": 29890 + }, + { + "epoch": 2.7080700323888474, + "grad_norm": 0.8781916499137878, + "learning_rate": 1.945266719023742e-05, + "loss": 1.7967, + "step": 29891 + }, + { + "epoch": 2.7081606305632944, + "grad_norm": 1.017090082168579, + "learning_rate": 1.9446625989246665e-05, + "loss": 2.8306, + "step": 29892 + }, + { + "epoch": 2.708251228737741, + "grad_norm": 0.9640507102012634, + "learning_rate": 1.9440584788255906e-05, + "loss": 2.7258, + "step": 29893 + }, + { + "epoch": 2.7083418269121875, + "grad_norm": 1.0356374979019165, + "learning_rate": 1.943454358726515e-05, + "loss": 2.7741, + "step": 29894 + }, + { + "epoch": 2.7084324250866345, + "grad_norm": 0.8262844085693359, + "learning_rate": 1.942850238627439e-05, + "loss": 1.9572, + "step": 29895 + }, + { + "epoch": 2.7085230232610815, + "grad_norm": 0.9887843132019043, + "learning_rate": 1.9422461185283635e-05, + "loss": 2.3351, + "step": 29896 + }, + { + "epoch": 2.708613621435528, + "grad_norm": 0.9865402579307556, + "learning_rate": 1.9416419984292876e-05, + "loss": 2.6519, + "step": 29897 + }, + { + "epoch": 2.7087042196099747, + "grad_norm": 0.9968584775924683, + "learning_rate": 1.941037878330212e-05, + "loss": 2.4614, + "step": 29898 + }, + { + "epoch": 2.7087948177844217, + "grad_norm": 0.9439123272895813, + "learning_rate": 1.9404337582311364e-05, + "loss": 2.4778, + "step": 29899 + }, + { + "epoch": 2.7088854159588687, + "grad_norm": 0.971246600151062, + "learning_rate": 1.939829638132061e-05, + "loss": 2.6004, + "step": 29900 + }, + { + "epoch": 2.7089760141333152, + "grad_norm": 1.210420846939087, + "learning_rate": 1.9392255180329853e-05, + "loss": 2.6239, + "step": 29901 + }, + { + "epoch": 2.709066612307762, + "grad_norm": 1.1058279275894165, + "learning_rate": 1.9386213979339094e-05, + "loss": 2.524, + "step": 29902 + }, + { + "epoch": 2.709157210482209, + "grad_norm": 0.9650577306747437, + "learning_rate": 1.9380172778348338e-05, + "loss": 2.6704, + "step": 29903 + }, + { + "epoch": 2.709247808656656, + "grad_norm": 0.9982506632804871, + "learning_rate": 1.937413157735758e-05, + "loss": 2.6648, + "step": 29904 + }, + { + "epoch": 2.7093384068311024, + "grad_norm": 1.0649796724319458, + "learning_rate": 1.9368090376366823e-05, + "loss": 2.6292, + "step": 29905 + }, + { + "epoch": 2.709429005005549, + "grad_norm": 0.9217544198036194, + "learning_rate": 1.9362049175376064e-05, + "loss": 2.2167, + "step": 29906 + }, + { + "epoch": 2.709519603179996, + "grad_norm": 0.9707314372062683, + "learning_rate": 1.935600797438531e-05, + "loss": 2.5333, + "step": 29907 + }, + { + "epoch": 2.709610201354443, + "grad_norm": 0.995161235332489, + "learning_rate": 1.9349966773394552e-05, + "loss": 2.5271, + "step": 29908 + }, + { + "epoch": 2.7097007995288895, + "grad_norm": 0.9670697450637817, + "learning_rate": 1.9343925572403796e-05, + "loss": 2.4058, + "step": 29909 + }, + { + "epoch": 2.709791397703336, + "grad_norm": 0.9454313516616821, + "learning_rate": 1.9337884371413037e-05, + "loss": 2.3972, + "step": 29910 + }, + { + "epoch": 2.709881995877783, + "grad_norm": 0.8410923480987549, + "learning_rate": 1.933184317042228e-05, + "loss": 1.9241, + "step": 29911 + }, + { + "epoch": 2.70997259405223, + "grad_norm": 1.0075404644012451, + "learning_rate": 1.9325801969431522e-05, + "loss": 2.571, + "step": 29912 + }, + { + "epoch": 2.7100631922266767, + "grad_norm": 1.0667481422424316, + "learning_rate": 1.9319760768440766e-05, + "loss": 2.8648, + "step": 29913 + }, + { + "epoch": 2.7101537904011233, + "grad_norm": 1.097699761390686, + "learning_rate": 1.9313719567450007e-05, + "loss": 2.8057, + "step": 29914 + }, + { + "epoch": 2.7102443885755703, + "grad_norm": 1.0339393615722656, + "learning_rate": 1.930767836645925e-05, + "loss": 2.8088, + "step": 29915 + }, + { + "epoch": 2.710334986750017, + "grad_norm": 1.0559041500091553, + "learning_rate": 1.93016371654685e-05, + "loss": 2.5568, + "step": 29916 + }, + { + "epoch": 2.710425584924464, + "grad_norm": 0.8369147181510925, + "learning_rate": 1.929559596447774e-05, + "loss": 1.9187, + "step": 29917 + }, + { + "epoch": 2.7105161830989104, + "grad_norm": 0.9630129337310791, + "learning_rate": 1.9289554763486984e-05, + "loss": 2.4896, + "step": 29918 + }, + { + "epoch": 2.7106067812733574, + "grad_norm": 0.9862433075904846, + "learning_rate": 1.9283513562496225e-05, + "loss": 2.3241, + "step": 29919 + }, + { + "epoch": 2.710697379447804, + "grad_norm": 1.038035273551941, + "learning_rate": 1.927747236150547e-05, + "loss": 2.7878, + "step": 29920 + }, + { + "epoch": 2.710787977622251, + "grad_norm": 1.0121344327926636, + "learning_rate": 1.927143116051471e-05, + "loss": 2.7303, + "step": 29921 + }, + { + "epoch": 2.7108785757966976, + "grad_norm": 1.2120181322097778, + "learning_rate": 1.9265389959523954e-05, + "loss": 2.6506, + "step": 29922 + }, + { + "epoch": 2.7109691739711446, + "grad_norm": 0.8920870423316956, + "learning_rate": 1.9259348758533195e-05, + "loss": 1.8914, + "step": 29923 + }, + { + "epoch": 2.711059772145591, + "grad_norm": 1.0343976020812988, + "learning_rate": 1.9253307557542443e-05, + "loss": 2.5828, + "step": 29924 + }, + { + "epoch": 2.711150370320038, + "grad_norm": 0.9889903664588928, + "learning_rate": 1.9247266356551683e-05, + "loss": 2.7446, + "step": 29925 + }, + { + "epoch": 2.7112409684944847, + "grad_norm": 0.9855011701583862, + "learning_rate": 1.9241225155560928e-05, + "loss": 1.8964, + "step": 29926 + }, + { + "epoch": 2.7113315666689317, + "grad_norm": 1.1344295740127563, + "learning_rate": 1.923518395457017e-05, + "loss": 2.4836, + "step": 29927 + }, + { + "epoch": 2.7114221648433783, + "grad_norm": 1.011059284210205, + "learning_rate": 1.9229142753579413e-05, + "loss": 2.602, + "step": 29928 + }, + { + "epoch": 2.7115127630178253, + "grad_norm": 0.9954521059989929, + "learning_rate": 1.9223101552588654e-05, + "loss": 2.7811, + "step": 29929 + }, + { + "epoch": 2.711603361192272, + "grad_norm": 1.0331761837005615, + "learning_rate": 1.9217060351597898e-05, + "loss": 2.651, + "step": 29930 + }, + { + "epoch": 2.711693959366719, + "grad_norm": 0.9880407452583313, + "learning_rate": 1.9211019150607142e-05, + "loss": 2.4828, + "step": 29931 + }, + { + "epoch": 2.7117845575411654, + "grad_norm": 1.231954574584961, + "learning_rate": 1.9204977949616386e-05, + "loss": 2.4586, + "step": 29932 + }, + { + "epoch": 2.7118751557156124, + "grad_norm": 0.9762619137763977, + "learning_rate": 1.919893674862563e-05, + "loss": 2.354, + "step": 29933 + }, + { + "epoch": 2.711965753890059, + "grad_norm": 0.9688543081283569, + "learning_rate": 1.919289554763487e-05, + "loss": 2.5458, + "step": 29934 + }, + { + "epoch": 2.712056352064506, + "grad_norm": 0.9958875179290771, + "learning_rate": 1.9186854346644115e-05, + "loss": 2.4713, + "step": 29935 + }, + { + "epoch": 2.7121469502389526, + "grad_norm": 0.9993278980255127, + "learning_rate": 1.9180813145653356e-05, + "loss": 2.4868, + "step": 29936 + }, + { + "epoch": 2.7122375484133996, + "grad_norm": 0.9616888761520386, + "learning_rate": 1.91747719446626e-05, + "loss": 2.8028, + "step": 29937 + }, + { + "epoch": 2.712328146587846, + "grad_norm": 1.0339726209640503, + "learning_rate": 1.916873074367184e-05, + "loss": 2.5533, + "step": 29938 + }, + { + "epoch": 2.712418744762293, + "grad_norm": 1.0313396453857422, + "learning_rate": 1.9162689542681086e-05, + "loss": 2.4087, + "step": 29939 + }, + { + "epoch": 2.7125093429367397, + "grad_norm": 1.0857338905334473, + "learning_rate": 1.915664834169033e-05, + "loss": 2.5166, + "step": 29940 + }, + { + "epoch": 2.7125999411111867, + "grad_norm": 1.0238924026489258, + "learning_rate": 1.9150607140699574e-05, + "loss": 2.6944, + "step": 29941 + }, + { + "epoch": 2.7126905392856333, + "grad_norm": 0.9583104252815247, + "learning_rate": 1.9144565939708815e-05, + "loss": 2.4728, + "step": 29942 + }, + { + "epoch": 2.7127811374600803, + "grad_norm": 1.1570144891738892, + "learning_rate": 1.913852473871806e-05, + "loss": 2.5151, + "step": 29943 + }, + { + "epoch": 2.712871735634527, + "grad_norm": 0.9810187816619873, + "learning_rate": 1.91324835377273e-05, + "loss": 2.7783, + "step": 29944 + }, + { + "epoch": 2.712962333808974, + "grad_norm": 1.0452542304992676, + "learning_rate": 1.9126442336736544e-05, + "loss": 2.6372, + "step": 29945 + }, + { + "epoch": 2.7130529319834205, + "grad_norm": 0.9653630256652832, + "learning_rate": 1.9120401135745788e-05, + "loss": 2.3222, + "step": 29946 + }, + { + "epoch": 2.7131435301578675, + "grad_norm": 0.96047043800354, + "learning_rate": 1.911435993475503e-05, + "loss": 2.5237, + "step": 29947 + }, + { + "epoch": 2.713234128332314, + "grad_norm": 0.9643552303314209, + "learning_rate": 1.9108318733764273e-05, + "loss": 2.6966, + "step": 29948 + }, + { + "epoch": 2.713324726506761, + "grad_norm": 0.9538619518280029, + "learning_rate": 1.9102277532773518e-05, + "loss": 2.6076, + "step": 29949 + }, + { + "epoch": 2.7134153246812076, + "grad_norm": 1.0402246713638306, + "learning_rate": 1.9096236331782762e-05, + "loss": 2.6285, + "step": 29950 + }, + { + "epoch": 2.7135059228556546, + "grad_norm": 0.9866176843643188, + "learning_rate": 1.9090195130792003e-05, + "loss": 2.7142, + "step": 29951 + }, + { + "epoch": 2.713596521030101, + "grad_norm": 1.0440937280654907, + "learning_rate": 1.9084153929801247e-05, + "loss": 2.7259, + "step": 29952 + }, + { + "epoch": 2.713687119204548, + "grad_norm": 1.1404703855514526, + "learning_rate": 1.9078112728810488e-05, + "loss": 2.7915, + "step": 29953 + }, + { + "epoch": 2.7137777173789948, + "grad_norm": 1.018790364265442, + "learning_rate": 1.9072071527819732e-05, + "loss": 2.5622, + "step": 29954 + }, + { + "epoch": 2.7138683155534418, + "grad_norm": 1.0036592483520508, + "learning_rate": 1.9066030326828973e-05, + "loss": 2.5708, + "step": 29955 + }, + { + "epoch": 2.7139589137278883, + "grad_norm": 0.9573577642440796, + "learning_rate": 1.9059989125838217e-05, + "loss": 2.6612, + "step": 29956 + }, + { + "epoch": 2.714049511902335, + "grad_norm": 1.0239818096160889, + "learning_rate": 1.905394792484746e-05, + "loss": 2.5944, + "step": 29957 + }, + { + "epoch": 2.714140110076782, + "grad_norm": 1.103481650352478, + "learning_rate": 1.9047906723856705e-05, + "loss": 2.5663, + "step": 29958 + }, + { + "epoch": 2.714230708251229, + "grad_norm": 1.0170060396194458, + "learning_rate": 1.9041865522865946e-05, + "loss": 2.5655, + "step": 29959 + }, + { + "epoch": 2.7143213064256755, + "grad_norm": 0.958075761795044, + "learning_rate": 1.903582432187519e-05, + "loss": 2.4705, + "step": 29960 + }, + { + "epoch": 2.714411904600122, + "grad_norm": 1.0568147897720337, + "learning_rate": 1.9029783120884435e-05, + "loss": 2.8715, + "step": 29961 + }, + { + "epoch": 2.714502502774569, + "grad_norm": 0.9771047234535217, + "learning_rate": 1.9023741919893675e-05, + "loss": 2.5398, + "step": 29962 + }, + { + "epoch": 2.714593100949016, + "grad_norm": 0.9804059863090515, + "learning_rate": 1.901770071890292e-05, + "loss": 2.4124, + "step": 29963 + }, + { + "epoch": 2.7146836991234626, + "grad_norm": 0.8113042712211609, + "learning_rate": 1.901165951791216e-05, + "loss": 1.7927, + "step": 29964 + }, + { + "epoch": 2.714774297297909, + "grad_norm": 0.9829920530319214, + "learning_rate": 1.9005618316921405e-05, + "loss": 2.6302, + "step": 29965 + }, + { + "epoch": 2.714864895472356, + "grad_norm": 1.0669485330581665, + "learning_rate": 1.899957711593065e-05, + "loss": 2.7557, + "step": 29966 + }, + { + "epoch": 2.714955493646803, + "grad_norm": 1.0017739534378052, + "learning_rate": 1.8993535914939893e-05, + "loss": 2.6243, + "step": 29967 + }, + { + "epoch": 2.71504609182125, + "grad_norm": 1.0926456451416016, + "learning_rate": 1.8987494713949134e-05, + "loss": 2.8972, + "step": 29968 + }, + { + "epoch": 2.7151366899956964, + "grad_norm": 1.0495531558990479, + "learning_rate": 1.8981453512958378e-05, + "loss": 2.3847, + "step": 29969 + }, + { + "epoch": 2.7152272881701434, + "grad_norm": 1.040475606918335, + "learning_rate": 1.897541231196762e-05, + "loss": 2.5951, + "step": 29970 + }, + { + "epoch": 2.7153178863445904, + "grad_norm": 0.9452334046363831, + "learning_rate": 1.8969371110976863e-05, + "loss": 2.6737, + "step": 29971 + }, + { + "epoch": 2.715408484519037, + "grad_norm": 1.0005958080291748, + "learning_rate": 1.8963329909986104e-05, + "loss": 2.7769, + "step": 29972 + }, + { + "epoch": 2.7154990826934835, + "grad_norm": 1.081061601638794, + "learning_rate": 1.8957288708995348e-05, + "loss": 2.6818, + "step": 29973 + }, + { + "epoch": 2.7155896808679305, + "grad_norm": 1.0445022583007812, + "learning_rate": 1.8951247508004592e-05, + "loss": 2.6231, + "step": 29974 + }, + { + "epoch": 2.7156802790423775, + "grad_norm": 1.0341289043426514, + "learning_rate": 1.8945206307013837e-05, + "loss": 2.6823, + "step": 29975 + }, + { + "epoch": 2.715770877216824, + "grad_norm": 1.0654103755950928, + "learning_rate": 1.893916510602308e-05, + "loss": 2.7301, + "step": 29976 + }, + { + "epoch": 2.7158614753912707, + "grad_norm": 1.034978985786438, + "learning_rate": 1.8933123905032322e-05, + "loss": 2.4721, + "step": 29977 + }, + { + "epoch": 2.7159520735657177, + "grad_norm": 1.0264045000076294, + "learning_rate": 1.8927082704041566e-05, + "loss": 2.6088, + "step": 29978 + }, + { + "epoch": 2.7160426717401647, + "grad_norm": 0.9503660202026367, + "learning_rate": 1.8921041503050807e-05, + "loss": 2.6001, + "step": 29979 + }, + { + "epoch": 2.7161332699146112, + "grad_norm": 0.9832233786582947, + "learning_rate": 1.891500030206005e-05, + "loss": 2.6935, + "step": 29980 + }, + { + "epoch": 2.716223868089058, + "grad_norm": 1.0060112476348877, + "learning_rate": 1.8908959101069292e-05, + "loss": 2.602, + "step": 29981 + }, + { + "epoch": 2.716314466263505, + "grad_norm": 1.0087429285049438, + "learning_rate": 1.8902917900078536e-05, + "loss": 2.7843, + "step": 29982 + }, + { + "epoch": 2.716405064437952, + "grad_norm": 1.0538153648376465, + "learning_rate": 1.889687669908778e-05, + "loss": 2.915, + "step": 29983 + }, + { + "epoch": 2.7164956626123984, + "grad_norm": 1.0058588981628418, + "learning_rate": 1.8890835498097025e-05, + "loss": 2.6238, + "step": 29984 + }, + { + "epoch": 2.716586260786845, + "grad_norm": 1.0431435108184814, + "learning_rate": 1.8884794297106265e-05, + "loss": 1.8643, + "step": 29985 + }, + { + "epoch": 2.716676858961292, + "grad_norm": 1.02146315574646, + "learning_rate": 1.887875309611551e-05, + "loss": 2.6921, + "step": 29986 + }, + { + "epoch": 2.716767457135739, + "grad_norm": 0.9959051012992859, + "learning_rate": 1.887271189512475e-05, + "loss": 2.8707, + "step": 29987 + }, + { + "epoch": 2.7168580553101855, + "grad_norm": 0.9944896697998047, + "learning_rate": 1.8866670694133995e-05, + "loss": 2.5124, + "step": 29988 + }, + { + "epoch": 2.716948653484632, + "grad_norm": 0.8416748046875, + "learning_rate": 1.8860629493143235e-05, + "loss": 2.158, + "step": 29989 + }, + { + "epoch": 2.717039251659079, + "grad_norm": 1.04457426071167, + "learning_rate": 1.885458829215248e-05, + "loss": 2.5971, + "step": 29990 + }, + { + "epoch": 2.717129849833526, + "grad_norm": 0.9383974075317383, + "learning_rate": 1.8848547091161724e-05, + "loss": 2.5202, + "step": 29991 + }, + { + "epoch": 2.7172204480079727, + "grad_norm": 0.8522913455963135, + "learning_rate": 1.8842505890170968e-05, + "loss": 1.7953, + "step": 29992 + }, + { + "epoch": 2.7173110461824193, + "grad_norm": 1.1740964651107788, + "learning_rate": 1.8836464689180212e-05, + "loss": 2.4064, + "step": 29993 + }, + { + "epoch": 2.7174016443568663, + "grad_norm": 1.056929111480713, + "learning_rate": 1.8830423488189453e-05, + "loss": 2.6825, + "step": 29994 + }, + { + "epoch": 2.717492242531313, + "grad_norm": 1.0558503866195679, + "learning_rate": 1.8824382287198697e-05, + "loss": 2.5547, + "step": 29995 + }, + { + "epoch": 2.71758284070576, + "grad_norm": 1.011692762374878, + "learning_rate": 1.8818341086207938e-05, + "loss": 2.6027, + "step": 29996 + }, + { + "epoch": 2.7176734388802064, + "grad_norm": 0.9632221460342407, + "learning_rate": 1.8812299885217182e-05, + "loss": 2.444, + "step": 29997 + }, + { + "epoch": 2.7177640370546534, + "grad_norm": 0.976111888885498, + "learning_rate": 1.8806258684226423e-05, + "loss": 2.6753, + "step": 29998 + }, + { + "epoch": 2.7178546352291, + "grad_norm": 0.7194995880126953, + "learning_rate": 1.8800217483235667e-05, + "loss": 1.4279, + "step": 29999 + }, + { + "epoch": 2.717945233403547, + "grad_norm": 1.0115454196929932, + "learning_rate": 1.879417628224491e-05, + "loss": 2.5092, + "step": 30000 + }, + { + "epoch": 2.7180358315779936, + "grad_norm": 1.0003799200057983, + "learning_rate": 1.8788135081254156e-05, + "loss": 2.5442, + "step": 30001 + }, + { + "epoch": 2.7181264297524406, + "grad_norm": 0.9824469089508057, + "learning_rate": 1.8782093880263397e-05, + "loss": 2.6763, + "step": 30002 + }, + { + "epoch": 2.718217027926887, + "grad_norm": 1.011413335800171, + "learning_rate": 1.877605267927264e-05, + "loss": 2.8241, + "step": 30003 + }, + { + "epoch": 2.718307626101334, + "grad_norm": 1.032773733139038, + "learning_rate": 1.8770011478281882e-05, + "loss": 2.7974, + "step": 30004 + }, + { + "epoch": 2.7183982242757807, + "grad_norm": 0.9245928525924683, + "learning_rate": 1.8763970277291126e-05, + "loss": 2.6218, + "step": 30005 + }, + { + "epoch": 2.7184888224502277, + "grad_norm": 0.983364999294281, + "learning_rate": 1.875792907630037e-05, + "loss": 2.8075, + "step": 30006 + }, + { + "epoch": 2.7185794206246743, + "grad_norm": 0.8906663656234741, + "learning_rate": 1.875188787530961e-05, + "loss": 1.7533, + "step": 30007 + }, + { + "epoch": 2.7186700187991213, + "grad_norm": 0.9873299598693848, + "learning_rate": 1.8745846674318855e-05, + "loss": 2.61, + "step": 30008 + }, + { + "epoch": 2.718760616973568, + "grad_norm": 1.0252156257629395, + "learning_rate": 1.87398054733281e-05, + "loss": 2.479, + "step": 30009 + }, + { + "epoch": 2.718851215148015, + "grad_norm": 1.015177607536316, + "learning_rate": 1.8733764272337344e-05, + "loss": 2.8185, + "step": 30010 + }, + { + "epoch": 2.7189418133224614, + "grad_norm": 0.9530559778213501, + "learning_rate": 1.8727723071346585e-05, + "loss": 2.4673, + "step": 30011 + }, + { + "epoch": 2.7190324114969084, + "grad_norm": 1.0222055912017822, + "learning_rate": 1.872168187035583e-05, + "loss": 2.5761, + "step": 30012 + }, + { + "epoch": 2.719123009671355, + "grad_norm": 0.9476825594902039, + "learning_rate": 1.871564066936507e-05, + "loss": 2.6322, + "step": 30013 + }, + { + "epoch": 2.719213607845802, + "grad_norm": 1.0208497047424316, + "learning_rate": 1.8709599468374314e-05, + "loss": 2.8231, + "step": 30014 + }, + { + "epoch": 2.7193042060202486, + "grad_norm": 1.039871335029602, + "learning_rate": 1.8703558267383555e-05, + "loss": 2.6212, + "step": 30015 + }, + { + "epoch": 2.7193948041946956, + "grad_norm": 1.0121725797653198, + "learning_rate": 1.86975170663928e-05, + "loss": 2.5119, + "step": 30016 + }, + { + "epoch": 2.719485402369142, + "grad_norm": 1.2004270553588867, + "learning_rate": 1.8691475865402043e-05, + "loss": 2.7604, + "step": 30017 + }, + { + "epoch": 2.719576000543589, + "grad_norm": 1.0453263521194458, + "learning_rate": 1.8685434664411287e-05, + "loss": 2.356, + "step": 30018 + }, + { + "epoch": 2.7196665987180357, + "grad_norm": 1.0697721242904663, + "learning_rate": 1.8679393463420528e-05, + "loss": 2.5372, + "step": 30019 + }, + { + "epoch": 2.7197571968924827, + "grad_norm": 0.9754611849784851, + "learning_rate": 1.8673352262429772e-05, + "loss": 2.3944, + "step": 30020 + }, + { + "epoch": 2.7198477950669293, + "grad_norm": 1.0602842569351196, + "learning_rate": 1.8667311061439017e-05, + "loss": 2.4528, + "step": 30021 + }, + { + "epoch": 2.7199383932413763, + "grad_norm": 1.0184153318405151, + "learning_rate": 1.8661269860448257e-05, + "loss": 2.5282, + "step": 30022 + }, + { + "epoch": 2.720028991415823, + "grad_norm": 1.085936188697815, + "learning_rate": 1.86552286594575e-05, + "loss": 2.4313, + "step": 30023 + }, + { + "epoch": 2.72011958959027, + "grad_norm": 0.9320369958877563, + "learning_rate": 1.8649187458466742e-05, + "loss": 2.3629, + "step": 30024 + }, + { + "epoch": 2.7202101877647165, + "grad_norm": 0.8828896284103394, + "learning_rate": 1.8643146257475987e-05, + "loss": 2.0448, + "step": 30025 + }, + { + "epoch": 2.7203007859391635, + "grad_norm": 0.9892870187759399, + "learning_rate": 1.863710505648523e-05, + "loss": 2.7748, + "step": 30026 + }, + { + "epoch": 2.72039138411361, + "grad_norm": 1.0170912742614746, + "learning_rate": 1.8631063855494475e-05, + "loss": 2.763, + "step": 30027 + }, + { + "epoch": 2.720481982288057, + "grad_norm": 1.0001689195632935, + "learning_rate": 1.8625022654503716e-05, + "loss": 2.7598, + "step": 30028 + }, + { + "epoch": 2.7205725804625036, + "grad_norm": 0.9828137755393982, + "learning_rate": 1.861898145351296e-05, + "loss": 2.3379, + "step": 30029 + }, + { + "epoch": 2.7206631786369506, + "grad_norm": 1.0537859201431274, + "learning_rate": 1.86129402525222e-05, + "loss": 2.5418, + "step": 30030 + }, + { + "epoch": 2.720753776811397, + "grad_norm": 1.0007827281951904, + "learning_rate": 1.8606899051531445e-05, + "loss": 2.7926, + "step": 30031 + }, + { + "epoch": 2.720844374985844, + "grad_norm": 0.9917354583740234, + "learning_rate": 1.8600857850540686e-05, + "loss": 2.6446, + "step": 30032 + }, + { + "epoch": 2.7209349731602908, + "grad_norm": 1.0438110828399658, + "learning_rate": 1.859481664954993e-05, + "loss": 2.5684, + "step": 30033 + }, + { + "epoch": 2.7210255713347378, + "grad_norm": 0.8862625360488892, + "learning_rate": 1.8588775448559174e-05, + "loss": 2.1492, + "step": 30034 + }, + { + "epoch": 2.7211161695091843, + "grad_norm": 1.0614262819290161, + "learning_rate": 1.858273424756842e-05, + "loss": 2.4144, + "step": 30035 + }, + { + "epoch": 2.721206767683631, + "grad_norm": 1.0892654657363892, + "learning_rate": 1.8576693046577663e-05, + "loss": 2.5107, + "step": 30036 + }, + { + "epoch": 2.721297365858078, + "grad_norm": 1.023148536682129, + "learning_rate": 1.8570651845586904e-05, + "loss": 2.4359, + "step": 30037 + }, + { + "epoch": 2.721387964032525, + "grad_norm": 0.8342653512954712, + "learning_rate": 1.8564610644596148e-05, + "loss": 2.0239, + "step": 30038 + }, + { + "epoch": 2.7214785622069715, + "grad_norm": 0.8572596311569214, + "learning_rate": 1.855856944360539e-05, + "loss": 1.9187, + "step": 30039 + }, + { + "epoch": 2.721569160381418, + "grad_norm": 1.0132114887237549, + "learning_rate": 1.8552528242614633e-05, + "loss": 2.5817, + "step": 30040 + }, + { + "epoch": 2.721659758555865, + "grad_norm": 1.0395731925964355, + "learning_rate": 1.8546487041623874e-05, + "loss": 2.735, + "step": 30041 + }, + { + "epoch": 2.721750356730312, + "grad_norm": 0.9628183841705322, + "learning_rate": 1.8540445840633118e-05, + "loss": 2.5079, + "step": 30042 + }, + { + "epoch": 2.7218409549047586, + "grad_norm": 1.0383057594299316, + "learning_rate": 1.8534404639642362e-05, + "loss": 2.6684, + "step": 30043 + }, + { + "epoch": 2.721931553079205, + "grad_norm": 1.1093432903289795, + "learning_rate": 1.8528363438651606e-05, + "loss": 3.0687, + "step": 30044 + }, + { + "epoch": 2.722022151253652, + "grad_norm": 1.009464144706726, + "learning_rate": 1.8522322237660847e-05, + "loss": 2.5578, + "step": 30045 + }, + { + "epoch": 2.722112749428099, + "grad_norm": 1.0921835899353027, + "learning_rate": 1.851628103667009e-05, + "loss": 2.8053, + "step": 30046 + }, + { + "epoch": 2.722203347602546, + "grad_norm": 1.0128191709518433, + "learning_rate": 1.8510239835679332e-05, + "loss": 2.4976, + "step": 30047 + }, + { + "epoch": 2.7222939457769924, + "grad_norm": 0.9868333339691162, + "learning_rate": 1.8504198634688577e-05, + "loss": 2.5843, + "step": 30048 + }, + { + "epoch": 2.7223845439514394, + "grad_norm": 1.1029683351516724, + "learning_rate": 1.849815743369782e-05, + "loss": 2.5496, + "step": 30049 + }, + { + "epoch": 2.7224751421258864, + "grad_norm": 0.858275294303894, + "learning_rate": 1.849211623270706e-05, + "loss": 2.0946, + "step": 30050 + }, + { + "epoch": 2.722565740300333, + "grad_norm": 0.9959114789962769, + "learning_rate": 1.8486075031716306e-05, + "loss": 1.67, + "step": 30051 + }, + { + "epoch": 2.7226563384747795, + "grad_norm": 1.0441654920578003, + "learning_rate": 1.848003383072555e-05, + "loss": 2.4634, + "step": 30052 + }, + { + "epoch": 2.7227469366492265, + "grad_norm": 1.0196316242218018, + "learning_rate": 1.8473992629734794e-05, + "loss": 2.5893, + "step": 30053 + }, + { + "epoch": 2.7228375348236735, + "grad_norm": 1.0322520732879639, + "learning_rate": 1.8467951428744035e-05, + "loss": 2.5696, + "step": 30054 + }, + { + "epoch": 2.72292813299812, + "grad_norm": 0.9007798433303833, + "learning_rate": 1.846191022775328e-05, + "loss": 1.9846, + "step": 30055 + }, + { + "epoch": 2.7230187311725667, + "grad_norm": 0.9003986120223999, + "learning_rate": 1.845586902676252e-05, + "loss": 2.1443, + "step": 30056 + }, + { + "epoch": 2.7231093293470137, + "grad_norm": 1.0044358968734741, + "learning_rate": 1.8449827825771764e-05, + "loss": 2.6837, + "step": 30057 + }, + { + "epoch": 2.7231999275214607, + "grad_norm": 1.0536417961120605, + "learning_rate": 1.8443786624781005e-05, + "loss": 2.5463, + "step": 30058 + }, + { + "epoch": 2.7232905256959072, + "grad_norm": 1.2081005573272705, + "learning_rate": 1.843774542379025e-05, + "loss": 2.4704, + "step": 30059 + }, + { + "epoch": 2.723381123870354, + "grad_norm": 1.043494701385498, + "learning_rate": 1.8431704222799494e-05, + "loss": 2.5805, + "step": 30060 + }, + { + "epoch": 2.723471722044801, + "grad_norm": 0.9982008337974548, + "learning_rate": 1.8425663021808738e-05, + "loss": 2.4418, + "step": 30061 + }, + { + "epoch": 2.723562320219248, + "grad_norm": 1.019333839416504, + "learning_rate": 1.841962182081798e-05, + "loss": 2.6442, + "step": 30062 + }, + { + "epoch": 2.7236529183936944, + "grad_norm": 1.0393915176391602, + "learning_rate": 1.8413580619827223e-05, + "loss": 2.8539, + "step": 30063 + }, + { + "epoch": 2.723743516568141, + "grad_norm": 1.0525416135787964, + "learning_rate": 1.8407539418836467e-05, + "loss": 2.7381, + "step": 30064 + }, + { + "epoch": 2.723834114742588, + "grad_norm": 1.0058071613311768, + "learning_rate": 1.8401498217845708e-05, + "loss": 2.5572, + "step": 30065 + }, + { + "epoch": 2.723924712917035, + "grad_norm": 1.0024867057800293, + "learning_rate": 1.8395457016854952e-05, + "loss": 2.4154, + "step": 30066 + }, + { + "epoch": 2.7240153110914815, + "grad_norm": 1.003949522972107, + "learning_rate": 1.8389415815864193e-05, + "loss": 2.6942, + "step": 30067 + }, + { + "epoch": 2.724105909265928, + "grad_norm": 1.0117489099502563, + "learning_rate": 1.8383374614873437e-05, + "loss": 2.6312, + "step": 30068 + }, + { + "epoch": 2.724196507440375, + "grad_norm": 0.9893313646316528, + "learning_rate": 1.837733341388268e-05, + "loss": 2.7419, + "step": 30069 + }, + { + "epoch": 2.724287105614822, + "grad_norm": 1.0523563623428345, + "learning_rate": 1.8371292212891926e-05, + "loss": 2.7212, + "step": 30070 + }, + { + "epoch": 2.7243777037892687, + "grad_norm": 0.9680182337760925, + "learning_rate": 1.8365251011901166e-05, + "loss": 2.4061, + "step": 30071 + }, + { + "epoch": 2.7244683019637153, + "grad_norm": 1.1253148317337036, + "learning_rate": 1.835920981091041e-05, + "loss": 2.605, + "step": 30072 + }, + { + "epoch": 2.7245589001381623, + "grad_norm": 1.0310695171356201, + "learning_rate": 1.835316860991965e-05, + "loss": 2.9334, + "step": 30073 + }, + { + "epoch": 2.7246494983126093, + "grad_norm": 0.9902133941650391, + "learning_rate": 1.8347127408928896e-05, + "loss": 2.3177, + "step": 30074 + }, + { + "epoch": 2.724740096487056, + "grad_norm": 1.0264689922332764, + "learning_rate": 1.8341086207938137e-05, + "loss": 2.5337, + "step": 30075 + }, + { + "epoch": 2.7248306946615024, + "grad_norm": 0.9806033968925476, + "learning_rate": 1.833504500694738e-05, + "loss": 2.4732, + "step": 30076 + }, + { + "epoch": 2.7249212928359494, + "grad_norm": 1.0237711668014526, + "learning_rate": 1.8329003805956625e-05, + "loss": 2.786, + "step": 30077 + }, + { + "epoch": 2.725011891010396, + "grad_norm": 0.9927219152450562, + "learning_rate": 1.832296260496587e-05, + "loss": 2.6256, + "step": 30078 + }, + { + "epoch": 2.725102489184843, + "grad_norm": 0.9831278920173645, + "learning_rate": 1.8316921403975113e-05, + "loss": 2.5995, + "step": 30079 + }, + { + "epoch": 2.7251930873592896, + "grad_norm": 0.9688553810119629, + "learning_rate": 1.8310880202984354e-05, + "loss": 2.7426, + "step": 30080 + }, + { + "epoch": 2.7252836855337366, + "grad_norm": 0.9736970067024231, + "learning_rate": 1.83048390019936e-05, + "loss": 2.7323, + "step": 30081 + }, + { + "epoch": 2.725374283708183, + "grad_norm": 0.9574857950210571, + "learning_rate": 1.829879780100284e-05, + "loss": 2.4825, + "step": 30082 + }, + { + "epoch": 2.72546488188263, + "grad_norm": 0.9878111481666565, + "learning_rate": 1.8292756600012083e-05, + "loss": 2.6991, + "step": 30083 + }, + { + "epoch": 2.7255554800570767, + "grad_norm": 0.8930012583732605, + "learning_rate": 1.8286715399021324e-05, + "loss": 1.868, + "step": 30084 + }, + { + "epoch": 2.7256460782315237, + "grad_norm": 1.0242868661880493, + "learning_rate": 1.828067419803057e-05, + "loss": 2.8076, + "step": 30085 + }, + { + "epoch": 2.7257366764059703, + "grad_norm": 1.0274415016174316, + "learning_rate": 1.8274632997039813e-05, + "loss": 2.9395, + "step": 30086 + }, + { + "epoch": 2.7258272745804173, + "grad_norm": 0.9602407217025757, + "learning_rate": 1.8268591796049057e-05, + "loss": 2.7033, + "step": 30087 + }, + { + "epoch": 2.725917872754864, + "grad_norm": 1.0412166118621826, + "learning_rate": 1.8262550595058298e-05, + "loss": 2.6231, + "step": 30088 + }, + { + "epoch": 2.726008470929311, + "grad_norm": 0.9940217137336731, + "learning_rate": 1.8256509394067542e-05, + "loss": 2.6504, + "step": 30089 + }, + { + "epoch": 2.7260990691037574, + "grad_norm": 1.1291285753250122, + "learning_rate": 1.8250468193076783e-05, + "loss": 2.6295, + "step": 30090 + }, + { + "epoch": 2.7261896672782044, + "grad_norm": 1.0399025678634644, + "learning_rate": 1.8244426992086027e-05, + "loss": 2.7408, + "step": 30091 + }, + { + "epoch": 2.726280265452651, + "grad_norm": 1.0122920274734497, + "learning_rate": 1.8238385791095268e-05, + "loss": 2.6762, + "step": 30092 + }, + { + "epoch": 2.726370863627098, + "grad_norm": 0.9929423332214355, + "learning_rate": 1.8232344590104512e-05, + "loss": 2.6329, + "step": 30093 + }, + { + "epoch": 2.7264614618015446, + "grad_norm": 1.0596965551376343, + "learning_rate": 1.822630338911376e-05, + "loss": 2.684, + "step": 30094 + }, + { + "epoch": 2.7265520599759916, + "grad_norm": 0.991981029510498, + "learning_rate": 1.8220262188123e-05, + "loss": 2.5582, + "step": 30095 + }, + { + "epoch": 2.726642658150438, + "grad_norm": 1.0149239301681519, + "learning_rate": 1.8214220987132245e-05, + "loss": 2.5563, + "step": 30096 + }, + { + "epoch": 2.726733256324885, + "grad_norm": 0.9913904666900635, + "learning_rate": 1.8208179786141486e-05, + "loss": 2.5411, + "step": 30097 + }, + { + "epoch": 2.7268238544993317, + "grad_norm": 1.1031745672225952, + "learning_rate": 1.820213858515073e-05, + "loss": 2.8704, + "step": 30098 + }, + { + "epoch": 2.7269144526737787, + "grad_norm": 1.0821926593780518, + "learning_rate": 1.819609738415997e-05, + "loss": 2.5428, + "step": 30099 + }, + { + "epoch": 2.7270050508482253, + "grad_norm": 0.8715039491653442, + "learning_rate": 1.8190056183169215e-05, + "loss": 1.729, + "step": 30100 + }, + { + "epoch": 2.7270956490226723, + "grad_norm": 1.09181809425354, + "learning_rate": 1.8184014982178456e-05, + "loss": 2.489, + "step": 30101 + }, + { + "epoch": 2.727186247197119, + "grad_norm": 0.8785011768341064, + "learning_rate": 1.81779737811877e-05, + "loss": 2.0091, + "step": 30102 + }, + { + "epoch": 2.727276845371566, + "grad_norm": 1.0459892749786377, + "learning_rate": 1.8171932580196944e-05, + "loss": 2.6779, + "step": 30103 + }, + { + "epoch": 2.7273674435460125, + "grad_norm": 0.9575698375701904, + "learning_rate": 1.816589137920619e-05, + "loss": 2.5544, + "step": 30104 + }, + { + "epoch": 2.7274580417204595, + "grad_norm": 1.1486200094223022, + "learning_rate": 1.815985017821543e-05, + "loss": 2.5611, + "step": 30105 + }, + { + "epoch": 2.727548639894906, + "grad_norm": 1.057253122329712, + "learning_rate": 1.8153808977224673e-05, + "loss": 2.6669, + "step": 30106 + }, + { + "epoch": 2.727639238069353, + "grad_norm": 0.8671687245368958, + "learning_rate": 1.8147767776233914e-05, + "loss": 1.9405, + "step": 30107 + }, + { + "epoch": 2.7277298362437996, + "grad_norm": 0.9939925074577332, + "learning_rate": 1.814172657524316e-05, + "loss": 2.6123, + "step": 30108 + }, + { + "epoch": 2.7278204344182466, + "grad_norm": 1.0756112337112427, + "learning_rate": 1.8135685374252403e-05, + "loss": 2.6665, + "step": 30109 + }, + { + "epoch": 2.727911032592693, + "grad_norm": 1.0152531862258911, + "learning_rate": 1.8129644173261643e-05, + "loss": 2.7147, + "step": 30110 + }, + { + "epoch": 2.72800163076714, + "grad_norm": 0.9600260853767395, + "learning_rate": 1.812360297227089e-05, + "loss": 2.6095, + "step": 30111 + }, + { + "epoch": 2.7280922289415868, + "grad_norm": 1.0594877004623413, + "learning_rate": 1.8117561771280132e-05, + "loss": 2.7375, + "step": 30112 + }, + { + "epoch": 2.7281828271160338, + "grad_norm": 1.0131057500839233, + "learning_rate": 1.8111520570289376e-05, + "loss": 2.559, + "step": 30113 + }, + { + "epoch": 2.7282734252904803, + "grad_norm": 1.0248054265975952, + "learning_rate": 1.8105479369298617e-05, + "loss": 2.5725, + "step": 30114 + }, + { + "epoch": 2.7283640234649273, + "grad_norm": 1.0933904647827148, + "learning_rate": 1.809943816830786e-05, + "loss": 2.5701, + "step": 30115 + }, + { + "epoch": 2.728454621639374, + "grad_norm": 0.9851205348968506, + "learning_rate": 1.8093396967317102e-05, + "loss": 2.7412, + "step": 30116 + }, + { + "epoch": 2.728545219813821, + "grad_norm": 1.0171127319335938, + "learning_rate": 1.8087355766326346e-05, + "loss": 1.8572, + "step": 30117 + }, + { + "epoch": 2.7286358179882675, + "grad_norm": 0.984619677066803, + "learning_rate": 1.8081314565335587e-05, + "loss": 2.4317, + "step": 30118 + }, + { + "epoch": 2.728726416162714, + "grad_norm": 1.04205322265625, + "learning_rate": 1.8075273364344835e-05, + "loss": 2.5424, + "step": 30119 + }, + { + "epoch": 2.728817014337161, + "grad_norm": 1.0064973831176758, + "learning_rate": 1.8069232163354075e-05, + "loss": 2.4858, + "step": 30120 + }, + { + "epoch": 2.728907612511608, + "grad_norm": 0.9981116652488708, + "learning_rate": 1.806319096236332e-05, + "loss": 2.3705, + "step": 30121 + }, + { + "epoch": 2.7289982106860546, + "grad_norm": 0.980212926864624, + "learning_rate": 1.805714976137256e-05, + "loss": 2.3169, + "step": 30122 + }, + { + "epoch": 2.729088808860501, + "grad_norm": 0.9624386429786682, + "learning_rate": 1.8051108560381805e-05, + "loss": 2.6, + "step": 30123 + }, + { + "epoch": 2.729179407034948, + "grad_norm": 1.1042540073394775, + "learning_rate": 1.804506735939105e-05, + "loss": 2.6606, + "step": 30124 + }, + { + "epoch": 2.7292700052093952, + "grad_norm": 1.0388123989105225, + "learning_rate": 1.803902615840029e-05, + "loss": 2.4609, + "step": 30125 + }, + { + "epoch": 2.729360603383842, + "grad_norm": 1.1910145282745361, + "learning_rate": 1.8032984957409534e-05, + "loss": 2.5944, + "step": 30126 + }, + { + "epoch": 2.7294512015582884, + "grad_norm": 1.051205039024353, + "learning_rate": 1.8026943756418775e-05, + "loss": 1.9599, + "step": 30127 + }, + { + "epoch": 2.7295417997327354, + "grad_norm": 0.9918797016143799, + "learning_rate": 1.8020902555428022e-05, + "loss": 2.6181, + "step": 30128 + }, + { + "epoch": 2.7296323979071824, + "grad_norm": 1.018236517906189, + "learning_rate": 1.8014861354437263e-05, + "loss": 2.4094, + "step": 30129 + }, + { + "epoch": 2.729722996081629, + "grad_norm": 1.0633589029312134, + "learning_rate": 1.8008820153446507e-05, + "loss": 2.5391, + "step": 30130 + }, + { + "epoch": 2.7298135942560755, + "grad_norm": 1.036093831062317, + "learning_rate": 1.800277895245575e-05, + "loss": 2.7271, + "step": 30131 + }, + { + "epoch": 2.7299041924305225, + "grad_norm": 0.9794623255729675, + "learning_rate": 1.7996737751464993e-05, + "loss": 2.7291, + "step": 30132 + }, + { + "epoch": 2.7299947906049695, + "grad_norm": 1.0126550197601318, + "learning_rate": 1.7990696550474233e-05, + "loss": 2.6025, + "step": 30133 + }, + { + "epoch": 2.730085388779416, + "grad_norm": 0.9751798510551453, + "learning_rate": 1.7984655349483478e-05, + "loss": 2.4623, + "step": 30134 + }, + { + "epoch": 2.7301759869538627, + "grad_norm": 0.9581044316291809, + "learning_rate": 1.797861414849272e-05, + "loss": 2.6705, + "step": 30135 + }, + { + "epoch": 2.7302665851283097, + "grad_norm": 0.9584725499153137, + "learning_rate": 1.7972572947501966e-05, + "loss": 2.7351, + "step": 30136 + }, + { + "epoch": 2.7303571833027567, + "grad_norm": 0.8992612957954407, + "learning_rate": 1.7966531746511207e-05, + "loss": 2.152, + "step": 30137 + }, + { + "epoch": 2.7304477814772032, + "grad_norm": 1.0387319326400757, + "learning_rate": 1.796049054552045e-05, + "loss": 2.3211, + "step": 30138 + }, + { + "epoch": 2.73053837965165, + "grad_norm": 1.0376783609390259, + "learning_rate": 1.7954449344529695e-05, + "loss": 2.7285, + "step": 30139 + }, + { + "epoch": 2.730628977826097, + "grad_norm": 1.186046838760376, + "learning_rate": 1.7948408143538936e-05, + "loss": 2.5335, + "step": 30140 + }, + { + "epoch": 2.730719576000544, + "grad_norm": 0.9440324902534485, + "learning_rate": 1.794236694254818e-05, + "loss": 2.56, + "step": 30141 + }, + { + "epoch": 2.7308101741749904, + "grad_norm": 1.0622680187225342, + "learning_rate": 1.793632574155742e-05, + "loss": 2.4846, + "step": 30142 + }, + { + "epoch": 2.730900772349437, + "grad_norm": 1.0277022123336792, + "learning_rate": 1.7930284540566665e-05, + "loss": 2.7337, + "step": 30143 + }, + { + "epoch": 2.730991370523884, + "grad_norm": 1.13233482837677, + "learning_rate": 1.792424333957591e-05, + "loss": 2.5843, + "step": 30144 + }, + { + "epoch": 2.731081968698331, + "grad_norm": 1.0326517820358276, + "learning_rate": 1.7918202138585154e-05, + "loss": 2.7543, + "step": 30145 + }, + { + "epoch": 2.7311725668727775, + "grad_norm": 0.8420886993408203, + "learning_rate": 1.7912160937594395e-05, + "loss": 1.9503, + "step": 30146 + }, + { + "epoch": 2.731263165047224, + "grad_norm": 0.9740003347396851, + "learning_rate": 1.790611973660364e-05, + "loss": 2.5176, + "step": 30147 + }, + { + "epoch": 2.731353763221671, + "grad_norm": 1.0113886594772339, + "learning_rate": 1.790007853561288e-05, + "loss": 2.4392, + "step": 30148 + }, + { + "epoch": 2.731444361396118, + "grad_norm": 0.856831431388855, + "learning_rate": 1.7894037334622124e-05, + "loss": 1.8583, + "step": 30149 + }, + { + "epoch": 2.7315349595705647, + "grad_norm": 1.0891780853271484, + "learning_rate": 1.7887996133631365e-05, + "loss": 2.541, + "step": 30150 + }, + { + "epoch": 2.7316255577450113, + "grad_norm": 0.9743140935897827, + "learning_rate": 1.788195493264061e-05, + "loss": 2.5758, + "step": 30151 + }, + { + "epoch": 2.7317161559194583, + "grad_norm": 1.0470519065856934, + "learning_rate": 1.7875913731649853e-05, + "loss": 2.7501, + "step": 30152 + }, + { + "epoch": 2.7318067540939053, + "grad_norm": 1.1430693864822388, + "learning_rate": 1.7869872530659097e-05, + "loss": 2.9475, + "step": 30153 + }, + { + "epoch": 2.731897352268352, + "grad_norm": 1.0632953643798828, + "learning_rate": 1.786383132966834e-05, + "loss": 2.9747, + "step": 30154 + }, + { + "epoch": 2.7319879504427984, + "grad_norm": 1.0160959959030151, + "learning_rate": 1.7857790128677582e-05, + "loss": 2.7044, + "step": 30155 + }, + { + "epoch": 2.7320785486172454, + "grad_norm": 1.0010062456130981, + "learning_rate": 1.7851748927686827e-05, + "loss": 2.5409, + "step": 30156 + }, + { + "epoch": 2.732169146791692, + "grad_norm": 0.9489310383796692, + "learning_rate": 1.7845707726696067e-05, + "loss": 2.5692, + "step": 30157 + }, + { + "epoch": 2.732259744966139, + "grad_norm": 1.1679649353027344, + "learning_rate": 1.7839666525705312e-05, + "loss": 2.6392, + "step": 30158 + }, + { + "epoch": 2.7323503431405856, + "grad_norm": 1.0112172365188599, + "learning_rate": 1.7833625324714553e-05, + "loss": 2.7373, + "step": 30159 + }, + { + "epoch": 2.7324409413150326, + "grad_norm": 1.0431948900222778, + "learning_rate": 1.7827584123723797e-05, + "loss": 2.7727, + "step": 30160 + }, + { + "epoch": 2.732531539489479, + "grad_norm": 0.8263986706733704, + "learning_rate": 1.782154292273304e-05, + "loss": 1.9362, + "step": 30161 + }, + { + "epoch": 2.732622137663926, + "grad_norm": 0.9759008884429932, + "learning_rate": 1.7815501721742285e-05, + "loss": 2.4086, + "step": 30162 + }, + { + "epoch": 2.7327127358383727, + "grad_norm": 0.9765159487724304, + "learning_rate": 1.7809460520751526e-05, + "loss": 2.2971, + "step": 30163 + }, + { + "epoch": 2.7328033340128197, + "grad_norm": 0.9425702691078186, + "learning_rate": 1.780341931976077e-05, + "loss": 2.4739, + "step": 30164 + }, + { + "epoch": 2.7328939321872663, + "grad_norm": 0.9294217228889465, + "learning_rate": 1.779737811877001e-05, + "loss": 2.4959, + "step": 30165 + }, + { + "epoch": 2.7329845303617133, + "grad_norm": 1.1150366067886353, + "learning_rate": 1.7791336917779255e-05, + "loss": 2.5827, + "step": 30166 + }, + { + "epoch": 2.73307512853616, + "grad_norm": 1.0136054754257202, + "learning_rate": 1.7785295716788496e-05, + "loss": 2.5917, + "step": 30167 + }, + { + "epoch": 2.733165726710607, + "grad_norm": 0.9029031991958618, + "learning_rate": 1.777925451579774e-05, + "loss": 2.53, + "step": 30168 + }, + { + "epoch": 2.7332563248850534, + "grad_norm": 0.9910502433776855, + "learning_rate": 1.7773213314806985e-05, + "loss": 2.499, + "step": 30169 + }, + { + "epoch": 2.7333469230595004, + "grad_norm": 0.8659118413925171, + "learning_rate": 1.776717211381623e-05, + "loss": 2.0355, + "step": 30170 + }, + { + "epoch": 2.733437521233947, + "grad_norm": 0.883441150188446, + "learning_rate": 1.7761130912825473e-05, + "loss": 2.0232, + "step": 30171 + }, + { + "epoch": 2.733528119408394, + "grad_norm": 1.0712414979934692, + "learning_rate": 1.7755089711834714e-05, + "loss": 2.5843, + "step": 30172 + }, + { + "epoch": 2.7336187175828406, + "grad_norm": 1.0344940423965454, + "learning_rate": 1.7749048510843958e-05, + "loss": 2.7403, + "step": 30173 + }, + { + "epoch": 2.7337093157572876, + "grad_norm": 1.057057499885559, + "learning_rate": 1.77430073098532e-05, + "loss": 2.5148, + "step": 30174 + }, + { + "epoch": 2.733799913931734, + "grad_norm": 1.1469171047210693, + "learning_rate": 1.7736966108862443e-05, + "loss": 2.6048, + "step": 30175 + }, + { + "epoch": 2.733890512106181, + "grad_norm": 0.9798122644424438, + "learning_rate": 1.7730924907871684e-05, + "loss": 2.3416, + "step": 30176 + }, + { + "epoch": 2.7339811102806277, + "grad_norm": 0.6908224821090698, + "learning_rate": 1.7724883706880928e-05, + "loss": 1.1835, + "step": 30177 + }, + { + "epoch": 2.7340717084550747, + "grad_norm": 1.025286078453064, + "learning_rate": 1.7718842505890172e-05, + "loss": 2.5364, + "step": 30178 + }, + { + "epoch": 2.7341623066295213, + "grad_norm": 0.9871158599853516, + "learning_rate": 1.7712801304899417e-05, + "loss": 2.7538, + "step": 30179 + }, + { + "epoch": 2.7342529048039683, + "grad_norm": 1.0108627080917358, + "learning_rate": 1.7706760103908657e-05, + "loss": 2.6279, + "step": 30180 + }, + { + "epoch": 2.734343502978415, + "grad_norm": 1.077749252319336, + "learning_rate": 1.77007189029179e-05, + "loss": 2.8396, + "step": 30181 + }, + { + "epoch": 2.734434101152862, + "grad_norm": 1.1533972024917603, + "learning_rate": 1.7694677701927142e-05, + "loss": 2.6846, + "step": 30182 + }, + { + "epoch": 2.7345246993273085, + "grad_norm": 1.1346362829208374, + "learning_rate": 1.7688636500936387e-05, + "loss": 2.5408, + "step": 30183 + }, + { + "epoch": 2.7346152975017555, + "grad_norm": 0.9646238684654236, + "learning_rate": 1.768259529994563e-05, + "loss": 2.6876, + "step": 30184 + }, + { + "epoch": 2.734705895676202, + "grad_norm": 1.0318191051483154, + "learning_rate": 1.7676554098954872e-05, + "loss": 2.5941, + "step": 30185 + }, + { + "epoch": 2.734796493850649, + "grad_norm": 1.0596206188201904, + "learning_rate": 1.7670512897964116e-05, + "loss": 2.6714, + "step": 30186 + }, + { + "epoch": 2.7348870920250956, + "grad_norm": 0.9564003944396973, + "learning_rate": 1.766447169697336e-05, + "loss": 2.5674, + "step": 30187 + }, + { + "epoch": 2.7349776901995426, + "grad_norm": 0.8609662055969238, + "learning_rate": 1.7658430495982604e-05, + "loss": 1.9309, + "step": 30188 + }, + { + "epoch": 2.735068288373989, + "grad_norm": 0.9818645715713501, + "learning_rate": 1.7652389294991845e-05, + "loss": 2.6154, + "step": 30189 + }, + { + "epoch": 2.735158886548436, + "grad_norm": 1.020961880683899, + "learning_rate": 1.764634809400109e-05, + "loss": 2.629, + "step": 30190 + }, + { + "epoch": 2.7352494847228828, + "grad_norm": 1.042472243309021, + "learning_rate": 1.764030689301033e-05, + "loss": 2.9562, + "step": 30191 + }, + { + "epoch": 2.7353400828973298, + "grad_norm": 1.029515266418457, + "learning_rate": 1.7634265692019574e-05, + "loss": 2.6355, + "step": 30192 + }, + { + "epoch": 2.7354306810717763, + "grad_norm": 1.0347020626068115, + "learning_rate": 1.7628224491028815e-05, + "loss": 2.4485, + "step": 30193 + }, + { + "epoch": 2.7355212792462233, + "grad_norm": 0.8409282565116882, + "learning_rate": 1.762218329003806e-05, + "loss": 1.8438, + "step": 30194 + }, + { + "epoch": 2.73561187742067, + "grad_norm": 0.9660091996192932, + "learning_rate": 1.7616142089047304e-05, + "loss": 2.534, + "step": 30195 + }, + { + "epoch": 2.735702475595117, + "grad_norm": 1.021012306213379, + "learning_rate": 1.7610100888056548e-05, + "loss": 2.5671, + "step": 30196 + }, + { + "epoch": 2.7357930737695635, + "grad_norm": 1.0678815841674805, + "learning_rate": 1.7604059687065792e-05, + "loss": 2.6898, + "step": 30197 + }, + { + "epoch": 2.73588367194401, + "grad_norm": 1.0731021165847778, + "learning_rate": 1.7598018486075033e-05, + "loss": 2.649, + "step": 30198 + }, + { + "epoch": 2.735974270118457, + "grad_norm": 1.052567481994629, + "learning_rate": 1.7591977285084277e-05, + "loss": 2.6838, + "step": 30199 + }, + { + "epoch": 2.736064868292904, + "grad_norm": 0.9090853333473206, + "learning_rate": 1.7585936084093518e-05, + "loss": 1.8257, + "step": 30200 + }, + { + "epoch": 2.7361554664673506, + "grad_norm": 0.9835206270217896, + "learning_rate": 1.7579894883102762e-05, + "loss": 2.4281, + "step": 30201 + }, + { + "epoch": 2.736246064641797, + "grad_norm": 1.0539562702178955, + "learning_rate": 1.7573853682112003e-05, + "loss": 2.5202, + "step": 30202 + }, + { + "epoch": 2.736336662816244, + "grad_norm": 0.9457879066467285, + "learning_rate": 1.7567812481121247e-05, + "loss": 2.4214, + "step": 30203 + }, + { + "epoch": 2.7364272609906912, + "grad_norm": 0.9532520174980164, + "learning_rate": 1.756177128013049e-05, + "loss": 1.8169, + "step": 30204 + }, + { + "epoch": 2.736517859165138, + "grad_norm": 1.002476453781128, + "learning_rate": 1.7555730079139736e-05, + "loss": 2.8467, + "step": 30205 + }, + { + "epoch": 2.7366084573395844, + "grad_norm": 1.1139129400253296, + "learning_rate": 1.7549688878148977e-05, + "loss": 2.7108, + "step": 30206 + }, + { + "epoch": 2.7366990555140314, + "grad_norm": 0.8271179795265198, + "learning_rate": 1.754364767715822e-05, + "loss": 1.9318, + "step": 30207 + }, + { + "epoch": 2.7367896536884784, + "grad_norm": 1.0703043937683105, + "learning_rate": 1.753760647616746e-05, + "loss": 2.3946, + "step": 30208 + }, + { + "epoch": 2.736880251862925, + "grad_norm": 0.9395318031311035, + "learning_rate": 1.7531565275176706e-05, + "loss": 2.407, + "step": 30209 + }, + { + "epoch": 2.7369708500373715, + "grad_norm": 1.0706453323364258, + "learning_rate": 1.7525524074185947e-05, + "loss": 2.6648, + "step": 30210 + }, + { + "epoch": 2.7370614482118185, + "grad_norm": 0.9199358224868774, + "learning_rate": 1.751948287319519e-05, + "loss": 2.0197, + "step": 30211 + }, + { + "epoch": 2.7371520463862655, + "grad_norm": 1.0668636560440063, + "learning_rate": 1.7513441672204435e-05, + "loss": 2.8079, + "step": 30212 + }, + { + "epoch": 2.737242644560712, + "grad_norm": 1.0031496286392212, + "learning_rate": 1.750740047121368e-05, + "loss": 2.9234, + "step": 30213 + }, + { + "epoch": 2.7373332427351587, + "grad_norm": 1.0259655714035034, + "learning_rate": 1.7501359270222924e-05, + "loss": 2.5884, + "step": 30214 + }, + { + "epoch": 2.7374238409096057, + "grad_norm": 1.041628360748291, + "learning_rate": 1.7495318069232164e-05, + "loss": 2.9108, + "step": 30215 + }, + { + "epoch": 2.7375144390840527, + "grad_norm": 1.1902610063552856, + "learning_rate": 1.748927686824141e-05, + "loss": 2.776, + "step": 30216 + }, + { + "epoch": 2.7376050372584992, + "grad_norm": 0.850022554397583, + "learning_rate": 1.748323566725065e-05, + "loss": 1.8863, + "step": 30217 + }, + { + "epoch": 2.737695635432946, + "grad_norm": 0.9991576671600342, + "learning_rate": 1.7477194466259894e-05, + "loss": 2.8371, + "step": 30218 + }, + { + "epoch": 2.737786233607393, + "grad_norm": 1.0071700811386108, + "learning_rate": 1.7471153265269134e-05, + "loss": 2.1154, + "step": 30219 + }, + { + "epoch": 2.73787683178184, + "grad_norm": 0.9923833012580872, + "learning_rate": 1.746511206427838e-05, + "loss": 2.6332, + "step": 30220 + }, + { + "epoch": 2.7379674299562864, + "grad_norm": 0.9906545877456665, + "learning_rate": 1.7459070863287623e-05, + "loss": 2.4437, + "step": 30221 + }, + { + "epoch": 2.738058028130733, + "grad_norm": 1.0195283889770508, + "learning_rate": 1.7453029662296867e-05, + "loss": 2.7622, + "step": 30222 + }, + { + "epoch": 2.73814862630518, + "grad_norm": 0.9972273707389832, + "learning_rate": 1.7446988461306108e-05, + "loss": 2.5998, + "step": 30223 + }, + { + "epoch": 2.738239224479627, + "grad_norm": 0.952884316444397, + "learning_rate": 1.7440947260315352e-05, + "loss": 2.4903, + "step": 30224 + }, + { + "epoch": 2.7383298226540735, + "grad_norm": 1.043121337890625, + "learning_rate": 1.7434906059324593e-05, + "loss": 2.801, + "step": 30225 + }, + { + "epoch": 2.73842042082852, + "grad_norm": 0.9613807201385498, + "learning_rate": 1.7428864858333837e-05, + "loss": 2.5492, + "step": 30226 + }, + { + "epoch": 2.738511019002967, + "grad_norm": 0.9905378818511963, + "learning_rate": 1.742282365734308e-05, + "loss": 2.5565, + "step": 30227 + }, + { + "epoch": 2.738601617177414, + "grad_norm": 1.0988353490829468, + "learning_rate": 1.7416782456352322e-05, + "loss": 2.9427, + "step": 30228 + }, + { + "epoch": 2.7386922153518607, + "grad_norm": 0.8548629283905029, + "learning_rate": 1.7410741255361566e-05, + "loss": 1.9898, + "step": 30229 + }, + { + "epoch": 2.7387828135263073, + "grad_norm": 1.0470997095108032, + "learning_rate": 1.740470005437081e-05, + "loss": 2.4185, + "step": 30230 + }, + { + "epoch": 2.7388734117007543, + "grad_norm": 1.0688362121582031, + "learning_rate": 1.7398658853380055e-05, + "loss": 2.2837, + "step": 30231 + }, + { + "epoch": 2.7389640098752013, + "grad_norm": 0.823630154132843, + "learning_rate": 1.7392617652389296e-05, + "loss": 2.0281, + "step": 30232 + }, + { + "epoch": 2.739054608049648, + "grad_norm": 1.0569090843200684, + "learning_rate": 1.738657645139854e-05, + "loss": 2.6797, + "step": 30233 + }, + { + "epoch": 2.7391452062240944, + "grad_norm": 1.0120468139648438, + "learning_rate": 1.738053525040778e-05, + "loss": 3.0243, + "step": 30234 + }, + { + "epoch": 2.7392358043985414, + "grad_norm": 0.9626218676567078, + "learning_rate": 1.7374494049417025e-05, + "loss": 2.3675, + "step": 30235 + }, + { + "epoch": 2.7393264025729884, + "grad_norm": 0.9962647557258606, + "learning_rate": 1.7368452848426266e-05, + "loss": 2.4757, + "step": 30236 + }, + { + "epoch": 2.739417000747435, + "grad_norm": 1.055300235748291, + "learning_rate": 1.736241164743551e-05, + "loss": 2.4557, + "step": 30237 + }, + { + "epoch": 2.7395075989218816, + "grad_norm": 1.0192184448242188, + "learning_rate": 1.7356370446444754e-05, + "loss": 2.4891, + "step": 30238 + }, + { + "epoch": 2.7395981970963286, + "grad_norm": 1.0019816160202026, + "learning_rate": 1.7350329245454e-05, + "loss": 2.6785, + "step": 30239 + }, + { + "epoch": 2.739688795270775, + "grad_norm": 1.0569597482681274, + "learning_rate": 1.734428804446324e-05, + "loss": 2.5496, + "step": 30240 + }, + { + "epoch": 2.739779393445222, + "grad_norm": 1.0599020719528198, + "learning_rate": 1.7338246843472484e-05, + "loss": 2.2112, + "step": 30241 + }, + { + "epoch": 2.7398699916196687, + "grad_norm": 0.9935908317565918, + "learning_rate": 1.7332205642481728e-05, + "loss": 2.5236, + "step": 30242 + }, + { + "epoch": 2.7399605897941157, + "grad_norm": 1.0299516916275024, + "learning_rate": 1.732616444149097e-05, + "loss": 2.4659, + "step": 30243 + }, + { + "epoch": 2.7400511879685623, + "grad_norm": 1.0061204433441162, + "learning_rate": 1.7320123240500213e-05, + "loss": 2.5304, + "step": 30244 + }, + { + "epoch": 2.7401417861430093, + "grad_norm": 0.9829526543617249, + "learning_rate": 1.7314082039509454e-05, + "loss": 2.5362, + "step": 30245 + }, + { + "epoch": 2.740232384317456, + "grad_norm": 1.0297954082489014, + "learning_rate": 1.7308040838518698e-05, + "loss": 2.5603, + "step": 30246 + }, + { + "epoch": 2.740322982491903, + "grad_norm": 1.0550256967544556, + "learning_rate": 1.7301999637527942e-05, + "loss": 2.7481, + "step": 30247 + }, + { + "epoch": 2.7404135806663494, + "grad_norm": 0.9994580149650574, + "learning_rate": 1.7295958436537186e-05, + "loss": 2.7714, + "step": 30248 + }, + { + "epoch": 2.7405041788407964, + "grad_norm": 1.0618077516555786, + "learning_rate": 1.7289917235546427e-05, + "loss": 2.6545, + "step": 30249 + }, + { + "epoch": 2.740594777015243, + "grad_norm": 1.018156886100769, + "learning_rate": 1.728387603455567e-05, + "loss": 2.4379, + "step": 30250 + }, + { + "epoch": 2.74068537518969, + "grad_norm": 1.0936459302902222, + "learning_rate": 1.7277834833564912e-05, + "loss": 2.7059, + "step": 30251 + }, + { + "epoch": 2.7407759733641366, + "grad_norm": 0.9791951179504395, + "learning_rate": 1.7271793632574156e-05, + "loss": 2.6218, + "step": 30252 + }, + { + "epoch": 2.7408665715385836, + "grad_norm": 0.9574499726295471, + "learning_rate": 1.7265752431583397e-05, + "loss": 2.6486, + "step": 30253 + }, + { + "epoch": 2.74095716971303, + "grad_norm": 0.9756530523300171, + "learning_rate": 1.725971123059264e-05, + "loss": 2.3481, + "step": 30254 + }, + { + "epoch": 2.741047767887477, + "grad_norm": 1.0312559604644775, + "learning_rate": 1.7253670029601886e-05, + "loss": 2.7663, + "step": 30255 + }, + { + "epoch": 2.7411383660619237, + "grad_norm": 0.9686684012413025, + "learning_rate": 1.724762882861113e-05, + "loss": 2.5684, + "step": 30256 + }, + { + "epoch": 2.7412289642363707, + "grad_norm": 0.9392175674438477, + "learning_rate": 1.7241587627620374e-05, + "loss": 2.4167, + "step": 30257 + }, + { + "epoch": 2.7413195624108173, + "grad_norm": 0.8477850556373596, + "learning_rate": 1.7235546426629615e-05, + "loss": 1.8691, + "step": 30258 + }, + { + "epoch": 2.7414101605852643, + "grad_norm": 1.067561149597168, + "learning_rate": 1.722950522563886e-05, + "loss": 2.6721, + "step": 30259 + }, + { + "epoch": 2.741500758759711, + "grad_norm": 1.0297892093658447, + "learning_rate": 1.72234640246481e-05, + "loss": 2.5357, + "step": 30260 + }, + { + "epoch": 2.741591356934158, + "grad_norm": 1.0596251487731934, + "learning_rate": 1.7217422823657344e-05, + "loss": 2.6581, + "step": 30261 + }, + { + "epoch": 2.7416819551086045, + "grad_norm": 0.9493852853775024, + "learning_rate": 1.7211381622666585e-05, + "loss": 2.5982, + "step": 30262 + }, + { + "epoch": 2.7417725532830515, + "grad_norm": 0.9471772313117981, + "learning_rate": 1.720534042167583e-05, + "loss": 2.5313, + "step": 30263 + }, + { + "epoch": 2.741863151457498, + "grad_norm": 0.977240264415741, + "learning_rate": 1.7199299220685073e-05, + "loss": 2.4298, + "step": 30264 + }, + { + "epoch": 2.741953749631945, + "grad_norm": 1.1132194995880127, + "learning_rate": 1.7193258019694318e-05, + "loss": 2.6762, + "step": 30265 + }, + { + "epoch": 2.7420443478063916, + "grad_norm": 1.0744916200637817, + "learning_rate": 1.718721681870356e-05, + "loss": 2.4246, + "step": 30266 + }, + { + "epoch": 2.7421349459808386, + "grad_norm": 1.0065689086914062, + "learning_rate": 1.7181175617712803e-05, + "loss": 2.7754, + "step": 30267 + }, + { + "epoch": 2.742225544155285, + "grad_norm": 1.0603960752487183, + "learning_rate": 1.7175134416722044e-05, + "loss": 2.5669, + "step": 30268 + }, + { + "epoch": 2.742316142329732, + "grad_norm": 1.0497119426727295, + "learning_rate": 1.7169093215731288e-05, + "loss": 2.624, + "step": 30269 + }, + { + "epoch": 2.7424067405041788, + "grad_norm": 1.041712999343872, + "learning_rate": 1.716305201474053e-05, + "loss": 2.5396, + "step": 30270 + }, + { + "epoch": 2.7424973386786258, + "grad_norm": 0.9718853235244751, + "learning_rate": 1.7157010813749773e-05, + "loss": 2.389, + "step": 30271 + }, + { + "epoch": 2.7425879368530723, + "grad_norm": 1.0541036128997803, + "learning_rate": 1.7150969612759017e-05, + "loss": 2.5783, + "step": 30272 + }, + { + "epoch": 2.7426785350275193, + "grad_norm": 0.9805343151092529, + "learning_rate": 1.714492841176826e-05, + "loss": 2.3975, + "step": 30273 + }, + { + "epoch": 2.742769133201966, + "grad_norm": 1.0926340818405151, + "learning_rate": 1.7138887210777505e-05, + "loss": 2.6244, + "step": 30274 + }, + { + "epoch": 2.742859731376413, + "grad_norm": 0.9957812428474426, + "learning_rate": 1.7132846009786746e-05, + "loss": 2.3772, + "step": 30275 + }, + { + "epoch": 2.7429503295508595, + "grad_norm": 0.9830483794212341, + "learning_rate": 1.712680480879599e-05, + "loss": 2.5252, + "step": 30276 + }, + { + "epoch": 2.7430409277253065, + "grad_norm": 0.8675696849822998, + "learning_rate": 1.712076360780523e-05, + "loss": 2.0199, + "step": 30277 + }, + { + "epoch": 2.743131525899753, + "grad_norm": 1.0183250904083252, + "learning_rate": 1.7114722406814476e-05, + "loss": 2.6204, + "step": 30278 + }, + { + "epoch": 2.7432221240742, + "grad_norm": 1.0011533498764038, + "learning_rate": 1.7108681205823716e-05, + "loss": 2.5047, + "step": 30279 + }, + { + "epoch": 2.7433127222486466, + "grad_norm": 1.0758548974990845, + "learning_rate": 1.710264000483296e-05, + "loss": 2.6035, + "step": 30280 + }, + { + "epoch": 2.743403320423093, + "grad_norm": 1.0418825149536133, + "learning_rate": 1.7096598803842205e-05, + "loss": 2.644, + "step": 30281 + }, + { + "epoch": 2.74349391859754, + "grad_norm": 0.9569724202156067, + "learning_rate": 1.709055760285145e-05, + "loss": 2.4066, + "step": 30282 + }, + { + "epoch": 2.7435845167719872, + "grad_norm": 1.1137722730636597, + "learning_rate": 1.708451640186069e-05, + "loss": 2.2804, + "step": 30283 + }, + { + "epoch": 2.743675114946434, + "grad_norm": 1.226462483406067, + "learning_rate": 1.7078475200869934e-05, + "loss": 2.4054, + "step": 30284 + }, + { + "epoch": 2.7437657131208804, + "grad_norm": 1.2383790016174316, + "learning_rate": 1.7072433999879175e-05, + "loss": 2.6016, + "step": 30285 + }, + { + "epoch": 2.7438563112953274, + "grad_norm": 1.0139293670654297, + "learning_rate": 1.706639279888842e-05, + "loss": 2.7201, + "step": 30286 + }, + { + "epoch": 2.7439469094697744, + "grad_norm": 1.0070164203643799, + "learning_rate": 1.7060351597897663e-05, + "loss": 2.6487, + "step": 30287 + }, + { + "epoch": 2.744037507644221, + "grad_norm": 0.992716372013092, + "learning_rate": 1.7054310396906904e-05, + "loss": 2.5921, + "step": 30288 + }, + { + "epoch": 2.7441281058186675, + "grad_norm": 1.0565648078918457, + "learning_rate": 1.704826919591615e-05, + "loss": 2.8631, + "step": 30289 + }, + { + "epoch": 2.7442187039931145, + "grad_norm": 1.0512813329696655, + "learning_rate": 1.7042227994925393e-05, + "loss": 2.5833, + "step": 30290 + }, + { + "epoch": 2.7443093021675615, + "grad_norm": 1.0036776065826416, + "learning_rate": 1.7036186793934637e-05, + "loss": 2.711, + "step": 30291 + }, + { + "epoch": 2.744399900342008, + "grad_norm": 1.0511590242385864, + "learning_rate": 1.7030145592943878e-05, + "loss": 2.6845, + "step": 30292 + }, + { + "epoch": 2.7444904985164547, + "grad_norm": 1.0075325965881348, + "learning_rate": 1.7024104391953122e-05, + "loss": 2.6662, + "step": 30293 + }, + { + "epoch": 2.7445810966909017, + "grad_norm": 1.041690468788147, + "learning_rate": 1.7018063190962363e-05, + "loss": 2.6405, + "step": 30294 + }, + { + "epoch": 2.7446716948653487, + "grad_norm": 0.9626472592353821, + "learning_rate": 1.7012021989971607e-05, + "loss": 2.1618, + "step": 30295 + }, + { + "epoch": 2.7447622930397952, + "grad_norm": 0.9942169189453125, + "learning_rate": 1.7005980788980848e-05, + "loss": 2.6589, + "step": 30296 + }, + { + "epoch": 2.744852891214242, + "grad_norm": 1.0093823671340942, + "learning_rate": 1.6999939587990092e-05, + "loss": 2.8126, + "step": 30297 + }, + { + "epoch": 2.744943489388689, + "grad_norm": 0.9667802453041077, + "learning_rate": 1.6993898386999336e-05, + "loss": 2.4751, + "step": 30298 + }, + { + "epoch": 2.745034087563136, + "grad_norm": 1.1108896732330322, + "learning_rate": 1.698785718600858e-05, + "loss": 2.4094, + "step": 30299 + }, + { + "epoch": 2.7451246857375824, + "grad_norm": 1.0732427835464478, + "learning_rate": 1.698181598501782e-05, + "loss": 2.4745, + "step": 30300 + }, + { + "epoch": 2.745215283912029, + "grad_norm": 0.9870272874832153, + "learning_rate": 1.6975774784027065e-05, + "loss": 2.5371, + "step": 30301 + }, + { + "epoch": 2.745305882086476, + "grad_norm": 0.9568662047386169, + "learning_rate": 1.696973358303631e-05, + "loss": 2.5234, + "step": 30302 + }, + { + "epoch": 2.745396480260923, + "grad_norm": 0.8624995946884155, + "learning_rate": 1.696369238204555e-05, + "loss": 1.9094, + "step": 30303 + }, + { + "epoch": 2.7454870784353695, + "grad_norm": 1.0099778175354004, + "learning_rate": 1.6957651181054795e-05, + "loss": 2.646, + "step": 30304 + }, + { + "epoch": 2.745577676609816, + "grad_norm": 0.9480730295181274, + "learning_rate": 1.6951609980064036e-05, + "loss": 2.3677, + "step": 30305 + }, + { + "epoch": 2.745668274784263, + "grad_norm": 0.9602519273757935, + "learning_rate": 1.6945568779073283e-05, + "loss": 2.6001, + "step": 30306 + }, + { + "epoch": 2.74575887295871, + "grad_norm": 1.0043128728866577, + "learning_rate": 1.6939527578082524e-05, + "loss": 2.6064, + "step": 30307 + }, + { + "epoch": 2.7458494711331567, + "grad_norm": 1.0080009698867798, + "learning_rate": 1.6933486377091768e-05, + "loss": 2.7492, + "step": 30308 + }, + { + "epoch": 2.7459400693076033, + "grad_norm": 1.025614619255066, + "learning_rate": 1.692744517610101e-05, + "loss": 2.4332, + "step": 30309 + }, + { + "epoch": 2.7460306674820503, + "grad_norm": 0.804908812046051, + "learning_rate": 1.6921403975110253e-05, + "loss": 1.8497, + "step": 30310 + }, + { + "epoch": 2.7461212656564973, + "grad_norm": 0.8744201064109802, + "learning_rate": 1.6915362774119494e-05, + "loss": 1.8523, + "step": 30311 + }, + { + "epoch": 2.746211863830944, + "grad_norm": 0.9732373952865601, + "learning_rate": 1.6909321573128738e-05, + "loss": 2.8084, + "step": 30312 + }, + { + "epoch": 2.7463024620053904, + "grad_norm": 1.0388308763504028, + "learning_rate": 1.690328037213798e-05, + "loss": 2.7048, + "step": 30313 + }, + { + "epoch": 2.7463930601798374, + "grad_norm": 1.0301955938339233, + "learning_rate": 1.6897239171147223e-05, + "loss": 2.5284, + "step": 30314 + }, + { + "epoch": 2.7464836583542844, + "grad_norm": 0.987192690372467, + "learning_rate": 1.6891197970156468e-05, + "loss": 2.6668, + "step": 30315 + }, + { + "epoch": 2.746574256528731, + "grad_norm": 0.9676798582077026, + "learning_rate": 1.6885156769165712e-05, + "loss": 2.3953, + "step": 30316 + }, + { + "epoch": 2.7466648547031776, + "grad_norm": 0.9880070686340332, + "learning_rate": 1.6879115568174956e-05, + "loss": 2.7216, + "step": 30317 + }, + { + "epoch": 2.7467554528776246, + "grad_norm": 1.0196326971054077, + "learning_rate": 1.6873074367184197e-05, + "loss": 2.8507, + "step": 30318 + }, + { + "epoch": 2.746846051052071, + "grad_norm": 1.1369041204452515, + "learning_rate": 1.686703316619344e-05, + "loss": 2.5182, + "step": 30319 + }, + { + "epoch": 2.746936649226518, + "grad_norm": 0.8461744785308838, + "learning_rate": 1.6860991965202682e-05, + "loss": 1.8419, + "step": 30320 + }, + { + "epoch": 2.7470272474009647, + "grad_norm": 1.016660451889038, + "learning_rate": 1.6854950764211926e-05, + "loss": 2.7529, + "step": 30321 + }, + { + "epoch": 2.7471178455754117, + "grad_norm": 0.9766238927841187, + "learning_rate": 1.6848909563221167e-05, + "loss": 2.6642, + "step": 30322 + }, + { + "epoch": 2.7472084437498583, + "grad_norm": 0.8536777496337891, + "learning_rate": 1.6842868362230414e-05, + "loss": 2.0442, + "step": 30323 + }, + { + "epoch": 2.7472990419243053, + "grad_norm": 0.9973830580711365, + "learning_rate": 1.6836827161239655e-05, + "loss": 2.3601, + "step": 30324 + }, + { + "epoch": 2.747389640098752, + "grad_norm": 1.1439626216888428, + "learning_rate": 1.68307859602489e-05, + "loss": 2.5084, + "step": 30325 + }, + { + "epoch": 2.747480238273199, + "grad_norm": 0.9963405132293701, + "learning_rate": 1.682474475925814e-05, + "loss": 2.4585, + "step": 30326 + }, + { + "epoch": 2.7475708364476454, + "grad_norm": 1.064565896987915, + "learning_rate": 1.6818703558267385e-05, + "loss": 2.9928, + "step": 30327 + }, + { + "epoch": 2.7476614346220924, + "grad_norm": 1.020103931427002, + "learning_rate": 1.6812662357276625e-05, + "loss": 2.5774, + "step": 30328 + }, + { + "epoch": 2.747752032796539, + "grad_norm": 1.0222448110580444, + "learning_rate": 1.680662115628587e-05, + "loss": 2.6222, + "step": 30329 + }, + { + "epoch": 2.747842630970986, + "grad_norm": 0.9756237268447876, + "learning_rate": 1.6800579955295114e-05, + "loss": 2.6014, + "step": 30330 + }, + { + "epoch": 2.7479332291454326, + "grad_norm": 0.8109404444694519, + "learning_rate": 1.6794538754304358e-05, + "loss": 1.8267, + "step": 30331 + }, + { + "epoch": 2.7480238273198796, + "grad_norm": 1.0191584825515747, + "learning_rate": 1.6788497553313602e-05, + "loss": 2.9538, + "step": 30332 + }, + { + "epoch": 2.748114425494326, + "grad_norm": 1.041604995727539, + "learning_rate": 1.6782456352322843e-05, + "loss": 2.6326, + "step": 30333 + }, + { + "epoch": 2.748205023668773, + "grad_norm": 0.9772804379463196, + "learning_rate": 1.6776415151332087e-05, + "loss": 2.696, + "step": 30334 + }, + { + "epoch": 2.7482956218432197, + "grad_norm": 1.0721404552459717, + "learning_rate": 1.6770373950341328e-05, + "loss": 2.8193, + "step": 30335 + }, + { + "epoch": 2.7483862200176667, + "grad_norm": 1.0172457695007324, + "learning_rate": 1.6764332749350572e-05, + "loss": 2.5565, + "step": 30336 + }, + { + "epoch": 2.7484768181921133, + "grad_norm": 1.1279348134994507, + "learning_rate": 1.6758291548359813e-05, + "loss": 2.6448, + "step": 30337 + }, + { + "epoch": 2.7485674163665603, + "grad_norm": 0.9895321726799011, + "learning_rate": 1.6752250347369057e-05, + "loss": 2.3618, + "step": 30338 + }, + { + "epoch": 2.748658014541007, + "grad_norm": 1.0171564817428589, + "learning_rate": 1.67462091463783e-05, + "loss": 2.4512, + "step": 30339 + }, + { + "epoch": 2.748748612715454, + "grad_norm": 1.0094581842422485, + "learning_rate": 1.6740167945387546e-05, + "loss": 2.8758, + "step": 30340 + }, + { + "epoch": 2.7488392108899005, + "grad_norm": 1.1308757066726685, + "learning_rate": 1.6734126744396787e-05, + "loss": 2.4609, + "step": 30341 + }, + { + "epoch": 2.7489298090643475, + "grad_norm": 0.921332597732544, + "learning_rate": 1.672808554340603e-05, + "loss": 2.6237, + "step": 30342 + }, + { + "epoch": 2.749020407238794, + "grad_norm": 0.9732847213745117, + "learning_rate": 1.6722044342415272e-05, + "loss": 2.6648, + "step": 30343 + }, + { + "epoch": 2.749111005413241, + "grad_norm": 1.0116044282913208, + "learning_rate": 1.6716003141424516e-05, + "loss": 2.7693, + "step": 30344 + }, + { + "epoch": 2.7492016035876876, + "grad_norm": 1.1291778087615967, + "learning_rate": 1.670996194043376e-05, + "loss": 2.4827, + "step": 30345 + }, + { + "epoch": 2.7492922017621346, + "grad_norm": 0.8582874536514282, + "learning_rate": 1.6703920739443e-05, + "loss": 1.8255, + "step": 30346 + }, + { + "epoch": 2.749382799936581, + "grad_norm": 0.97555011510849, + "learning_rate": 1.6697879538452245e-05, + "loss": 2.8298, + "step": 30347 + }, + { + "epoch": 2.749473398111028, + "grad_norm": 0.9882709980010986, + "learning_rate": 1.669183833746149e-05, + "loss": 2.5439, + "step": 30348 + }, + { + "epoch": 2.7495639962854748, + "grad_norm": 0.9838083982467651, + "learning_rate": 1.6685797136470734e-05, + "loss": 2.4295, + "step": 30349 + }, + { + "epoch": 2.7496545944599218, + "grad_norm": 1.0718040466308594, + "learning_rate": 1.6679755935479974e-05, + "loss": 2.6324, + "step": 30350 + }, + { + "epoch": 2.7497451926343683, + "grad_norm": 0.9976128339767456, + "learning_rate": 1.667371473448922e-05, + "loss": 2.4419, + "step": 30351 + }, + { + "epoch": 2.7498357908088154, + "grad_norm": 1.17500638961792, + "learning_rate": 1.666767353349846e-05, + "loss": 1.6825, + "step": 30352 + }, + { + "epoch": 2.749926388983262, + "grad_norm": 0.969074547290802, + "learning_rate": 1.6661632332507704e-05, + "loss": 2.5635, + "step": 30353 + }, + { + "epoch": 2.750016987157709, + "grad_norm": 0.9702615141868591, + "learning_rate": 1.6655591131516945e-05, + "loss": 2.4008, + "step": 30354 + }, + { + "epoch": 2.7501075853321555, + "grad_norm": 1.0137535333633423, + "learning_rate": 1.664954993052619e-05, + "loss": 2.4201, + "step": 30355 + }, + { + "epoch": 2.7501981835066025, + "grad_norm": 0.856239914894104, + "learning_rate": 1.6643508729535433e-05, + "loss": 1.6936, + "step": 30356 + }, + { + "epoch": 2.750288781681049, + "grad_norm": 0.8255318403244019, + "learning_rate": 1.6637467528544677e-05, + "loss": 1.9166, + "step": 30357 + }, + { + "epoch": 2.750379379855496, + "grad_norm": 0.9577494263648987, + "learning_rate": 1.6631426327553918e-05, + "loss": 2.4364, + "step": 30358 + }, + { + "epoch": 2.7504699780299426, + "grad_norm": 1.0350582599639893, + "learning_rate": 1.6625385126563162e-05, + "loss": 2.694, + "step": 30359 + }, + { + "epoch": 2.750560576204389, + "grad_norm": 0.9356775879859924, + "learning_rate": 1.6619343925572406e-05, + "loss": 2.5636, + "step": 30360 + }, + { + "epoch": 2.750651174378836, + "grad_norm": 0.9828737378120422, + "learning_rate": 1.6613302724581647e-05, + "loss": 2.7059, + "step": 30361 + }, + { + "epoch": 2.7507417725532832, + "grad_norm": 1.0235466957092285, + "learning_rate": 1.660726152359089e-05, + "loss": 2.686, + "step": 30362 + }, + { + "epoch": 2.75083237072773, + "grad_norm": 1.005030632019043, + "learning_rate": 1.6601220322600132e-05, + "loss": 2.6722, + "step": 30363 + }, + { + "epoch": 2.7509229689021764, + "grad_norm": 1.0318810939788818, + "learning_rate": 1.6595179121609377e-05, + "loss": 2.7148, + "step": 30364 + }, + { + "epoch": 2.7510135670766234, + "grad_norm": 1.096065878868103, + "learning_rate": 1.658913792061862e-05, + "loss": 3.0399, + "step": 30365 + }, + { + "epoch": 2.7511041652510704, + "grad_norm": 0.9638959765434265, + "learning_rate": 1.6583096719627865e-05, + "loss": 2.7138, + "step": 30366 + }, + { + "epoch": 2.751194763425517, + "grad_norm": 1.0289757251739502, + "learning_rate": 1.6577055518637106e-05, + "loss": 2.7403, + "step": 30367 + }, + { + "epoch": 2.7512853615999635, + "grad_norm": 1.0161951780319214, + "learning_rate": 1.657101431764635e-05, + "loss": 2.7481, + "step": 30368 + }, + { + "epoch": 2.7513759597744105, + "grad_norm": 0.9935818910598755, + "learning_rate": 1.656497311665559e-05, + "loss": 2.4529, + "step": 30369 + }, + { + "epoch": 2.7514665579488575, + "grad_norm": 0.9950863718986511, + "learning_rate": 1.6558931915664835e-05, + "loss": 2.6237, + "step": 30370 + }, + { + "epoch": 2.751557156123304, + "grad_norm": 1.057461142539978, + "learning_rate": 1.6552890714674076e-05, + "loss": 2.6753, + "step": 30371 + }, + { + "epoch": 2.7516477542977507, + "grad_norm": 0.9774749875068665, + "learning_rate": 1.654684951368332e-05, + "loss": 2.5401, + "step": 30372 + }, + { + "epoch": 2.7517383524721977, + "grad_norm": 1.003614068031311, + "learning_rate": 1.6540808312692564e-05, + "loss": 2.7516, + "step": 30373 + }, + { + "epoch": 2.7518289506466447, + "grad_norm": 0.9999943375587463, + "learning_rate": 1.653476711170181e-05, + "loss": 2.7346, + "step": 30374 + }, + { + "epoch": 2.7519195488210912, + "grad_norm": 0.9892454147338867, + "learning_rate": 1.6528725910711053e-05, + "loss": 2.5704, + "step": 30375 + }, + { + "epoch": 2.752010146995538, + "grad_norm": 1.111181378364563, + "learning_rate": 1.6522684709720294e-05, + "loss": 2.9902, + "step": 30376 + }, + { + "epoch": 2.752100745169985, + "grad_norm": 1.002604365348816, + "learning_rate": 1.6516643508729538e-05, + "loss": 2.6637, + "step": 30377 + }, + { + "epoch": 2.752191343344432, + "grad_norm": 1.1159766912460327, + "learning_rate": 1.651060230773878e-05, + "loss": 2.4972, + "step": 30378 + }, + { + "epoch": 2.7522819415188784, + "grad_norm": 0.9115878939628601, + "learning_rate": 1.6504561106748023e-05, + "loss": 2.0494, + "step": 30379 + }, + { + "epoch": 2.752372539693325, + "grad_norm": 0.8636176586151123, + "learning_rate": 1.6498519905757264e-05, + "loss": 2.0988, + "step": 30380 + }, + { + "epoch": 2.752463137867772, + "grad_norm": 1.0450316667556763, + "learning_rate": 1.6492478704766508e-05, + "loss": 2.4956, + "step": 30381 + }, + { + "epoch": 2.752553736042219, + "grad_norm": 1.032570242881775, + "learning_rate": 1.6486437503775752e-05, + "loss": 2.736, + "step": 30382 + }, + { + "epoch": 2.7526443342166655, + "grad_norm": 0.9926961660385132, + "learning_rate": 1.6480396302784996e-05, + "loss": 2.7604, + "step": 30383 + }, + { + "epoch": 2.752734932391112, + "grad_norm": 1.0089060068130493, + "learning_rate": 1.6474355101794237e-05, + "loss": 2.5337, + "step": 30384 + }, + { + "epoch": 2.752825530565559, + "grad_norm": 1.0086828470230103, + "learning_rate": 1.646831390080348e-05, + "loss": 2.5186, + "step": 30385 + }, + { + "epoch": 2.752916128740006, + "grad_norm": 0.9825090765953064, + "learning_rate": 1.6462272699812722e-05, + "loss": 2.6078, + "step": 30386 + }, + { + "epoch": 2.7530067269144527, + "grad_norm": 0.9331366419792175, + "learning_rate": 1.6456231498821966e-05, + "loss": 2.5065, + "step": 30387 + }, + { + "epoch": 2.7530973250888993, + "grad_norm": 1.0341222286224365, + "learning_rate": 1.6450190297831207e-05, + "loss": 2.4341, + "step": 30388 + }, + { + "epoch": 2.7531879232633463, + "grad_norm": 1.0130795240402222, + "learning_rate": 1.644414909684045e-05, + "loss": 2.6329, + "step": 30389 + }, + { + "epoch": 2.7532785214377933, + "grad_norm": 0.9600986838340759, + "learning_rate": 1.6438107895849696e-05, + "loss": 2.473, + "step": 30390 + }, + { + "epoch": 2.75336911961224, + "grad_norm": 1.1411577463150024, + "learning_rate": 1.643206669485894e-05, + "loss": 2.8741, + "step": 30391 + }, + { + "epoch": 2.7534597177866864, + "grad_norm": 1.0162407159805298, + "learning_rate": 1.6426025493868184e-05, + "loss": 2.4971, + "step": 30392 + }, + { + "epoch": 2.7535503159611334, + "grad_norm": 0.8540585041046143, + "learning_rate": 1.6419984292877425e-05, + "loss": 1.9089, + "step": 30393 + }, + { + "epoch": 2.7536409141355804, + "grad_norm": 0.9889951944351196, + "learning_rate": 1.641394309188667e-05, + "loss": 2.7001, + "step": 30394 + }, + { + "epoch": 2.753731512310027, + "grad_norm": 1.0037994384765625, + "learning_rate": 1.640790189089591e-05, + "loss": 2.6636, + "step": 30395 + }, + { + "epoch": 2.7538221104844736, + "grad_norm": 0.8830256462097168, + "learning_rate": 1.6401860689905154e-05, + "loss": 1.8754, + "step": 30396 + }, + { + "epoch": 2.7539127086589206, + "grad_norm": 0.817528247833252, + "learning_rate": 1.6395819488914395e-05, + "loss": 1.2117, + "step": 30397 + }, + { + "epoch": 2.7540033068333676, + "grad_norm": 0.9767885208129883, + "learning_rate": 1.638977828792364e-05, + "loss": 2.448, + "step": 30398 + }, + { + "epoch": 2.754093905007814, + "grad_norm": 1.0508575439453125, + "learning_rate": 1.6383737086932884e-05, + "loss": 2.7964, + "step": 30399 + }, + { + "epoch": 2.7541845031822607, + "grad_norm": 0.9704468846321106, + "learning_rate": 1.6377695885942128e-05, + "loss": 2.6008, + "step": 30400 + }, + { + "epoch": 2.7542751013567077, + "grad_norm": 1.0528732538223267, + "learning_rate": 1.637165468495137e-05, + "loss": 2.5141, + "step": 30401 + }, + { + "epoch": 2.7543656995311543, + "grad_norm": 1.109015941619873, + "learning_rate": 1.6365613483960613e-05, + "loss": 2.6488, + "step": 30402 + }, + { + "epoch": 2.7544562977056013, + "grad_norm": 0.9622800946235657, + "learning_rate": 1.6359572282969854e-05, + "loss": 2.6058, + "step": 30403 + }, + { + "epoch": 2.754546895880048, + "grad_norm": 1.0623301267623901, + "learning_rate": 1.6353531081979098e-05, + "loss": 2.4357, + "step": 30404 + }, + { + "epoch": 2.754637494054495, + "grad_norm": 0.9606809020042419, + "learning_rate": 1.6347489880988342e-05, + "loss": 2.7459, + "step": 30405 + }, + { + "epoch": 2.7547280922289414, + "grad_norm": 1.0578436851501465, + "learning_rate": 1.6341448679997583e-05, + "loss": 2.7466, + "step": 30406 + }, + { + "epoch": 2.7548186904033884, + "grad_norm": 0.9758076071739197, + "learning_rate": 1.6335407479006827e-05, + "loss": 2.092, + "step": 30407 + }, + { + "epoch": 2.754909288577835, + "grad_norm": 0.931661069393158, + "learning_rate": 1.632936627801607e-05, + "loss": 2.5202, + "step": 30408 + }, + { + "epoch": 2.754999886752282, + "grad_norm": 0.9961332082748413, + "learning_rate": 1.6323325077025316e-05, + "loss": 2.642, + "step": 30409 + }, + { + "epoch": 2.7550904849267286, + "grad_norm": 0.978498637676239, + "learning_rate": 1.6317283876034556e-05, + "loss": 1.8566, + "step": 30410 + }, + { + "epoch": 2.7551810831011756, + "grad_norm": 1.084617018699646, + "learning_rate": 1.63112426750438e-05, + "loss": 2.2887, + "step": 30411 + }, + { + "epoch": 2.755271681275622, + "grad_norm": 0.83155357837677, + "learning_rate": 1.630520147405304e-05, + "loss": 1.9654, + "step": 30412 + }, + { + "epoch": 2.755362279450069, + "grad_norm": 0.9860436916351318, + "learning_rate": 1.6299160273062286e-05, + "loss": 2.6562, + "step": 30413 + }, + { + "epoch": 2.7554528776245157, + "grad_norm": 0.987467885017395, + "learning_rate": 1.6293119072071526e-05, + "loss": 2.6117, + "step": 30414 + }, + { + "epoch": 2.7555434757989627, + "grad_norm": 0.9831976294517517, + "learning_rate": 1.628707787108077e-05, + "loss": 2.6161, + "step": 30415 + }, + { + "epoch": 2.7556340739734093, + "grad_norm": 1.1026064157485962, + "learning_rate": 1.6281036670090015e-05, + "loss": 2.5188, + "step": 30416 + }, + { + "epoch": 2.7557246721478563, + "grad_norm": 1.0073227882385254, + "learning_rate": 1.627499546909926e-05, + "loss": 2.6433, + "step": 30417 + }, + { + "epoch": 2.755815270322303, + "grad_norm": 1.0024210214614868, + "learning_rate": 1.62689542681085e-05, + "loss": 2.6858, + "step": 30418 + }, + { + "epoch": 2.75590586849675, + "grad_norm": 0.9907870292663574, + "learning_rate": 1.6262913067117744e-05, + "loss": 2.5303, + "step": 30419 + }, + { + "epoch": 2.7559964666711965, + "grad_norm": 1.0095807313919067, + "learning_rate": 1.625687186612699e-05, + "loss": 2.6434, + "step": 30420 + }, + { + "epoch": 2.7560870648456435, + "grad_norm": 1.099010944366455, + "learning_rate": 1.625083066513623e-05, + "loss": 2.2952, + "step": 30421 + }, + { + "epoch": 2.75617766302009, + "grad_norm": 1.1471115350723267, + "learning_rate": 1.6244789464145473e-05, + "loss": 2.5074, + "step": 30422 + }, + { + "epoch": 2.756268261194537, + "grad_norm": 0.9454275369644165, + "learning_rate": 1.6238748263154714e-05, + "loss": 2.5857, + "step": 30423 + }, + { + "epoch": 2.7563588593689836, + "grad_norm": 1.1981185674667358, + "learning_rate": 1.623270706216396e-05, + "loss": 2.5869, + "step": 30424 + }, + { + "epoch": 2.7564494575434306, + "grad_norm": 1.0457042455673218, + "learning_rate": 1.6226665861173203e-05, + "loss": 2.6234, + "step": 30425 + }, + { + "epoch": 2.756540055717877, + "grad_norm": 0.9755454659461975, + "learning_rate": 1.6220624660182447e-05, + "loss": 2.6404, + "step": 30426 + }, + { + "epoch": 2.756630653892324, + "grad_norm": 0.9991534948348999, + "learning_rate": 1.6214583459191688e-05, + "loss": 2.6116, + "step": 30427 + }, + { + "epoch": 2.7567212520667708, + "grad_norm": 1.1279350519180298, + "learning_rate": 1.6208542258200932e-05, + "loss": 2.8556, + "step": 30428 + }, + { + "epoch": 2.7568118502412178, + "grad_norm": 1.0346953868865967, + "learning_rate": 1.6202501057210173e-05, + "loss": 2.4895, + "step": 30429 + }, + { + "epoch": 2.7569024484156643, + "grad_norm": 0.9539931416511536, + "learning_rate": 1.6196459856219417e-05, + "loss": 2.5305, + "step": 30430 + }, + { + "epoch": 2.7569930465901114, + "grad_norm": 0.9902330040931702, + "learning_rate": 1.6190418655228658e-05, + "loss": 2.5392, + "step": 30431 + }, + { + "epoch": 2.757083644764558, + "grad_norm": 1.1116844415664673, + "learning_rate": 1.6184377454237902e-05, + "loss": 2.4382, + "step": 30432 + }, + { + "epoch": 2.757174242939005, + "grad_norm": 1.0313096046447754, + "learning_rate": 1.6178336253247146e-05, + "loss": 2.538, + "step": 30433 + }, + { + "epoch": 2.7572648411134515, + "grad_norm": 1.1398684978485107, + "learning_rate": 1.617229505225639e-05, + "loss": 2.7991, + "step": 30434 + }, + { + "epoch": 2.7573554392878985, + "grad_norm": 0.9023488759994507, + "learning_rate": 1.6166253851265635e-05, + "loss": 2.0342, + "step": 30435 + }, + { + "epoch": 2.757446037462345, + "grad_norm": 1.041001319885254, + "learning_rate": 1.6160212650274876e-05, + "loss": 2.6449, + "step": 30436 + }, + { + "epoch": 2.757536635636792, + "grad_norm": 0.9715256690979004, + "learning_rate": 1.615417144928412e-05, + "loss": 2.6922, + "step": 30437 + }, + { + "epoch": 2.7576272338112386, + "grad_norm": 0.9378881454467773, + "learning_rate": 1.614813024829336e-05, + "loss": 2.5167, + "step": 30438 + }, + { + "epoch": 2.7577178319856857, + "grad_norm": 1.2018814086914062, + "learning_rate": 1.6142089047302605e-05, + "loss": 2.8338, + "step": 30439 + }, + { + "epoch": 2.757808430160132, + "grad_norm": 0.9989275932312012, + "learning_rate": 1.6136047846311846e-05, + "loss": 2.799, + "step": 30440 + }, + { + "epoch": 2.7578990283345792, + "grad_norm": 1.0105304718017578, + "learning_rate": 1.613000664532109e-05, + "loss": 2.3906, + "step": 30441 + }, + { + "epoch": 2.757989626509026, + "grad_norm": 1.0421123504638672, + "learning_rate": 1.6123965444330334e-05, + "loss": 2.547, + "step": 30442 + }, + { + "epoch": 2.7580802246834724, + "grad_norm": 1.0450133085250854, + "learning_rate": 1.6117924243339578e-05, + "loss": 1.8801, + "step": 30443 + }, + { + "epoch": 2.7581708228579194, + "grad_norm": 1.0291887521743774, + "learning_rate": 1.611188304234882e-05, + "loss": 2.5681, + "step": 30444 + }, + { + "epoch": 2.7582614210323664, + "grad_norm": 0.9518498182296753, + "learning_rate": 1.6105841841358063e-05, + "loss": 2.1951, + "step": 30445 + }, + { + "epoch": 2.758352019206813, + "grad_norm": 0.9941827654838562, + "learning_rate": 1.6099800640367304e-05, + "loss": 2.712, + "step": 30446 + }, + { + "epoch": 2.7584426173812595, + "grad_norm": 1.0668519735336304, + "learning_rate": 1.609375943937655e-05, + "loss": 2.5715, + "step": 30447 + }, + { + "epoch": 2.7585332155557065, + "grad_norm": 1.0385873317718506, + "learning_rate": 1.608771823838579e-05, + "loss": 2.9052, + "step": 30448 + }, + { + "epoch": 2.7586238137301535, + "grad_norm": 0.5067463517189026, + "learning_rate": 1.6081677037395033e-05, + "loss": 0.7516, + "step": 30449 + }, + { + "epoch": 2.7587144119046, + "grad_norm": 1.0836455821990967, + "learning_rate": 1.6075635836404278e-05, + "loss": 2.3981, + "step": 30450 + }, + { + "epoch": 2.7588050100790467, + "grad_norm": 1.033060073852539, + "learning_rate": 1.6069594635413522e-05, + "loss": 2.6882, + "step": 30451 + }, + { + "epoch": 2.7588956082534937, + "grad_norm": 1.0085822343826294, + "learning_rate": 1.6063553434422766e-05, + "loss": 2.6604, + "step": 30452 + }, + { + "epoch": 2.7589862064279407, + "grad_norm": 0.971030056476593, + "learning_rate": 1.6057512233432007e-05, + "loss": 2.737, + "step": 30453 + }, + { + "epoch": 2.7590768046023872, + "grad_norm": 0.9401158690452576, + "learning_rate": 1.605147103244125e-05, + "loss": 2.4234, + "step": 30454 + }, + { + "epoch": 2.759167402776834, + "grad_norm": 0.9602075815200806, + "learning_rate": 1.6045429831450492e-05, + "loss": 2.5032, + "step": 30455 + }, + { + "epoch": 2.759258000951281, + "grad_norm": 1.0633286237716675, + "learning_rate": 1.6039388630459736e-05, + "loss": 2.8004, + "step": 30456 + }, + { + "epoch": 2.759348599125728, + "grad_norm": 1.0415369272232056, + "learning_rate": 1.6033347429468977e-05, + "loss": 2.5857, + "step": 30457 + }, + { + "epoch": 2.7594391973001744, + "grad_norm": 0.8404159545898438, + "learning_rate": 1.602730622847822e-05, + "loss": 2.0872, + "step": 30458 + }, + { + "epoch": 2.759529795474621, + "grad_norm": 0.9813336133956909, + "learning_rate": 1.6021265027487465e-05, + "loss": 2.8006, + "step": 30459 + }, + { + "epoch": 2.759620393649068, + "grad_norm": 1.0582910776138306, + "learning_rate": 1.601522382649671e-05, + "loss": 2.6239, + "step": 30460 + }, + { + "epoch": 2.759710991823515, + "grad_norm": 1.1048784255981445, + "learning_rate": 1.600918262550595e-05, + "loss": 2.6202, + "step": 30461 + }, + { + "epoch": 2.7598015899979615, + "grad_norm": 0.9755163192749023, + "learning_rate": 1.6003141424515195e-05, + "loss": 2.6273, + "step": 30462 + }, + { + "epoch": 2.759892188172408, + "grad_norm": 1.1134700775146484, + "learning_rate": 1.5997100223524436e-05, + "loss": 2.7956, + "step": 30463 + }, + { + "epoch": 2.759982786346855, + "grad_norm": 1.023697018623352, + "learning_rate": 1.599105902253368e-05, + "loss": 2.7083, + "step": 30464 + }, + { + "epoch": 2.760073384521302, + "grad_norm": 1.0945967435836792, + "learning_rate": 1.5985017821542924e-05, + "loss": 2.7698, + "step": 30465 + }, + { + "epoch": 2.7601639826957487, + "grad_norm": 0.9689931869506836, + "learning_rate": 1.5978976620552165e-05, + "loss": 2.5252, + "step": 30466 + }, + { + "epoch": 2.7602545808701953, + "grad_norm": 1.001976490020752, + "learning_rate": 1.597293541956141e-05, + "loss": 2.5635, + "step": 30467 + }, + { + "epoch": 2.7603451790446423, + "grad_norm": 1.0207668542861938, + "learning_rate": 1.5966894218570653e-05, + "loss": 2.8064, + "step": 30468 + }, + { + "epoch": 2.7604357772190893, + "grad_norm": 1.0645732879638672, + "learning_rate": 1.5960853017579897e-05, + "loss": 2.6778, + "step": 30469 + }, + { + "epoch": 2.760526375393536, + "grad_norm": 0.9734803438186646, + "learning_rate": 1.5954811816589138e-05, + "loss": 2.5076, + "step": 30470 + }, + { + "epoch": 2.7606169735679824, + "grad_norm": 0.877925455570221, + "learning_rate": 1.5948770615598383e-05, + "loss": 1.7868, + "step": 30471 + }, + { + "epoch": 2.7607075717424294, + "grad_norm": 1.074934959411621, + "learning_rate": 1.5942729414607623e-05, + "loss": 2.6348, + "step": 30472 + }, + { + "epoch": 2.7607981699168764, + "grad_norm": 1.0528513193130493, + "learning_rate": 1.5936688213616868e-05, + "loss": 2.7098, + "step": 30473 + }, + { + "epoch": 2.760888768091323, + "grad_norm": 1.006650686264038, + "learning_rate": 1.593064701262611e-05, + "loss": 2.7808, + "step": 30474 + }, + { + "epoch": 2.7609793662657696, + "grad_norm": 1.0237971544265747, + "learning_rate": 1.5924605811635353e-05, + "loss": 2.6304, + "step": 30475 + }, + { + "epoch": 2.7610699644402166, + "grad_norm": 0.9053958654403687, + "learning_rate": 1.5918564610644597e-05, + "loss": 2.027, + "step": 30476 + }, + { + "epoch": 2.7611605626146636, + "grad_norm": 0.9291535019874573, + "learning_rate": 1.591252340965384e-05, + "loss": 2.409, + "step": 30477 + }, + { + "epoch": 2.76125116078911, + "grad_norm": 0.9784006476402283, + "learning_rate": 1.5906482208663085e-05, + "loss": 2.5446, + "step": 30478 + }, + { + "epoch": 2.7613417589635567, + "grad_norm": 1.0382400751113892, + "learning_rate": 1.5900441007672326e-05, + "loss": 2.7232, + "step": 30479 + }, + { + "epoch": 2.7614323571380037, + "grad_norm": 1.0401579141616821, + "learning_rate": 1.589439980668157e-05, + "loss": 2.8069, + "step": 30480 + }, + { + "epoch": 2.7615229553124503, + "grad_norm": 1.0651694536209106, + "learning_rate": 1.588835860569081e-05, + "loss": 2.537, + "step": 30481 + }, + { + "epoch": 2.7616135534868973, + "grad_norm": 1.0433073043823242, + "learning_rate": 1.5882317404700055e-05, + "loss": 2.5965, + "step": 30482 + }, + { + "epoch": 2.761704151661344, + "grad_norm": 0.9706825613975525, + "learning_rate": 1.5876276203709296e-05, + "loss": 2.5173, + "step": 30483 + }, + { + "epoch": 2.761794749835791, + "grad_norm": 1.0368859767913818, + "learning_rate": 1.587023500271854e-05, + "loss": 2.8007, + "step": 30484 + }, + { + "epoch": 2.7618853480102374, + "grad_norm": 1.1895283460617065, + "learning_rate": 1.5864193801727785e-05, + "loss": 2.5501, + "step": 30485 + }, + { + "epoch": 2.7619759461846844, + "grad_norm": 0.997351348400116, + "learning_rate": 1.585815260073703e-05, + "loss": 2.5921, + "step": 30486 + }, + { + "epoch": 2.762066544359131, + "grad_norm": 1.0177216529846191, + "learning_rate": 1.585211139974627e-05, + "loss": 2.4391, + "step": 30487 + }, + { + "epoch": 2.762157142533578, + "grad_norm": 0.9294549822807312, + "learning_rate": 1.5846070198755514e-05, + "loss": 2.6342, + "step": 30488 + }, + { + "epoch": 2.7622477407080246, + "grad_norm": 1.005063772201538, + "learning_rate": 1.5840028997764755e-05, + "loss": 2.7979, + "step": 30489 + }, + { + "epoch": 2.7623383388824716, + "grad_norm": 0.9627499580383301, + "learning_rate": 1.5833987796774e-05, + "loss": 2.6891, + "step": 30490 + }, + { + "epoch": 2.762428937056918, + "grad_norm": 0.9430873990058899, + "learning_rate": 1.582794659578324e-05, + "loss": 2.5402, + "step": 30491 + }, + { + "epoch": 2.762519535231365, + "grad_norm": 1.0300143957138062, + "learning_rate": 1.5821905394792484e-05, + "loss": 2.6652, + "step": 30492 + }, + { + "epoch": 2.7626101334058117, + "grad_norm": 1.0115596055984497, + "learning_rate": 1.581586419380173e-05, + "loss": 2.5743, + "step": 30493 + }, + { + "epoch": 2.7627007315802588, + "grad_norm": 1.0315895080566406, + "learning_rate": 1.5809822992810972e-05, + "loss": 2.4168, + "step": 30494 + }, + { + "epoch": 2.7627913297547053, + "grad_norm": 0.9434531331062317, + "learning_rate": 1.5803781791820217e-05, + "loss": 2.6054, + "step": 30495 + }, + { + "epoch": 2.7628819279291523, + "grad_norm": 1.0283035039901733, + "learning_rate": 1.5797740590829457e-05, + "loss": 2.6321, + "step": 30496 + }, + { + "epoch": 2.762972526103599, + "grad_norm": 1.0698763132095337, + "learning_rate": 1.57916993898387e-05, + "loss": 2.6924, + "step": 30497 + }, + { + "epoch": 2.763063124278046, + "grad_norm": 1.006761908531189, + "learning_rate": 1.5785658188847943e-05, + "loss": 2.7046, + "step": 30498 + }, + { + "epoch": 2.7631537224524925, + "grad_norm": 1.0021342039108276, + "learning_rate": 1.5779616987857187e-05, + "loss": 2.6126, + "step": 30499 + }, + { + "epoch": 2.7632443206269395, + "grad_norm": 0.9546239376068115, + "learning_rate": 1.5773575786866428e-05, + "loss": 2.5952, + "step": 30500 + }, + { + "epoch": 2.763334918801386, + "grad_norm": 1.0647021532058716, + "learning_rate": 1.5767534585875672e-05, + "loss": 2.7272, + "step": 30501 + }, + { + "epoch": 2.763425516975833, + "grad_norm": 1.0356817245483398, + "learning_rate": 1.5761493384884916e-05, + "loss": 2.8511, + "step": 30502 + }, + { + "epoch": 2.7635161151502796, + "grad_norm": 0.8576956391334534, + "learning_rate": 1.575545218389416e-05, + "loss": 1.8055, + "step": 30503 + }, + { + "epoch": 2.7636067133247266, + "grad_norm": 0.9403589367866516, + "learning_rate": 1.57494109829034e-05, + "loss": 2.4946, + "step": 30504 + }, + { + "epoch": 2.763697311499173, + "grad_norm": 1.0221425294876099, + "learning_rate": 1.5743369781912645e-05, + "loss": 2.3642, + "step": 30505 + }, + { + "epoch": 2.76378790967362, + "grad_norm": 1.0246975421905518, + "learning_rate": 1.5737328580921886e-05, + "loss": 2.6804, + "step": 30506 + }, + { + "epoch": 2.7638785078480668, + "grad_norm": 1.033791422843933, + "learning_rate": 1.573128737993113e-05, + "loss": 2.6577, + "step": 30507 + }, + { + "epoch": 2.7639691060225138, + "grad_norm": 0.9973969459533691, + "learning_rate": 1.5725246178940375e-05, + "loss": 2.616, + "step": 30508 + }, + { + "epoch": 2.7640597041969603, + "grad_norm": 1.0556858777999878, + "learning_rate": 1.5719204977949615e-05, + "loss": 2.6116, + "step": 30509 + }, + { + "epoch": 2.7641503023714074, + "grad_norm": 1.1008501052856445, + "learning_rate": 1.5713163776958863e-05, + "loss": 2.5439, + "step": 30510 + }, + { + "epoch": 2.764240900545854, + "grad_norm": 1.0591274499893188, + "learning_rate": 1.5707122575968104e-05, + "loss": 2.8679, + "step": 30511 + }, + { + "epoch": 2.764331498720301, + "grad_norm": 0.9827492237091064, + "learning_rate": 1.5701081374977348e-05, + "loss": 2.4886, + "step": 30512 + }, + { + "epoch": 2.7644220968947475, + "grad_norm": 0.9954086542129517, + "learning_rate": 1.569504017398659e-05, + "loss": 2.3045, + "step": 30513 + }, + { + "epoch": 2.7645126950691945, + "grad_norm": 0.9885589480400085, + "learning_rate": 1.5688998972995833e-05, + "loss": 2.6989, + "step": 30514 + }, + { + "epoch": 2.764603293243641, + "grad_norm": 0.9697035551071167, + "learning_rate": 1.5682957772005074e-05, + "loss": 2.5778, + "step": 30515 + }, + { + "epoch": 2.764693891418088, + "grad_norm": 0.8839061260223389, + "learning_rate": 1.5676916571014318e-05, + "loss": 1.979, + "step": 30516 + }, + { + "epoch": 2.7647844895925346, + "grad_norm": 1.1355241537094116, + "learning_rate": 1.567087537002356e-05, + "loss": 2.4698, + "step": 30517 + }, + { + "epoch": 2.7648750877669817, + "grad_norm": 0.9537604451179504, + "learning_rate": 1.5664834169032807e-05, + "loss": 2.603, + "step": 30518 + }, + { + "epoch": 2.764965685941428, + "grad_norm": 1.0148488283157349, + "learning_rate": 1.5658792968042047e-05, + "loss": 2.5845, + "step": 30519 + }, + { + "epoch": 2.7650562841158752, + "grad_norm": 0.9908238649368286, + "learning_rate": 1.565275176705129e-05, + "loss": 2.4876, + "step": 30520 + }, + { + "epoch": 2.765146882290322, + "grad_norm": 0.8376297354698181, + "learning_rate": 1.5646710566060532e-05, + "loss": 1.9754, + "step": 30521 + }, + { + "epoch": 2.7652374804647684, + "grad_norm": 0.8615382313728333, + "learning_rate": 1.5640669365069777e-05, + "loss": 2.0266, + "step": 30522 + }, + { + "epoch": 2.7653280786392154, + "grad_norm": 0.9993582963943481, + "learning_rate": 1.563462816407902e-05, + "loss": 2.7756, + "step": 30523 + }, + { + "epoch": 2.7654186768136624, + "grad_norm": 0.989561915397644, + "learning_rate": 1.562858696308826e-05, + "loss": 2.5703, + "step": 30524 + }, + { + "epoch": 2.765509274988109, + "grad_norm": 0.8913472294807434, + "learning_rate": 1.5622545762097506e-05, + "loss": 1.9397, + "step": 30525 + }, + { + "epoch": 2.7655998731625555, + "grad_norm": 0.9457124471664429, + "learning_rate": 1.561650456110675e-05, + "loss": 2.4359, + "step": 30526 + }, + { + "epoch": 2.7656904713370025, + "grad_norm": 0.9354259371757507, + "learning_rate": 1.5610463360115994e-05, + "loss": 2.5205, + "step": 30527 + }, + { + "epoch": 2.7657810695114495, + "grad_norm": 0.8812004923820496, + "learning_rate": 1.5604422159125235e-05, + "loss": 2.1729, + "step": 30528 + }, + { + "epoch": 2.765871667685896, + "grad_norm": 1.035370945930481, + "learning_rate": 1.559838095813448e-05, + "loss": 2.6374, + "step": 30529 + }, + { + "epoch": 2.7659622658603427, + "grad_norm": 1.0128182172775269, + "learning_rate": 1.559233975714372e-05, + "loss": 2.5611, + "step": 30530 + }, + { + "epoch": 2.7660528640347897, + "grad_norm": 1.1356217861175537, + "learning_rate": 1.5586298556152964e-05, + "loss": 2.7008, + "step": 30531 + }, + { + "epoch": 2.7661434622092367, + "grad_norm": 1.035109281539917, + "learning_rate": 1.5580257355162205e-05, + "loss": 2.5966, + "step": 30532 + }, + { + "epoch": 2.7662340603836832, + "grad_norm": 1.013082504272461, + "learning_rate": 1.557421615417145e-05, + "loss": 2.482, + "step": 30533 + }, + { + "epoch": 2.76632465855813, + "grad_norm": 1.0419635772705078, + "learning_rate": 1.556817495318069e-05, + "loss": 2.9694, + "step": 30534 + }, + { + "epoch": 2.766415256732577, + "grad_norm": 0.9941002130508423, + "learning_rate": 1.5562133752189938e-05, + "loss": 2.7619, + "step": 30535 + }, + { + "epoch": 2.766505854907024, + "grad_norm": 1.00498628616333, + "learning_rate": 1.555609255119918e-05, + "loss": 2.7593, + "step": 30536 + }, + { + "epoch": 2.7665964530814704, + "grad_norm": 0.9977261424064636, + "learning_rate": 1.5550051350208423e-05, + "loss": 2.4412, + "step": 30537 + }, + { + "epoch": 2.766687051255917, + "grad_norm": 0.9984035491943359, + "learning_rate": 1.5544010149217667e-05, + "loss": 2.5002, + "step": 30538 + }, + { + "epoch": 2.766777649430364, + "grad_norm": 1.20274019241333, + "learning_rate": 1.5537968948226908e-05, + "loss": 2.3023, + "step": 30539 + }, + { + "epoch": 2.766868247604811, + "grad_norm": 0.9982714653015137, + "learning_rate": 1.5531927747236152e-05, + "loss": 2.7026, + "step": 30540 + }, + { + "epoch": 2.7669588457792575, + "grad_norm": 0.9583539962768555, + "learning_rate": 1.5525886546245393e-05, + "loss": 2.5111, + "step": 30541 + }, + { + "epoch": 2.767049443953704, + "grad_norm": 1.0151138305664062, + "learning_rate": 1.5519845345254637e-05, + "loss": 2.5459, + "step": 30542 + }, + { + "epoch": 2.767140042128151, + "grad_norm": 0.9230372309684753, + "learning_rate": 1.551380414426388e-05, + "loss": 1.8796, + "step": 30543 + }, + { + "epoch": 2.767230640302598, + "grad_norm": 1.0702500343322754, + "learning_rate": 1.5507762943273126e-05, + "loss": 2.6544, + "step": 30544 + }, + { + "epoch": 2.7673212384770447, + "grad_norm": 0.9712371826171875, + "learning_rate": 1.5501721742282367e-05, + "loss": 2.5957, + "step": 30545 + }, + { + "epoch": 2.7674118366514913, + "grad_norm": 1.0419217348098755, + "learning_rate": 1.549568054129161e-05, + "loss": 2.4308, + "step": 30546 + }, + { + "epoch": 2.7675024348259383, + "grad_norm": 1.0607059001922607, + "learning_rate": 1.548963934030085e-05, + "loss": 2.5622, + "step": 30547 + }, + { + "epoch": 2.7675930330003853, + "grad_norm": 1.1572022438049316, + "learning_rate": 1.5483598139310096e-05, + "loss": 2.4823, + "step": 30548 + }, + { + "epoch": 2.767683631174832, + "grad_norm": 0.8806160688400269, + "learning_rate": 1.5477556938319337e-05, + "loss": 2.128, + "step": 30549 + }, + { + "epoch": 2.7677742293492784, + "grad_norm": 0.9996864795684814, + "learning_rate": 1.547151573732858e-05, + "loss": 1.8663, + "step": 30550 + }, + { + "epoch": 2.7678648275237254, + "grad_norm": 1.041912317276001, + "learning_rate": 1.5465474536337825e-05, + "loss": 2.4104, + "step": 30551 + }, + { + "epoch": 2.7679554256981724, + "grad_norm": 1.0460177659988403, + "learning_rate": 1.545943333534707e-05, + "loss": 2.7135, + "step": 30552 + }, + { + "epoch": 2.768046023872619, + "grad_norm": 1.0801290273666382, + "learning_rate": 1.5453392134356313e-05, + "loss": 2.7254, + "step": 30553 + }, + { + "epoch": 2.7681366220470656, + "grad_norm": 1.028894305229187, + "learning_rate": 1.5447350933365554e-05, + "loss": 2.51, + "step": 30554 + }, + { + "epoch": 2.7682272202215126, + "grad_norm": 1.0963398218154907, + "learning_rate": 1.54413097323748e-05, + "loss": 2.567, + "step": 30555 + }, + { + "epoch": 2.7683178183959596, + "grad_norm": 1.147442102432251, + "learning_rate": 1.543526853138404e-05, + "loss": 2.6221, + "step": 30556 + }, + { + "epoch": 2.768408416570406, + "grad_norm": 0.9863128066062927, + "learning_rate": 1.5429227330393284e-05, + "loss": 2.6352, + "step": 30557 + }, + { + "epoch": 2.7684990147448527, + "grad_norm": 1.0393952131271362, + "learning_rate": 1.5423186129402524e-05, + "loss": 2.6791, + "step": 30558 + }, + { + "epoch": 2.7685896129192997, + "grad_norm": 1.037319302558899, + "learning_rate": 1.541714492841177e-05, + "loss": 2.8486, + "step": 30559 + }, + { + "epoch": 2.7686802110937467, + "grad_norm": 1.0777517557144165, + "learning_rate": 1.5411103727421013e-05, + "loss": 2.5679, + "step": 30560 + }, + { + "epoch": 2.7687708092681933, + "grad_norm": 1.04743230342865, + "learning_rate": 1.5405062526430257e-05, + "loss": 2.6425, + "step": 30561 + }, + { + "epoch": 2.76886140744264, + "grad_norm": 1.0474826097488403, + "learning_rate": 1.5399021325439498e-05, + "loss": 2.4735, + "step": 30562 + }, + { + "epoch": 2.768952005617087, + "grad_norm": 1.0358455181121826, + "learning_rate": 1.5392980124448742e-05, + "loss": 2.541, + "step": 30563 + }, + { + "epoch": 2.7690426037915334, + "grad_norm": 1.052635908126831, + "learning_rate": 1.5386938923457983e-05, + "loss": 2.701, + "step": 30564 + }, + { + "epoch": 2.7691332019659805, + "grad_norm": 0.9548643827438354, + "learning_rate": 1.5380897722467227e-05, + "loss": 2.4504, + "step": 30565 + }, + { + "epoch": 2.769223800140427, + "grad_norm": 1.0455342531204224, + "learning_rate": 1.5374856521476468e-05, + "loss": 2.7998, + "step": 30566 + }, + { + "epoch": 2.769314398314874, + "grad_norm": 1.0024902820587158, + "learning_rate": 1.5368815320485712e-05, + "loss": 2.5912, + "step": 30567 + }, + { + "epoch": 2.7694049964893206, + "grad_norm": 0.9851289987564087, + "learning_rate": 1.5362774119494956e-05, + "loss": 2.6402, + "step": 30568 + }, + { + "epoch": 2.7694955946637676, + "grad_norm": 1.0324554443359375, + "learning_rate": 1.53567329185042e-05, + "loss": 2.7961, + "step": 30569 + }, + { + "epoch": 2.769586192838214, + "grad_norm": 1.0567047595977783, + "learning_rate": 1.5350691717513445e-05, + "loss": 2.3542, + "step": 30570 + }, + { + "epoch": 2.769676791012661, + "grad_norm": 1.089593768119812, + "learning_rate": 1.5344650516522686e-05, + "loss": 2.6065, + "step": 30571 + }, + { + "epoch": 2.7697673891871077, + "grad_norm": 1.055456280708313, + "learning_rate": 1.533860931553193e-05, + "loss": 2.6519, + "step": 30572 + }, + { + "epoch": 2.7698579873615548, + "grad_norm": 1.0392351150512695, + "learning_rate": 1.533256811454117e-05, + "loss": 2.5267, + "step": 30573 + }, + { + "epoch": 2.7699485855360013, + "grad_norm": 0.9466044306755066, + "learning_rate": 1.5326526913550415e-05, + "loss": 1.8754, + "step": 30574 + }, + { + "epoch": 2.7700391837104483, + "grad_norm": 1.1619623899459839, + "learning_rate": 1.5320485712559656e-05, + "loss": 2.7634, + "step": 30575 + }, + { + "epoch": 2.770129781884895, + "grad_norm": 0.9531899094581604, + "learning_rate": 1.53144445115689e-05, + "loss": 2.5226, + "step": 30576 + }, + { + "epoch": 2.770220380059342, + "grad_norm": 1.0134555101394653, + "learning_rate": 1.5308403310578144e-05, + "loss": 2.7973, + "step": 30577 + }, + { + "epoch": 2.7703109782337885, + "grad_norm": 0.9816951751708984, + "learning_rate": 1.530236210958739e-05, + "loss": 2.5909, + "step": 30578 + }, + { + "epoch": 2.7704015764082355, + "grad_norm": 0.9276016354560852, + "learning_rate": 1.529632090859663e-05, + "loss": 2.3458, + "step": 30579 + }, + { + "epoch": 2.770492174582682, + "grad_norm": 0.9895367622375488, + "learning_rate": 1.5290279707605873e-05, + "loss": 2.511, + "step": 30580 + }, + { + "epoch": 2.770582772757129, + "grad_norm": 0.9974400997161865, + "learning_rate": 1.5284238506615114e-05, + "loss": 2.7495, + "step": 30581 + }, + { + "epoch": 2.7706733709315756, + "grad_norm": 0.9192203879356384, + "learning_rate": 1.527819730562436e-05, + "loss": 2.3667, + "step": 30582 + }, + { + "epoch": 2.7707639691060226, + "grad_norm": 0.9770763516426086, + "learning_rate": 1.5272156104633603e-05, + "loss": 2.4669, + "step": 30583 + }, + { + "epoch": 2.770854567280469, + "grad_norm": 1.0989278554916382, + "learning_rate": 1.5266114903642844e-05, + "loss": 2.7363, + "step": 30584 + }, + { + "epoch": 2.770945165454916, + "grad_norm": 0.9121161699295044, + "learning_rate": 1.5260073702652088e-05, + "loss": 1.7978, + "step": 30585 + }, + { + "epoch": 2.7710357636293628, + "grad_norm": 1.0488005876541138, + "learning_rate": 1.525403250166133e-05, + "loss": 2.6228, + "step": 30586 + }, + { + "epoch": 2.77112636180381, + "grad_norm": 1.075276255607605, + "learning_rate": 1.5247991300670575e-05, + "loss": 2.5291, + "step": 30587 + }, + { + "epoch": 2.7712169599782563, + "grad_norm": 0.9990129470825195, + "learning_rate": 1.5241950099679817e-05, + "loss": 2.4534, + "step": 30588 + }, + { + "epoch": 2.7713075581527034, + "grad_norm": 1.0368980169296265, + "learning_rate": 1.5235908898689061e-05, + "loss": 2.5338, + "step": 30589 + }, + { + "epoch": 2.77139815632715, + "grad_norm": 0.9263126254081726, + "learning_rate": 1.5229867697698302e-05, + "loss": 2.637, + "step": 30590 + }, + { + "epoch": 2.771488754501597, + "grad_norm": 1.0098952054977417, + "learning_rate": 1.5223826496707546e-05, + "loss": 2.6748, + "step": 30591 + }, + { + "epoch": 2.7715793526760435, + "grad_norm": 1.0543831586837769, + "learning_rate": 1.5217785295716789e-05, + "loss": 2.7215, + "step": 30592 + }, + { + "epoch": 2.7716699508504905, + "grad_norm": 1.0169414281845093, + "learning_rate": 1.5211744094726033e-05, + "loss": 2.372, + "step": 30593 + }, + { + "epoch": 2.771760549024937, + "grad_norm": 1.015069603919983, + "learning_rate": 1.5205702893735274e-05, + "loss": 2.7466, + "step": 30594 + }, + { + "epoch": 2.771851147199384, + "grad_norm": 1.0133742094039917, + "learning_rate": 1.5199661692744518e-05, + "loss": 2.5395, + "step": 30595 + }, + { + "epoch": 2.7719417453738306, + "grad_norm": 0.9815243482589722, + "learning_rate": 1.519362049175376e-05, + "loss": 2.5867, + "step": 30596 + }, + { + "epoch": 2.7720323435482777, + "grad_norm": 1.0000778436660767, + "learning_rate": 1.5187579290763005e-05, + "loss": 3.0191, + "step": 30597 + }, + { + "epoch": 2.772122941722724, + "grad_norm": 1.0221456289291382, + "learning_rate": 1.5181538089772249e-05, + "loss": 2.9402, + "step": 30598 + }, + { + "epoch": 2.7722135398971712, + "grad_norm": 1.0264086723327637, + "learning_rate": 1.517549688878149e-05, + "loss": 2.6385, + "step": 30599 + }, + { + "epoch": 2.772304138071618, + "grad_norm": 1.0017294883728027, + "learning_rate": 1.5169455687790734e-05, + "loss": 2.6628, + "step": 30600 + }, + { + "epoch": 2.772394736246065, + "grad_norm": 0.8219570517539978, + "learning_rate": 1.5163414486799977e-05, + "loss": 1.7208, + "step": 30601 + }, + { + "epoch": 2.7724853344205114, + "grad_norm": 1.0494778156280518, + "learning_rate": 1.5157373285809221e-05, + "loss": 2.9793, + "step": 30602 + }, + { + "epoch": 2.7725759325949584, + "grad_norm": 0.9595188498497009, + "learning_rate": 1.5151332084818462e-05, + "loss": 2.6351, + "step": 30603 + }, + { + "epoch": 2.772666530769405, + "grad_norm": 1.0247129201889038, + "learning_rate": 1.5145290883827706e-05, + "loss": 2.2592, + "step": 30604 + }, + { + "epoch": 2.7727571289438515, + "grad_norm": 1.0227947235107422, + "learning_rate": 1.5139249682836948e-05, + "loss": 2.4088, + "step": 30605 + }, + { + "epoch": 2.7728477271182985, + "grad_norm": 1.0904407501220703, + "learning_rate": 1.5133208481846193e-05, + "loss": 2.6644, + "step": 30606 + }, + { + "epoch": 2.7729383252927455, + "grad_norm": 0.9817543625831604, + "learning_rate": 1.5127167280855433e-05, + "loss": 2.4645, + "step": 30607 + }, + { + "epoch": 2.773028923467192, + "grad_norm": 1.0837420225143433, + "learning_rate": 1.5121126079864678e-05, + "loss": 2.804, + "step": 30608 + }, + { + "epoch": 2.7731195216416387, + "grad_norm": 1.0517717599868774, + "learning_rate": 1.511508487887392e-05, + "loss": 3.0057, + "step": 30609 + }, + { + "epoch": 2.7732101198160857, + "grad_norm": 1.0164240598678589, + "learning_rate": 1.5109043677883164e-05, + "loss": 2.5547, + "step": 30610 + }, + { + "epoch": 2.7733007179905327, + "grad_norm": 0.9947965741157532, + "learning_rate": 1.5103002476892405e-05, + "loss": 2.605, + "step": 30611 + }, + { + "epoch": 2.7733913161649792, + "grad_norm": 1.0807466506958008, + "learning_rate": 1.509696127590165e-05, + "loss": 2.4654, + "step": 30612 + }, + { + "epoch": 2.773481914339426, + "grad_norm": 1.0375819206237793, + "learning_rate": 1.5090920074910894e-05, + "loss": 2.6328, + "step": 30613 + }, + { + "epoch": 2.773572512513873, + "grad_norm": 0.9718475341796875, + "learning_rate": 1.5084878873920136e-05, + "loss": 2.5533, + "step": 30614 + }, + { + "epoch": 2.77366311068832, + "grad_norm": 1.054801106452942, + "learning_rate": 1.507883767292938e-05, + "loss": 2.7106, + "step": 30615 + }, + { + "epoch": 2.7737537088627664, + "grad_norm": 1.0365684032440186, + "learning_rate": 1.5072796471938621e-05, + "loss": 2.6628, + "step": 30616 + }, + { + "epoch": 2.773844307037213, + "grad_norm": 1.2155547142028809, + "learning_rate": 1.5066755270947865e-05, + "loss": 2.4579, + "step": 30617 + }, + { + "epoch": 2.77393490521166, + "grad_norm": 1.1276323795318604, + "learning_rate": 1.5060714069957108e-05, + "loss": 2.6669, + "step": 30618 + }, + { + "epoch": 2.774025503386107, + "grad_norm": 1.1526566743850708, + "learning_rate": 1.5054672868966352e-05, + "loss": 2.3989, + "step": 30619 + }, + { + "epoch": 2.7741161015605535, + "grad_norm": 1.0215851068496704, + "learning_rate": 1.5048631667975593e-05, + "loss": 2.6678, + "step": 30620 + }, + { + "epoch": 2.774206699735, + "grad_norm": 1.0310660600662231, + "learning_rate": 1.5042590466984837e-05, + "loss": 2.6805, + "step": 30621 + }, + { + "epoch": 2.774297297909447, + "grad_norm": 1.0286228656768799, + "learning_rate": 1.503654926599408e-05, + "loss": 2.6713, + "step": 30622 + }, + { + "epoch": 2.774387896083894, + "grad_norm": 1.0901798009872437, + "learning_rate": 1.5030508065003324e-05, + "loss": 2.62, + "step": 30623 + }, + { + "epoch": 2.7744784942583407, + "grad_norm": 1.0204217433929443, + "learning_rate": 1.5024466864012565e-05, + "loss": 2.8405, + "step": 30624 + }, + { + "epoch": 2.7745690924327873, + "grad_norm": 1.1006686687469482, + "learning_rate": 1.5018425663021809e-05, + "loss": 3.0285, + "step": 30625 + }, + { + "epoch": 2.7746596906072343, + "grad_norm": 1.0291608572006226, + "learning_rate": 1.5012384462031053e-05, + "loss": 3.023, + "step": 30626 + }, + { + "epoch": 2.7747502887816813, + "grad_norm": 0.8381088972091675, + "learning_rate": 1.5006343261040296e-05, + "loss": 1.9582, + "step": 30627 + }, + { + "epoch": 2.774840886956128, + "grad_norm": 0.8393864035606384, + "learning_rate": 1.500030206004954e-05, + "loss": 1.8969, + "step": 30628 + }, + { + "epoch": 2.7749314851305744, + "grad_norm": 0.9602174162864685, + "learning_rate": 1.4994260859058781e-05, + "loss": 2.6027, + "step": 30629 + }, + { + "epoch": 2.7750220833050214, + "grad_norm": 1.2527174949645996, + "learning_rate": 1.4988219658068025e-05, + "loss": 2.5176, + "step": 30630 + }, + { + "epoch": 2.7751126814794684, + "grad_norm": 1.054222822189331, + "learning_rate": 1.4982178457077268e-05, + "loss": 2.7207, + "step": 30631 + }, + { + "epoch": 2.775203279653915, + "grad_norm": 1.0258618593215942, + "learning_rate": 1.4976137256086512e-05, + "loss": 2.366, + "step": 30632 + }, + { + "epoch": 2.7752938778283616, + "grad_norm": 1.1207295656204224, + "learning_rate": 1.4970096055095753e-05, + "loss": 2.4895, + "step": 30633 + }, + { + "epoch": 2.7753844760028086, + "grad_norm": 0.9779919981956482, + "learning_rate": 1.4964054854104997e-05, + "loss": 2.2314, + "step": 30634 + }, + { + "epoch": 2.7754750741772556, + "grad_norm": 1.033231496810913, + "learning_rate": 1.495801365311424e-05, + "loss": 2.6857, + "step": 30635 + }, + { + "epoch": 2.775565672351702, + "grad_norm": 0.9611170291900635, + "learning_rate": 1.4951972452123484e-05, + "loss": 2.0824, + "step": 30636 + }, + { + "epoch": 2.7756562705261487, + "grad_norm": 0.8893700242042542, + "learning_rate": 1.4945931251132724e-05, + "loss": 2.0017, + "step": 30637 + }, + { + "epoch": 2.7757468687005957, + "grad_norm": 1.1084964275360107, + "learning_rate": 1.4939890050141969e-05, + "loss": 2.2962, + "step": 30638 + }, + { + "epoch": 2.7758374668750427, + "grad_norm": 1.1783068180084229, + "learning_rate": 1.4933848849151211e-05, + "loss": 2.5931, + "step": 30639 + }, + { + "epoch": 2.7759280650494893, + "grad_norm": 0.9882144331932068, + "learning_rate": 1.4927807648160455e-05, + "loss": 2.419, + "step": 30640 + }, + { + "epoch": 2.776018663223936, + "grad_norm": 0.9063900113105774, + "learning_rate": 1.49217664471697e-05, + "loss": 1.9378, + "step": 30641 + }, + { + "epoch": 2.776109261398383, + "grad_norm": 0.9205338358879089, + "learning_rate": 1.491572524617894e-05, + "loss": 2.4588, + "step": 30642 + }, + { + "epoch": 2.7761998595728294, + "grad_norm": 1.004314661026001, + "learning_rate": 1.4909684045188185e-05, + "loss": 2.6575, + "step": 30643 + }, + { + "epoch": 2.7762904577472765, + "grad_norm": 1.0426976680755615, + "learning_rate": 1.4903642844197427e-05, + "loss": 2.4612, + "step": 30644 + }, + { + "epoch": 2.776381055921723, + "grad_norm": 0.969062864780426, + "learning_rate": 1.4897601643206671e-05, + "loss": 2.5529, + "step": 30645 + }, + { + "epoch": 2.77647165409617, + "grad_norm": 1.0412765741348267, + "learning_rate": 1.4891560442215912e-05, + "loss": 2.6638, + "step": 30646 + }, + { + "epoch": 2.7765622522706166, + "grad_norm": 1.074257493019104, + "learning_rate": 1.4885519241225156e-05, + "loss": 2.2669, + "step": 30647 + }, + { + "epoch": 2.7766528504450636, + "grad_norm": 0.8907572031021118, + "learning_rate": 1.4879478040234399e-05, + "loss": 1.8514, + "step": 30648 + }, + { + "epoch": 2.77674344861951, + "grad_norm": 1.1551380157470703, + "learning_rate": 1.4873436839243643e-05, + "loss": 2.599, + "step": 30649 + }, + { + "epoch": 2.776834046793957, + "grad_norm": 1.0148274898529053, + "learning_rate": 1.4867395638252884e-05, + "loss": 2.4726, + "step": 30650 + }, + { + "epoch": 2.7769246449684037, + "grad_norm": 0.9996286034584045, + "learning_rate": 1.4861354437262128e-05, + "loss": 2.4952, + "step": 30651 + }, + { + "epoch": 2.7770152431428508, + "grad_norm": 1.0252982378005981, + "learning_rate": 1.485531323627137e-05, + "loss": 2.6388, + "step": 30652 + }, + { + "epoch": 2.7771058413172973, + "grad_norm": 1.0077656507492065, + "learning_rate": 1.4849272035280615e-05, + "loss": 2.6278, + "step": 30653 + }, + { + "epoch": 2.7771964394917443, + "grad_norm": 1.0237560272216797, + "learning_rate": 1.4843230834289856e-05, + "loss": 2.4071, + "step": 30654 + }, + { + "epoch": 2.777287037666191, + "grad_norm": 1.0308853387832642, + "learning_rate": 1.48371896332991e-05, + "loss": 2.8393, + "step": 30655 + }, + { + "epoch": 2.777377635840638, + "grad_norm": 1.0349682569503784, + "learning_rate": 1.4831148432308344e-05, + "loss": 2.9433, + "step": 30656 + }, + { + "epoch": 2.7774682340150845, + "grad_norm": 0.9931530952453613, + "learning_rate": 1.4825107231317587e-05, + "loss": 2.5234, + "step": 30657 + }, + { + "epoch": 2.7775588321895315, + "grad_norm": 0.9550544619560242, + "learning_rate": 1.4819066030326831e-05, + "loss": 2.5625, + "step": 30658 + }, + { + "epoch": 2.777649430363978, + "grad_norm": 0.9934197068214417, + "learning_rate": 1.4813024829336072e-05, + "loss": 2.6117, + "step": 30659 + }, + { + "epoch": 2.777740028538425, + "grad_norm": 1.0018748044967651, + "learning_rate": 1.4806983628345316e-05, + "loss": 2.7192, + "step": 30660 + }, + { + "epoch": 2.7778306267128716, + "grad_norm": 1.012794852256775, + "learning_rate": 1.4800942427354559e-05, + "loss": 2.4062, + "step": 30661 + }, + { + "epoch": 2.7779212248873186, + "grad_norm": 1.0580670833587646, + "learning_rate": 1.4794901226363803e-05, + "loss": 2.6988, + "step": 30662 + }, + { + "epoch": 2.778011823061765, + "grad_norm": 1.0048811435699463, + "learning_rate": 1.4788860025373044e-05, + "loss": 2.5199, + "step": 30663 + }, + { + "epoch": 2.778102421236212, + "grad_norm": 0.9390949606895447, + "learning_rate": 1.4782818824382288e-05, + "loss": 2.7683, + "step": 30664 + }, + { + "epoch": 2.7781930194106588, + "grad_norm": 0.959726095199585, + "learning_rate": 1.477677762339153e-05, + "loss": 2.7622, + "step": 30665 + }, + { + "epoch": 2.778283617585106, + "grad_norm": 1.016951560974121, + "learning_rate": 1.4770736422400775e-05, + "loss": 2.7446, + "step": 30666 + }, + { + "epoch": 2.7783742157595523, + "grad_norm": 1.0972555875778198, + "learning_rate": 1.4764695221410015e-05, + "loss": 2.3986, + "step": 30667 + }, + { + "epoch": 2.7784648139339994, + "grad_norm": 1.0066548585891724, + "learning_rate": 1.475865402041926e-05, + "loss": 2.5373, + "step": 30668 + }, + { + "epoch": 2.778555412108446, + "grad_norm": 0.9589036107063293, + "learning_rate": 1.4752612819428502e-05, + "loss": 2.5241, + "step": 30669 + }, + { + "epoch": 2.778646010282893, + "grad_norm": 1.1071535348892212, + "learning_rate": 1.4746571618437746e-05, + "loss": 2.4883, + "step": 30670 + }, + { + "epoch": 2.7787366084573395, + "grad_norm": 1.1653634309768677, + "learning_rate": 1.474053041744699e-05, + "loss": 2.5583, + "step": 30671 + }, + { + "epoch": 2.7788272066317865, + "grad_norm": 0.9855630993843079, + "learning_rate": 1.4734489216456231e-05, + "loss": 2.4714, + "step": 30672 + }, + { + "epoch": 2.778917804806233, + "grad_norm": 0.9315958023071289, + "learning_rate": 1.4728448015465476e-05, + "loss": 2.5344, + "step": 30673 + }, + { + "epoch": 2.77900840298068, + "grad_norm": 1.1058831214904785, + "learning_rate": 1.4722406814474718e-05, + "loss": 2.3735, + "step": 30674 + }, + { + "epoch": 2.7790990011551266, + "grad_norm": 1.004601001739502, + "learning_rate": 1.4716365613483962e-05, + "loss": 2.4715, + "step": 30675 + }, + { + "epoch": 2.7791895993295737, + "grad_norm": 0.8785958886146545, + "learning_rate": 1.4710324412493203e-05, + "loss": 2.1304, + "step": 30676 + }, + { + "epoch": 2.7792801975040202, + "grad_norm": 1.178537368774414, + "learning_rate": 1.4704283211502447e-05, + "loss": 2.4258, + "step": 30677 + }, + { + "epoch": 2.7793707956784672, + "grad_norm": 1.0906850099563599, + "learning_rate": 1.469824201051169e-05, + "loss": 2.6285, + "step": 30678 + }, + { + "epoch": 2.779461393852914, + "grad_norm": 1.0627676248550415, + "learning_rate": 1.4692200809520934e-05, + "loss": 2.6615, + "step": 30679 + }, + { + "epoch": 2.779551992027361, + "grad_norm": 0.8337739706039429, + "learning_rate": 1.4686159608530175e-05, + "loss": 1.9358, + "step": 30680 + }, + { + "epoch": 2.7796425902018074, + "grad_norm": 0.9547391533851624, + "learning_rate": 1.468011840753942e-05, + "loss": 2.4861, + "step": 30681 + }, + { + "epoch": 2.7797331883762544, + "grad_norm": 0.9584828019142151, + "learning_rate": 1.4674077206548662e-05, + "loss": 2.6524, + "step": 30682 + }, + { + "epoch": 2.779823786550701, + "grad_norm": 0.9319374561309814, + "learning_rate": 1.4668036005557906e-05, + "loss": 1.8143, + "step": 30683 + }, + { + "epoch": 2.7799143847251475, + "grad_norm": 1.0365021228790283, + "learning_rate": 1.4661994804567147e-05, + "loss": 2.5697, + "step": 30684 + }, + { + "epoch": 2.7800049828995945, + "grad_norm": 1.0592199563980103, + "learning_rate": 1.4655953603576391e-05, + "loss": 2.6481, + "step": 30685 + }, + { + "epoch": 2.7800955810740415, + "grad_norm": 1.0151066780090332, + "learning_rate": 1.4649912402585635e-05, + "loss": 2.4774, + "step": 30686 + }, + { + "epoch": 2.780186179248488, + "grad_norm": 1.037418007850647, + "learning_rate": 1.4643871201594878e-05, + "loss": 2.5165, + "step": 30687 + }, + { + "epoch": 2.7802767774229347, + "grad_norm": 0.9860263466835022, + "learning_rate": 1.4637830000604122e-05, + "loss": 2.608, + "step": 30688 + }, + { + "epoch": 2.7803673755973817, + "grad_norm": 1.022498369216919, + "learning_rate": 1.4631788799613363e-05, + "loss": 2.5132, + "step": 30689 + }, + { + "epoch": 2.7804579737718287, + "grad_norm": 0.9221192002296448, + "learning_rate": 1.4625747598622607e-05, + "loss": 2.6201, + "step": 30690 + }, + { + "epoch": 2.7805485719462752, + "grad_norm": 0.8825112581253052, + "learning_rate": 1.461970639763185e-05, + "loss": 1.955, + "step": 30691 + }, + { + "epoch": 2.780639170120722, + "grad_norm": 0.82472825050354, + "learning_rate": 1.4613665196641094e-05, + "loss": 1.9619, + "step": 30692 + }, + { + "epoch": 2.780729768295169, + "grad_norm": 1.101722002029419, + "learning_rate": 1.4607623995650335e-05, + "loss": 2.282, + "step": 30693 + }, + { + "epoch": 2.780820366469616, + "grad_norm": 1.1073544025421143, + "learning_rate": 1.4601582794659579e-05, + "loss": 2.5055, + "step": 30694 + }, + { + "epoch": 2.7809109646440624, + "grad_norm": 0.9401633739471436, + "learning_rate": 1.4595541593668821e-05, + "loss": 2.4468, + "step": 30695 + }, + { + "epoch": 2.781001562818509, + "grad_norm": 1.078884482383728, + "learning_rate": 1.4589500392678066e-05, + "loss": 2.4747, + "step": 30696 + }, + { + "epoch": 2.781092160992956, + "grad_norm": 1.0138064622879028, + "learning_rate": 1.4583459191687306e-05, + "loss": 2.6771, + "step": 30697 + }, + { + "epoch": 2.781182759167403, + "grad_norm": 1.0384936332702637, + "learning_rate": 1.457741799069655e-05, + "loss": 2.5277, + "step": 30698 + }, + { + "epoch": 2.7812733573418496, + "grad_norm": 0.9739617109298706, + "learning_rate": 1.4571376789705793e-05, + "loss": 2.4872, + "step": 30699 + }, + { + "epoch": 2.781363955516296, + "grad_norm": 0.997419536113739, + "learning_rate": 1.4565335588715037e-05, + "loss": 2.6861, + "step": 30700 + }, + { + "epoch": 2.781454553690743, + "grad_norm": 1.103402853012085, + "learning_rate": 1.4559294387724282e-05, + "loss": 2.9291, + "step": 30701 + }, + { + "epoch": 2.78154515186519, + "grad_norm": 0.9011237621307373, + "learning_rate": 1.4553253186733522e-05, + "loss": 1.9834, + "step": 30702 + }, + { + "epoch": 2.7816357500396367, + "grad_norm": 0.9879522323608398, + "learning_rate": 1.4547211985742768e-05, + "loss": 2.9231, + "step": 30703 + }, + { + "epoch": 2.7817263482140833, + "grad_norm": 1.0171970129013062, + "learning_rate": 1.4541170784752009e-05, + "loss": 2.4174, + "step": 30704 + }, + { + "epoch": 2.7818169463885303, + "grad_norm": 1.0119054317474365, + "learning_rate": 1.4535129583761253e-05, + "loss": 2.644, + "step": 30705 + }, + { + "epoch": 2.7819075445629773, + "grad_norm": 1.084101676940918, + "learning_rate": 1.4529088382770494e-05, + "loss": 2.4067, + "step": 30706 + }, + { + "epoch": 2.781998142737424, + "grad_norm": 0.9069783687591553, + "learning_rate": 1.452304718177974e-05, + "loss": 2.0759, + "step": 30707 + }, + { + "epoch": 2.7820887409118704, + "grad_norm": 0.9805590510368347, + "learning_rate": 1.4517005980788981e-05, + "loss": 2.5254, + "step": 30708 + }, + { + "epoch": 2.7821793390863174, + "grad_norm": 1.0468530654907227, + "learning_rate": 1.4510964779798225e-05, + "loss": 2.5609, + "step": 30709 + }, + { + "epoch": 2.7822699372607644, + "grad_norm": 1.1471421718597412, + "learning_rate": 1.4504923578807466e-05, + "loss": 2.6429, + "step": 30710 + }, + { + "epoch": 2.782360535435211, + "grad_norm": 0.9996014833450317, + "learning_rate": 1.449888237781671e-05, + "loss": 2.4983, + "step": 30711 + }, + { + "epoch": 2.7824511336096576, + "grad_norm": 0.9793367981910706, + "learning_rate": 1.4492841176825953e-05, + "loss": 2.4328, + "step": 30712 + }, + { + "epoch": 2.7825417317841046, + "grad_norm": 0.9486910104751587, + "learning_rate": 1.4486799975835197e-05, + "loss": 2.469, + "step": 30713 + }, + { + "epoch": 2.7826323299585516, + "grad_norm": 1.0070005655288696, + "learning_rate": 1.4480758774844438e-05, + "loss": 2.5205, + "step": 30714 + }, + { + "epoch": 2.782722928132998, + "grad_norm": 1.0344196557998657, + "learning_rate": 1.4474717573853682e-05, + "loss": 2.7072, + "step": 30715 + }, + { + "epoch": 2.7828135263074447, + "grad_norm": 1.084805965423584, + "learning_rate": 1.4468676372862928e-05, + "loss": 2.655, + "step": 30716 + }, + { + "epoch": 2.7829041244818917, + "grad_norm": 0.9775916934013367, + "learning_rate": 1.4462635171872169e-05, + "loss": 2.6682, + "step": 30717 + }, + { + "epoch": 2.7829947226563387, + "grad_norm": 1.0867539644241333, + "learning_rate": 1.4456593970881413e-05, + "loss": 2.7306, + "step": 30718 + }, + { + "epoch": 2.7830853208307853, + "grad_norm": 1.0301495790481567, + "learning_rate": 1.4450552769890654e-05, + "loss": 2.8581, + "step": 30719 + }, + { + "epoch": 2.783175919005232, + "grad_norm": 1.0432811975479126, + "learning_rate": 1.44445115688999e-05, + "loss": 2.6255, + "step": 30720 + }, + { + "epoch": 2.783266517179679, + "grad_norm": 1.0650197267532349, + "learning_rate": 1.443847036790914e-05, + "loss": 2.9579, + "step": 30721 + }, + { + "epoch": 2.783357115354126, + "grad_norm": 1.1206316947937012, + "learning_rate": 1.4432429166918385e-05, + "loss": 2.8728, + "step": 30722 + }, + { + "epoch": 2.7834477135285725, + "grad_norm": 1.0090876817703247, + "learning_rate": 1.4426387965927626e-05, + "loss": 1.79, + "step": 30723 + }, + { + "epoch": 2.783538311703019, + "grad_norm": 0.9586791396141052, + "learning_rate": 1.4420346764936871e-05, + "loss": 2.4583, + "step": 30724 + }, + { + "epoch": 2.783628909877466, + "grad_norm": 1.0700801610946655, + "learning_rate": 1.4414305563946112e-05, + "loss": 2.5599, + "step": 30725 + }, + { + "epoch": 2.7837195080519126, + "grad_norm": 0.9542999267578125, + "learning_rate": 1.4408264362955356e-05, + "loss": 2.6781, + "step": 30726 + }, + { + "epoch": 2.7838101062263596, + "grad_norm": 1.0164648294448853, + "learning_rate": 1.4402223161964597e-05, + "loss": 2.7311, + "step": 30727 + }, + { + "epoch": 2.783900704400806, + "grad_norm": 1.127086877822876, + "learning_rate": 1.4396181960973843e-05, + "loss": 2.9373, + "step": 30728 + }, + { + "epoch": 2.783991302575253, + "grad_norm": 1.170324444770813, + "learning_rate": 1.4390140759983084e-05, + "loss": 2.5443, + "step": 30729 + }, + { + "epoch": 2.7840819007496997, + "grad_norm": 1.0357412099838257, + "learning_rate": 1.4384099558992328e-05, + "loss": 2.5749, + "step": 30730 + }, + { + "epoch": 2.7841724989241468, + "grad_norm": 1.0590630769729614, + "learning_rate": 1.4378058358001572e-05, + "loss": 2.3436, + "step": 30731 + }, + { + "epoch": 2.7842630970985933, + "grad_norm": 1.0111666917800903, + "learning_rate": 1.4372017157010815e-05, + "loss": 2.7016, + "step": 30732 + }, + { + "epoch": 2.7843536952730403, + "grad_norm": 1.0888653993606567, + "learning_rate": 1.436597595602006e-05, + "loss": 2.419, + "step": 30733 + }, + { + "epoch": 2.784444293447487, + "grad_norm": 0.9671772718429565, + "learning_rate": 1.43599347550293e-05, + "loss": 2.5737, + "step": 30734 + }, + { + "epoch": 2.784534891621934, + "grad_norm": 0.9820460081100464, + "learning_rate": 1.4353893554038544e-05, + "loss": 2.5195, + "step": 30735 + }, + { + "epoch": 2.7846254897963805, + "grad_norm": 1.1001719236373901, + "learning_rate": 1.4347852353047787e-05, + "loss": 2.8497, + "step": 30736 + }, + { + "epoch": 2.7847160879708275, + "grad_norm": 0.9785716533660889, + "learning_rate": 1.4341811152057031e-05, + "loss": 2.5681, + "step": 30737 + }, + { + "epoch": 2.784806686145274, + "grad_norm": 1.0157867670059204, + "learning_rate": 1.4335769951066272e-05, + "loss": 2.7477, + "step": 30738 + }, + { + "epoch": 2.784897284319721, + "grad_norm": 1.026777744293213, + "learning_rate": 1.4329728750075516e-05, + "loss": 2.7578, + "step": 30739 + }, + { + "epoch": 2.7849878824941676, + "grad_norm": 1.0683174133300781, + "learning_rate": 1.4323687549084759e-05, + "loss": 2.8288, + "step": 30740 + }, + { + "epoch": 2.7850784806686146, + "grad_norm": 0.8872113823890686, + "learning_rate": 1.4317646348094003e-05, + "loss": 2.1649, + "step": 30741 + }, + { + "epoch": 2.785169078843061, + "grad_norm": 1.0257095098495483, + "learning_rate": 1.4311605147103244e-05, + "loss": 2.4977, + "step": 30742 + }, + { + "epoch": 2.785259677017508, + "grad_norm": 0.9986939430236816, + "learning_rate": 1.4305563946112488e-05, + "loss": 2.491, + "step": 30743 + }, + { + "epoch": 2.7853502751919548, + "grad_norm": 0.9033051133155823, + "learning_rate": 1.4299522745121729e-05, + "loss": 2.652, + "step": 30744 + }, + { + "epoch": 2.785440873366402, + "grad_norm": 0.9667850136756897, + "learning_rate": 1.4293481544130975e-05, + "loss": 2.6826, + "step": 30745 + }, + { + "epoch": 2.7855314715408483, + "grad_norm": 1.112388253211975, + "learning_rate": 1.4287440343140219e-05, + "loss": 2.5346, + "step": 30746 + }, + { + "epoch": 2.7856220697152954, + "grad_norm": 1.0080842971801758, + "learning_rate": 1.428139914214946e-05, + "loss": 2.5129, + "step": 30747 + }, + { + "epoch": 2.785712667889742, + "grad_norm": 1.1148977279663086, + "learning_rate": 1.4275357941158704e-05, + "loss": 2.3423, + "step": 30748 + }, + { + "epoch": 2.785803266064189, + "grad_norm": 1.0097569227218628, + "learning_rate": 1.4269316740167946e-05, + "loss": 2.4925, + "step": 30749 + }, + { + "epoch": 2.7858938642386355, + "grad_norm": 0.9956900477409363, + "learning_rate": 1.426327553917719e-05, + "loss": 2.56, + "step": 30750 + }, + { + "epoch": 2.7859844624130825, + "grad_norm": 1.2053780555725098, + "learning_rate": 1.4257234338186431e-05, + "loss": 2.6497, + "step": 30751 + }, + { + "epoch": 2.786075060587529, + "grad_norm": 1.0893861055374146, + "learning_rate": 1.4251193137195676e-05, + "loss": 2.5539, + "step": 30752 + }, + { + "epoch": 2.786165658761976, + "grad_norm": 1.0486897230148315, + "learning_rate": 1.4245151936204918e-05, + "loss": 2.7712, + "step": 30753 + }, + { + "epoch": 2.7862562569364226, + "grad_norm": 0.9747064113616943, + "learning_rate": 1.4239110735214162e-05, + "loss": 2.4365, + "step": 30754 + }, + { + "epoch": 2.7863468551108697, + "grad_norm": 0.8864904046058655, + "learning_rate": 1.4233069534223403e-05, + "loss": 1.182, + "step": 30755 + }, + { + "epoch": 2.7864374532853162, + "grad_norm": 1.1066170930862427, + "learning_rate": 1.4227028333232647e-05, + "loss": 2.8124, + "step": 30756 + }, + { + "epoch": 2.7865280514597632, + "grad_norm": 1.0575547218322754, + "learning_rate": 1.422098713224189e-05, + "loss": 2.7244, + "step": 30757 + }, + { + "epoch": 2.78661864963421, + "grad_norm": 0.908953845500946, + "learning_rate": 1.4214945931251134e-05, + "loss": 2.101, + "step": 30758 + }, + { + "epoch": 2.786709247808657, + "grad_norm": 1.0160874128341675, + "learning_rate": 1.4208904730260378e-05, + "loss": 2.7598, + "step": 30759 + }, + { + "epoch": 2.7867998459831034, + "grad_norm": 0.9334449768066406, + "learning_rate": 1.420286352926962e-05, + "loss": 2.0701, + "step": 30760 + }, + { + "epoch": 2.7868904441575504, + "grad_norm": 0.9068641066551208, + "learning_rate": 1.4196822328278863e-05, + "loss": 2.4861, + "step": 30761 + }, + { + "epoch": 2.786981042331997, + "grad_norm": 1.1102715730667114, + "learning_rate": 1.4190781127288106e-05, + "loss": 2.6371, + "step": 30762 + }, + { + "epoch": 2.787071640506444, + "grad_norm": 1.0387179851531982, + "learning_rate": 1.418473992629735e-05, + "loss": 2.6757, + "step": 30763 + }, + { + "epoch": 2.7871622386808905, + "grad_norm": 0.9950082302093506, + "learning_rate": 1.4178698725306591e-05, + "loss": 2.7764, + "step": 30764 + }, + { + "epoch": 2.7872528368553375, + "grad_norm": 1.0118186473846436, + "learning_rate": 1.4172657524315835e-05, + "loss": 2.2371, + "step": 30765 + }, + { + "epoch": 2.787343435029784, + "grad_norm": 0.9350147843360901, + "learning_rate": 1.4166616323325078e-05, + "loss": 2.5082, + "step": 30766 + }, + { + "epoch": 2.7874340332042307, + "grad_norm": 0.9628370404243469, + "learning_rate": 1.4160575122334322e-05, + "loss": 2.3304, + "step": 30767 + }, + { + "epoch": 2.7875246313786777, + "grad_norm": 1.0473510026931763, + "learning_rate": 1.4154533921343563e-05, + "loss": 2.7384, + "step": 30768 + }, + { + "epoch": 2.7876152295531247, + "grad_norm": 1.1023330688476562, + "learning_rate": 1.4148492720352807e-05, + "loss": 2.7113, + "step": 30769 + }, + { + "epoch": 2.7877058277275713, + "grad_norm": 1.0197821855545044, + "learning_rate": 1.414245151936205e-05, + "loss": 2.6692, + "step": 30770 + }, + { + "epoch": 2.787796425902018, + "grad_norm": 1.0553021430969238, + "learning_rate": 1.4136410318371294e-05, + "loss": 2.6007, + "step": 30771 + }, + { + "epoch": 2.787887024076465, + "grad_norm": 1.0016010999679565, + "learning_rate": 1.4130369117380535e-05, + "loss": 2.7199, + "step": 30772 + }, + { + "epoch": 2.787977622250912, + "grad_norm": 1.1271864175796509, + "learning_rate": 1.4124327916389779e-05, + "loss": 2.7944, + "step": 30773 + }, + { + "epoch": 2.7880682204253584, + "grad_norm": 1.0807664394378662, + "learning_rate": 1.4118286715399023e-05, + "loss": 2.507, + "step": 30774 + }, + { + "epoch": 2.788158818599805, + "grad_norm": 0.9409186244010925, + "learning_rate": 1.4112245514408266e-05, + "loss": 2.516, + "step": 30775 + }, + { + "epoch": 2.788249416774252, + "grad_norm": 0.9973142743110657, + "learning_rate": 1.410620431341751e-05, + "loss": 2.5686, + "step": 30776 + }, + { + "epoch": 2.788340014948699, + "grad_norm": 1.1418402194976807, + "learning_rate": 1.410016311242675e-05, + "loss": 2.424, + "step": 30777 + }, + { + "epoch": 2.7884306131231456, + "grad_norm": 1.0029990673065186, + "learning_rate": 1.4094121911435995e-05, + "loss": 2.5467, + "step": 30778 + }, + { + "epoch": 2.788521211297592, + "grad_norm": 1.0170352458953857, + "learning_rate": 1.4088080710445237e-05, + "loss": 2.6036, + "step": 30779 + }, + { + "epoch": 2.788611809472039, + "grad_norm": 1.0172967910766602, + "learning_rate": 1.4082039509454482e-05, + "loss": 2.6411, + "step": 30780 + }, + { + "epoch": 2.788702407646486, + "grad_norm": 0.9439435005187988, + "learning_rate": 1.4075998308463722e-05, + "loss": 2.6212, + "step": 30781 + }, + { + "epoch": 2.7887930058209327, + "grad_norm": 1.095384955406189, + "learning_rate": 1.4069957107472967e-05, + "loss": 2.9281, + "step": 30782 + }, + { + "epoch": 2.7888836039953793, + "grad_norm": 0.9804745316505432, + "learning_rate": 1.4063915906482209e-05, + "loss": 2.6132, + "step": 30783 + }, + { + "epoch": 2.7889742021698263, + "grad_norm": 1.074494481086731, + "learning_rate": 1.4057874705491453e-05, + "loss": 2.8153, + "step": 30784 + }, + { + "epoch": 2.7890648003442733, + "grad_norm": 0.9932643175125122, + "learning_rate": 1.4051833504500694e-05, + "loss": 2.8131, + "step": 30785 + }, + { + "epoch": 2.78915539851872, + "grad_norm": 0.9623073935508728, + "learning_rate": 1.4045792303509938e-05, + "loss": 2.7955, + "step": 30786 + }, + { + "epoch": 2.7892459966931664, + "grad_norm": 0.999864399433136, + "learning_rate": 1.4039751102519181e-05, + "loss": 2.5876, + "step": 30787 + }, + { + "epoch": 2.7893365948676134, + "grad_norm": 0.9863595366477966, + "learning_rate": 1.4033709901528425e-05, + "loss": 2.6039, + "step": 30788 + }, + { + "epoch": 2.7894271930420604, + "grad_norm": 1.0558935403823853, + "learning_rate": 1.402766870053767e-05, + "loss": 2.8091, + "step": 30789 + }, + { + "epoch": 2.789517791216507, + "grad_norm": 0.9749823808670044, + "learning_rate": 1.402162749954691e-05, + "loss": 2.5672, + "step": 30790 + }, + { + "epoch": 2.7896083893909536, + "grad_norm": 0.9968629479408264, + "learning_rate": 1.4015586298556154e-05, + "loss": 2.7405, + "step": 30791 + }, + { + "epoch": 2.7896989875654006, + "grad_norm": 1.1133661270141602, + "learning_rate": 1.4009545097565397e-05, + "loss": 2.7803, + "step": 30792 + }, + { + "epoch": 2.7897895857398476, + "grad_norm": 1.0600701570510864, + "learning_rate": 1.4003503896574641e-05, + "loss": 2.8109, + "step": 30793 + }, + { + "epoch": 2.789880183914294, + "grad_norm": 1.043332815170288, + "learning_rate": 1.3997462695583882e-05, + "loss": 2.9014, + "step": 30794 + }, + { + "epoch": 2.7899707820887407, + "grad_norm": 1.0404019355773926, + "learning_rate": 1.3991421494593126e-05, + "loss": 2.6563, + "step": 30795 + }, + { + "epoch": 2.7900613802631877, + "grad_norm": 0.97291100025177, + "learning_rate": 1.3985380293602369e-05, + "loss": 2.5321, + "step": 30796 + }, + { + "epoch": 2.7901519784376347, + "grad_norm": 1.0303796529769897, + "learning_rate": 1.3979339092611613e-05, + "loss": 2.6507, + "step": 30797 + }, + { + "epoch": 2.7902425766120813, + "grad_norm": 0.957626461982727, + "learning_rate": 1.3973297891620854e-05, + "loss": 2.5615, + "step": 30798 + }, + { + "epoch": 2.790333174786528, + "grad_norm": 0.9472256898880005, + "learning_rate": 1.3967256690630098e-05, + "loss": 2.6753, + "step": 30799 + }, + { + "epoch": 2.790423772960975, + "grad_norm": 1.035942792892456, + "learning_rate": 1.396121548963934e-05, + "loss": 2.6374, + "step": 30800 + }, + { + "epoch": 2.790514371135422, + "grad_norm": 0.9960569143295288, + "learning_rate": 1.3955174288648585e-05, + "loss": 2.4361, + "step": 30801 + }, + { + "epoch": 2.7906049693098685, + "grad_norm": 0.9664934277534485, + "learning_rate": 1.3949133087657826e-05, + "loss": 2.5234, + "step": 30802 + }, + { + "epoch": 2.790695567484315, + "grad_norm": 0.9578012228012085, + "learning_rate": 1.394309188666707e-05, + "loss": 2.533, + "step": 30803 + }, + { + "epoch": 2.790786165658762, + "grad_norm": 0.9447445273399353, + "learning_rate": 1.3937050685676314e-05, + "loss": 2.609, + "step": 30804 + }, + { + "epoch": 2.7908767638332086, + "grad_norm": 0.9677905440330505, + "learning_rate": 1.3931009484685556e-05, + "loss": 2.6345, + "step": 30805 + }, + { + "epoch": 2.7909673620076556, + "grad_norm": 1.065786600112915, + "learning_rate": 1.39249682836948e-05, + "loss": 2.7211, + "step": 30806 + }, + { + "epoch": 2.791057960182102, + "grad_norm": 1.0466426610946655, + "learning_rate": 1.3918927082704042e-05, + "loss": 2.6211, + "step": 30807 + }, + { + "epoch": 2.791148558356549, + "grad_norm": 0.8290188312530518, + "learning_rate": 1.3912885881713286e-05, + "loss": 2.0195, + "step": 30808 + }, + { + "epoch": 2.7912391565309957, + "grad_norm": 0.9981095790863037, + "learning_rate": 1.3906844680722528e-05, + "loss": 2.5413, + "step": 30809 + }, + { + "epoch": 2.7913297547054428, + "grad_norm": 0.8829295039176941, + "learning_rate": 1.3900803479731772e-05, + "loss": 1.8696, + "step": 30810 + }, + { + "epoch": 2.7914203528798893, + "grad_norm": 1.1920241117477417, + "learning_rate": 1.3894762278741013e-05, + "loss": 2.566, + "step": 30811 + }, + { + "epoch": 2.7915109510543363, + "grad_norm": 1.090047836303711, + "learning_rate": 1.3888721077750258e-05, + "loss": 2.3267, + "step": 30812 + }, + { + "epoch": 2.791601549228783, + "grad_norm": 1.1049144268035889, + "learning_rate": 1.38826798767595e-05, + "loss": 2.5169, + "step": 30813 + }, + { + "epoch": 2.79169214740323, + "grad_norm": 1.0275310277938843, + "learning_rate": 1.3876638675768744e-05, + "loss": 2.7457, + "step": 30814 + }, + { + "epoch": 2.7917827455776765, + "grad_norm": 1.0638506412506104, + "learning_rate": 1.3870597474777985e-05, + "loss": 2.2057, + "step": 30815 + }, + { + "epoch": 2.7918733437521235, + "grad_norm": 1.0216583013534546, + "learning_rate": 1.386455627378723e-05, + "loss": 2.4857, + "step": 30816 + }, + { + "epoch": 2.79196394192657, + "grad_norm": 0.9324525594711304, + "learning_rate": 1.3858515072796472e-05, + "loss": 1.9956, + "step": 30817 + }, + { + "epoch": 2.792054540101017, + "grad_norm": 0.9636699557304382, + "learning_rate": 1.3852473871805716e-05, + "loss": 2.3548, + "step": 30818 + }, + { + "epoch": 2.7921451382754636, + "grad_norm": 0.9583924412727356, + "learning_rate": 1.384643267081496e-05, + "loss": 2.2702, + "step": 30819 + }, + { + "epoch": 2.7922357364499106, + "grad_norm": 1.028756856918335, + "learning_rate": 1.3840391469824201e-05, + "loss": 2.7487, + "step": 30820 + }, + { + "epoch": 2.792326334624357, + "grad_norm": 0.9070185422897339, + "learning_rate": 1.3834350268833445e-05, + "loss": 1.9138, + "step": 30821 + }, + { + "epoch": 2.792416932798804, + "grad_norm": 0.977350652217865, + "learning_rate": 1.3828309067842688e-05, + "loss": 2.4717, + "step": 30822 + }, + { + "epoch": 2.7925075309732508, + "grad_norm": 1.0391440391540527, + "learning_rate": 1.3822267866851932e-05, + "loss": 2.6727, + "step": 30823 + }, + { + "epoch": 2.792598129147698, + "grad_norm": 1.0554827451705933, + "learning_rate": 1.3816226665861173e-05, + "loss": 2.5447, + "step": 30824 + }, + { + "epoch": 2.7926887273221443, + "grad_norm": 1.0118911266326904, + "learning_rate": 1.3810185464870417e-05, + "loss": 2.4901, + "step": 30825 + }, + { + "epoch": 2.7927793254965914, + "grad_norm": 0.9347872138023376, + "learning_rate": 1.380414426387966e-05, + "loss": 2.462, + "step": 30826 + }, + { + "epoch": 2.792869923671038, + "grad_norm": 1.1851314306259155, + "learning_rate": 1.3798103062888904e-05, + "loss": 2.5832, + "step": 30827 + }, + { + "epoch": 2.792960521845485, + "grad_norm": 1.0575586557388306, + "learning_rate": 1.3792061861898145e-05, + "loss": 2.5014, + "step": 30828 + }, + { + "epoch": 2.7930511200199315, + "grad_norm": 0.9712785482406616, + "learning_rate": 1.3786020660907389e-05, + "loss": 2.6129, + "step": 30829 + }, + { + "epoch": 2.7931417181943785, + "grad_norm": 1.1559849977493286, + "learning_rate": 1.3779979459916631e-05, + "loss": 2.3257, + "step": 30830 + }, + { + "epoch": 2.793232316368825, + "grad_norm": 1.0650089979171753, + "learning_rate": 1.3773938258925876e-05, + "loss": 2.5174, + "step": 30831 + }, + { + "epoch": 2.793322914543272, + "grad_norm": 1.0495073795318604, + "learning_rate": 1.3767897057935116e-05, + "loss": 2.8605, + "step": 30832 + }, + { + "epoch": 2.7934135127177186, + "grad_norm": 1.0235462188720703, + "learning_rate": 1.376185585694436e-05, + "loss": 2.5089, + "step": 30833 + }, + { + "epoch": 2.7935041108921657, + "grad_norm": 0.9912600517272949, + "learning_rate": 1.3755814655953605e-05, + "loss": 2.373, + "step": 30834 + }, + { + "epoch": 2.7935947090666122, + "grad_norm": 0.9487712979316711, + "learning_rate": 1.3749773454962847e-05, + "loss": 2.0355, + "step": 30835 + }, + { + "epoch": 2.7936853072410592, + "grad_norm": 0.9768937826156616, + "learning_rate": 1.3743732253972092e-05, + "loss": 2.6115, + "step": 30836 + }, + { + "epoch": 2.793775905415506, + "grad_norm": 0.8699538111686707, + "learning_rate": 1.3737691052981332e-05, + "loss": 1.9969, + "step": 30837 + }, + { + "epoch": 2.793866503589953, + "grad_norm": 0.9843391180038452, + "learning_rate": 1.3731649851990577e-05, + "loss": 2.8398, + "step": 30838 + }, + { + "epoch": 2.7939571017643994, + "grad_norm": 1.0643575191497803, + "learning_rate": 1.372560865099982e-05, + "loss": 2.5908, + "step": 30839 + }, + { + "epoch": 2.7940476999388464, + "grad_norm": 0.9358488321304321, + "learning_rate": 1.3719567450009063e-05, + "loss": 2.31, + "step": 30840 + }, + { + "epoch": 2.794138298113293, + "grad_norm": 0.9561992287635803, + "learning_rate": 1.3713526249018304e-05, + "loss": 2.5748, + "step": 30841 + }, + { + "epoch": 2.79422889628774, + "grad_norm": 0.9791283011436462, + "learning_rate": 1.3707485048027548e-05, + "loss": 2.3847, + "step": 30842 + }, + { + "epoch": 2.7943194944621865, + "grad_norm": 0.9508767127990723, + "learning_rate": 1.3701443847036791e-05, + "loss": 2.4975, + "step": 30843 + }, + { + "epoch": 2.7944100926366335, + "grad_norm": 0.9445919990539551, + "learning_rate": 1.3695402646046035e-05, + "loss": 2.681, + "step": 30844 + }, + { + "epoch": 2.79450069081108, + "grad_norm": 1.0852806568145752, + "learning_rate": 1.3689361445055276e-05, + "loss": 2.5183, + "step": 30845 + }, + { + "epoch": 2.7945912889855267, + "grad_norm": 0.9966341853141785, + "learning_rate": 1.368332024406452e-05, + "loss": 2.5015, + "step": 30846 + }, + { + "epoch": 2.7946818871599737, + "grad_norm": 1.0416219234466553, + "learning_rate": 1.3677279043073763e-05, + "loss": 2.4836, + "step": 30847 + }, + { + "epoch": 2.7947724853344207, + "grad_norm": 1.0074286460876465, + "learning_rate": 1.3671237842083007e-05, + "loss": 2.501, + "step": 30848 + }, + { + "epoch": 2.7948630835088673, + "grad_norm": 1.0100338459014893, + "learning_rate": 1.3665196641092251e-05, + "loss": 2.5167, + "step": 30849 + }, + { + "epoch": 2.794953681683314, + "grad_norm": 1.0425992012023926, + "learning_rate": 1.3659155440101492e-05, + "loss": 3.2949, + "step": 30850 + }, + { + "epoch": 2.795044279857761, + "grad_norm": 1.0658174753189087, + "learning_rate": 1.3653114239110736e-05, + "loss": 1.8495, + "step": 30851 + }, + { + "epoch": 2.795134878032208, + "grad_norm": 0.9523332715034485, + "learning_rate": 1.3647073038119979e-05, + "loss": 2.4166, + "step": 30852 + }, + { + "epoch": 2.7952254762066544, + "grad_norm": 1.0471153259277344, + "learning_rate": 1.3641031837129223e-05, + "loss": 3.1397, + "step": 30853 + }, + { + "epoch": 2.795316074381101, + "grad_norm": 1.0476757287979126, + "learning_rate": 1.3634990636138464e-05, + "loss": 2.6334, + "step": 30854 + }, + { + "epoch": 2.795406672555548, + "grad_norm": 1.0392541885375977, + "learning_rate": 1.3628949435147708e-05, + "loss": 2.5763, + "step": 30855 + }, + { + "epoch": 2.795497270729995, + "grad_norm": 1.0488042831420898, + "learning_rate": 1.362290823415695e-05, + "loss": 2.5776, + "step": 30856 + }, + { + "epoch": 2.7955878689044416, + "grad_norm": 1.0073401927947998, + "learning_rate": 1.3616867033166195e-05, + "loss": 2.6399, + "step": 30857 + }, + { + "epoch": 2.795678467078888, + "grad_norm": 1.0016754865646362, + "learning_rate": 1.3610825832175436e-05, + "loss": 2.597, + "step": 30858 + }, + { + "epoch": 2.795769065253335, + "grad_norm": 0.9636979699134827, + "learning_rate": 1.360478463118468e-05, + "loss": 2.5808, + "step": 30859 + }, + { + "epoch": 2.795859663427782, + "grad_norm": 0.990929901599884, + "learning_rate": 1.3598743430193922e-05, + "loss": 2.5508, + "step": 30860 + }, + { + "epoch": 2.7959502616022287, + "grad_norm": 1.011874794960022, + "learning_rate": 1.3592702229203167e-05, + "loss": 2.7051, + "step": 30861 + }, + { + "epoch": 2.7960408597766753, + "grad_norm": 1.0504757165908813, + "learning_rate": 1.3586661028212407e-05, + "loss": 2.3989, + "step": 30862 + }, + { + "epoch": 2.7961314579511223, + "grad_norm": 1.0797321796417236, + "learning_rate": 1.3580619827221652e-05, + "loss": 2.7541, + "step": 30863 + }, + { + "epoch": 2.7962220561255693, + "grad_norm": 0.9220901131629944, + "learning_rate": 1.3574578626230896e-05, + "loss": 1.944, + "step": 30864 + }, + { + "epoch": 2.796312654300016, + "grad_norm": 1.026207685470581, + "learning_rate": 1.3568537425240138e-05, + "loss": 2.3441, + "step": 30865 + }, + { + "epoch": 2.7964032524744624, + "grad_norm": 0.934788167476654, + "learning_rate": 1.3562496224249383e-05, + "loss": 2.5213, + "step": 30866 + }, + { + "epoch": 2.7964938506489094, + "grad_norm": 0.9719401001930237, + "learning_rate": 1.3556455023258623e-05, + "loss": 2.533, + "step": 30867 + }, + { + "epoch": 2.7965844488233564, + "grad_norm": 1.0342577695846558, + "learning_rate": 1.3550413822267868e-05, + "loss": 2.3994, + "step": 30868 + }, + { + "epoch": 2.796675046997803, + "grad_norm": 0.9233365058898926, + "learning_rate": 1.354437262127711e-05, + "loss": 2.3571, + "step": 30869 + }, + { + "epoch": 2.7967656451722496, + "grad_norm": 1.0607413053512573, + "learning_rate": 1.3538331420286354e-05, + "loss": 2.8091, + "step": 30870 + }, + { + "epoch": 2.7968562433466966, + "grad_norm": 0.6846426129341125, + "learning_rate": 1.3532290219295595e-05, + "loss": 1.3899, + "step": 30871 + }, + { + "epoch": 2.7969468415211436, + "grad_norm": 1.2227392196655273, + "learning_rate": 1.352624901830484e-05, + "loss": 2.6641, + "step": 30872 + }, + { + "epoch": 2.79703743969559, + "grad_norm": 0.9948081970214844, + "learning_rate": 1.3520207817314082e-05, + "loss": 2.7884, + "step": 30873 + }, + { + "epoch": 2.7971280378700367, + "grad_norm": 0.9729716777801514, + "learning_rate": 1.3514166616323326e-05, + "loss": 2.408, + "step": 30874 + }, + { + "epoch": 2.7972186360444837, + "grad_norm": 0.96587735414505, + "learning_rate": 1.3508125415332567e-05, + "loss": 2.5886, + "step": 30875 + }, + { + "epoch": 2.7973092342189307, + "grad_norm": 1.0319747924804688, + "learning_rate": 1.3502084214341811e-05, + "loss": 2.6593, + "step": 30876 + }, + { + "epoch": 2.7973998323933773, + "grad_norm": 0.9359975457191467, + "learning_rate": 1.3496043013351054e-05, + "loss": 2.7759, + "step": 30877 + }, + { + "epoch": 2.797490430567824, + "grad_norm": 0.8322081565856934, + "learning_rate": 1.3490001812360298e-05, + "loss": 1.9881, + "step": 30878 + }, + { + "epoch": 2.797581028742271, + "grad_norm": 0.9826815724372864, + "learning_rate": 1.3483960611369542e-05, + "loss": 2.6052, + "step": 30879 + }, + { + "epoch": 2.797671626916718, + "grad_norm": 1.0341540575027466, + "learning_rate": 1.3477919410378783e-05, + "loss": 2.4224, + "step": 30880 + }, + { + "epoch": 2.7977622250911645, + "grad_norm": 1.0286705493927002, + "learning_rate": 1.3471878209388027e-05, + "loss": 2.5873, + "step": 30881 + }, + { + "epoch": 2.797852823265611, + "grad_norm": 1.046077847480774, + "learning_rate": 1.346583700839727e-05, + "loss": 2.624, + "step": 30882 + }, + { + "epoch": 2.797943421440058, + "grad_norm": 0.9892807006835938, + "learning_rate": 1.3459795807406514e-05, + "loss": 2.737, + "step": 30883 + }, + { + "epoch": 2.798034019614505, + "grad_norm": 0.9961308836936951, + "learning_rate": 1.3453754606415755e-05, + "loss": 2.4033, + "step": 30884 + }, + { + "epoch": 2.7981246177889516, + "grad_norm": 1.0833815336227417, + "learning_rate": 1.3447713405424999e-05, + "loss": 2.6146, + "step": 30885 + }, + { + "epoch": 2.798215215963398, + "grad_norm": 1.0162289142608643, + "learning_rate": 1.3441672204434242e-05, + "loss": 2.7416, + "step": 30886 + }, + { + "epoch": 2.798305814137845, + "grad_norm": 1.0774177312850952, + "learning_rate": 1.3435631003443486e-05, + "loss": 2.4279, + "step": 30887 + }, + { + "epoch": 2.7983964123122917, + "grad_norm": 0.9665694832801819, + "learning_rate": 1.3429589802452727e-05, + "loss": 2.5734, + "step": 30888 + }, + { + "epoch": 2.7984870104867388, + "grad_norm": 1.073042392730713, + "learning_rate": 1.342354860146197e-05, + "loss": 2.4249, + "step": 30889 + }, + { + "epoch": 2.7985776086611853, + "grad_norm": 0.923038899898529, + "learning_rate": 1.3417507400471213e-05, + "loss": 1.8825, + "step": 30890 + }, + { + "epoch": 2.7986682068356323, + "grad_norm": 1.1308550834655762, + "learning_rate": 1.3411466199480458e-05, + "loss": 2.7323, + "step": 30891 + }, + { + "epoch": 2.798758805010079, + "grad_norm": 0.8979499936103821, + "learning_rate": 1.3405424998489698e-05, + "loss": 1.914, + "step": 30892 + }, + { + "epoch": 2.798849403184526, + "grad_norm": 1.0387803316116333, + "learning_rate": 1.3399383797498943e-05, + "loss": 2.8445, + "step": 30893 + }, + { + "epoch": 2.7989400013589725, + "grad_norm": 1.0120924711227417, + "learning_rate": 1.3393342596508189e-05, + "loss": 2.6452, + "step": 30894 + }, + { + "epoch": 2.7990305995334195, + "grad_norm": 1.0954551696777344, + "learning_rate": 1.338730139551743e-05, + "loss": 2.6703, + "step": 30895 + }, + { + "epoch": 2.799121197707866, + "grad_norm": 1.0239700078964233, + "learning_rate": 1.3381260194526674e-05, + "loss": 2.5202, + "step": 30896 + }, + { + "epoch": 2.799211795882313, + "grad_norm": 1.0569721460342407, + "learning_rate": 1.3375218993535914e-05, + "loss": 2.5769, + "step": 30897 + }, + { + "epoch": 2.7993023940567596, + "grad_norm": 1.0462419986724854, + "learning_rate": 1.3369177792545159e-05, + "loss": 2.8779, + "step": 30898 + }, + { + "epoch": 2.7993929922312066, + "grad_norm": 0.978541374206543, + "learning_rate": 1.3363136591554401e-05, + "loss": 2.4844, + "step": 30899 + }, + { + "epoch": 2.799483590405653, + "grad_norm": 1.0099364519119263, + "learning_rate": 1.3357095390563645e-05, + "loss": 2.2974, + "step": 30900 + }, + { + "epoch": 2.7995741885801, + "grad_norm": 0.9194703698158264, + "learning_rate": 1.3351054189572886e-05, + "loss": 2.5242, + "step": 30901 + }, + { + "epoch": 2.7996647867545468, + "grad_norm": 0.9442007541656494, + "learning_rate": 1.334501298858213e-05, + "loss": 2.7536, + "step": 30902 + }, + { + "epoch": 2.799755384928994, + "grad_norm": 1.0275778770446777, + "learning_rate": 1.3338971787591373e-05, + "loss": 2.8996, + "step": 30903 + }, + { + "epoch": 2.7998459831034403, + "grad_norm": 0.8933607935905457, + "learning_rate": 1.3332930586600617e-05, + "loss": 1.9693, + "step": 30904 + }, + { + "epoch": 2.7999365812778874, + "grad_norm": 0.9894649386405945, + "learning_rate": 1.3326889385609858e-05, + "loss": 2.6366, + "step": 30905 + }, + { + "epoch": 2.800027179452334, + "grad_norm": 0.9850828051567078, + "learning_rate": 1.3320848184619102e-05, + "loss": 2.6836, + "step": 30906 + }, + { + "epoch": 2.800117777626781, + "grad_norm": 1.0222136974334717, + "learning_rate": 1.3314806983628348e-05, + "loss": 2.5964, + "step": 30907 + }, + { + "epoch": 2.8002083758012275, + "grad_norm": 0.9246212840080261, + "learning_rate": 1.3308765782637589e-05, + "loss": 1.9284, + "step": 30908 + }, + { + "epoch": 2.8002989739756745, + "grad_norm": 1.0400147438049316, + "learning_rate": 1.3302724581646833e-05, + "loss": 2.7469, + "step": 30909 + }, + { + "epoch": 2.800389572150121, + "grad_norm": 0.9097446799278259, + "learning_rate": 1.3296683380656074e-05, + "loss": 2.2104, + "step": 30910 + }, + { + "epoch": 2.800480170324568, + "grad_norm": 1.082200288772583, + "learning_rate": 1.329064217966532e-05, + "loss": 2.4716, + "step": 30911 + }, + { + "epoch": 2.8005707684990147, + "grad_norm": 0.8477636575698853, + "learning_rate": 1.328460097867456e-05, + "loss": 2.1722, + "step": 30912 + }, + { + "epoch": 2.8006613666734617, + "grad_norm": 0.9544013738632202, + "learning_rate": 1.3278559777683805e-05, + "loss": 2.3009, + "step": 30913 + }, + { + "epoch": 2.8007519648479082, + "grad_norm": 1.0354926586151123, + "learning_rate": 1.3272518576693046e-05, + "loss": 2.512, + "step": 30914 + }, + { + "epoch": 2.8008425630223552, + "grad_norm": 0.9981171488761902, + "learning_rate": 1.3266477375702292e-05, + "loss": 2.5227, + "step": 30915 + }, + { + "epoch": 2.800933161196802, + "grad_norm": 0.9632083177566528, + "learning_rate": 1.3260436174711533e-05, + "loss": 2.4768, + "step": 30916 + }, + { + "epoch": 2.801023759371249, + "grad_norm": 0.9816056489944458, + "learning_rate": 1.3254394973720777e-05, + "loss": 2.6397, + "step": 30917 + }, + { + "epoch": 2.8011143575456954, + "grad_norm": 0.9585338830947876, + "learning_rate": 1.3248353772730018e-05, + "loss": 2.7085, + "step": 30918 + }, + { + "epoch": 2.8012049557201424, + "grad_norm": 0.9769014716148376, + "learning_rate": 1.3242312571739263e-05, + "loss": 2.9212, + "step": 30919 + }, + { + "epoch": 2.801295553894589, + "grad_norm": 0.8036783933639526, + "learning_rate": 1.3236271370748504e-05, + "loss": 1.936, + "step": 30920 + }, + { + "epoch": 2.801386152069036, + "grad_norm": 1.0324816703796387, + "learning_rate": 1.3230230169757749e-05, + "loss": 2.4411, + "step": 30921 + }, + { + "epoch": 2.8014767502434825, + "grad_norm": 0.9399741291999817, + "learning_rate": 1.3224188968766993e-05, + "loss": 2.5372, + "step": 30922 + }, + { + "epoch": 2.8015673484179295, + "grad_norm": 0.9355689287185669, + "learning_rate": 1.3218147767776235e-05, + "loss": 2.7201, + "step": 30923 + }, + { + "epoch": 2.801657946592376, + "grad_norm": 1.0372450351715088, + "learning_rate": 1.321210656678548e-05, + "loss": 2.8421, + "step": 30924 + }, + { + "epoch": 2.801748544766823, + "grad_norm": 1.0779399871826172, + "learning_rate": 1.320606536579472e-05, + "loss": 2.4535, + "step": 30925 + }, + { + "epoch": 2.8018391429412697, + "grad_norm": 1.0157132148742676, + "learning_rate": 1.3200024164803965e-05, + "loss": 2.8666, + "step": 30926 + }, + { + "epoch": 2.8019297411157167, + "grad_norm": 1.0103150606155396, + "learning_rate": 1.3193982963813205e-05, + "loss": 2.4068, + "step": 30927 + }, + { + "epoch": 2.8020203392901633, + "grad_norm": 1.1316423416137695, + "learning_rate": 1.3187941762822451e-05, + "loss": 2.4498, + "step": 30928 + }, + { + "epoch": 2.80211093746461, + "grad_norm": 0.9424120187759399, + "learning_rate": 1.3181900561831692e-05, + "loss": 2.6424, + "step": 30929 + }, + { + "epoch": 2.802201535639057, + "grad_norm": 1.0173654556274414, + "learning_rate": 1.3175859360840936e-05, + "loss": 2.6127, + "step": 30930 + }, + { + "epoch": 2.802292133813504, + "grad_norm": 1.0090124607086182, + "learning_rate": 1.3169818159850177e-05, + "loss": 2.6419, + "step": 30931 + }, + { + "epoch": 2.8023827319879504, + "grad_norm": 0.972963809967041, + "learning_rate": 1.3163776958859423e-05, + "loss": 2.8896, + "step": 30932 + }, + { + "epoch": 2.802473330162397, + "grad_norm": 1.0060875415802002, + "learning_rate": 1.3157735757868664e-05, + "loss": 2.7972, + "step": 30933 + }, + { + "epoch": 2.802563928336844, + "grad_norm": 1.1517040729522705, + "learning_rate": 1.3151694556877908e-05, + "loss": 2.6252, + "step": 30934 + }, + { + "epoch": 2.802654526511291, + "grad_norm": 0.9533393979072571, + "learning_rate": 1.3145653355887149e-05, + "loss": 2.524, + "step": 30935 + }, + { + "epoch": 2.8027451246857376, + "grad_norm": 0.9189777970314026, + "learning_rate": 1.3139612154896395e-05, + "loss": 2.2405, + "step": 30936 + }, + { + "epoch": 2.802835722860184, + "grad_norm": 1.1453222036361694, + "learning_rate": 1.3133570953905639e-05, + "loss": 2.4925, + "step": 30937 + }, + { + "epoch": 2.802926321034631, + "grad_norm": 0.9710053205490112, + "learning_rate": 1.312752975291488e-05, + "loss": 2.4788, + "step": 30938 + }, + { + "epoch": 2.803016919209078, + "grad_norm": 0.9608950018882751, + "learning_rate": 1.3121488551924124e-05, + "loss": 2.3331, + "step": 30939 + }, + { + "epoch": 2.8031075173835247, + "grad_norm": 0.999678909778595, + "learning_rate": 1.3115447350933367e-05, + "loss": 2.6281, + "step": 30940 + }, + { + "epoch": 2.8031981155579713, + "grad_norm": 1.007368564605713, + "learning_rate": 1.310940614994261e-05, + "loss": 2.8839, + "step": 30941 + }, + { + "epoch": 2.8032887137324183, + "grad_norm": 1.0308681726455688, + "learning_rate": 1.3103364948951852e-05, + "loss": 2.5598, + "step": 30942 + }, + { + "epoch": 2.8033793119068653, + "grad_norm": 1.062346339225769, + "learning_rate": 1.3097323747961096e-05, + "loss": 2.561, + "step": 30943 + }, + { + "epoch": 2.803469910081312, + "grad_norm": 1.030281901359558, + "learning_rate": 1.3091282546970338e-05, + "loss": 2.7884, + "step": 30944 + }, + { + "epoch": 2.8035605082557584, + "grad_norm": 1.0127462148666382, + "learning_rate": 1.3085241345979583e-05, + "loss": 2.5519, + "step": 30945 + }, + { + "epoch": 2.8036511064302054, + "grad_norm": 1.0937045812606812, + "learning_rate": 1.3079200144988823e-05, + "loss": 2.5444, + "step": 30946 + }, + { + "epoch": 2.8037417046046524, + "grad_norm": 1.0554345846176147, + "learning_rate": 1.3073158943998068e-05, + "loss": 2.6201, + "step": 30947 + }, + { + "epoch": 2.803832302779099, + "grad_norm": 1.0411651134490967, + "learning_rate": 1.306711774300731e-05, + "loss": 2.6166, + "step": 30948 + }, + { + "epoch": 2.8039229009535456, + "grad_norm": 1.004152536392212, + "learning_rate": 1.3061076542016554e-05, + "loss": 2.6675, + "step": 30949 + }, + { + "epoch": 2.8040134991279926, + "grad_norm": 0.9982783198356628, + "learning_rate": 1.3055035341025795e-05, + "loss": 2.6854, + "step": 30950 + }, + { + "epoch": 2.8041040973024396, + "grad_norm": 1.022516131401062, + "learning_rate": 1.304899414003504e-05, + "loss": 2.6951, + "step": 30951 + }, + { + "epoch": 2.804194695476886, + "grad_norm": 0.9675251245498657, + "learning_rate": 1.3042952939044284e-05, + "loss": 2.5894, + "step": 30952 + }, + { + "epoch": 2.8042852936513327, + "grad_norm": 1.059749722480774, + "learning_rate": 1.3036911738053526e-05, + "loss": 2.4109, + "step": 30953 + }, + { + "epoch": 2.8043758918257797, + "grad_norm": 0.9833964109420776, + "learning_rate": 1.303087053706277e-05, + "loss": 2.6214, + "step": 30954 + }, + { + "epoch": 2.8044664900002267, + "grad_norm": 1.043701171875, + "learning_rate": 1.3024829336072011e-05, + "loss": 2.4668, + "step": 30955 + }, + { + "epoch": 2.8045570881746733, + "grad_norm": 0.9492821097373962, + "learning_rate": 1.3018788135081255e-05, + "loss": 2.658, + "step": 30956 + }, + { + "epoch": 2.80464768634912, + "grad_norm": 0.9831705689430237, + "learning_rate": 1.3012746934090498e-05, + "loss": 2.6805, + "step": 30957 + }, + { + "epoch": 2.804738284523567, + "grad_norm": 0.9759055376052856, + "learning_rate": 1.3006705733099742e-05, + "loss": 2.6295, + "step": 30958 + }, + { + "epoch": 2.804828882698014, + "grad_norm": 1.0041722059249878, + "learning_rate": 1.3000664532108983e-05, + "loss": 2.5412, + "step": 30959 + }, + { + "epoch": 2.8049194808724605, + "grad_norm": 1.0500233173370361, + "learning_rate": 1.2994623331118227e-05, + "loss": 2.6025, + "step": 30960 + }, + { + "epoch": 2.805010079046907, + "grad_norm": 1.0398192405700684, + "learning_rate": 1.298858213012747e-05, + "loss": 2.6404, + "step": 30961 + }, + { + "epoch": 2.805100677221354, + "grad_norm": 0.8622657656669617, + "learning_rate": 1.2982540929136714e-05, + "loss": 2.211, + "step": 30962 + }, + { + "epoch": 2.805191275395801, + "grad_norm": 0.9313775300979614, + "learning_rate": 1.2976499728145955e-05, + "loss": 2.6653, + "step": 30963 + }, + { + "epoch": 2.8052818735702476, + "grad_norm": 1.0211174488067627, + "learning_rate": 1.2970458527155199e-05, + "loss": 2.6472, + "step": 30964 + }, + { + "epoch": 2.805372471744694, + "grad_norm": 0.974494218826294, + "learning_rate": 1.2964417326164442e-05, + "loss": 2.2964, + "step": 30965 + }, + { + "epoch": 2.805463069919141, + "grad_norm": 1.0375734567642212, + "learning_rate": 1.2958376125173686e-05, + "loss": 2.674, + "step": 30966 + }, + { + "epoch": 2.8055536680935877, + "grad_norm": 1.1140424013137817, + "learning_rate": 1.295233492418293e-05, + "loss": 2.4678, + "step": 30967 + }, + { + "epoch": 2.8056442662680348, + "grad_norm": 0.9693487882614136, + "learning_rate": 1.294629372319217e-05, + "loss": 2.7078, + "step": 30968 + }, + { + "epoch": 2.8057348644424813, + "grad_norm": 0.890444278717041, + "learning_rate": 1.2940252522201415e-05, + "loss": 2.0213, + "step": 30969 + }, + { + "epoch": 2.8058254626169283, + "grad_norm": 0.9800562858581543, + "learning_rate": 1.2934211321210658e-05, + "loss": 2.7259, + "step": 30970 + }, + { + "epoch": 2.805916060791375, + "grad_norm": 0.990399956703186, + "learning_rate": 1.2928170120219902e-05, + "loss": 2.8441, + "step": 30971 + }, + { + "epoch": 2.806006658965822, + "grad_norm": 1.2511135339736938, + "learning_rate": 1.2922128919229143e-05, + "loss": 2.5391, + "step": 30972 + }, + { + "epoch": 2.8060972571402685, + "grad_norm": 1.1019967794418335, + "learning_rate": 1.2916087718238387e-05, + "loss": 2.7204, + "step": 30973 + }, + { + "epoch": 2.8061878553147155, + "grad_norm": 1.0182892084121704, + "learning_rate": 1.291004651724763e-05, + "loss": 2.7318, + "step": 30974 + }, + { + "epoch": 2.806278453489162, + "grad_norm": 0.9858052730560303, + "learning_rate": 1.2904005316256874e-05, + "loss": 2.3507, + "step": 30975 + }, + { + "epoch": 2.806369051663609, + "grad_norm": 1.087180733680725, + "learning_rate": 1.2897964115266114e-05, + "loss": 2.5441, + "step": 30976 + }, + { + "epoch": 2.8064596498380556, + "grad_norm": 1.0514490604400635, + "learning_rate": 1.2891922914275359e-05, + "loss": 2.8434, + "step": 30977 + }, + { + "epoch": 2.8065502480125026, + "grad_norm": 1.089377999305725, + "learning_rate": 1.2885881713284601e-05, + "loss": 2.6934, + "step": 30978 + }, + { + "epoch": 2.806640846186949, + "grad_norm": 1.004798412322998, + "learning_rate": 1.2879840512293845e-05, + "loss": 2.4322, + "step": 30979 + }, + { + "epoch": 2.806731444361396, + "grad_norm": 0.8664743900299072, + "learning_rate": 1.2873799311303086e-05, + "loss": 2.1369, + "step": 30980 + }, + { + "epoch": 2.8068220425358428, + "grad_norm": 0.977684736251831, + "learning_rate": 1.286775811031233e-05, + "loss": 2.6426, + "step": 30981 + }, + { + "epoch": 2.80691264071029, + "grad_norm": 1.0226234197616577, + "learning_rate": 1.2861716909321575e-05, + "loss": 2.7028, + "step": 30982 + }, + { + "epoch": 2.8070032388847364, + "grad_norm": 0.8690728545188904, + "learning_rate": 1.2855675708330817e-05, + "loss": 2.1663, + "step": 30983 + }, + { + "epoch": 2.8070938370591834, + "grad_norm": 0.925317108631134, + "learning_rate": 1.2849634507340061e-05, + "loss": 2.5567, + "step": 30984 + }, + { + "epoch": 2.80718443523363, + "grad_norm": 1.0000420808792114, + "learning_rate": 1.2843593306349302e-05, + "loss": 2.6687, + "step": 30985 + }, + { + "epoch": 2.807275033408077, + "grad_norm": 1.1132333278656006, + "learning_rate": 1.2837552105358546e-05, + "loss": 2.5995, + "step": 30986 + }, + { + "epoch": 2.8073656315825235, + "grad_norm": 1.1104096174240112, + "learning_rate": 1.2831510904367789e-05, + "loss": 2.5153, + "step": 30987 + }, + { + "epoch": 2.8074562297569705, + "grad_norm": 0.894845724105835, + "learning_rate": 1.2825469703377033e-05, + "loss": 2.1009, + "step": 30988 + }, + { + "epoch": 2.807546827931417, + "grad_norm": 1.1398710012435913, + "learning_rate": 1.2819428502386274e-05, + "loss": 2.5705, + "step": 30989 + }, + { + "epoch": 2.807637426105864, + "grad_norm": 1.0551451444625854, + "learning_rate": 1.2813387301395518e-05, + "loss": 2.5378, + "step": 30990 + }, + { + "epoch": 2.8077280242803107, + "grad_norm": 1.1018009185791016, + "learning_rate": 1.280734610040476e-05, + "loss": 2.7802, + "step": 30991 + }, + { + "epoch": 2.8078186224547577, + "grad_norm": 1.0153237581253052, + "learning_rate": 1.2801304899414005e-05, + "loss": 2.867, + "step": 30992 + }, + { + "epoch": 2.8079092206292042, + "grad_norm": 1.057140827178955, + "learning_rate": 1.2795263698423246e-05, + "loss": 2.9905, + "step": 30993 + }, + { + "epoch": 2.8079998188036512, + "grad_norm": 1.0290697813034058, + "learning_rate": 1.278922249743249e-05, + "loss": 2.6465, + "step": 30994 + }, + { + "epoch": 2.808090416978098, + "grad_norm": 0.9905620217323303, + "learning_rate": 1.2783181296441733e-05, + "loss": 2.5054, + "step": 30995 + }, + { + "epoch": 2.808181015152545, + "grad_norm": 1.014592170715332, + "learning_rate": 1.2777140095450977e-05, + "loss": 2.6576, + "step": 30996 + }, + { + "epoch": 2.8082716133269914, + "grad_norm": 0.9613216519355774, + "learning_rate": 1.2771098894460221e-05, + "loss": 2.8802, + "step": 30997 + }, + { + "epoch": 2.8083622115014384, + "grad_norm": 0.9921423196792603, + "learning_rate": 1.2765057693469462e-05, + "loss": 2.7429, + "step": 30998 + }, + { + "epoch": 2.808452809675885, + "grad_norm": 0.9974862933158875, + "learning_rate": 1.2759016492478706e-05, + "loss": 2.4338, + "step": 30999 + }, + { + "epoch": 2.808543407850332, + "grad_norm": 1.0727850198745728, + "learning_rate": 1.2752975291487949e-05, + "loss": 2.5404, + "step": 31000 + }, + { + "epoch": 2.8086340060247785, + "grad_norm": 1.127467393875122, + "learning_rate": 1.2746934090497193e-05, + "loss": 2.5117, + "step": 31001 + }, + { + "epoch": 2.8087246041992255, + "grad_norm": 0.9818145036697388, + "learning_rate": 1.2740892889506434e-05, + "loss": 2.553, + "step": 31002 + }, + { + "epoch": 2.808815202373672, + "grad_norm": 1.060436487197876, + "learning_rate": 1.2734851688515678e-05, + "loss": 2.6309, + "step": 31003 + }, + { + "epoch": 2.808905800548119, + "grad_norm": 0.9540565609931946, + "learning_rate": 1.272881048752492e-05, + "loss": 2.439, + "step": 31004 + }, + { + "epoch": 2.8089963987225657, + "grad_norm": 0.9510678648948669, + "learning_rate": 1.2722769286534165e-05, + "loss": 2.6716, + "step": 31005 + }, + { + "epoch": 2.8090869968970127, + "grad_norm": 1.0284165143966675, + "learning_rate": 1.2716728085543405e-05, + "loss": 2.7319, + "step": 31006 + }, + { + "epoch": 2.8091775950714593, + "grad_norm": 0.9498370289802551, + "learning_rate": 1.271068688455265e-05, + "loss": 2.7334, + "step": 31007 + }, + { + "epoch": 2.809268193245906, + "grad_norm": 0.948878824710846, + "learning_rate": 1.2704645683561892e-05, + "loss": 2.321, + "step": 31008 + }, + { + "epoch": 2.809358791420353, + "grad_norm": 1.0227365493774414, + "learning_rate": 1.2698604482571136e-05, + "loss": 2.6291, + "step": 31009 + }, + { + "epoch": 2.8094493895948, + "grad_norm": 1.0139224529266357, + "learning_rate": 1.2692563281580377e-05, + "loss": 2.7915, + "step": 31010 + }, + { + "epoch": 2.8095399877692464, + "grad_norm": 0.9993240833282471, + "learning_rate": 1.2686522080589621e-05, + "loss": 2.4592, + "step": 31011 + }, + { + "epoch": 2.809630585943693, + "grad_norm": 1.1233280897140503, + "learning_rate": 1.2680480879598866e-05, + "loss": 2.5757, + "step": 31012 + }, + { + "epoch": 2.80972118411814, + "grad_norm": 0.9317694306373596, + "learning_rate": 1.2674439678608108e-05, + "loss": 1.9599, + "step": 31013 + }, + { + "epoch": 2.809811782292587, + "grad_norm": 0.9802522659301758, + "learning_rate": 1.2668398477617352e-05, + "loss": 2.7587, + "step": 31014 + }, + { + "epoch": 2.8099023804670336, + "grad_norm": 0.9076182842254639, + "learning_rate": 1.2662357276626593e-05, + "loss": 2.058, + "step": 31015 + }, + { + "epoch": 2.80999297864148, + "grad_norm": 1.051648497581482, + "learning_rate": 1.2656316075635837e-05, + "loss": 2.198, + "step": 31016 + }, + { + "epoch": 2.810083576815927, + "grad_norm": 1.0309232473373413, + "learning_rate": 1.265027487464508e-05, + "loss": 2.7225, + "step": 31017 + }, + { + "epoch": 2.810174174990374, + "grad_norm": 0.909176230430603, + "learning_rate": 1.2644233673654324e-05, + "loss": 2.8489, + "step": 31018 + }, + { + "epoch": 2.8102647731648207, + "grad_norm": 1.0136620998382568, + "learning_rate": 1.2638192472663565e-05, + "loss": 2.3282, + "step": 31019 + }, + { + "epoch": 2.8103553713392673, + "grad_norm": 0.9508874416351318, + "learning_rate": 1.263215127167281e-05, + "loss": 2.3496, + "step": 31020 + }, + { + "epoch": 2.8104459695137143, + "grad_norm": 0.972440779209137, + "learning_rate": 1.2626110070682052e-05, + "loss": 2.7394, + "step": 31021 + }, + { + "epoch": 2.8105365676881613, + "grad_norm": 1.0589380264282227, + "learning_rate": 1.2620068869691296e-05, + "loss": 2.7036, + "step": 31022 + }, + { + "epoch": 2.810627165862608, + "grad_norm": 0.9477066993713379, + "learning_rate": 1.2614027668700537e-05, + "loss": 2.4027, + "step": 31023 + }, + { + "epoch": 2.8107177640370544, + "grad_norm": 1.0205943584442139, + "learning_rate": 1.2607986467709781e-05, + "loss": 2.9218, + "step": 31024 + }, + { + "epoch": 2.8108083622115014, + "grad_norm": 0.9352644681930542, + "learning_rate": 1.2601945266719023e-05, + "loss": 2.515, + "step": 31025 + }, + { + "epoch": 2.8108989603859484, + "grad_norm": 1.0316073894500732, + "learning_rate": 1.2595904065728268e-05, + "loss": 2.7429, + "step": 31026 + }, + { + "epoch": 2.810989558560395, + "grad_norm": 1.0170012712478638, + "learning_rate": 1.2589862864737512e-05, + "loss": 2.7248, + "step": 31027 + }, + { + "epoch": 2.8110801567348416, + "grad_norm": 1.0977245569229126, + "learning_rate": 1.2583821663746753e-05, + "loss": 2.3581, + "step": 31028 + }, + { + "epoch": 2.8111707549092886, + "grad_norm": 1.0574005842208862, + "learning_rate": 1.2577780462755997e-05, + "loss": 3.0538, + "step": 31029 + }, + { + "epoch": 2.8112613530837356, + "grad_norm": 1.0356312990188599, + "learning_rate": 1.257173926176524e-05, + "loss": 2.5106, + "step": 31030 + }, + { + "epoch": 2.811351951258182, + "grad_norm": 1.059526801109314, + "learning_rate": 1.2565698060774484e-05, + "loss": 2.8802, + "step": 31031 + }, + { + "epoch": 2.8114425494326287, + "grad_norm": 1.0249099731445312, + "learning_rate": 1.2559656859783725e-05, + "loss": 2.6427, + "step": 31032 + }, + { + "epoch": 2.8115331476070757, + "grad_norm": 1.0308154821395874, + "learning_rate": 1.2553615658792969e-05, + "loss": 2.643, + "step": 31033 + }, + { + "epoch": 2.8116237457815227, + "grad_norm": 1.090084433555603, + "learning_rate": 1.2547574457802211e-05, + "loss": 2.6517, + "step": 31034 + }, + { + "epoch": 2.8117143439559693, + "grad_norm": 0.9753077626228333, + "learning_rate": 1.2541533256811455e-05, + "loss": 2.5447, + "step": 31035 + }, + { + "epoch": 2.811804942130416, + "grad_norm": 0.8519963026046753, + "learning_rate": 1.2535492055820696e-05, + "loss": 2.0258, + "step": 31036 + }, + { + "epoch": 2.811895540304863, + "grad_norm": 0.9894335269927979, + "learning_rate": 1.252945085482994e-05, + "loss": 2.5805, + "step": 31037 + }, + { + "epoch": 2.81198613847931, + "grad_norm": 0.988349437713623, + "learning_rate": 1.2523409653839183e-05, + "loss": 2.7518, + "step": 31038 + }, + { + "epoch": 2.8120767366537565, + "grad_norm": 0.9408320188522339, + "learning_rate": 1.2517368452848427e-05, + "loss": 1.6094, + "step": 31039 + }, + { + "epoch": 2.812167334828203, + "grad_norm": 1.2415804862976074, + "learning_rate": 1.2511327251857668e-05, + "loss": 2.566, + "step": 31040 + }, + { + "epoch": 2.81225793300265, + "grad_norm": 0.9808216691017151, + "learning_rate": 1.2505286050866912e-05, + "loss": 2.6308, + "step": 31041 + }, + { + "epoch": 2.812348531177097, + "grad_norm": 1.0342285633087158, + "learning_rate": 1.2499244849876157e-05, + "loss": 2.5202, + "step": 31042 + }, + { + "epoch": 2.8124391293515436, + "grad_norm": 1.018951416015625, + "learning_rate": 1.2493203648885399e-05, + "loss": 2.4671, + "step": 31043 + }, + { + "epoch": 2.81252972752599, + "grad_norm": 1.0641075372695923, + "learning_rate": 1.2487162447894642e-05, + "loss": 2.6175, + "step": 31044 + }, + { + "epoch": 2.812620325700437, + "grad_norm": 1.0525691509246826, + "learning_rate": 1.2481121246903884e-05, + "loss": 2.7764, + "step": 31045 + }, + { + "epoch": 2.812710923874884, + "grad_norm": 0.9756262898445129, + "learning_rate": 1.2475080045913128e-05, + "loss": 2.803, + "step": 31046 + }, + { + "epoch": 2.8128015220493308, + "grad_norm": 1.08327317237854, + "learning_rate": 1.2469038844922371e-05, + "loss": 2.5255, + "step": 31047 + }, + { + "epoch": 2.8128921202237773, + "grad_norm": 1.0058412551879883, + "learning_rate": 1.2462997643931615e-05, + "loss": 2.7499, + "step": 31048 + }, + { + "epoch": 2.8129827183982243, + "grad_norm": 1.0316951274871826, + "learning_rate": 1.2456956442940858e-05, + "loss": 2.7771, + "step": 31049 + }, + { + "epoch": 2.813073316572671, + "grad_norm": 1.0986905097961426, + "learning_rate": 1.24509152419501e-05, + "loss": 2.4554, + "step": 31050 + }, + { + "epoch": 2.813163914747118, + "grad_norm": 1.0138148069381714, + "learning_rate": 1.2444874040959344e-05, + "loss": 2.7482, + "step": 31051 + }, + { + "epoch": 2.8132545129215645, + "grad_norm": 0.9412965178489685, + "learning_rate": 1.2438832839968587e-05, + "loss": 2.293, + "step": 31052 + }, + { + "epoch": 2.8133451110960115, + "grad_norm": 1.0161534547805786, + "learning_rate": 1.243279163897783e-05, + "loss": 2.6317, + "step": 31053 + }, + { + "epoch": 2.813435709270458, + "grad_norm": 1.023476243019104, + "learning_rate": 1.2426750437987072e-05, + "loss": 2.5661, + "step": 31054 + }, + { + "epoch": 2.813526307444905, + "grad_norm": 1.056096076965332, + "learning_rate": 1.2420709236996316e-05, + "loss": 2.5166, + "step": 31055 + }, + { + "epoch": 2.8136169056193516, + "grad_norm": 1.0113203525543213, + "learning_rate": 1.2414668036005559e-05, + "loss": 2.4305, + "step": 31056 + }, + { + "epoch": 2.8137075037937986, + "grad_norm": 0.9837552905082703, + "learning_rate": 1.2408626835014801e-05, + "loss": 2.7975, + "step": 31057 + }, + { + "epoch": 2.813798101968245, + "grad_norm": 0.9378756284713745, + "learning_rate": 1.2402585634024044e-05, + "loss": 2.5385, + "step": 31058 + }, + { + "epoch": 2.813888700142692, + "grad_norm": 1.041793704032898, + "learning_rate": 1.2396544433033288e-05, + "loss": 2.8602, + "step": 31059 + }, + { + "epoch": 2.8139792983171388, + "grad_norm": 1.0285943746566772, + "learning_rate": 1.239050323204253e-05, + "loss": 2.8271, + "step": 31060 + }, + { + "epoch": 2.814069896491586, + "grad_norm": 1.144140601158142, + "learning_rate": 1.2384462031051773e-05, + "loss": 2.5118, + "step": 31061 + }, + { + "epoch": 2.8141604946660324, + "grad_norm": 0.9538089036941528, + "learning_rate": 1.2378420830061015e-05, + "loss": 2.3063, + "step": 31062 + }, + { + "epoch": 2.8142510928404794, + "grad_norm": 0.9993051886558533, + "learning_rate": 1.237237962907026e-05, + "loss": 2.6653, + "step": 31063 + }, + { + "epoch": 2.814341691014926, + "grad_norm": 0.9806793332099915, + "learning_rate": 1.2366338428079504e-05, + "loss": 2.5561, + "step": 31064 + }, + { + "epoch": 2.814432289189373, + "grad_norm": 1.0779821872711182, + "learning_rate": 1.2360297227088746e-05, + "loss": 2.6841, + "step": 31065 + }, + { + "epoch": 2.8145228873638195, + "grad_norm": 1.1726455688476562, + "learning_rate": 1.2354256026097989e-05, + "loss": 2.586, + "step": 31066 + }, + { + "epoch": 2.8146134855382665, + "grad_norm": 0.9700382351875305, + "learning_rate": 1.2348214825107231e-05, + "loss": 2.5846, + "step": 31067 + }, + { + "epoch": 2.814704083712713, + "grad_norm": 0.9081251621246338, + "learning_rate": 1.2342173624116476e-05, + "loss": 2.033, + "step": 31068 + }, + { + "epoch": 2.81479468188716, + "grad_norm": 1.0341994762420654, + "learning_rate": 1.2336132423125718e-05, + "loss": 2.6695, + "step": 31069 + }, + { + "epoch": 2.8148852800616067, + "grad_norm": 0.9589876532554626, + "learning_rate": 1.233009122213496e-05, + "loss": 2.5549, + "step": 31070 + }, + { + "epoch": 2.8149758782360537, + "grad_norm": 1.0548561811447144, + "learning_rate": 1.2324050021144203e-05, + "loss": 2.5463, + "step": 31071 + }, + { + "epoch": 2.8150664764105002, + "grad_norm": 0.9497911334037781, + "learning_rate": 1.2318008820153448e-05, + "loss": 1.74, + "step": 31072 + }, + { + "epoch": 2.8151570745849472, + "grad_norm": 1.0118929147720337, + "learning_rate": 1.231196761916269e-05, + "loss": 2.5075, + "step": 31073 + }, + { + "epoch": 2.815247672759394, + "grad_norm": 0.994458019733429, + "learning_rate": 1.2305926418171933e-05, + "loss": 2.768, + "step": 31074 + }, + { + "epoch": 2.815338270933841, + "grad_norm": 1.0023880004882812, + "learning_rate": 1.2299885217181175e-05, + "loss": 2.601, + "step": 31075 + }, + { + "epoch": 2.8154288691082874, + "grad_norm": 0.9117699265480042, + "learning_rate": 1.229384401619042e-05, + "loss": 2.4345, + "step": 31076 + }, + { + "epoch": 2.8155194672827344, + "grad_norm": 1.0274055004119873, + "learning_rate": 1.2287802815199662e-05, + "loss": 2.4084, + "step": 31077 + }, + { + "epoch": 2.815610065457181, + "grad_norm": 1.0265668630599976, + "learning_rate": 1.2281761614208906e-05, + "loss": 2.6002, + "step": 31078 + }, + { + "epoch": 2.815700663631628, + "grad_norm": 0.9542746543884277, + "learning_rate": 1.2275720413218149e-05, + "loss": 2.6179, + "step": 31079 + }, + { + "epoch": 2.8157912618060745, + "grad_norm": 1.0473061800003052, + "learning_rate": 1.2269679212227391e-05, + "loss": 2.8251, + "step": 31080 + }, + { + "epoch": 2.8158818599805215, + "grad_norm": 1.0082794427871704, + "learning_rate": 1.2263638011236635e-05, + "loss": 2.3697, + "step": 31081 + }, + { + "epoch": 2.815972458154968, + "grad_norm": 1.0447099208831787, + "learning_rate": 1.2257596810245878e-05, + "loss": 2.6378, + "step": 31082 + }, + { + "epoch": 2.816063056329415, + "grad_norm": 0.9941120147705078, + "learning_rate": 1.225155560925512e-05, + "loss": 2.4483, + "step": 31083 + }, + { + "epoch": 2.8161536545038617, + "grad_norm": 1.0087776184082031, + "learning_rate": 1.2245514408264363e-05, + "loss": 2.3601, + "step": 31084 + }, + { + "epoch": 2.8162442526783087, + "grad_norm": 0.948845624923706, + "learning_rate": 1.2239473207273607e-05, + "loss": 2.3512, + "step": 31085 + }, + { + "epoch": 2.8163348508527553, + "grad_norm": 0.9950467348098755, + "learning_rate": 1.223343200628285e-05, + "loss": 2.5607, + "step": 31086 + }, + { + "epoch": 2.8164254490272023, + "grad_norm": 1.0831451416015625, + "learning_rate": 1.2227390805292092e-05, + "loss": 2.8594, + "step": 31087 + }, + { + "epoch": 2.816516047201649, + "grad_norm": 1.1717230081558228, + "learning_rate": 1.2221349604301335e-05, + "loss": 2.52, + "step": 31088 + }, + { + "epoch": 2.816606645376096, + "grad_norm": 1.078391194343567, + "learning_rate": 1.2215308403310579e-05, + "loss": 2.5852, + "step": 31089 + }, + { + "epoch": 2.8166972435505424, + "grad_norm": 0.9689849615097046, + "learning_rate": 1.2209267202319821e-05, + "loss": 2.5196, + "step": 31090 + }, + { + "epoch": 2.816787841724989, + "grad_norm": 1.0076518058776855, + "learning_rate": 1.2203226001329064e-05, + "loss": 2.5407, + "step": 31091 + }, + { + "epoch": 2.816878439899436, + "grad_norm": 1.0350216627120972, + "learning_rate": 1.2197184800338308e-05, + "loss": 2.6621, + "step": 31092 + }, + { + "epoch": 2.816969038073883, + "grad_norm": 0.9659698009490967, + "learning_rate": 1.219114359934755e-05, + "loss": 2.5059, + "step": 31093 + }, + { + "epoch": 2.8170596362483296, + "grad_norm": 1.0643285512924194, + "learning_rate": 1.2185102398356795e-05, + "loss": 2.5951, + "step": 31094 + }, + { + "epoch": 2.817150234422776, + "grad_norm": 1.0075252056121826, + "learning_rate": 1.2179061197366037e-05, + "loss": 2.5467, + "step": 31095 + }, + { + "epoch": 2.817240832597223, + "grad_norm": 1.0852723121643066, + "learning_rate": 1.217301999637528e-05, + "loss": 2.6622, + "step": 31096 + }, + { + "epoch": 2.81733143077167, + "grad_norm": 0.9933376312255859, + "learning_rate": 1.2166978795384522e-05, + "loss": 2.5508, + "step": 31097 + }, + { + "epoch": 2.8174220289461167, + "grad_norm": 0.9989209175109863, + "learning_rate": 1.2160937594393767e-05, + "loss": 2.3917, + "step": 31098 + }, + { + "epoch": 2.8175126271205633, + "grad_norm": 1.0237836837768555, + "learning_rate": 1.215489639340301e-05, + "loss": 2.6574, + "step": 31099 + }, + { + "epoch": 2.8176032252950103, + "grad_norm": 1.0064345598220825, + "learning_rate": 1.2148855192412252e-05, + "loss": 2.6336, + "step": 31100 + }, + { + "epoch": 2.8176938234694573, + "grad_norm": 0.9839380979537964, + "learning_rate": 1.2142813991421494e-05, + "loss": 2.4392, + "step": 31101 + }, + { + "epoch": 2.817784421643904, + "grad_norm": 1.0651447772979736, + "learning_rate": 1.2136772790430738e-05, + "loss": 2.6697, + "step": 31102 + }, + { + "epoch": 2.8178750198183504, + "grad_norm": 0.9837102890014648, + "learning_rate": 1.2130731589439981e-05, + "loss": 2.6542, + "step": 31103 + }, + { + "epoch": 2.8179656179927974, + "grad_norm": 0.9887141585350037, + "learning_rate": 1.2124690388449224e-05, + "loss": 2.4475, + "step": 31104 + }, + { + "epoch": 2.8180562161672444, + "grad_norm": 0.9826048016548157, + "learning_rate": 1.2118649187458466e-05, + "loss": 2.5267, + "step": 31105 + }, + { + "epoch": 2.818146814341691, + "grad_norm": 1.0656474828720093, + "learning_rate": 1.211260798646771e-05, + "loss": 2.7253, + "step": 31106 + }, + { + "epoch": 2.8182374125161376, + "grad_norm": 1.0323792695999146, + "learning_rate": 1.2106566785476954e-05, + "loss": 2.5358, + "step": 31107 + }, + { + "epoch": 2.8183280106905846, + "grad_norm": 0.9884549975395203, + "learning_rate": 1.2100525584486197e-05, + "loss": 2.7676, + "step": 31108 + }, + { + "epoch": 2.8184186088650316, + "grad_norm": 1.046722650527954, + "learning_rate": 1.209448438349544e-05, + "loss": 2.8474, + "step": 31109 + }, + { + "epoch": 2.818509207039478, + "grad_norm": 1.1007561683654785, + "learning_rate": 1.2088443182504684e-05, + "loss": 2.4151, + "step": 31110 + }, + { + "epoch": 2.8185998052139247, + "grad_norm": 1.0944926738739014, + "learning_rate": 1.2082401981513926e-05, + "loss": 2.6836, + "step": 31111 + }, + { + "epoch": 2.8186904033883717, + "grad_norm": 1.0050063133239746, + "learning_rate": 1.2076360780523169e-05, + "loss": 2.7142, + "step": 31112 + }, + { + "epoch": 2.8187810015628187, + "grad_norm": 0.9755085706710815, + "learning_rate": 1.2070319579532411e-05, + "loss": 2.5334, + "step": 31113 + }, + { + "epoch": 2.8188715997372653, + "grad_norm": 0.9805720448493958, + "learning_rate": 1.2064278378541654e-05, + "loss": 2.7298, + "step": 31114 + }, + { + "epoch": 2.818962197911712, + "grad_norm": 1.0451364517211914, + "learning_rate": 1.2058237177550898e-05, + "loss": 2.6172, + "step": 31115 + }, + { + "epoch": 2.819052796086159, + "grad_norm": 1.1159710884094238, + "learning_rate": 1.205219597656014e-05, + "loss": 2.4272, + "step": 31116 + }, + { + "epoch": 2.819143394260606, + "grad_norm": 0.8808919191360474, + "learning_rate": 1.2046154775569383e-05, + "loss": 1.9089, + "step": 31117 + }, + { + "epoch": 2.8192339924350525, + "grad_norm": 0.9029552936553955, + "learning_rate": 1.2040113574578626e-05, + "loss": 2.0932, + "step": 31118 + }, + { + "epoch": 2.819324590609499, + "grad_norm": 0.9926722049713135, + "learning_rate": 1.203407237358787e-05, + "loss": 2.6916, + "step": 31119 + }, + { + "epoch": 2.819415188783946, + "grad_norm": 0.974651038646698, + "learning_rate": 1.2028031172597112e-05, + "loss": 2.5837, + "step": 31120 + }, + { + "epoch": 2.819505786958393, + "grad_norm": 0.926165759563446, + "learning_rate": 1.2021989971606355e-05, + "loss": 2.4459, + "step": 31121 + }, + { + "epoch": 2.8195963851328396, + "grad_norm": 1.0098609924316406, + "learning_rate": 1.2015948770615599e-05, + "loss": 2.7053, + "step": 31122 + }, + { + "epoch": 2.819686983307286, + "grad_norm": 0.962678849697113, + "learning_rate": 1.2009907569624843e-05, + "loss": 2.5381, + "step": 31123 + }, + { + "epoch": 2.819777581481733, + "grad_norm": 1.0702465772628784, + "learning_rate": 1.2003866368634086e-05, + "loss": 2.6455, + "step": 31124 + }, + { + "epoch": 2.81986817965618, + "grad_norm": 1.0961487293243408, + "learning_rate": 1.1997825167643328e-05, + "loss": 2.3989, + "step": 31125 + }, + { + "epoch": 2.8199587778306268, + "grad_norm": 1.0489648580551147, + "learning_rate": 1.1991783966652571e-05, + "loss": 2.8112, + "step": 31126 + }, + { + "epoch": 2.8200493760050733, + "grad_norm": 1.042965054512024, + "learning_rate": 1.1985742765661815e-05, + "loss": 2.4695, + "step": 31127 + }, + { + "epoch": 2.8201399741795203, + "grad_norm": 0.9994888305664062, + "learning_rate": 1.1979701564671058e-05, + "loss": 2.7809, + "step": 31128 + }, + { + "epoch": 2.820230572353967, + "grad_norm": 1.0549448728561401, + "learning_rate": 1.19736603636803e-05, + "loss": 2.7497, + "step": 31129 + }, + { + "epoch": 2.820321170528414, + "grad_norm": 0.9887829422950745, + "learning_rate": 1.1967619162689543e-05, + "loss": 2.5361, + "step": 31130 + }, + { + "epoch": 2.8204117687028605, + "grad_norm": 1.0214320421218872, + "learning_rate": 1.1961577961698787e-05, + "loss": 2.5343, + "step": 31131 + }, + { + "epoch": 2.8205023668773075, + "grad_norm": 0.9575126767158508, + "learning_rate": 1.195553676070803e-05, + "loss": 2.6087, + "step": 31132 + }, + { + "epoch": 2.820592965051754, + "grad_norm": 0.9969339966773987, + "learning_rate": 1.1949495559717272e-05, + "loss": 2.5031, + "step": 31133 + }, + { + "epoch": 2.820683563226201, + "grad_norm": 0.954775333404541, + "learning_rate": 1.1943454358726514e-05, + "loss": 2.5364, + "step": 31134 + }, + { + "epoch": 2.8207741614006476, + "grad_norm": 1.060811996459961, + "learning_rate": 1.1937413157735759e-05, + "loss": 2.5754, + "step": 31135 + }, + { + "epoch": 2.8208647595750946, + "grad_norm": 1.0329619646072388, + "learning_rate": 1.1931371956745001e-05, + "loss": 2.6894, + "step": 31136 + }, + { + "epoch": 2.820955357749541, + "grad_norm": 1.022718071937561, + "learning_rate": 1.1925330755754245e-05, + "loss": 2.7254, + "step": 31137 + }, + { + "epoch": 2.821045955923988, + "grad_norm": 0.9606385231018066, + "learning_rate": 1.1919289554763488e-05, + "loss": 2.3833, + "step": 31138 + }, + { + "epoch": 2.821136554098435, + "grad_norm": 1.13912034034729, + "learning_rate": 1.191324835377273e-05, + "loss": 2.654, + "step": 31139 + }, + { + "epoch": 2.821227152272882, + "grad_norm": 0.9706332683563232, + "learning_rate": 1.1907207152781975e-05, + "loss": 2.4891, + "step": 31140 + }, + { + "epoch": 2.8213177504473284, + "grad_norm": 1.0518465042114258, + "learning_rate": 1.1901165951791217e-05, + "loss": 2.5614, + "step": 31141 + }, + { + "epoch": 2.8214083486217754, + "grad_norm": 0.9904698133468628, + "learning_rate": 1.189512475080046e-05, + "loss": 2.7443, + "step": 31142 + }, + { + "epoch": 2.821498946796222, + "grad_norm": 0.920443594455719, + "learning_rate": 1.1889083549809702e-05, + "loss": 1.8101, + "step": 31143 + }, + { + "epoch": 2.821589544970669, + "grad_norm": 0.8770005702972412, + "learning_rate": 1.1883042348818946e-05, + "loss": 1.8967, + "step": 31144 + }, + { + "epoch": 2.8216801431451155, + "grad_norm": 1.0263038873672485, + "learning_rate": 1.1877001147828189e-05, + "loss": 2.7502, + "step": 31145 + }, + { + "epoch": 2.8217707413195625, + "grad_norm": 1.0329021215438843, + "learning_rate": 1.1870959946837432e-05, + "loss": 2.7952, + "step": 31146 + }, + { + "epoch": 2.821861339494009, + "grad_norm": 0.9347423315048218, + "learning_rate": 1.1864918745846674e-05, + "loss": 2.1478, + "step": 31147 + }, + { + "epoch": 2.821951937668456, + "grad_norm": 1.0103545188903809, + "learning_rate": 1.1858877544855918e-05, + "loss": 2.7981, + "step": 31148 + }, + { + "epoch": 2.8220425358429027, + "grad_norm": 1.0292915105819702, + "learning_rate": 1.185283634386516e-05, + "loss": 2.8391, + "step": 31149 + }, + { + "epoch": 2.8221331340173497, + "grad_norm": 1.0662602186203003, + "learning_rate": 1.1846795142874403e-05, + "loss": 2.5932, + "step": 31150 + }, + { + "epoch": 2.8222237321917962, + "grad_norm": 0.9149623513221741, + "learning_rate": 1.1840753941883648e-05, + "loss": 2.219, + "step": 31151 + }, + { + "epoch": 2.8223143303662432, + "grad_norm": 0.95064377784729, + "learning_rate": 1.183471274089289e-05, + "loss": 2.4491, + "step": 31152 + }, + { + "epoch": 2.82240492854069, + "grad_norm": 0.8755053281784058, + "learning_rate": 1.1828671539902134e-05, + "loss": 1.9699, + "step": 31153 + }, + { + "epoch": 2.822495526715137, + "grad_norm": 0.8507067561149597, + "learning_rate": 1.1822630338911377e-05, + "loss": 2.001, + "step": 31154 + }, + { + "epoch": 2.8225861248895834, + "grad_norm": 0.9594812989234924, + "learning_rate": 1.181658913792062e-05, + "loss": 2.3302, + "step": 31155 + }, + { + "epoch": 2.8226767230640304, + "grad_norm": 1.033610463142395, + "learning_rate": 1.1810547936929862e-05, + "loss": 2.5983, + "step": 31156 + }, + { + "epoch": 2.822767321238477, + "grad_norm": 1.0564582347869873, + "learning_rate": 1.1804506735939106e-05, + "loss": 2.5718, + "step": 31157 + }, + { + "epoch": 2.822857919412924, + "grad_norm": 1.1147247552871704, + "learning_rate": 1.1798465534948349e-05, + "loss": 2.4957, + "step": 31158 + }, + { + "epoch": 2.8229485175873705, + "grad_norm": 0.8850034475326538, + "learning_rate": 1.1792424333957591e-05, + "loss": 1.8952, + "step": 31159 + }, + { + "epoch": 2.8230391157618175, + "grad_norm": 1.0444610118865967, + "learning_rate": 1.1786383132966834e-05, + "loss": 2.7669, + "step": 31160 + }, + { + "epoch": 2.823129713936264, + "grad_norm": 1.025831699371338, + "learning_rate": 1.1780341931976078e-05, + "loss": 2.7103, + "step": 31161 + }, + { + "epoch": 2.823220312110711, + "grad_norm": 1.0059175491333008, + "learning_rate": 1.177430073098532e-05, + "loss": 2.6911, + "step": 31162 + }, + { + "epoch": 2.8233109102851577, + "grad_norm": 0.8790725469589233, + "learning_rate": 1.1768259529994563e-05, + "loss": 2.0523, + "step": 31163 + }, + { + "epoch": 2.8234015084596047, + "grad_norm": 1.0239139795303345, + "learning_rate": 1.1762218329003805e-05, + "loss": 2.4622, + "step": 31164 + }, + { + "epoch": 2.8234921066340513, + "grad_norm": 1.1428412199020386, + "learning_rate": 1.175617712801305e-05, + "loss": 2.4042, + "step": 31165 + }, + { + "epoch": 2.8235827048084983, + "grad_norm": 1.1474308967590332, + "learning_rate": 1.1750135927022294e-05, + "loss": 2.3213, + "step": 31166 + }, + { + "epoch": 2.823673302982945, + "grad_norm": 1.0095386505126953, + "learning_rate": 1.1744094726031536e-05, + "loss": 2.6131, + "step": 31167 + }, + { + "epoch": 2.823763901157392, + "grad_norm": 1.1292908191680908, + "learning_rate": 1.1738053525040779e-05, + "loss": 2.5206, + "step": 31168 + }, + { + "epoch": 2.8238544993318384, + "grad_norm": 1.0857905149459839, + "learning_rate": 1.1732012324050021e-05, + "loss": 2.7386, + "step": 31169 + }, + { + "epoch": 2.823945097506285, + "grad_norm": 0.9980790615081787, + "learning_rate": 1.1725971123059266e-05, + "loss": 2.4894, + "step": 31170 + }, + { + "epoch": 2.824035695680732, + "grad_norm": 1.0805058479309082, + "learning_rate": 1.1719929922068508e-05, + "loss": 2.6476, + "step": 31171 + }, + { + "epoch": 2.824126293855179, + "grad_norm": 0.9909175634384155, + "learning_rate": 1.171388872107775e-05, + "loss": 2.4793, + "step": 31172 + }, + { + "epoch": 2.8242168920296256, + "grad_norm": 0.9873234033584595, + "learning_rate": 1.1707847520086993e-05, + "loss": 2.5789, + "step": 31173 + }, + { + "epoch": 2.824307490204072, + "grad_norm": 0.9910760521888733, + "learning_rate": 1.1701806319096237e-05, + "loss": 2.5226, + "step": 31174 + }, + { + "epoch": 2.824398088378519, + "grad_norm": 1.0882470607757568, + "learning_rate": 1.169576511810548e-05, + "loss": 2.5436, + "step": 31175 + }, + { + "epoch": 2.824488686552966, + "grad_norm": 1.045484185218811, + "learning_rate": 1.1689723917114722e-05, + "loss": 2.8704, + "step": 31176 + }, + { + "epoch": 2.8245792847274127, + "grad_norm": 0.8931983709335327, + "learning_rate": 1.1683682716123965e-05, + "loss": 1.9015, + "step": 31177 + }, + { + "epoch": 2.8246698829018593, + "grad_norm": 1.0135208368301392, + "learning_rate": 1.167764151513321e-05, + "loss": 2.7017, + "step": 31178 + }, + { + "epoch": 2.8247604810763063, + "grad_norm": 0.9871640205383301, + "learning_rate": 1.1671600314142452e-05, + "loss": 2.451, + "step": 31179 + }, + { + "epoch": 2.8248510792507533, + "grad_norm": 0.968397855758667, + "learning_rate": 1.1665559113151694e-05, + "loss": 2.5995, + "step": 31180 + }, + { + "epoch": 2.8249416774252, + "grad_norm": 0.9704222679138184, + "learning_rate": 1.1659517912160938e-05, + "loss": 2.6255, + "step": 31181 + }, + { + "epoch": 2.8250322755996464, + "grad_norm": 1.1443558931350708, + "learning_rate": 1.1653476711170181e-05, + "loss": 2.6047, + "step": 31182 + }, + { + "epoch": 2.8251228737740934, + "grad_norm": 0.9982873201370239, + "learning_rate": 1.1647435510179425e-05, + "loss": 2.7017, + "step": 31183 + }, + { + "epoch": 2.8252134719485404, + "grad_norm": 0.9257809519767761, + "learning_rate": 1.1641394309188668e-05, + "loss": 2.0697, + "step": 31184 + }, + { + "epoch": 2.825304070122987, + "grad_norm": 1.0480397939682007, + "learning_rate": 1.163535310819791e-05, + "loss": 2.7631, + "step": 31185 + }, + { + "epoch": 2.8253946682974336, + "grad_norm": 1.0376722812652588, + "learning_rate": 1.1629311907207153e-05, + "loss": 2.8014, + "step": 31186 + }, + { + "epoch": 2.8254852664718806, + "grad_norm": 0.9095777273178101, + "learning_rate": 1.1623270706216397e-05, + "loss": 2.5018, + "step": 31187 + }, + { + "epoch": 2.8255758646463276, + "grad_norm": 0.9866828322410583, + "learning_rate": 1.161722950522564e-05, + "loss": 2.6942, + "step": 31188 + }, + { + "epoch": 2.825666462820774, + "grad_norm": 1.059330701828003, + "learning_rate": 1.1611188304234882e-05, + "loss": 2.5536, + "step": 31189 + }, + { + "epoch": 2.8257570609952207, + "grad_norm": 1.1350822448730469, + "learning_rate": 1.1605147103244125e-05, + "loss": 2.384, + "step": 31190 + }, + { + "epoch": 2.8258476591696677, + "grad_norm": 1.0974177122116089, + "learning_rate": 1.1599105902253369e-05, + "loss": 2.7766, + "step": 31191 + }, + { + "epoch": 2.8259382573441147, + "grad_norm": 1.0192230939865112, + "learning_rate": 1.1593064701262611e-05, + "loss": 2.4941, + "step": 31192 + }, + { + "epoch": 2.8260288555185613, + "grad_norm": 0.998214840888977, + "learning_rate": 1.1587023500271854e-05, + "loss": 2.7889, + "step": 31193 + }, + { + "epoch": 2.826119453693008, + "grad_norm": 0.9648220539093018, + "learning_rate": 1.1580982299281096e-05, + "loss": 1.9913, + "step": 31194 + }, + { + "epoch": 2.826210051867455, + "grad_norm": 0.996566116809845, + "learning_rate": 1.157494109829034e-05, + "loss": 2.5857, + "step": 31195 + }, + { + "epoch": 2.826300650041902, + "grad_norm": 1.0124382972717285, + "learning_rate": 1.1568899897299585e-05, + "loss": 2.4565, + "step": 31196 + }, + { + "epoch": 2.8263912482163485, + "grad_norm": 1.0048104524612427, + "learning_rate": 1.1562858696308827e-05, + "loss": 2.5452, + "step": 31197 + }, + { + "epoch": 2.826481846390795, + "grad_norm": 1.0023362636566162, + "learning_rate": 1.155681749531807e-05, + "loss": 2.7732, + "step": 31198 + }, + { + "epoch": 2.826572444565242, + "grad_norm": 1.0961898565292358, + "learning_rate": 1.1550776294327312e-05, + "loss": 2.4382, + "step": 31199 + }, + { + "epoch": 2.826663042739689, + "grad_norm": 1.172154426574707, + "learning_rate": 1.1544735093336557e-05, + "loss": 2.7052, + "step": 31200 + }, + { + "epoch": 2.8267536409141356, + "grad_norm": 0.9793151021003723, + "learning_rate": 1.1538693892345799e-05, + "loss": 2.8419, + "step": 31201 + }, + { + "epoch": 2.826844239088582, + "grad_norm": 0.9834226965904236, + "learning_rate": 1.1532652691355042e-05, + "loss": 2.4848, + "step": 31202 + }, + { + "epoch": 2.826934837263029, + "grad_norm": 0.9957412481307983, + "learning_rate": 1.1526611490364284e-05, + "loss": 2.9214, + "step": 31203 + }, + { + "epoch": 2.827025435437476, + "grad_norm": 0.9703569412231445, + "learning_rate": 1.1520570289373528e-05, + "loss": 2.6687, + "step": 31204 + }, + { + "epoch": 2.8271160336119228, + "grad_norm": 1.01206374168396, + "learning_rate": 1.1514529088382771e-05, + "loss": 2.793, + "step": 31205 + }, + { + "epoch": 2.8272066317863693, + "grad_norm": 1.0069152116775513, + "learning_rate": 1.1508487887392013e-05, + "loss": 2.8455, + "step": 31206 + }, + { + "epoch": 2.8272972299608163, + "grad_norm": 1.018327236175537, + "learning_rate": 1.1502446686401256e-05, + "loss": 2.8344, + "step": 31207 + }, + { + "epoch": 2.8273878281352633, + "grad_norm": 1.1130958795547485, + "learning_rate": 1.14964054854105e-05, + "loss": 2.6411, + "step": 31208 + }, + { + "epoch": 2.82747842630971, + "grad_norm": 0.9538321495056152, + "learning_rate": 1.1490364284419743e-05, + "loss": 2.7124, + "step": 31209 + }, + { + "epoch": 2.8275690244841565, + "grad_norm": 1.0393391847610474, + "learning_rate": 1.1484323083428985e-05, + "loss": 2.7496, + "step": 31210 + }, + { + "epoch": 2.8276596226586035, + "grad_norm": 0.9783493876457214, + "learning_rate": 1.147828188243823e-05, + "loss": 2.8233, + "step": 31211 + }, + { + "epoch": 2.82775022083305, + "grad_norm": 1.0631670951843262, + "learning_rate": 1.1472240681447474e-05, + "loss": 2.6265, + "step": 31212 + }, + { + "epoch": 2.827840819007497, + "grad_norm": 0.9514533877372742, + "learning_rate": 1.1466199480456716e-05, + "loss": 2.4527, + "step": 31213 + }, + { + "epoch": 2.8279314171819436, + "grad_norm": 0.838028609752655, + "learning_rate": 1.1460158279465959e-05, + "loss": 2.0631, + "step": 31214 + }, + { + "epoch": 2.8280220153563906, + "grad_norm": 1.0169965028762817, + "learning_rate": 1.1454117078475201e-05, + "loss": 2.6129, + "step": 31215 + }, + { + "epoch": 2.828112613530837, + "grad_norm": 0.9103288054466248, + "learning_rate": 1.1448075877484445e-05, + "loss": 1.9541, + "step": 31216 + }, + { + "epoch": 2.828203211705284, + "grad_norm": 1.0396251678466797, + "learning_rate": 1.1442034676493688e-05, + "loss": 2.616, + "step": 31217 + }, + { + "epoch": 2.828293809879731, + "grad_norm": 1.0436937808990479, + "learning_rate": 1.143599347550293e-05, + "loss": 2.5721, + "step": 31218 + }, + { + "epoch": 2.828384408054178, + "grad_norm": 1.003472089767456, + "learning_rate": 1.1429952274512173e-05, + "loss": 2.5666, + "step": 31219 + }, + { + "epoch": 2.8284750062286244, + "grad_norm": 1.0416890382766724, + "learning_rate": 1.1423911073521416e-05, + "loss": 2.9227, + "step": 31220 + }, + { + "epoch": 2.8285656044030714, + "grad_norm": 1.0711942911148071, + "learning_rate": 1.141786987253066e-05, + "loss": 2.6492, + "step": 31221 + }, + { + "epoch": 2.828656202577518, + "grad_norm": 0.9972506761550903, + "learning_rate": 1.1411828671539902e-05, + "loss": 2.7374, + "step": 31222 + }, + { + "epoch": 2.828746800751965, + "grad_norm": 0.9653927683830261, + "learning_rate": 1.1405787470549145e-05, + "loss": 2.3471, + "step": 31223 + }, + { + "epoch": 2.8288373989264115, + "grad_norm": 1.0315290689468384, + "learning_rate": 1.1399746269558387e-05, + "loss": 2.5582, + "step": 31224 + }, + { + "epoch": 2.8289279971008585, + "grad_norm": 1.0195844173431396, + "learning_rate": 1.1393705068567633e-05, + "loss": 2.7868, + "step": 31225 + }, + { + "epoch": 2.829018595275305, + "grad_norm": 1.0464460849761963, + "learning_rate": 1.1387663867576876e-05, + "loss": 2.7265, + "step": 31226 + }, + { + "epoch": 2.829109193449752, + "grad_norm": 1.074485182762146, + "learning_rate": 1.1381622666586118e-05, + "loss": 2.7925, + "step": 31227 + }, + { + "epoch": 2.8291997916241987, + "grad_norm": 0.8301473259925842, + "learning_rate": 1.137558146559536e-05, + "loss": 1.9716, + "step": 31228 + }, + { + "epoch": 2.8292903897986457, + "grad_norm": 0.9777604341506958, + "learning_rate": 1.1369540264604605e-05, + "loss": 2.6769, + "step": 31229 + }, + { + "epoch": 2.8293809879730922, + "grad_norm": 1.1273497343063354, + "learning_rate": 1.1363499063613848e-05, + "loss": 2.6747, + "step": 31230 + }, + { + "epoch": 2.8294715861475392, + "grad_norm": 1.1357016563415527, + "learning_rate": 1.135745786262309e-05, + "loss": 2.4117, + "step": 31231 + }, + { + "epoch": 2.829562184321986, + "grad_norm": 1.0259530544281006, + "learning_rate": 1.1351416661632333e-05, + "loss": 3.1313, + "step": 31232 + }, + { + "epoch": 2.829652782496433, + "grad_norm": 1.0040677785873413, + "learning_rate": 1.1345375460641577e-05, + "loss": 2.7189, + "step": 31233 + }, + { + "epoch": 2.8297433806708794, + "grad_norm": 1.0243226289749146, + "learning_rate": 1.133933425965082e-05, + "loss": 2.562, + "step": 31234 + }, + { + "epoch": 2.8298339788453264, + "grad_norm": 0.8617832064628601, + "learning_rate": 1.1333293058660062e-05, + "loss": 2.5097, + "step": 31235 + }, + { + "epoch": 2.829924577019773, + "grad_norm": 1.040449619293213, + "learning_rate": 1.1327251857669304e-05, + "loss": 2.4939, + "step": 31236 + }, + { + "epoch": 2.83001517519422, + "grad_norm": 0.9794113636016846, + "learning_rate": 1.1321210656678549e-05, + "loss": 2.5883, + "step": 31237 + }, + { + "epoch": 2.8301057733686665, + "grad_norm": 0.9362903833389282, + "learning_rate": 1.1315169455687791e-05, + "loss": 2.7079, + "step": 31238 + }, + { + "epoch": 2.8301963715431135, + "grad_norm": 0.992060124874115, + "learning_rate": 1.1309128254697034e-05, + "loss": 2.3656, + "step": 31239 + }, + { + "epoch": 2.83028696971756, + "grad_norm": 0.9912691712379456, + "learning_rate": 1.1303087053706278e-05, + "loss": 2.6097, + "step": 31240 + }, + { + "epoch": 2.830377567892007, + "grad_norm": 1.0740903615951538, + "learning_rate": 1.129704585271552e-05, + "loss": 2.4293, + "step": 31241 + }, + { + "epoch": 2.8304681660664537, + "grad_norm": 1.1576911211013794, + "learning_rate": 1.1291004651724765e-05, + "loss": 2.381, + "step": 31242 + }, + { + "epoch": 2.8305587642409007, + "grad_norm": 1.0106474161148071, + "learning_rate": 1.1284963450734007e-05, + "loss": 2.3766, + "step": 31243 + }, + { + "epoch": 2.8306493624153473, + "grad_norm": 1.0216108560562134, + "learning_rate": 1.127892224974325e-05, + "loss": 2.7954, + "step": 31244 + }, + { + "epoch": 2.8307399605897943, + "grad_norm": 0.9852975010871887, + "learning_rate": 1.1272881048752492e-05, + "loss": 2.828, + "step": 31245 + }, + { + "epoch": 2.830830558764241, + "grad_norm": 0.8273731470108032, + "learning_rate": 1.1266839847761736e-05, + "loss": 1.8893, + "step": 31246 + }, + { + "epoch": 2.830921156938688, + "grad_norm": 0.9966647624969482, + "learning_rate": 1.1260798646770979e-05, + "loss": 2.5787, + "step": 31247 + }, + { + "epoch": 2.8310117551131344, + "grad_norm": 0.9665254950523376, + "learning_rate": 1.1254757445780221e-05, + "loss": 2.6273, + "step": 31248 + }, + { + "epoch": 2.8311023532875814, + "grad_norm": 0.9610216021537781, + "learning_rate": 1.1248716244789464e-05, + "loss": 2.51, + "step": 31249 + }, + { + "epoch": 2.831192951462028, + "grad_norm": 0.8659108877182007, + "learning_rate": 1.1242675043798708e-05, + "loss": 1.8226, + "step": 31250 + }, + { + "epoch": 2.831283549636475, + "grad_norm": 1.05868661403656, + "learning_rate": 1.123663384280795e-05, + "loss": 2.6517, + "step": 31251 + }, + { + "epoch": 2.8313741478109216, + "grad_norm": 1.0116301774978638, + "learning_rate": 1.1230592641817193e-05, + "loss": 2.5378, + "step": 31252 + }, + { + "epoch": 2.831464745985368, + "grad_norm": 0.8799184560775757, + "learning_rate": 1.1224551440826436e-05, + "loss": 1.9933, + "step": 31253 + }, + { + "epoch": 2.831555344159815, + "grad_norm": 1.0906599760055542, + "learning_rate": 1.121851023983568e-05, + "loss": 2.4726, + "step": 31254 + }, + { + "epoch": 2.831645942334262, + "grad_norm": 1.0982211828231812, + "learning_rate": 1.1212469038844924e-05, + "loss": 2.738, + "step": 31255 + }, + { + "epoch": 2.8317365405087087, + "grad_norm": 0.9828441143035889, + "learning_rate": 1.1206427837854167e-05, + "loss": 2.7716, + "step": 31256 + }, + { + "epoch": 2.8318271386831553, + "grad_norm": 0.9668278098106384, + "learning_rate": 1.120038663686341e-05, + "loss": 2.5026, + "step": 31257 + }, + { + "epoch": 2.8319177368576023, + "grad_norm": 0.8824244737625122, + "learning_rate": 1.1194345435872652e-05, + "loss": 2.0709, + "step": 31258 + }, + { + "epoch": 2.8320083350320493, + "grad_norm": 1.0240271091461182, + "learning_rate": 1.1188304234881896e-05, + "loss": 2.7768, + "step": 31259 + }, + { + "epoch": 2.832098933206496, + "grad_norm": 1.064780354499817, + "learning_rate": 1.1182263033891138e-05, + "loss": 2.5985, + "step": 31260 + }, + { + "epoch": 2.8321895313809424, + "grad_norm": 1.1732405424118042, + "learning_rate": 1.1176221832900381e-05, + "loss": 2.4454, + "step": 31261 + }, + { + "epoch": 2.8322801295553894, + "grad_norm": 1.1047555208206177, + "learning_rate": 1.1170180631909624e-05, + "loss": 2.6851, + "step": 31262 + }, + { + "epoch": 2.8323707277298364, + "grad_norm": 0.9565321803092957, + "learning_rate": 1.1164139430918868e-05, + "loss": 2.5349, + "step": 31263 + }, + { + "epoch": 2.832461325904283, + "grad_norm": 0.9802930355072021, + "learning_rate": 1.115809822992811e-05, + "loss": 2.5105, + "step": 31264 + }, + { + "epoch": 2.8325519240787296, + "grad_norm": 0.8890196681022644, + "learning_rate": 1.1152057028937353e-05, + "loss": 1.992, + "step": 31265 + }, + { + "epoch": 2.8326425222531766, + "grad_norm": 0.9330021142959595, + "learning_rate": 1.1146015827946595e-05, + "loss": 1.9909, + "step": 31266 + }, + { + "epoch": 2.8327331204276236, + "grad_norm": 1.0586098432540894, + "learning_rate": 1.113997462695584e-05, + "loss": 2.2592, + "step": 31267 + }, + { + "epoch": 2.83282371860207, + "grad_norm": 0.9548074007034302, + "learning_rate": 1.1133933425965082e-05, + "loss": 2.6224, + "step": 31268 + }, + { + "epoch": 2.8329143167765167, + "grad_norm": 0.9395119547843933, + "learning_rate": 1.1127892224974325e-05, + "loss": 2.4617, + "step": 31269 + }, + { + "epoch": 2.8330049149509637, + "grad_norm": 1.01923668384552, + "learning_rate": 1.1121851023983569e-05, + "loss": 2.3937, + "step": 31270 + }, + { + "epoch": 2.8330955131254107, + "grad_norm": 0.9141213297843933, + "learning_rate": 1.1115809822992811e-05, + "loss": 1.8598, + "step": 31271 + }, + { + "epoch": 2.8331861112998573, + "grad_norm": 0.8518542647361755, + "learning_rate": 1.1109768622002056e-05, + "loss": 1.9237, + "step": 31272 + }, + { + "epoch": 2.833276709474304, + "grad_norm": 1.01419997215271, + "learning_rate": 1.1103727421011298e-05, + "loss": 2.7958, + "step": 31273 + }, + { + "epoch": 2.833367307648751, + "grad_norm": 0.8506998419761658, + "learning_rate": 1.109768622002054e-05, + "loss": 1.9528, + "step": 31274 + }, + { + "epoch": 2.833457905823198, + "grad_norm": 1.0146193504333496, + "learning_rate": 1.1091645019029783e-05, + "loss": 2.6497, + "step": 31275 + }, + { + "epoch": 2.8335485039976445, + "grad_norm": 0.9860795140266418, + "learning_rate": 1.1085603818039027e-05, + "loss": 2.6038, + "step": 31276 + }, + { + "epoch": 2.833639102172091, + "grad_norm": 0.9840720891952515, + "learning_rate": 1.107956261704827e-05, + "loss": 2.6609, + "step": 31277 + }, + { + "epoch": 2.833729700346538, + "grad_norm": 0.8418068289756775, + "learning_rate": 1.1073521416057512e-05, + "loss": 1.8806, + "step": 31278 + }, + { + "epoch": 2.833820298520985, + "grad_norm": 1.0706309080123901, + "learning_rate": 1.1067480215066755e-05, + "loss": 2.6374, + "step": 31279 + }, + { + "epoch": 2.8339108966954316, + "grad_norm": 1.0375237464904785, + "learning_rate": 1.1061439014075999e-05, + "loss": 2.6213, + "step": 31280 + }, + { + "epoch": 2.834001494869878, + "grad_norm": 1.1134908199310303, + "learning_rate": 1.1055397813085242e-05, + "loss": 2.5534, + "step": 31281 + }, + { + "epoch": 2.834092093044325, + "grad_norm": 1.0825954675674438, + "learning_rate": 1.1049356612094484e-05, + "loss": 2.5678, + "step": 31282 + }, + { + "epoch": 2.834182691218772, + "grad_norm": 1.0303676128387451, + "learning_rate": 1.1043315411103727e-05, + "loss": 2.8631, + "step": 31283 + }, + { + "epoch": 2.8342732893932188, + "grad_norm": 0.9489887952804565, + "learning_rate": 1.1037274210112971e-05, + "loss": 2.4062, + "step": 31284 + }, + { + "epoch": 2.8343638875676653, + "grad_norm": 0.924510657787323, + "learning_rate": 1.1031233009122215e-05, + "loss": 1.9075, + "step": 31285 + }, + { + "epoch": 2.8344544857421123, + "grad_norm": 1.0524877309799194, + "learning_rate": 1.1025191808131458e-05, + "loss": 2.5481, + "step": 31286 + }, + { + "epoch": 2.8345450839165593, + "grad_norm": 1.012725591659546, + "learning_rate": 1.10191506071407e-05, + "loss": 2.8092, + "step": 31287 + }, + { + "epoch": 2.834635682091006, + "grad_norm": 1.0286682844161987, + "learning_rate": 1.1013109406149943e-05, + "loss": 2.4078, + "step": 31288 + }, + { + "epoch": 2.8347262802654525, + "grad_norm": 1.023924469947815, + "learning_rate": 1.1007068205159187e-05, + "loss": 2.4827, + "step": 31289 + }, + { + "epoch": 2.8348168784398995, + "grad_norm": 1.0877046585083008, + "learning_rate": 1.100102700416843e-05, + "loss": 2.4751, + "step": 31290 + }, + { + "epoch": 2.834907476614346, + "grad_norm": 0.9562898278236389, + "learning_rate": 1.0994985803177672e-05, + "loss": 2.5824, + "step": 31291 + }, + { + "epoch": 2.834998074788793, + "grad_norm": 1.0811761617660522, + "learning_rate": 1.0988944602186914e-05, + "loss": 2.6789, + "step": 31292 + }, + { + "epoch": 2.8350886729632396, + "grad_norm": 1.015196681022644, + "learning_rate": 1.0982903401196159e-05, + "loss": 2.842, + "step": 31293 + }, + { + "epoch": 2.8351792711376866, + "grad_norm": 0.9752158522605896, + "learning_rate": 1.0976862200205401e-05, + "loss": 2.717, + "step": 31294 + }, + { + "epoch": 2.835269869312133, + "grad_norm": 1.1660575866699219, + "learning_rate": 1.0970820999214644e-05, + "loss": 2.4292, + "step": 31295 + }, + { + "epoch": 2.83536046748658, + "grad_norm": 1.011818528175354, + "learning_rate": 1.0964779798223886e-05, + "loss": 2.3748, + "step": 31296 + }, + { + "epoch": 2.835451065661027, + "grad_norm": 0.798447847366333, + "learning_rate": 1.095873859723313e-05, + "loss": 1.2672, + "step": 31297 + }, + { + "epoch": 2.835541663835474, + "grad_norm": 1.0260003805160522, + "learning_rate": 1.0952697396242373e-05, + "loss": 2.6743, + "step": 31298 + }, + { + "epoch": 2.8356322620099204, + "grad_norm": 1.1224770545959473, + "learning_rate": 1.0946656195251617e-05, + "loss": 2.7418, + "step": 31299 + }, + { + "epoch": 2.8357228601843674, + "grad_norm": 1.0138121843338013, + "learning_rate": 1.094061499426086e-05, + "loss": 2.8282, + "step": 31300 + }, + { + "epoch": 2.835813458358814, + "grad_norm": 1.0635583400726318, + "learning_rate": 1.0934573793270102e-05, + "loss": 2.5781, + "step": 31301 + }, + { + "epoch": 2.835904056533261, + "grad_norm": 1.0141741037368774, + "learning_rate": 1.0928532592279347e-05, + "loss": 2.5302, + "step": 31302 + }, + { + "epoch": 2.8359946547077075, + "grad_norm": 1.0929745435714722, + "learning_rate": 1.0922491391288589e-05, + "loss": 2.7518, + "step": 31303 + }, + { + "epoch": 2.8360852528821545, + "grad_norm": 0.9743183851242065, + "learning_rate": 1.0916450190297832e-05, + "loss": 2.8333, + "step": 31304 + }, + { + "epoch": 2.836175851056601, + "grad_norm": 0.9557403326034546, + "learning_rate": 1.0910408989307074e-05, + "loss": 2.4583, + "step": 31305 + }, + { + "epoch": 2.836266449231048, + "grad_norm": 1.0815370082855225, + "learning_rate": 1.0904367788316318e-05, + "loss": 2.74, + "step": 31306 + }, + { + "epoch": 2.8363570474054947, + "grad_norm": 0.9410581588745117, + "learning_rate": 1.089832658732556e-05, + "loss": 1.8097, + "step": 31307 + }, + { + "epoch": 2.8364476455799417, + "grad_norm": 1.0007195472717285, + "learning_rate": 1.0892285386334803e-05, + "loss": 2.601, + "step": 31308 + }, + { + "epoch": 2.8365382437543882, + "grad_norm": 1.0236287117004395, + "learning_rate": 1.0886244185344046e-05, + "loss": 2.4157, + "step": 31309 + }, + { + "epoch": 2.8366288419288352, + "grad_norm": 1.000339388847351, + "learning_rate": 1.088020298435329e-05, + "loss": 2.4395, + "step": 31310 + }, + { + "epoch": 2.836719440103282, + "grad_norm": 1.0320881605148315, + "learning_rate": 1.0874161783362533e-05, + "loss": 2.6842, + "step": 31311 + }, + { + "epoch": 2.836810038277729, + "grad_norm": 1.0119102001190186, + "learning_rate": 1.0868120582371775e-05, + "loss": 2.3727, + "step": 31312 + }, + { + "epoch": 2.8369006364521754, + "grad_norm": 0.8985795378684998, + "learning_rate": 1.0862079381381018e-05, + "loss": 2.1171, + "step": 31313 + }, + { + "epoch": 2.8369912346266224, + "grad_norm": 0.9692565202713013, + "learning_rate": 1.0856038180390264e-05, + "loss": 2.9703, + "step": 31314 + }, + { + "epoch": 2.837081832801069, + "grad_norm": 1.0615861415863037, + "learning_rate": 1.0849996979399506e-05, + "loss": 2.605, + "step": 31315 + }, + { + "epoch": 2.837172430975516, + "grad_norm": 0.9835899472236633, + "learning_rate": 1.0843955778408749e-05, + "loss": 2.6105, + "step": 31316 + }, + { + "epoch": 2.8372630291499625, + "grad_norm": 1.0150108337402344, + "learning_rate": 1.0837914577417991e-05, + "loss": 2.5091, + "step": 31317 + }, + { + "epoch": 2.8373536273244095, + "grad_norm": 1.1262949705123901, + "learning_rate": 1.0831873376427235e-05, + "loss": 2.4757, + "step": 31318 + }, + { + "epoch": 2.837444225498856, + "grad_norm": 0.9381189942359924, + "learning_rate": 1.0825832175436478e-05, + "loss": 2.3508, + "step": 31319 + }, + { + "epoch": 2.837534823673303, + "grad_norm": 0.9817870855331421, + "learning_rate": 1.081979097444572e-05, + "loss": 2.6844, + "step": 31320 + }, + { + "epoch": 2.8376254218477497, + "grad_norm": 1.0273516178131104, + "learning_rate": 1.0813749773454963e-05, + "loss": 2.4982, + "step": 31321 + }, + { + "epoch": 2.8377160200221967, + "grad_norm": 1.006219744682312, + "learning_rate": 1.0807708572464207e-05, + "loss": 2.5349, + "step": 31322 + }, + { + "epoch": 2.8378066181966433, + "grad_norm": 0.915152907371521, + "learning_rate": 1.080166737147345e-05, + "loss": 2.2652, + "step": 31323 + }, + { + "epoch": 2.8378972163710903, + "grad_norm": 1.0449670553207397, + "learning_rate": 1.0795626170482692e-05, + "loss": 2.713, + "step": 31324 + }, + { + "epoch": 2.837987814545537, + "grad_norm": 1.050264596939087, + "learning_rate": 1.0789584969491935e-05, + "loss": 2.6907, + "step": 31325 + }, + { + "epoch": 2.838078412719984, + "grad_norm": 1.0390386581420898, + "learning_rate": 1.0783543768501179e-05, + "loss": 2.5757, + "step": 31326 + }, + { + "epoch": 2.8381690108944304, + "grad_norm": 0.9993132948875427, + "learning_rate": 1.0777502567510421e-05, + "loss": 2.2977, + "step": 31327 + }, + { + "epoch": 2.8382596090688774, + "grad_norm": 0.9991595149040222, + "learning_rate": 1.0771461366519664e-05, + "loss": 2.8309, + "step": 31328 + }, + { + "epoch": 2.838350207243324, + "grad_norm": 1.0786094665527344, + "learning_rate": 1.0765420165528908e-05, + "loss": 2.5211, + "step": 31329 + }, + { + "epoch": 2.838440805417771, + "grad_norm": 0.9728854894638062, + "learning_rate": 1.075937896453815e-05, + "loss": 2.6426, + "step": 31330 + }, + { + "epoch": 2.8385314035922176, + "grad_norm": 1.0052121877670288, + "learning_rate": 1.0753337763547395e-05, + "loss": 2.6508, + "step": 31331 + }, + { + "epoch": 2.838622001766664, + "grad_norm": 1.0052618980407715, + "learning_rate": 1.0747296562556637e-05, + "loss": 2.4894, + "step": 31332 + }, + { + "epoch": 2.838712599941111, + "grad_norm": 0.9836001396179199, + "learning_rate": 1.074125536156588e-05, + "loss": 2.501, + "step": 31333 + }, + { + "epoch": 2.838803198115558, + "grad_norm": 1.1017048358917236, + "learning_rate": 1.0735214160575123e-05, + "loss": 2.4049, + "step": 31334 + }, + { + "epoch": 2.8388937962900047, + "grad_norm": 0.9315916895866394, + "learning_rate": 1.0729172959584367e-05, + "loss": 2.4773, + "step": 31335 + }, + { + "epoch": 2.8389843944644513, + "grad_norm": 1.0542798042297363, + "learning_rate": 1.072313175859361e-05, + "loss": 2.5957, + "step": 31336 + }, + { + "epoch": 2.8390749926388983, + "grad_norm": 1.1913542747497559, + "learning_rate": 1.0717090557602852e-05, + "loss": 2.6779, + "step": 31337 + }, + { + "epoch": 2.8391655908133453, + "grad_norm": 0.9535613059997559, + "learning_rate": 1.0711049356612094e-05, + "loss": 2.0059, + "step": 31338 + }, + { + "epoch": 2.839256188987792, + "grad_norm": 1.0495356321334839, + "learning_rate": 1.0705008155621339e-05, + "loss": 2.5417, + "step": 31339 + }, + { + "epoch": 2.8393467871622384, + "grad_norm": 0.994920015335083, + "learning_rate": 1.0698966954630581e-05, + "loss": 2.5513, + "step": 31340 + }, + { + "epoch": 2.8394373853366854, + "grad_norm": 0.9918729662895203, + "learning_rate": 1.0692925753639824e-05, + "loss": 2.328, + "step": 31341 + }, + { + "epoch": 2.8395279835111324, + "grad_norm": 1.0496437549591064, + "learning_rate": 1.0686884552649066e-05, + "loss": 2.7663, + "step": 31342 + }, + { + "epoch": 2.839618581685579, + "grad_norm": 0.8969364166259766, + "learning_rate": 1.068084335165831e-05, + "loss": 1.9461, + "step": 31343 + }, + { + "epoch": 2.8397091798600256, + "grad_norm": 0.9504483342170715, + "learning_rate": 1.0674802150667555e-05, + "loss": 2.5944, + "step": 31344 + }, + { + "epoch": 2.8397997780344726, + "grad_norm": 1.0187859535217285, + "learning_rate": 1.0668760949676797e-05, + "loss": 2.6275, + "step": 31345 + }, + { + "epoch": 2.8398903762089196, + "grad_norm": 1.0501689910888672, + "learning_rate": 1.066271974868604e-05, + "loss": 2.7937, + "step": 31346 + }, + { + "epoch": 2.839980974383366, + "grad_norm": 0.9647320508956909, + "learning_rate": 1.0656678547695282e-05, + "loss": 2.6425, + "step": 31347 + }, + { + "epoch": 2.8400715725578127, + "grad_norm": 0.9432615041732788, + "learning_rate": 1.0650637346704526e-05, + "loss": 2.4984, + "step": 31348 + }, + { + "epoch": 2.8401621707322597, + "grad_norm": 0.9975671768188477, + "learning_rate": 1.0644596145713769e-05, + "loss": 2.528, + "step": 31349 + }, + { + "epoch": 2.8402527689067067, + "grad_norm": 1.0646870136260986, + "learning_rate": 1.0638554944723011e-05, + "loss": 2.7576, + "step": 31350 + }, + { + "epoch": 2.8403433670811533, + "grad_norm": 1.0022006034851074, + "learning_rate": 1.0632513743732254e-05, + "loss": 2.7376, + "step": 31351 + }, + { + "epoch": 2.8404339652556, + "grad_norm": 0.9954942464828491, + "learning_rate": 1.0626472542741498e-05, + "loss": 2.5921, + "step": 31352 + }, + { + "epoch": 2.840524563430047, + "grad_norm": 1.0812103748321533, + "learning_rate": 1.062043134175074e-05, + "loss": 2.6514, + "step": 31353 + }, + { + "epoch": 2.840615161604494, + "grad_norm": 1.092882752418518, + "learning_rate": 1.0614390140759983e-05, + "loss": 2.7747, + "step": 31354 + }, + { + "epoch": 2.8407057597789405, + "grad_norm": 0.9073416590690613, + "learning_rate": 1.0608348939769226e-05, + "loss": 2.3231, + "step": 31355 + }, + { + "epoch": 2.840796357953387, + "grad_norm": 0.991098940372467, + "learning_rate": 1.060230773877847e-05, + "loss": 2.6729, + "step": 31356 + }, + { + "epoch": 2.840886956127834, + "grad_norm": 0.9773099422454834, + "learning_rate": 1.0596266537787712e-05, + "loss": 2.4666, + "step": 31357 + }, + { + "epoch": 2.840977554302281, + "grad_norm": 1.030515432357788, + "learning_rate": 1.0590225336796955e-05, + "loss": 2.5625, + "step": 31358 + }, + { + "epoch": 2.8410681524767276, + "grad_norm": 1.0510317087173462, + "learning_rate": 1.0584184135806199e-05, + "loss": 2.4765, + "step": 31359 + }, + { + "epoch": 2.841158750651174, + "grad_norm": 0.9445083737373352, + "learning_rate": 1.0578142934815442e-05, + "loss": 2.818, + "step": 31360 + }, + { + "epoch": 2.841249348825621, + "grad_norm": 1.0345674753189087, + "learning_rate": 1.0572101733824686e-05, + "loss": 2.6, + "step": 31361 + }, + { + "epoch": 2.841339947000068, + "grad_norm": 0.9946334958076477, + "learning_rate": 1.0566060532833928e-05, + "loss": 2.6849, + "step": 31362 + }, + { + "epoch": 2.8414305451745148, + "grad_norm": 1.0267406702041626, + "learning_rate": 1.0560019331843171e-05, + "loss": 2.8846, + "step": 31363 + }, + { + "epoch": 2.8415211433489613, + "grad_norm": 1.032577395439148, + "learning_rate": 1.0553978130852413e-05, + "loss": 2.4453, + "step": 31364 + }, + { + "epoch": 2.8416117415234083, + "grad_norm": 1.0113050937652588, + "learning_rate": 1.0547936929861658e-05, + "loss": 2.6978, + "step": 31365 + }, + { + "epoch": 2.8417023396978554, + "grad_norm": 1.112774133682251, + "learning_rate": 1.05418957288709e-05, + "loss": 2.6459, + "step": 31366 + }, + { + "epoch": 2.841792937872302, + "grad_norm": 0.8933820128440857, + "learning_rate": 1.0535854527880143e-05, + "loss": 1.9104, + "step": 31367 + }, + { + "epoch": 2.8418835360467485, + "grad_norm": 0.983649492263794, + "learning_rate": 1.0529813326889385e-05, + "loss": 2.6272, + "step": 31368 + }, + { + "epoch": 2.8419741342211955, + "grad_norm": 1.047202467918396, + "learning_rate": 1.052377212589863e-05, + "loss": 2.8509, + "step": 31369 + }, + { + "epoch": 2.8420647323956425, + "grad_norm": 0.9963644742965698, + "learning_rate": 1.0517730924907872e-05, + "loss": 2.6182, + "step": 31370 + }, + { + "epoch": 2.842155330570089, + "grad_norm": 0.957763135433197, + "learning_rate": 1.0511689723917115e-05, + "loss": 2.5239, + "step": 31371 + }, + { + "epoch": 2.8422459287445356, + "grad_norm": 1.0724949836730957, + "learning_rate": 1.0505648522926357e-05, + "loss": 2.6617, + "step": 31372 + }, + { + "epoch": 2.8423365269189826, + "grad_norm": 0.8909338712692261, + "learning_rate": 1.0499607321935601e-05, + "loss": 1.9711, + "step": 31373 + }, + { + "epoch": 2.842427125093429, + "grad_norm": 0.9197681546211243, + "learning_rate": 1.0493566120944845e-05, + "loss": 2.0315, + "step": 31374 + }, + { + "epoch": 2.842517723267876, + "grad_norm": 1.023857831954956, + "learning_rate": 1.0487524919954088e-05, + "loss": 2.5562, + "step": 31375 + }, + { + "epoch": 2.842608321442323, + "grad_norm": 0.959017276763916, + "learning_rate": 1.048148371896333e-05, + "loss": 2.5081, + "step": 31376 + }, + { + "epoch": 2.84269891961677, + "grad_norm": 1.0604673624038696, + "learning_rate": 1.0475442517972573e-05, + "loss": 2.7303, + "step": 31377 + }, + { + "epoch": 2.8427895177912164, + "grad_norm": 1.261610507965088, + "learning_rate": 1.0469401316981817e-05, + "loss": 2.304, + "step": 31378 + }, + { + "epoch": 2.8428801159656634, + "grad_norm": 1.0226296186447144, + "learning_rate": 1.046336011599106e-05, + "loss": 2.5468, + "step": 31379 + }, + { + "epoch": 2.84297071414011, + "grad_norm": 0.988439679145813, + "learning_rate": 1.0457318915000302e-05, + "loss": 2.5441, + "step": 31380 + }, + { + "epoch": 2.843061312314557, + "grad_norm": 1.1044055223464966, + "learning_rate": 1.0451277714009545e-05, + "loss": 2.5281, + "step": 31381 + }, + { + "epoch": 2.8431519104890035, + "grad_norm": 1.0278533697128296, + "learning_rate": 1.0445236513018789e-05, + "loss": 2.4632, + "step": 31382 + }, + { + "epoch": 2.8432425086634505, + "grad_norm": 1.0416327714920044, + "learning_rate": 1.0439195312028032e-05, + "loss": 2.7688, + "step": 31383 + }, + { + "epoch": 2.843333106837897, + "grad_norm": 1.031652808189392, + "learning_rate": 1.0433154111037274e-05, + "loss": 2.6788, + "step": 31384 + }, + { + "epoch": 2.843423705012344, + "grad_norm": 0.9097333550453186, + "learning_rate": 1.0427112910046517e-05, + "loss": 1.8658, + "step": 31385 + }, + { + "epoch": 2.8435143031867907, + "grad_norm": 0.9640071392059326, + "learning_rate": 1.042107170905576e-05, + "loss": 2.7498, + "step": 31386 + }, + { + "epoch": 2.8436049013612377, + "grad_norm": 0.9966464042663574, + "learning_rate": 1.0415030508065003e-05, + "loss": 2.53, + "step": 31387 + }, + { + "epoch": 2.8436954995356842, + "grad_norm": 0.9373243451118469, + "learning_rate": 1.0408989307074248e-05, + "loss": 2.7027, + "step": 31388 + }, + { + "epoch": 2.8437860977101312, + "grad_norm": 0.9964886903762817, + "learning_rate": 1.040294810608349e-05, + "loss": 2.4843, + "step": 31389 + }, + { + "epoch": 2.843876695884578, + "grad_norm": 0.895315408706665, + "learning_rate": 1.0396906905092733e-05, + "loss": 1.8578, + "step": 31390 + }, + { + "epoch": 2.843967294059025, + "grad_norm": 0.9459967613220215, + "learning_rate": 1.0390865704101977e-05, + "loss": 2.5925, + "step": 31391 + }, + { + "epoch": 2.8440578922334714, + "grad_norm": 1.154191017150879, + "learning_rate": 1.038482450311122e-05, + "loss": 2.7226, + "step": 31392 + }, + { + "epoch": 2.8441484904079184, + "grad_norm": 1.0284141302108765, + "learning_rate": 1.0378783302120462e-05, + "loss": 2.8037, + "step": 31393 + }, + { + "epoch": 2.844239088582365, + "grad_norm": 1.1016639471054077, + "learning_rate": 1.0372742101129704e-05, + "loss": 2.6083, + "step": 31394 + }, + { + "epoch": 2.844329686756812, + "grad_norm": 0.962601900100708, + "learning_rate": 1.0366700900138949e-05, + "loss": 2.5622, + "step": 31395 + }, + { + "epoch": 2.8444202849312585, + "grad_norm": 1.0624090433120728, + "learning_rate": 1.0360659699148191e-05, + "loss": 2.5963, + "step": 31396 + }, + { + "epoch": 2.8445108831057055, + "grad_norm": 1.1144920587539673, + "learning_rate": 1.0354618498157434e-05, + "loss": 2.632, + "step": 31397 + }, + { + "epoch": 2.844601481280152, + "grad_norm": 0.9517594575881958, + "learning_rate": 1.0348577297166676e-05, + "loss": 2.4571, + "step": 31398 + }, + { + "epoch": 2.844692079454599, + "grad_norm": 0.8455394506454468, + "learning_rate": 1.034253609617592e-05, + "loss": 1.876, + "step": 31399 + }, + { + "epoch": 2.8447826776290457, + "grad_norm": 1.0304383039474487, + "learning_rate": 1.0336494895185163e-05, + "loss": 2.7787, + "step": 31400 + }, + { + "epoch": 2.8448732758034927, + "grad_norm": 1.0053925514221191, + "learning_rate": 1.0330453694194405e-05, + "loss": 2.6205, + "step": 31401 + }, + { + "epoch": 2.8449638739779393, + "grad_norm": 1.0872046947479248, + "learning_rate": 1.0324412493203648e-05, + "loss": 2.6482, + "step": 31402 + }, + { + "epoch": 2.8450544721523863, + "grad_norm": 0.8524315357208252, + "learning_rate": 1.0318371292212892e-05, + "loss": 1.9767, + "step": 31403 + }, + { + "epoch": 2.845145070326833, + "grad_norm": 0.8749848008155823, + "learning_rate": 1.0312330091222136e-05, + "loss": 2.0918, + "step": 31404 + }, + { + "epoch": 2.84523566850128, + "grad_norm": 0.9450026750564575, + "learning_rate": 1.0306288890231379e-05, + "loss": 2.367, + "step": 31405 + }, + { + "epoch": 2.8453262666757264, + "grad_norm": 1.0679304599761963, + "learning_rate": 1.0300247689240621e-05, + "loss": 2.6159, + "step": 31406 + }, + { + "epoch": 2.8454168648501734, + "grad_norm": 1.0661801099777222, + "learning_rate": 1.0294206488249864e-05, + "loss": 2.9414, + "step": 31407 + }, + { + "epoch": 2.84550746302462, + "grad_norm": 1.0081813335418701, + "learning_rate": 1.0288165287259108e-05, + "loss": 2.6515, + "step": 31408 + }, + { + "epoch": 2.845598061199067, + "grad_norm": 1.0544474124908447, + "learning_rate": 1.028212408626835e-05, + "loss": 2.5588, + "step": 31409 + }, + { + "epoch": 2.8456886593735136, + "grad_norm": 0.969706654548645, + "learning_rate": 1.0276082885277593e-05, + "loss": 2.5668, + "step": 31410 + }, + { + "epoch": 2.8457792575479606, + "grad_norm": 1.0333185195922852, + "learning_rate": 1.0270041684286836e-05, + "loss": 2.7323, + "step": 31411 + }, + { + "epoch": 2.845869855722407, + "grad_norm": 1.0298986434936523, + "learning_rate": 1.026400048329608e-05, + "loss": 2.5996, + "step": 31412 + }, + { + "epoch": 2.845960453896854, + "grad_norm": 0.9518460035324097, + "learning_rate": 1.0257959282305323e-05, + "loss": 1.8559, + "step": 31413 + }, + { + "epoch": 2.8460510520713007, + "grad_norm": 1.0295982360839844, + "learning_rate": 1.0251918081314565e-05, + "loss": 2.8554, + "step": 31414 + }, + { + "epoch": 2.8461416502457473, + "grad_norm": 1.016176700592041, + "learning_rate": 1.0245876880323808e-05, + "loss": 2.5415, + "step": 31415 + }, + { + "epoch": 2.8462322484201943, + "grad_norm": 1.016828179359436, + "learning_rate": 1.0239835679333052e-05, + "loss": 2.4741, + "step": 31416 + }, + { + "epoch": 2.8463228465946413, + "grad_norm": 1.0478969812393188, + "learning_rate": 1.0233794478342294e-05, + "loss": 2.4898, + "step": 31417 + }, + { + "epoch": 2.846413444769088, + "grad_norm": 1.0653902292251587, + "learning_rate": 1.0227753277351539e-05, + "loss": 2.3199, + "step": 31418 + }, + { + "epoch": 2.8465040429435344, + "grad_norm": 0.9830330014228821, + "learning_rate": 1.0221712076360781e-05, + "loss": 2.4885, + "step": 31419 + }, + { + "epoch": 2.8465946411179814, + "grad_norm": 1.0309059619903564, + "learning_rate": 1.0215670875370025e-05, + "loss": 2.8307, + "step": 31420 + }, + { + "epoch": 2.8466852392924284, + "grad_norm": 1.128264307975769, + "learning_rate": 1.0209629674379268e-05, + "loss": 2.5432, + "step": 31421 + }, + { + "epoch": 2.846775837466875, + "grad_norm": 1.095266580581665, + "learning_rate": 1.020358847338851e-05, + "loss": 2.6783, + "step": 31422 + }, + { + "epoch": 2.8468664356413216, + "grad_norm": 0.979080080986023, + "learning_rate": 1.0197547272397753e-05, + "loss": 2.542, + "step": 31423 + }, + { + "epoch": 2.8469570338157686, + "grad_norm": 1.0957324504852295, + "learning_rate": 1.0191506071406997e-05, + "loss": 2.564, + "step": 31424 + }, + { + "epoch": 2.8470476319902156, + "grad_norm": 1.0048166513442993, + "learning_rate": 1.018546487041624e-05, + "loss": 2.5083, + "step": 31425 + }, + { + "epoch": 2.847138230164662, + "grad_norm": 1.1049745082855225, + "learning_rate": 1.0179423669425482e-05, + "loss": 2.7894, + "step": 31426 + }, + { + "epoch": 2.8472288283391087, + "grad_norm": 1.0955301523208618, + "learning_rate": 1.0173382468434725e-05, + "loss": 2.5965, + "step": 31427 + }, + { + "epoch": 2.8473194265135557, + "grad_norm": 0.9833470582962036, + "learning_rate": 1.0167341267443969e-05, + "loss": 2.8126, + "step": 31428 + }, + { + "epoch": 2.8474100246880027, + "grad_norm": 1.096684455871582, + "learning_rate": 1.0161300066453211e-05, + "loss": 2.7199, + "step": 31429 + }, + { + "epoch": 2.8475006228624493, + "grad_norm": 0.956697940826416, + "learning_rate": 1.0155258865462454e-05, + "loss": 2.8158, + "step": 31430 + }, + { + "epoch": 2.847591221036896, + "grad_norm": 1.0556142330169678, + "learning_rate": 1.0149217664471696e-05, + "loss": 2.7369, + "step": 31431 + }, + { + "epoch": 2.847681819211343, + "grad_norm": 1.0170912742614746, + "learning_rate": 1.014317646348094e-05, + "loss": 2.5875, + "step": 31432 + }, + { + "epoch": 2.84777241738579, + "grad_norm": 1.0416083335876465, + "learning_rate": 1.0137135262490185e-05, + "loss": 2.5523, + "step": 31433 + }, + { + "epoch": 2.8478630155602365, + "grad_norm": 1.1106654405593872, + "learning_rate": 1.0131094061499427e-05, + "loss": 2.6267, + "step": 31434 + }, + { + "epoch": 2.847953613734683, + "grad_norm": 0.97157222032547, + "learning_rate": 1.012505286050867e-05, + "loss": 2.4315, + "step": 31435 + }, + { + "epoch": 2.84804421190913, + "grad_norm": 1.058053970336914, + "learning_rate": 1.0119011659517912e-05, + "loss": 2.5647, + "step": 31436 + }, + { + "epoch": 2.848134810083577, + "grad_norm": 0.9738776087760925, + "learning_rate": 1.0112970458527157e-05, + "loss": 2.7614, + "step": 31437 + }, + { + "epoch": 2.8482254082580236, + "grad_norm": 1.1266913414001465, + "learning_rate": 1.01069292575364e-05, + "loss": 2.5666, + "step": 31438 + }, + { + "epoch": 2.84831600643247, + "grad_norm": 0.9640421271324158, + "learning_rate": 1.0100888056545642e-05, + "loss": 2.5775, + "step": 31439 + }, + { + "epoch": 2.848406604606917, + "grad_norm": 1.0704820156097412, + "learning_rate": 1.0094846855554884e-05, + "loss": 2.699, + "step": 31440 + }, + { + "epoch": 2.848497202781364, + "grad_norm": 0.9569973945617676, + "learning_rate": 1.0088805654564128e-05, + "loss": 2.6936, + "step": 31441 + }, + { + "epoch": 2.8485878009558108, + "grad_norm": 0.9451251029968262, + "learning_rate": 1.0082764453573371e-05, + "loss": 2.5629, + "step": 31442 + }, + { + "epoch": 2.8486783991302573, + "grad_norm": 0.9983781576156616, + "learning_rate": 1.0076723252582613e-05, + "loss": 2.7144, + "step": 31443 + }, + { + "epoch": 2.8487689973047043, + "grad_norm": 0.9805522561073303, + "learning_rate": 1.0070682051591856e-05, + "loss": 2.5397, + "step": 31444 + }, + { + "epoch": 2.8488595954791514, + "grad_norm": 1.0961523056030273, + "learning_rate": 1.00646408506011e-05, + "loss": 2.2955, + "step": 31445 + }, + { + "epoch": 2.848950193653598, + "grad_norm": 0.9797564148902893, + "learning_rate": 1.0058599649610343e-05, + "loss": 2.5031, + "step": 31446 + }, + { + "epoch": 2.8490407918280445, + "grad_norm": 1.0340745449066162, + "learning_rate": 1.0052558448619587e-05, + "loss": 2.5334, + "step": 31447 + }, + { + "epoch": 2.8491313900024915, + "grad_norm": 1.1137995719909668, + "learning_rate": 1.004651724762883e-05, + "loss": 2.5319, + "step": 31448 + }, + { + "epoch": 2.8492219881769385, + "grad_norm": 0.9407480955123901, + "learning_rate": 1.0040476046638072e-05, + "loss": 2.611, + "step": 31449 + }, + { + "epoch": 2.849312586351385, + "grad_norm": 1.0185608863830566, + "learning_rate": 1.0034434845647316e-05, + "loss": 2.7755, + "step": 31450 + }, + { + "epoch": 2.8494031845258316, + "grad_norm": 0.9961484670639038, + "learning_rate": 1.0028393644656559e-05, + "loss": 2.5299, + "step": 31451 + }, + { + "epoch": 2.8494937827002786, + "grad_norm": 0.9967892169952393, + "learning_rate": 1.0022352443665801e-05, + "loss": 1.7932, + "step": 31452 + }, + { + "epoch": 2.849584380874725, + "grad_norm": 1.1038575172424316, + "learning_rate": 1.0016311242675044e-05, + "loss": 2.6447, + "step": 31453 + }, + { + "epoch": 2.849674979049172, + "grad_norm": 0.9957420229911804, + "learning_rate": 1.0010270041684288e-05, + "loss": 2.863, + "step": 31454 + }, + { + "epoch": 2.849765577223619, + "grad_norm": 1.0893168449401855, + "learning_rate": 1.000422884069353e-05, + "loss": 2.7847, + "step": 31455 + }, + { + "epoch": 2.849856175398066, + "grad_norm": 1.0408533811569214, + "learning_rate": 9.998187639702773e-06, + "loss": 2.4809, + "step": 31456 + }, + { + "epoch": 2.8499467735725124, + "grad_norm": 0.9907143712043762, + "learning_rate": 9.992146438712016e-06, + "loss": 2.3681, + "step": 31457 + }, + { + "epoch": 2.8500373717469594, + "grad_norm": 0.9672520160675049, + "learning_rate": 9.98610523772126e-06, + "loss": 2.5661, + "step": 31458 + }, + { + "epoch": 2.850127969921406, + "grad_norm": 1.0040905475616455, + "learning_rate": 9.980064036730502e-06, + "loss": 2.5235, + "step": 31459 + }, + { + "epoch": 2.850218568095853, + "grad_norm": 0.9188793897628784, + "learning_rate": 9.974022835739745e-06, + "loss": 2.0059, + "step": 31460 + }, + { + "epoch": 2.8503091662702995, + "grad_norm": 1.0287574529647827, + "learning_rate": 9.967981634748987e-06, + "loss": 2.9569, + "step": 31461 + }, + { + "epoch": 2.8503997644447465, + "grad_norm": 1.0960756540298462, + "learning_rate": 9.961940433758232e-06, + "loss": 2.3916, + "step": 31462 + }, + { + "epoch": 2.850490362619193, + "grad_norm": 0.9536685347557068, + "learning_rate": 9.955899232767476e-06, + "loss": 2.5227, + "step": 31463 + }, + { + "epoch": 2.85058096079364, + "grad_norm": 1.0502700805664062, + "learning_rate": 9.949858031776718e-06, + "loss": 2.3068, + "step": 31464 + }, + { + "epoch": 2.8506715589680867, + "grad_norm": 1.0123670101165771, + "learning_rate": 9.943816830785961e-06, + "loss": 2.6225, + "step": 31465 + }, + { + "epoch": 2.8507621571425337, + "grad_norm": 0.8431705236434937, + "learning_rate": 9.937775629795203e-06, + "loss": 1.6298, + "step": 31466 + }, + { + "epoch": 2.8508527553169802, + "grad_norm": 0.863425076007843, + "learning_rate": 9.931734428804448e-06, + "loss": 1.9607, + "step": 31467 + }, + { + "epoch": 2.8509433534914272, + "grad_norm": 1.1591674089431763, + "learning_rate": 9.92569322781369e-06, + "loss": 2.7043, + "step": 31468 + }, + { + "epoch": 2.851033951665874, + "grad_norm": 1.0911785364151, + "learning_rate": 9.919652026822933e-06, + "loss": 2.362, + "step": 31469 + }, + { + "epoch": 2.851124549840321, + "grad_norm": 0.9152441620826721, + "learning_rate": 9.913610825832175e-06, + "loss": 2.5545, + "step": 31470 + }, + { + "epoch": 2.8512151480147674, + "grad_norm": 0.9869197607040405, + "learning_rate": 9.90756962484142e-06, + "loss": 2.5007, + "step": 31471 + }, + { + "epoch": 2.8513057461892144, + "grad_norm": 0.9613575339317322, + "learning_rate": 9.901528423850662e-06, + "loss": 2.3046, + "step": 31472 + }, + { + "epoch": 2.851396344363661, + "grad_norm": 0.970199465751648, + "learning_rate": 9.895487222859904e-06, + "loss": 2.2486, + "step": 31473 + }, + { + "epoch": 2.851486942538108, + "grad_norm": 0.9968077540397644, + "learning_rate": 9.889446021869147e-06, + "loss": 2.6559, + "step": 31474 + }, + { + "epoch": 2.8515775407125545, + "grad_norm": 0.8873865008354187, + "learning_rate": 9.883404820878391e-06, + "loss": 2.212, + "step": 31475 + }, + { + "epoch": 2.8516681388870015, + "grad_norm": 0.9625951647758484, + "learning_rate": 9.877363619887634e-06, + "loss": 2.4878, + "step": 31476 + }, + { + "epoch": 2.851758737061448, + "grad_norm": 0.9483526945114136, + "learning_rate": 9.871322418896878e-06, + "loss": 2.5982, + "step": 31477 + }, + { + "epoch": 2.851849335235895, + "grad_norm": 0.9956904649734497, + "learning_rate": 9.86528121790612e-06, + "loss": 2.4305, + "step": 31478 + }, + { + "epoch": 2.8519399334103417, + "grad_norm": 1.0445581674575806, + "learning_rate": 9.859240016915363e-06, + "loss": 2.6161, + "step": 31479 + }, + { + "epoch": 2.8520305315847887, + "grad_norm": 0.9671469330787659, + "learning_rate": 9.853198815924607e-06, + "loss": 2.4798, + "step": 31480 + }, + { + "epoch": 2.8521211297592353, + "grad_norm": 0.9791136980056763, + "learning_rate": 9.84715761493385e-06, + "loss": 2.3692, + "step": 31481 + }, + { + "epoch": 2.8522117279336823, + "grad_norm": 0.9823957085609436, + "learning_rate": 9.841116413943092e-06, + "loss": 2.5061, + "step": 31482 + }, + { + "epoch": 2.852302326108129, + "grad_norm": 1.0958309173583984, + "learning_rate": 9.835075212952335e-06, + "loss": 2.5446, + "step": 31483 + }, + { + "epoch": 2.852392924282576, + "grad_norm": 0.9348717927932739, + "learning_rate": 9.829034011961579e-06, + "loss": 2.5075, + "step": 31484 + }, + { + "epoch": 2.8524835224570224, + "grad_norm": 0.9989269375801086, + "learning_rate": 9.822992810970821e-06, + "loss": 2.3141, + "step": 31485 + }, + { + "epoch": 2.8525741206314694, + "grad_norm": 0.9970750212669373, + "learning_rate": 9.816951609980064e-06, + "loss": 2.4891, + "step": 31486 + }, + { + "epoch": 2.852664718805916, + "grad_norm": 0.883281946182251, + "learning_rate": 9.810910408989307e-06, + "loss": 1.769, + "step": 31487 + }, + { + "epoch": 2.852755316980363, + "grad_norm": 1.0475612878799438, + "learning_rate": 9.80486920799855e-06, + "loss": 2.8296, + "step": 31488 + }, + { + "epoch": 2.8528459151548096, + "grad_norm": 1.1054238080978394, + "learning_rate": 9.798828007007793e-06, + "loss": 2.517, + "step": 31489 + }, + { + "epoch": 2.8529365133292566, + "grad_norm": 1.154469609260559, + "learning_rate": 9.792786806017036e-06, + "loss": 2.4397, + "step": 31490 + }, + { + "epoch": 2.853027111503703, + "grad_norm": 0.8649902939796448, + "learning_rate": 9.786745605026278e-06, + "loss": 1.7623, + "step": 31491 + }, + { + "epoch": 2.85311770967815, + "grad_norm": 0.9767774939537048, + "learning_rate": 9.780704404035523e-06, + "loss": 2.5696, + "step": 31492 + }, + { + "epoch": 2.8532083078525967, + "grad_norm": 0.9415722489356995, + "learning_rate": 9.774663203044767e-06, + "loss": 2.7491, + "step": 31493 + }, + { + "epoch": 2.8532989060270433, + "grad_norm": 1.0184612274169922, + "learning_rate": 9.76862200205401e-06, + "loss": 2.7194, + "step": 31494 + }, + { + "epoch": 2.8533895042014903, + "grad_norm": 1.030782699584961, + "learning_rate": 9.762580801063252e-06, + "loss": 2.4757, + "step": 31495 + }, + { + "epoch": 2.8534801023759373, + "grad_norm": 0.9289295077323914, + "learning_rate": 9.756539600072494e-06, + "loss": 2.5027, + "step": 31496 + }, + { + "epoch": 2.853570700550384, + "grad_norm": 0.9901664853096008, + "learning_rate": 9.750498399081739e-06, + "loss": 2.5717, + "step": 31497 + }, + { + "epoch": 2.8536612987248304, + "grad_norm": 1.0184725522994995, + "learning_rate": 9.744457198090981e-06, + "loss": 2.5602, + "step": 31498 + }, + { + "epoch": 2.8537518968992774, + "grad_norm": 1.0533796548843384, + "learning_rate": 9.738415997100224e-06, + "loss": 2.4693, + "step": 31499 + }, + { + "epoch": 2.8538424950737245, + "grad_norm": 0.9884901642799377, + "learning_rate": 9.732374796109466e-06, + "loss": 2.6101, + "step": 31500 + } + ], + "logging_steps": 1, + "max_steps": 33111, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.822335635976327e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}