{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.079734219269103, "eval_steps": 500, "global_step": 650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016611295681063123, "grad_norm": 283.7961630862445, "learning_rate": 2.7027027027027028e-08, "loss": 1.0013, "step": 1 }, { "epoch": 0.0033222591362126247, "grad_norm": 403.0161196007378, "learning_rate": 5.4054054054054056e-08, "loss": 1.3486, "step": 2 }, { "epoch": 0.0049833887043189366, "grad_norm": 225.65685288846345, "learning_rate": 8.108108108108108e-08, "loss": 0.9601, "step": 3 }, { "epoch": 0.006644518272425249, "grad_norm": 309.07994612900893, "learning_rate": 1.0810810810810811e-07, "loss": 0.814, "step": 4 }, { "epoch": 0.008305647840531562, "grad_norm": 380.5732961356113, "learning_rate": 1.3513513513513515e-07, "loss": 0.9665, "step": 5 }, { "epoch": 0.009966777408637873, "grad_norm": 350.2306529649741, "learning_rate": 1.6216216216216215e-07, "loss": 0.9394, "step": 6 }, { "epoch": 0.011627906976744186, "grad_norm": 303.16533146413684, "learning_rate": 1.891891891891892e-07, "loss": 1.1157, "step": 7 }, { "epoch": 0.013289036544850499, "grad_norm": 330.3322177024234, "learning_rate": 2.1621621621621622e-07, "loss": 0.8766, "step": 8 }, { "epoch": 0.014950166112956811, "grad_norm": 232.1781635901737, "learning_rate": 2.4324324324324326e-07, "loss": 0.7901, "step": 9 }, { "epoch": 0.016611295681063124, "grad_norm": 303.49257454610745, "learning_rate": 2.702702702702703e-07, "loss": 0.875, "step": 10 }, { "epoch": 0.018272425249169437, "grad_norm": 191.62808346871842, "learning_rate": 2.972972972972973e-07, "loss": 0.6306, "step": 11 }, { "epoch": 0.019933554817275746, "grad_norm": 184.28128658857395, "learning_rate": 3.243243243243243e-07, "loss": 0.6915, "step": 12 }, { "epoch": 0.02159468438538206, "grad_norm": 466.93407676031165, "learning_rate": 3.5135135135135134e-07, "loss": 0.6333, "step": 13 }, { "epoch": 0.023255813953488372, "grad_norm": 146.99367069527102, "learning_rate": 3.783783783783784e-07, "loss": 0.4615, "step": 14 }, { "epoch": 0.024916943521594685, "grad_norm": 146.6363236745373, "learning_rate": 4.054054054054054e-07, "loss": 0.6328, "step": 15 }, { "epoch": 0.026578073089700997, "grad_norm": 196.3416905532426, "learning_rate": 4.3243243243243244e-07, "loss": 0.6118, "step": 16 }, { "epoch": 0.02823920265780731, "grad_norm": 302.44153365360745, "learning_rate": 4.594594594594595e-07, "loss": 0.5397, "step": 17 }, { "epoch": 0.029900332225913623, "grad_norm": 8753.401359110785, "learning_rate": 4.864864864864865e-07, "loss": 0.6609, "step": 18 }, { "epoch": 0.03156146179401993, "grad_norm": 139.31291333841494, "learning_rate": 5.135135135135134e-07, "loss": 0.4247, "step": 19 }, { "epoch": 0.03322259136212625, "grad_norm": 117.2303002306334, "learning_rate": 5.405405405405406e-07, "loss": 0.5025, "step": 20 }, { "epoch": 0.03488372093023256, "grad_norm": 159.0527667166082, "learning_rate": 5.675675675675675e-07, "loss": 0.4789, "step": 21 }, { "epoch": 0.036544850498338874, "grad_norm": 211.30553229145636, "learning_rate": 5.945945945945947e-07, "loss": 0.6005, "step": 22 }, { "epoch": 0.03820598006644518, "grad_norm": 74.75774440900648, "learning_rate": 6.216216216216216e-07, "loss": 0.4322, "step": 23 }, { "epoch": 0.03986710963455149, "grad_norm": 152.6630766770394, "learning_rate": 6.486486486486486e-07, "loss": 0.6084, "step": 24 }, { "epoch": 0.04152823920265781, "grad_norm": 68.59356856504525, "learning_rate": 6.756756756756756e-07, "loss": 0.3271, "step": 25 }, { "epoch": 0.04318936877076412, "grad_norm": 48.213900231519695, "learning_rate": 7.027027027027027e-07, "loss": 0.3455, "step": 26 }, { "epoch": 0.044850498338870434, "grad_norm": 50.0637038462766, "learning_rate": 7.297297297297297e-07, "loss": 0.3409, "step": 27 }, { "epoch": 0.046511627906976744, "grad_norm": 64.20573798423982, "learning_rate": 7.567567567567568e-07, "loss": 0.448, "step": 28 }, { "epoch": 0.04817275747508306, "grad_norm": 132.42160531061333, "learning_rate": 7.837837837837838e-07, "loss": 0.3915, "step": 29 }, { "epoch": 0.04983388704318937, "grad_norm": 101.32898633286817, "learning_rate": 8.108108108108108e-07, "loss": 0.3159, "step": 30 }, { "epoch": 0.05149501661129568, "grad_norm": 67.44357173798667, "learning_rate": 8.378378378378377e-07, "loss": 0.2786, "step": 31 }, { "epoch": 0.053156146179401995, "grad_norm": 94.87155550615857, "learning_rate": 8.648648648648649e-07, "loss": 0.4167, "step": 32 }, { "epoch": 0.054817275747508304, "grad_norm": 120.07807866412833, "learning_rate": 8.918918918918918e-07, "loss": 0.3996, "step": 33 }, { "epoch": 0.05647840531561462, "grad_norm": 53.547698067900235, "learning_rate": 9.18918918918919e-07, "loss": 0.3981, "step": 34 }, { "epoch": 0.05813953488372093, "grad_norm": 89.39620348618399, "learning_rate": 9.459459459459459e-07, "loss": 0.3523, "step": 35 }, { "epoch": 0.059800664451827246, "grad_norm": 125.63871272969014, "learning_rate": 9.72972972972973e-07, "loss": 0.343, "step": 36 }, { "epoch": 0.061461794019933555, "grad_norm": 85.85489825737757, "learning_rate": 1e-06, "loss": 0.3063, "step": 37 }, { "epoch": 0.06312292358803986, "grad_norm": 56.63108841849816, "learning_rate": 9.999981882520454e-07, "loss": 0.4455, "step": 38 }, { "epoch": 0.06478405315614617, "grad_norm": 40.40412484077134, "learning_rate": 9.999927530213112e-07, "loss": 0.3411, "step": 39 }, { "epoch": 0.0664451827242525, "grad_norm": 42.56638423746503, "learning_rate": 9.999836943471866e-07, "loss": 0.3422, "step": 40 }, { "epoch": 0.0681063122923588, "grad_norm": 149.52998978810905, "learning_rate": 9.999710122953198e-07, "loss": 0.3539, "step": 41 }, { "epoch": 0.06976744186046512, "grad_norm": 99.06075800406914, "learning_rate": 9.999547069576173e-07, "loss": 0.3705, "step": 42 }, { "epoch": 0.07142857142857142, "grad_norm": 28.75050949230183, "learning_rate": 9.99934778452244e-07, "loss": 0.2556, "step": 43 }, { "epoch": 0.07308970099667775, "grad_norm": 102.07599788982593, "learning_rate": 9.999112269236213e-07, "loss": 0.3375, "step": 44 }, { "epoch": 0.07475083056478406, "grad_norm": 94.98798632226429, "learning_rate": 9.99884052542427e-07, "loss": 0.325, "step": 45 }, { "epoch": 0.07641196013289037, "grad_norm": 45.49599727736315, "learning_rate": 9.99853255505594e-07, "loss": 0.3344, "step": 46 }, { "epoch": 0.07807308970099668, "grad_norm": 437.1776518783744, "learning_rate": 9.99818836036308e-07, "loss": 0.3195, "step": 47 }, { "epoch": 0.07973421926910298, "grad_norm": 40.386876297214194, "learning_rate": 9.997807943840063e-07, "loss": 0.2935, "step": 48 }, { "epoch": 0.08139534883720931, "grad_norm": 35.81499917192016, "learning_rate": 9.997391308243767e-07, "loss": 0.3221, "step": 49 }, { "epoch": 0.08305647840531562, "grad_norm": 135.69410164163278, "learning_rate": 9.996938456593547e-07, "loss": 0.3641, "step": 50 }, { "epoch": 0.08471760797342193, "grad_norm": 37.49737579141678, "learning_rate": 9.996449392171216e-07, "loss": 0.3116, "step": 51 }, { "epoch": 0.08637873754152824, "grad_norm": 53.54810233248991, "learning_rate": 9.995924118521016e-07, "loss": 0.2374, "step": 52 }, { "epoch": 0.08803986710963455, "grad_norm": 29.764129425896126, "learning_rate": 9.995362639449604e-07, "loss": 0.3214, "step": 53 }, { "epoch": 0.08970099667774087, "grad_norm": 86.36715802553283, "learning_rate": 9.994764959026014e-07, "loss": 0.2724, "step": 54 }, { "epoch": 0.09136212624584718, "grad_norm": 59.60483629138851, "learning_rate": 9.99413108158163e-07, "loss": 0.2682, "step": 55 }, { "epoch": 0.09302325581395349, "grad_norm": 1121.1809969504397, "learning_rate": 9.99346101171016e-07, "loss": 0.3457, "step": 56 }, { "epoch": 0.0946843853820598, "grad_norm": 36.4371245053598, "learning_rate": 9.99275475426759e-07, "loss": 0.3518, "step": 57 }, { "epoch": 0.09634551495016612, "grad_norm": 36.679857127723125, "learning_rate": 9.992012314372164e-07, "loss": 0.1912, "step": 58 }, { "epoch": 0.09800664451827243, "grad_norm": 168.93933955821868, "learning_rate": 9.991233697404337e-07, "loss": 0.2478, "step": 59 }, { "epoch": 0.09966777408637874, "grad_norm": 24.14622539640721, "learning_rate": 9.990418909006743e-07, "loss": 0.2118, "step": 60 }, { "epoch": 0.10132890365448505, "grad_norm": 43.18341622294434, "learning_rate": 9.989567955084143e-07, "loss": 0.3924, "step": 61 }, { "epoch": 0.10299003322259136, "grad_norm": 29.085536311011936, "learning_rate": 9.988680841803396e-07, "loss": 0.2878, "step": 62 }, { "epoch": 0.10465116279069768, "grad_norm": 24.284878773871643, "learning_rate": 9.987757575593402e-07, "loss": 0.1948, "step": 63 }, { "epoch": 0.10631229235880399, "grad_norm": 27.805278117108944, "learning_rate": 9.986798163145066e-07, "loss": 0.2563, "step": 64 }, { "epoch": 0.1079734219269103, "grad_norm": 22.56960935878, "learning_rate": 9.985802611411243e-07, "loss": 0.2298, "step": 65 }, { "epoch": 0.10963455149501661, "grad_norm": 37.482492845739586, "learning_rate": 9.984770927606686e-07, "loss": 0.2785, "step": 66 }, { "epoch": 0.11129568106312292, "grad_norm": 23.848350011819495, "learning_rate": 9.983703119207998e-07, "loss": 0.2113, "step": 67 }, { "epoch": 0.11295681063122924, "grad_norm": 30.649497045880594, "learning_rate": 9.98259919395358e-07, "loss": 0.255, "step": 68 }, { "epoch": 0.11461794019933555, "grad_norm": 23.006604632466246, "learning_rate": 9.98145915984357e-07, "loss": 0.224, "step": 69 }, { "epoch": 0.11627906976744186, "grad_norm": 21.882953775970815, "learning_rate": 9.98028302513978e-07, "loss": 0.2616, "step": 70 }, { "epoch": 0.11794019933554817, "grad_norm": 25.05081248063707, "learning_rate": 9.97907079836566e-07, "loss": 0.2128, "step": 71 }, { "epoch": 0.11960132890365449, "grad_norm": 31.598368685329934, "learning_rate": 9.977822488306195e-07, "loss": 0.3792, "step": 72 }, { "epoch": 0.1212624584717608, "grad_norm": 37.48759519077218, "learning_rate": 9.976538104007886e-07, "loss": 0.2736, "step": 73 }, { "epoch": 0.12292358803986711, "grad_norm": 19.638414416569248, "learning_rate": 9.975217654778651e-07, "loss": 0.1277, "step": 74 }, { "epoch": 0.12458471760797342, "grad_norm": 29.04555059332869, "learning_rate": 9.97386115018778e-07, "loss": 0.3141, "step": 75 }, { "epoch": 0.12624584717607973, "grad_norm": 21.430456032128323, "learning_rate": 9.972468600065845e-07, "loss": 0.2253, "step": 76 }, { "epoch": 0.12790697674418605, "grad_norm": 41.85901544277914, "learning_rate": 9.971040014504648e-07, "loss": 0.3621, "step": 77 }, { "epoch": 0.12956810631229235, "grad_norm": 14.223035223233115, "learning_rate": 9.969575403857135e-07, "loss": 0.1284, "step": 78 }, { "epoch": 0.13122923588039867, "grad_norm": 22.262799198485006, "learning_rate": 9.968074778737332e-07, "loss": 0.2524, "step": 79 }, { "epoch": 0.132890365448505, "grad_norm": 50.697373968189815, "learning_rate": 9.966538150020252e-07, "loss": 0.2189, "step": 80 }, { "epoch": 0.1345514950166113, "grad_norm": 21.86462698311471, "learning_rate": 9.964965528841833e-07, "loss": 0.2334, "step": 81 }, { "epoch": 0.1362126245847176, "grad_norm": 30.03903059222607, "learning_rate": 9.963356926598848e-07, "loss": 0.2619, "step": 82 }, { "epoch": 0.1378737541528239, "grad_norm": 44.39420516166669, "learning_rate": 9.961712354948822e-07, "loss": 0.3148, "step": 83 }, { "epoch": 0.13953488372093023, "grad_norm": 45.40184063467476, "learning_rate": 9.960031825809955e-07, "loss": 0.2719, "step": 84 }, { "epoch": 0.14119601328903655, "grad_norm": 41.30803088566475, "learning_rate": 9.95831535136103e-07, "loss": 0.2746, "step": 85 }, { "epoch": 0.14285714285714285, "grad_norm": 21.10123734223929, "learning_rate": 9.956562944041316e-07, "loss": 0.2082, "step": 86 }, { "epoch": 0.14451827242524917, "grad_norm": 38.37926428810434, "learning_rate": 9.954774616550499e-07, "loss": 0.221, "step": 87 }, { "epoch": 0.1461794019933555, "grad_norm": 21.615149648940918, "learning_rate": 9.952950381848576e-07, "loss": 0.1952, "step": 88 }, { "epoch": 0.1478405315614618, "grad_norm": 21.749889867769415, "learning_rate": 9.951090253155757e-07, "loss": 0.2139, "step": 89 }, { "epoch": 0.14950166112956811, "grad_norm": 16.24745315976571, "learning_rate": 9.949194243952382e-07, "loss": 0.1852, "step": 90 }, { "epoch": 0.1511627906976744, "grad_norm": 13.76793966078715, "learning_rate": 9.94726236797881e-07, "loss": 0.179, "step": 91 }, { "epoch": 0.15282392026578073, "grad_norm": 16.40584213825403, "learning_rate": 9.945294639235336e-07, "loss": 0.2484, "step": 92 }, { "epoch": 0.15448504983388706, "grad_norm": 19.45237506640772, "learning_rate": 9.943291071982072e-07, "loss": 0.2379, "step": 93 }, { "epoch": 0.15614617940199335, "grad_norm": 11.996023853804303, "learning_rate": 9.941251680738852e-07, "loss": 0.1372, "step": 94 }, { "epoch": 0.15780730897009967, "grad_norm": 16.66955200884909, "learning_rate": 9.939176480285128e-07, "loss": 0.1833, "step": 95 }, { "epoch": 0.15946843853820597, "grad_norm": 20.352150945455673, "learning_rate": 9.93706548565986e-07, "loss": 0.1878, "step": 96 }, { "epoch": 0.1611295681063123, "grad_norm": 30.86575710552868, "learning_rate": 9.934918712161414e-07, "loss": 0.2089, "step": 97 }, { "epoch": 0.16279069767441862, "grad_norm": 30.108048559073602, "learning_rate": 9.932736175347433e-07, "loss": 0.2334, "step": 98 }, { "epoch": 0.1644518272425249, "grad_norm": 16.137380650744188, "learning_rate": 9.930517891034748e-07, "loss": 0.1935, "step": 99 }, { "epoch": 0.16611295681063123, "grad_norm": 17.736914720083263, "learning_rate": 9.928263875299245e-07, "loss": 0.1772, "step": 100 }, { "epoch": 0.16777408637873753, "grad_norm": 11.972509757727426, "learning_rate": 9.92597414447576e-07, "loss": 0.1374, "step": 101 }, { "epoch": 0.16943521594684385, "grad_norm": 20.433799792275494, "learning_rate": 9.923648715157952e-07, "loss": 0.2198, "step": 102 }, { "epoch": 0.17109634551495018, "grad_norm": 19.404673471029323, "learning_rate": 9.921287604198185e-07, "loss": 0.152, "step": 103 }, { "epoch": 0.17275747508305647, "grad_norm": 18.73899376004668, "learning_rate": 9.918890828707416e-07, "loss": 0.2282, "step": 104 }, { "epoch": 0.1744186046511628, "grad_norm": 22.306238872760357, "learning_rate": 9.916458406055055e-07, "loss": 0.1895, "step": 105 }, { "epoch": 0.1760797342192691, "grad_norm": 19.021294987112594, "learning_rate": 9.91399035386885e-07, "loss": 0.2403, "step": 106 }, { "epoch": 0.1777408637873754, "grad_norm": 86.53948554137872, "learning_rate": 9.911486690034753e-07, "loss": 0.1723, "step": 107 }, { "epoch": 0.17940199335548174, "grad_norm": 18.7063649829218, "learning_rate": 9.908947432696798e-07, "loss": 0.2134, "step": 108 }, { "epoch": 0.18106312292358803, "grad_norm": 19.41271168055036, "learning_rate": 9.906372600256962e-07, "loss": 0.225, "step": 109 }, { "epoch": 0.18272425249169436, "grad_norm": 20.719404876758283, "learning_rate": 9.903762211375032e-07, "loss": 0.2158, "step": 110 }, { "epoch": 0.18438538205980065, "grad_norm": 21.610832850601305, "learning_rate": 9.901116284968478e-07, "loss": 0.2267, "step": 111 }, { "epoch": 0.18604651162790697, "grad_norm": 22.19654692004818, "learning_rate": 9.898434840212305e-07, "loss": 0.2376, "step": 112 }, { "epoch": 0.1877076411960133, "grad_norm": 14.838187787942, "learning_rate": 9.89571789653892e-07, "loss": 0.1743, "step": 113 }, { "epoch": 0.1893687707641196, "grad_norm": 17.480053388106516, "learning_rate": 9.892965473637992e-07, "loss": 0.239, "step": 114 }, { "epoch": 0.19102990033222592, "grad_norm": 18.66511685282266, "learning_rate": 9.890177591456311e-07, "loss": 0.2502, "step": 115 }, { "epoch": 0.19269102990033224, "grad_norm": 17.526724768165533, "learning_rate": 9.887354270197634e-07, "loss": 0.2557, "step": 116 }, { "epoch": 0.19435215946843853, "grad_norm": 13.92139359787026, "learning_rate": 9.884495530322548e-07, "loss": 0.2024, "step": 117 }, { "epoch": 0.19601328903654486, "grad_norm": 11.321726941564314, "learning_rate": 9.881601392548314e-07, "loss": 0.1411, "step": 118 }, { "epoch": 0.19767441860465115, "grad_norm": 13.326684770702352, "learning_rate": 9.878671877848728e-07, "loss": 0.1813, "step": 119 }, { "epoch": 0.19933554817275748, "grad_norm": 19.563367999650673, "learning_rate": 9.875707007453957e-07, "loss": 0.2395, "step": 120 }, { "epoch": 0.2009966777408638, "grad_norm": 19.412272467279955, "learning_rate": 9.872706802850395e-07, "loss": 0.1867, "step": 121 }, { "epoch": 0.2026578073089701, "grad_norm": 38.85770250068695, "learning_rate": 9.869671285780498e-07, "loss": 0.213, "step": 122 }, { "epoch": 0.20431893687707642, "grad_norm": 12.921432457309935, "learning_rate": 9.866600478242635e-07, "loss": 0.1208, "step": 123 }, { "epoch": 0.2059800664451827, "grad_norm": 23.926163027310174, "learning_rate": 9.863494402490922e-07, "loss": 0.2012, "step": 124 }, { "epoch": 0.20764119601328904, "grad_norm": 15.480133857657162, "learning_rate": 9.860353081035065e-07, "loss": 0.1231, "step": 125 }, { "epoch": 0.20930232558139536, "grad_norm": 22.15508037126096, "learning_rate": 9.857176536640195e-07, "loss": 0.2013, "step": 126 }, { "epoch": 0.21096345514950166, "grad_norm": 18.2661529166959, "learning_rate": 9.853964792326704e-07, "loss": 0.2317, "step": 127 }, { "epoch": 0.21262458471760798, "grad_norm": 16.111552087003037, "learning_rate": 9.850717871370073e-07, "loss": 0.1145, "step": 128 }, { "epoch": 0.21428571428571427, "grad_norm": 17.14067866125475, "learning_rate": 9.847435797300718e-07, "loss": 0.2102, "step": 129 }, { "epoch": 0.2159468438538206, "grad_norm": 15.60291480405653, "learning_rate": 9.844118593903797e-07, "loss": 0.1035, "step": 130 }, { "epoch": 0.21760797342192692, "grad_norm": 15.174859703173738, "learning_rate": 9.840766285219059e-07, "loss": 0.1183, "step": 131 }, { "epoch": 0.21926910299003322, "grad_norm": 25.54546397866724, "learning_rate": 9.837378895540655e-07, "loss": 0.2647, "step": 132 }, { "epoch": 0.22093023255813954, "grad_norm": 10.70010328252097, "learning_rate": 9.833956449416976e-07, "loss": 0.1388, "step": 133 }, { "epoch": 0.22259136212624583, "grad_norm": 13.281620643715373, "learning_rate": 9.830498971650454e-07, "loss": 0.1973, "step": 134 }, { "epoch": 0.22425249169435216, "grad_norm": 15.696504177542776, "learning_rate": 9.827006487297406e-07, "loss": 0.2341, "step": 135 }, { "epoch": 0.22591362126245848, "grad_norm": 11.52200533124044, "learning_rate": 9.823479021667838e-07, "loss": 0.1317, "step": 136 }, { "epoch": 0.22757475083056478, "grad_norm": 27.726328978283576, "learning_rate": 9.819916600325262e-07, "loss": 0.354, "step": 137 }, { "epoch": 0.2292358803986711, "grad_norm": 17.901230984984156, "learning_rate": 9.816319249086519e-07, "loss": 0.2298, "step": 138 }, { "epoch": 0.23089700996677742, "grad_norm": 14.384698945178693, "learning_rate": 9.812686994021582e-07, "loss": 0.1523, "step": 139 }, { "epoch": 0.23255813953488372, "grad_norm": 11.621939766839967, "learning_rate": 9.809019861453373e-07, "loss": 0.2313, "step": 140 }, { "epoch": 0.23421926910299004, "grad_norm": 12.969245765067623, "learning_rate": 9.805317877957576e-07, "loss": 0.2519, "step": 141 }, { "epoch": 0.23588039867109634, "grad_norm": 8.925285164861304, "learning_rate": 9.80158107036243e-07, "loss": 0.1042, "step": 142 }, { "epoch": 0.23754152823920266, "grad_norm": 13.756628040019768, "learning_rate": 9.797809465748553e-07, "loss": 0.1994, "step": 143 }, { "epoch": 0.23920265780730898, "grad_norm": 15.970803014352063, "learning_rate": 9.794003091448728e-07, "loss": 0.22, "step": 144 }, { "epoch": 0.24086378737541528, "grad_norm": 11.738958531546247, "learning_rate": 9.790161975047724e-07, "loss": 0.1279, "step": 145 }, { "epoch": 0.2425249169435216, "grad_norm": 13.897850888275721, "learning_rate": 9.786286144382077e-07, "loss": 0.1566, "step": 146 }, { "epoch": 0.2441860465116279, "grad_norm": 17.313264421559992, "learning_rate": 9.7823756275399e-07, "loss": 0.225, "step": 147 }, { "epoch": 0.24584717607973422, "grad_norm": 23.335482929522485, "learning_rate": 9.77843045286068e-07, "loss": 0.2193, "step": 148 }, { "epoch": 0.24750830564784054, "grad_norm": 13.474282765831358, "learning_rate": 9.774450648935062e-07, "loss": 0.1841, "step": 149 }, { "epoch": 0.24916943521594684, "grad_norm": 9.992948490443583, "learning_rate": 9.77043624460465e-07, "loss": 0.1319, "step": 150 }, { "epoch": 0.25083056478405313, "grad_norm": 12.069688114769054, "learning_rate": 9.766387268961807e-07, "loss": 0.2002, "step": 151 }, { "epoch": 0.25249169435215946, "grad_norm": 19.552574894536637, "learning_rate": 9.762303751349421e-07, "loss": 0.3202, "step": 152 }, { "epoch": 0.2541528239202658, "grad_norm": 15.59457607811674, "learning_rate": 9.758185721360713e-07, "loss": 0.134, "step": 153 }, { "epoch": 0.2558139534883721, "grad_norm": 14.504175942466917, "learning_rate": 9.754033208839009e-07, "loss": 0.1177, "step": 154 }, { "epoch": 0.2574750830564784, "grad_norm": 14.826282805181464, "learning_rate": 9.749846243877538e-07, "loss": 0.1866, "step": 155 }, { "epoch": 0.2591362126245847, "grad_norm": 10.496856702175336, "learning_rate": 9.745624856819197e-07, "loss": 0.1535, "step": 156 }, { "epoch": 0.260797342192691, "grad_norm": 17.600209366012557, "learning_rate": 9.741369078256344e-07, "loss": 0.1506, "step": 157 }, { "epoch": 0.26245847176079734, "grad_norm": 16.897725025587278, "learning_rate": 9.737078939030574e-07, "loss": 0.1118, "step": 158 }, { "epoch": 0.26411960132890366, "grad_norm": 14.824856178621472, "learning_rate": 9.73275447023249e-07, "loss": 0.1801, "step": 159 }, { "epoch": 0.26578073089701, "grad_norm": 10.323291152162106, "learning_rate": 9.728395703201482e-07, "loss": 0.1151, "step": 160 }, { "epoch": 0.26744186046511625, "grad_norm": 25.65889033205719, "learning_rate": 9.724002669525494e-07, "loss": 0.2601, "step": 161 }, { "epoch": 0.2691029900332226, "grad_norm": 15.331455485719754, "learning_rate": 9.719575401040814e-07, "loss": 0.2295, "step": 162 }, { "epoch": 0.2707641196013289, "grad_norm": 15.620975269391694, "learning_rate": 9.715113929831816e-07, "loss": 0.1661, "step": 163 }, { "epoch": 0.2724252491694352, "grad_norm": 21.27647184161525, "learning_rate": 9.710618288230743e-07, "loss": 0.1653, "step": 164 }, { "epoch": 0.27408637873754155, "grad_norm": 9.574800737047406, "learning_rate": 9.706088508817475e-07, "loss": 0.1149, "step": 165 }, { "epoch": 0.2757475083056478, "grad_norm": 10.020677235555565, "learning_rate": 9.701524624419288e-07, "loss": 0.114, "step": 166 }, { "epoch": 0.27740863787375414, "grad_norm": 26.463216054020872, "learning_rate": 9.696926668110612e-07, "loss": 0.2905, "step": 167 }, { "epoch": 0.27906976744186046, "grad_norm": 9.8357213518326, "learning_rate": 9.692294673212803e-07, "loss": 0.0852, "step": 168 }, { "epoch": 0.2807308970099668, "grad_norm": 11.574558850099985, "learning_rate": 9.687628673293887e-07, "loss": 0.2001, "step": 169 }, { "epoch": 0.2823920265780731, "grad_norm": 20.35819109924581, "learning_rate": 9.682928702168325e-07, "loss": 0.2113, "step": 170 }, { "epoch": 0.2840531561461794, "grad_norm": 10.618810483797844, "learning_rate": 9.678194793896772e-07, "loss": 0.157, "step": 171 }, { "epoch": 0.2857142857142857, "grad_norm": 12.604859535960964, "learning_rate": 9.673426982785825e-07, "loss": 0.1428, "step": 172 }, { "epoch": 0.287375415282392, "grad_norm": 14.237387799784342, "learning_rate": 9.668625303387768e-07, "loss": 0.1614, "step": 173 }, { "epoch": 0.28903654485049834, "grad_norm": 10.68448826627167, "learning_rate": 9.663789790500332e-07, "loss": 0.1228, "step": 174 }, { "epoch": 0.29069767441860467, "grad_norm": 14.01788767612421, "learning_rate": 9.658920479166444e-07, "loss": 0.1634, "step": 175 }, { "epoch": 0.292358803986711, "grad_norm": 12.150319124625579, "learning_rate": 9.65401740467396e-07, "loss": 0.1816, "step": 176 }, { "epoch": 0.29401993355481726, "grad_norm": 13.26166970387516, "learning_rate": 9.649080602555419e-07, "loss": 0.2212, "step": 177 }, { "epoch": 0.2956810631229236, "grad_norm": 10.456475792269867, "learning_rate": 9.644110108587791e-07, "loss": 0.162, "step": 178 }, { "epoch": 0.2973421926910299, "grad_norm": 12.01103083397816, "learning_rate": 9.6391059587922e-07, "loss": 0.1953, "step": 179 }, { "epoch": 0.29900332225913623, "grad_norm": 10.932107597417101, "learning_rate": 9.634068189433682e-07, "loss": 0.1792, "step": 180 }, { "epoch": 0.30066445182724255, "grad_norm": 9.912596130837807, "learning_rate": 9.628996837020907e-07, "loss": 0.171, "step": 181 }, { "epoch": 0.3023255813953488, "grad_norm": 9.446769528764404, "learning_rate": 9.623891938305928e-07, "loss": 0.1131, "step": 182 }, { "epoch": 0.30398671096345514, "grad_norm": 11.672641463794086, "learning_rate": 9.618753530283901e-07, "loss": 0.1384, "step": 183 }, { "epoch": 0.30564784053156147, "grad_norm": 10.856472785858744, "learning_rate": 9.613581650192831e-07, "loss": 0.1635, "step": 184 }, { "epoch": 0.3073089700996678, "grad_norm": 15.534434398535327, "learning_rate": 9.608376335513285e-07, "loss": 0.2019, "step": 185 }, { "epoch": 0.3089700996677741, "grad_norm": 9.808233191877529, "learning_rate": 9.60313762396814e-07, "loss": 0.0811, "step": 186 }, { "epoch": 0.3106312292358804, "grad_norm": 11.12760822568997, "learning_rate": 9.597865553522297e-07, "loss": 0.1293, "step": 187 }, { "epoch": 0.3122923588039867, "grad_norm": 16.610274735181868, "learning_rate": 9.592560162382403e-07, "loss": 0.1754, "step": 188 }, { "epoch": 0.313953488372093, "grad_norm": 31.037129088244438, "learning_rate": 9.587221488996586e-07, "loss": 0.3788, "step": 189 }, { "epoch": 0.31561461794019935, "grad_norm": 19.704030174639467, "learning_rate": 9.58184957205417e-07, "loss": 0.1908, "step": 190 }, { "epoch": 0.31727574750830567, "grad_norm": 11.615505292621943, "learning_rate": 9.576444450485391e-07, "loss": 0.1098, "step": 191 }, { "epoch": 0.31893687707641194, "grad_norm": 10.666388155171473, "learning_rate": 9.571006163461123e-07, "loss": 0.131, "step": 192 }, { "epoch": 0.32059800664451826, "grad_norm": 17.15366066076218, "learning_rate": 9.565534750392585e-07, "loss": 0.2124, "step": 193 }, { "epoch": 0.3222591362126246, "grad_norm": 12.172651785352677, "learning_rate": 9.560030250931064e-07, "loss": 0.1371, "step": 194 }, { "epoch": 0.3239202657807309, "grad_norm": 22.03825149725322, "learning_rate": 9.554492704967624e-07, "loss": 0.2334, "step": 195 }, { "epoch": 0.32558139534883723, "grad_norm": 13.245192413042442, "learning_rate": 9.548922152632811e-07, "loss": 0.1631, "step": 196 }, { "epoch": 0.3272425249169435, "grad_norm": 10.802335247206143, "learning_rate": 9.543318634296375e-07, "loss": 0.1568, "step": 197 }, { "epoch": 0.3289036544850498, "grad_norm": 12.17592621693887, "learning_rate": 9.53768219056697e-07, "loss": 0.1141, "step": 198 }, { "epoch": 0.33056478405315615, "grad_norm": 11.253478382590625, "learning_rate": 9.532012862291853e-07, "loss": 0.1163, "step": 199 }, { "epoch": 0.33222591362126247, "grad_norm": 21.815045663911672, "learning_rate": 9.526310690556605e-07, "loss": 0.1867, "step": 200 }, { "epoch": 0.3338870431893688, "grad_norm": 18.859610289112627, "learning_rate": 9.520575716684811e-07, "loss": 0.2125, "step": 201 }, { "epoch": 0.33554817275747506, "grad_norm": 11.205977833725816, "learning_rate": 9.514807982237785e-07, "loss": 0.1618, "step": 202 }, { "epoch": 0.3372093023255814, "grad_norm": 11.388470403679207, "learning_rate": 9.50900752901425e-07, "loss": 0.1184, "step": 203 }, { "epoch": 0.3388704318936877, "grad_norm": 10.075648460940602, "learning_rate": 9.503174399050043e-07, "loss": 0.1441, "step": 204 }, { "epoch": 0.34053156146179403, "grad_norm": 12.322759834617335, "learning_rate": 9.497308634617807e-07, "loss": 0.1244, "step": 205 }, { "epoch": 0.34219269102990035, "grad_norm": 14.94788430766923, "learning_rate": 9.491410278226692e-07, "loss": 0.2405, "step": 206 }, { "epoch": 0.3438538205980066, "grad_norm": 12.443492266603144, "learning_rate": 9.485479372622037e-07, "loss": 0.149, "step": 207 }, { "epoch": 0.34551495016611294, "grad_norm": 11.067400945275928, "learning_rate": 9.479515960785068e-07, "loss": 0.1404, "step": 208 }, { "epoch": 0.34717607973421927, "grad_norm": 11.525444371950433, "learning_rate": 9.473520085932579e-07, "loss": 0.1384, "step": 209 }, { "epoch": 0.3488372093023256, "grad_norm": 10.971783104668468, "learning_rate": 9.467491791516626e-07, "loss": 0.1349, "step": 210 }, { "epoch": 0.3504983388704319, "grad_norm": 12.635492557571897, "learning_rate": 9.461431121224214e-07, "loss": 0.1997, "step": 211 }, { "epoch": 0.3521594684385382, "grad_norm": 19.325558806797325, "learning_rate": 9.455338118976966e-07, "loss": 0.1585, "step": 212 }, { "epoch": 0.3538205980066445, "grad_norm": 8.353857534598658, "learning_rate": 9.449212828930822e-07, "loss": 0.1202, "step": 213 }, { "epoch": 0.3554817275747508, "grad_norm": 10.543973821466691, "learning_rate": 9.443055295475707e-07, "loss": 0.1858, "step": 214 }, { "epoch": 0.35714285714285715, "grad_norm": 9.95076123718523, "learning_rate": 9.436865563235217e-07, "loss": 0.179, "step": 215 }, { "epoch": 0.3588039867109635, "grad_norm": 13.713185473400454, "learning_rate": 9.430643677066291e-07, "loss": 0.1925, "step": 216 }, { "epoch": 0.36046511627906974, "grad_norm": 7.513881407573629, "learning_rate": 9.424389682058886e-07, "loss": 0.1222, "step": 217 }, { "epoch": 0.36212624584717606, "grad_norm": 11.36739084106459, "learning_rate": 9.418103623535653e-07, "loss": 0.1867, "step": 218 }, { "epoch": 0.3637873754152824, "grad_norm": 12.043749514925038, "learning_rate": 9.41178554705161e-07, "loss": 0.1931, "step": 219 }, { "epoch": 0.3654485049833887, "grad_norm": 6.233797219912333, "learning_rate": 9.405435498393799e-07, "loss": 0.0966, "step": 220 }, { "epoch": 0.36710963455149503, "grad_norm": 9.594573864750178, "learning_rate": 9.399053523580976e-07, "loss": 0.1386, "step": 221 }, { "epoch": 0.3687707641196013, "grad_norm": 11.240178079664732, "learning_rate": 9.392639668863258e-07, "loss": 0.1203, "step": 222 }, { "epoch": 0.3704318936877076, "grad_norm": 14.503931970774168, "learning_rate": 9.3861939807218e-07, "loss": 0.1463, "step": 223 }, { "epoch": 0.37209302325581395, "grad_norm": 10.257728339652989, "learning_rate": 9.379716505868447e-07, "loss": 0.1593, "step": 224 }, { "epoch": 0.37375415282392027, "grad_norm": 12.14682043932465, "learning_rate": 9.373207291245411e-07, "loss": 0.1257, "step": 225 }, { "epoch": 0.3754152823920266, "grad_norm": 9.99859322996326, "learning_rate": 9.366666384024913e-07, "loss": 0.1696, "step": 226 }, { "epoch": 0.3770764119601329, "grad_norm": 10.04625893529298, "learning_rate": 9.360093831608856e-07, "loss": 0.1625, "step": 227 }, { "epoch": 0.3787375415282392, "grad_norm": 16.19965207561594, "learning_rate": 9.353489681628475e-07, "loss": 0.1471, "step": 228 }, { "epoch": 0.3803986710963455, "grad_norm": 13.04710404485369, "learning_rate": 9.346853981943988e-07, "loss": 0.1499, "step": 229 }, { "epoch": 0.38205980066445183, "grad_norm": 11.06095587308583, "learning_rate": 9.340186780644259e-07, "loss": 0.0893, "step": 230 }, { "epoch": 0.38372093023255816, "grad_norm": 11.353482545061441, "learning_rate": 9.333488126046438e-07, "loss": 0.1214, "step": 231 }, { "epoch": 0.3853820598006645, "grad_norm": 17.31613403125687, "learning_rate": 9.326758066695624e-07, "loss": 0.1278, "step": 232 }, { "epoch": 0.38704318936877075, "grad_norm": 19.69366300321997, "learning_rate": 9.319996651364499e-07, "loss": 0.1722, "step": 233 }, { "epoch": 0.38870431893687707, "grad_norm": 13.58019477734271, "learning_rate": 9.313203929052986e-07, "loss": 0.1316, "step": 234 }, { "epoch": 0.3903654485049834, "grad_norm": 22.806246251767572, "learning_rate": 9.306379948987888e-07, "loss": 0.2574, "step": 235 }, { "epoch": 0.3920265780730897, "grad_norm": 16.293534036256254, "learning_rate": 9.299524760622533e-07, "loss": 0.1146, "step": 236 }, { "epoch": 0.39368770764119604, "grad_norm": 9.215117865221517, "learning_rate": 9.292638413636414e-07, "loss": 0.0652, "step": 237 }, { "epoch": 0.3953488372093023, "grad_norm": 19.243051206888683, "learning_rate": 9.285720957934831e-07, "loss": 0.2231, "step": 238 }, { "epoch": 0.39700996677740863, "grad_norm": 11.51135738383444, "learning_rate": 9.278772443648531e-07, "loss": 0.1822, "step": 239 }, { "epoch": 0.39867109634551495, "grad_norm": 12.885715889145898, "learning_rate": 9.271792921133337e-07, "loss": 0.1281, "step": 240 }, { "epoch": 0.4003322259136213, "grad_norm": 12.475318988797966, "learning_rate": 9.264782440969793e-07, "loss": 0.1822, "step": 241 }, { "epoch": 0.4019933554817276, "grad_norm": 9.33400580821295, "learning_rate": 9.257741053962794e-07, "loss": 0.1347, "step": 242 }, { "epoch": 0.40365448504983387, "grad_norm": 12.058606856026875, "learning_rate": 9.25066881114121e-07, "loss": 0.1706, "step": 243 }, { "epoch": 0.4053156146179402, "grad_norm": 10.824161359526528, "learning_rate": 9.243565763757529e-07, "loss": 0.1761, "step": 244 }, { "epoch": 0.4069767441860465, "grad_norm": 14.581271946544478, "learning_rate": 9.236431963287477e-07, "loss": 0.2583, "step": 245 }, { "epoch": 0.40863787375415284, "grad_norm": 6.398803338643679, "learning_rate": 9.229267461429647e-07, "loss": 0.1036, "step": 246 }, { "epoch": 0.41029900332225916, "grad_norm": 8.214273717131517, "learning_rate": 9.222072310105126e-07, "loss": 0.151, "step": 247 }, { "epoch": 0.4119601328903654, "grad_norm": 6.531248071881361, "learning_rate": 9.214846561457117e-07, "loss": 0.1343, "step": 248 }, { "epoch": 0.41362126245847175, "grad_norm": 7.946424836117755, "learning_rate": 9.207590267850562e-07, "loss": 0.1339, "step": 249 }, { "epoch": 0.4152823920265781, "grad_norm": 14.683229666391957, "learning_rate": 9.200303481871758e-07, "loss": 0.2346, "step": 250 }, { "epoch": 0.4169435215946844, "grad_norm": 6.7523422282754595, "learning_rate": 9.192986256327989e-07, "loss": 0.1082, "step": 251 }, { "epoch": 0.4186046511627907, "grad_norm": 12.807771031205872, "learning_rate": 9.185638644247122e-07, "loss": 0.172, "step": 252 }, { "epoch": 0.420265780730897, "grad_norm": 12.441325438265876, "learning_rate": 9.178260698877247e-07, "loss": 0.1524, "step": 253 }, { "epoch": 0.4219269102990033, "grad_norm": 11.924760374595467, "learning_rate": 9.170852473686272e-07, "loss": 0.145, "step": 254 }, { "epoch": 0.42358803986710963, "grad_norm": 15.295324115541575, "learning_rate": 9.163414022361542e-07, "loss": 0.2366, "step": 255 }, { "epoch": 0.42524916943521596, "grad_norm": 11.45834430193356, "learning_rate": 9.155945398809457e-07, "loss": 0.1714, "step": 256 }, { "epoch": 0.4269102990033223, "grad_norm": 11.860210597995017, "learning_rate": 9.148446657155069e-07, "loss": 0.1581, "step": 257 }, { "epoch": 0.42857142857142855, "grad_norm": 14.653302275353555, "learning_rate": 9.140917851741696e-07, "loss": 0.1782, "step": 258 }, { "epoch": 0.43023255813953487, "grad_norm": 11.340233977827461, "learning_rate": 9.13335903713053e-07, "loss": 0.135, "step": 259 }, { "epoch": 0.4318936877076412, "grad_norm": 12.33215015955027, "learning_rate": 9.125770268100241e-07, "loss": 0.1755, "step": 260 }, { "epoch": 0.4335548172757475, "grad_norm": 10.82579948342303, "learning_rate": 9.118151599646573e-07, "loss": 0.1775, "step": 261 }, { "epoch": 0.43521594684385384, "grad_norm": 8.893649702916877, "learning_rate": 9.110503086981955e-07, "loss": 0.134, "step": 262 }, { "epoch": 0.4368770764119601, "grad_norm": 11.851382147068934, "learning_rate": 9.102824785535096e-07, "loss": 0.248, "step": 263 }, { "epoch": 0.43853820598006643, "grad_norm": 8.085625713042226, "learning_rate": 9.095116750950583e-07, "loss": 0.1053, "step": 264 }, { "epoch": 0.44019933554817275, "grad_norm": 8.825530836149932, "learning_rate": 9.087379039088481e-07, "loss": 0.1699, "step": 265 }, { "epoch": 0.4418604651162791, "grad_norm": 8.37105680491955, "learning_rate": 9.079611706023925e-07, "loss": 0.1496, "step": 266 }, { "epoch": 0.4435215946843854, "grad_norm": 8.257776032135112, "learning_rate": 9.071814808046709e-07, "loss": 0.1492, "step": 267 }, { "epoch": 0.44518272425249167, "grad_norm": 9.848656586818922, "learning_rate": 9.063988401660895e-07, "loss": 0.1167, "step": 268 }, { "epoch": 0.446843853820598, "grad_norm": 13.15071986231609, "learning_rate": 9.056132543584385e-07, "loss": 0.2396, "step": 269 }, { "epoch": 0.4485049833887043, "grad_norm": 7.4908331870411065, "learning_rate": 9.048247290748516e-07, "loss": 0.1152, "step": 270 }, { "epoch": 0.45016611295681064, "grad_norm": 8.35585031303107, "learning_rate": 9.040332700297651e-07, "loss": 0.0845, "step": 271 }, { "epoch": 0.45182724252491696, "grad_norm": 18.299405153632886, "learning_rate": 9.032388829588764e-07, "loss": 0.1516, "step": 272 }, { "epoch": 0.45348837209302323, "grad_norm": 10.140601449856803, "learning_rate": 9.02441573619102e-07, "loss": 0.1274, "step": 273 }, { "epoch": 0.45514950166112955, "grad_norm": 12.172095204640925, "learning_rate": 9.01641347788536e-07, "loss": 0.182, "step": 274 }, { "epoch": 0.4568106312292359, "grad_norm": 9.563752668175168, "learning_rate": 9.008382112664088e-07, "loss": 0.0945, "step": 275 }, { "epoch": 0.4584717607973422, "grad_norm": 10.287796564887433, "learning_rate": 9.000321698730439e-07, "loss": 0.0976, "step": 276 }, { "epoch": 0.4601328903654485, "grad_norm": 13.189377107182274, "learning_rate": 8.992232294498169e-07, "loss": 0.1124, "step": 277 }, { "epoch": 0.46179401993355484, "grad_norm": 25.060289240642998, "learning_rate": 8.984113958591124e-07, "loss": 0.1806, "step": 278 }, { "epoch": 0.4634551495016611, "grad_norm": 14.670517988936247, "learning_rate": 8.975966749842816e-07, "loss": 0.1432, "step": 279 }, { "epoch": 0.46511627906976744, "grad_norm": 19.822372119676242, "learning_rate": 8.967790727296001e-07, "loss": 0.2261, "step": 280 }, { "epoch": 0.46677740863787376, "grad_norm": 5.631680487351567, "learning_rate": 8.959585950202248e-07, "loss": 0.0537, "step": 281 }, { "epoch": 0.4684385382059801, "grad_norm": 21.281978024222216, "learning_rate": 8.95135247802151e-07, "loss": 0.2133, "step": 282 }, { "epoch": 0.4700996677740864, "grad_norm": 16.409996226272277, "learning_rate": 8.943090370421691e-07, "loss": 0.1548, "step": 283 }, { "epoch": 0.4717607973421927, "grad_norm": 10.142872282405547, "learning_rate": 8.934799687278219e-07, "loss": 0.1067, "step": 284 }, { "epoch": 0.473421926910299, "grad_norm": 22.26876142574316, "learning_rate": 8.926480488673605e-07, "loss": 0.1667, "step": 285 }, { "epoch": 0.4750830564784053, "grad_norm": 9.876108544387835, "learning_rate": 8.918132834897015e-07, "loss": 0.1081, "step": 286 }, { "epoch": 0.47674418604651164, "grad_norm": 13.966378148098578, "learning_rate": 8.909756786443827e-07, "loss": 0.1993, "step": 287 }, { "epoch": 0.47840531561461797, "grad_norm": 14.484542043786044, "learning_rate": 8.901352404015194e-07, "loss": 0.1349, "step": 288 }, { "epoch": 0.48006644518272423, "grad_norm": 7.865469366519635, "learning_rate": 8.89291974851761e-07, "loss": 0.0748, "step": 289 }, { "epoch": 0.48172757475083056, "grad_norm": 12.981698322123203, "learning_rate": 8.884458881062457e-07, "loss": 0.1387, "step": 290 }, { "epoch": 0.4833887043189369, "grad_norm": 14.42068589553622, "learning_rate": 8.875969862965574e-07, "loss": 0.1887, "step": 291 }, { "epoch": 0.4850498338870432, "grad_norm": 19.32168616528694, "learning_rate": 8.867452755746805e-07, "loss": 0.1184, "step": 292 }, { "epoch": 0.4867109634551495, "grad_norm": 14.687720171684697, "learning_rate": 8.858907621129559e-07, "loss": 0.1596, "step": 293 }, { "epoch": 0.4883720930232558, "grad_norm": 10.32826671997461, "learning_rate": 8.850334521040352e-07, "loss": 0.1432, "step": 294 }, { "epoch": 0.4900332225913621, "grad_norm": 9.149170000100604, "learning_rate": 8.841733517608374e-07, "loss": 0.1725, "step": 295 }, { "epoch": 0.49169435215946844, "grad_norm": 7.682038024526175, "learning_rate": 8.833104673165024e-07, "loss": 0.1473, "step": 296 }, { "epoch": 0.49335548172757476, "grad_norm": 7.36387604217277, "learning_rate": 8.824448050243469e-07, "loss": 0.1065, "step": 297 }, { "epoch": 0.4950166112956811, "grad_norm": 10.351711544120024, "learning_rate": 8.815763711578183e-07, "loss": 0.1717, "step": 298 }, { "epoch": 0.49667774086378735, "grad_norm": 10.17527289922022, "learning_rate": 8.8070517201045e-07, "loss": 0.1498, "step": 299 }, { "epoch": 0.4983388704318937, "grad_norm": 17.205640014020535, "learning_rate": 8.798312138958146e-07, "loss": 0.1562, "step": 300 }, { "epoch": 0.5, "grad_norm": 13.741848852332474, "learning_rate": 8.789545031474799e-07, "loss": 0.1875, "step": 301 }, { "epoch": 0.5016611295681063, "grad_norm": 7.794673805015374, "learning_rate": 8.780750461189612e-07, "loss": 0.1141, "step": 302 }, { "epoch": 0.5033222591362126, "grad_norm": 11.818808426001134, "learning_rate": 8.771928491836764e-07, "loss": 0.1633, "step": 303 }, { "epoch": 0.5049833887043189, "grad_norm": 7.761679819092504, "learning_rate": 8.763079187348999e-07, "loss": 0.1248, "step": 304 }, { "epoch": 0.5066445182724253, "grad_norm": 8.882760685506408, "learning_rate": 8.754202611857149e-07, "loss": 0.1513, "step": 305 }, { "epoch": 0.5083056478405316, "grad_norm": 7.40135368569042, "learning_rate": 8.745298829689686e-07, "loss": 0.0891, "step": 306 }, { "epoch": 0.5099667774086378, "grad_norm": 10.925689760300747, "learning_rate": 8.736367905372246e-07, "loss": 0.1939, "step": 307 }, { "epoch": 0.5116279069767442, "grad_norm": 11.424412238662494, "learning_rate": 8.727409903627165e-07, "loss": 0.1181, "step": 308 }, { "epoch": 0.5132890365448505, "grad_norm": 11.026582956807562, "learning_rate": 8.71842488937301e-07, "loss": 0.1892, "step": 309 }, { "epoch": 0.5149501661129569, "grad_norm": 14.452286426871511, "learning_rate": 8.709412927724103e-07, "loss": 0.1648, "step": 310 }, { "epoch": 0.5166112956810631, "grad_norm": 8.879574876643009, "learning_rate": 8.700374083990057e-07, "loss": 0.1412, "step": 311 }, { "epoch": 0.5182724252491694, "grad_norm": 14.483071618844798, "learning_rate": 8.691308423675299e-07, "loss": 0.2708, "step": 312 }, { "epoch": 0.5199335548172758, "grad_norm": 10.234610377564792, "learning_rate": 8.682216012478596e-07, "loss": 0.1516, "step": 313 }, { "epoch": 0.521594684385382, "grad_norm": 17.446807434470152, "learning_rate": 8.673096916292576e-07, "loss": 0.1629, "step": 314 }, { "epoch": 0.5232558139534884, "grad_norm": 9.65924900517999, "learning_rate": 8.663951201203254e-07, "loss": 0.1413, "step": 315 }, { "epoch": 0.5249169435215947, "grad_norm": 9.522606938597889, "learning_rate": 8.654778933489556e-07, "loss": 0.1678, "step": 316 }, { "epoch": 0.526578073089701, "grad_norm": 10.287315185571302, "learning_rate": 8.645580179622828e-07, "loss": 0.1753, "step": 317 }, { "epoch": 0.5282392026578073, "grad_norm": 9.999270763772957, "learning_rate": 8.636355006266365e-07, "loss": 0.1578, "step": 318 }, { "epoch": 0.5299003322259136, "grad_norm": 9.32239950078714, "learning_rate": 8.627103480274921e-07, "loss": 0.1659, "step": 319 }, { "epoch": 0.53156146179402, "grad_norm": 6.923158524420186, "learning_rate": 8.617825668694232e-07, "loss": 0.1233, "step": 320 }, { "epoch": 0.5332225913621262, "grad_norm": 10.823618378270748, "learning_rate": 8.60852163876052e-07, "loss": 0.1538, "step": 321 }, { "epoch": 0.5348837209302325, "grad_norm": 11.047906207579194, "learning_rate": 8.599191457900016e-07, "loss": 0.1547, "step": 322 }, { "epoch": 0.5365448504983389, "grad_norm": 13.05735973202964, "learning_rate": 8.589835193728463e-07, "loss": 0.1444, "step": 323 }, { "epoch": 0.5382059800664452, "grad_norm": 9.104429009432321, "learning_rate": 8.580452914050631e-07, "loss": 0.1255, "step": 324 }, { "epoch": 0.5398671096345515, "grad_norm": 11.64252163137442, "learning_rate": 8.571044686859825e-07, "loss": 0.1912, "step": 325 }, { "epoch": 0.5415282392026578, "grad_norm": 14.944866982774602, "learning_rate": 8.561610580337391e-07, "loss": 0.1768, "step": 326 }, { "epoch": 0.5431893687707641, "grad_norm": 9.488326230185102, "learning_rate": 8.55215066285222e-07, "loss": 0.1118, "step": 327 }, { "epoch": 0.5448504983388704, "grad_norm": 9.85897825770774, "learning_rate": 8.542665002960257e-07, "loss": 0.1025, "step": 328 }, { "epoch": 0.5465116279069767, "grad_norm": 8.684946844670787, "learning_rate": 8.533153669404001e-07, "loss": 0.1264, "step": 329 }, { "epoch": 0.5481727574750831, "grad_norm": 10.062901574236582, "learning_rate": 8.523616731112011e-07, "loss": 0.1723, "step": 330 }, { "epoch": 0.5498338870431894, "grad_norm": 12.11357006155644, "learning_rate": 8.514054257198398e-07, "loss": 0.1531, "step": 331 }, { "epoch": 0.5514950166112956, "grad_norm": 11.839908028521032, "learning_rate": 8.504466316962336e-07, "loss": 0.1442, "step": 332 }, { "epoch": 0.553156146179402, "grad_norm": 11.844766776417758, "learning_rate": 8.494852979887544e-07, "loss": 0.1071, "step": 333 }, { "epoch": 0.5548172757475083, "grad_norm": 10.271319518938574, "learning_rate": 8.4852143156418e-07, "loss": 0.149, "step": 334 }, { "epoch": 0.5564784053156147, "grad_norm": 11.779914075239326, "learning_rate": 8.475550394076426e-07, "loss": 0.1389, "step": 335 }, { "epoch": 0.5581395348837209, "grad_norm": 10.435527692770954, "learning_rate": 8.465861285225781e-07, "loss": 0.149, "step": 336 }, { "epoch": 0.5598006644518272, "grad_norm": 9.38848130124771, "learning_rate": 8.456147059306757e-07, "loss": 0.0886, "step": 337 }, { "epoch": 0.5614617940199336, "grad_norm": 10.191781455117614, "learning_rate": 8.446407786718273e-07, "loss": 0.1092, "step": 338 }, { "epoch": 0.5631229235880398, "grad_norm": 10.76683247123338, "learning_rate": 8.436643538040753e-07, "loss": 0.1363, "step": 339 }, { "epoch": 0.5647840531561462, "grad_norm": 10.294295935493142, "learning_rate": 8.426854384035631e-07, "loss": 0.0882, "step": 340 }, { "epoch": 0.5664451827242525, "grad_norm": 16.910697465451545, "learning_rate": 8.417040395644825e-07, "loss": 0.1969, "step": 341 }, { "epoch": 0.5681063122923588, "grad_norm": 15.166734734046708, "learning_rate": 8.40720164399023e-07, "loss": 0.1724, "step": 342 }, { "epoch": 0.5697674418604651, "grad_norm": 11.95776400356682, "learning_rate": 8.397338200373194e-07, "loss": 0.1101, "step": 343 }, { "epoch": 0.5714285714285714, "grad_norm": 14.389186956170414, "learning_rate": 8.387450136274017e-07, "loss": 0.1589, "step": 344 }, { "epoch": 0.5730897009966778, "grad_norm": 14.678898341131756, "learning_rate": 8.377537523351417e-07, "loss": 0.1563, "step": 345 }, { "epoch": 0.574750830564784, "grad_norm": 10.867669749295963, "learning_rate": 8.367600433442018e-07, "loss": 0.1036, "step": 346 }, { "epoch": 0.5764119601328903, "grad_norm": 20.043379096999764, "learning_rate": 8.35763893855983e-07, "loss": 0.2066, "step": 347 }, { "epoch": 0.5780730897009967, "grad_norm": 13.690312486819662, "learning_rate": 8.347653110895725e-07, "loss": 0.156, "step": 348 }, { "epoch": 0.579734219269103, "grad_norm": 10.391292183316825, "learning_rate": 8.337643022816914e-07, "loss": 0.1022, "step": 349 }, { "epoch": 0.5813953488372093, "grad_norm": 9.001405280579482, "learning_rate": 8.327608746866423e-07, "loss": 0.101, "step": 350 }, { "epoch": 0.5830564784053156, "grad_norm": 8.24123642090196, "learning_rate": 8.31755035576257e-07, "loss": 0.0964, "step": 351 }, { "epoch": 0.584717607973422, "grad_norm": 9.492222500148317, "learning_rate": 8.307467922398432e-07, "loss": 0.1317, "step": 352 }, { "epoch": 0.5863787375415282, "grad_norm": 15.062860420034843, "learning_rate": 8.297361519841318e-07, "loss": 0.2075, "step": 353 }, { "epoch": 0.5880398671096345, "grad_norm": 14.016489579298407, "learning_rate": 8.28723122133225e-07, "loss": 0.2038, "step": 354 }, { "epoch": 0.5897009966777409, "grad_norm": 10.966278256892918, "learning_rate": 8.277077100285412e-07, "loss": 0.1182, "step": 355 }, { "epoch": 0.5913621262458472, "grad_norm": 10.72462410494972, "learning_rate": 8.266899230287642e-07, "loss": 0.1052, "step": 356 }, { "epoch": 0.5930232558139535, "grad_norm": 10.814041170004836, "learning_rate": 8.256697685097877e-07, "loss": 0.0989, "step": 357 }, { "epoch": 0.5946843853820598, "grad_norm": 15.163523649534604, "learning_rate": 8.246472538646634e-07, "loss": 0.1417, "step": 358 }, { "epoch": 0.5963455149501661, "grad_norm": 11.111867453534417, "learning_rate": 8.236223865035465e-07, "loss": 0.1706, "step": 359 }, { "epoch": 0.5980066445182725, "grad_norm": 10.54296261815159, "learning_rate": 8.225951738536423e-07, "loss": 0.1287, "step": 360 }, { "epoch": 0.5996677740863787, "grad_norm": 8.36851786714232, "learning_rate": 8.215656233591524e-07, "loss": 0.1091, "step": 361 }, { "epoch": 0.6013289036544851, "grad_norm": 14.223204073643483, "learning_rate": 8.205337424812208e-07, "loss": 0.1974, "step": 362 }, { "epoch": 0.6029900332225914, "grad_norm": 9.381500918095396, "learning_rate": 8.194995386978803e-07, "loss": 0.1167, "step": 363 }, { "epoch": 0.6046511627906976, "grad_norm": 13.697438543636968, "learning_rate": 8.184630195039965e-07, "loss": 0.1341, "step": 364 }, { "epoch": 0.606312292358804, "grad_norm": 11.51516985079088, "learning_rate": 8.17424192411216e-07, "loss": 0.1389, "step": 365 }, { "epoch": 0.6079734219269103, "grad_norm": 17.961936532222403, "learning_rate": 8.163830649479101e-07, "loss": 0.2059, "step": 366 }, { "epoch": 0.6096345514950167, "grad_norm": 11.255484209831073, "learning_rate": 8.15339644659121e-07, "loss": 0.1636, "step": 367 }, { "epoch": 0.6112956810631229, "grad_norm": 14.400184086382511, "learning_rate": 8.14293939106507e-07, "loss": 0.2286, "step": 368 }, { "epoch": 0.6129568106312292, "grad_norm": 11.730011163552305, "learning_rate": 8.132459558682878e-07, "loss": 0.1594, "step": 369 }, { "epoch": 0.6146179401993356, "grad_norm": 10.322545416211497, "learning_rate": 8.121957025391891e-07, "loss": 0.1497, "step": 370 }, { "epoch": 0.6162790697674418, "grad_norm": 10.109902353400317, "learning_rate": 8.111431867303884e-07, "loss": 0.1422, "step": 371 }, { "epoch": 0.6179401993355482, "grad_norm": 8.097191708903397, "learning_rate": 8.10088416069459e-07, "loss": 0.0915, "step": 372 }, { "epoch": 0.6196013289036545, "grad_norm": 15.757926168628456, "learning_rate": 8.090313982003155e-07, "loss": 0.2464, "step": 373 }, { "epoch": 0.6212624584717608, "grad_norm": 12.285328658201744, "learning_rate": 8.079721407831574e-07, "loss": 0.1759, "step": 374 }, { "epoch": 0.6229235880398671, "grad_norm": 9.06127827692409, "learning_rate": 8.06910651494415e-07, "loss": 0.1211, "step": 375 }, { "epoch": 0.6245847176079734, "grad_norm": 6.734374794139948, "learning_rate": 8.058469380266921e-07, "loss": 0.11, "step": 376 }, { "epoch": 0.6262458471760798, "grad_norm": 9.929774756393966, "learning_rate": 8.047810080887116e-07, "loss": 0.146, "step": 377 }, { "epoch": 0.627906976744186, "grad_norm": 16.089957357745977, "learning_rate": 8.037128694052588e-07, "loss": 0.2195, "step": 378 }, { "epoch": 0.6295681063122923, "grad_norm": 15.96907916727788, "learning_rate": 8.026425297171266e-07, "loss": 0.1866, "step": 379 }, { "epoch": 0.6312292358803987, "grad_norm": 10.24243255900198, "learning_rate": 8.015699967810576e-07, "loss": 0.1659, "step": 380 }, { "epoch": 0.632890365448505, "grad_norm": 8.298998658122965, "learning_rate": 8.004952783696891e-07, "loss": 0.1212, "step": 381 }, { "epoch": 0.6345514950166113, "grad_norm": 6.94281041135004, "learning_rate": 7.994183822714968e-07, "loss": 0.0888, "step": 382 }, { "epoch": 0.6362126245847176, "grad_norm": 10.305910457713361, "learning_rate": 7.983393162907379e-07, "loss": 0.1903, "step": 383 }, { "epoch": 0.6378737541528239, "grad_norm": 7.464775960157331, "learning_rate": 7.972580882473946e-07, "loss": 0.097, "step": 384 }, { "epoch": 0.6395348837209303, "grad_norm": 7.4746993771307055, "learning_rate": 7.961747059771179e-07, "loss": 0.1109, "step": 385 }, { "epoch": 0.6411960132890365, "grad_norm": 6.615341400751429, "learning_rate": 7.950891773311701e-07, "loss": 0.0779, "step": 386 }, { "epoch": 0.6428571428571429, "grad_norm": 17.60157058625115, "learning_rate": 7.940015101763684e-07, "loss": 0.2216, "step": 387 }, { "epoch": 0.6445182724252492, "grad_norm": 11.013583175188954, "learning_rate": 7.92911712395028e-07, "loss": 0.172, "step": 388 }, { "epoch": 0.6461794019933554, "grad_norm": 8.0676059944362, "learning_rate": 7.918197918849042e-07, "loss": 0.1122, "step": 389 }, { "epoch": 0.6478405315614618, "grad_norm": 10.8514146503401, "learning_rate": 7.907257565591362e-07, "loss": 0.082, "step": 390 }, { "epoch": 0.6495016611295681, "grad_norm": 12.550088112103355, "learning_rate": 7.896296143461889e-07, "loss": 0.1142, "step": 391 }, { "epoch": 0.6511627906976745, "grad_norm": 12.146722322785989, "learning_rate": 7.885313731897962e-07, "loss": 0.1843, "step": 392 }, { "epoch": 0.6528239202657807, "grad_norm": 11.011969414306924, "learning_rate": 7.874310410489027e-07, "loss": 0.1209, "step": 393 }, { "epoch": 0.654485049833887, "grad_norm": 6.810537387951736, "learning_rate": 7.863286258976061e-07, "loss": 0.0608, "step": 394 }, { "epoch": 0.6561461794019934, "grad_norm": 10.89167605967294, "learning_rate": 7.852241357251002e-07, "loss": 0.1189, "step": 395 }, { "epoch": 0.6578073089700996, "grad_norm": 14.859678250628704, "learning_rate": 7.841175785356165e-07, "loss": 0.1324, "step": 396 }, { "epoch": 0.659468438538206, "grad_norm": 14.58129989899506, "learning_rate": 7.830089623483656e-07, "loss": 0.1417, "step": 397 }, { "epoch": 0.6611295681063123, "grad_norm": 16.34306018101847, "learning_rate": 7.818982951974798e-07, "loss": 0.1263, "step": 398 }, { "epoch": 0.6627906976744186, "grad_norm": 9.974280701968183, "learning_rate": 7.807855851319554e-07, "loss": 0.1354, "step": 399 }, { "epoch": 0.6644518272425249, "grad_norm": 14.425725799753549, "learning_rate": 7.796708402155925e-07, "loss": 0.1874, "step": 400 }, { "epoch": 0.6661129568106312, "grad_norm": 10.094566765127704, "learning_rate": 7.785540685269388e-07, "loss": 0.147, "step": 401 }, { "epoch": 0.6677740863787376, "grad_norm": 15.97109522405515, "learning_rate": 7.774352781592295e-07, "loss": 0.1826, "step": 402 }, { "epoch": 0.6694352159468439, "grad_norm": 13.187995357816002, "learning_rate": 7.763144772203291e-07, "loss": 0.1317, "step": 403 }, { "epoch": 0.6710963455149501, "grad_norm": 15.41977600347847, "learning_rate": 7.751916738326732e-07, "loss": 0.1712, "step": 404 }, { "epoch": 0.6727574750830565, "grad_norm": 6.680838078645473, "learning_rate": 7.740668761332084e-07, "loss": 0.0829, "step": 405 }, { "epoch": 0.6744186046511628, "grad_norm": 16.868662883815464, "learning_rate": 7.729400922733345e-07, "loss": 0.1237, "step": 406 }, { "epoch": 0.6760797342192691, "grad_norm": 10.871555946818436, "learning_rate": 7.71811330418845e-07, "loss": 0.1275, "step": 407 }, { "epoch": 0.6777408637873754, "grad_norm": 14.978834275251021, "learning_rate": 7.706805987498677e-07, "loss": 0.1144, "step": 408 }, { "epoch": 0.6794019933554817, "grad_norm": 9.534606751566304, "learning_rate": 7.69547905460806e-07, "loss": 0.1177, "step": 409 }, { "epoch": 0.6810631229235881, "grad_norm": 9.950511975602048, "learning_rate": 7.684132587602786e-07, "loss": 0.1758, "step": 410 }, { "epoch": 0.6827242524916943, "grad_norm": 10.042396922065393, "learning_rate": 7.67276666871061e-07, "loss": 0.1446, "step": 411 }, { "epoch": 0.6843853820598007, "grad_norm": 10.048910173755388, "learning_rate": 7.661381380300253e-07, "loss": 0.163, "step": 412 }, { "epoch": 0.686046511627907, "grad_norm": 9.33358937868672, "learning_rate": 7.649976804880809e-07, "loss": 0.1048, "step": 413 }, { "epoch": 0.6877076411960132, "grad_norm": 10.583010872327236, "learning_rate": 7.63855302510114e-07, "loss": 0.1344, "step": 414 }, { "epoch": 0.6893687707641196, "grad_norm": 12.165380086835881, "learning_rate": 7.627110123749285e-07, "loss": 0.1494, "step": 415 }, { "epoch": 0.6910299003322259, "grad_norm": 10.99633419748417, "learning_rate": 7.615648183751857e-07, "loss": 0.1329, "step": 416 }, { "epoch": 0.6926910299003323, "grad_norm": 11.667588050351162, "learning_rate": 7.60416728817344e-07, "loss": 0.153, "step": 417 }, { "epoch": 0.6943521594684385, "grad_norm": 8.985742528680719, "learning_rate": 7.592667520215994e-07, "loss": 0.1267, "step": 418 }, { "epoch": 0.6960132890365448, "grad_norm": 9.25393580859672, "learning_rate": 7.581148963218241e-07, "loss": 0.1382, "step": 419 }, { "epoch": 0.6976744186046512, "grad_norm": 8.89447816694309, "learning_rate": 7.569611700655068e-07, "loss": 0.1189, "step": 420 }, { "epoch": 0.6993355481727574, "grad_norm": 11.740985525599102, "learning_rate": 7.558055816136924e-07, "loss": 0.1677, "step": 421 }, { "epoch": 0.7009966777408638, "grad_norm": 7.905842682651146, "learning_rate": 7.546481393409209e-07, "loss": 0.098, "step": 422 }, { "epoch": 0.7026578073089701, "grad_norm": 15.390816785264915, "learning_rate": 7.53488851635167e-07, "loss": 0.1973, "step": 423 }, { "epoch": 0.7043189368770764, "grad_norm": 10.964069409142427, "learning_rate": 7.523277268977792e-07, "loss": 0.1268, "step": 424 }, { "epoch": 0.7059800664451827, "grad_norm": 11.34599369651872, "learning_rate": 7.51164773543419e-07, "loss": 0.1469, "step": 425 }, { "epoch": 0.707641196013289, "grad_norm": 9.205102192725148, "learning_rate": 7.5e-07, "loss": 0.1199, "step": 426 }, { "epoch": 0.7093023255813954, "grad_norm": 8.670727684942934, "learning_rate": 7.488334147086263e-07, "loss": 0.1012, "step": 427 }, { "epoch": 0.7109634551495017, "grad_norm": 10.11416734093539, "learning_rate": 7.476650261235318e-07, "loss": 0.1354, "step": 428 }, { "epoch": 0.7126245847176079, "grad_norm": 6.003475116109574, "learning_rate": 7.464948427120197e-07, "loss": 0.0826, "step": 429 }, { "epoch": 0.7142857142857143, "grad_norm": 11.188990918400444, "learning_rate": 7.453228729543988e-07, "loss": 0.1512, "step": 430 }, { "epoch": 0.7159468438538206, "grad_norm": 19.722212570905874, "learning_rate": 7.441491253439249e-07, "loss": 0.0985, "step": 431 }, { "epoch": 0.717607973421927, "grad_norm": 9.220172905747987, "learning_rate": 7.429736083867371e-07, "loss": 0.1254, "step": 432 }, { "epoch": 0.7192691029900332, "grad_norm": 10.857365036669531, "learning_rate": 7.417963306017972e-07, "loss": 0.1556, "step": 433 }, { "epoch": 0.7209302325581395, "grad_norm": 9.12825370131944, "learning_rate": 7.406173005208277e-07, "loss": 0.109, "step": 434 }, { "epoch": 0.7225913621262459, "grad_norm": 11.058921726765327, "learning_rate": 7.394365266882501e-07, "loss": 0.1443, "step": 435 }, { "epoch": 0.7242524916943521, "grad_norm": 17.6871451943868, "learning_rate": 7.382540176611223e-07, "loss": 0.2528, "step": 436 }, { "epoch": 0.7259136212624585, "grad_norm": 9.523326447398212, "learning_rate": 7.370697820090778e-07, "loss": 0.0873, "step": 437 }, { "epoch": 0.7275747508305648, "grad_norm": 11.905512725983018, "learning_rate": 7.358838283142628e-07, "loss": 0.1576, "step": 438 }, { "epoch": 0.729235880398671, "grad_norm": 13.862231389544819, "learning_rate": 7.346961651712739e-07, "loss": 0.2174, "step": 439 }, { "epoch": 0.7308970099667774, "grad_norm": 20.52530465219098, "learning_rate": 7.335068011870962e-07, "loss": 0.2746, "step": 440 }, { "epoch": 0.7325581395348837, "grad_norm": 10.26969871122055, "learning_rate": 7.323157449810405e-07, "loss": 0.1119, "step": 441 }, { "epoch": 0.7342192691029901, "grad_norm": 11.415857653085869, "learning_rate": 7.311230051846819e-07, "loss": 0.138, "step": 442 }, { "epoch": 0.7358803986710963, "grad_norm": 11.650217063993916, "learning_rate": 7.299285904417955e-07, "loss": 0.1596, "step": 443 }, { "epoch": 0.7375415282392026, "grad_norm": 9.256723737467732, "learning_rate": 7.287325094082954e-07, "loss": 0.1267, "step": 444 }, { "epoch": 0.739202657807309, "grad_norm": 7.694604457260306, "learning_rate": 7.275347707521709e-07, "loss": 0.1038, "step": 445 }, { "epoch": 0.7408637873754153, "grad_norm": 7.191980021296017, "learning_rate": 7.263353831534244e-07, "loss": 0.109, "step": 446 }, { "epoch": 0.7425249169435216, "grad_norm": 10.691175165821143, "learning_rate": 7.25134355304008e-07, "loss": 0.1907, "step": 447 }, { "epoch": 0.7441860465116279, "grad_norm": 6.947238283602248, "learning_rate": 7.239316959077607e-07, "loss": 0.0847, "step": 448 }, { "epoch": 0.7458471760797342, "grad_norm": 13.005333968288324, "learning_rate": 7.227274136803452e-07, "loss": 0.2188, "step": 449 }, { "epoch": 0.7475083056478405, "grad_norm": 7.289755042527808, "learning_rate": 7.215215173491849e-07, "loss": 0.1152, "step": 450 }, { "epoch": 0.7491694352159468, "grad_norm": 8.425252309131805, "learning_rate": 7.203140156534009e-07, "loss": 0.1461, "step": 451 }, { "epoch": 0.7508305647840532, "grad_norm": 5.562081357930086, "learning_rate": 7.191049173437479e-07, "loss": 0.0852, "step": 452 }, { "epoch": 0.7524916943521595, "grad_norm": 12.352790950896518, "learning_rate": 7.178942311825516e-07, "loss": 0.155, "step": 453 }, { "epoch": 0.7541528239202658, "grad_norm": 9.171891575083187, "learning_rate": 7.166819659436445e-07, "loss": 0.1495, "step": 454 }, { "epoch": 0.7558139534883721, "grad_norm": 9.41209858612358, "learning_rate": 7.15468130412303e-07, "loss": 0.1169, "step": 455 }, { "epoch": 0.7574750830564784, "grad_norm": 10.230916740772512, "learning_rate": 7.142527333851833e-07, "loss": 0.2093, "step": 456 }, { "epoch": 0.7591362126245847, "grad_norm": 7.965188922429285, "learning_rate": 7.130357836702577e-07, "loss": 0.114, "step": 457 }, { "epoch": 0.760797342192691, "grad_norm": 6.820864692461988, "learning_rate": 7.118172900867508e-07, "loss": 0.1279, "step": 458 }, { "epoch": 0.7624584717607974, "grad_norm": 9.896346871574128, "learning_rate": 7.105972614650756e-07, "loss": 0.1915, "step": 459 }, { "epoch": 0.7641196013289037, "grad_norm": 13.402974322971838, "learning_rate": 7.093757066467696e-07, "loss": 0.1564, "step": 460 }, { "epoch": 0.7657807308970099, "grad_norm": 8.969620661819997, "learning_rate": 7.081526344844305e-07, "loss": 0.1348, "step": 461 }, { "epoch": 0.7674418604651163, "grad_norm": 7.901573755091921, "learning_rate": 7.069280538416524e-07, "loss": 0.117, "step": 462 }, { "epoch": 0.7691029900332226, "grad_norm": 8.676435505228378, "learning_rate": 7.05701973592961e-07, "loss": 0.1312, "step": 463 }, { "epoch": 0.770764119601329, "grad_norm": 8.312745149516928, "learning_rate": 7.044744026237499e-07, "loss": 0.1163, "step": 464 }, { "epoch": 0.7724252491694352, "grad_norm": 8.29093146988122, "learning_rate": 7.03245349830216e-07, "loss": 0.1253, "step": 465 }, { "epoch": 0.7740863787375415, "grad_norm": 10.991996181440621, "learning_rate": 7.020148241192945e-07, "loss": 0.1426, "step": 466 }, { "epoch": 0.7757475083056479, "grad_norm": 9.665978609673429, "learning_rate": 7.007828344085958e-07, "loss": 0.116, "step": 467 }, { "epoch": 0.7774086378737541, "grad_norm": 12.673357083226641, "learning_rate": 6.995493896263385e-07, "loss": 0.1128, "step": 468 }, { "epoch": 0.7790697674418605, "grad_norm": 16.485475962652067, "learning_rate": 6.983144987112875e-07, "loss": 0.2125, "step": 469 }, { "epoch": 0.7807308970099668, "grad_norm": 9.751397196349588, "learning_rate": 6.970781706126864e-07, "loss": 0.1438, "step": 470 }, { "epoch": 0.782392026578073, "grad_norm": 15.35224234314337, "learning_rate": 6.958404142901956e-07, "loss": 0.1653, "step": 471 }, { "epoch": 0.7840531561461794, "grad_norm": 12.390405950401401, "learning_rate": 6.946012387138247e-07, "loss": 0.1534, "step": 472 }, { "epoch": 0.7857142857142857, "grad_norm": 8.927360461817429, "learning_rate": 6.933606528638689e-07, "loss": 0.1109, "step": 473 }, { "epoch": 0.7873754152823921, "grad_norm": 13.299787573189752, "learning_rate": 6.921186657308439e-07, "loss": 0.179, "step": 474 }, { "epoch": 0.7890365448504983, "grad_norm": 8.06526614276188, "learning_rate": 6.9087528631542e-07, "loss": 0.1337, "step": 475 }, { "epoch": 0.7906976744186046, "grad_norm": 8.042198670013263, "learning_rate": 6.89630523628358e-07, "loss": 0.1081, "step": 476 }, { "epoch": 0.792358803986711, "grad_norm": 9.692237559854625, "learning_rate": 6.883843866904426e-07, "loss": 0.1177, "step": 477 }, { "epoch": 0.7940199335548173, "grad_norm": 9.344382237225672, "learning_rate": 6.87136884532418e-07, "loss": 0.1255, "step": 478 }, { "epoch": 0.7956810631229236, "grad_norm": 8.203203552366837, "learning_rate": 6.858880261949224e-07, "loss": 0.1308, "step": 479 }, { "epoch": 0.7973421926910299, "grad_norm": 7.069561317398428, "learning_rate": 6.84637820728422e-07, "loss": 0.1177, "step": 480 }, { "epoch": 0.7990033222591362, "grad_norm": 12.651184018556993, "learning_rate": 6.833862771931452e-07, "loss": 0.1717, "step": 481 }, { "epoch": 0.8006644518272426, "grad_norm": 7.53322640696838, "learning_rate": 6.82133404659018e-07, "loss": 0.132, "step": 482 }, { "epoch": 0.8023255813953488, "grad_norm": 8.74434892318003, "learning_rate": 6.808792122055973e-07, "loss": 0.144, "step": 483 }, { "epoch": 0.8039867109634552, "grad_norm": 10.42589485523522, "learning_rate": 6.796237089220057e-07, "loss": 0.1394, "step": 484 }, { "epoch": 0.8056478405315615, "grad_norm": 11.906093609013176, "learning_rate": 6.783669039068652e-07, "loss": 0.1599, "step": 485 }, { "epoch": 0.8073089700996677, "grad_norm": 11.119016103511091, "learning_rate": 6.771088062682312e-07, "loss": 0.1454, "step": 486 }, { "epoch": 0.8089700996677741, "grad_norm": 6.861287582124284, "learning_rate": 6.758494251235274e-07, "loss": 0.0874, "step": 487 }, { "epoch": 0.8106312292358804, "grad_norm": 22.336812147687517, "learning_rate": 6.745887695994783e-07, "loss": 0.2066, "step": 488 }, { "epoch": 0.8122923588039868, "grad_norm": 10.480117099044136, "learning_rate": 6.733268488320442e-07, "loss": 0.1989, "step": 489 }, { "epoch": 0.813953488372093, "grad_norm": 7.915417258802982, "learning_rate": 6.720636719663549e-07, "loss": 0.0994, "step": 490 }, { "epoch": 0.8156146179401993, "grad_norm": 6.0203241786289015, "learning_rate": 6.707992481566426e-07, "loss": 0.0882, "step": 491 }, { "epoch": 0.8172757475083057, "grad_norm": 10.519130887832892, "learning_rate": 6.695335865661763e-07, "loss": 0.1457, "step": 492 }, { "epoch": 0.8189368770764119, "grad_norm": 10.88189785815634, "learning_rate": 6.682666963671953e-07, "loss": 0.1381, "step": 493 }, { "epoch": 0.8205980066445183, "grad_norm": 8.410449806885296, "learning_rate": 6.669985867408421e-07, "loss": 0.1285, "step": 494 }, { "epoch": 0.8222591362126246, "grad_norm": 10.63153721731556, "learning_rate": 6.657292668770973e-07, "loss": 0.1344, "step": 495 }, { "epoch": 0.8239202657807309, "grad_norm": 7.858722492263171, "learning_rate": 6.644587459747113e-07, "loss": 0.1392, "step": 496 }, { "epoch": 0.8255813953488372, "grad_norm": 12.495414541553474, "learning_rate": 6.631870332411387e-07, "loss": 0.1249, "step": 497 }, { "epoch": 0.8272425249169435, "grad_norm": 8.69182573599952, "learning_rate": 6.619141378924714e-07, "loss": 0.1069, "step": 498 }, { "epoch": 0.8289036544850499, "grad_norm": 11.19610726093013, "learning_rate": 6.606400691533715e-07, "loss": 0.1561, "step": 499 }, { "epoch": 0.8305647840531561, "grad_norm": 11.60589510557093, "learning_rate": 6.593648362570045e-07, "loss": 0.1657, "step": 500 }, { "epoch": 0.8322259136212624, "grad_norm": 8.967078348213883, "learning_rate": 6.580884484449733e-07, "loss": 0.1476, "step": 501 }, { "epoch": 0.8338870431893688, "grad_norm": 8.046078745336114, "learning_rate": 6.568109149672496e-07, "loss": 0.1536, "step": 502 }, { "epoch": 0.8355481727574751, "grad_norm": 10.11126212003755, "learning_rate": 6.555322450821081e-07, "loss": 0.1772, "step": 503 }, { "epoch": 0.8372093023255814, "grad_norm": 22.00811087279718, "learning_rate": 6.542524480560588e-07, "loss": 0.196, "step": 504 }, { "epoch": 0.8388704318936877, "grad_norm": 9.729933636134202, "learning_rate": 6.529715331637804e-07, "loss": 0.1218, "step": 505 }, { "epoch": 0.840531561461794, "grad_norm": 20.546621479120287, "learning_rate": 6.516895096880529e-07, "loss": 0.1806, "step": 506 }, { "epoch": 0.8421926910299004, "grad_norm": 8.302645527868407, "learning_rate": 6.504063869196897e-07, "loss": 0.1164, "step": 507 }, { "epoch": 0.8438538205980066, "grad_norm": 18.63430523045363, "learning_rate": 6.491221741574711e-07, "loss": 0.2653, "step": 508 }, { "epoch": 0.845514950166113, "grad_norm": 5.795031890334442, "learning_rate": 6.478368807080763e-07, "loss": 0.0734, "step": 509 }, { "epoch": 0.8471760797342193, "grad_norm": 7.3352460848157754, "learning_rate": 6.465505158860165e-07, "loss": 0.1188, "step": 510 }, { "epoch": 0.8488372093023255, "grad_norm": 8.230664802958891, "learning_rate": 6.452630890135672e-07, "loss": 0.1376, "step": 511 }, { "epoch": 0.8504983388704319, "grad_norm": 10.861378955756326, "learning_rate": 6.439746094207004e-07, "loss": 0.1895, "step": 512 }, { "epoch": 0.8521594684385382, "grad_norm": 7.801899185147919, "learning_rate": 6.426850864450168e-07, "loss": 0.0992, "step": 513 }, { "epoch": 0.8538205980066446, "grad_norm": 8.21231391824538, "learning_rate": 6.413945294316794e-07, "loss": 0.1277, "step": 514 }, { "epoch": 0.8554817275747508, "grad_norm": 8.182824956563156, "learning_rate": 6.401029477333437e-07, "loss": 0.0903, "step": 515 }, { "epoch": 0.8571428571428571, "grad_norm": 8.908858082334962, "learning_rate": 6.388103507100922e-07, "loss": 0.1044, "step": 516 }, { "epoch": 0.8588039867109635, "grad_norm": 8.858069259669536, "learning_rate": 6.375167477293648e-07, "loss": 0.143, "step": 517 }, { "epoch": 0.8604651162790697, "grad_norm": 7.449561570396506, "learning_rate": 6.362221481658917e-07, "loss": 0.1143, "step": 518 }, { "epoch": 0.8621262458471761, "grad_norm": 10.420657379070615, "learning_rate": 6.349265614016254e-07, "loss": 0.0923, "step": 519 }, { "epoch": 0.8637873754152824, "grad_norm": 6.692843180236105, "learning_rate": 6.336299968256724e-07, "loss": 0.0929, "step": 520 }, { "epoch": 0.8654485049833887, "grad_norm": 13.432306584512084, "learning_rate": 6.323324638342257e-07, "loss": 0.1248, "step": 521 }, { "epoch": 0.867109634551495, "grad_norm": 13.80099877387905, "learning_rate": 6.310339718304965e-07, "loss": 0.1533, "step": 522 }, { "epoch": 0.8687707641196013, "grad_norm": 11.959387915414487, "learning_rate": 6.297345302246452e-07, "loss": 0.1385, "step": 523 }, { "epoch": 0.8704318936877077, "grad_norm": 15.092125999647005, "learning_rate": 6.28434148433715e-07, "loss": 0.2109, "step": 524 }, { "epoch": 0.872093023255814, "grad_norm": 10.776480947820303, "learning_rate": 6.271328358815618e-07, "loss": 0.171, "step": 525 }, { "epoch": 0.8737541528239202, "grad_norm": 8.238618005353674, "learning_rate": 6.258306019987871e-07, "loss": 0.1164, "step": 526 }, { "epoch": 0.8754152823920266, "grad_norm": 26.508653168688756, "learning_rate": 6.245274562226693e-07, "loss": 0.2546, "step": 527 }, { "epoch": 0.8770764119601329, "grad_norm": 18.913079012630988, "learning_rate": 6.232234079970949e-07, "loss": 0.1723, "step": 528 }, { "epoch": 0.8787375415282392, "grad_norm": 15.484934247028333, "learning_rate": 6.219184667724911e-07, "loss": 0.1934, "step": 529 }, { "epoch": 0.8803986710963455, "grad_norm": 11.50624636520952, "learning_rate": 6.20612642005756e-07, "loss": 0.153, "step": 530 }, { "epoch": 0.8820598006644518, "grad_norm": 9.765436764418478, "learning_rate": 6.193059431601909e-07, "loss": 0.1117, "step": 531 }, { "epoch": 0.8837209302325582, "grad_norm": 10.442516383917948, "learning_rate": 6.179983797054321e-07, "loss": 0.1138, "step": 532 }, { "epoch": 0.8853820598006644, "grad_norm": 9.29144855908764, "learning_rate": 6.166899611173808e-07, "loss": 0.1424, "step": 533 }, { "epoch": 0.8870431893687708, "grad_norm": 8.897928846693906, "learning_rate": 6.15380696878136e-07, "loss": 0.1231, "step": 534 }, { "epoch": 0.8887043189368771, "grad_norm": 6.957319586157739, "learning_rate": 6.14070596475925e-07, "loss": 0.1312, "step": 535 }, { "epoch": 0.8903654485049833, "grad_norm": 10.179421163075975, "learning_rate": 6.127596694050345e-07, "loss": 0.1678, "step": 536 }, { "epoch": 0.8920265780730897, "grad_norm": 7.020040317670267, "learning_rate": 6.114479251657425e-07, "loss": 0.0954, "step": 537 }, { "epoch": 0.893687707641196, "grad_norm": 8.473993860366981, "learning_rate": 6.101353732642485e-07, "loss": 0.1449, "step": 538 }, { "epoch": 0.8953488372093024, "grad_norm": 8.652250840430034, "learning_rate": 6.088220232126055e-07, "loss": 0.1063, "step": 539 }, { "epoch": 0.8970099667774086, "grad_norm": 9.534760389427388, "learning_rate": 6.075078845286509e-07, "loss": 0.1728, "step": 540 }, { "epoch": 0.8986710963455149, "grad_norm": 5.776374425659757, "learning_rate": 6.061929667359365e-07, "loss": 0.0742, "step": 541 }, { "epoch": 0.9003322259136213, "grad_norm": 11.015148721611304, "learning_rate": 6.04877279363661e-07, "loss": 0.1788, "step": 542 }, { "epoch": 0.9019933554817275, "grad_norm": 9.473756170184656, "learning_rate": 6.035608319466e-07, "loss": 0.1579, "step": 543 }, { "epoch": 0.9036544850498339, "grad_norm": 11.179037463866795, "learning_rate": 6.02243634025037e-07, "loss": 0.1533, "step": 544 }, { "epoch": 0.9053156146179402, "grad_norm": 7.8758415023645405, "learning_rate": 6.00925695144695e-07, "loss": 0.1146, "step": 545 }, { "epoch": 0.9069767441860465, "grad_norm": 15.665873822759634, "learning_rate": 5.99607024856666e-07, "loss": 0.1047, "step": 546 }, { "epoch": 0.9086378737541528, "grad_norm": 9.200391667118774, "learning_rate": 5.982876327173427e-07, "loss": 0.1272, "step": 547 }, { "epoch": 0.9102990033222591, "grad_norm": 8.77578098706721, "learning_rate": 5.969675282883493e-07, "loss": 0.1516, "step": 548 }, { "epoch": 0.9119601328903655, "grad_norm": 9.320250348637398, "learning_rate": 5.956467211364717e-07, "loss": 0.1387, "step": 549 }, { "epoch": 0.9136212624584718, "grad_norm": 9.952052907950817, "learning_rate": 5.943252208335884e-07, "loss": 0.1403, "step": 550 }, { "epoch": 0.915282392026578, "grad_norm": 5.2002513589385275, "learning_rate": 5.930030369566017e-07, "loss": 0.0565, "step": 551 }, { "epoch": 0.9169435215946844, "grad_norm": 15.488056016488622, "learning_rate": 5.916801790873669e-07, "loss": 0.1978, "step": 552 }, { "epoch": 0.9186046511627907, "grad_norm": 12.637431729375692, "learning_rate": 5.903566568126245e-07, "loss": 0.1326, "step": 553 }, { "epoch": 0.920265780730897, "grad_norm": 9.074527532983787, "learning_rate": 5.890324797239294e-07, "loss": 0.1423, "step": 554 }, { "epoch": 0.9219269102990033, "grad_norm": 9.091722076710525, "learning_rate": 5.877076574175819e-07, "loss": 0.1073, "step": 555 }, { "epoch": 0.9235880398671097, "grad_norm": 8.454862363294326, "learning_rate": 5.86382199494559e-07, "loss": 0.0991, "step": 556 }, { "epoch": 0.925249169435216, "grad_norm": 7.602236450189799, "learning_rate": 5.850561155604429e-07, "loss": 0.1149, "step": 557 }, { "epoch": 0.9269102990033222, "grad_norm": 10.529531993339845, "learning_rate": 5.837294152253533e-07, "loss": 0.1796, "step": 558 }, { "epoch": 0.9285714285714286, "grad_norm": 8.976880430709578, "learning_rate": 5.824021081038767e-07, "loss": 0.1138, "step": 559 }, { "epoch": 0.9302325581395349, "grad_norm": 14.33254707392304, "learning_rate": 5.810742038149966e-07, "loss": 0.1308, "step": 560 }, { "epoch": 0.9318936877076412, "grad_norm": 7.258067172250153, "learning_rate": 5.79745711982025e-07, "loss": 0.09, "step": 561 }, { "epoch": 0.9335548172757475, "grad_norm": 8.172499836134483, "learning_rate": 5.78416642232531e-07, "loss": 0.1044, "step": 562 }, { "epoch": 0.9352159468438538, "grad_norm": 6.594794551088574, "learning_rate": 5.770870041982722e-07, "loss": 0.1254, "step": 563 }, { "epoch": 0.9368770764119602, "grad_norm": 5.9566914133383255, "learning_rate": 5.757568075151249e-07, "loss": 0.0921, "step": 564 }, { "epoch": 0.9385382059800664, "grad_norm": 8.378099614968635, "learning_rate": 5.744260618230133e-07, "loss": 0.1151, "step": 565 }, { "epoch": 0.9401993355481728, "grad_norm": 7.594866350018622, "learning_rate": 5.730947767658404e-07, "loss": 0.0926, "step": 566 }, { "epoch": 0.9418604651162791, "grad_norm": 5.975799246097571, "learning_rate": 5.717629619914185e-07, "loss": 0.0634, "step": 567 }, { "epoch": 0.9435215946843853, "grad_norm": 7.98709361058564, "learning_rate": 5.704306271513981e-07, "loss": 0.0739, "step": 568 }, { "epoch": 0.9451827242524917, "grad_norm": 5.107038268623648, "learning_rate": 5.69097781901199e-07, "loss": 0.0742, "step": 569 }, { "epoch": 0.946843853820598, "grad_norm": 7.691173623087892, "learning_rate": 5.677644358999398e-07, "loss": 0.1137, "step": 570 }, { "epoch": 0.9485049833887044, "grad_norm": 15.079146650209767, "learning_rate": 5.664305988103678e-07, "loss": 0.1334, "step": 571 }, { "epoch": 0.9501661129568106, "grad_norm": 10.468995649492486, "learning_rate": 5.6509628029879e-07, "loss": 0.0933, "step": 572 }, { "epoch": 0.9518272425249169, "grad_norm": 12.307929483854858, "learning_rate": 5.637614900350014e-07, "loss": 0.1288, "step": 573 }, { "epoch": 0.9534883720930233, "grad_norm": 11.854140397945446, "learning_rate": 5.624262376922162e-07, "loss": 0.1043, "step": 574 }, { "epoch": 0.9551495016611296, "grad_norm": 10.655229778108161, "learning_rate": 5.610905329469973e-07, "loss": 0.0992, "step": 575 }, { "epoch": 0.9568106312292359, "grad_norm": 12.54430451998764, "learning_rate": 5.597543854791856e-07, "loss": 0.187, "step": 576 }, { "epoch": 0.9584717607973422, "grad_norm": 10.536342105186373, "learning_rate": 5.584178049718314e-07, "loss": 0.1524, "step": 577 }, { "epoch": 0.9601328903654485, "grad_norm": 13.313735426136686, "learning_rate": 5.570808011111226e-07, "loss": 0.1978, "step": 578 }, { "epoch": 0.9617940199335548, "grad_norm": 7.220442413364977, "learning_rate": 5.557433835863151e-07, "loss": 0.0943, "step": 579 }, { "epoch": 0.9634551495016611, "grad_norm": 23.093365160299122, "learning_rate": 5.544055620896629e-07, "loss": 0.1533, "step": 580 }, { "epoch": 0.9651162790697675, "grad_norm": 11.087199919198286, "learning_rate": 5.530673463163471e-07, "loss": 0.1455, "step": 581 }, { "epoch": 0.9667774086378738, "grad_norm": 10.58290498744938, "learning_rate": 5.517287459644069e-07, "loss": 0.1665, "step": 582 }, { "epoch": 0.96843853820598, "grad_norm": 7.202020662641881, "learning_rate": 5.50389770734668e-07, "loss": 0.0956, "step": 583 }, { "epoch": 0.9700996677740864, "grad_norm": 12.46472530295776, "learning_rate": 5.490504303306727e-07, "loss": 0.1617, "step": 584 }, { "epoch": 0.9717607973421927, "grad_norm": 16.76493326267004, "learning_rate": 5.477107344586101e-07, "loss": 0.1507, "step": 585 }, { "epoch": 0.973421926910299, "grad_norm": 9.237732865929152, "learning_rate": 5.463706928272453e-07, "loss": 0.1412, "step": 586 }, { "epoch": 0.9750830564784053, "grad_norm": 12.383497236276556, "learning_rate": 5.450303151478489e-07, "loss": 0.1493, "step": 587 }, { "epoch": 0.9767441860465116, "grad_norm": 11.151722325366409, "learning_rate": 5.43689611134127e-07, "loss": 0.1412, "step": 588 }, { "epoch": 0.978405315614618, "grad_norm": 7.21013726279382, "learning_rate": 5.423485905021507e-07, "loss": 0.1246, "step": 589 }, { "epoch": 0.9800664451827242, "grad_norm": 8.579739495792525, "learning_rate": 5.410072629702856e-07, "loss": 0.1234, "step": 590 }, { "epoch": 0.9817275747508306, "grad_norm": 7.602281673588693, "learning_rate": 5.396656382591213e-07, "loss": 0.1116, "step": 591 }, { "epoch": 0.9833887043189369, "grad_norm": 14.100145982497999, "learning_rate": 5.38323726091401e-07, "loss": 0.1388, "step": 592 }, { "epoch": 0.9850498338870431, "grad_norm": 7.492618519081229, "learning_rate": 5.369815361919511e-07, "loss": 0.0761, "step": 593 }, { "epoch": 0.9867109634551495, "grad_norm": 6.371586229880432, "learning_rate": 5.356390782876111e-07, "loss": 0.1078, "step": 594 }, { "epoch": 0.9883720930232558, "grad_norm": 9.371673513904842, "learning_rate": 5.342963621071623e-07, "loss": 0.1745, "step": 595 }, { "epoch": 0.9900332225913622, "grad_norm": 5.206481672409249, "learning_rate": 5.329533973812581e-07, "loss": 0.0683, "step": 596 }, { "epoch": 0.9916943521594684, "grad_norm": 14.826520082325914, "learning_rate": 5.316101938423524e-07, "loss": 0.1577, "step": 597 }, { "epoch": 0.9933554817275747, "grad_norm": 10.961715205667316, "learning_rate": 5.302667612246308e-07, "loss": 0.1665, "step": 598 }, { "epoch": 0.9950166112956811, "grad_norm": 14.397029548851924, "learning_rate": 5.28923109263938e-07, "loss": 0.1731, "step": 599 }, { "epoch": 0.9966777408637874, "grad_norm": 8.375566282055818, "learning_rate": 5.275792476977091e-07, "loss": 0.1293, "step": 600 }, { "epoch": 0.9983388704318937, "grad_norm": 11.268520674866318, "learning_rate": 5.262351862648978e-07, "loss": 0.1419, "step": 601 }, { "epoch": 1.0, "grad_norm": 6.784664762230538, "learning_rate": 5.248909347059061e-07, "loss": 0.075, "step": 602 }, { "epoch": 1.0016611295681064, "grad_norm": 4.877956718125895, "learning_rate": 5.235465027625146e-07, "loss": 0.0621, "step": 603 }, { "epoch": 1.0033222591362125, "grad_norm": 4.203005899244808, "learning_rate": 5.2220190017781e-07, "loss": 0.0457, "step": 604 }, { "epoch": 1.004983388704319, "grad_norm": 3.8123959854058103, "learning_rate": 5.208571366961165e-07, "loss": 0.0378, "step": 605 }, { "epoch": 1.0066445182724253, "grad_norm": 5.315954696600741, "learning_rate": 5.195122220629239e-07, "loss": 0.0723, "step": 606 }, { "epoch": 1.0083056478405317, "grad_norm": 3.7805372757538094, "learning_rate": 5.181671660248178e-07, "loss": 0.0298, "step": 607 }, { "epoch": 1.0099667774086378, "grad_norm": 4.0974547093630225, "learning_rate": 5.16821978329408e-07, "loss": 0.0396, "step": 608 }, { "epoch": 1.0116279069767442, "grad_norm": 4.948296436070784, "learning_rate": 5.154766687252591e-07, "loss": 0.0263, "step": 609 }, { "epoch": 1.0132890365448506, "grad_norm": 9.792467737742347, "learning_rate": 5.141312469618183e-07, "loss": 0.0942, "step": 610 }, { "epoch": 1.0149501661129567, "grad_norm": 7.834675461013419, "learning_rate": 5.127857227893465e-07, "loss": 0.0447, "step": 611 }, { "epoch": 1.0166112956810631, "grad_norm": 7.985753475492194, "learning_rate": 5.114401059588464e-07, "loss": 0.0646, "step": 612 }, { "epoch": 1.0182724252491695, "grad_norm": 10.331188978857707, "learning_rate": 5.100944062219917e-07, "loss": 0.0382, "step": 613 }, { "epoch": 1.0199335548172757, "grad_norm": 3.83376887835134, "learning_rate": 5.08748633331058e-07, "loss": 0.0345, "step": 614 }, { "epoch": 1.021594684385382, "grad_norm": 3.8639360019028355, "learning_rate": 5.074027970388499e-07, "loss": 0.0243, "step": 615 }, { "epoch": 1.0232558139534884, "grad_norm": 10.400286609821283, "learning_rate": 5.060569070986324e-07, "loss": 0.0734, "step": 616 }, { "epoch": 1.0249169435215948, "grad_norm": 5.442828090266754, "learning_rate": 5.047109732640586e-07, "loss": 0.0294, "step": 617 }, { "epoch": 1.026578073089701, "grad_norm": 7.870712026341651, "learning_rate": 5.033650052891001e-07, "loss": 0.0301, "step": 618 }, { "epoch": 1.0282392026578073, "grad_norm": 13.585823168789453, "learning_rate": 5.020190129279759e-07, "loss": 0.0988, "step": 619 }, { "epoch": 1.0299003322259137, "grad_norm": 8.796258559095701, "learning_rate": 5.006730059350815e-07, "loss": 0.0468, "step": 620 }, { "epoch": 1.0315614617940199, "grad_norm": 10.44576092302698, "learning_rate": 4.993269940649184e-07, "loss": 0.0714, "step": 621 }, { "epoch": 1.0332225913621262, "grad_norm": 12.352450302810853, "learning_rate": 4.979809870720242e-07, "loss": 0.0478, "step": 622 }, { "epoch": 1.0348837209302326, "grad_norm": 17.75211377150514, "learning_rate": 4.966349947108999e-07, "loss": 0.1147, "step": 623 }, { "epoch": 1.0365448504983388, "grad_norm": 6.156505538276604, "learning_rate": 4.952890267359412e-07, "loss": 0.0478, "step": 624 }, { "epoch": 1.0382059800664452, "grad_norm": 9.218532704370334, "learning_rate": 4.939430929013677e-07, "loss": 0.027, "step": 625 }, { "epoch": 1.0398671096345515, "grad_norm": 7.120527183593547, "learning_rate": 4.925972029611501e-07, "loss": 0.0544, "step": 626 }, { "epoch": 1.041528239202658, "grad_norm": 9.198383686246443, "learning_rate": 4.912513666689421e-07, "loss": 0.0323, "step": 627 }, { "epoch": 1.043189368770764, "grad_norm": 12.663439974096635, "learning_rate": 4.899055937780083e-07, "loss": 0.0445, "step": 628 }, { "epoch": 1.0448504983388704, "grad_norm": 24.085803314752685, "learning_rate": 4.885598940411536e-07, "loss": 0.0655, "step": 629 }, { "epoch": 1.0465116279069768, "grad_norm": 6.902292684354512, "learning_rate": 4.872142772106535e-07, "loss": 0.0326, "step": 630 }, { "epoch": 1.048172757475083, "grad_norm": 6.121287974432574, "learning_rate": 4.858687530381817e-07, "loss": 0.0369, "step": 631 }, { "epoch": 1.0498338870431894, "grad_norm": 8.86764220454998, "learning_rate": 4.845233312747411e-07, "loss": 0.0607, "step": 632 }, { "epoch": 1.0514950166112957, "grad_norm": 9.943461616121928, "learning_rate": 4.831780216705919e-07, "loss": 0.0529, "step": 633 }, { "epoch": 1.053156146179402, "grad_norm": 3.7501260677915313, "learning_rate": 4.818328339751823e-07, "loss": 0.0177, "step": 634 }, { "epoch": 1.0548172757475083, "grad_norm": 3.303220265814859, "learning_rate": 4.804877779370762e-07, "loss": 0.0139, "step": 635 }, { "epoch": 1.0564784053156147, "grad_norm": 7.432956410407962, "learning_rate": 4.791428633038835e-07, "loss": 0.0463, "step": 636 }, { "epoch": 1.058139534883721, "grad_norm": 8.712378765440459, "learning_rate": 4.777980998221901e-07, "loss": 0.0424, "step": 637 }, { "epoch": 1.0598006644518272, "grad_norm": 11.111774138122533, "learning_rate": 4.764534972374855e-07, "loss": 0.0522, "step": 638 }, { "epoch": 1.0614617940199336, "grad_norm": 4.983601758483826, "learning_rate": 4.751090652940938e-07, "loss": 0.0182, "step": 639 }, { "epoch": 1.06312292358804, "grad_norm": 9.89938771331774, "learning_rate": 4.7376481373510217e-07, "loss": 0.0418, "step": 640 }, { "epoch": 1.064784053156146, "grad_norm": 3.1865048946418626, "learning_rate": 4.7242075230229083e-07, "loss": 0.0155, "step": 641 }, { "epoch": 1.0664451827242525, "grad_norm": 10.924670279472195, "learning_rate": 4.71076890736062e-07, "loss": 0.0586, "step": 642 }, { "epoch": 1.0681063122923589, "grad_norm": 4.291225390159846, "learning_rate": 4.6973323877536925e-07, "loss": 0.0206, "step": 643 }, { "epoch": 1.069767441860465, "grad_norm": 7.729879396540719, "learning_rate": 4.6838980615764756e-07, "loss": 0.0442, "step": 644 }, { "epoch": 1.0714285714285714, "grad_norm": 10.4730457424228, "learning_rate": 4.6704660261874195e-07, "loss": 0.0297, "step": 645 }, { "epoch": 1.0730897009966778, "grad_norm": 2.9531809499425465, "learning_rate": 4.657036378928376e-07, "loss": 0.0126, "step": 646 }, { "epoch": 1.0747508305647842, "grad_norm": 6.896419676230295, "learning_rate": 4.643609217123888e-07, "loss": 0.024, "step": 647 }, { "epoch": 1.0764119601328903, "grad_norm": 2.4703995052788392, "learning_rate": 4.630184638080488e-07, "loss": 0.0102, "step": 648 }, { "epoch": 1.0780730897009967, "grad_norm": 10.728604189023233, "learning_rate": 4.616762739085992e-07, "loss": 0.0538, "step": 649 }, { "epoch": 1.079734219269103, "grad_norm": 12.945045897775847, "learning_rate": 4.603343617408787e-07, "loss": 0.0504, "step": 650 } ], "logging_steps": 1, "max_steps": 1204, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 689596271034368.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }