{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.000616903146206, "eval_steps": 203, "global_step": 811, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012338062924120913, "grad_norm": 4560.17138671875, "learning_rate": 2.0000000000000003e-06, "loss": 56.0062, "step": 1 }, { "epoch": 0.0012338062924120913, "eval_loss": 35.97797393798828, "eval_runtime": 12.2941, "eval_samples_per_second": 111.029, "eval_steps_per_second": 13.909, "step": 1 }, { "epoch": 0.0024676125848241827, "grad_norm": 5592.865234375, "learning_rate": 4.000000000000001e-06, "loss": 33.8818, "step": 2 }, { "epoch": 0.003701418877236274, "grad_norm": 7475.13330078125, "learning_rate": 6e-06, "loss": 41.2805, "step": 3 }, { "epoch": 0.004935225169648365, "grad_norm": 3744.770263671875, "learning_rate": 8.000000000000001e-06, "loss": 16.7301, "step": 4 }, { "epoch": 0.006169031462060457, "grad_norm": 4991.17529296875, "learning_rate": 1e-05, "loss": 34.7618, "step": 5 }, { "epoch": 0.007402837754472548, "grad_norm": 2832.42138671875, "learning_rate": 1.2e-05, "loss": 35.808, "step": 6 }, { "epoch": 0.00863664404688464, "grad_norm": 6010.7431640625, "learning_rate": 1.4000000000000001e-05, "loss": 63.1808, "step": 7 }, { "epoch": 0.00987045033929673, "grad_norm": 4059.252685546875, "learning_rate": 1.6000000000000003e-05, "loss": 35.5838, "step": 8 }, { "epoch": 0.011104256631708822, "grad_norm": 10154.849609375, "learning_rate": 1.8e-05, "loss": 48.4638, "step": 9 }, { "epoch": 0.012338062924120914, "grad_norm": 2368.802978515625, "learning_rate": 2e-05, "loss": 17.3951, "step": 10 }, { "epoch": 0.013571869216533004, "grad_norm": 2871.435302734375, "learning_rate": 2.2000000000000003e-05, "loss": 4.7809, "step": 11 }, { "epoch": 0.014805675508945095, "grad_norm": 4488.845703125, "learning_rate": 2.4e-05, "loss": 28.9825, "step": 12 }, { "epoch": 0.016039481801357187, "grad_norm": 2447.83935546875, "learning_rate": 2.6000000000000002e-05, "loss": 35.9274, "step": 13 }, { "epoch": 0.01727328809376928, "grad_norm": 3905.12451171875, "learning_rate": 2.8000000000000003e-05, "loss": 33.2547, "step": 14 }, { "epoch": 0.01850709438618137, "grad_norm": 9355.744140625, "learning_rate": 3e-05, "loss": 30.8486, "step": 15 }, { "epoch": 0.01974090067859346, "grad_norm": 2112.064453125, "learning_rate": 3.2000000000000005e-05, "loss": 20.9453, "step": 16 }, { "epoch": 0.020974706971005553, "grad_norm": 2881.35693359375, "learning_rate": 3.4000000000000007e-05, "loss": 25.8971, "step": 17 }, { "epoch": 0.022208513263417645, "grad_norm": 3045.595947265625, "learning_rate": 3.6e-05, "loss": 29.2871, "step": 18 }, { "epoch": 0.023442319555829736, "grad_norm": 7254.01025390625, "learning_rate": 3.8e-05, "loss": 4.6358, "step": 19 }, { "epoch": 0.024676125848241828, "grad_norm": 3501.96435546875, "learning_rate": 4e-05, "loss": 26.5743, "step": 20 }, { "epoch": 0.025909932140653916, "grad_norm": 1327.111572265625, "learning_rate": 4.2e-05, "loss": 4.5889, "step": 21 }, { "epoch": 0.027143738433066007, "grad_norm": 1893.3087158203125, "learning_rate": 4.4000000000000006e-05, "loss": 14.7384, "step": 22 }, { "epoch": 0.0283775447254781, "grad_norm": 1726.1190185546875, "learning_rate": 4.600000000000001e-05, "loss": 9.0423, "step": 23 }, { "epoch": 0.02961135101789019, "grad_norm": 1904.666259765625, "learning_rate": 4.8e-05, "loss": 12.7261, "step": 24 }, { "epoch": 0.030845157310302282, "grad_norm": 2148.897705078125, "learning_rate": 5e-05, "loss": 9.9565, "step": 25 }, { "epoch": 0.032078963602714373, "grad_norm": 848.7877197265625, "learning_rate": 5.2000000000000004e-05, "loss": 9.052, "step": 26 }, { "epoch": 0.033312769895126465, "grad_norm": 2708.355224609375, "learning_rate": 5.4000000000000005e-05, "loss": 10.5259, "step": 27 }, { "epoch": 0.03454657618753856, "grad_norm": 1201.4300537109375, "learning_rate": 5.6000000000000006e-05, "loss": 5.9977, "step": 28 }, { "epoch": 0.03578038247995065, "grad_norm": 975.587890625, "learning_rate": 5.8e-05, "loss": 6.1963, "step": 29 }, { "epoch": 0.03701418877236274, "grad_norm": 1623.7308349609375, "learning_rate": 6e-05, "loss": 5.802, "step": 30 }, { "epoch": 0.03824799506477483, "grad_norm": 1643.6309814453125, "learning_rate": 6.2e-05, "loss": 7.1271, "step": 31 }, { "epoch": 0.03948180135718692, "grad_norm": 1294.16943359375, "learning_rate": 6.400000000000001e-05, "loss": 6.4117, "step": 32 }, { "epoch": 0.040715607649599014, "grad_norm": 1482.4114990234375, "learning_rate": 6.6e-05, "loss": 7.4928, "step": 33 }, { "epoch": 0.041949413942011106, "grad_norm": 3359.637451171875, "learning_rate": 6.800000000000001e-05, "loss": 10.1592, "step": 34 }, { "epoch": 0.0431832202344232, "grad_norm": 1437.229248046875, "learning_rate": 7e-05, "loss": 9.848, "step": 35 }, { "epoch": 0.04441702652683529, "grad_norm": 1842.515380859375, "learning_rate": 7.2e-05, "loss": 7.6048, "step": 36 }, { "epoch": 0.04565083281924738, "grad_norm": 1191.092529296875, "learning_rate": 7.4e-05, "loss": 7.0469, "step": 37 }, { "epoch": 0.04688463911165947, "grad_norm": 791.5805053710938, "learning_rate": 7.6e-05, "loss": 8.0601, "step": 38 }, { "epoch": 0.048118445404071564, "grad_norm": 3307.971923828125, "learning_rate": 7.800000000000001e-05, "loss": 8.5344, "step": 39 }, { "epoch": 0.049352251696483655, "grad_norm": 897.6090698242188, "learning_rate": 8e-05, "loss": 8.3129, "step": 40 }, { "epoch": 0.05058605798889575, "grad_norm": 1446.93310546875, "learning_rate": 8.2e-05, "loss": 8.1552, "step": 41 }, { "epoch": 0.05181986428130783, "grad_norm": 1532.832763671875, "learning_rate": 8.4e-05, "loss": 6.3817, "step": 42 }, { "epoch": 0.05305367057371992, "grad_norm": 1702.4017333984375, "learning_rate": 8.6e-05, "loss": 7.5822, "step": 43 }, { "epoch": 0.054287476866132015, "grad_norm": 1295.9134521484375, "learning_rate": 8.800000000000001e-05, "loss": 6.7747, "step": 44 }, { "epoch": 0.055521283158544106, "grad_norm": 1218.6241455078125, "learning_rate": 9e-05, "loss": 6.4545, "step": 45 }, { "epoch": 0.0567550894509562, "grad_norm": 3025.362060546875, "learning_rate": 9.200000000000001e-05, "loss": 4.7579, "step": 46 }, { "epoch": 0.05798889574336829, "grad_norm": 2290.4189453125, "learning_rate": 9.4e-05, "loss": 11.1638, "step": 47 }, { "epoch": 0.05922270203578038, "grad_norm": 1046.950927734375, "learning_rate": 9.6e-05, "loss": 8.4257, "step": 48 }, { "epoch": 0.06045650832819247, "grad_norm": 1667.9334716796875, "learning_rate": 9.8e-05, "loss": 6.9188, "step": 49 }, { "epoch": 0.061690314620604564, "grad_norm": 1744.4573974609375, "learning_rate": 0.0001, "loss": 6.8408, "step": 50 }, { "epoch": 0.06292412091301666, "grad_norm": 1064.3636474609375, "learning_rate": 9.99995739409215e-05, "loss": 6.5595, "step": 51 }, { "epoch": 0.06415792720542875, "grad_norm": 1446.072021484375, "learning_rate": 9.999829577094701e-05, "loss": 6.8713, "step": 52 }, { "epoch": 0.06539173349784085, "grad_norm": 895.4334106445312, "learning_rate": 9.999616551185958e-05, "loss": 5.9296, "step": 53 }, { "epoch": 0.06662553979025293, "grad_norm": 1768.35009765625, "learning_rate": 9.999318319996388e-05, "loss": 5.1248, "step": 54 }, { "epoch": 0.06785934608266501, "grad_norm": 1274.9132080078125, "learning_rate": 9.998934888608553e-05, "loss": 4.9814, "step": 55 }, { "epoch": 0.06909315237507711, "grad_norm": 4490.84326171875, "learning_rate": 9.998466263557031e-05, "loss": 6.0693, "step": 56 }, { "epoch": 0.0703269586674892, "grad_norm": 1595.7825927734375, "learning_rate": 9.9979124528283e-05, "loss": 5.5088, "step": 57 }, { "epoch": 0.0715607649599013, "grad_norm": 1278.6317138671875, "learning_rate": 9.997273465860601e-05, "loss": 5.2405, "step": 58 }, { "epoch": 0.07279457125231338, "grad_norm": 2358.891845703125, "learning_rate": 9.996549313543788e-05, "loss": 5.8857, "step": 59 }, { "epoch": 0.07402837754472548, "grad_norm": 1924.482666015625, "learning_rate": 9.99574000821912e-05, "loss": 5.3205, "step": 60 }, { "epoch": 0.07526218383713756, "grad_norm": 2458.82568359375, "learning_rate": 9.99484556367908e-05, "loss": 7.0119, "step": 61 }, { "epoch": 0.07649599012954966, "grad_norm": 2840.022216796875, "learning_rate": 9.993865995167112e-05, "loss": 6.2273, "step": 62 }, { "epoch": 0.07772979642196175, "grad_norm": 1267.5972900390625, "learning_rate": 9.992801319377379e-05, "loss": 5.461, "step": 63 }, { "epoch": 0.07896360271437385, "grad_norm": 3967.05029296875, "learning_rate": 9.991651554454473e-05, "loss": 5.3853, "step": 64 }, { "epoch": 0.08019740900678593, "grad_norm": 1956.1412353515625, "learning_rate": 9.990416719993105e-05, "loss": 6.6339, "step": 65 }, { "epoch": 0.08143121529919803, "grad_norm": 2096.103515625, "learning_rate": 9.989096837037772e-05, "loss": 6.3398, "step": 66 }, { "epoch": 0.08266502159161011, "grad_norm": 1360.98779296875, "learning_rate": 9.9876919280824e-05, "loss": 4.387, "step": 67 }, { "epoch": 0.08389882788402221, "grad_norm": 1470.181396484375, "learning_rate": 9.986202017069956e-05, "loss": 4.9476, "step": 68 }, { "epoch": 0.0851326341764343, "grad_norm": 1192.005615234375, "learning_rate": 9.984627129392044e-05, "loss": 4.6936, "step": 69 }, { "epoch": 0.0863664404688464, "grad_norm": 1485.0313720703125, "learning_rate": 9.982967291888474e-05, "loss": 7.158, "step": 70 }, { "epoch": 0.08760024676125848, "grad_norm": 1102.8905029296875, "learning_rate": 9.981222532846799e-05, "loss": 4.266, "step": 71 }, { "epoch": 0.08883405305367058, "grad_norm": 870.322021484375, "learning_rate": 9.979392882001835e-05, "loss": 5.5898, "step": 72 }, { "epoch": 0.09006785934608266, "grad_norm": 1322.6845703125, "learning_rate": 9.977478370535156e-05, "loss": 5.3145, "step": 73 }, { "epoch": 0.09130166563849476, "grad_norm": 2103.114990234375, "learning_rate": 9.975479031074563e-05, "loss": 6.9281, "step": 74 }, { "epoch": 0.09253547193090685, "grad_norm": 883.7548217773438, "learning_rate": 9.973394897693524e-05, "loss": 5.567, "step": 75 }, { "epoch": 0.09376927822331894, "grad_norm": 1145.715087890625, "learning_rate": 9.971226005910596e-05, "loss": 5.1996, "step": 76 }, { "epoch": 0.09500308451573103, "grad_norm": 1630.5809326171875, "learning_rate": 9.968972392688824e-05, "loss": 5.048, "step": 77 }, { "epoch": 0.09623689080814313, "grad_norm": 1148.510986328125, "learning_rate": 9.966634096435101e-05, "loss": 4.4982, "step": 78 }, { "epoch": 0.09747069710055521, "grad_norm": 881.5775756835938, "learning_rate": 9.964211156999519e-05, "loss": 5.1363, "step": 79 }, { "epoch": 0.09870450339296731, "grad_norm": 914.3575439453125, "learning_rate": 9.961703615674694e-05, "loss": 4.9005, "step": 80 }, { "epoch": 0.0999383096853794, "grad_norm": 1390.382080078125, "learning_rate": 9.959111515195055e-05, "loss": 6.2021, "step": 81 }, { "epoch": 0.1011721159777915, "grad_norm": 1237.8050537109375, "learning_rate": 9.956434899736119e-05, "loss": 5.9765, "step": 82 }, { "epoch": 0.10240592227020358, "grad_norm": 713.2557983398438, "learning_rate": 9.95367381491374e-05, "loss": 4.3175, "step": 83 }, { "epoch": 0.10363972856261566, "grad_norm": 1391.5262451171875, "learning_rate": 9.950828307783328e-05, "loss": 5.1451, "step": 84 }, { "epoch": 0.10487353485502776, "grad_norm": 879.743896484375, "learning_rate": 9.947898426839048e-05, "loss": 5.5813, "step": 85 }, { "epoch": 0.10610734114743985, "grad_norm": 675.2743530273438, "learning_rate": 9.944884222012994e-05, "loss": 4.1802, "step": 86 }, { "epoch": 0.10734114743985194, "grad_norm": 860.4559326171875, "learning_rate": 9.941785744674344e-05, "loss": 5.1935, "step": 87 }, { "epoch": 0.10857495373226403, "grad_norm": 1170.777099609375, "learning_rate": 9.938603047628468e-05, "loss": 4.4537, "step": 88 }, { "epoch": 0.10980876002467613, "grad_norm": 1625.8270263671875, "learning_rate": 9.935336185116048e-05, "loss": 4.6682, "step": 89 }, { "epoch": 0.11104256631708821, "grad_norm": 823.9100341796875, "learning_rate": 9.93198521281214e-05, "loss": 5.2064, "step": 90 }, { "epoch": 0.11227637260950031, "grad_norm": 1431.343994140625, "learning_rate": 9.928550187825234e-05, "loss": 5.0934, "step": 91 }, { "epoch": 0.1135101789019124, "grad_norm": 919.2091064453125, "learning_rate": 9.925031168696268e-05, "loss": 4.7349, "step": 92 }, { "epoch": 0.1147439851943245, "grad_norm": 1216.918701171875, "learning_rate": 9.92142821539765e-05, "loss": 4.8568, "step": 93 }, { "epoch": 0.11597779148673658, "grad_norm": 1565.6661376953125, "learning_rate": 9.917741389332212e-05, "loss": 5.2424, "step": 94 }, { "epoch": 0.11721159777914868, "grad_norm": 1420.35595703125, "learning_rate": 9.913970753332188e-05, "loss": 5.101, "step": 95 }, { "epoch": 0.11844540407156076, "grad_norm": 1778.9505615234375, "learning_rate": 9.910116371658122e-05, "loss": 4.8264, "step": 96 }, { "epoch": 0.11967921036397286, "grad_norm": 1399.1075439453125, "learning_rate": 9.90617830999779e-05, "loss": 5.3497, "step": 97 }, { "epoch": 0.12091301665638494, "grad_norm": 2120.199951171875, "learning_rate": 9.902156635465066e-05, "loss": 4.1041, "step": 98 }, { "epoch": 0.12214682294879704, "grad_norm": 803.6605224609375, "learning_rate": 9.89805141659879e-05, "loss": 4.1576, "step": 99 }, { "epoch": 0.12338062924120913, "grad_norm": 1080.4222412109375, "learning_rate": 9.893862723361588e-05, "loss": 4.4528, "step": 100 }, { "epoch": 0.12461443553362123, "grad_norm": 794.2058715820312, "learning_rate": 9.889590627138699e-05, "loss": 4.0517, "step": 101 }, { "epoch": 0.12584824182603332, "grad_norm": 769.0142822265625, "learning_rate": 9.885235200736731e-05, "loss": 4.4746, "step": 102 }, { "epoch": 0.1270820481184454, "grad_norm": 861.2747192382812, "learning_rate": 9.880796518382447e-05, "loss": 4.4587, "step": 103 }, { "epoch": 0.1283158544108575, "grad_norm": 850.0379638671875, "learning_rate": 9.87627465572148e-05, "loss": 4.8245, "step": 104 }, { "epoch": 0.1295496607032696, "grad_norm": 856.2617797851562, "learning_rate": 9.871669689817058e-05, "loss": 4.8164, "step": 105 }, { "epoch": 0.1307834669956817, "grad_norm": 861.1815795898438, "learning_rate": 9.866981699148682e-05, "loss": 4.1493, "step": 106 }, { "epoch": 0.13201727328809376, "grad_norm": 901.6146850585938, "learning_rate": 9.86221076361079e-05, "loss": 4.9544, "step": 107 }, { "epoch": 0.13325107958050586, "grad_norm": 1420.4744873046875, "learning_rate": 9.857356964511399e-05, "loss": 5.2153, "step": 108 }, { "epoch": 0.13448488587291796, "grad_norm": 941.40185546875, "learning_rate": 9.852420384570718e-05, "loss": 4.7888, "step": 109 }, { "epoch": 0.13571869216533003, "grad_norm": 1175.920166015625, "learning_rate": 9.84740110791973e-05, "loss": 4.6101, "step": 110 }, { "epoch": 0.13695249845774213, "grad_norm": 806.5234985351562, "learning_rate": 9.842299220098774e-05, "loss": 4.252, "step": 111 }, { "epoch": 0.13818630475015423, "grad_norm": 775.3392944335938, "learning_rate": 9.837114808056074e-05, "loss": 4.0434, "step": 112 }, { "epoch": 0.13942011104256632, "grad_norm": 732.814453125, "learning_rate": 9.831847960146263e-05, "loss": 4.6918, "step": 113 }, { "epoch": 0.1406539173349784, "grad_norm": 816.4066772460938, "learning_rate": 9.826498766128875e-05, "loss": 4.4245, "step": 114 }, { "epoch": 0.1418877236273905, "grad_norm": 1168.6640625, "learning_rate": 9.821067317166819e-05, "loss": 4.6703, "step": 115 }, { "epoch": 0.1431215299198026, "grad_norm": 1048.36328125, "learning_rate": 9.815553705824816e-05, "loss": 5.0383, "step": 116 }, { "epoch": 0.1443553362122147, "grad_norm": 1430.2000732421875, "learning_rate": 9.809958026067838e-05, "loss": 4.5511, "step": 117 }, { "epoch": 0.14558914250462676, "grad_norm": 950.8472900390625, "learning_rate": 9.804280373259489e-05, "loss": 4.6968, "step": 118 }, { "epoch": 0.14682294879703886, "grad_norm": 875.76904296875, "learning_rate": 9.79852084416039e-05, "loss": 4.1999, "step": 119 }, { "epoch": 0.14805675508945096, "grad_norm": 733.0322265625, "learning_rate": 9.792679536926525e-05, "loss": 4.2898, "step": 120 }, { "epoch": 0.14929056138186306, "grad_norm": 640.2374267578125, "learning_rate": 9.786756551107578e-05, "loss": 4.6612, "step": 121 }, { "epoch": 0.15052436767427513, "grad_norm": 1187.465576171875, "learning_rate": 9.780751987645222e-05, "loss": 4.9567, "step": 122 }, { "epoch": 0.15175817396668723, "grad_norm": 852.5482177734375, "learning_rate": 9.774665948871408e-05, "loss": 5.1155, "step": 123 }, { "epoch": 0.15299198025909932, "grad_norm": 697.5789794921875, "learning_rate": 9.768498538506618e-05, "loss": 4.1774, "step": 124 }, { "epoch": 0.15422578655151142, "grad_norm": 661.68115234375, "learning_rate": 9.762249861658099e-05, "loss": 4.068, "step": 125 }, { "epoch": 0.1554595928439235, "grad_norm": 1018.1265869140625, "learning_rate": 9.755920024818074e-05, "loss": 4.47, "step": 126 }, { "epoch": 0.1566933991363356, "grad_norm": 934.083251953125, "learning_rate": 9.749509135861917e-05, "loss": 4.2238, "step": 127 }, { "epoch": 0.1579272054287477, "grad_norm": 954.9738159179688, "learning_rate": 9.743017304046329e-05, "loss": 4.3781, "step": 128 }, { "epoch": 0.1591610117211598, "grad_norm": 660.1015014648438, "learning_rate": 9.736444640007462e-05, "loss": 4.4627, "step": 129 }, { "epoch": 0.16039481801357186, "grad_norm": 916.5843505859375, "learning_rate": 9.729791255759045e-05, "loss": 4.8595, "step": 130 }, { "epoch": 0.16162862430598396, "grad_norm": 613.2633666992188, "learning_rate": 9.72305726469047e-05, "loss": 4.3733, "step": 131 }, { "epoch": 0.16286243059839606, "grad_norm": 716.1300659179688, "learning_rate": 9.716242781564855e-05, "loss": 4.3151, "step": 132 }, { "epoch": 0.16409623689080816, "grad_norm": 787.4788818359375, "learning_rate": 9.7093479225171e-05, "loss": 4.3675, "step": 133 }, { "epoch": 0.16533004318322023, "grad_norm": 683.7646484375, "learning_rate": 9.702372805051893e-05, "loss": 4.3018, "step": 134 }, { "epoch": 0.16656384947563233, "grad_norm": 683.65380859375, "learning_rate": 9.69531754804172e-05, "loss": 4.1101, "step": 135 }, { "epoch": 0.16779765576804442, "grad_norm": 893.3405151367188, "learning_rate": 9.688182271724834e-05, "loss": 4.4741, "step": 136 }, { "epoch": 0.16903146206045652, "grad_norm": 2627.186767578125, "learning_rate": 9.680967097703205e-05, "loss": 4.4923, "step": 137 }, { "epoch": 0.1702652683528686, "grad_norm": 951.4346313476562, "learning_rate": 9.673672148940445e-05, "loss": 4.2935, "step": 138 }, { "epoch": 0.1714990746452807, "grad_norm": 1141.7415771484375, "learning_rate": 9.666297549759727e-05, "loss": 4.2769, "step": 139 }, { "epoch": 0.1727328809376928, "grad_norm": 706.809326171875, "learning_rate": 9.658843425841642e-05, "loss": 4.4514, "step": 140 }, { "epoch": 0.17396668723010486, "grad_norm": 763.8978881835938, "learning_rate": 9.651309904222078e-05, "loss": 4.3747, "step": 141 }, { "epoch": 0.17520049352251696, "grad_norm": 644.3199462890625, "learning_rate": 9.64369711329005e-05, "loss": 3.9941, "step": 142 }, { "epoch": 0.17643429981492906, "grad_norm": 1079.9039306640625, "learning_rate": 9.636005182785502e-05, "loss": 4.06, "step": 143 }, { "epoch": 0.17766810610734116, "grad_norm": 1368.89990234375, "learning_rate": 9.628234243797107e-05, "loss": 4.5469, "step": 144 }, { "epoch": 0.17890191239975323, "grad_norm": 713.904296875, "learning_rate": 9.62038442876003e-05, "loss": 4.872, "step": 145 }, { "epoch": 0.18013571869216533, "grad_norm": 670.1369018554688, "learning_rate": 9.612455871453669e-05, "loss": 4.5054, "step": 146 }, { "epoch": 0.18136952498457742, "grad_norm": 758.7835083007812, "learning_rate": 9.604448706999378e-05, "loss": 4.2458, "step": 147 }, { "epoch": 0.18260333127698952, "grad_norm": 1024.5286865234375, "learning_rate": 9.596363071858161e-05, "loss": 4.461, "step": 148 }, { "epoch": 0.1838371375694016, "grad_norm": 1269.671630859375, "learning_rate": 9.588199103828346e-05, "loss": 4.2194, "step": 149 }, { "epoch": 0.1850709438618137, "grad_norm": 1105.3748779296875, "learning_rate": 9.579956942043243e-05, "loss": 4.7544, "step": 150 }, { "epoch": 0.1863047501542258, "grad_norm": 760.70849609375, "learning_rate": 9.571636726968767e-05, "loss": 4.4033, "step": 151 }, { "epoch": 0.1875385564466379, "grad_norm": 964.02197265625, "learning_rate": 9.563238600401042e-05, "loss": 4.283, "step": 152 }, { "epoch": 0.18877236273904996, "grad_norm": 1978.085693359375, "learning_rate": 9.554762705463993e-05, "loss": 4.3468, "step": 153 }, { "epoch": 0.19000616903146206, "grad_norm": 715.2885131835938, "learning_rate": 9.546209186606898e-05, "loss": 4.0693, "step": 154 }, { "epoch": 0.19123997532387416, "grad_norm": 769.632080078125, "learning_rate": 9.537578189601934e-05, "loss": 4.4345, "step": 155 }, { "epoch": 0.19247378161628625, "grad_norm": 810.5945434570312, "learning_rate": 9.528869861541683e-05, "loss": 4.0155, "step": 156 }, { "epoch": 0.19370758790869833, "grad_norm": 892.734375, "learning_rate": 9.520084350836636e-05, "loss": 4.2127, "step": 157 }, { "epoch": 0.19494139420111042, "grad_norm": 1075.0047607421875, "learning_rate": 9.511221807212655e-05, "loss": 4.301, "step": 158 }, { "epoch": 0.19617520049352252, "grad_norm": 1251.20947265625, "learning_rate": 9.502282381708428e-05, "loss": 4.4876, "step": 159 }, { "epoch": 0.19740900678593462, "grad_norm": 806.9592895507812, "learning_rate": 9.493266226672892e-05, "loss": 4.3804, "step": 160 }, { "epoch": 0.1986428130783467, "grad_norm": 851.3005981445312, "learning_rate": 9.484173495762634e-05, "loss": 3.8664, "step": 161 }, { "epoch": 0.1998766193707588, "grad_norm": 660.2595825195312, "learning_rate": 9.475004343939276e-05, "loss": 3.9425, "step": 162 }, { "epoch": 0.2011104256631709, "grad_norm": 608.1460571289062, "learning_rate": 9.465758927466833e-05, "loss": 4.5709, "step": 163 }, { "epoch": 0.202344231955583, "grad_norm": 712.73681640625, "learning_rate": 9.45643740390905e-05, "loss": 4.0292, "step": 164 }, { "epoch": 0.20357803824799506, "grad_norm": 595.7338256835938, "learning_rate": 9.447039932126717e-05, "loss": 4.0951, "step": 165 }, { "epoch": 0.20481184454040716, "grad_norm": 711.393798828125, "learning_rate": 9.43756667227496e-05, "loss": 3.8687, "step": 166 }, { "epoch": 0.20604565083281925, "grad_norm": 748.5135498046875, "learning_rate": 9.428017785800514e-05, "loss": 4.5947, "step": 167 }, { "epoch": 0.20727945712523133, "grad_norm": 603.4214477539062, "learning_rate": 9.41839343543897e-05, "loss": 4.1573, "step": 168 }, { "epoch": 0.20851326341764342, "grad_norm": 793.157470703125, "learning_rate": 9.408693785212001e-05, "loss": 4.2098, "step": 169 }, { "epoch": 0.20974706971005552, "grad_norm": 743.0506591796875, "learning_rate": 9.398919000424569e-05, "loss": 4.0386, "step": 170 }, { "epoch": 0.21098087600246762, "grad_norm": 544.7698364257812, "learning_rate": 9.389069247662107e-05, "loss": 4.303, "step": 171 }, { "epoch": 0.2122146822948797, "grad_norm": 991.1251220703125, "learning_rate": 9.379144694787678e-05, "loss": 4.0436, "step": 172 }, { "epoch": 0.2134484885872918, "grad_norm": 1470.9853515625, "learning_rate": 9.369145510939114e-05, "loss": 4.2473, "step": 173 }, { "epoch": 0.2146822948797039, "grad_norm": 1087.509033203125, "learning_rate": 9.359071866526139e-05, "loss": 4.0844, "step": 174 }, { "epoch": 0.215916101172116, "grad_norm": 588.9415283203125, "learning_rate": 9.34892393322746e-05, "loss": 4.2084, "step": 175 }, { "epoch": 0.21714990746452806, "grad_norm": 744.3660278320312, "learning_rate": 9.33870188398784e-05, "loss": 4.4122, "step": 176 }, { "epoch": 0.21838371375694016, "grad_norm": 1330.01416015625, "learning_rate": 9.328405893015155e-05, "loss": 3.933, "step": 177 }, { "epoch": 0.21961752004935226, "grad_norm": 723.2584838867188, "learning_rate": 9.318036135777421e-05, "loss": 4.0077, "step": 178 }, { "epoch": 0.22085132634176435, "grad_norm": 833.8857421875, "learning_rate": 9.307592788999809e-05, "loss": 4.4596, "step": 179 }, { "epoch": 0.22208513263417642, "grad_norm": 825.7850952148438, "learning_rate": 9.297076030661622e-05, "loss": 4.1355, "step": 180 }, { "epoch": 0.22331893892658852, "grad_norm": 793.1968994140625, "learning_rate": 9.28648603999328e-05, "loss": 4.144, "step": 181 }, { "epoch": 0.22455274521900062, "grad_norm": 885.459716796875, "learning_rate": 9.275822997473249e-05, "loss": 4.1302, "step": 182 }, { "epoch": 0.22578655151141272, "grad_norm": 1035.5894775390625, "learning_rate": 9.265087084824969e-05, "loss": 4.2419, "step": 183 }, { "epoch": 0.2270203578038248, "grad_norm": 719.035888671875, "learning_rate": 9.254278485013765e-05, "loss": 4.0593, "step": 184 }, { "epoch": 0.2282541640962369, "grad_norm": 731.126953125, "learning_rate": 9.243397382243717e-05, "loss": 3.9855, "step": 185 }, { "epoch": 0.229487970388649, "grad_norm": 738.84375, "learning_rate": 9.232443961954531e-05, "loss": 4.1835, "step": 186 }, { "epoch": 0.23072177668106109, "grad_norm": 696.2467651367188, "learning_rate": 9.221418410818374e-05, "loss": 4.0606, "step": 187 }, { "epoch": 0.23195558297347316, "grad_norm": 612.947998046875, "learning_rate": 9.210320916736692e-05, "loss": 4.0575, "step": 188 }, { "epoch": 0.23318938926588526, "grad_norm": 808.7545776367188, "learning_rate": 9.199151668837009e-05, "loss": 3.9909, "step": 189 }, { "epoch": 0.23442319555829735, "grad_norm": 837.6028442382812, "learning_rate": 9.187910857469704e-05, "loss": 4.1157, "step": 190 }, { "epoch": 0.23565700185070945, "grad_norm": 1000.20751953125, "learning_rate": 9.176598674204766e-05, "loss": 4.1318, "step": 191 }, { "epoch": 0.23689080814312152, "grad_norm": 726.634765625, "learning_rate": 9.16521531182853e-05, "loss": 4.4594, "step": 192 }, { "epoch": 0.23812461443553362, "grad_norm": 1218.5831298828125, "learning_rate": 9.15376096434039e-05, "loss": 4.2829, "step": 193 }, { "epoch": 0.23935842072794572, "grad_norm": 1265.02392578125, "learning_rate": 9.142235826949497e-05, "loss": 4.127, "step": 194 }, { "epoch": 0.2405922270203578, "grad_norm": 1306.1739501953125, "learning_rate": 9.13064009607143e-05, "loss": 4.4841, "step": 195 }, { "epoch": 0.2418260333127699, "grad_norm": 995.2982177734375, "learning_rate": 9.11897396932484e-05, "loss": 4.6542, "step": 196 }, { "epoch": 0.243059839605182, "grad_norm": 2460.7783203125, "learning_rate": 9.1072376455281e-05, "loss": 4.328, "step": 197 }, { "epoch": 0.24429364589759409, "grad_norm": 846.62939453125, "learning_rate": 9.0954313246959e-05, "loss": 4.0734, "step": 198 }, { "epoch": 0.24552745219000616, "grad_norm": 878.358154296875, "learning_rate": 9.083555208035847e-05, "loss": 4.3734, "step": 199 }, { "epoch": 0.24676125848241826, "grad_norm": 868.6820678710938, "learning_rate": 9.071609497945036e-05, "loss": 4.3404, "step": 200 }, { "epoch": 0.24799506477483035, "grad_norm": 1059.5777587890625, "learning_rate": 9.059594398006593e-05, "loss": 4.4662, "step": 201 }, { "epoch": 0.24922887106724245, "grad_norm": 945.8322143554688, "learning_rate": 9.047510112986218e-05, "loss": 4.4829, "step": 202 }, { "epoch": 0.25046267735965455, "grad_norm": 1532.6656494140625, "learning_rate": 9.03535684882868e-05, "loss": 4.6308, "step": 203 }, { "epoch": 0.25046267735965455, "eval_loss": 4.406351566314697, "eval_runtime": 12.5602, "eval_samples_per_second": 108.677, "eval_steps_per_second": 13.614, "step": 203 }, { "epoch": 0.25169648365206665, "grad_norm": 1377.272705078125, "learning_rate": 9.023134812654324e-05, "loss": 4.2306, "step": 204 }, { "epoch": 0.2529302899444787, "grad_norm": 1295.0616455078125, "learning_rate": 9.010844212755528e-05, "loss": 4.3758, "step": 205 }, { "epoch": 0.2541640962368908, "grad_norm": 2435.465576171875, "learning_rate": 8.998485258593158e-05, "loss": 4.7894, "step": 206 }, { "epoch": 0.2553979025293029, "grad_norm": 1126.279296875, "learning_rate": 8.986058160792999e-05, "loss": 4.6111, "step": 207 }, { "epoch": 0.256631708821715, "grad_norm": 1509.06884765625, "learning_rate": 8.973563131142164e-05, "loss": 4.4207, "step": 208 }, { "epoch": 0.2578655151141271, "grad_norm": 1990.911865234375, "learning_rate": 8.961000382585487e-05, "loss": 4.4321, "step": 209 }, { "epoch": 0.2590993214065392, "grad_norm": 1448.5911865234375, "learning_rate": 8.94837012922189e-05, "loss": 4.4892, "step": 210 }, { "epoch": 0.2603331276989513, "grad_norm": 1054.68701171875, "learning_rate": 8.935672586300737e-05, "loss": 4.3771, "step": 211 }, { "epoch": 0.2615669339913634, "grad_norm": 928.865234375, "learning_rate": 8.922907970218167e-05, "loss": 4.2268, "step": 212 }, { "epoch": 0.2628007402837754, "grad_norm": 1341.111328125, "learning_rate": 8.910076498513402e-05, "loss": 4.1635, "step": 213 }, { "epoch": 0.2640345465761875, "grad_norm": 1158.7484130859375, "learning_rate": 8.897178389865042e-05, "loss": 4.2772, "step": 214 }, { "epoch": 0.2652683528685996, "grad_norm": 1444.398193359375, "learning_rate": 8.884213864087339e-05, "loss": 4.5948, "step": 215 }, { "epoch": 0.2665021591610117, "grad_norm": 1325.520263671875, "learning_rate": 8.871183142126446e-05, "loss": 4.3144, "step": 216 }, { "epoch": 0.2677359654534238, "grad_norm": 1347.3206787109375, "learning_rate": 8.858086446056664e-05, "loss": 4.3073, "step": 217 }, { "epoch": 0.2689697717458359, "grad_norm": 1085.1033935546875, "learning_rate": 8.84492399907664e-05, "loss": 4.1869, "step": 218 }, { "epoch": 0.270203578038248, "grad_norm": 1085.8934326171875, "learning_rate": 8.831696025505577e-05, "loss": 4.4337, "step": 219 }, { "epoch": 0.27143738433066006, "grad_norm": 894.6118774414062, "learning_rate": 8.818402750779402e-05, "loss": 4.249, "step": 220 }, { "epoch": 0.27267119062307216, "grad_norm": 959.304443359375, "learning_rate": 8.805044401446933e-05, "loss": 4.5136, "step": 221 }, { "epoch": 0.27390499691548426, "grad_norm": 983.3085327148438, "learning_rate": 8.791621205166008e-05, "loss": 4.3945, "step": 222 }, { "epoch": 0.27513880320789635, "grad_norm": 1240.808837890625, "learning_rate": 8.778133390699613e-05, "loss": 4.6812, "step": 223 }, { "epoch": 0.27637260950030845, "grad_norm": 1170.7327880859375, "learning_rate": 8.764581187911979e-05, "loss": 4.3752, "step": 224 }, { "epoch": 0.27760641579272055, "grad_norm": 860.9459228515625, "learning_rate": 8.750964827764671e-05, "loss": 4.3768, "step": 225 }, { "epoch": 0.27884022208513265, "grad_norm": 1130.246337890625, "learning_rate": 8.73728454231264e-05, "loss": 4.3994, "step": 226 }, { "epoch": 0.28007402837754475, "grad_norm": 1062.34619140625, "learning_rate": 8.72354056470028e-05, "loss": 4.5776, "step": 227 }, { "epoch": 0.2813078346699568, "grad_norm": 870.713623046875, "learning_rate": 8.709733129157449e-05, "loss": 4.5764, "step": 228 }, { "epoch": 0.2825416409623689, "grad_norm": 653.0759887695312, "learning_rate": 8.695862470995475e-05, "loss": 4.2891, "step": 229 }, { "epoch": 0.283775447254781, "grad_norm": 2089.304443359375, "learning_rate": 8.681928826603153e-05, "loss": 4.6807, "step": 230 }, { "epoch": 0.2850092535471931, "grad_norm": 776.1245727539062, "learning_rate": 8.667932433442711e-05, "loss": 4.5913, "step": 231 }, { "epoch": 0.2862430598396052, "grad_norm": 761.7719116210938, "learning_rate": 8.653873530045762e-05, "loss": 4.4249, "step": 232 }, { "epoch": 0.2874768661320173, "grad_norm": 1007.0621337890625, "learning_rate": 8.639752356009246e-05, "loss": 4.1843, "step": 233 }, { "epoch": 0.2887106724244294, "grad_norm": 1450.182373046875, "learning_rate": 8.625569151991337e-05, "loss": 4.5278, "step": 234 }, { "epoch": 0.2899444787168415, "grad_norm": 963.1392822265625, "learning_rate": 8.611324159707349e-05, "loss": 4.1958, "step": 235 }, { "epoch": 0.2911782850092535, "grad_norm": 1050.1671142578125, "learning_rate": 8.597017621925613e-05, "loss": 4.2233, "step": 236 }, { "epoch": 0.2924120913016656, "grad_norm": 883.4952392578125, "learning_rate": 8.58264978246334e-05, "loss": 4.5393, "step": 237 }, { "epoch": 0.2936458975940777, "grad_norm": 1134.7784423828125, "learning_rate": 8.568220886182471e-05, "loss": 4.4194, "step": 238 }, { "epoch": 0.2948797038864898, "grad_norm": 1003.4342651367188, "learning_rate": 8.553731178985494e-05, "loss": 4.3341, "step": 239 }, { "epoch": 0.2961135101789019, "grad_norm": 842.8468017578125, "learning_rate": 8.53918090781126e-05, "loss": 4.4834, "step": 240 }, { "epoch": 0.297347316471314, "grad_norm": 905.9857788085938, "learning_rate": 8.524570320630775e-05, "loss": 4.4486, "step": 241 }, { "epoch": 0.2985811227637261, "grad_norm": 1211.843017578125, "learning_rate": 8.509899666442972e-05, "loss": 4.4023, "step": 242 }, { "epoch": 0.2998149290561382, "grad_norm": 1189.910400390625, "learning_rate": 8.495169195270467e-05, "loss": 4.2451, "step": 243 }, { "epoch": 0.30104873534855026, "grad_norm": 919.16845703125, "learning_rate": 8.480379158155299e-05, "loss": 4.322, "step": 244 }, { "epoch": 0.30228254164096235, "grad_norm": 1101.24462890625, "learning_rate": 8.465529807154648e-05, "loss": 4.478, "step": 245 }, { "epoch": 0.30351634793337445, "grad_norm": 716.9551391601562, "learning_rate": 8.450621395336553e-05, "loss": 4.6653, "step": 246 }, { "epoch": 0.30475015422578655, "grad_norm": 940.1488647460938, "learning_rate": 8.435654176775577e-05, "loss": 4.2529, "step": 247 }, { "epoch": 0.30598396051819865, "grad_norm": 1030.68017578125, "learning_rate": 8.420628406548495e-05, "loss": 4.4303, "step": 248 }, { "epoch": 0.30721776681061075, "grad_norm": 681.9510498046875, "learning_rate": 8.405544340729938e-05, "loss": 4.3448, "step": 249 }, { "epoch": 0.30845157310302285, "grad_norm": 645.9469604492188, "learning_rate": 8.390402236388039e-05, "loss": 4.1763, "step": 250 }, { "epoch": 0.3096853793954349, "grad_norm": 806.1895751953125, "learning_rate": 8.375202351580032e-05, "loss": 4.608, "step": 251 }, { "epoch": 0.310919185687847, "grad_norm": 1020.5823974609375, "learning_rate": 8.359944945347879e-05, "loss": 4.2845, "step": 252 }, { "epoch": 0.3121529919802591, "grad_norm": 1052.7481689453125, "learning_rate": 8.344630277713834e-05, "loss": 4.3261, "step": 253 }, { "epoch": 0.3133867982726712, "grad_norm": 1070.864013671875, "learning_rate": 8.329258609676025e-05, "loss": 4.2009, "step": 254 }, { "epoch": 0.3146206045650833, "grad_norm": 658.0210571289062, "learning_rate": 8.313830203204e-05, "loss": 4.3171, "step": 255 }, { "epoch": 0.3158544108574954, "grad_norm": 827.125244140625, "learning_rate": 8.298345321234268e-05, "loss": 4.3799, "step": 256 }, { "epoch": 0.3170882171499075, "grad_norm": 1038.740966796875, "learning_rate": 8.28280422766581e-05, "loss": 4.3107, "step": 257 }, { "epoch": 0.3183220234423196, "grad_norm": 660.5693969726562, "learning_rate": 8.267207187355584e-05, "loss": 4.18, "step": 258 }, { "epoch": 0.3195558297347316, "grad_norm": 907.1415405273438, "learning_rate": 8.251554466114014e-05, "loss": 4.3424, "step": 259 }, { "epoch": 0.3207896360271437, "grad_norm": 1690.822998046875, "learning_rate": 8.235846330700463e-05, "loss": 4.2558, "step": 260 }, { "epoch": 0.3220234423195558, "grad_norm": 1129.4356689453125, "learning_rate": 8.220083048818676e-05, "loss": 4.4873, "step": 261 }, { "epoch": 0.3232572486119679, "grad_norm": 746.98974609375, "learning_rate": 8.20426488911223e-05, "loss": 4.3762, "step": 262 }, { "epoch": 0.32449105490438, "grad_norm": 949.0023193359375, "learning_rate": 8.188392121159945e-05, "loss": 4.187, "step": 263 }, { "epoch": 0.3257248611967921, "grad_norm": 847.7987060546875, "learning_rate": 8.172465015471295e-05, "loss": 4.3078, "step": 264 }, { "epoch": 0.3269586674892042, "grad_norm": 826.9012451171875, "learning_rate": 8.156483843481803e-05, "loss": 4.2838, "step": 265 }, { "epoch": 0.3281924737816163, "grad_norm": 956.7261352539062, "learning_rate": 8.140448877548402e-05, "loss": 4.4443, "step": 266 }, { "epoch": 0.32942628007402835, "grad_norm": 1921.76953125, "learning_rate": 8.124360390944805e-05, "loss": 4.1257, "step": 267 }, { "epoch": 0.33066008636644045, "grad_norm": 870.8641357421875, "learning_rate": 8.108218657856847e-05, "loss": 4.0566, "step": 268 }, { "epoch": 0.33189389265885255, "grad_norm": 732.7698974609375, "learning_rate": 8.092023953377799e-05, "loss": 4.0232, "step": 269 }, { "epoch": 0.33312769895126465, "grad_norm": 659.7510986328125, "learning_rate": 8.075776553503697e-05, "loss": 4.4442, "step": 270 }, { "epoch": 0.33436150524367675, "grad_norm": 1112.9576416015625, "learning_rate": 8.059476735128633e-05, "loss": 4.2613, "step": 271 }, { "epoch": 0.33559531153608885, "grad_norm": 1582.8585205078125, "learning_rate": 8.043124776040029e-05, "loss": 4.5872, "step": 272 }, { "epoch": 0.33682911782850095, "grad_norm": 1295.1976318359375, "learning_rate": 8.02672095491391e-05, "loss": 4.2045, "step": 273 }, { "epoch": 0.33806292412091304, "grad_norm": 872.4213256835938, "learning_rate": 8.010265551310152e-05, "loss": 4.0964, "step": 274 }, { "epoch": 0.3392967304133251, "grad_norm": 815.054443359375, "learning_rate": 7.99375884566772e-05, "loss": 4.1138, "step": 275 }, { "epoch": 0.3405305367057372, "grad_norm": 730.4610595703125, "learning_rate": 7.977201119299885e-05, "loss": 4.0316, "step": 276 }, { "epoch": 0.3417643429981493, "grad_norm": 920.5109252929688, "learning_rate": 7.960592654389431e-05, "loss": 4.3128, "step": 277 }, { "epoch": 0.3429981492905614, "grad_norm": 904.0980834960938, "learning_rate": 7.943933733983851e-05, "loss": 4.2074, "step": 278 }, { "epoch": 0.3442319555829735, "grad_norm": 1503.0118408203125, "learning_rate": 7.927224641990515e-05, "loss": 4.9589, "step": 279 }, { "epoch": 0.3454657618753856, "grad_norm": 896.9112548828125, "learning_rate": 7.910465663171835e-05, "loss": 4.2574, "step": 280 }, { "epoch": 0.3466995681677977, "grad_norm": 1025.028564453125, "learning_rate": 7.893657083140416e-05, "loss": 4.124, "step": 281 }, { "epoch": 0.3479333744602097, "grad_norm": 1109.3721923828125, "learning_rate": 7.876799188354183e-05, "loss": 4.1843, "step": 282 }, { "epoch": 0.3491671807526218, "grad_norm": 1240.8521728515625, "learning_rate": 7.8598922661115e-05, "loss": 4.1967, "step": 283 }, { "epoch": 0.3504009870450339, "grad_norm": 3984.364013671875, "learning_rate": 7.842936604546274e-05, "loss": 4.3211, "step": 284 }, { "epoch": 0.351634793337446, "grad_norm": 919.879150390625, "learning_rate": 7.825932492623048e-05, "loss": 4.3676, "step": 285 }, { "epoch": 0.3528685996298581, "grad_norm": 1509.087890625, "learning_rate": 7.808880220132071e-05, "loss": 4.4972, "step": 286 }, { "epoch": 0.3541024059222702, "grad_norm": 948.1290893554688, "learning_rate": 7.791780077684366e-05, "loss": 4.1502, "step": 287 }, { "epoch": 0.3553362122146823, "grad_norm": 801.6161499023438, "learning_rate": 7.774632356706768e-05, "loss": 4.0981, "step": 288 }, { "epoch": 0.3565700185070944, "grad_norm": 848.953125, "learning_rate": 7.757437349436964e-05, "loss": 4.5651, "step": 289 }, { "epoch": 0.35780382479950645, "grad_norm": 843.8176879882812, "learning_rate": 7.740195348918516e-05, "loss": 4.5426, "step": 290 }, { "epoch": 0.35903763109191855, "grad_norm": 1099.3475341796875, "learning_rate": 7.722906648995856e-05, "loss": 4.3209, "step": 291 }, { "epoch": 0.36027143738433065, "grad_norm": 970.3837890625, "learning_rate": 7.705571544309284e-05, "loss": 4.1784, "step": 292 }, { "epoch": 0.36150524367674275, "grad_norm": 1227.03125, "learning_rate": 7.688190330289953e-05, "loss": 4.4555, "step": 293 }, { "epoch": 0.36273904996915485, "grad_norm": 1042.2286376953125, "learning_rate": 7.670763303154822e-05, "loss": 4.4602, "step": 294 }, { "epoch": 0.36397285626156695, "grad_norm": 909.2962036132812, "learning_rate": 7.653290759901617e-05, "loss": 4.2845, "step": 295 }, { "epoch": 0.36520666255397904, "grad_norm": 2113.53662109375, "learning_rate": 7.635772998303763e-05, "loss": 4.3554, "step": 296 }, { "epoch": 0.36644046884639114, "grad_norm": 955.9302368164062, "learning_rate": 7.618210316905316e-05, "loss": 4.4607, "step": 297 }, { "epoch": 0.3676742751388032, "grad_norm": 1045.12744140625, "learning_rate": 7.60060301501587e-05, "loss": 4.2151, "step": 298 }, { "epoch": 0.3689080814312153, "grad_norm": 1148.5784912109375, "learning_rate": 7.58295139270546e-05, "loss": 4.4256, "step": 299 }, { "epoch": 0.3701418877236274, "grad_norm": 2229.29931640625, "learning_rate": 7.565255750799439e-05, "loss": 4.5698, "step": 300 }, { "epoch": 0.3713756940160395, "grad_norm": 720.8040161132812, "learning_rate": 7.547516390873366e-05, "loss": 4.0525, "step": 301 }, { "epoch": 0.3726095003084516, "grad_norm": 738.2750854492188, "learning_rate": 7.529733615247851e-05, "loss": 4.5477, "step": 302 }, { "epoch": 0.3738433066008637, "grad_norm": 871.5250854492188, "learning_rate": 7.511907726983418e-05, "loss": 4.1773, "step": 303 }, { "epoch": 0.3750771128932758, "grad_norm": 1024.720458984375, "learning_rate": 7.494039029875326e-05, "loss": 4.3616, "step": 304 }, { "epoch": 0.3763109191856878, "grad_norm": 661.3922119140625, "learning_rate": 7.476127828448401e-05, "loss": 4.1335, "step": 305 }, { "epoch": 0.3775447254780999, "grad_norm": 877.6995239257812, "learning_rate": 7.45817442795184e-05, "loss": 4.4129, "step": 306 }, { "epoch": 0.378778531770512, "grad_norm": 634.2882080078125, "learning_rate": 7.440179134354016e-05, "loss": 4.2423, "step": 307 }, { "epoch": 0.3800123380629241, "grad_norm": 727.6279296875, "learning_rate": 7.422142254337256e-05, "loss": 4.3042, "step": 308 }, { "epoch": 0.3812461443553362, "grad_norm": 606.58984375, "learning_rate": 7.40406409529262e-05, "loss": 4.33, "step": 309 }, { "epoch": 0.3824799506477483, "grad_norm": 711.3494873046875, "learning_rate": 7.385944965314658e-05, "loss": 4.0204, "step": 310 }, { "epoch": 0.3837137569401604, "grad_norm": 789.8071899414062, "learning_rate": 7.367785173196165e-05, "loss": 4.7292, "step": 311 }, { "epoch": 0.3849475632325725, "grad_norm": 869.54541015625, "learning_rate": 7.349585028422911e-05, "loss": 4.5218, "step": 312 }, { "epoch": 0.38618136952498455, "grad_norm": 695.4873657226562, "learning_rate": 7.331344841168373e-05, "loss": 4.2921, "step": 313 }, { "epoch": 0.38741517581739665, "grad_norm": 568.3463134765625, "learning_rate": 7.313064922288446e-05, "loss": 4.3687, "step": 314 }, { "epoch": 0.38864898210980875, "grad_norm": 1486.2598876953125, "learning_rate": 7.294745583316146e-05, "loss": 4.38, "step": 315 }, { "epoch": 0.38988278840222085, "grad_norm": 895.27001953125, "learning_rate": 7.2763871364563e-05, "loss": 4.3786, "step": 316 }, { "epoch": 0.39111659469463295, "grad_norm": 578.4692993164062, "learning_rate": 7.257989894580228e-05, "loss": 4.414, "step": 317 }, { "epoch": 0.39235040098704504, "grad_norm": 901.8697509765625, "learning_rate": 7.239554171220401e-05, "loss": 4.4638, "step": 318 }, { "epoch": 0.39358420727945714, "grad_norm": 444.4349060058594, "learning_rate": 7.22108028056512e-05, "loss": 4.3555, "step": 319 }, { "epoch": 0.39481801357186924, "grad_norm": 620.0184936523438, "learning_rate": 7.202568537453129e-05, "loss": 4.1884, "step": 320 }, { "epoch": 0.3960518198642813, "grad_norm": 841.135986328125, "learning_rate": 7.184019257368283e-05, "loss": 4.3909, "step": 321 }, { "epoch": 0.3972856261566934, "grad_norm": 668.8302001953125, "learning_rate": 7.165432756434146e-05, "loss": 4.9136, "step": 322 }, { "epoch": 0.3985194324491055, "grad_norm": 1110.5648193359375, "learning_rate": 7.146809351408619e-05, "loss": 4.2065, "step": 323 }, { "epoch": 0.3997532387415176, "grad_norm": 521.2616577148438, "learning_rate": 7.128149359678531e-05, "loss": 4.1338, "step": 324 }, { "epoch": 0.4009870450339297, "grad_norm": 663.8246459960938, "learning_rate": 7.10945309925424e-05, "loss": 4.1785, "step": 325 }, { "epoch": 0.4022208513263418, "grad_norm": 1459.381591796875, "learning_rate": 7.090720888764201e-05, "loss": 4.1546, "step": 326 }, { "epoch": 0.4034546576187539, "grad_norm": 756.01220703125, "learning_rate": 7.07195304744955e-05, "loss": 4.1689, "step": 327 }, { "epoch": 0.404688463911166, "grad_norm": 583.2752685546875, "learning_rate": 7.053149895158656e-05, "loss": 4.59, "step": 328 }, { "epoch": 0.405922270203578, "grad_norm": 632.4768676757812, "learning_rate": 7.034311752341667e-05, "loss": 4.301, "step": 329 }, { "epoch": 0.4071560764959901, "grad_norm": 643.6720581054688, "learning_rate": 7.015438940045052e-05, "loss": 4.3972, "step": 330 }, { "epoch": 0.4083898827884022, "grad_norm": 722.1488037109375, "learning_rate": 6.996531779906133e-05, "loss": 4.3077, "step": 331 }, { "epoch": 0.4096236890808143, "grad_norm": 690.2899780273438, "learning_rate": 6.977590594147601e-05, "loss": 4.0946, "step": 332 }, { "epoch": 0.4108574953732264, "grad_norm": 691.6495971679688, "learning_rate": 6.95861570557202e-05, "loss": 4.6194, "step": 333 }, { "epoch": 0.4120913016656385, "grad_norm": 507.4448547363281, "learning_rate": 6.939607437556332e-05, "loss": 4.2845, "step": 334 }, { "epoch": 0.4133251079580506, "grad_norm": 749.8709716796875, "learning_rate": 6.920566114046342e-05, "loss": 4.1089, "step": 335 }, { "epoch": 0.41455891425046265, "grad_norm": 596.8978271484375, "learning_rate": 6.901492059551201e-05, "loss": 4.2486, "step": 336 }, { "epoch": 0.41579272054287475, "grad_norm": 876.170654296875, "learning_rate": 6.882385599137873e-05, "loss": 4.2581, "step": 337 }, { "epoch": 0.41702652683528685, "grad_norm": 712.3971557617188, "learning_rate": 6.863247058425593e-05, "loss": 4.3449, "step": 338 }, { "epoch": 0.41826033312769895, "grad_norm": 652.3949584960938, "learning_rate": 6.844076763580325e-05, "loss": 4.4245, "step": 339 }, { "epoch": 0.41949413942011105, "grad_norm": 537.5595703125, "learning_rate": 6.824875041309194e-05, "loss": 4.2944, "step": 340 }, { "epoch": 0.42072794571252314, "grad_norm": 821.6701049804688, "learning_rate": 6.805642218854919e-05, "loss": 4.3504, "step": 341 }, { "epoch": 0.42196175200493524, "grad_norm": 965.2427368164062, "learning_rate": 6.78637862399025e-05, "loss": 4.4086, "step": 342 }, { "epoch": 0.42319555829734734, "grad_norm": 613.4368896484375, "learning_rate": 6.767084585012365e-05, "loss": 4.5594, "step": 343 }, { "epoch": 0.4244293645897594, "grad_norm": 998.8842163085938, "learning_rate": 6.747760430737282e-05, "loss": 4.4461, "step": 344 }, { "epoch": 0.4256631708821715, "grad_norm": 624.1119384765625, "learning_rate": 6.728406490494257e-05, "loss": 4.2759, "step": 345 }, { "epoch": 0.4268969771745836, "grad_norm": 736.1170654296875, "learning_rate": 6.709023094120164e-05, "loss": 4.1539, "step": 346 }, { "epoch": 0.4281307834669957, "grad_norm": 653.1454467773438, "learning_rate": 6.689610571953887e-05, "loss": 4.5321, "step": 347 }, { "epoch": 0.4293645897594078, "grad_norm": 732.800048828125, "learning_rate": 6.670169254830677e-05, "loss": 4.3018, "step": 348 }, { "epoch": 0.4305983960518199, "grad_norm": 626.2633056640625, "learning_rate": 6.65069947407652e-05, "loss": 4.3347, "step": 349 }, { "epoch": 0.431832202344232, "grad_norm": 572.1875, "learning_rate": 6.63120156150249e-05, "loss": 4.2569, "step": 350 }, { "epoch": 0.4330660086366441, "grad_norm": 621.7860107421875, "learning_rate": 6.611675849399092e-05, "loss": 4.2538, "step": 351 }, { "epoch": 0.4342998149290561, "grad_norm": 673.1295166015625, "learning_rate": 6.592122670530605e-05, "loss": 4.2466, "step": 352 }, { "epoch": 0.4355336212214682, "grad_norm": 789.9872436523438, "learning_rate": 6.572542358129402e-05, "loss": 4.2683, "step": 353 }, { "epoch": 0.4367674275138803, "grad_norm": 585.2059936523438, "learning_rate": 6.552935245890279e-05, "loss": 4.4181, "step": 354 }, { "epoch": 0.4380012338062924, "grad_norm": 746.8546752929688, "learning_rate": 6.53330166796476e-05, "loss": 4.2814, "step": 355 }, { "epoch": 0.4392350400987045, "grad_norm": 1194.3642578125, "learning_rate": 6.513641958955415e-05, "loss": 4.4727, "step": 356 }, { "epoch": 0.4404688463911166, "grad_norm": 1172.553466796875, "learning_rate": 6.493956453910138e-05, "loss": 4.3559, "step": 357 }, { "epoch": 0.4417026526835287, "grad_norm": 685.5620727539062, "learning_rate": 6.474245488316457e-05, "loss": 4.2717, "step": 358 }, { "epoch": 0.44293645897594075, "grad_norm": 1251.83935546875, "learning_rate": 6.454509398095808e-05, "loss": 4.4361, "step": 359 }, { "epoch": 0.44417026526835285, "grad_norm": 761.3630981445312, "learning_rate": 6.434748519597804e-05, "loss": 4.1469, "step": 360 }, { "epoch": 0.44540407156076495, "grad_norm": 1683.0096435546875, "learning_rate": 6.414963189594513e-05, "loss": 4.2239, "step": 361 }, { "epoch": 0.44663787785317705, "grad_norm": 4889.1904296875, "learning_rate": 6.395153745274715e-05, "loss": 4.4494, "step": 362 }, { "epoch": 0.44787168414558914, "grad_norm": 1073.2198486328125, "learning_rate": 6.375320524238155e-05, "loss": 4.1871, "step": 363 }, { "epoch": 0.44910549043800124, "grad_norm": 933.00390625, "learning_rate": 6.355463864489784e-05, "loss": 4.2717, "step": 364 }, { "epoch": 0.45033929673041334, "grad_norm": 849.5030517578125, "learning_rate": 6.335584104434011e-05, "loss": 4.1714, "step": 365 }, { "epoch": 0.45157310302282544, "grad_norm": 797.42724609375, "learning_rate": 6.315681582868926e-05, "loss": 4.4905, "step": 366 }, { "epoch": 0.4528069093152375, "grad_norm": 972.1139526367188, "learning_rate": 6.295756638980528e-05, "loss": 4.4919, "step": 367 }, { "epoch": 0.4540407156076496, "grad_norm": 592.1343994140625, "learning_rate": 6.275809612336948e-05, "loss": 4.3478, "step": 368 }, { "epoch": 0.4552745219000617, "grad_norm": 1665.978271484375, "learning_rate": 6.255840842882655e-05, "loss": 4.5419, "step": 369 }, { "epoch": 0.4565083281924738, "grad_norm": 4402.978515625, "learning_rate": 6.235850670932671e-05, "loss": 4.6138, "step": 370 }, { "epoch": 0.4577421344848859, "grad_norm": 1594.6617431640625, "learning_rate": 6.215839437166766e-05, "loss": 4.5357, "step": 371 }, { "epoch": 0.458975940777298, "grad_norm": 874.2888793945312, "learning_rate": 6.195807482623653e-05, "loss": 4.2039, "step": 372 }, { "epoch": 0.4602097470697101, "grad_norm": 1560.271240234375, "learning_rate": 6.175755148695174e-05, "loss": 4.2187, "step": 373 }, { "epoch": 0.46144355336212217, "grad_norm": 1246.7991943359375, "learning_rate": 6.155682777120486e-05, "loss": 4.2019, "step": 374 }, { "epoch": 0.4626773596545342, "grad_norm": 1500.3251953125, "learning_rate": 6.135590709980237e-05, "loss": 4.5292, "step": 375 }, { "epoch": 0.4639111659469463, "grad_norm": 1800.3328857421875, "learning_rate": 6.115479289690729e-05, "loss": 4.6747, "step": 376 }, { "epoch": 0.4651449722393584, "grad_norm": 862.0317993164062, "learning_rate": 6.095348858998089e-05, "loss": 4.4015, "step": 377 }, { "epoch": 0.4663787785317705, "grad_norm": 979.6875610351562, "learning_rate": 6.075199760972429e-05, "loss": 4.3292, "step": 378 }, { "epoch": 0.4676125848241826, "grad_norm": 1733.320556640625, "learning_rate": 6.0550323390019944e-05, "loss": 4.4314, "step": 379 }, { "epoch": 0.4688463911165947, "grad_norm": 1046.5047607421875, "learning_rate": 6.034846936787314e-05, "loss": 4.3921, "step": 380 }, { "epoch": 0.4700801974090068, "grad_norm": 1860.8629150390625, "learning_rate": 6.0146438983353415e-05, "loss": 4.3222, "step": 381 }, { "epoch": 0.4713140037014189, "grad_norm": 972.5175170898438, "learning_rate": 5.994423567953594e-05, "loss": 4.3193, "step": 382 }, { "epoch": 0.47254780999383095, "grad_norm": 634.0526733398438, "learning_rate": 5.974186290244287e-05, "loss": 4.1973, "step": 383 }, { "epoch": 0.47378161628624305, "grad_norm": 1021.6718139648438, "learning_rate": 5.953932410098455e-05, "loss": 4.2401, "step": 384 }, { "epoch": 0.47501542257865514, "grad_norm": 753.704345703125, "learning_rate": 5.933662272690079e-05, "loss": 4.2924, "step": 385 }, { "epoch": 0.47624922887106724, "grad_norm": 896.6467895507812, "learning_rate": 5.9133762234702004e-05, "loss": 4.1883, "step": 386 }, { "epoch": 0.47748303516347934, "grad_norm": 1033.7984619140625, "learning_rate": 5.893074608161039e-05, "loss": 4.3882, "step": 387 }, { "epoch": 0.47871684145589144, "grad_norm": 686.5562133789062, "learning_rate": 5.8727577727500925e-05, "loss": 4.1916, "step": 388 }, { "epoch": 0.47995064774830354, "grad_norm": 810.2001953125, "learning_rate": 5.8524260634842495e-05, "loss": 4.1881, "step": 389 }, { "epoch": 0.4811844540407156, "grad_norm": 621.872314453125, "learning_rate": 5.832079826863883e-05, "loss": 4.4162, "step": 390 }, { "epoch": 0.4824182603331277, "grad_norm": 748.2630615234375, "learning_rate": 5.811719409636943e-05, "loss": 4.3348, "step": 391 }, { "epoch": 0.4836520666255398, "grad_norm": 777.8478393554688, "learning_rate": 5.7913451587930576e-05, "loss": 4.2, "step": 392 }, { "epoch": 0.4848858729179519, "grad_norm": 738.7431640625, "learning_rate": 5.7709574215576054e-05, "loss": 4.3066, "step": 393 }, { "epoch": 0.486119679210364, "grad_norm": 969.1599731445312, "learning_rate": 5.750556545385809e-05, "loss": 4.3875, "step": 394 }, { "epoch": 0.4873534855027761, "grad_norm": 1434.59765625, "learning_rate": 5.73014287795681e-05, "loss": 4.2935, "step": 395 }, { "epoch": 0.48858729179518817, "grad_norm": 733.67236328125, "learning_rate": 5.709716767167741e-05, "loss": 4.2401, "step": 396 }, { "epoch": 0.48982109808760027, "grad_norm": 764.4368896484375, "learning_rate": 5.689278561127798e-05, "loss": 4.1596, "step": 397 }, { "epoch": 0.4910549043800123, "grad_norm": 697.0581665039062, "learning_rate": 5.66882860815231e-05, "loss": 4.2965, "step": 398 }, { "epoch": 0.4922887106724244, "grad_norm": 1043.562744140625, "learning_rate": 5.6483672567568046e-05, "loss": 4.1089, "step": 399 }, { "epoch": 0.4935225169648365, "grad_norm": 1163.54345703125, "learning_rate": 5.627894855651061e-05, "loss": 4.1813, "step": 400 }, { "epoch": 0.4947563232572486, "grad_norm": 746.6779174804688, "learning_rate": 5.607411753733173e-05, "loss": 4.4091, "step": 401 }, { "epoch": 0.4959901295496607, "grad_norm": 697.9642944335938, "learning_rate": 5.586918300083601e-05, "loss": 4.2014, "step": 402 }, { "epoch": 0.4972239358420728, "grad_norm": 774.064453125, "learning_rate": 5.566414843959228e-05, "loss": 4.8718, "step": 403 }, { "epoch": 0.4984577421344849, "grad_norm": 1240.5908203125, "learning_rate": 5.545901734787394e-05, "loss": 4.3829, "step": 404 }, { "epoch": 0.499691548426897, "grad_norm": 697.5282592773438, "learning_rate": 5.5253793221599584e-05, "loss": 4.0875, "step": 405 }, { "epoch": 0.5009253547193091, "grad_norm": 884.7564086914062, "learning_rate": 5.504847955827326e-05, "loss": 4.2248, "step": 406 }, { "epoch": 0.5009253547193091, "eval_loss": 4.2050323486328125, "eval_runtime": 9.7158, "eval_samples_per_second": 140.493, "eval_steps_per_second": 17.6, "step": 406 }, { "epoch": 0.5021591610117212, "grad_norm": 853.3882446289062, "learning_rate": 5.4843079856925e-05, "loss": 4.4044, "step": 407 }, { "epoch": 0.5033929673041333, "grad_norm": 647.4006958007812, "learning_rate": 5.4637597618051094e-05, "loss": 4.1797, "step": 408 }, { "epoch": 0.5046267735965454, "grad_norm": 822.2937622070312, "learning_rate": 5.443203634355449e-05, "loss": 4.1378, "step": 409 }, { "epoch": 0.5058605798889574, "grad_norm": 775.1270751953125, "learning_rate": 5.422639953668508e-05, "loss": 4.2344, "step": 410 }, { "epoch": 0.5070943861813695, "grad_norm": 1231.236328125, "learning_rate": 5.402069070197997e-05, "loss": 4.4194, "step": 411 }, { "epoch": 0.5083281924737816, "grad_norm": 1013.276123046875, "learning_rate": 5.3814913345203854e-05, "loss": 4.1588, "step": 412 }, { "epoch": 0.5095619987661937, "grad_norm": 690.4890747070312, "learning_rate": 5.360907097328916e-05, "loss": 4.1674, "step": 413 }, { "epoch": 0.5107958050586058, "grad_norm": 1136.9014892578125, "learning_rate": 5.340316709427633e-05, "loss": 4.5034, "step": 414 }, { "epoch": 0.5120296113510179, "grad_norm": 1397.9810791015625, "learning_rate": 5.3197205217254044e-05, "loss": 4.3418, "step": 415 }, { "epoch": 0.51326341764343, "grad_norm": 583.5597534179688, "learning_rate": 5.2991188852299435e-05, "loss": 4.1513, "step": 416 }, { "epoch": 0.5144972239358421, "grad_norm": 1105.35791015625, "learning_rate": 5.2785121510418166e-05, "loss": 4.2943, "step": 417 }, { "epoch": 0.5157310302282542, "grad_norm": 665.0667724609375, "learning_rate": 5.2579006703484724e-05, "loss": 4.2208, "step": 418 }, { "epoch": 0.5169648365206663, "grad_norm": 750.4703369140625, "learning_rate": 5.237284794418251e-05, "loss": 4.1352, "step": 419 }, { "epoch": 0.5181986428130784, "grad_norm": 1101.5928955078125, "learning_rate": 5.216664874594395e-05, "loss": 4.2828, "step": 420 }, { "epoch": 0.5194324491054905, "grad_norm": 736.1017456054688, "learning_rate": 5.196041262289067e-05, "loss": 4.1608, "step": 421 }, { "epoch": 0.5206662553979026, "grad_norm": 673.5686645507812, "learning_rate": 5.175414308977356e-05, "loss": 4.1789, "step": 422 }, { "epoch": 0.5219000616903147, "grad_norm": 917.0165405273438, "learning_rate": 5.1547843661912906e-05, "loss": 4.19, "step": 423 }, { "epoch": 0.5231338679827268, "grad_norm": 634.482666015625, "learning_rate": 5.134151785513848e-05, "loss": 4.0292, "step": 424 }, { "epoch": 0.5243676742751388, "grad_norm": 740.7830200195312, "learning_rate": 5.113516918572961e-05, "loss": 4.0322, "step": 425 }, { "epoch": 0.5256014805675508, "grad_norm": 2342.027099609375, "learning_rate": 5.092880117035527e-05, "loss": 4.1086, "step": 426 }, { "epoch": 0.526835286859963, "grad_norm": 841.03125, "learning_rate": 5.072241732601409e-05, "loss": 4.0484, "step": 427 }, { "epoch": 0.528069093152375, "grad_norm": 814.412353515625, "learning_rate": 5.0516021169974494e-05, "loss": 4.2691, "step": 428 }, { "epoch": 0.5293028994447871, "grad_norm": 1005.2704467773438, "learning_rate": 5.030961621971473e-05, "loss": 4.276, "step": 429 }, { "epoch": 0.5305367057371992, "grad_norm": 957.53369140625, "learning_rate": 5.010320599286291e-05, "loss": 4.1213, "step": 430 }, { "epoch": 0.5317705120296113, "grad_norm": 721.4414672851562, "learning_rate": 4.9896794007137096e-05, "loss": 4.1486, "step": 431 }, { "epoch": 0.5330043183220234, "grad_norm": 809.1273193359375, "learning_rate": 4.969038378028527e-05, "loss": 4.0889, "step": 432 }, { "epoch": 0.5342381246144355, "grad_norm": 1319.694580078125, "learning_rate": 4.948397883002552e-05, "loss": 4.1972, "step": 433 }, { "epoch": 0.5354719309068476, "grad_norm": 896.4898681640625, "learning_rate": 4.9277582673985934e-05, "loss": 4.1462, "step": 434 }, { "epoch": 0.5367057371992597, "grad_norm": 684.2529296875, "learning_rate": 4.907119882964474e-05, "loss": 4.2471, "step": 435 }, { "epoch": 0.5379395434916718, "grad_norm": 1085.970703125, "learning_rate": 4.886483081427039e-05, "loss": 4.1986, "step": 436 }, { "epoch": 0.5391733497840839, "grad_norm": 547.0230712890625, "learning_rate": 4.865848214486152e-05, "loss": 4.0881, "step": 437 }, { "epoch": 0.540407156076496, "grad_norm": 753.9632568359375, "learning_rate": 4.8452156338087105e-05, "loss": 4.016, "step": 438 }, { "epoch": 0.5416409623689081, "grad_norm": 692.2648315429688, "learning_rate": 4.824585691022647e-05, "loss": 4.2989, "step": 439 }, { "epoch": 0.5428747686613201, "grad_norm": 585.565673828125, "learning_rate": 4.803958737710934e-05, "loss": 4.0761, "step": 440 }, { "epoch": 0.5441085749537322, "grad_norm": 669.18408203125, "learning_rate": 4.783335125405606e-05, "loss": 4.1985, "step": 441 }, { "epoch": 0.5453423812461443, "grad_norm": 774.6920166015625, "learning_rate": 4.76271520558175e-05, "loss": 4.1696, "step": 442 }, { "epoch": 0.5465761875385564, "grad_norm": 711.0860595703125, "learning_rate": 4.742099329651529e-05, "loss": 4.1698, "step": 443 }, { "epoch": 0.5478099938309685, "grad_norm": 912.1954345703125, "learning_rate": 4.721487848958186e-05, "loss": 4.4007, "step": 444 }, { "epoch": 0.5490438001233806, "grad_norm": 536.2879028320312, "learning_rate": 4.700881114770058e-05, "loss": 4.1292, "step": 445 }, { "epoch": 0.5502776064157927, "grad_norm": 609.5086669921875, "learning_rate": 4.680279478274596e-05, "loss": 4.118, "step": 446 }, { "epoch": 0.5515114127082048, "grad_norm": 457.2904357910156, "learning_rate": 4.659683290572367e-05, "loss": 4.2144, "step": 447 }, { "epoch": 0.5527452190006169, "grad_norm": 765.5906982421875, "learning_rate": 4.6390929026710855e-05, "loss": 4.0711, "step": 448 }, { "epoch": 0.553979025293029, "grad_norm": 637.5826416015625, "learning_rate": 4.6185086654796165e-05, "loss": 4.2581, "step": 449 }, { "epoch": 0.5552128315854411, "grad_norm": 931.8234252929688, "learning_rate": 4.597930929802004e-05, "loss": 4.1231, "step": 450 }, { "epoch": 0.5564466378778532, "grad_norm": 569.963623046875, "learning_rate": 4.577360046331493e-05, "loss": 4.1986, "step": 451 }, { "epoch": 0.5576804441702653, "grad_norm": 554.6689453125, "learning_rate": 4.5567963656445506e-05, "loss": 3.9808, "step": 452 }, { "epoch": 0.5589142504626774, "grad_norm": 707.5774536132812, "learning_rate": 4.536240238194891e-05, "loss": 4.0964, "step": 453 }, { "epoch": 0.5601480567550895, "grad_norm": 680.7227783203125, "learning_rate": 4.515692014307502e-05, "loss": 4.2356, "step": 454 }, { "epoch": 0.5613818630475016, "grad_norm": 616.7816772460938, "learning_rate": 4.495152044172675e-05, "loss": 4.0758, "step": 455 }, { "epoch": 0.5626156693399136, "grad_norm": 533.3388061523438, "learning_rate": 4.474620677840045e-05, "loss": 4.2436, "step": 456 }, { "epoch": 0.5638494756323257, "grad_norm": 650.9319458007812, "learning_rate": 4.454098265212606e-05, "loss": 4.3357, "step": 457 }, { "epoch": 0.5650832819247378, "grad_norm": 539.3369140625, "learning_rate": 4.4335851560407734e-05, "loss": 4.1327, "step": 458 }, { "epoch": 0.5663170882171499, "grad_norm": 858.3012084960938, "learning_rate": 4.413081699916399e-05, "loss": 4.0637, "step": 459 }, { "epoch": 0.567550894509562, "grad_norm": 839.3760375976562, "learning_rate": 4.3925882462668284e-05, "loss": 4.2314, "step": 460 }, { "epoch": 0.5687847008019741, "grad_norm": 723.5487670898438, "learning_rate": 4.3721051443489405e-05, "loss": 4.199, "step": 461 }, { "epoch": 0.5700185070943862, "grad_norm": 778.8251342773438, "learning_rate": 4.351632743243196e-05, "loss": 4.2422, "step": 462 }, { "epoch": 0.5712523133867983, "grad_norm": 480.365478515625, "learning_rate": 4.3311713918476906e-05, "loss": 4.1606, "step": 463 }, { "epoch": 0.5724861196792104, "grad_norm": 854.7259521484375, "learning_rate": 4.3107214388722045e-05, "loss": 4.2883, "step": 464 }, { "epoch": 0.5737199259716225, "grad_norm": 659.643310546875, "learning_rate": 4.29028323283226e-05, "loss": 4.2344, "step": 465 }, { "epoch": 0.5749537322640346, "grad_norm": 732.3975219726562, "learning_rate": 4.269857122043191e-05, "loss": 4.0897, "step": 466 }, { "epoch": 0.5761875385564467, "grad_norm": 696.5797119140625, "learning_rate": 4.2494434546141905e-05, "loss": 4.3794, "step": 467 }, { "epoch": 0.5774213448488588, "grad_norm": 875.3156127929688, "learning_rate": 4.229042578442396e-05, "loss": 4.2172, "step": 468 }, { "epoch": 0.5786551511412709, "grad_norm": 596.0404663085938, "learning_rate": 4.208654841206945e-05, "loss": 4.3061, "step": 469 }, { "epoch": 0.579888957433683, "grad_norm": 563.7130126953125, "learning_rate": 4.1882805903630576e-05, "loss": 4.1122, "step": 470 }, { "epoch": 0.581122763726095, "grad_norm": 674.4610595703125, "learning_rate": 4.1679201731361195e-05, "loss": 4.2857, "step": 471 }, { "epoch": 0.582356570018507, "grad_norm": 864.395751953125, "learning_rate": 4.147573936515751e-05, "loss": 4.3274, "step": 472 }, { "epoch": 0.5835903763109191, "grad_norm": 588.88232421875, "learning_rate": 4.1272422272499086e-05, "loss": 4.0944, "step": 473 }, { "epoch": 0.5848241826033312, "grad_norm": 818.7593383789062, "learning_rate": 4.1069253918389627e-05, "loss": 4.2651, "step": 474 }, { "epoch": 0.5860579888957433, "grad_norm": 995.448974609375, "learning_rate": 4.0866237765298e-05, "loss": 4.0507, "step": 475 }, { "epoch": 0.5872917951881554, "grad_norm": 844.66748046875, "learning_rate": 4.066337727309923e-05, "loss": 4.2058, "step": 476 }, { "epoch": 0.5885256014805675, "grad_norm": 705.0908203125, "learning_rate": 4.0460675899015456e-05, "loss": 4.0508, "step": 477 }, { "epoch": 0.5897594077729796, "grad_norm": 617.9857177734375, "learning_rate": 4.025813709755714e-05, "loss": 4.1344, "step": 478 }, { "epoch": 0.5909932140653917, "grad_norm": 1111.494140625, "learning_rate": 4.005576432046406e-05, "loss": 4.1989, "step": 479 }, { "epoch": 0.5922270203578038, "grad_norm": 757.2418823242188, "learning_rate": 3.9853561016646604e-05, "loss": 4.0764, "step": 480 }, { "epoch": 0.5934608266502159, "grad_norm": 480.5588073730469, "learning_rate": 3.965153063212688e-05, "loss": 4.3756, "step": 481 }, { "epoch": 0.594694632942628, "grad_norm": 816.2938232421875, "learning_rate": 3.944967660998007e-05, "loss": 4.0795, "step": 482 }, { "epoch": 0.5959284392350401, "grad_norm": 634.9500122070312, "learning_rate": 3.924800239027572e-05, "loss": 4.0386, "step": 483 }, { "epoch": 0.5971622455274522, "grad_norm": 788.2621459960938, "learning_rate": 3.9046511410019115e-05, "loss": 4.1948, "step": 484 }, { "epoch": 0.5983960518198643, "grad_norm": 639.3242797851562, "learning_rate": 3.884520710309273e-05, "loss": 4.1536, "step": 485 }, { "epoch": 0.5996298581122764, "grad_norm": 783.134521484375, "learning_rate": 3.864409290019765e-05, "loss": 4.2878, "step": 486 }, { "epoch": 0.6008636644046884, "grad_norm": 725.840576171875, "learning_rate": 3.8443172228795134e-05, "loss": 4.2542, "step": 487 }, { "epoch": 0.6020974706971005, "grad_norm": 745.060546875, "learning_rate": 3.8242448513048266e-05, "loss": 4.0974, "step": 488 }, { "epoch": 0.6033312769895126, "grad_norm": 662.4865112304688, "learning_rate": 3.804192517376348e-05, "loss": 4.2055, "step": 489 }, { "epoch": 0.6045650832819247, "grad_norm": 853.4915161132812, "learning_rate": 3.784160562833235e-05, "loss": 4.07, "step": 490 }, { "epoch": 0.6057988895743368, "grad_norm": 843.7233276367188, "learning_rate": 3.764149329067329e-05, "loss": 3.9474, "step": 491 }, { "epoch": 0.6070326958667489, "grad_norm": 737.5382080078125, "learning_rate": 3.744159157117345e-05, "loss": 4.254, "step": 492 }, { "epoch": 0.608266502159161, "grad_norm": 753.4673461914062, "learning_rate": 3.7241903876630536e-05, "loss": 4.0535, "step": 493 }, { "epoch": 0.6095003084515731, "grad_norm": 554.43408203125, "learning_rate": 3.7042433610194715e-05, "loss": 4.2252, "step": 494 }, { "epoch": 0.6107341147439852, "grad_norm": 715.8684692382812, "learning_rate": 3.684318417131075e-05, "loss": 3.9666, "step": 495 }, { "epoch": 0.6119679210363973, "grad_norm": 645.7371826171875, "learning_rate": 3.6644158955659904e-05, "loss": 4.1646, "step": 496 }, { "epoch": 0.6132017273288094, "grad_norm": 650.3280639648438, "learning_rate": 3.644536135510217e-05, "loss": 4.3831, "step": 497 }, { "epoch": 0.6144355336212215, "grad_norm": 943.124267578125, "learning_rate": 3.6246794757618466e-05, "loss": 4.1393, "step": 498 }, { "epoch": 0.6156693399136336, "grad_norm": 955.5465087890625, "learning_rate": 3.604846254725285e-05, "loss": 4.1975, "step": 499 }, { "epoch": 0.6169031462060457, "grad_norm": 898.3872680664062, "learning_rate": 3.585036810405488e-05, "loss": 3.9472, "step": 500 }, { "epoch": 0.6181369524984578, "grad_norm": 571.25732421875, "learning_rate": 3.5652514804021986e-05, "loss": 4.1276, "step": 501 }, { "epoch": 0.6193707587908698, "grad_norm": 726.6759643554688, "learning_rate": 3.545490601904193e-05, "loss": 4.156, "step": 502 }, { "epoch": 0.6206045650832819, "grad_norm": 669.5355834960938, "learning_rate": 3.525754511683543e-05, "loss": 4.2966, "step": 503 }, { "epoch": 0.621838371375694, "grad_norm": 988.7280883789062, "learning_rate": 3.506043546089862e-05, "loss": 4.0182, "step": 504 }, { "epoch": 0.6230721776681061, "grad_norm": 995.143310546875, "learning_rate": 3.486358041044586e-05, "loss": 4.2714, "step": 505 }, { "epoch": 0.6243059839605182, "grad_norm": 1300.2379150390625, "learning_rate": 3.46669833203524e-05, "loss": 4.099, "step": 506 }, { "epoch": 0.6255397902529303, "grad_norm": 716.7886352539062, "learning_rate": 3.447064754109722e-05, "loss": 4.2156, "step": 507 }, { "epoch": 0.6267735965453424, "grad_norm": 842.786376953125, "learning_rate": 3.427457641870599e-05, "loss": 4.27, "step": 508 }, { "epoch": 0.6280074028377545, "grad_norm": 1042.2271728515625, "learning_rate": 3.407877329469395e-05, "loss": 4.2519, "step": 509 }, { "epoch": 0.6292412091301666, "grad_norm": 1337.3258056640625, "learning_rate": 3.388324150600909e-05, "loss": 4.2326, "step": 510 }, { "epoch": 0.6304750154225787, "grad_norm": 610.8500366210938, "learning_rate": 3.368798438497512e-05, "loss": 4.1575, "step": 511 }, { "epoch": 0.6317088217149908, "grad_norm": 2609.87353515625, "learning_rate": 3.3493005259234806e-05, "loss": 4.717, "step": 512 }, { "epoch": 0.6329426280074029, "grad_norm": 784.0038452148438, "learning_rate": 3.329830745169324e-05, "loss": 4.0873, "step": 513 }, { "epoch": 0.634176434299815, "grad_norm": 561.313720703125, "learning_rate": 3.310389428046114e-05, "loss": 4.2622, "step": 514 }, { "epoch": 0.6354102405922271, "grad_norm": 654.1542358398438, "learning_rate": 3.290976905879837e-05, "loss": 4.2932, "step": 515 }, { "epoch": 0.6366440468846392, "grad_norm": 916.87548828125, "learning_rate": 3.271593509505746e-05, "loss": 4.3277, "step": 516 }, { "epoch": 0.6378778531770513, "grad_norm": 2840.9462890625, "learning_rate": 3.252239569262718e-05, "loss": 4.2958, "step": 517 }, { "epoch": 0.6391116594694632, "grad_norm": 870.240478515625, "learning_rate": 3.2329154149876364e-05, "loss": 4.2606, "step": 518 }, { "epoch": 0.6403454657618753, "grad_norm": 724.269775390625, "learning_rate": 3.213621376009749e-05, "loss": 4.0055, "step": 519 }, { "epoch": 0.6415792720542874, "grad_norm": 856.3790893554688, "learning_rate": 3.1943577811450815e-05, "loss": 4.1413, "step": 520 }, { "epoch": 0.6428130783466995, "grad_norm": 689.23828125, "learning_rate": 3.1751249586908095e-05, "loss": 4.0491, "step": 521 }, { "epoch": 0.6440468846391116, "grad_norm": 902.5159912109375, "learning_rate": 3.155923236419675e-05, "loss": 4.2865, "step": 522 }, { "epoch": 0.6452806909315237, "grad_norm": 736.0128173828125, "learning_rate": 3.136752941574407e-05, "loss": 4.0431, "step": 523 }, { "epoch": 0.6465144972239358, "grad_norm": 926.1270751953125, "learning_rate": 3.1176144008621266e-05, "loss": 4.367, "step": 524 }, { "epoch": 0.6477483035163479, "grad_norm": 544.0671997070312, "learning_rate": 3.098507940448799e-05, "loss": 4.0743, "step": 525 }, { "epoch": 0.64898210980876, "grad_norm": 675.6790161132812, "learning_rate": 3.07943388595366e-05, "loss": 4.2285, "step": 526 }, { "epoch": 0.6502159161011721, "grad_norm": 744.640625, "learning_rate": 3.060392562443669e-05, "loss": 4.0717, "step": 527 }, { "epoch": 0.6514497223935842, "grad_norm": 767.6522827148438, "learning_rate": 3.0413842944279812e-05, "loss": 4.2368, "step": 528 }, { "epoch": 0.6526835286859963, "grad_norm": 843.551513671875, "learning_rate": 3.0224094058524e-05, "loss": 4.1211, "step": 529 }, { "epoch": 0.6539173349784084, "grad_norm": 883.8860473632812, "learning_rate": 3.0034682200938675e-05, "loss": 3.984, "step": 530 }, { "epoch": 0.6551511412708205, "grad_norm": 1104.385498046875, "learning_rate": 2.9845610599549502e-05, "loss": 4.1508, "step": 531 }, { "epoch": 0.6563849475632326, "grad_norm": 708.7241821289062, "learning_rate": 2.9656882476583347e-05, "loss": 4.2806, "step": 532 }, { "epoch": 0.6576187538556446, "grad_norm": 576.7755126953125, "learning_rate": 2.9468501048413456e-05, "loss": 4.072, "step": 533 }, { "epoch": 0.6588525601480567, "grad_norm": 723.578857421875, "learning_rate": 2.9280469525504496e-05, "loss": 4.0651, "step": 534 }, { "epoch": 0.6600863664404688, "grad_norm": 1123.490478515625, "learning_rate": 2.9092791112358015e-05, "loss": 4.2243, "step": 535 }, { "epoch": 0.6613201727328809, "grad_norm": 1284.28076171875, "learning_rate": 2.8905469007457642e-05, "loss": 4.3101, "step": 536 }, { "epoch": 0.662553979025293, "grad_norm": 789.3023071289062, "learning_rate": 2.8718506403214696e-05, "loss": 4.2006, "step": 537 }, { "epoch": 0.6637877853177051, "grad_norm": 648.8651123046875, "learning_rate": 2.853190648591383e-05, "loss": 4.3445, "step": 538 }, { "epoch": 0.6650215916101172, "grad_norm": 894.7275390625, "learning_rate": 2.8345672435658534e-05, "loss": 4.3914, "step": 539 }, { "epoch": 0.6662553979025293, "grad_norm": 840.4365844726562, "learning_rate": 2.815980742631718e-05, "loss": 4.0824, "step": 540 }, { "epoch": 0.6674892041949414, "grad_norm": 768.6077880859375, "learning_rate": 2.7974314625468724e-05, "loss": 4.1064, "step": 541 }, { "epoch": 0.6687230104873535, "grad_norm": 912.2119140625, "learning_rate": 2.7789197194348816e-05, "loss": 3.9527, "step": 542 }, { "epoch": 0.6699568167797656, "grad_norm": 833.4104614257812, "learning_rate": 2.7604458287795986e-05, "loss": 4.269, "step": 543 }, { "epoch": 0.6711906230721777, "grad_norm": 2589.731201171875, "learning_rate": 2.7420101054197733e-05, "loss": 4.4387, "step": 544 }, { "epoch": 0.6724244293645898, "grad_norm": 659.1605224609375, "learning_rate": 2.7236128635436997e-05, "loss": 4.1201, "step": 545 }, { "epoch": 0.6736582356570019, "grad_norm": 736.2332153320312, "learning_rate": 2.7052544166838544e-05, "loss": 4.2099, "step": 546 }, { "epoch": 0.674892041949414, "grad_norm": 859.8041381835938, "learning_rate": 2.686935077711553e-05, "loss": 4.1371, "step": 547 }, { "epoch": 0.6761258482418261, "grad_norm": 908.8090209960938, "learning_rate": 2.6686551588316277e-05, "loss": 4.1132, "step": 548 }, { "epoch": 0.6773596545342381, "grad_norm": 711.6289672851562, "learning_rate": 2.6504149715770904e-05, "loss": 4.3484, "step": 549 }, { "epoch": 0.6785934608266502, "grad_norm": 718.6626586914062, "learning_rate": 2.632214826803837e-05, "loss": 4.124, "step": 550 }, { "epoch": 0.6798272671190623, "grad_norm": 706.4327392578125, "learning_rate": 2.6140550346853444e-05, "loss": 4.2005, "step": 551 }, { "epoch": 0.6810610734114744, "grad_norm": 738.0886840820312, "learning_rate": 2.5959359047073817e-05, "loss": 4.1552, "step": 552 }, { "epoch": 0.6822948797038865, "grad_norm": 683.3991088867188, "learning_rate": 2.577857745662746e-05, "loss": 4.1813, "step": 553 }, { "epoch": 0.6835286859962986, "grad_norm": 817.3013305664062, "learning_rate": 2.5598208656459855e-05, "loss": 4.1163, "step": 554 }, { "epoch": 0.6847624922887107, "grad_norm": 875.6898193359375, "learning_rate": 2.5418255720481614e-05, "loss": 4.0033, "step": 555 }, { "epoch": 0.6859962985811228, "grad_norm": 709.0074462890625, "learning_rate": 2.5238721715516012e-05, "loss": 4.2382, "step": 556 }, { "epoch": 0.6872301048735349, "grad_norm": 1076.5789794921875, "learning_rate": 2.5059609701246743e-05, "loss": 4.1432, "step": 557 }, { "epoch": 0.688463911165947, "grad_norm": 1058.0235595703125, "learning_rate": 2.4880922730165834e-05, "loss": 4.1446, "step": 558 }, { "epoch": 0.6896977174583591, "grad_norm": 775.5261840820312, "learning_rate": 2.4702663847521484e-05, "loss": 4.044, "step": 559 }, { "epoch": 0.6909315237507712, "grad_norm": 908.505615234375, "learning_rate": 2.4524836091266358e-05, "loss": 4.0728, "step": 560 }, { "epoch": 0.6921653300431833, "grad_norm": 759.946044921875, "learning_rate": 2.4347442492005628e-05, "loss": 4.1196, "step": 561 }, { "epoch": 0.6933991363355954, "grad_norm": 763.1526489257812, "learning_rate": 2.4170486072945407e-05, "loss": 4.2458, "step": 562 }, { "epoch": 0.6946329426280075, "grad_norm": 868.8223266601562, "learning_rate": 2.39939698498413e-05, "loss": 4.3268, "step": 563 }, { "epoch": 0.6958667489204194, "grad_norm": 804.314453125, "learning_rate": 2.3817896830946833e-05, "loss": 4.1783, "step": 564 }, { "epoch": 0.6971005552128315, "grad_norm": 707.9354858398438, "learning_rate": 2.3642270016962377e-05, "loss": 4.3348, "step": 565 }, { "epoch": 0.6983343615052436, "grad_norm": 724.1066284179688, "learning_rate": 2.3467092400983848e-05, "loss": 4.0843, "step": 566 }, { "epoch": 0.6995681677976557, "grad_norm": 501.19415283203125, "learning_rate": 2.3292366968451794e-05, "loss": 4.2207, "step": 567 }, { "epoch": 0.7008019740900678, "grad_norm": 660.9345703125, "learning_rate": 2.311809669710049e-05, "loss": 4.0833, "step": 568 }, { "epoch": 0.7020357803824799, "grad_norm": 614.4649047851562, "learning_rate": 2.2944284556907164e-05, "loss": 4.0431, "step": 569 }, { "epoch": 0.703269586674892, "grad_norm": 809.9785766601562, "learning_rate": 2.2770933510041458e-05, "loss": 4.1038, "step": 570 }, { "epoch": 0.7045033929673041, "grad_norm": 827.5841674804688, "learning_rate": 2.259804651081486e-05, "loss": 4.1789, "step": 571 }, { "epoch": 0.7057371992597162, "grad_norm": 617.230712890625, "learning_rate": 2.242562650563036e-05, "loss": 4.1059, "step": 572 }, { "epoch": 0.7069710055521283, "grad_norm": 955.2245483398438, "learning_rate": 2.225367643293234e-05, "loss": 4.1901, "step": 573 }, { "epoch": 0.7082048118445404, "grad_norm": 931.9966430664062, "learning_rate": 2.2082199223156353e-05, "loss": 4.0366, "step": 574 }, { "epoch": 0.7094386181369525, "grad_norm": 610.5770874023438, "learning_rate": 2.1911197798679302e-05, "loss": 4.1104, "step": 575 }, { "epoch": 0.7106724244293646, "grad_norm": 1126.201171875, "learning_rate": 2.1740675073769527e-05, "loss": 4.3509, "step": 576 }, { "epoch": 0.7119062307217767, "grad_norm": 790.5230712890625, "learning_rate": 2.157063395453727e-05, "loss": 4.127, "step": 577 }, { "epoch": 0.7131400370141888, "grad_norm": 892.1445922851562, "learning_rate": 2.1401077338885022e-05, "loss": 4.1203, "step": 578 }, { "epoch": 0.7143738433066008, "grad_norm": 608.5709228515625, "learning_rate": 2.1232008116458168e-05, "loss": 4.0546, "step": 579 }, { "epoch": 0.7156076495990129, "grad_norm": 773.236083984375, "learning_rate": 2.1063429168595838e-05, "loss": 4.0265, "step": 580 }, { "epoch": 0.716841455891425, "grad_norm": 631.272216796875, "learning_rate": 2.0895343368281656e-05, "loss": 4.167, "step": 581 }, { "epoch": 0.7180752621838371, "grad_norm": 891.7882080078125, "learning_rate": 2.0727753580094867e-05, "loss": 4.1199, "step": 582 }, { "epoch": 0.7193090684762492, "grad_norm": 1047.534912109375, "learning_rate": 2.056066266016151e-05, "loss": 4.161, "step": 583 }, { "epoch": 0.7205428747686613, "grad_norm": 1438.3089599609375, "learning_rate": 2.0394073456105695e-05, "loss": 4.2972, "step": 584 }, { "epoch": 0.7217766810610734, "grad_norm": 932.1802978515625, "learning_rate": 2.0227988807001168e-05, "loss": 4.1325, "step": 585 }, { "epoch": 0.7230104873534855, "grad_norm": 827.4475708007812, "learning_rate": 2.0062411543322802e-05, "loss": 4.086, "step": 586 }, { "epoch": 0.7242442936458976, "grad_norm": 1037.58056640625, "learning_rate": 1.9897344486898482e-05, "loss": 4.0051, "step": 587 }, { "epoch": 0.7254780999383097, "grad_norm": 746.5398559570312, "learning_rate": 1.973279045086091e-05, "loss": 4.2181, "step": 588 }, { "epoch": 0.7267119062307218, "grad_norm": 1007.1620483398438, "learning_rate": 1.95687522395997e-05, "loss": 4.2684, "step": 589 }, { "epoch": 0.7279457125231339, "grad_norm": 1248.197509765625, "learning_rate": 1.9405232648713667e-05, "loss": 4.4022, "step": 590 }, { "epoch": 0.729179518815546, "grad_norm": 1171.357666015625, "learning_rate": 1.9242234464963015e-05, "loss": 4.0653, "step": 591 }, { "epoch": 0.7304133251079581, "grad_norm": 778.9668579101562, "learning_rate": 1.9079760466222025e-05, "loss": 4.2312, "step": 592 }, { "epoch": 0.7316471314003702, "grad_norm": 900.8779296875, "learning_rate": 1.8917813421431552e-05, "loss": 4.1372, "step": 593 }, { "epoch": 0.7328809376927823, "grad_norm": 933.5811157226562, "learning_rate": 1.8756396090551937e-05, "loss": 4.038, "step": 594 }, { "epoch": 0.7341147439851943, "grad_norm": 707.1832275390625, "learning_rate": 1.859551122451598e-05, "loss": 3.9995, "step": 595 }, { "epoch": 0.7353485502776064, "grad_norm": 787.6524658203125, "learning_rate": 1.8435161565181985e-05, "loss": 4.1945, "step": 596 }, { "epoch": 0.7365823565700185, "grad_norm": 672.23779296875, "learning_rate": 1.8275349845287065e-05, "loss": 4.0415, "step": 597 }, { "epoch": 0.7378161628624306, "grad_norm": 914.7695922851562, "learning_rate": 1.8116078788400565e-05, "loss": 4.2472, "step": 598 }, { "epoch": 0.7390499691548427, "grad_norm": 784.9636840820312, "learning_rate": 1.7957351108877718e-05, "loss": 4.1694, "step": 599 }, { "epoch": 0.7402837754472548, "grad_norm": 842.5552368164062, "learning_rate": 1.7799169511813257e-05, "loss": 4.22, "step": 600 }, { "epoch": 0.7415175817396669, "grad_norm": 869.786865234375, "learning_rate": 1.764153669299538e-05, "loss": 4.1254, "step": 601 }, { "epoch": 0.742751388032079, "grad_norm": 722.7039794921875, "learning_rate": 1.7484455338859873e-05, "loss": 4.1921, "step": 602 }, { "epoch": 0.7439851943244911, "grad_norm": 1220.6378173828125, "learning_rate": 1.7327928126444188e-05, "loss": 4.192, "step": 603 }, { "epoch": 0.7452190006169032, "grad_norm": 623.8390502929688, "learning_rate": 1.7171957723341915e-05, "loss": 4.2977, "step": 604 }, { "epoch": 0.7464528069093153, "grad_norm": 594.2061767578125, "learning_rate": 1.701654678765732e-05, "loss": 4.1827, "step": 605 }, { "epoch": 0.7476866132017274, "grad_norm": 930.7269897460938, "learning_rate": 1.686169796795999e-05, "loss": 4.0349, "step": 606 }, { "epoch": 0.7489204194941395, "grad_norm": 849.483154296875, "learning_rate": 1.670741390323976e-05, "loss": 4.0226, "step": 607 }, { "epoch": 0.7501542257865516, "grad_norm": 712.315185546875, "learning_rate": 1.655369722286168e-05, "loss": 4.2015, "step": 608 }, { "epoch": 0.7513880320789637, "grad_norm": 1144.7913818359375, "learning_rate": 1.640055054652122e-05, "loss": 4.0789, "step": 609 }, { "epoch": 0.7513880320789637, "eval_loss": 4.146423816680908, "eval_runtime": 9.7129, "eval_samples_per_second": 140.535, "eval_steps_per_second": 17.606, "step": 609 }, { "epoch": 0.7526218383713756, "grad_norm": 751.4150390625, "learning_rate": 1.6247976484199685e-05, "loss": 4.1387, "step": 610 }, { "epoch": 0.7538556446637877, "grad_norm": 654.2318725585938, "learning_rate": 1.6095977636119612e-05, "loss": 4.0025, "step": 611 }, { "epoch": 0.7550894509561998, "grad_norm": 1046.57666015625, "learning_rate": 1.594455659270061e-05, "loss": 4.1134, "step": 612 }, { "epoch": 0.7563232572486119, "grad_norm": 823.7333984375, "learning_rate": 1.5793715934515064e-05, "loss": 4.1532, "step": 613 }, { "epoch": 0.757557063541024, "grad_norm": 522.1107177734375, "learning_rate": 1.5643458232244252e-05, "loss": 4.1607, "step": 614 }, { "epoch": 0.7587908698334361, "grad_norm": 632.6012573242188, "learning_rate": 1.549378604663449e-05, "loss": 4.0663, "step": 615 }, { "epoch": 0.7600246761258482, "grad_norm": 1016.3590698242188, "learning_rate": 1.534470192845352e-05, "loss": 4.159, "step": 616 }, { "epoch": 0.7612584824182603, "grad_norm": 930.0181884765625, "learning_rate": 1.5196208418447033e-05, "loss": 4.2752, "step": 617 }, { "epoch": 0.7624922887106724, "grad_norm": 657.7666625976562, "learning_rate": 1.5048308047295357e-05, "loss": 4.2391, "step": 618 }, { "epoch": 0.7637260950030845, "grad_norm": 740.6392211914062, "learning_rate": 1.4901003335570291e-05, "loss": 4.0785, "step": 619 }, { "epoch": 0.7649599012954966, "grad_norm": 712.2428588867188, "learning_rate": 1.4754296793692263e-05, "loss": 4.003, "step": 620 }, { "epoch": 0.7661937075879087, "grad_norm": 638.2742309570312, "learning_rate": 1.4608190921887405e-05, "loss": 4.1222, "step": 621 }, { "epoch": 0.7674275138803208, "grad_norm": 771.7957763671875, "learning_rate": 1.4462688210145076e-05, "loss": 3.9993, "step": 622 }, { "epoch": 0.7686613201727329, "grad_norm": 1143.6319580078125, "learning_rate": 1.4317791138175301e-05, "loss": 4.1052, "step": 623 }, { "epoch": 0.769895126465145, "grad_norm": 733.2741088867188, "learning_rate": 1.4173502175366592e-05, "loss": 4.1681, "step": 624 }, { "epoch": 0.7711289327575571, "grad_norm": 580.5585327148438, "learning_rate": 1.4029823780743878e-05, "loss": 4.0505, "step": 625 }, { "epoch": 0.7723627390499691, "grad_norm": 667.1614990234375, "learning_rate": 1.3886758402926508e-05, "loss": 4.0162, "step": 626 }, { "epoch": 0.7735965453423812, "grad_norm": 686.6605834960938, "learning_rate": 1.3744308480086631e-05, "loss": 4.0104, "step": 627 }, { "epoch": 0.7748303516347933, "grad_norm": 779.1243286132812, "learning_rate": 1.3602476439907547e-05, "loss": 4.2397, "step": 628 }, { "epoch": 0.7760641579272054, "grad_norm": 605.6174926757812, "learning_rate": 1.3461264699542386e-05, "loss": 3.9689, "step": 629 }, { "epoch": 0.7772979642196175, "grad_norm": 620.8440551757812, "learning_rate": 1.3320675665572912e-05, "loss": 4.2737, "step": 630 }, { "epoch": 0.7785317705120296, "grad_norm": 1106.942138671875, "learning_rate": 1.3180711733968476e-05, "loss": 4.1882, "step": 631 }, { "epoch": 0.7797655768044417, "grad_norm": 1045.4193115234375, "learning_rate": 1.3041375290045266e-05, "loss": 4.0532, "step": 632 }, { "epoch": 0.7809993830968538, "grad_norm": 1204.981201171875, "learning_rate": 1.2902668708425531e-05, "loss": 4.1542, "step": 633 }, { "epoch": 0.7822331893892659, "grad_norm": 800.9042358398438, "learning_rate": 1.27645943529972e-05, "loss": 4.0571, "step": 634 }, { "epoch": 0.783466995681678, "grad_norm": 1350.43798828125, "learning_rate": 1.26271545768736e-05, "loss": 4.282, "step": 635 }, { "epoch": 0.7847008019740901, "grad_norm": 1843.5970458984375, "learning_rate": 1.2490351722353283e-05, "loss": 4.1206, "step": 636 }, { "epoch": 0.7859346082665022, "grad_norm": 736.8824462890625, "learning_rate": 1.2354188120880206e-05, "loss": 4.0556, "step": 637 }, { "epoch": 0.7871684145589143, "grad_norm": 566.03515625, "learning_rate": 1.2218666093003883e-05, "loss": 4.0476, "step": 638 }, { "epoch": 0.7884022208513264, "grad_norm": 803.45458984375, "learning_rate": 1.2083787948339925e-05, "loss": 4.1298, "step": 639 }, { "epoch": 0.7896360271437385, "grad_norm": 753.2470092773438, "learning_rate": 1.1949555985530681e-05, "loss": 4.1376, "step": 640 }, { "epoch": 0.7908698334361505, "grad_norm": 740.588134765625, "learning_rate": 1.1815972492205974e-05, "loss": 4.0778, "step": 641 }, { "epoch": 0.7921036397285626, "grad_norm": 733.8716430664062, "learning_rate": 1.1683039744944235e-05, "loss": 4.1101, "step": 642 }, { "epoch": 0.7933374460209747, "grad_norm": 557.7567138671875, "learning_rate": 1.1550760009233608e-05, "loss": 4.0159, "step": 643 }, { "epoch": 0.7945712523133868, "grad_norm": 897.4954833984375, "learning_rate": 1.1419135539433357e-05, "loss": 4.1767, "step": 644 }, { "epoch": 0.7958050586057989, "grad_norm": 827.8949584960938, "learning_rate": 1.1288168578735541e-05, "loss": 4.0123, "step": 645 }, { "epoch": 0.797038864898211, "grad_norm": 620.9117431640625, "learning_rate": 1.1157861359126637e-05, "loss": 4.0085, "step": 646 }, { "epoch": 0.7982726711906231, "grad_norm": 883.6416625976562, "learning_rate": 1.1028216101349604e-05, "loss": 4.2117, "step": 647 }, { "epoch": 0.7995064774830352, "grad_norm": 920.2574462890625, "learning_rate": 1.0899235014866006e-05, "loss": 4.211, "step": 648 }, { "epoch": 0.8007402837754473, "grad_norm": 1049.3328857421875, "learning_rate": 1.0770920297818338e-05, "loss": 4.0649, "step": 649 }, { "epoch": 0.8019740900678594, "grad_norm": 1121.3260498046875, "learning_rate": 1.0643274136992642e-05, "loss": 4.1647, "step": 650 }, { "epoch": 0.8032078963602715, "grad_norm": 774.3944702148438, "learning_rate": 1.0516298707781108e-05, "loss": 4.1825, "step": 651 }, { "epoch": 0.8044417026526836, "grad_norm": 625.9962768554688, "learning_rate": 1.0389996174145145e-05, "loss": 4.1092, "step": 652 }, { "epoch": 0.8056755089450957, "grad_norm": 648.0266723632812, "learning_rate": 1.0264368688578374e-05, "loss": 4.2745, "step": 653 }, { "epoch": 0.8069093152375078, "grad_norm": 954.5858764648438, "learning_rate": 1.0139418392070022e-05, "loss": 4.087, "step": 654 }, { "epoch": 0.8081431215299199, "grad_norm": 821.7733764648438, "learning_rate": 1.0015147414068431e-05, "loss": 4.168, "step": 655 }, { "epoch": 0.809376927822332, "grad_norm": 913.7914428710938, "learning_rate": 9.891557872444723e-06, "loss": 4.0597, "step": 656 }, { "epoch": 0.8106107341147439, "grad_norm": 826.9698486328125, "learning_rate": 9.768651873456763e-06, "loss": 4.1464, "step": 657 }, { "epoch": 0.811844540407156, "grad_norm": 804.5733642578125, "learning_rate": 9.646431511713206e-06, "loss": 4.3219, "step": 658 }, { "epoch": 0.8130783466995681, "grad_norm": 731.3887329101562, "learning_rate": 9.524898870137828e-06, "loss": 4.2377, "step": 659 }, { "epoch": 0.8143121529919802, "grad_norm": 867.5519409179688, "learning_rate": 9.404056019934072e-06, "loss": 4.3898, "step": 660 }, { "epoch": 0.8155459592843923, "grad_norm": 723.6217651367188, "learning_rate": 9.283905020549654e-06, "loss": 4.2068, "step": 661 }, { "epoch": 0.8167797655768044, "grad_norm": 836.3093872070312, "learning_rate": 9.164447919641538e-06, "loss": 4.2795, "step": 662 }, { "epoch": 0.8180135718692165, "grad_norm": 724.6507568359375, "learning_rate": 9.045686753041017e-06, "loss": 4.0401, "step": 663 }, { "epoch": 0.8192473781616286, "grad_norm": 701.7694091796875, "learning_rate": 8.92762354471901e-06, "loss": 4.1451, "step": 664 }, { "epoch": 0.8204811844540407, "grad_norm": 870.6682739257812, "learning_rate": 8.810260306751611e-06, "loss": 4.1369, "step": 665 }, { "epoch": 0.8217149907464528, "grad_norm": 810.0514526367188, "learning_rate": 8.693599039285717e-06, "loss": 4.1027, "step": 666 }, { "epoch": 0.8229487970388649, "grad_norm": 664.614013671875, "learning_rate": 8.577641730505032e-06, "loss": 4.2311, "step": 667 }, { "epoch": 0.824182603331277, "grad_norm": 878.1331787109375, "learning_rate": 8.462390356596117e-06, "loss": 4.0878, "step": 668 }, { "epoch": 0.8254164096236891, "grad_norm": 737.3565063476562, "learning_rate": 8.347846881714715e-06, "loss": 4.1636, "step": 669 }, { "epoch": 0.8266502159161012, "grad_norm": 569.6368408203125, "learning_rate": 8.234013257952356e-06, "loss": 4.1989, "step": 670 }, { "epoch": 0.8278840222085133, "grad_norm": 1296.9088134765625, "learning_rate": 8.120891425302962e-06, "loss": 4.153, "step": 671 }, { "epoch": 0.8291178285009253, "grad_norm": 723.2803344726562, "learning_rate": 8.008483311629911e-06, "loss": 4.0184, "step": 672 }, { "epoch": 0.8303516347933374, "grad_norm": 801.1158447265625, "learning_rate": 7.896790832633072e-06, "loss": 4.2169, "step": 673 }, { "epoch": 0.8315854410857495, "grad_norm": 622.2526245117188, "learning_rate": 7.785815891816255e-06, "loss": 4.1782, "step": 674 }, { "epoch": 0.8328192473781616, "grad_norm": 985.3092651367188, "learning_rate": 7.675560380454695e-06, "loss": 4.2106, "step": 675 }, { "epoch": 0.8340530536705737, "grad_norm": 885.2922973632812, "learning_rate": 7.566026177562846e-06, "loss": 4.2153, "step": 676 }, { "epoch": 0.8352868599629858, "grad_norm": 659.1461791992188, "learning_rate": 7.457215149862373e-06, "loss": 4.1667, "step": 677 }, { "epoch": 0.8365206662553979, "grad_norm": 686.8173217773438, "learning_rate": 7.349129151750311e-06, "loss": 4.0595, "step": 678 }, { "epoch": 0.83775447254781, "grad_norm": 724.762451171875, "learning_rate": 7.241770025267519e-06, "loss": 4.1332, "step": 679 }, { "epoch": 0.8389882788402221, "grad_norm": 991.0209350585938, "learning_rate": 7.135139600067203e-06, "loss": 4.319, "step": 680 }, { "epoch": 0.8402220851326342, "grad_norm": 626.606689453125, "learning_rate": 7.0292396933837765e-06, "loss": 4.0464, "step": 681 }, { "epoch": 0.8414558914250463, "grad_norm": 471.4643859863281, "learning_rate": 6.924072110001933e-06, "loss": 4.1906, "step": 682 }, { "epoch": 0.8426896977174584, "grad_norm": 807.5468139648438, "learning_rate": 6.819638642225795e-06, "loss": 4.0246, "step": 683 }, { "epoch": 0.8439235040098705, "grad_norm": 653.949951171875, "learning_rate": 6.715941069848458e-06, "loss": 4.1349, "step": 684 }, { "epoch": 0.8451573103022826, "grad_norm": 808.3042602539062, "learning_rate": 6.612981160121612e-06, "loss": 4.0238, "step": 685 }, { "epoch": 0.8463911165946947, "grad_norm": 913.7222900390625, "learning_rate": 6.510760667725408e-06, "loss": 4.1736, "step": 686 }, { "epoch": 0.8476249228871067, "grad_norm": 776.08642578125, "learning_rate": 6.409281334738615e-06, "loss": 4.1393, "step": 687 }, { "epoch": 0.8488587291795188, "grad_norm": 752.64599609375, "learning_rate": 6.308544890608864e-06, "loss": 4.0782, "step": 688 }, { "epoch": 0.8500925354719309, "grad_norm": 927.5094604492188, "learning_rate": 6.208553052123234e-06, "loss": 4.302, "step": 689 }, { "epoch": 0.851326341764343, "grad_norm": 1267.0023193359375, "learning_rate": 6.1093075233789375e-06, "loss": 4.0104, "step": 690 }, { "epoch": 0.8525601480567551, "grad_norm": 799.9869384765625, "learning_rate": 6.0108099957543075e-06, "loss": 4.2453, "step": 691 }, { "epoch": 0.8537939543491672, "grad_norm": 682.4727783203125, "learning_rate": 5.913062147879994e-06, "loss": 4.2432, "step": 692 }, { "epoch": 0.8550277606415793, "grad_norm": 799.929931640625, "learning_rate": 5.816065645610313e-06, "loss": 4.1794, "step": 693 }, { "epoch": 0.8562615669339914, "grad_norm": 678.6499633789062, "learning_rate": 5.719822141994874e-06, "loss": 4.1272, "step": 694 }, { "epoch": 0.8574953732264035, "grad_norm": 890.60986328125, "learning_rate": 5.624333277250415e-06, "loss": 4.0011, "step": 695 }, { "epoch": 0.8587291795188156, "grad_norm": 525.164306640625, "learning_rate": 5.529600678732843e-06, "loss": 4.358, "step": 696 }, { "epoch": 0.8599629858112277, "grad_norm": 634.1682739257812, "learning_rate": 5.435625960909513e-06, "loss": 4.168, "step": 697 }, { "epoch": 0.8611967921036398, "grad_norm": 1266.8399658203125, "learning_rate": 5.342410725331682e-06, "loss": 4.0225, "step": 698 }, { "epoch": 0.8624305983960519, "grad_norm": 951.68212890625, "learning_rate": 5.249956560607255e-06, "loss": 4.3119, "step": 699 }, { "epoch": 0.863664404688464, "grad_norm": 1137.700439453125, "learning_rate": 5.158265042373672e-06, "loss": 4.0688, "step": 700 }, { "epoch": 0.864898210980876, "grad_norm": 1199.1778564453125, "learning_rate": 5.067337733271083e-06, "loss": 4.1535, "step": 701 }, { "epoch": 0.8661320172732881, "grad_norm": 1050.1751708984375, "learning_rate": 4.977176182915727e-06, "loss": 4.1608, "step": 702 }, { "epoch": 0.8673658235657001, "grad_norm": 601.8255615234375, "learning_rate": 4.887781927873458e-06, "loss": 4.0056, "step": 703 }, { "epoch": 0.8685996298581122, "grad_norm": 781.65283203125, "learning_rate": 4.799156491633655e-06, "loss": 4.2278, "step": 704 }, { "epoch": 0.8698334361505243, "grad_norm": 953.7297973632812, "learning_rate": 4.711301384583183e-06, "loss": 3.9453, "step": 705 }, { "epoch": 0.8710672424429364, "grad_norm": 682.08837890625, "learning_rate": 4.624218103980665e-06, "loss": 3.9502, "step": 706 }, { "epoch": 0.8723010487353485, "grad_norm": 999.5166625976562, "learning_rate": 4.537908133931018e-06, "loss": 4.0734, "step": 707 }, { "epoch": 0.8735348550277606, "grad_norm": 1854.658935546875, "learning_rate": 4.452372945360072e-06, "loss": 4.0023, "step": 708 }, { "epoch": 0.8747686613201727, "grad_norm": 974.3369750976562, "learning_rate": 4.367613995989589e-06, "loss": 4.1085, "step": 709 }, { "epoch": 0.8760024676125848, "grad_norm": 929.5103759765625, "learning_rate": 4.283632730312348e-06, "loss": 4.2707, "step": 710 }, { "epoch": 0.8772362739049969, "grad_norm": 820.7687377929688, "learning_rate": 4.200430579567571e-06, "loss": 4.0151, "step": 711 }, { "epoch": 0.878470080197409, "grad_norm": 851.3623657226562, "learning_rate": 4.118008961716552e-06, "loss": 4.1227, "step": 712 }, { "epoch": 0.8797038864898211, "grad_norm": 1031.295166015625, "learning_rate": 4.0363692814184e-06, "loss": 4.0757, "step": 713 }, { "epoch": 0.8809376927822332, "grad_norm": 1569.51318359375, "learning_rate": 3.955512930006222e-06, "loss": 4.3649, "step": 714 }, { "epoch": 0.8821714990746453, "grad_norm": 1083.7933349609375, "learning_rate": 3.87544128546331e-06, "loss": 4.0284, "step": 715 }, { "epoch": 0.8834053053670574, "grad_norm": 761.168212890625, "learning_rate": 3.7961557123997018e-06, "loss": 4.2838, "step": 716 }, { "epoch": 0.8846391116594695, "grad_norm": 821.79443359375, "learning_rate": 3.7176575620289368e-06, "loss": 4.0374, "step": 717 }, { "epoch": 0.8858729179518815, "grad_norm": 877.775146484375, "learning_rate": 3.6399481721449857e-06, "loss": 4.174, "step": 718 }, { "epoch": 0.8871067242442936, "grad_norm": 701.419677734375, "learning_rate": 3.563028867099505e-06, "loss": 3.98, "step": 719 }, { "epoch": 0.8883405305367057, "grad_norm": 802.7510375976562, "learning_rate": 3.4869009577792157e-06, "loss": 4.0774, "step": 720 }, { "epoch": 0.8895743368291178, "grad_norm": 699.1093139648438, "learning_rate": 3.4115657415835835e-06, "loss": 4.0156, "step": 721 }, { "epoch": 0.8908081431215299, "grad_norm": 733.71484375, "learning_rate": 3.3370245024027414e-06, "loss": 4.2041, "step": 722 }, { "epoch": 0.892041949413942, "grad_norm": 527.6852416992188, "learning_rate": 3.2632785105955467e-06, "loss": 3.9713, "step": 723 }, { "epoch": 0.8932757557063541, "grad_norm": 798.1897583007812, "learning_rate": 3.190329022967975e-06, "loss": 4.0734, "step": 724 }, { "epoch": 0.8945095619987662, "grad_norm": 723.0835571289062, "learning_rate": 3.1181772827516664e-06, "loss": 4.0981, "step": 725 }, { "epoch": 0.8957433682911783, "grad_norm": 783.1177978515625, "learning_rate": 3.046824519582808e-06, "loss": 4.0234, "step": 726 }, { "epoch": 0.8969771745835904, "grad_norm": 666.199462890625, "learning_rate": 2.9762719494810855e-06, "loss": 3.9911, "step": 727 }, { "epoch": 0.8982109808760025, "grad_norm": 901.4703369140625, "learning_rate": 2.9065207748290134e-06, "loss": 4.3203, "step": 728 }, { "epoch": 0.8994447871684146, "grad_norm": 615.899169921875, "learning_rate": 2.83757218435145e-06, "loss": 4.1368, "step": 729 }, { "epoch": 0.9006785934608267, "grad_norm": 953.3482666015625, "learning_rate": 2.7694273530953163e-06, "loss": 4.2854, "step": 730 }, { "epoch": 0.9019123997532388, "grad_norm": 939.6257934570312, "learning_rate": 2.702087442409551e-06, "loss": 4.0877, "step": 731 }, { "epoch": 0.9031462060456509, "grad_norm": 666.4422607421875, "learning_rate": 2.6355535999253887e-06, "loss": 4.1066, "step": 732 }, { "epoch": 0.904380012338063, "grad_norm": 737.6424560546875, "learning_rate": 2.5698269595367254e-06, "loss": 4.1649, "step": 733 }, { "epoch": 0.905613818630475, "grad_norm": 659.7061157226562, "learning_rate": 2.5049086413808377e-06, "loss": 4.0905, "step": 734 }, { "epoch": 0.9068476249228871, "grad_norm": 725.164306640625, "learning_rate": 2.4407997518192728e-06, "loss": 3.9756, "step": 735 }, { "epoch": 0.9080814312152992, "grad_norm": 1040.10205078125, "learning_rate": 2.377501383419006e-06, "loss": 4.0973, "step": 736 }, { "epoch": 0.9093152375077113, "grad_norm": 933.7422485351562, "learning_rate": 2.3150146149338247e-06, "loss": 3.9678, "step": 737 }, { "epoch": 0.9105490438001234, "grad_norm": 729.0170288085938, "learning_rate": 2.253340511285923e-06, "loss": 4.0576, "step": 738 }, { "epoch": 0.9117828500925355, "grad_norm": 700.053955078125, "learning_rate": 2.1924801235477743e-06, "loss": 4.2267, "step": 739 }, { "epoch": 0.9130166563849476, "grad_norm": 834.5526733398438, "learning_rate": 2.132434488924212e-06, "loss": 3.9349, "step": 740 }, { "epoch": 0.9142504626773597, "grad_norm": 1009.9600830078125, "learning_rate": 2.073204630734743e-06, "loss": 4.2046, "step": 741 }, { "epoch": 0.9154842689697718, "grad_norm": 581.45849609375, "learning_rate": 2.0147915583961173e-06, "loss": 4.1017, "step": 742 }, { "epoch": 0.9167180752621839, "grad_norm": 612.1376953125, "learning_rate": 1.9571962674051204e-06, "loss": 4.2051, "step": 743 }, { "epoch": 0.917951881554596, "grad_norm": 963.0797729492188, "learning_rate": 1.9004197393216294e-06, "loss": 4.1168, "step": 744 }, { "epoch": 0.919185687847008, "grad_norm": 852.234130859375, "learning_rate": 1.84446294175184e-06, "loss": 4.0354, "step": 745 }, { "epoch": 0.9204194941394201, "grad_norm": 1199.9132080078125, "learning_rate": 1.7893268283318276e-06, "loss": 4.3468, "step": 746 }, { "epoch": 0.9216533004318322, "grad_norm": 724.0450439453125, "learning_rate": 1.735012338711256e-06, "loss": 4.0663, "step": 747 }, { "epoch": 0.9228871067242443, "grad_norm": 807.9916381835938, "learning_rate": 1.681520398537373e-06, "loss": 4.116, "step": 748 }, { "epoch": 0.9241209130166563, "grad_norm": 1655.5943603515625, "learning_rate": 1.6288519194392615e-06, "loss": 4.157, "step": 749 }, { "epoch": 0.9253547193090684, "grad_norm": 559.2360229492188, "learning_rate": 1.5770077990122645e-06, "loss": 3.9687, "step": 750 }, { "epoch": 0.9265885256014805, "grad_norm": 1023.9130249023438, "learning_rate": 1.5259889208027013e-06, "loss": 4.1494, "step": 751 }, { "epoch": 0.9278223318938926, "grad_norm": 682.001220703125, "learning_rate": 1.4757961542928355e-06, "loss": 4.1164, "step": 752 }, { "epoch": 0.9290561381863047, "grad_norm": 781.3024291992188, "learning_rate": 1.4264303548859992e-06, "loss": 4.0709, "step": 753 }, { "epoch": 0.9302899444787168, "grad_norm": 1117.8253173828125, "learning_rate": 1.3778923638920971e-06, "loss": 4.2305, "step": 754 }, { "epoch": 0.9315237507711289, "grad_norm": 982.4514770507812, "learning_rate": 1.330183008513186e-06, "loss": 4.0738, "step": 755 }, { "epoch": 0.932757557063541, "grad_norm": 663.4027709960938, "learning_rate": 1.283303101829425e-06, "loss": 4.2843, "step": 756 }, { "epoch": 0.9339913633559531, "grad_norm": 605.0052490234375, "learning_rate": 1.2372534427852079e-06, "loss": 4.1355, "step": 757 }, { "epoch": 0.9352251696483652, "grad_norm": 678.015625, "learning_rate": 1.1920348161755413e-06, "loss": 4.1688, "step": 758 }, { "epoch": 0.9364589759407773, "grad_norm": 1134.179443359375, "learning_rate": 1.1476479926326945e-06, "loss": 4.2093, "step": 759 }, { "epoch": 0.9376927822331894, "grad_norm": 772.4798583984375, "learning_rate": 1.104093728613026e-06, "loss": 4.058, "step": 760 }, { "epoch": 0.9389265885256015, "grad_norm": 913.7644653320312, "learning_rate": 1.061372766384111e-06, "loss": 4.0062, "step": 761 }, { "epoch": 0.9401603948180136, "grad_norm": 660.7054443359375, "learning_rate": 1.0194858340121183e-06, "loss": 4.0886, "step": 762 }, { "epoch": 0.9413942011104257, "grad_norm": 673.8242797851562, "learning_rate": 9.784336453493414e-07, "loss": 4.1026, "step": 763 }, { "epoch": 0.9426280074028378, "grad_norm": 671.2329711914062, "learning_rate": 9.382169000221041e-07, "loss": 4.1491, "step": 764 }, { "epoch": 0.9438618136952498, "grad_norm": 581.8679809570312, "learning_rate": 8.988362834187747e-07, "loss": 4.0206, "step": 765 }, { "epoch": 0.9450956199876619, "grad_norm": 1120.507568359375, "learning_rate": 8.602924666781254e-07, "loss": 4.0462, "step": 766 }, { "epoch": 0.946329426280074, "grad_norm": 666.160400390625, "learning_rate": 8.225861066778806e-07, "loss": 4.0883, "step": 767 }, { "epoch": 0.9475632325724861, "grad_norm": 679.1992797851562, "learning_rate": 7.857178460235148e-07, "loss": 3.9554, "step": 768 }, { "epoch": 0.9487970388648982, "grad_norm": 750.4222412109375, "learning_rate": 7.496883130373167e-07, "loss": 4.1345, "step": 769 }, { "epoch": 0.9500308451573103, "grad_norm": 989.0843505859375, "learning_rate": 7.144981217476754e-07, "loss": 4.0929, "step": 770 }, { "epoch": 0.9512646514497224, "grad_norm": 751.5528564453125, "learning_rate": 6.801478718785947e-07, "loss": 4.0241, "step": 771 }, { "epoch": 0.9524984577421345, "grad_norm": 999.92431640625, "learning_rate": 6.46638148839529e-07, "loss": 4.0454, "step": 772 }, { "epoch": 0.9537322640345466, "grad_norm": 859.3124389648438, "learning_rate": 6.139695237153298e-07, "loss": 4.1067, "step": 773 }, { "epoch": 0.9549660703269587, "grad_norm": 935.9942626953125, "learning_rate": 5.821425532565816e-07, "loss": 4.1005, "step": 774 }, { "epoch": 0.9561998766193708, "grad_norm": 935.444580078125, "learning_rate": 5.511577798700596e-07, "loss": 4.1124, "step": 775 }, { "epoch": 0.9574336829117829, "grad_norm": 812.61328125, "learning_rate": 5.21015731609531e-07, "loss": 4.0986, "step": 776 }, { "epoch": 0.958667489204195, "grad_norm": 719.1353149414062, "learning_rate": 4.91716922166735e-07, "loss": 4.0122, "step": 777 }, { "epoch": 0.9599012954966071, "grad_norm": 614.0670776367188, "learning_rate": 4.632618508626063e-07, "loss": 4.2284, "step": 778 }, { "epoch": 0.9611351017890192, "grad_norm": 732.4361572265625, "learning_rate": 4.35651002638815e-07, "loss": 4.0553, "step": 779 }, { "epoch": 0.9623689080814312, "grad_norm": 739.580322265625, "learning_rate": 4.088848480494567e-07, "loss": 4.1197, "step": 780 }, { "epoch": 0.9636027143738433, "grad_norm": 630.6943359375, "learning_rate": 3.829638432530702e-07, "loss": 4.1957, "step": 781 }, { "epoch": 0.9648365206662554, "grad_norm": 787.4408569335938, "learning_rate": 3.5788843000481576e-07, "loss": 4.0408, "step": 782 }, { "epoch": 0.9660703269586675, "grad_norm": 656.0836181640625, "learning_rate": 3.336590356489977e-07, "loss": 4.0707, "step": 783 }, { "epoch": 0.9673041332510796, "grad_norm": 1066.859130859375, "learning_rate": 3.102760731117593e-07, "loss": 4.08, "step": 784 }, { "epoch": 0.9685379395434917, "grad_norm": 636.3853149414062, "learning_rate": 2.8773994089402734e-07, "loss": 4.1094, "step": 785 }, { "epoch": 0.9697717458359038, "grad_norm": 833.309814453125, "learning_rate": 2.6605102306476724e-07, "loss": 4.2544, "step": 786 }, { "epoch": 0.9710055521283159, "grad_norm": 769.7086181640625, "learning_rate": 2.452096892543776e-07, "loss": 4.152, "step": 787 }, { "epoch": 0.972239358420728, "grad_norm": 737.0515747070312, "learning_rate": 2.2521629464844486e-07, "loss": 4.2431, "step": 788 }, { "epoch": 0.97347316471314, "grad_norm": 683.5414428710938, "learning_rate": 2.0607117998165947e-07, "loss": 4.2063, "step": 789 }, { "epoch": 0.9747069710055521, "grad_norm": 725.9776000976562, "learning_rate": 1.8777467153202054e-07, "loss": 4.2039, "step": 790 }, { "epoch": 0.9759407772979642, "grad_norm": 562.0029907226562, "learning_rate": 1.703270811152624e-07, "loss": 4.1486, "step": 791 }, { "epoch": 0.9771745835903763, "grad_norm": 756.2742309570312, "learning_rate": 1.5372870607956446e-07, "loss": 4.0233, "step": 792 }, { "epoch": 0.9784083898827884, "grad_norm": 777.6002197265625, "learning_rate": 1.3797982930044972e-07, "loss": 4.1716, "step": 793 }, { "epoch": 0.9796421961752005, "grad_norm": 1127.728759765625, "learning_rate": 1.2308071917601083e-07, "loss": 4.2959, "step": 794 }, { "epoch": 0.9808760024676126, "grad_norm": 827.7559814453125, "learning_rate": 1.0903162962228597e-07, "loss": 4.11, "step": 795 }, { "epoch": 0.9821098087600246, "grad_norm": 734.08984375, "learning_rate": 9.583280006895679e-08, "loss": 4.0029, "step": 796 }, { "epoch": 0.9833436150524367, "grad_norm": 1221.7593994140625, "learning_rate": 8.348445545527939e-08, "loss": 4.4357, "step": 797 }, { "epoch": 0.9845774213448488, "grad_norm": 632.0841674804688, "learning_rate": 7.198680622621523e-08, "loss": 4.0149, "step": 798 }, { "epoch": 0.9858112276372609, "grad_norm": 644.5614624023438, "learning_rate": 6.134004832888396e-08, "loss": 4.1734, "step": 799 }, { "epoch": 0.987045033929673, "grad_norm": 802.9095458984375, "learning_rate": 5.154436320919942e-08, "loss": 4.0969, "step": 800 }, { "epoch": 0.9882788402220851, "grad_norm": 657.191162109375, "learning_rate": 4.25999178087888e-08, "loss": 4.127, "step": 801 }, { "epoch": 0.9895126465144972, "grad_norm": 618.2482299804688, "learning_rate": 3.4506864562133815e-08, "loss": 3.99, "step": 802 }, { "epoch": 0.9907464528069093, "grad_norm": 729.18798828125, "learning_rate": 2.7265341393983845e-08, "loss": 4.0454, "step": 803 }, { "epoch": 0.9919802590993214, "grad_norm": 660.1466064453125, "learning_rate": 2.0875471717013427e-08, "loss": 4.1317, "step": 804 }, { "epoch": 0.9932140653917335, "grad_norm": 619.5026245117188, "learning_rate": 1.5337364429696132e-08, "loss": 3.9811, "step": 805 }, { "epoch": 0.9944478716841456, "grad_norm": 849.2465209960938, "learning_rate": 1.065111391447271e-08, "loss": 4.1834, "step": 806 }, { "epoch": 0.9956816779765577, "grad_norm": 727.19580078125, "learning_rate": 6.816800036124615e-09, "loss": 4.224, "step": 807 }, { "epoch": 0.9969154842689698, "grad_norm": 646.8365478515625, "learning_rate": 3.8344881404195345e-09, "loss": 3.9375, "step": 808 }, { "epoch": 0.9981492905613819, "grad_norm": 956.5570068359375, "learning_rate": 1.7042290529956095e-09, "loss": 4.0443, "step": 809 }, { "epoch": 0.999383096853794, "grad_norm": 1171.58642578125, "learning_rate": 4.260590785121199e-10, "loss": 4.0195, "step": 810 }, { "epoch": 1.000616903146206, "grad_norm": 823.234130859375, "learning_rate": 0.0, "loss": 4.1185, "step": 811 } ], "logging_steps": 1, "max_steps": 811, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5387190162948096.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }