{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016, "grad_norm": 1.3979606628417969, "learning_rate": 7.936507936507937e-07, "loss": 1.7057, "step": 1 }, { "epoch": 0.0032, "grad_norm": 1.2405153512954712, "learning_rate": 1.5873015873015873e-06, "loss": 1.631, "step": 2 }, { "epoch": 0.0048, "grad_norm": 1.4436768293380737, "learning_rate": 2.3809523809523808e-06, "loss": 1.7883, "step": 3 }, { "epoch": 0.0064, "grad_norm": 1.3493705987930298, "learning_rate": 3.1746031746031746e-06, "loss": 1.6246, "step": 4 }, { "epoch": 0.008, "grad_norm": 1.3904924392700195, "learning_rate": 3.968253968253968e-06, "loss": 1.663, "step": 5 }, { "epoch": 0.0096, "grad_norm": 1.1314210891723633, "learning_rate": 4.7619047619047615e-06, "loss": 1.7002, "step": 6 }, { "epoch": 0.0112, "grad_norm": 1.205215334892273, "learning_rate": 5.555555555555556e-06, "loss": 1.8956, "step": 7 }, { "epoch": 0.0128, "grad_norm": 1.3392200469970703, "learning_rate": 6.349206349206349e-06, "loss": 1.8131, "step": 8 }, { "epoch": 0.0144, "grad_norm": 1.283113718032837, "learning_rate": 7.142857142857143e-06, "loss": 1.872, "step": 9 }, { "epoch": 0.016, "grad_norm": 1.3169431686401367, "learning_rate": 7.936507936507936e-06, "loss": 1.6827, "step": 10 }, { "epoch": 0.0176, "grad_norm": 1.2517348527908325, "learning_rate": 8.73015873015873e-06, "loss": 1.5772, "step": 11 }, { "epoch": 0.0192, "grad_norm": 1.195190668106079, "learning_rate": 9.523809523809523e-06, "loss": 1.626, "step": 12 }, { "epoch": 0.0208, "grad_norm": 1.2285528182983398, "learning_rate": 1.0317460317460318e-05, "loss": 1.7562, "step": 13 }, { "epoch": 0.0224, "grad_norm": 1.204359531402588, "learning_rate": 1.1111111111111112e-05, "loss": 1.6901, "step": 14 }, { "epoch": 0.024, "grad_norm": 1.1920750141143799, "learning_rate": 1.1904761904761905e-05, "loss": 1.6725, "step": 15 }, { "epoch": 0.0256, "grad_norm": 1.456931471824646, "learning_rate": 1.2698412698412699e-05, "loss": 1.7348, "step": 16 }, { "epoch": 0.0272, "grad_norm": 1.165505051612854, "learning_rate": 1.3492063492063492e-05, "loss": 1.7561, "step": 17 }, { "epoch": 0.0288, "grad_norm": 1.3036893606185913, "learning_rate": 1.4285714285714285e-05, "loss": 1.7304, "step": 18 }, { "epoch": 0.0304, "grad_norm": 1.069793462753296, "learning_rate": 1.5079365079365079e-05, "loss": 1.5716, "step": 19 }, { "epoch": 0.032, "grad_norm": 1.0666621923446655, "learning_rate": 1.5873015873015872e-05, "loss": 1.6092, "step": 20 }, { "epoch": 0.0336, "grad_norm": 1.1366040706634521, "learning_rate": 1.6666666666666667e-05, "loss": 1.6673, "step": 21 }, { "epoch": 0.0352, "grad_norm": 0.9361292719841003, "learning_rate": 1.746031746031746e-05, "loss": 1.5994, "step": 22 }, { "epoch": 0.0368, "grad_norm": 1.1505255699157715, "learning_rate": 1.8253968253968254e-05, "loss": 1.5517, "step": 23 }, { "epoch": 0.0384, "grad_norm": 0.9984336495399475, "learning_rate": 1.9047619047619046e-05, "loss": 1.6453, "step": 24 }, { "epoch": 0.04, "grad_norm": 1.2266405820846558, "learning_rate": 1.984126984126984e-05, "loss": 1.5835, "step": 25 }, { "epoch": 0.0416, "grad_norm": 0.9882978200912476, "learning_rate": 2.0634920634920636e-05, "loss": 1.4833, "step": 26 }, { "epoch": 0.0432, "grad_norm": 1.2320302724838257, "learning_rate": 2.1428571428571428e-05, "loss": 1.5261, "step": 27 }, { "epoch": 0.0448, "grad_norm": 1.0345029830932617, "learning_rate": 2.2222222222222223e-05, "loss": 1.4773, "step": 28 }, { "epoch": 0.0464, "grad_norm": 0.9714513421058655, "learning_rate": 2.3015873015873015e-05, "loss": 1.4822, "step": 29 }, { "epoch": 0.048, "grad_norm": 0.8878258466720581, "learning_rate": 2.380952380952381e-05, "loss": 1.5331, "step": 30 }, { "epoch": 0.0496, "grad_norm": 0.9958455562591553, "learning_rate": 2.4603174603174602e-05, "loss": 1.5817, "step": 31 }, { "epoch": 0.0512, "grad_norm": 0.8074731230735779, "learning_rate": 2.5396825396825397e-05, "loss": 1.4562, "step": 32 }, { "epoch": 0.0528, "grad_norm": 0.9326385855674744, "learning_rate": 2.6190476190476192e-05, "loss": 1.4886, "step": 33 }, { "epoch": 0.0544, "grad_norm": 0.7399032115936279, "learning_rate": 2.6984126984126984e-05, "loss": 1.4769, "step": 34 }, { "epoch": 0.056, "grad_norm": 0.7977785468101501, "learning_rate": 2.777777777777778e-05, "loss": 1.5056, "step": 35 }, { "epoch": 0.0576, "grad_norm": 0.8815913796424866, "learning_rate": 2.857142857142857e-05, "loss": 1.5069, "step": 36 }, { "epoch": 0.0592, "grad_norm": 0.7610746622085571, "learning_rate": 2.9365079365079366e-05, "loss": 1.5323, "step": 37 }, { "epoch": 0.0608, "grad_norm": 0.9050503969192505, "learning_rate": 3.0158730158730158e-05, "loss": 1.395, "step": 38 }, { "epoch": 0.0624, "grad_norm": 0.9405601024627686, "learning_rate": 3.095238095238095e-05, "loss": 1.5173, "step": 39 }, { "epoch": 0.064, "grad_norm": 0.8589337468147278, "learning_rate": 3.1746031746031745e-05, "loss": 1.5219, "step": 40 }, { "epoch": 0.0656, "grad_norm": 0.8924518823623657, "learning_rate": 3.253968253968254e-05, "loss": 1.5336, "step": 41 }, { "epoch": 0.0672, "grad_norm": 0.9373279809951782, "learning_rate": 3.3333333333333335e-05, "loss": 1.4083, "step": 42 }, { "epoch": 0.0688, "grad_norm": 0.8667979836463928, "learning_rate": 3.412698412698413e-05, "loss": 1.4857, "step": 43 }, { "epoch": 0.0704, "grad_norm": 0.7958736419677734, "learning_rate": 3.492063492063492e-05, "loss": 1.3467, "step": 44 }, { "epoch": 0.072, "grad_norm": 0.8489401936531067, "learning_rate": 3.571428571428572e-05, "loss": 1.4741, "step": 45 }, { "epoch": 0.0736, "grad_norm": 0.7129983305931091, "learning_rate": 3.650793650793651e-05, "loss": 1.4114, "step": 46 }, { "epoch": 0.0752, "grad_norm": 0.9399805068969727, "learning_rate": 3.730158730158731e-05, "loss": 1.4851, "step": 47 }, { "epoch": 0.0768, "grad_norm": 0.7960166931152344, "learning_rate": 3.809523809523809e-05, "loss": 1.3434, "step": 48 }, { "epoch": 0.0784, "grad_norm": 0.8377113342285156, "learning_rate": 3.888888888888889e-05, "loss": 1.488, "step": 49 }, { "epoch": 0.08, "grad_norm": 0.7329150438308716, "learning_rate": 3.968253968253968e-05, "loss": 1.3705, "step": 50 }, { "epoch": 0.0816, "grad_norm": 0.8006243109703064, "learning_rate": 4.047619047619048e-05, "loss": 1.3481, "step": 51 }, { "epoch": 0.0832, "grad_norm": 0.8081865906715393, "learning_rate": 4.126984126984127e-05, "loss": 1.216, "step": 52 }, { "epoch": 0.0848, "grad_norm": 0.8263102173805237, "learning_rate": 4.2063492063492065e-05, "loss": 1.4479, "step": 53 }, { "epoch": 0.0864, "grad_norm": 0.793979823589325, "learning_rate": 4.2857142857142856e-05, "loss": 1.4209, "step": 54 }, { "epoch": 0.088, "grad_norm": 0.8208668828010559, "learning_rate": 4.3650793650793655e-05, "loss": 1.4577, "step": 55 }, { "epoch": 0.0896, "grad_norm": 0.7821940779685974, "learning_rate": 4.4444444444444447e-05, "loss": 1.4105, "step": 56 }, { "epoch": 0.0912, "grad_norm": 0.739564836025238, "learning_rate": 4.523809523809524e-05, "loss": 1.4654, "step": 57 }, { "epoch": 0.0928, "grad_norm": 0.6928958892822266, "learning_rate": 4.603174603174603e-05, "loss": 1.3649, "step": 58 }, { "epoch": 0.0944, "grad_norm": 0.7304552793502808, "learning_rate": 4.682539682539683e-05, "loss": 1.3974, "step": 59 }, { "epoch": 0.096, "grad_norm": 0.7225795984268188, "learning_rate": 4.761904761904762e-05, "loss": 1.3688, "step": 60 }, { "epoch": 0.0976, "grad_norm": 0.827202320098877, "learning_rate": 4.841269841269841e-05, "loss": 1.341, "step": 61 }, { "epoch": 0.0992, "grad_norm": 0.710622251033783, "learning_rate": 4.9206349206349204e-05, "loss": 1.3971, "step": 62 }, { "epoch": 0.1008, "grad_norm": 0.753088653087616, "learning_rate": 5e-05, "loss": 1.3397, "step": 63 }, { "epoch": 0.1024, "grad_norm": 0.671766996383667, "learning_rate": 4.999960939662063e-05, "loss": 1.2192, "step": 64 }, { "epoch": 0.104, "grad_norm": 0.7635954022407532, "learning_rate": 4.999843759868819e-05, "loss": 1.3366, "step": 65 }, { "epoch": 0.1056, "grad_norm": 0.7389415502548218, "learning_rate": 4.999648464281934e-05, "loss": 1.1964, "step": 66 }, { "epoch": 0.1072, "grad_norm": 0.7409834265708923, "learning_rate": 4.9993750590040575e-05, "loss": 1.2422, "step": 67 }, { "epoch": 0.1088, "grad_norm": 0.7984970808029175, "learning_rate": 4.999023552578632e-05, "loss": 1.4257, "step": 68 }, { "epoch": 0.1104, "grad_norm": 0.7510572671890259, "learning_rate": 4.998593955989626e-05, "loss": 1.3295, "step": 69 }, { "epoch": 0.112, "grad_norm": 0.9146924018859863, "learning_rate": 4.9980862826611875e-05, "loss": 1.3841, "step": 70 }, { "epoch": 0.1136, "grad_norm": 0.750268280506134, "learning_rate": 4.9975005484572305e-05, "loss": 1.2717, "step": 71 }, { "epoch": 0.1152, "grad_norm": 0.7634686231613159, "learning_rate": 4.9968367716809374e-05, "loss": 1.3544, "step": 72 }, { "epoch": 0.1168, "grad_norm": 0.7704282402992249, "learning_rate": 4.996094973074183e-05, "loss": 1.3958, "step": 73 }, { "epoch": 0.1184, "grad_norm": 0.7203567028045654, "learning_rate": 4.995275175816891e-05, "loss": 1.2506, "step": 74 }, { "epoch": 0.12, "grad_norm": 0.8578824400901794, "learning_rate": 4.994377405526308e-05, "loss": 1.5482, "step": 75 }, { "epoch": 0.1216, "grad_norm": 0.6955505013465881, "learning_rate": 4.993401690256203e-05, "loss": 1.3166, "step": 76 }, { "epoch": 0.1232, "grad_norm": 0.7726685404777527, "learning_rate": 4.992348060495989e-05, "loss": 1.3358, "step": 77 }, { "epoch": 0.1248, "grad_norm": 0.8043551445007324, "learning_rate": 4.991216549169776e-05, "loss": 1.3415, "step": 78 }, { "epoch": 0.1264, "grad_norm": 0.6525835394859314, "learning_rate": 4.990007191635334e-05, "loss": 1.2401, "step": 79 }, { "epoch": 0.128, "grad_norm": 0.703302264213562, "learning_rate": 4.988720025682995e-05, "loss": 1.3796, "step": 80 }, { "epoch": 0.1296, "grad_norm": 0.7054821252822876, "learning_rate": 4.987355091534468e-05, "loss": 1.3721, "step": 81 }, { "epoch": 0.1312, "grad_norm": 0.7959026098251343, "learning_rate": 4.985912431841584e-05, "loss": 1.3189, "step": 82 }, { "epoch": 0.1328, "grad_norm": 0.8068015575408936, "learning_rate": 4.9843920916849645e-05, "loss": 1.4042, "step": 83 }, { "epoch": 0.1344, "grad_norm": 0.7899581789970398, "learning_rate": 4.982794118572609e-05, "loss": 1.3431, "step": 84 }, { "epoch": 0.136, "grad_norm": 0.8032404184341431, "learning_rate": 4.981118562438414e-05, "loss": 1.2106, "step": 85 }, { "epoch": 0.1376, "grad_norm": 0.7540927529335022, "learning_rate": 4.9793654756406085e-05, "loss": 1.3154, "step": 86 }, { "epoch": 0.1392, "grad_norm": 0.8364540934562683, "learning_rate": 4.9775349129601243e-05, "loss": 1.2985, "step": 87 }, { "epoch": 0.1408, "grad_norm": 0.7959004640579224, "learning_rate": 4.9756269315988804e-05, "loss": 1.2578, "step": 88 }, { "epoch": 0.1424, "grad_norm": 0.759797215461731, "learning_rate": 4.973641591177991e-05, "loss": 1.2787, "step": 89 }, { "epoch": 0.144, "grad_norm": 0.9131535887718201, "learning_rate": 4.971578953735912e-05, "loss": 1.2582, "step": 90 }, { "epoch": 0.1456, "grad_norm": 0.8275206685066223, "learning_rate": 4.969439083726496e-05, "loss": 1.3592, "step": 91 }, { "epoch": 0.1472, "grad_norm": 0.6681392788887024, "learning_rate": 4.967222048016979e-05, "loss": 1.2484, "step": 92 }, { "epoch": 0.1488, "grad_norm": 0.7666839957237244, "learning_rate": 4.964927915885893e-05, "loss": 1.2378, "step": 93 }, { "epoch": 0.1504, "grad_norm": 0.7831534743309021, "learning_rate": 4.962556759020898e-05, "loss": 1.4016, "step": 94 }, { "epoch": 0.152, "grad_norm": 0.883674144744873, "learning_rate": 4.960108651516545e-05, "loss": 1.5277, "step": 95 }, { "epoch": 0.1536, "grad_norm": 0.669151246547699, "learning_rate": 4.9575836698719605e-05, "loss": 1.2147, "step": 96 }, { "epoch": 0.1552, "grad_norm": 0.7544010877609253, "learning_rate": 4.954981892988451e-05, "loss": 1.2514, "step": 97 }, { "epoch": 0.1568, "grad_norm": 0.7225381731987, "learning_rate": 4.952303402167047e-05, "loss": 1.3334, "step": 98 }, { "epoch": 0.1584, "grad_norm": 0.8228552937507629, "learning_rate": 4.949548281105951e-05, "loss": 1.4532, "step": 99 }, { "epoch": 0.16, "grad_norm": 0.666982889175415, "learning_rate": 4.946716615897932e-05, "loss": 1.2596, "step": 100 }, { "epoch": 0.1616, "grad_norm": 0.7933996319770813, "learning_rate": 4.943808495027631e-05, "loss": 1.2552, "step": 101 }, { "epoch": 0.1632, "grad_norm": 0.8225094676017761, "learning_rate": 4.940824009368793e-05, "loss": 1.2595, "step": 102 }, { "epoch": 0.1648, "grad_norm": 0.7698407173156738, "learning_rate": 4.937763252181434e-05, "loss": 1.4083, "step": 103 }, { "epoch": 0.1664, "grad_norm": 0.8420011401176453, "learning_rate": 4.934626319108923e-05, "loss": 1.4834, "step": 104 }, { "epoch": 0.168, "grad_norm": 0.8101248741149902, "learning_rate": 4.93141330817499e-05, "loss": 1.4045, "step": 105 }, { "epoch": 0.1696, "grad_norm": 0.773597776889801, "learning_rate": 4.9281243197806726e-05, "loss": 1.2369, "step": 106 }, { "epoch": 0.1712, "grad_norm": 0.8300926685333252, "learning_rate": 4.924759456701167e-05, "loss": 1.3797, "step": 107 }, { "epoch": 0.1728, "grad_norm": 0.7435926795005798, "learning_rate": 4.9213188240826245e-05, "loss": 1.279, "step": 108 }, { "epoch": 0.1744, "grad_norm": 0.8630765676498413, "learning_rate": 4.917802529438864e-05, "loss": 1.3583, "step": 109 }, { "epoch": 0.176, "grad_norm": 0.7833401560783386, "learning_rate": 4.9142106826480114e-05, "loss": 1.2915, "step": 110 }, { "epoch": 0.1776, "grad_norm": 0.7859653234481812, "learning_rate": 4.910543395949067e-05, "loss": 1.4477, "step": 111 }, { "epoch": 0.1792, "grad_norm": 0.8431854248046875, "learning_rate": 4.9068007839383946e-05, "loss": 1.2933, "step": 112 }, { "epoch": 0.1808, "grad_norm": 0.8416605591773987, "learning_rate": 4.9029829635661475e-05, "loss": 1.2209, "step": 113 }, { "epoch": 0.1824, "grad_norm": 0.7700342535972595, "learning_rate": 4.899090054132609e-05, "loss": 1.2887, "step": 114 }, { "epoch": 0.184, "grad_norm": 0.7672936916351318, "learning_rate": 4.895122177284465e-05, "loss": 1.3119, "step": 115 }, { "epoch": 0.1856, "grad_norm": 0.807087779045105, "learning_rate": 4.891079457011005e-05, "loss": 1.3326, "step": 116 }, { "epoch": 0.1872, "grad_norm": 0.7031504511833191, "learning_rate": 4.8869620196402436e-05, "loss": 1.1882, "step": 117 }, { "epoch": 0.1888, "grad_norm": 0.7604570984840393, "learning_rate": 4.882769993834978e-05, "loss": 1.3986, "step": 118 }, { "epoch": 0.1904, "grad_norm": 0.7414752840995789, "learning_rate": 4.878503510588765e-05, "loss": 1.3335, "step": 119 }, { "epoch": 0.192, "grad_norm": 0.815904438495636, "learning_rate": 4.874162703221823e-05, "loss": 1.3663, "step": 120 }, { "epoch": 0.1936, "grad_norm": 0.8176221251487732, "learning_rate": 4.8697477073768766e-05, "loss": 1.4764, "step": 121 }, { "epoch": 0.1952, "grad_norm": 0.7878517508506775, "learning_rate": 4.8652586610149095e-05, "loss": 1.2385, "step": 122 }, { "epoch": 0.1968, "grad_norm": 0.8291516900062561, "learning_rate": 4.8606957044108556e-05, "loss": 1.3696, "step": 123 }, { "epoch": 0.1984, "grad_norm": 0.7808008790016174, "learning_rate": 4.856058980149216e-05, "loss": 1.308, "step": 124 }, { "epoch": 0.2, "grad_norm": 0.8151692152023315, "learning_rate": 4.851348633119606e-05, "loss": 1.3201, "step": 125 }, { "epoch": 0.2016, "grad_norm": 0.7533981204032898, "learning_rate": 4.84656481051222e-05, "loss": 1.3023, "step": 126 }, { "epoch": 0.2032, "grad_norm": 0.7771658301353455, "learning_rate": 4.8417076618132426e-05, "loss": 1.3661, "step": 127 }, { "epoch": 0.2048, "grad_norm": 0.7094460725784302, "learning_rate": 4.836777338800168e-05, "loss": 1.2355, "step": 128 }, { "epoch": 0.2064, "grad_norm": 0.8727604150772095, "learning_rate": 4.8317739955370636e-05, "loss": 1.3832, "step": 129 }, { "epoch": 0.208, "grad_norm": 0.8162310123443604, "learning_rate": 4.8266977883697515e-05, "loss": 1.3186, "step": 130 }, { "epoch": 0.2096, "grad_norm": 0.8144053816795349, "learning_rate": 4.821548875920927e-05, "loss": 1.3084, "step": 131 }, { "epoch": 0.2112, "grad_norm": 0.744472861289978, "learning_rate": 4.816327419085196e-05, "loss": 1.3201, "step": 132 }, { "epoch": 0.2128, "grad_norm": 0.7600781917572021, "learning_rate": 4.811033581024056e-05, "loss": 1.2617, "step": 133 }, { "epoch": 0.2144, "grad_norm": 0.8421154022216797, "learning_rate": 4.805667527160788e-05, "loss": 1.3822, "step": 134 }, { "epoch": 0.216, "grad_norm": 0.7833184003829956, "learning_rate": 4.800229425175294e-05, "loss": 1.3177, "step": 135 }, { "epoch": 0.2176, "grad_norm": 0.7761088013648987, "learning_rate": 4.7947194449988555e-05, "loss": 1.2761, "step": 136 }, { "epoch": 0.2192, "grad_norm": 0.8261726498603821, "learning_rate": 4.7891377588088223e-05, "loss": 1.3894, "step": 137 }, { "epoch": 0.2208, "grad_norm": 0.7743844985961914, "learning_rate": 4.7834845410232356e-05, "loss": 1.3965, "step": 138 }, { "epoch": 0.2224, "grad_norm": 0.8164190053939819, "learning_rate": 4.777759968295369e-05, "loss": 1.2547, "step": 139 }, { "epoch": 0.224, "grad_norm": 0.7959739565849304, "learning_rate": 4.771964219508222e-05, "loss": 1.3903, "step": 140 }, { "epoch": 0.2256, "grad_norm": 0.7821384072303772, "learning_rate": 4.766097475768919e-05, "loss": 1.1245, "step": 141 }, { "epoch": 0.2272, "grad_norm": 0.7910454869270325, "learning_rate": 4.7601599204030544e-05, "loss": 1.3451, "step": 142 }, { "epoch": 0.2288, "grad_norm": 0.862091064453125, "learning_rate": 4.754151738948962e-05, "loss": 1.1133, "step": 143 }, { "epoch": 0.2304, "grad_norm": 0.7979600429534912, "learning_rate": 4.7480731191519224e-05, "loss": 1.2931, "step": 144 }, { "epoch": 0.232, "grad_norm": 0.8062856197357178, "learning_rate": 4.741924250958289e-05, "loss": 1.2375, "step": 145 }, { "epoch": 0.2336, "grad_norm": 0.8622507452964783, "learning_rate": 4.7357053265095575e-05, "loss": 1.3672, "step": 146 }, { "epoch": 0.2352, "grad_norm": 1.3723289966583252, "learning_rate": 4.729416540136361e-05, "loss": 1.3603, "step": 147 }, { "epoch": 0.2368, "grad_norm": 0.8904662132263184, "learning_rate": 4.723058088352395e-05, "loss": 1.5258, "step": 148 }, { "epoch": 0.2384, "grad_norm": 0.7454637289047241, "learning_rate": 4.7166301698482815e-05, "loss": 1.3166, "step": 149 }, { "epoch": 0.24, "grad_norm": 0.8734514713287354, "learning_rate": 4.710132985485355e-05, "loss": 1.239, "step": 150 }, { "epoch": 0.2416, "grad_norm": 0.9273560047149658, "learning_rate": 4.703566738289389e-05, "loss": 1.2568, "step": 151 }, { "epoch": 0.2432, "grad_norm": 1.1991658210754395, "learning_rate": 4.696931633444251e-05, "loss": 1.2657, "step": 152 }, { "epoch": 0.2448, "grad_norm": 0.7914251089096069, "learning_rate": 4.69022787828549e-05, "loss": 1.2721, "step": 153 }, { "epoch": 0.2464, "grad_norm": 0.8522552847862244, "learning_rate": 4.683455682293863e-05, "loss": 1.1422, "step": 154 }, { "epoch": 0.248, "grad_norm": 0.865207314491272, "learning_rate": 4.676615257088776e-05, "loss": 1.2764, "step": 155 }, { "epoch": 0.2496, "grad_norm": 0.916280210018158, "learning_rate": 4.6697068164216896e-05, "loss": 1.3771, "step": 156 }, { "epoch": 0.2512, "grad_norm": 0.8297073245048523, "learning_rate": 4.662730576169423e-05, "loss": 1.2498, "step": 157 }, { "epoch": 0.2528, "grad_norm": 0.8184478282928467, "learning_rate": 4.6556867543274184e-05, "loss": 1.2888, "step": 158 }, { "epoch": 0.2544, "grad_norm": 0.7587262392044067, "learning_rate": 4.6485755710029256e-05, "loss": 1.251, "step": 159 }, { "epoch": 0.256, "grad_norm": 0.859453022480011, "learning_rate": 4.6413972484081216e-05, "loss": 1.3827, "step": 160 }, { "epoch": 0.2576, "grad_norm": 0.7794456481933594, "learning_rate": 4.6341520108531746e-05, "loss": 1.4008, "step": 161 }, { "epoch": 0.2592, "grad_norm": 0.7838786840438843, "learning_rate": 4.626840084739224e-05, "loss": 1.2819, "step": 162 }, { "epoch": 0.2608, "grad_norm": 0.7927438020706177, "learning_rate": 4.619461698551315e-05, "loss": 1.2204, "step": 163 }, { "epoch": 0.2624, "grad_norm": 0.8057637214660645, "learning_rate": 4.612017082851253e-05, "loss": 1.3231, "step": 164 }, { "epoch": 0.264, "grad_norm": 0.7881826162338257, "learning_rate": 4.604506470270403e-05, "loss": 1.3593, "step": 165 }, { "epoch": 0.2656, "grad_norm": 0.7817838788032532, "learning_rate": 4.5969300955024167e-05, "loss": 1.1901, "step": 166 }, { "epoch": 0.2672, "grad_norm": 0.8130877017974854, "learning_rate": 4.589288195295901e-05, "loss": 1.2503, "step": 167 }, { "epoch": 0.2688, "grad_norm": 0.8405358791351318, "learning_rate": 4.58158100844702e-05, "loss": 1.3533, "step": 168 }, { "epoch": 0.2704, "grad_norm": 0.8096474409103394, "learning_rate": 4.573808775792033e-05, "loss": 1.1982, "step": 169 }, { "epoch": 0.272, "grad_norm": 0.8719704747200012, "learning_rate": 4.5659717401997655e-05, "loss": 1.3916, "step": 170 }, { "epoch": 0.2736, "grad_norm": 1.2298812866210938, "learning_rate": 4.5580701465640254e-05, "loss": 1.514, "step": 171 }, { "epoch": 0.2752, "grad_norm": 0.8150945901870728, "learning_rate": 4.550104241795946e-05, "loss": 1.1563, "step": 172 }, { "epoch": 0.2768, "grad_norm": 0.811395525932312, "learning_rate": 4.5420742748162734e-05, "loss": 1.1962, "step": 173 }, { "epoch": 0.2784, "grad_norm": 0.8785234093666077, "learning_rate": 4.5339804965475875e-05, "loss": 1.4656, "step": 174 }, { "epoch": 0.28, "grad_norm": 0.9174700975418091, "learning_rate": 4.525823159906459e-05, "loss": 1.443, "step": 175 }, { "epoch": 0.2816, "grad_norm": 0.7986875772476196, "learning_rate": 4.5176025197955494e-05, "loss": 1.4816, "step": 176 }, { "epoch": 0.2832, "grad_norm": 0.8526942729949951, "learning_rate": 4.509318833095642e-05, "loss": 1.2801, "step": 177 }, { "epoch": 0.2848, "grad_norm": 0.8496647477149963, "learning_rate": 4.500972358657618e-05, "loss": 1.249, "step": 178 }, { "epoch": 0.2864, "grad_norm": 0.7902190685272217, "learning_rate": 4.492563357294369e-05, "loss": 1.2485, "step": 179 }, { "epoch": 0.288, "grad_norm": 0.800926923751831, "learning_rate": 4.4840920917726426e-05, "loss": 1.2694, "step": 180 }, { "epoch": 0.2896, "grad_norm": 0.8827731609344482, "learning_rate": 4.475558826804833e-05, "loss": 1.2383, "step": 181 }, { "epoch": 0.2912, "grad_norm": 0.9456571340560913, "learning_rate": 4.466963829040712e-05, "loss": 1.4372, "step": 182 }, { "epoch": 0.2928, "grad_norm": 0.9060493111610413, "learning_rate": 4.458307367059092e-05, "loss": 1.4349, "step": 183 }, { "epoch": 0.2944, "grad_norm": 0.8998486995697021, "learning_rate": 4.449589711359438e-05, "loss": 1.1828, "step": 184 }, { "epoch": 0.296, "grad_norm": 0.8456140160560608, "learning_rate": 4.440811134353412e-05, "loss": 1.348, "step": 185 }, { "epoch": 0.2976, "grad_norm": 0.8395023941993713, "learning_rate": 4.431971910356363e-05, "loss": 1.2582, "step": 186 }, { "epoch": 0.2992, "grad_norm": 0.9601055979728699, "learning_rate": 4.42307231557875e-05, "loss": 1.3007, "step": 187 }, { "epoch": 0.3008, "grad_norm": 0.8439981341362, "learning_rate": 4.414112628117517e-05, "loss": 1.2307, "step": 188 }, { "epoch": 0.3024, "grad_norm": 0.8234330415725708, "learning_rate": 4.4050931279474015e-05, "loss": 1.1441, "step": 189 }, { "epoch": 0.304, "grad_norm": 0.8155338168144226, "learning_rate": 4.396014096912182e-05, "loss": 1.2538, "step": 190 }, { "epoch": 0.3056, "grad_norm": 0.8123296499252319, "learning_rate": 4.386875818715874e-05, "loss": 1.2473, "step": 191 }, { "epoch": 0.3072, "grad_norm": 0.8073933720588684, "learning_rate": 4.3776785789138675e-05, "loss": 1.1306, "step": 192 }, { "epoch": 0.3088, "grad_norm": 0.8189268708229065, "learning_rate": 4.368422664903997e-05, "loss": 1.3952, "step": 193 }, { "epoch": 0.3104, "grad_norm": 0.854719877243042, "learning_rate": 4.359108365917565e-05, "loss": 1.128, "step": 194 }, { "epoch": 0.312, "grad_norm": 0.8306546211242676, "learning_rate": 4.349735973010305e-05, "loss": 1.2598, "step": 195 }, { "epoch": 0.3136, "grad_norm": 0.8042142987251282, "learning_rate": 4.3403057790532855e-05, "loss": 1.1879, "step": 196 }, { "epoch": 0.3152, "grad_norm": 0.7727879881858826, "learning_rate": 4.330818078723755e-05, "loss": 1.1834, "step": 197 }, { "epoch": 0.3168, "grad_norm": 0.9326703548431396, "learning_rate": 4.32127316849594e-05, "loss": 1.3223, "step": 198 }, { "epoch": 0.3184, "grad_norm": 0.8537107110023499, "learning_rate": 4.311671346631774e-05, "loss": 1.2677, "step": 199 }, { "epoch": 0.32, "grad_norm": 0.7908600568771362, "learning_rate": 4.302012913171584e-05, "loss": 1.2312, "step": 200 }, { "epoch": 0.3216, "grad_norm": 0.8733341693878174, "learning_rate": 4.292298169924709e-05, "loss": 1.4151, "step": 201 }, { "epoch": 0.3232, "grad_norm": 0.8206008672714233, "learning_rate": 4.282527420460072e-05, "loss": 1.3119, "step": 202 }, { "epoch": 0.3248, "grad_norm": 0.8491325974464417, "learning_rate": 4.272700970096696e-05, "loss": 1.2861, "step": 203 }, { "epoch": 0.3264, "grad_norm": 0.8207558393478394, "learning_rate": 4.262819125894156e-05, "loss": 1.3848, "step": 204 }, { "epoch": 0.328, "grad_norm": 0.816182553768158, "learning_rate": 4.252882196642992e-05, "loss": 1.4901, "step": 205 }, { "epoch": 0.3296, "grad_norm": 0.821904182434082, "learning_rate": 4.242890492855056e-05, "loss": 1.2896, "step": 206 }, { "epoch": 0.3312, "grad_norm": 0.8308921456336975, "learning_rate": 4.23284432675381e-05, "loss": 1.3969, "step": 207 }, { "epoch": 0.3328, "grad_norm": 0.7922840118408203, "learning_rate": 4.222744012264566e-05, "loss": 1.2397, "step": 208 }, { "epoch": 0.3344, "grad_norm": 0.7680162191390991, "learning_rate": 4.212589865004684e-05, "loss": 1.3328, "step": 209 }, { "epoch": 0.336, "grad_norm": 1.5495011806488037, "learning_rate": 4.2023822022737016e-05, "loss": 1.3894, "step": 210 }, { "epoch": 0.3376, "grad_norm": 0.8258798718452454, "learning_rate": 4.192121343043424e-05, "loss": 1.2391, "step": 211 }, { "epoch": 0.3392, "grad_norm": 0.8727698922157288, "learning_rate": 4.181807607947954e-05, "loss": 1.4235, "step": 212 }, { "epoch": 0.3408, "grad_norm": 1.7937464714050293, "learning_rate": 4.1714413192736754e-05, "loss": 1.1742, "step": 213 }, { "epoch": 0.3424, "grad_norm": 0.9736344218254089, "learning_rate": 4.161022800949177e-05, "loss": 1.2128, "step": 214 }, { "epoch": 0.344, "grad_norm": 0.8412801623344421, "learning_rate": 4.150552378535137e-05, "loss": 1.3655, "step": 215 }, { "epoch": 0.3456, "grad_norm": 0.9748556017875671, "learning_rate": 4.140030379214147e-05, "loss": 1.2987, "step": 216 }, { "epoch": 0.3472, "grad_norm": 0.8029381036758423, "learning_rate": 4.1294571317804854e-05, "loss": 1.2375, "step": 217 }, { "epoch": 0.3488, "grad_norm": 0.8932775259017944, "learning_rate": 4.1188329666298464e-05, "loss": 1.2456, "step": 218 }, { "epoch": 0.3504, "grad_norm": 0.8745604753494263, "learning_rate": 4.108158215749014e-05, "loss": 1.4166, "step": 219 }, { "epoch": 0.352, "grad_norm": 0.858933687210083, "learning_rate": 4.0974332127054914e-05, "loss": 1.2006, "step": 220 }, { "epoch": 0.3536, "grad_norm": 0.7766256928443909, "learning_rate": 4.0866582926370725e-05, "loss": 1.2719, "step": 221 }, { "epoch": 0.3552, "grad_norm": 0.8245936036109924, "learning_rate": 4.0758337922413716e-05, "loss": 1.3486, "step": 222 }, { "epoch": 0.3568, "grad_norm": 0.8210758566856384, "learning_rate": 4.064960049765304e-05, "loss": 1.3053, "step": 223 }, { "epoch": 0.3584, "grad_norm": 0.8996108174324036, "learning_rate": 4.054037404994516e-05, "loss": 1.2087, "step": 224 }, { "epoch": 0.36, "grad_norm": 0.8846112489700317, "learning_rate": 4.043066199242762e-05, "loss": 1.2087, "step": 225 }, { "epoch": 0.3616, "grad_norm": 0.8157996535301208, "learning_rate": 4.032046775341247e-05, "loss": 1.2156, "step": 226 }, { "epoch": 0.3632, "grad_norm": 0.820773720741272, "learning_rate": 4.020979477627907e-05, "loss": 1.139, "step": 227 }, { "epoch": 0.3648, "grad_norm": 0.8938034176826477, "learning_rate": 4.0098646519366534e-05, "loss": 1.2798, "step": 228 }, { "epoch": 0.3664, "grad_norm": 0.8679342269897461, "learning_rate": 3.998702645586565e-05, "loss": 1.2645, "step": 229 }, { "epoch": 0.368, "grad_norm": 0.8442770838737488, "learning_rate": 3.9874938073710336e-05, "loss": 1.2631, "step": 230 }, { "epoch": 0.3696, "grad_norm": 1.3499926328659058, "learning_rate": 3.976238487546864e-05, "loss": 1.2127, "step": 231 }, { "epoch": 0.3712, "grad_norm": 0.8420690894126892, "learning_rate": 3.9649370378233365e-05, "loss": 1.2012, "step": 232 }, { "epoch": 0.3728, "grad_norm": 0.7775508165359497, "learning_rate": 3.953589811351204e-05, "loss": 1.3022, "step": 233 }, { "epoch": 0.3744, "grad_norm": 0.904107928276062, "learning_rate": 3.94219716271167e-05, "loss": 1.3107, "step": 234 }, { "epoch": 0.376, "grad_norm": 0.803371250629425, "learning_rate": 3.930759447905298e-05, "loss": 1.3826, "step": 235 }, { "epoch": 0.3776, "grad_norm": 0.8575745820999146, "learning_rate": 3.919277024340891e-05, "loss": 1.2177, "step": 236 }, { "epoch": 0.3792, "grad_norm": 0.8359964489936829, "learning_rate": 3.907750250824327e-05, "loss": 1.2003, "step": 237 }, { "epoch": 0.3808, "grad_norm": 0.8195081949234009, "learning_rate": 3.8961794875473394e-05, "loss": 1.0953, "step": 238 }, { "epoch": 0.3824, "grad_norm": 0.8791641592979431, "learning_rate": 3.884565096076269e-05, "loss": 1.4065, "step": 239 }, { "epoch": 0.384, "grad_norm": 0.9039677381515503, "learning_rate": 3.872907439340758e-05, "loss": 1.2867, "step": 240 }, { "epoch": 0.3856, "grad_norm": 0.8739061951637268, "learning_rate": 3.861206881622419e-05, "loss": 1.2602, "step": 241 }, { "epoch": 0.3872, "grad_norm": 0.8759358525276184, "learning_rate": 3.8494637885434396e-05, "loss": 1.138, "step": 242 }, { "epoch": 0.3888, "grad_norm": 0.8547525405883789, "learning_rate": 3.837678527055168e-05, "loss": 1.3375, "step": 243 }, { "epoch": 0.3904, "grad_norm": 0.9128666520118713, "learning_rate": 3.8258514654266434e-05, "loss": 1.2696, "step": 244 }, { "epoch": 0.392, "grad_norm": 0.827133059501648, "learning_rate": 3.813982973233083e-05, "loss": 1.1281, "step": 245 }, { "epoch": 0.3936, "grad_norm": 0.7862581014633179, "learning_rate": 3.802073421344339e-05, "loss": 1.1071, "step": 246 }, { "epoch": 0.3952, "grad_norm": 0.787131130695343, "learning_rate": 3.7901231819133105e-05, "loss": 1.3001, "step": 247 }, { "epoch": 0.3968, "grad_norm": 0.8636060357093811, "learning_rate": 3.7781326283643085e-05, "loss": 1.3212, "step": 248 }, { "epoch": 0.3984, "grad_norm": 0.8448414206504822, "learning_rate": 3.766102135381393e-05, "loss": 1.2134, "step": 249 }, { "epoch": 0.4, "grad_norm": 0.8881953358650208, "learning_rate": 3.75403207889666e-05, "loss": 1.3329, "step": 250 }, { "epoch": 0.4016, "grad_norm": 0.8011172413825989, "learning_rate": 3.741922836078499e-05, "loss": 1.2185, "step": 251 }, { "epoch": 0.4032, "grad_norm": 0.8902686238288879, "learning_rate": 3.729774785319801e-05, "loss": 1.1815, "step": 252 }, { "epoch": 0.4048, "grad_norm": 0.8770828247070312, "learning_rate": 3.717588306226143e-05, "loss": 1.3711, "step": 253 }, { "epoch": 0.4064, "grad_norm": 0.7881701588630676, "learning_rate": 3.705363779603917e-05, "loss": 1.141, "step": 254 }, { "epoch": 0.408, "grad_norm": 0.7844118475914001, "learning_rate": 3.693101587448436e-05, "loss": 1.0943, "step": 255 }, { "epoch": 0.4096, "grad_norm": 0.9868535995483398, "learning_rate": 3.680802112931996e-05, "loss": 1.2847, "step": 256 }, { "epoch": 0.4112, "grad_norm": 0.8272721767425537, "learning_rate": 3.6684657403919005e-05, "loss": 1.1939, "step": 257 }, { "epoch": 0.4128, "grad_norm": 0.8536967039108276, "learning_rate": 3.6560928553184554e-05, "loss": 1.3056, "step": 258 }, { "epoch": 0.4144, "grad_norm": 0.8277765512466431, "learning_rate": 3.6436838443429175e-05, "loss": 1.2853, "step": 259 }, { "epoch": 0.416, "grad_norm": 1.1763432025909424, "learning_rate": 3.631239095225417e-05, "loss": 1.3572, "step": 260 }, { "epoch": 0.4176, "grad_norm": 0.8517109155654907, "learning_rate": 3.618758996842839e-05, "loss": 1.3001, "step": 261 }, { "epoch": 0.4192, "grad_norm": 0.9327878355979919, "learning_rate": 3.60624393917667e-05, "loss": 1.3529, "step": 262 }, { "epoch": 0.4208, "grad_norm": 0.8235234022140503, "learning_rate": 3.5936943133008183e-05, "loss": 1.347, "step": 263 }, { "epoch": 0.4224, "grad_norm": 0.8215406537055969, "learning_rate": 3.581110511369384e-05, "loss": 1.1976, "step": 264 }, { "epoch": 0.424, "grad_norm": 0.8907118439674377, "learning_rate": 3.568492926604412e-05, "loss": 1.2235, "step": 265 }, { "epoch": 0.4256, "grad_norm": 0.8126318454742432, "learning_rate": 3.555841953283603e-05, "loss": 1.266, "step": 266 }, { "epoch": 0.4272, "grad_norm": 0.8192747831344604, "learning_rate": 3.5431579867279905e-05, "loss": 1.284, "step": 267 }, { "epoch": 0.4288, "grad_norm": 0.8461147546768188, "learning_rate": 3.530441423289591e-05, "loss": 1.196, "step": 268 }, { "epoch": 0.4304, "grad_norm": 0.8671585917472839, "learning_rate": 3.517692660339018e-05, "loss": 1.3197, "step": 269 }, { "epoch": 0.432, "grad_norm": 0.8324633836746216, "learning_rate": 3.504912096253061e-05, "loss": 1.2526, "step": 270 }, { "epoch": 0.4336, "grad_norm": 0.887322187423706, "learning_rate": 3.492100130402242e-05, "loss": 1.258, "step": 271 }, { "epoch": 0.4352, "grad_norm": 0.9236389398574829, "learning_rate": 3.479257163138334e-05, "loss": 1.3906, "step": 272 }, { "epoch": 0.4368, "grad_norm": 0.8896394968032837, "learning_rate": 3.4663835957818515e-05, "loss": 1.3444, "step": 273 }, { "epoch": 0.4384, "grad_norm": 0.9045920372009277, "learning_rate": 3.453479830609505e-05, "loss": 1.3313, "step": 274 }, { "epoch": 0.44, "grad_norm": 0.8787500262260437, "learning_rate": 3.440546270841639e-05, "loss": 1.315, "step": 275 }, { "epoch": 0.4416, "grad_norm": 0.9180662631988525, "learning_rate": 3.427583320629626e-05, "loss": 1.1742, "step": 276 }, { "epoch": 0.4432, "grad_norm": 0.8734263181686401, "learning_rate": 3.414591385043237e-05, "loss": 1.2295, "step": 277 }, { "epoch": 0.4448, "grad_norm": 0.895437479019165, "learning_rate": 3.401570870057989e-05, "loss": 1.4899, "step": 278 }, { "epoch": 0.4464, "grad_norm": 0.8832334280014038, "learning_rate": 3.3885221825424537e-05, "loss": 1.1037, "step": 279 }, { "epoch": 0.448, "grad_norm": 0.8790562152862549, "learning_rate": 3.375445730245546e-05, "loss": 1.3911, "step": 280 }, { "epoch": 0.4496, "grad_norm": 0.8506147861480713, "learning_rate": 3.362341921783784e-05, "loss": 1.3045, "step": 281 }, { "epoch": 0.4512, "grad_norm": 0.8560390472412109, "learning_rate": 3.349211166628515e-05, "loss": 1.3448, "step": 282 }, { "epoch": 0.4528, "grad_norm": 0.9639808535575867, "learning_rate": 3.336053875093128e-05, "loss": 1.1907, "step": 283 }, { "epoch": 0.4544, "grad_norm": 0.9146746397018433, "learning_rate": 3.322870458320224e-05, "loss": 1.3424, "step": 284 }, { "epoch": 0.456, "grad_norm": 0.8635088801383972, "learning_rate": 3.309661328268776e-05, "loss": 1.2731, "step": 285 }, { "epoch": 0.4576, "grad_norm": 0.9176720976829529, "learning_rate": 3.296426897701251e-05, "loss": 1.1164, "step": 286 }, { "epoch": 0.4592, "grad_norm": 0.9176136255264282, "learning_rate": 3.283167580170712e-05, "loss": 1.3009, "step": 287 }, { "epoch": 0.4608, "grad_norm": 0.8293492794036865, "learning_rate": 3.2698837900078996e-05, "loss": 1.1148, "step": 288 }, { "epoch": 0.4624, "grad_norm": 0.8872430324554443, "learning_rate": 3.256575942308278e-05, "loss": 1.4119, "step": 289 }, { "epoch": 0.464, "grad_norm": 1.0525835752487183, "learning_rate": 3.243244452919072e-05, "loss": 1.261, "step": 290 }, { "epoch": 0.4656, "grad_norm": 1.0282267332077026, "learning_rate": 3.229889738426264e-05, "loss": 1.3091, "step": 291 }, { "epoch": 0.4672, "grad_norm": 0.9044435620307922, "learning_rate": 3.2165122161415845e-05, "loss": 1.189, "step": 292 }, { "epoch": 0.4688, "grad_norm": 0.8462666273117065, "learning_rate": 3.203112304089466e-05, "loss": 1.2666, "step": 293 }, { "epoch": 0.4704, "grad_norm": 0.8603621125221252, "learning_rate": 3.189690420993983e-05, "loss": 1.2621, "step": 294 }, { "epoch": 0.472, "grad_norm": 0.8999144434928894, "learning_rate": 3.176246986265767e-05, "loss": 1.1922, "step": 295 }, { "epoch": 0.4736, "grad_norm": 0.8386927843093872, "learning_rate": 3.162782419988901e-05, "loss": 1.3682, "step": 296 }, { "epoch": 0.4752, "grad_norm": 0.830860435962677, "learning_rate": 3.149297142907792e-05, "loss": 1.3108, "step": 297 }, { "epoch": 0.4768, "grad_norm": 0.8755518198013306, "learning_rate": 3.1357915764140244e-05, "loss": 1.2627, "step": 298 }, { "epoch": 0.4784, "grad_norm": 0.9460086822509766, "learning_rate": 3.122266142533191e-05, "loss": 1.1392, "step": 299 }, { "epoch": 0.48, "grad_norm": 0.8781261444091797, "learning_rate": 3.108721263911706e-05, "loss": 1.3146, "step": 300 }, { "epoch": 0.4816, "grad_norm": 0.9098759293556213, "learning_rate": 3.095157363803598e-05, "loss": 1.2657, "step": 301 }, { "epoch": 0.4832, "grad_norm": 0.8645206689834595, "learning_rate": 3.0815748660572855e-05, "loss": 1.1338, "step": 302 }, { "epoch": 0.4848, "grad_norm": 0.8320742845535278, "learning_rate": 3.06797419510233e-05, "loss": 1.2843, "step": 303 }, { "epoch": 0.4864, "grad_norm": 0.8513315320014954, "learning_rate": 3.0543557759361736e-05, "loss": 1.2161, "step": 304 }, { "epoch": 0.488, "grad_norm": 0.9879288077354431, "learning_rate": 3.0407200341108617e-05, "loss": 1.2837, "step": 305 }, { "epoch": 0.4896, "grad_norm": 0.8944355845451355, "learning_rate": 3.0270673957197393e-05, "loss": 1.2138, "step": 306 }, { "epoch": 0.4912, "grad_norm": 0.8395674824714661, "learning_rate": 3.013398287384144e-05, "loss": 1.2877, "step": 307 }, { "epoch": 0.4928, "grad_norm": 0.8452143669128418, "learning_rate": 2.9997131362400664e-05, "loss": 1.2491, "step": 308 }, { "epoch": 0.4944, "grad_norm": 0.8737632036209106, "learning_rate": 2.986012369924811e-05, "loss": 1.2385, "step": 309 }, { "epoch": 0.496, "grad_norm": 0.9431871771812439, "learning_rate": 2.9722964165636264e-05, "loss": 1.2495, "step": 310 }, { "epoch": 0.4976, "grad_norm": 0.9051371216773987, "learning_rate": 2.9585657047563315e-05, "loss": 1.2781, "step": 311 }, { "epoch": 0.4992, "grad_norm": 0.9298401474952698, "learning_rate": 2.9448206635639213e-05, "loss": 1.2223, "step": 312 }, { "epoch": 0.5008, "grad_norm": 0.9193928241729736, "learning_rate": 2.931061722495159e-05, "loss": 1.4039, "step": 313 }, { "epoch": 0.5024, "grad_norm": 0.8385506868362427, "learning_rate": 2.917289311493155e-05, "loss": 1.2201, "step": 314 }, { "epoch": 0.504, "grad_norm": 0.9346365928649902, "learning_rate": 2.9035038609219306e-05, "loss": 1.2969, "step": 315 }, { "epoch": 0.5056, "grad_norm": 0.8425478935241699, "learning_rate": 2.8897058015529732e-05, "loss": 1.1402, "step": 316 }, { "epoch": 0.5072, "grad_norm": 0.8805160522460938, "learning_rate": 2.875895564551772e-05, "loss": 1.201, "step": 317 }, { "epoch": 0.5088, "grad_norm": 0.9152933955192566, "learning_rate": 2.862073581464347e-05, "loss": 1.2362, "step": 318 }, { "epoch": 0.5104, "grad_norm": 0.9266682267189026, "learning_rate": 2.8482402842037614e-05, "loss": 1.2357, "step": 319 }, { "epoch": 0.512, "grad_norm": 0.9117448329925537, "learning_rate": 2.8343961050366275e-05, "loss": 1.2525, "step": 320 }, { "epoch": 0.5136, "grad_norm": 0.9225302934646606, "learning_rate": 2.8205414765696003e-05, "loss": 1.243, "step": 321 }, { "epoch": 0.5152, "grad_norm": 0.8879780769348145, "learning_rate": 2.806676831735855e-05, "loss": 1.1874, "step": 322 }, { "epoch": 0.5168, "grad_norm": 0.9359835386276245, "learning_rate": 2.792802603781562e-05, "loss": 1.2459, "step": 323 }, { "epoch": 0.5184, "grad_norm": 0.8781741261482239, "learning_rate": 2.7789192262523462e-05, "loss": 1.197, "step": 324 }, { "epoch": 0.52, "grad_norm": 0.8473330736160278, "learning_rate": 2.7650271329797427e-05, "loss": 1.4236, "step": 325 }, { "epoch": 0.5216, "grad_norm": 0.8732668161392212, "learning_rate": 2.751126758067638e-05, "loss": 1.3417, "step": 326 }, { "epoch": 0.5232, "grad_norm": 0.8333480358123779, "learning_rate": 2.737218535878705e-05, "loss": 1.15, "step": 327 }, { "epoch": 0.5248, "grad_norm": 0.9040815234184265, "learning_rate": 2.723302901020831e-05, "loss": 1.2865, "step": 328 }, { "epoch": 0.5264, "grad_norm": 0.9136947393417358, "learning_rate": 2.7093802883335357e-05, "loss": 1.2927, "step": 329 }, { "epoch": 0.528, "grad_norm": 0.8324840068817139, "learning_rate": 2.695451132874385e-05, "loss": 1.1393, "step": 330 }, { "epoch": 0.5296, "grad_norm": 0.9386698007583618, "learning_rate": 2.6815158699053932e-05, "loss": 1.307, "step": 331 }, { "epoch": 0.5312, "grad_norm": 0.9148058891296387, "learning_rate": 2.667574934879427e-05, "loss": 1.2586, "step": 332 }, { "epoch": 0.5328, "grad_norm": 0.9264355301856995, "learning_rate": 2.6536287634265918e-05, "loss": 1.2721, "step": 333 }, { "epoch": 0.5344, "grad_norm": 0.8968346118927002, "learning_rate": 2.639677791340623e-05, "loss": 1.2775, "step": 334 }, { "epoch": 0.536, "grad_norm": 0.9153205752372742, "learning_rate": 2.6257224545652688e-05, "loss": 1.4344, "step": 335 }, { "epoch": 0.5376, "grad_norm": 0.8399821519851685, "learning_rate": 2.611763189180665e-05, "loss": 1.2267, "step": 336 }, { "epoch": 0.5392, "grad_norm": 0.8458365797996521, "learning_rate": 2.5978004313897104e-05, "loss": 1.1581, "step": 337 }, { "epoch": 0.5408, "grad_norm": 0.8758581876754761, "learning_rate": 2.5838346175044355e-05, "loss": 1.1361, "step": 338 }, { "epoch": 0.5424, "grad_norm": 0.9009513258934021, "learning_rate": 2.569866183932368e-05, "loss": 1.2588, "step": 339 }, { "epoch": 0.544, "grad_norm": 0.9333202838897705, "learning_rate": 2.5558955671628965e-05, "loss": 1.2239, "step": 340 }, { "epoch": 0.5456, "grad_norm": 0.9238294363021851, "learning_rate": 2.5419232037536316e-05, "loss": 1.216, "step": 341 }, { "epoch": 0.5472, "grad_norm": 0.9511209726333618, "learning_rate": 2.5279495303167617e-05, "loss": 1.3145, "step": 342 }, { "epoch": 0.5488, "grad_norm": 0.8405779004096985, "learning_rate": 2.5139749835054123e-05, "loss": 1.2512, "step": 343 }, { "epoch": 0.5504, "grad_norm": 0.9548525810241699, "learning_rate": 2.5e-05, "loss": 1.227, "step": 344 }, { "epoch": 0.552, "grad_norm": 0.9383575916290283, "learning_rate": 2.4860250164945876e-05, "loss": 1.1814, "step": 345 }, { "epoch": 0.5536, "grad_norm": 0.863120436668396, "learning_rate": 2.472050469683239e-05, "loss": 1.3096, "step": 346 }, { "epoch": 0.5552, "grad_norm": 0.8800427317619324, "learning_rate": 2.4580767962463687e-05, "loss": 1.3052, "step": 347 }, { "epoch": 0.5568, "grad_norm": 0.8648183345794678, "learning_rate": 2.444104432837104e-05, "loss": 1.3242, "step": 348 }, { "epoch": 0.5584, "grad_norm": 0.9128614664077759, "learning_rate": 2.4301338160676324e-05, "loss": 1.3155, "step": 349 }, { "epoch": 0.56, "grad_norm": 0.8887414336204529, "learning_rate": 2.416165382495565e-05, "loss": 1.3212, "step": 350 }, { "epoch": 0.5616, "grad_norm": 0.9355538487434387, "learning_rate": 2.40219956861029e-05, "loss": 1.1966, "step": 351 }, { "epoch": 0.5632, "grad_norm": 0.9037690758705139, "learning_rate": 2.388236810819336e-05, "loss": 1.2639, "step": 352 }, { "epoch": 0.5648, "grad_norm": 0.8876987099647522, "learning_rate": 2.374277545434732e-05, "loss": 1.355, "step": 353 }, { "epoch": 0.5664, "grad_norm": 0.9076970219612122, "learning_rate": 2.3603222086593772e-05, "loss": 1.245, "step": 354 }, { "epoch": 0.568, "grad_norm": 0.8609734773635864, "learning_rate": 2.346371236573409e-05, "loss": 1.3408, "step": 355 }, { "epoch": 0.5696, "grad_norm": 0.8374832272529602, "learning_rate": 2.3324250651205733e-05, "loss": 1.2781, "step": 356 }, { "epoch": 0.5712, "grad_norm": 0.9215436577796936, "learning_rate": 2.318484130094607e-05, "loss": 1.2115, "step": 357 }, { "epoch": 0.5728, "grad_norm": 0.906322717666626, "learning_rate": 2.3045488671256154e-05, "loss": 1.143, "step": 358 }, { "epoch": 0.5744, "grad_norm": 0.8621953725814819, "learning_rate": 2.2906197116664653e-05, "loss": 1.3032, "step": 359 }, { "epoch": 0.576, "grad_norm": 0.8630958199501038, "learning_rate": 2.2766970989791696e-05, "loss": 1.2356, "step": 360 }, { "epoch": 0.5776, "grad_norm": 0.8555357456207275, "learning_rate": 2.262781464121296e-05, "loss": 1.3405, "step": 361 }, { "epoch": 0.5792, "grad_norm": 0.8668087720870972, "learning_rate": 2.2488732419323625e-05, "loss": 1.2857, "step": 362 }, { "epoch": 0.5808, "grad_norm": 0.8873060345649719, "learning_rate": 2.2349728670202582e-05, "loss": 1.2315, "step": 363 }, { "epoch": 0.5824, "grad_norm": 0.9091164469718933, "learning_rate": 2.2210807737476544e-05, "loss": 1.2532, "step": 364 }, { "epoch": 0.584, "grad_norm": 0.840869128704071, "learning_rate": 2.2071973962184384e-05, "loss": 1.2422, "step": 365 }, { "epoch": 0.5856, "grad_norm": 0.847135603427887, "learning_rate": 2.1933231682641454e-05, "loss": 1.2189, "step": 366 }, { "epoch": 0.5872, "grad_norm": 0.9141340851783752, "learning_rate": 2.1794585234303993e-05, "loss": 1.3002, "step": 367 }, { "epoch": 0.5888, "grad_norm": 0.9812307953834534, "learning_rate": 2.1656038949633728e-05, "loss": 1.2359, "step": 368 }, { "epoch": 0.5904, "grad_norm": 0.9033782482147217, "learning_rate": 2.1517597157962392e-05, "loss": 1.264, "step": 369 }, { "epoch": 0.592, "grad_norm": 0.9706579446792603, "learning_rate": 2.1379264185356544e-05, "loss": 1.1521, "step": 370 }, { "epoch": 0.5936, "grad_norm": 0.846195638179779, "learning_rate": 2.124104435448228e-05, "loss": 1.3544, "step": 371 }, { "epoch": 0.5952, "grad_norm": 0.8992451429367065, "learning_rate": 2.1102941984470273e-05, "loss": 1.3336, "step": 372 }, { "epoch": 0.5968, "grad_norm": 1.0022822618484497, "learning_rate": 2.0964961390780703e-05, "loss": 1.3492, "step": 373 }, { "epoch": 0.5984, "grad_norm": 0.92171710729599, "learning_rate": 2.0827106885068456e-05, "loss": 1.2152, "step": 374 }, { "epoch": 0.6, "grad_norm": 0.901383101940155, "learning_rate": 2.0689382775048418e-05, "loss": 1.1426, "step": 375 }, { "epoch": 0.6016, "grad_norm": 0.9033606648445129, "learning_rate": 2.055179336436079e-05, "loss": 1.2107, "step": 376 }, { "epoch": 0.6032, "grad_norm": 0.8900898694992065, "learning_rate": 2.0414342952436694e-05, "loss": 1.2645, "step": 377 }, { "epoch": 0.6048, "grad_norm": 0.9835701584815979, "learning_rate": 2.027703583436374e-05, "loss": 1.2762, "step": 378 }, { "epoch": 0.6064, "grad_norm": 0.8980581760406494, "learning_rate": 2.0139876300751904e-05, "loss": 1.211, "step": 379 }, { "epoch": 0.608, "grad_norm": 0.8751140832901001, "learning_rate": 2.000286863759934e-05, "loss": 1.2483, "step": 380 }, { "epoch": 0.6096, "grad_norm": 0.9279865026473999, "learning_rate": 1.9866017126158574e-05, "loss": 1.276, "step": 381 }, { "epoch": 0.6112, "grad_norm": 1.4325151443481445, "learning_rate": 1.972932604280261e-05, "loss": 1.2897, "step": 382 }, { "epoch": 0.6128, "grad_norm": 0.8680592775344849, "learning_rate": 1.9592799658891385e-05, "loss": 1.2971, "step": 383 }, { "epoch": 0.6144, "grad_norm": 0.8638734221458435, "learning_rate": 1.9456442240638266e-05, "loss": 1.1354, "step": 384 }, { "epoch": 0.616, "grad_norm": 0.8918828964233398, "learning_rate": 1.9320258048976702e-05, "loss": 1.146, "step": 385 }, { "epoch": 0.6176, "grad_norm": 0.8967251181602478, "learning_rate": 1.9184251339427147e-05, "loss": 1.2925, "step": 386 }, { "epoch": 0.6192, "grad_norm": 0.8948070406913757, "learning_rate": 1.904842636196402e-05, "loss": 1.3244, "step": 387 }, { "epoch": 0.6208, "grad_norm": 0.8988600969314575, "learning_rate": 1.8912787360882948e-05, "loss": 1.3059, "step": 388 }, { "epoch": 0.6224, "grad_norm": 0.900235652923584, "learning_rate": 1.8777338574668095e-05, "loss": 1.2904, "step": 389 }, { "epoch": 0.624, "grad_norm": 0.8608076572418213, "learning_rate": 1.8642084235859765e-05, "loss": 1.0421, "step": 390 }, { "epoch": 0.6256, "grad_norm": 0.8658925890922546, "learning_rate": 1.850702857092208e-05, "loss": 1.2413, "step": 391 }, { "epoch": 0.6272, "grad_norm": 0.9178957939147949, "learning_rate": 1.8372175800110996e-05, "loss": 1.1171, "step": 392 }, { "epoch": 0.6288, "grad_norm": 0.9782877564430237, "learning_rate": 1.8237530137342335e-05, "loss": 1.2704, "step": 393 }, { "epoch": 0.6304, "grad_norm": 0.9919098615646362, "learning_rate": 1.8103095790060172e-05, "loss": 1.3529, "step": 394 }, { "epoch": 0.632, "grad_norm": 0.9238556027412415, "learning_rate": 1.796887695910535e-05, "loss": 1.311, "step": 395 }, { "epoch": 0.6336, "grad_norm": 0.8913836479187012, "learning_rate": 1.7834877838584164e-05, "loss": 1.2863, "step": 396 }, { "epoch": 0.6352, "grad_norm": 0.90035080909729, "learning_rate": 1.7701102615737368e-05, "loss": 1.1532, "step": 397 }, { "epoch": 0.6368, "grad_norm": 1.577120065689087, "learning_rate": 1.756755547080929e-05, "loss": 1.2913, "step": 398 }, { "epoch": 0.6384, "grad_norm": 0.9123008847236633, "learning_rate": 1.7434240576917226e-05, "loss": 1.1595, "step": 399 }, { "epoch": 0.64, "grad_norm": 0.8944430351257324, "learning_rate": 1.7301162099921013e-05, "loss": 1.3178, "step": 400 }, { "epoch": 0.6416, "grad_norm": 0.8984083533287048, "learning_rate": 1.7168324198292888e-05, "loss": 1.1768, "step": 401 }, { "epoch": 0.6432, "grad_norm": 0.859035074710846, "learning_rate": 1.7035731022987493e-05, "loss": 1.1931, "step": 402 }, { "epoch": 0.6448, "grad_norm": 0.8194819092750549, "learning_rate": 1.6903386717312236e-05, "loss": 1.2926, "step": 403 }, { "epoch": 0.6464, "grad_norm": 0.9348705410957336, "learning_rate": 1.6771295416797767e-05, "loss": 1.3801, "step": 404 }, { "epoch": 0.648, "grad_norm": 0.8895663619041443, "learning_rate": 1.6639461249068726e-05, "loss": 1.2372, "step": 405 }, { "epoch": 0.6496, "grad_norm": 0.9557973146438599, "learning_rate": 1.650788833371486e-05, "loss": 1.2975, "step": 406 }, { "epoch": 0.6512, "grad_norm": 0.8750960826873779, "learning_rate": 1.637658078216217e-05, "loss": 1.2258, "step": 407 }, { "epoch": 0.6528, "grad_norm": 0.9003758430480957, "learning_rate": 1.6245542697544545e-05, "loss": 1.1715, "step": 408 }, { "epoch": 0.6544, "grad_norm": 0.935192346572876, "learning_rate": 1.6114778174575473e-05, "loss": 1.1167, "step": 409 }, { "epoch": 0.656, "grad_norm": 0.9298659563064575, "learning_rate": 1.5984291299420117e-05, "loss": 1.2462, "step": 410 }, { "epoch": 0.6576, "grad_norm": 0.8432703614234924, "learning_rate": 1.585408614956763e-05, "loss": 1.164, "step": 411 }, { "epoch": 0.6592, "grad_norm": 0.8721514940261841, "learning_rate": 1.5724166793703744e-05, "loss": 1.3714, "step": 412 }, { "epoch": 0.6608, "grad_norm": 0.8628910779953003, "learning_rate": 1.559453729158361e-05, "loss": 1.271, "step": 413 }, { "epoch": 0.6624, "grad_norm": 1.0193123817443848, "learning_rate": 1.5465201693904947e-05, "loss": 1.2537, "step": 414 }, { "epoch": 0.664, "grad_norm": 0.8773367404937744, "learning_rate": 1.5336164042181494e-05, "loss": 1.3883, "step": 415 }, { "epoch": 0.6656, "grad_norm": 0.8970101475715637, "learning_rate": 1.5207428368616656e-05, "loss": 1.1668, "step": 416 }, { "epoch": 0.6672, "grad_norm": 0.8884343504905701, "learning_rate": 1.5078998695977586e-05, "loss": 1.3026, "step": 417 }, { "epoch": 0.6688, "grad_norm": 0.9405607581138611, "learning_rate": 1.4950879037469397e-05, "loss": 1.2942, "step": 418 }, { "epoch": 0.6704, "grad_norm": 0.8537874221801758, "learning_rate": 1.482307339660983e-05, "loss": 1.2056, "step": 419 }, { "epoch": 0.672, "grad_norm": 0.9218015074729919, "learning_rate": 1.4695585767104092e-05, "loss": 1.3394, "step": 420 }, { "epoch": 0.6736, "grad_norm": 0.8756780028343201, "learning_rate": 1.4568420132720106e-05, "loss": 1.1279, "step": 421 }, { "epoch": 0.6752, "grad_norm": 0.916559100151062, "learning_rate": 1.4441580467163984e-05, "loss": 1.3637, "step": 422 }, { "epoch": 0.6768, "grad_norm": 0.8504108786582947, "learning_rate": 1.4315070733955888e-05, "loss": 1.3262, "step": 423 }, { "epoch": 0.6784, "grad_norm": 0.8963852524757385, "learning_rate": 1.4188894886306176e-05, "loss": 1.2574, "step": 424 }, { "epoch": 0.68, "grad_norm": 0.9186688661575317, "learning_rate": 1.4063056866991826e-05, "loss": 1.24, "step": 425 }, { "epoch": 0.6816, "grad_norm": 0.877158522605896, "learning_rate": 1.3937560608233296e-05, "loss": 1.1461, "step": 426 }, { "epoch": 0.6832, "grad_norm": 0.9318328499794006, "learning_rate": 1.381241003157162e-05, "loss": 1.3115, "step": 427 }, { "epoch": 0.6848, "grad_norm": 0.8685953617095947, "learning_rate": 1.3687609047745833e-05, "loss": 1.2278, "step": 428 }, { "epoch": 0.6864, "grad_norm": 1.0183967351913452, "learning_rate": 1.3563161556570826e-05, "loss": 1.1676, "step": 429 }, { "epoch": 0.688, "grad_norm": 0.9044201970100403, "learning_rate": 1.3439071446815452e-05, "loss": 1.1449, "step": 430 }, { "epoch": 0.6896, "grad_norm": 0.908244788646698, "learning_rate": 1.3315342596080996e-05, "loss": 1.1701, "step": 431 }, { "epoch": 0.6912, "grad_norm": 0.8455114960670471, "learning_rate": 1.3191978870680044e-05, "loss": 1.1382, "step": 432 }, { "epoch": 0.6928, "grad_norm": 0.9015253782272339, "learning_rate": 1.3068984125515644e-05, "loss": 1.2991, "step": 433 }, { "epoch": 0.6944, "grad_norm": 0.946810781955719, "learning_rate": 1.2946362203960832e-05, "loss": 1.1658, "step": 434 }, { "epoch": 0.696, "grad_norm": 0.9045064449310303, "learning_rate": 1.2824116937738579e-05, "loss": 1.3547, "step": 435 }, { "epoch": 0.6976, "grad_norm": 0.9437087178230286, "learning_rate": 1.2702252146801991e-05, "loss": 1.2769, "step": 436 }, { "epoch": 0.6992, "grad_norm": 0.92092365026474, "learning_rate": 1.2580771639215027e-05, "loss": 1.3412, "step": 437 }, { "epoch": 0.7008, "grad_norm": 0.8945919871330261, "learning_rate": 1.2459679211033407e-05, "loss": 1.2753, "step": 438 }, { "epoch": 0.7024, "grad_norm": 0.8865060806274414, "learning_rate": 1.2338978646186084e-05, "loss": 1.2111, "step": 439 }, { "epoch": 0.704, "grad_norm": 1.0016142129898071, "learning_rate": 1.2218673716356919e-05, "loss": 1.1812, "step": 440 }, { "epoch": 0.7056, "grad_norm": 0.9112198948860168, "learning_rate": 1.2098768180866895e-05, "loss": 1.1137, "step": 441 }, { "epoch": 0.7072, "grad_norm": 0.8970918655395508, "learning_rate": 1.1979265786556612e-05, "loss": 1.2726, "step": 442 }, { "epoch": 0.7088, "grad_norm": 0.9069077372550964, "learning_rate": 1.1860170267669174e-05, "loss": 1.1958, "step": 443 }, { "epoch": 0.7104, "grad_norm": 0.9349508881568909, "learning_rate": 1.1741485345733574e-05, "loss": 1.1879, "step": 444 }, { "epoch": 0.712, "grad_norm": 0.8500936031341553, "learning_rate": 1.1623214729448317e-05, "loss": 1.2384, "step": 445 }, { "epoch": 0.7136, "grad_norm": 0.8837436437606812, "learning_rate": 1.1505362114565615e-05, "loss": 1.145, "step": 446 }, { "epoch": 0.7152, "grad_norm": 0.8582147359848022, "learning_rate": 1.1387931183775822e-05, "loss": 1.2495, "step": 447 }, { "epoch": 0.7168, "grad_norm": 0.9009386301040649, "learning_rate": 1.1270925606592419e-05, "loss": 1.2958, "step": 448 }, { "epoch": 0.7184, "grad_norm": 0.8078173398971558, "learning_rate": 1.1154349039237322e-05, "loss": 1.2039, "step": 449 }, { "epoch": 0.72, "grad_norm": 0.8258360028266907, "learning_rate": 1.103820512452661e-05, "loss": 1.2223, "step": 450 }, { "epoch": 0.7216, "grad_norm": 0.8698141574859619, "learning_rate": 1.0922497491756734e-05, "loss": 1.2238, "step": 451 }, { "epoch": 0.7232, "grad_norm": 0.8935551643371582, "learning_rate": 1.0807229756591087e-05, "loss": 1.2719, "step": 452 }, { "epoch": 0.7248, "grad_norm": 0.9344638586044312, "learning_rate": 1.0692405520947028e-05, "loss": 1.2822, "step": 453 }, { "epoch": 0.7264, "grad_norm": 0.8965893387794495, "learning_rate": 1.0578028372883298e-05, "loss": 1.225, "step": 454 }, { "epoch": 0.728, "grad_norm": 0.9360785484313965, "learning_rate": 1.0464101886487958e-05, "loss": 1.29, "step": 455 }, { "epoch": 0.7296, "grad_norm": 0.9349333047866821, "learning_rate": 1.0350629621766639e-05, "loss": 1.334, "step": 456 }, { "epoch": 0.7312, "grad_norm": 0.8738869428634644, "learning_rate": 1.0237615124531363e-05, "loss": 1.228, "step": 457 }, { "epoch": 0.7328, "grad_norm": 0.9665687680244446, "learning_rate": 1.0125061926289672e-05, "loss": 1.2557, "step": 458 }, { "epoch": 0.7344, "grad_norm": 0.9515901803970337, "learning_rate": 1.0012973544134358e-05, "loss": 1.2816, "step": 459 }, { "epoch": 0.736, "grad_norm": 0.8777677416801453, "learning_rate": 9.901353480633468e-06, "loss": 1.354, "step": 460 }, { "epoch": 0.7376, "grad_norm": 0.9317053556442261, "learning_rate": 9.79020522372093e-06, "loss": 1.2328, "step": 461 }, { "epoch": 0.7392, "grad_norm": 0.9092256426811218, "learning_rate": 9.679532246587539e-06, "loss": 1.3953, "step": 462 }, { "epoch": 0.7408, "grad_norm": 0.85760498046875, "learning_rate": 9.569338007572382e-06, "loss": 1.2227, "step": 463 }, { "epoch": 0.7424, "grad_norm": 0.9638258814811707, "learning_rate": 9.459625950054849e-06, "loss": 1.2759, "step": 464 }, { "epoch": 0.744, "grad_norm": 1.0319563150405884, "learning_rate": 9.35039950234696e-06, "loss": 1.2376, "step": 465 }, { "epoch": 0.7456, "grad_norm": 0.9452919960021973, "learning_rate": 9.241662077586296e-06, "loss": 1.2416, "step": 466 }, { "epoch": 0.7472, "grad_norm": 0.8888805508613586, "learning_rate": 9.133417073629289e-06, "loss": 1.2008, "step": 467 }, { "epoch": 0.7488, "grad_norm": 1.2721936702728271, "learning_rate": 9.025667872945098e-06, "loss": 1.2616, "step": 468 }, { "epoch": 0.7504, "grad_norm": 0.8869225382804871, "learning_rate": 8.918417842509867e-06, "loss": 1.1939, "step": 469 }, { "epoch": 0.752, "grad_norm": 0.9706895351409912, "learning_rate": 8.811670333701544e-06, "loss": 1.2002, "step": 470 }, { "epoch": 0.7536, "grad_norm": 0.9659148454666138, "learning_rate": 8.705428682195155e-06, "loss": 1.3556, "step": 471 }, { "epoch": 0.7552, "grad_norm": 0.8844773173332214, "learning_rate": 8.599696207858532e-06, "loss": 1.3547, "step": 472 }, { "epoch": 0.7568, "grad_norm": 0.9633551239967346, "learning_rate": 8.494476214648626e-06, "loss": 1.306, "step": 473 }, { "epoch": 0.7584, "grad_norm": 0.8471859097480774, "learning_rate": 8.38977199050824e-06, "loss": 1.2395, "step": 474 }, { "epoch": 0.76, "grad_norm": 0.8387380242347717, "learning_rate": 8.285586807263254e-06, "loss": 1.1725, "step": 475 }, { "epoch": 0.7616, "grad_norm": 0.843757688999176, "learning_rate": 8.181923920520457e-06, "loss": 1.2939, "step": 476 }, { "epoch": 0.7632, "grad_norm": 0.9425132274627686, "learning_rate": 8.078786569565763e-06, "loss": 1.232, "step": 477 }, { "epoch": 0.7648, "grad_norm": 1.041238784790039, "learning_rate": 7.976177977262983e-06, "loss": 1.15, "step": 478 }, { "epoch": 0.7664, "grad_norm": 0.9261163473129272, "learning_rate": 7.874101349953167e-06, "loss": 1.2376, "step": 479 }, { "epoch": 0.768, "grad_norm": 0.8806859850883484, "learning_rate": 7.77255987735434e-06, "loss": 1.1752, "step": 480 }, { "epoch": 0.7696, "grad_norm": 0.980810821056366, "learning_rate": 7.671556732461905e-06, "loss": 1.2751, "step": 481 }, { "epoch": 0.7712, "grad_norm": 0.8694765567779541, "learning_rate": 7.5710950714494414e-06, "loss": 1.2739, "step": 482 }, { "epoch": 0.7728, "grad_norm": 0.9100877046585083, "learning_rate": 7.471178033570081e-06, "loss": 1.164, "step": 483 }, { "epoch": 0.7744, "grad_norm": 0.953989565372467, "learning_rate": 7.3718087410584475e-06, "loss": 1.1608, "step": 484 }, { "epoch": 0.776, "grad_norm": 0.9771277904510498, "learning_rate": 7.272990299033045e-06, "loss": 1.2991, "step": 485 }, { "epoch": 0.7776, "grad_norm": 0.8695554137229919, "learning_rate": 7.174725795399282e-06, "loss": 1.1256, "step": 486 }, { "epoch": 0.7792, "grad_norm": 0.9248853921890259, "learning_rate": 7.077018300752916e-06, "loss": 1.359, "step": 487 }, { "epoch": 0.7808, "grad_norm": 0.8975909352302551, "learning_rate": 6.979870868284169e-06, "loss": 1.1869, "step": 488 }, { "epoch": 0.7824, "grad_norm": 0.9052600860595703, "learning_rate": 6.883286533682265e-06, "loss": 1.1567, "step": 489 }, { "epoch": 0.784, "grad_norm": 0.8917181491851807, "learning_rate": 6.787268315040604e-06, "loss": 1.2932, "step": 490 }, { "epoch": 0.7856, "grad_norm": 0.9377654194831848, "learning_rate": 6.691819212762454e-06, "loss": 1.0563, "step": 491 }, { "epoch": 0.7872, "grad_norm": 0.9259486198425293, "learning_rate": 6.596942209467149e-06, "loss": 1.1416, "step": 492 }, { "epoch": 0.7888, "grad_norm": 0.9459463357925415, "learning_rate": 6.502640269896953e-06, "loss": 1.222, "step": 493 }, { "epoch": 0.7904, "grad_norm": 0.9838451743125916, "learning_rate": 6.4089163408243555e-06, "loss": 1.2166, "step": 494 }, { "epoch": 0.792, "grad_norm": 0.8990271687507629, "learning_rate": 6.3157733509600355e-06, "loss": 1.3592, "step": 495 }, { "epoch": 0.7936, "grad_norm": 0.927118718624115, "learning_rate": 6.2232142108613305e-06, "loss": 1.1831, "step": 496 }, { "epoch": 0.7952, "grad_norm": 0.9889640808105469, "learning_rate": 6.1312418128412565e-06, "loss": 1.3906, "step": 497 }, { "epoch": 0.7968, "grad_norm": 0.9518388509750366, "learning_rate": 6.039859030878184e-06, "loss": 1.3208, "step": 498 }, { "epoch": 0.7984, "grad_norm": 0.9496850371360779, "learning_rate": 5.949068720525991e-06, "loss": 1.1709, "step": 499 }, { "epoch": 0.8, "grad_norm": 0.8898599147796631, "learning_rate": 5.8588737188248285e-06, "loss": 1.177, "step": 500 }, { "epoch": 0.8016, "grad_norm": 0.9107718467712402, "learning_rate": 5.769276844212501e-06, "loss": 1.2313, "step": 501 }, { "epoch": 0.8032, "grad_norm": 0.9735508561134338, "learning_rate": 5.680280896436377e-06, "loss": 1.1961, "step": 502 }, { "epoch": 0.8048, "grad_norm": 0.8979254364967346, "learning_rate": 5.591888656465874e-06, "loss": 1.2766, "step": 503 }, { "epoch": 0.8064, "grad_norm": 0.8667820692062378, "learning_rate": 5.504102886405624e-06, "loss": 1.2446, "step": 504 }, { "epoch": 0.808, "grad_norm": 0.9055896401405334, "learning_rate": 5.416926329409083e-06, "loss": 1.3687, "step": 505 }, { "epoch": 0.8096, "grad_norm": 0.9357609152793884, "learning_rate": 5.330361709592887e-06, "loss": 1.1901, "step": 506 }, { "epoch": 0.8112, "grad_norm": 0.9340748190879822, "learning_rate": 5.244411731951671e-06, "loss": 1.2224, "step": 507 }, { "epoch": 0.8128, "grad_norm": 0.9045987725257874, "learning_rate": 5.159079082273579e-06, "loss": 1.4607, "step": 508 }, { "epoch": 0.8144, "grad_norm": 0.9520361423492432, "learning_rate": 5.074366427056309e-06, "loss": 1.211, "step": 509 }, { "epoch": 0.816, "grad_norm": 0.8779955506324768, "learning_rate": 4.9902764134238165e-06, "loss": 1.2394, "step": 510 }, { "epoch": 0.8176, "grad_norm": 0.8060851097106934, "learning_rate": 4.90681166904359e-06, "loss": 1.2658, "step": 511 }, { "epoch": 0.8192, "grad_norm": 1.2345097064971924, "learning_rate": 4.823974802044515e-06, "loss": 1.3345, "step": 512 }, { "epoch": 0.8208, "grad_norm": 0.8822219967842102, "learning_rate": 4.741768400935417e-06, "loss": 1.1296, "step": 513 }, { "epoch": 0.8224, "grad_norm": 0.8812503814697266, "learning_rate": 4.660195034524128e-06, "loss": 1.2769, "step": 514 }, { "epoch": 0.824, "grad_norm": 0.8823491930961609, "learning_rate": 4.579257251837271e-06, "loss": 1.3311, "step": 515 }, { "epoch": 0.8256, "grad_norm": 0.916392982006073, "learning_rate": 4.498957582040548e-06, "loss": 1.0722, "step": 516 }, { "epoch": 0.8272, "grad_norm": 0.8662752509117126, "learning_rate": 4.419298534359759e-06, "loss": 1.2124, "step": 517 }, { "epoch": 0.8288, "grad_norm": 0.9208042621612549, "learning_rate": 4.340282598002352e-06, "loss": 1.2589, "step": 518 }, { "epoch": 0.8304, "grad_norm": 0.9230892658233643, "learning_rate": 4.261912242079674e-06, "loss": 1.3368, "step": 519 }, { "epoch": 0.832, "grad_norm": 0.9611358046531677, "learning_rate": 4.184189915529796e-06, "loss": 1.283, "step": 520 }, { "epoch": 0.8336, "grad_norm": 0.9364386796951294, "learning_rate": 4.107118047040995e-06, "loss": 1.2327, "step": 521 }, { "epoch": 0.8352, "grad_norm": 0.9322593212127686, "learning_rate": 4.030699044975838e-06, "loss": 1.2396, "step": 522 }, { "epoch": 0.8368, "grad_norm": 0.9932008385658264, "learning_rate": 3.954935297295975e-06, "loss": 1.2866, "step": 523 }, { "epoch": 0.8384, "grad_norm": 0.8891022205352783, "learning_rate": 3.879829171487476e-06, "loss": 1.3127, "step": 524 }, { "epoch": 0.84, "grad_norm": 0.8962449431419373, "learning_rate": 3.8053830144868547e-06, "loss": 1.329, "step": 525 }, { "epoch": 0.8416, "grad_norm": 0.9037047028541565, "learning_rate": 3.7315991526077646e-06, "loss": 1.2925, "step": 526 }, { "epoch": 0.8432, "grad_norm": 0.9338326454162598, "learning_rate": 3.6584798914682582e-06, "loss": 1.3899, "step": 527 }, { "epoch": 0.8448, "grad_norm": 0.9139680862426758, "learning_rate": 3.5860275159187856e-06, "loss": 1.3525, "step": 528 }, { "epoch": 0.8464, "grad_norm": 1.027836561203003, "learning_rate": 3.514244289970753e-06, "loss": 1.3165, "step": 529 }, { "epoch": 0.848, "grad_norm": 0.8581384420394897, "learning_rate": 3.443132456725817e-06, "loss": 1.2509, "step": 530 }, { "epoch": 0.8496, "grad_norm": 0.9147748947143555, "learning_rate": 3.3726942383057763e-06, "loss": 1.4141, "step": 531 }, { "epoch": 0.8512, "grad_norm": 0.8491597175598145, "learning_rate": 3.30293183578311e-06, "loss": 1.1774, "step": 532 }, { "epoch": 0.8528, "grad_norm": 0.9394559860229492, "learning_rate": 3.233847429112244e-06, "loss": 1.1447, "step": 533 }, { "epoch": 0.8544, "grad_norm": 0.926699161529541, "learning_rate": 3.1654431770613837e-06, "loss": 1.2269, "step": 534 }, { "epoch": 0.856, "grad_norm": 0.9347027540206909, "learning_rate": 3.0977212171451e-06, "loss": 1.2411, "step": 535 }, { "epoch": 0.8576, "grad_norm": 0.9272406697273254, "learning_rate": 3.030683665557496e-06, "loss": 1.3289, "step": 536 }, { "epoch": 0.8592, "grad_norm": 0.9312652349472046, "learning_rate": 2.9643326171061165e-06, "loss": 1.2996, "step": 537 }, { "epoch": 0.8608, "grad_norm": 0.9261339902877808, "learning_rate": 2.8986701451464536e-06, "loss": 1.174, "step": 538 }, { "epoch": 0.8624, "grad_norm": 0.9149643182754517, "learning_rate": 2.833698301517185e-06, "loss": 1.2275, "step": 539 }, { "epoch": 0.864, "grad_norm": 0.9295749068260193, "learning_rate": 2.769419116476052e-06, "loss": 1.2272, "step": 540 }, { "epoch": 0.8656, "grad_norm": 0.956770658493042, "learning_rate": 2.7058345986363974e-06, "loss": 1.3632, "step": 541 }, { "epoch": 0.8672, "grad_norm": 0.8755472302436829, "learning_rate": 2.642946734904428e-06, "loss": 1.1477, "step": 542 }, { "epoch": 0.8688, "grad_norm": 0.9452895522117615, "learning_rate": 2.5807574904171155e-06, "loss": 1.1826, "step": 543 }, { "epoch": 0.8704, "grad_norm": 0.929875910282135, "learning_rate": 2.519268808480779e-06, "loss": 1.3345, "step": 544 }, { "epoch": 0.872, "grad_norm": 0.8939977288246155, "learning_rate": 2.4584826105103764e-06, "loss": 1.0852, "step": 545 }, { "epoch": 0.8736, "grad_norm": 0.9564840197563171, "learning_rate": 2.3984007959694603e-06, "loss": 1.3785, "step": 546 }, { "epoch": 0.8752, "grad_norm": 0.9138489961624146, "learning_rate": 2.3390252423108076e-06, "loss": 1.1976, "step": 547 }, { "epoch": 0.8768, "grad_norm": 0.9345712661743164, "learning_rate": 2.280357804917774e-06, "loss": 1.3183, "step": 548 }, { "epoch": 0.8784, "grad_norm": 0.8825331330299377, "learning_rate": 2.222400317046308e-06, "loss": 1.142, "step": 549 }, { "epoch": 0.88, "grad_norm": 0.9116278290748596, "learning_rate": 2.165154589767651e-06, "loss": 1.2633, "step": 550 }, { "epoch": 0.8816, "grad_norm": 0.9420577883720398, "learning_rate": 2.108622411911773e-06, "loss": 1.4175, "step": 551 }, { "epoch": 0.8832, "grad_norm": 0.8935698866844177, "learning_rate": 2.052805550011447e-06, "loss": 1.1246, "step": 552 }, { "epoch": 0.8848, "grad_norm": 0.90406733751297, "learning_rate": 1.997705748247067e-06, "loss": 1.1702, "step": 553 }, { "epoch": 0.8864, "grad_norm": 0.8734138011932373, "learning_rate": 1.9433247283921263e-06, "loss": 1.4027, "step": 554 }, { "epoch": 0.888, "grad_norm": 0.9232995510101318, "learning_rate": 1.8896641897594492e-06, "loss": 1.2564, "step": 555 }, { "epoch": 0.8896, "grad_norm": 0.9804056286811829, "learning_rate": 1.8367258091480404e-06, "loss": 1.3026, "step": 556 }, { "epoch": 0.8912, "grad_norm": 0.896550714969635, "learning_rate": 1.78451124079074e-06, "loss": 1.207, "step": 557 }, { "epoch": 0.8928, "grad_norm": 0.8786153197288513, "learning_rate": 1.7330221163024851e-06, "loss": 1.1658, "step": 558 }, { "epoch": 0.8944, "grad_norm": 1.0269036293029785, "learning_rate": 1.6822600446293636e-06, "loss": 1.2223, "step": 559 }, { "epoch": 0.896, "grad_norm": 0.9815586805343628, "learning_rate": 1.632226611998322e-06, "loss": 1.2982, "step": 560 }, { "epoch": 0.8976, "grad_norm": 0.9357401132583618, "learning_rate": 1.5829233818675766e-06, "loss": 1.1579, "step": 561 }, { "epoch": 0.8992, "grad_norm": 0.9296241998672485, "learning_rate": 1.534351894877803e-06, "loss": 1.275, "step": 562 }, { "epoch": 0.9008, "grad_norm": 0.9087421894073486, "learning_rate": 1.486513668803946e-06, "loss": 1.2282, "step": 563 }, { "epoch": 0.9024, "grad_norm": 0.8636383414268494, "learning_rate": 1.4394101985078402e-06, "loss": 1.2651, "step": 564 }, { "epoch": 0.904, "grad_norm": 0.9391440153121948, "learning_rate": 1.3930429558914494e-06, "loss": 1.3123, "step": 565 }, { "epoch": 0.9056, "grad_norm": 0.8945949077606201, "learning_rate": 1.3474133898509073e-06, "loss": 1.253, "step": 566 }, { "epoch": 0.9072, "grad_norm": 0.8939377069473267, "learning_rate": 1.3025229262312366e-06, "loss": 1.2218, "step": 567 }, { "epoch": 0.9088, "grad_norm": 0.9494385719299316, "learning_rate": 1.2583729677817762e-06, "loss": 1.2736, "step": 568 }, { "epoch": 0.9104, "grad_norm": 0.9959173202514648, "learning_rate": 1.214964894112361e-06, "loss": 1.3017, "step": 569 }, { "epoch": 0.912, "grad_norm": 1.0381131172180176, "learning_rate": 1.1723000616502167e-06, "loss": 1.3057, "step": 570 }, { "epoch": 0.9136, "grad_norm": 0.9565362930297852, "learning_rate": 1.1303798035975643e-06, "loss": 1.2157, "step": 571 }, { "epoch": 0.9152, "grad_norm": 1.2417234182357788, "learning_rate": 1.089205429889953e-06, "loss": 1.1841, "step": 572 }, { "epoch": 0.9168, "grad_norm": 0.8463436961174011, "learning_rate": 1.0487782271553504e-06, "loss": 1.1661, "step": 573 }, { "epoch": 0.9184, "grad_norm": 0.9578458666801453, "learning_rate": 1.009099458673915e-06, "loss": 1.1491, "step": 574 }, { "epoch": 0.92, "grad_norm": 1.0012203454971313, "learning_rate": 9.701703643385295e-07, "loss": 1.3727, "step": 575 }, { "epoch": 0.9216, "grad_norm": 0.9493686556816101, "learning_rate": 9.319921606160603e-07, "loss": 1.3262, "step": 576 }, { "epoch": 0.9232, "grad_norm": 0.9732685685157776, "learning_rate": 8.94566040509337e-07, "loss": 1.3848, "step": 577 }, { "epoch": 0.9248, "grad_norm": 1.003488302230835, "learning_rate": 8.578931735198858e-07, "loss": 1.253, "step": 578 }, { "epoch": 0.9264, "grad_norm": 0.9489741921424866, "learning_rate": 8.219747056113586e-07, "loss": 1.3633, "step": 579 }, { "epoch": 0.928, "grad_norm": 0.9205242991447449, "learning_rate": 7.868117591737583e-07, "loss": 1.4024, "step": 580 }, { "epoch": 0.9296, "grad_norm": 0.9339300394058228, "learning_rate": 7.524054329883346e-07, "loss": 1.1532, "step": 581 }, { "epoch": 0.9312, "grad_norm": 0.9014155864715576, "learning_rate": 7.187568021932795e-07, "loss": 1.2517, "step": 582 }, { "epoch": 0.9328, "grad_norm": 0.9373380541801453, "learning_rate": 6.858669182500971e-07, "loss": 1.212, "step": 583 }, { "epoch": 0.9344, "grad_norm": 0.8957499265670776, "learning_rate": 6.537368089107765e-07, "loss": 1.3616, "step": 584 }, { "epoch": 0.936, "grad_norm": 1.0080842971801758, "learning_rate": 6.223674781856592e-07, "loss": 1.2811, "step": 585 }, { "epoch": 0.9376, "grad_norm": 0.9520625472068787, "learning_rate": 5.917599063120754e-07, "loss": 1.3469, "step": 586 }, { "epoch": 0.9392, "grad_norm": 0.8778144121170044, "learning_rate": 5.619150497236992e-07, "loss": 1.2631, "step": 587 }, { "epoch": 0.9408, "grad_norm": 0.9478060603141785, "learning_rate": 5.328338410206801e-07, "loss": 1.1698, "step": 588 }, { "epoch": 0.9424, "grad_norm": 0.9025092124938965, "learning_rate": 5.045171889404954e-07, "loss": 1.1005, "step": 589 }, { "epoch": 0.944, "grad_norm": 1.0931105613708496, "learning_rate": 4.769659783295383e-07, "loss": 1.2853, "step": 590 }, { "epoch": 0.9456, "grad_norm": 0.9500147700309753, "learning_rate": 4.501810701154907e-07, "loss": 1.1083, "step": 591 }, { "epoch": 0.9472, "grad_norm": 0.9342660307884216, "learning_rate": 4.24163301280403e-07, "loss": 1.2244, "step": 592 }, { "epoch": 0.9488, "grad_norm": 0.9654433727264404, "learning_rate": 3.98913484834551e-07, "loss": 1.2992, "step": 593 }, { "epoch": 0.9504, "grad_norm": 0.8733147382736206, "learning_rate": 3.7443240979102565e-07, "loss": 1.167, "step": 594 }, { "epoch": 0.952, "grad_norm": 0.9046812057495117, "learning_rate": 3.507208411410778e-07, "loss": 1.2584, "step": 595 }, { "epoch": 0.9536, "grad_norm": 0.9616255760192871, "learning_rate": 3.277795198302125e-07, "loss": 1.2654, "step": 596 }, { "epoch": 0.9552, "grad_norm": 0.9657197594642639, "learning_rate": 3.0560916273504325e-07, "loss": 1.2626, "step": 597 }, { "epoch": 0.9568, "grad_norm": 1.1410290002822876, "learning_rate": 2.8421046264088256e-07, "loss": 1.3203, "step": 598 }, { "epoch": 0.9584, "grad_norm": 0.8977676033973694, "learning_rate": 2.635840882200924e-07, "loss": 1.2601, "step": 599 }, { "epoch": 0.96, "grad_norm": 0.8826071619987488, "learning_rate": 2.4373068401120356e-07, "loss": 1.2444, "step": 600 }, { "epoch": 0.9616, "grad_norm": 0.9761996865272522, "learning_rate": 2.246508703987543e-07, "loss": 1.3422, "step": 601 }, { "epoch": 0.9632, "grad_norm": 0.8568698763847351, "learning_rate": 2.063452435939167e-07, "loss": 1.2023, "step": 602 }, { "epoch": 0.9648, "grad_norm": 0.9191575646400452, "learning_rate": 1.8881437561586722e-07, "loss": 1.3778, "step": 603 }, { "epoch": 0.9664, "grad_norm": 0.95060133934021, "learning_rate": 1.7205881427391212e-07, "loss": 1.1938, "step": 604 }, { "epoch": 0.968, "grad_norm": 0.8598877787590027, "learning_rate": 1.5607908315035667e-07, "loss": 1.2414, "step": 605 }, { "epoch": 0.9696, "grad_norm": 0.9289694428443909, "learning_rate": 1.4087568158415998e-07, "loss": 1.283, "step": 606 }, { "epoch": 0.9712, "grad_norm": 0.9522616267204285, "learning_rate": 1.264490846553279e-07, "loss": 1.1906, "step": 607 }, { "epoch": 0.9728, "grad_norm": 0.884294867515564, "learning_rate": 1.1279974317005837e-07, "loss": 1.3421, "step": 608 }, { "epoch": 0.9744, "grad_norm": 0.8290904760360718, "learning_rate": 9.992808364666373e-08, "loss": 1.2514, "step": 609 }, { "epoch": 0.976, "grad_norm": 0.922666072845459, "learning_rate": 8.783450830224249e-08, "loss": 1.3292, "step": 610 }, { "epoch": 0.9776, "grad_norm": 0.9178884029388428, "learning_rate": 7.651939504010885e-08, "loss": 1.1438, "step": 611 }, { "epoch": 0.9792, "grad_norm": 0.9293354153633118, "learning_rate": 6.598309743797437e-08, "loss": 1.2618, "step": 612 }, { "epoch": 0.9808, "grad_norm": 0.9244751930236816, "learning_rate": 5.622594473692067e-08, "loss": 1.2053, "step": 613 }, { "epoch": 0.9824, "grad_norm": 0.8976507782936096, "learning_rate": 4.724824183108822e-08, "loss": 1.3884, "step": 614 }, { "epoch": 0.984, "grad_norm": 0.9788684844970703, "learning_rate": 3.90502692581729e-08, "loss": 1.2931, "step": 615 }, { "epoch": 0.9856, "grad_norm": 0.8698328137397766, "learning_rate": 3.163228319062739e-08, "loss": 1.1533, "step": 616 }, { "epoch": 0.9872, "grad_norm": 0.9049240350723267, "learning_rate": 2.4994515427695374e-08, "loss": 1.2941, "step": 617 }, { "epoch": 0.9888, "grad_norm": 0.846856415271759, "learning_rate": 1.913717338813126e-08, "loss": 1.2586, "step": 618 }, { "epoch": 0.9904, "grad_norm": 0.9132593870162964, "learning_rate": 1.4060440103746964e-08, "loss": 1.1571, "step": 619 }, { "epoch": 0.992, "grad_norm": 0.8558741211891174, "learning_rate": 9.764474213677654e-09, "loss": 1.2113, "step": 620 }, { "epoch": 0.9936, "grad_norm": 0.8717495203018188, "learning_rate": 6.249409959421803e-09, "loss": 1.2505, "step": 621 }, { "epoch": 0.9952, "grad_norm": 0.8598109483718872, "learning_rate": 3.5153571806584296e-09, "loss": 1.1295, "step": 622 }, { "epoch": 0.9968, "grad_norm": 0.8682437539100647, "learning_rate": 1.5624013118137326e-09, "loss": 1.2072, "step": 623 }, { "epoch": 0.9984, "grad_norm": 0.9119482636451721, "learning_rate": 3.9060337937435464e-10, "loss": 1.1789, "step": 624 }, { "epoch": 1.0, "grad_norm": 0.9116878509521484, "learning_rate": 0.0, "loss": 1.1926, "step": 625 } ], "logging_steps": 1, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.156461828037018e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }