{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 315, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015873015873015872, "grad_norm": 3.3795859813690186, "learning_rate": 6.25e-07, "loss": 0.4037, "step": 1 }, { "epoch": 0.031746031746031744, "grad_norm": 4.176924228668213, "learning_rate": 1.25e-06, "loss": 0.6101, "step": 2 }, { "epoch": 0.047619047619047616, "grad_norm": 3.2537522315979004, "learning_rate": 1.8750000000000003e-06, "loss": 0.5335, "step": 3 }, { "epoch": 0.06349206349206349, "grad_norm": 3.8701913356781006, "learning_rate": 2.5e-06, "loss": 0.5156, "step": 4 }, { "epoch": 0.07936507936507936, "grad_norm": 3.2381300926208496, "learning_rate": 3.125e-06, "loss": 0.5435, "step": 5 }, { "epoch": 0.09523809523809523, "grad_norm": 2.6569769382476807, "learning_rate": 3.7500000000000005e-06, "loss": 0.4349, "step": 6 }, { "epoch": 0.1111111111111111, "grad_norm": 2.641036033630371, "learning_rate": 4.3750000000000005e-06, "loss": 0.4657, "step": 7 }, { "epoch": 0.12698412698412698, "grad_norm": 2.500192880630493, "learning_rate": 5e-06, "loss": 0.387, "step": 8 }, { "epoch": 0.14285714285714285, "grad_norm": 2.925560474395752, "learning_rate": 5.625e-06, "loss": 0.5708, "step": 9 }, { "epoch": 0.15873015873015872, "grad_norm": 1.740651249885559, "learning_rate": 6.25e-06, "loss": 0.4166, "step": 10 }, { "epoch": 0.1746031746031746, "grad_norm": 1.8194835186004639, "learning_rate": 6.875e-06, "loss": 0.4392, "step": 11 }, { "epoch": 0.19047619047619047, "grad_norm": 1.2873449325561523, "learning_rate": 7.500000000000001e-06, "loss": 0.3224, "step": 12 }, { "epoch": 0.20634920634920634, "grad_norm": 1.4529649019241333, "learning_rate": 8.125000000000001e-06, "loss": 0.3407, "step": 13 }, { "epoch": 0.2222222222222222, "grad_norm": 1.3093940019607544, "learning_rate": 8.750000000000001e-06, "loss": 0.3677, "step": 14 }, { "epoch": 0.23809523809523808, "grad_norm": 1.4765881299972534, "learning_rate": 9.375000000000001e-06, "loss": 0.3792, "step": 15 }, { "epoch": 0.25396825396825395, "grad_norm": 5.271663665771484, "learning_rate": 1e-05, "loss": 0.6346, "step": 16 }, { "epoch": 0.2698412698412698, "grad_norm": 1.3494993448257446, "learning_rate": 9.999724009977419e-06, "loss": 0.4433, "step": 17 }, { "epoch": 0.2857142857142857, "grad_norm": 1.115939736366272, "learning_rate": 9.998896070377873e-06, "loss": 0.2864, "step": 18 }, { "epoch": 0.30158730158730157, "grad_norm": 1.5331807136535645, "learning_rate": 9.99751627260259e-06, "loss": 0.3423, "step": 19 }, { "epoch": 0.31746031746031744, "grad_norm": 1.5603549480438232, "learning_rate": 9.995584768975735e-06, "loss": 0.3799, "step": 20 }, { "epoch": 0.3333333333333333, "grad_norm": 1.4752039909362793, "learning_rate": 9.993101772727602e-06, "loss": 0.3565, "step": 21 }, { "epoch": 0.3492063492063492, "grad_norm": 9.61251449584961, "learning_rate": 9.990067557971068e-06, "loss": 0.7244, "step": 22 }, { "epoch": 0.36507936507936506, "grad_norm": 1.3895500898361206, "learning_rate": 9.986482459671332e-06, "loss": 0.3444, "step": 23 }, { "epoch": 0.38095238095238093, "grad_norm": 1.231410026550293, "learning_rate": 9.982346873608936e-06, "loss": 0.3221, "step": 24 }, { "epoch": 0.3968253968253968, "grad_norm": 1.9909180402755737, "learning_rate": 9.977661256336081e-06, "loss": 0.3724, "step": 25 }, { "epoch": 0.4126984126984127, "grad_norm": 1.2321062088012695, "learning_rate": 9.972426125126208e-06, "loss": 0.3391, "step": 26 }, { "epoch": 0.42857142857142855, "grad_norm": 5.4099321365356445, "learning_rate": 9.966642057916915e-06, "loss": 0.5806, "step": 27 }, { "epoch": 0.4444444444444444, "grad_norm": 1.0613089799880981, "learning_rate": 9.960309693246135e-06, "loss": 0.2237, "step": 28 }, { "epoch": 0.4603174603174603, "grad_norm": 1.1249053478240967, "learning_rate": 9.953429730181653e-06, "loss": 0.3618, "step": 29 }, { "epoch": 0.47619047619047616, "grad_norm": 1.0634913444519043, "learning_rate": 9.94600292824394e-06, "loss": 0.3177, "step": 30 }, { "epoch": 0.49206349206349204, "grad_norm": 1.1851016283035278, "learning_rate": 9.938030107322284e-06, "loss": 0.2799, "step": 31 }, { "epoch": 0.5079365079365079, "grad_norm": 1.2439275979995728, "learning_rate": 9.929512147584297e-06, "loss": 0.3415, "step": 32 }, { "epoch": 0.5238095238095238, "grad_norm": 1.1060391664505005, "learning_rate": 9.920449989378741e-06, "loss": 0.2888, "step": 33 }, { "epoch": 0.5396825396825397, "grad_norm": 1.4267876148223877, "learning_rate": 9.910844633131712e-06, "loss": 0.3724, "step": 34 }, { "epoch": 0.5555555555555556, "grad_norm": 1.1967415809631348, "learning_rate": 9.90069713923621e-06, "loss": 0.3362, "step": 35 }, { "epoch": 0.5714285714285714, "grad_norm": 0.9548850059509277, "learning_rate": 9.890008627935057e-06, "loss": 0.2776, "step": 36 }, { "epoch": 0.5873015873015873, "grad_norm": 1.1262502670288086, "learning_rate": 9.878780279197246e-06, "loss": 0.3215, "step": 37 }, { "epoch": 0.6031746031746031, "grad_norm": 0.9589006304740906, "learning_rate": 9.867013332587667e-06, "loss": 0.2871, "step": 38 }, { "epoch": 0.6190476190476191, "grad_norm": 1.2175616025924683, "learning_rate": 9.854709087130261e-06, "loss": 0.2672, "step": 39 }, { "epoch": 0.6349206349206349, "grad_norm": 1.1927433013916016, "learning_rate": 9.841868901164621e-06, "loss": 0.3167, "step": 40 }, { "epoch": 0.6507936507936508, "grad_norm": 1.1496555805206299, "learning_rate": 9.828494192196037e-06, "loss": 0.331, "step": 41 }, { "epoch": 0.6666666666666666, "grad_norm": 2.9185266494750977, "learning_rate": 9.814586436738998e-06, "loss": 0.4792, "step": 42 }, { "epoch": 0.6825396825396826, "grad_norm": 4.013699054718018, "learning_rate": 9.8001471701542e-06, "loss": 0.3316, "step": 43 }, { "epoch": 0.6984126984126984, "grad_norm": 1.1543551683425903, "learning_rate": 9.785177986479049e-06, "loss": 0.2954, "step": 44 }, { "epoch": 0.7142857142857143, "grad_norm": 0.9748827219009399, "learning_rate": 9.76968053825168e-06, "loss": 0.2267, "step": 45 }, { "epoch": 0.7301587301587301, "grad_norm": 1.0915194749832153, "learning_rate": 9.753656536328529e-06, "loss": 0.3005, "step": 46 }, { "epoch": 0.746031746031746, "grad_norm": 2.678406238555908, "learning_rate": 9.737107749695456e-06, "loss": 0.2828, "step": 47 }, { "epoch": 0.7619047619047619, "grad_norm": 0.9532033801078796, "learning_rate": 9.72003600527246e-06, "loss": 0.2658, "step": 48 }, { "epoch": 0.7777777777777778, "grad_norm": 1.1294094324111938, "learning_rate": 9.702443187711991e-06, "loss": 0.2978, "step": 49 }, { "epoch": 0.7936507936507936, "grad_norm": 1.067834734916687, "learning_rate": 9.6843312391909e-06, "loss": 0.2941, "step": 50 }, { "epoch": 0.8095238095238095, "grad_norm": 1.0761404037475586, "learning_rate": 9.665702159196014e-06, "loss": 0.3056, "step": 51 }, { "epoch": 0.8253968253968254, "grad_norm": 4.403201103210449, "learning_rate": 9.646558004303419e-06, "loss": 0.3728, "step": 52 }, { "epoch": 0.8412698412698413, "grad_norm": 1.0028859376907349, "learning_rate": 9.62690088795141e-06, "loss": 0.2765, "step": 53 }, { "epoch": 0.8571428571428571, "grad_norm": 1.0655666589736938, "learning_rate": 9.606732980207186e-06, "loss": 0.2912, "step": 54 }, { "epoch": 0.873015873015873, "grad_norm": 8.566058158874512, "learning_rate": 9.586056507527266e-06, "loss": 0.5695, "step": 55 }, { "epoch": 0.8888888888888888, "grad_norm": 1.487505555152893, "learning_rate": 9.564873752511719e-06, "loss": 0.2906, "step": 56 }, { "epoch": 0.9047619047619048, "grad_norm": 2.458684206008911, "learning_rate": 9.543187053652156e-06, "loss": 0.4637, "step": 57 }, { "epoch": 0.9206349206349206, "grad_norm": 1.1918795108795166, "learning_rate": 9.520998805073583e-06, "loss": 0.347, "step": 58 }, { "epoch": 0.9365079365079365, "grad_norm": 1.9880917072296143, "learning_rate": 9.498311456270091e-06, "loss": 0.3115, "step": 59 }, { "epoch": 0.9523809523809523, "grad_norm": 1.1867977380752563, "learning_rate": 9.475127511834438e-06, "loss": 0.3172, "step": 60 }, { "epoch": 0.9682539682539683, "grad_norm": 1.2388020753860474, "learning_rate": 9.451449531181571e-06, "loss": 0.252, "step": 61 }, { "epoch": 0.9841269841269841, "grad_norm": 9.597868919372559, "learning_rate": 9.427280128266049e-06, "loss": 0.2668, "step": 62 }, { "epoch": 1.0, "grad_norm": 0.9280756711959839, "learning_rate": 9.4026219712935e-06, "loss": 0.2107, "step": 63 }, { "epoch": 1.0158730158730158, "grad_norm": 0.9878024458885193, "learning_rate": 9.377477782426041e-06, "loss": 0.2008, "step": 64 }, { "epoch": 1.0317460317460316, "grad_norm": 2.081068992614746, "learning_rate": 9.351850337481774e-06, "loss": 0.1998, "step": 65 }, { "epoch": 1.0476190476190477, "grad_norm": 0.9242327213287354, "learning_rate": 9.325742465628342e-06, "loss": 0.1791, "step": 66 }, { "epoch": 1.0634920634920635, "grad_norm": 2.085343599319458, "learning_rate": 9.299157049070604e-06, "loss": 0.1787, "step": 67 }, { "epoch": 1.0793650793650793, "grad_norm": 0.815376877784729, "learning_rate": 9.272097022732444e-06, "loss": 0.1678, "step": 68 }, { "epoch": 1.0952380952380953, "grad_norm": 0.8281353116035461, "learning_rate": 9.244565373932775e-06, "loss": 0.1646, "step": 69 }, { "epoch": 1.1111111111111112, "grad_norm": 0.828342616558075, "learning_rate": 9.216565142055745e-06, "loss": 0.155, "step": 70 }, { "epoch": 1.126984126984127, "grad_norm": 0.77997225522995, "learning_rate": 9.188099418215208e-06, "loss": 0.1176, "step": 71 }, { "epoch": 1.1428571428571428, "grad_norm": 0.9490588903427124, "learning_rate": 9.159171344913469e-06, "loss": 0.1316, "step": 72 }, { "epoch": 1.1587301587301586, "grad_norm": 1.061501145362854, "learning_rate": 9.129784115694368e-06, "loss": 0.1759, "step": 73 }, { "epoch": 1.1746031746031746, "grad_norm": 1.077555537223816, "learning_rate": 9.09994097479073e-06, "loss": 0.1607, "step": 74 }, { "epoch": 1.1904761904761905, "grad_norm": 0.8213773965835571, "learning_rate": 9.069645216766207e-06, "loss": 0.1123, "step": 75 }, { "epoch": 1.2063492063492063, "grad_norm": 0.8540403246879578, "learning_rate": 9.038900186151574e-06, "loss": 0.1306, "step": 76 }, { "epoch": 1.2222222222222223, "grad_norm": 1.0110580921173096, "learning_rate": 9.007709277075512e-06, "loss": 0.1644, "step": 77 }, { "epoch": 1.2380952380952381, "grad_norm": 1.3765249252319336, "learning_rate": 8.976075932889896e-06, "loss": 0.2027, "step": 78 }, { "epoch": 1.253968253968254, "grad_norm": 1.0460267066955566, "learning_rate": 8.944003645789678e-06, "loss": 0.1635, "step": 79 }, { "epoch": 1.2698412698412698, "grad_norm": 0.8925233483314514, "learning_rate": 8.911495956427358e-06, "loss": 0.1538, "step": 80 }, { "epoch": 1.2857142857142856, "grad_norm": 1.1790034770965576, "learning_rate": 8.8785564535221e-06, "loss": 0.164, "step": 81 }, { "epoch": 1.3015873015873016, "grad_norm": 1.1329326629638672, "learning_rate": 8.845188773463567e-06, "loss": 0.1449, "step": 82 }, { "epoch": 1.3174603174603174, "grad_norm": 4.996532440185547, "learning_rate": 8.811396599910467e-06, "loss": 0.247, "step": 83 }, { "epoch": 1.3333333333333333, "grad_norm": 0.74964839220047, "learning_rate": 8.777183663383897e-06, "loss": 0.1259, "step": 84 }, { "epoch": 1.3492063492063493, "grad_norm": 1.8605432510375977, "learning_rate": 8.742553740855507e-06, "loss": 0.1935, "step": 85 }, { "epoch": 1.3650793650793651, "grad_norm": 0.9615695476531982, "learning_rate": 8.707510655330536e-06, "loss": 0.1772, "step": 86 }, { "epoch": 1.380952380952381, "grad_norm": 1.0028129816055298, "learning_rate": 8.672058275425773e-06, "loss": 0.159, "step": 87 }, { "epoch": 1.3968253968253967, "grad_norm": 0.780433177947998, "learning_rate": 8.636200514942466e-06, "loss": 0.1374, "step": 88 }, { "epoch": 1.4126984126984126, "grad_norm": 2.116007089614868, "learning_rate": 8.59994133243427e-06, "loss": 0.1951, "step": 89 }, { "epoch": 1.4285714285714286, "grad_norm": 0.8867003917694092, "learning_rate": 8.563284730770222e-06, "loss": 0.147, "step": 90 }, { "epoch": 1.4444444444444444, "grad_norm": 0.8089895248413086, "learning_rate": 8.52623475669285e-06, "loss": 0.1109, "step": 91 }, { "epoch": 1.4603174603174602, "grad_norm": 1.093917727470398, "learning_rate": 8.488795500371427e-06, "loss": 0.1328, "step": 92 }, { "epoch": 1.4761904761904763, "grad_norm": 1.0122655630111694, "learning_rate": 8.450971094950433e-06, "loss": 0.1998, "step": 93 }, { "epoch": 1.492063492063492, "grad_norm": 3.2237043380737305, "learning_rate": 8.412765716093273e-06, "loss": 0.1949, "step": 94 }, { "epoch": 1.507936507936508, "grad_norm": 0.8921442031860352, "learning_rate": 8.374183581521288e-06, "loss": 0.1659, "step": 95 }, { "epoch": 1.5238095238095237, "grad_norm": 0.9087255001068115, "learning_rate": 8.335228950548164e-06, "loss": 0.1381, "step": 96 }, { "epoch": 1.5396825396825395, "grad_norm": 0.8095484972000122, "learning_rate": 8.29590612360969e-06, "loss": 0.1418, "step": 97 }, { "epoch": 1.5555555555555556, "grad_norm": 0.9046949744224548, "learning_rate": 8.256219441789023e-06, "loss": 0.1362, "step": 98 }, { "epoch": 1.5714285714285714, "grad_norm": 1.0229334831237793, "learning_rate": 8.216173286337449e-06, "loss": 0.1548, "step": 99 }, { "epoch": 1.5873015873015874, "grad_norm": 0.7955384254455566, "learning_rate": 8.175772078190706e-06, "loss": 0.0964, "step": 100 }, { "epoch": 1.6031746031746033, "grad_norm": 0.8258505463600159, "learning_rate": 8.135020277480933e-06, "loss": 0.1436, "step": 101 }, { "epoch": 1.619047619047619, "grad_norm": 0.9820398688316345, "learning_rate": 8.093922383044293e-06, "loss": 0.1875, "step": 102 }, { "epoch": 1.6349206349206349, "grad_norm": 0.8524972796440125, "learning_rate": 8.052482931924307e-06, "loss": 0.1381, "step": 103 }, { "epoch": 1.6507936507936507, "grad_norm": 1.515363097190857, "learning_rate": 8.010706498870997e-06, "loss": 0.1783, "step": 104 }, { "epoch": 1.6666666666666665, "grad_norm": 0.9057204127311707, "learning_rate": 7.968597695835845e-06, "loss": 0.1491, "step": 105 }, { "epoch": 1.6825396825396826, "grad_norm": 0.73180091381073, "learning_rate": 7.926161171462647e-06, "loss": 0.1216, "step": 106 }, { "epoch": 1.6984126984126984, "grad_norm": 1.321354627609253, "learning_rate": 7.883401610574338e-06, "loss": 0.1496, "step": 107 }, { "epoch": 1.7142857142857144, "grad_norm": 0.9403553009033203, "learning_rate": 7.84032373365578e-06, "loss": 0.1509, "step": 108 }, { "epoch": 1.7301587301587302, "grad_norm": 0.9378424286842346, "learning_rate": 7.796932296332666e-06, "loss": 0.16, "step": 109 }, { "epoch": 1.746031746031746, "grad_norm": 0.8329445123672485, "learning_rate": 7.753232088846505e-06, "loss": 0.0972, "step": 110 }, { "epoch": 1.7619047619047619, "grad_norm": 1.2565405368804932, "learning_rate": 7.709227935525796e-06, "loss": 0.1837, "step": 111 }, { "epoch": 1.7777777777777777, "grad_norm": 0.9211390614509583, "learning_rate": 7.664924694253444e-06, "loss": 0.1472, "step": 112 }, { "epoch": 1.7936507936507935, "grad_norm": 0.8232191801071167, "learning_rate": 7.620327255930475e-06, "loss": 0.123, "step": 113 }, { "epoch": 1.8095238095238095, "grad_norm": 0.8965691328048706, "learning_rate": 7.575440543936092e-06, "loss": 0.1329, "step": 114 }, { "epoch": 1.8253968253968254, "grad_norm": 0.9043343663215637, "learning_rate": 7.530269513584158e-06, "loss": 0.1268, "step": 115 }, { "epoch": 1.8412698412698414, "grad_norm": 0.822960376739502, "learning_rate": 7.484819151576148e-06, "loss": 0.128, "step": 116 }, { "epoch": 1.8571428571428572, "grad_norm": 1.0179648399353027, "learning_rate": 7.439094475450638e-06, "loss": 0.1732, "step": 117 }, { "epoch": 1.873015873015873, "grad_norm": 0.888350784778595, "learning_rate": 7.393100533029383e-06, "loss": 0.1608, "step": 118 }, { "epoch": 1.8888888888888888, "grad_norm": 0.8138338923454285, "learning_rate": 7.346842401860069e-06, "loss": 0.1534, "step": 119 }, { "epoch": 1.9047619047619047, "grad_norm": 1.0096592903137207, "learning_rate": 7.300325188655762e-06, "loss": 0.1705, "step": 120 }, { "epoch": 1.9206349206349205, "grad_norm": 0.8139647841453552, "learning_rate": 7.253554028731149e-06, "loss": 0.1453, "step": 121 }, { "epoch": 1.9365079365079365, "grad_norm": 1.51543128490448, "learning_rate": 7.206534085435626e-06, "loss": 0.2008, "step": 122 }, { "epoch": 1.9523809523809523, "grad_norm": 0.944955587387085, "learning_rate": 7.159270549583278e-06, "loss": 0.1793, "step": 123 }, { "epoch": 1.9682539682539684, "grad_norm": 0.943600594997406, "learning_rate": 7.111768638879834e-06, "loss": 0.1783, "step": 124 }, { "epoch": 1.9841269841269842, "grad_norm": 0.8988723158836365, "learning_rate": 7.064033597346658e-06, "loss": 0.1472, "step": 125 }, { "epoch": 2.0, "grad_norm": 0.8375394940376282, "learning_rate": 7.016070694741824e-06, "loss": 0.0907, "step": 126 }, { "epoch": 2.015873015873016, "grad_norm": 0.5929964780807495, "learning_rate": 6.967885225978366e-06, "loss": 0.0541, "step": 127 }, { "epoch": 2.0317460317460316, "grad_norm": 0.7823465466499329, "learning_rate": 6.919482510539723e-06, "loss": 0.0553, "step": 128 }, { "epoch": 2.0476190476190474, "grad_norm": 0.6394754648208618, "learning_rate": 6.870867891892511e-06, "loss": 0.0539, "step": 129 }, { "epoch": 2.0634920634920633, "grad_norm": 0.6169945001602173, "learning_rate": 6.822046736896607e-06, "loss": 0.0446, "step": 130 }, { "epoch": 2.0793650793650795, "grad_norm": 0.6922060251235962, "learning_rate": 6.773024435212678e-06, "loss": 0.0474, "step": 131 }, { "epoch": 2.0952380952380953, "grad_norm": 0.6317225098609924, "learning_rate": 6.723806398707186e-06, "loss": 0.0472, "step": 132 }, { "epoch": 2.111111111111111, "grad_norm": 0.6741769909858704, "learning_rate": 6.674398060854931e-06, "loss": 0.042, "step": 133 }, { "epoch": 2.126984126984127, "grad_norm": 0.9402458071708679, "learning_rate": 6.624804876139227e-06, "loss": 0.0647, "step": 134 }, { "epoch": 2.142857142857143, "grad_norm": 0.8518017530441284, "learning_rate": 6.57503231944974e-06, "loss": 0.0559, "step": 135 }, { "epoch": 2.1587301587301586, "grad_norm": 0.9338895678520203, "learning_rate": 6.525085885478088e-06, "loss": 0.0482, "step": 136 }, { "epoch": 2.1746031746031744, "grad_norm": 0.8100844025611877, "learning_rate": 6.4749710881112485e-06, "loss": 0.0599, "step": 137 }, { "epoch": 2.1904761904761907, "grad_norm": 1.0090680122375488, "learning_rate": 6.424693459822843e-06, "loss": 0.0642, "step": 138 }, { "epoch": 2.2063492063492065, "grad_norm": 0.6279119253158569, "learning_rate": 6.374258551062377e-06, "loss": 0.0514, "step": 139 }, { "epoch": 2.2222222222222223, "grad_norm": 0.6508628129959106, "learning_rate": 6.3236719296424985e-06, "loss": 0.0464, "step": 140 }, { "epoch": 2.238095238095238, "grad_norm": 0.586854100227356, "learning_rate": 6.272939180124316e-06, "loss": 0.0407, "step": 141 }, { "epoch": 2.253968253968254, "grad_norm": 0.6627777814865112, "learning_rate": 6.222065903200909e-06, "loss": 0.0487, "step": 142 }, { "epoch": 2.2698412698412698, "grad_norm": 0.7252594232559204, "learning_rate": 6.171057715079012e-06, "loss": 0.0568, "step": 143 }, { "epoch": 2.2857142857142856, "grad_norm": 0.5209100246429443, "learning_rate": 6.119920246859025e-06, "loss": 0.0363, "step": 144 }, { "epoch": 2.3015873015873014, "grad_norm": 0.5737756490707397, "learning_rate": 6.068659143913349e-06, "loss": 0.0303, "step": 145 }, { "epoch": 2.317460317460317, "grad_norm": 1.3126976490020752, "learning_rate": 6.0172800652631706e-06, "loss": 0.074, "step": 146 }, { "epoch": 2.3333333333333335, "grad_norm": 0.6515079140663147, "learning_rate": 5.965788682953717e-06, "loss": 0.047, "step": 147 }, { "epoch": 2.3492063492063493, "grad_norm": 0.6837308406829834, "learning_rate": 5.914190681428098e-06, "loss": 0.0354, "step": 148 }, { "epoch": 2.365079365079365, "grad_norm": 1.8536609411239624, "learning_rate": 5.862491756899753e-06, "loss": 0.0605, "step": 149 }, { "epoch": 2.380952380952381, "grad_norm": 0.7980743646621704, "learning_rate": 5.8106976167236236e-06, "loss": 0.0443, "step": 150 }, { "epoch": 2.3968253968253967, "grad_norm": 0.7513721585273743, "learning_rate": 5.758813978766077e-06, "loss": 0.0554, "step": 151 }, { "epoch": 2.4126984126984126, "grad_norm": 0.9976074695587158, "learning_rate": 5.706846570773677e-06, "loss": 0.0629, "step": 152 }, { "epoch": 2.4285714285714284, "grad_norm": 0.6811009049415588, "learning_rate": 5.654801129740863e-06, "loss": 0.0395, "step": 153 }, { "epoch": 2.4444444444444446, "grad_norm": 0.8673996329307556, "learning_rate": 5.6026834012766155e-06, "loss": 0.0517, "step": 154 }, { "epoch": 2.4603174603174605, "grad_norm": 0.6963090300559998, "learning_rate": 5.550499138970158e-06, "loss": 0.0416, "step": 155 }, { "epoch": 2.4761904761904763, "grad_norm": 0.6987242102622986, "learning_rate": 5.4982541037557825e-06, "loss": 0.0416, "step": 156 }, { "epoch": 2.492063492063492, "grad_norm": 1.312720775604248, "learning_rate": 5.44595406327687e-06, "loss": 0.0499, "step": 157 }, { "epoch": 2.507936507936508, "grad_norm": 0.7223400473594666, "learning_rate": 5.393604791249158e-06, "loss": 0.0617, "step": 158 }, { "epoch": 2.5238095238095237, "grad_norm": 0.5593965649604797, "learning_rate": 5.341212066823356e-06, "loss": 0.0341, "step": 159 }, { "epoch": 2.5396825396825395, "grad_norm": 0.80050128698349, "learning_rate": 5.288781673947143e-06, "loss": 0.0818, "step": 160 }, { "epoch": 2.5555555555555554, "grad_norm": 0.9423879981040955, "learning_rate": 5.2363194007266435e-06, "loss": 0.0533, "step": 161 }, { "epoch": 2.571428571428571, "grad_norm": 0.5583639740943909, "learning_rate": 5.183831038787449e-06, "loss": 0.0308, "step": 162 }, { "epoch": 2.5873015873015874, "grad_norm": 0.9041386842727661, "learning_rate": 5.131322382635236e-06, "loss": 0.0393, "step": 163 }, { "epoch": 2.6031746031746033, "grad_norm": 0.7514568567276001, "learning_rate": 5.078799229016083e-06, "loss": 0.0379, "step": 164 }, { "epoch": 2.619047619047619, "grad_norm": 0.6620778441429138, "learning_rate": 5.0262673762765316e-06, "loss": 0.042, "step": 165 }, { "epoch": 2.634920634920635, "grad_norm": 0.7500037550926208, "learning_rate": 4.973732623723471e-06, "loss": 0.0494, "step": 166 }, { "epoch": 2.6507936507936507, "grad_norm": 0.6531999707221985, "learning_rate": 4.921200770983919e-06, "loss": 0.0364, "step": 167 }, { "epoch": 2.6666666666666665, "grad_norm": 1.0730338096618652, "learning_rate": 4.8686776173647655e-06, "loss": 0.0554, "step": 168 }, { "epoch": 2.682539682539683, "grad_norm": 0.6723483204841614, "learning_rate": 4.816168961212553e-06, "loss": 0.0532, "step": 169 }, { "epoch": 2.6984126984126986, "grad_norm": 0.7021626234054565, "learning_rate": 4.763680599273357e-06, "loss": 0.0611, "step": 170 }, { "epoch": 2.7142857142857144, "grad_norm": 0.7522814273834229, "learning_rate": 4.711218326052859e-06, "loss": 0.0426, "step": 171 }, { "epoch": 2.7301587301587302, "grad_norm": 0.6586665511131287, "learning_rate": 4.6587879331766465e-06, "loss": 0.0368, "step": 172 }, { "epoch": 2.746031746031746, "grad_norm": 1.2219913005828857, "learning_rate": 4.606395208750844e-06, "loss": 0.0852, "step": 173 }, { "epoch": 2.761904761904762, "grad_norm": 0.6554266810417175, "learning_rate": 4.554045936723132e-06, "loss": 0.0516, "step": 174 }, { "epoch": 2.7777777777777777, "grad_norm": 0.5427403450012207, "learning_rate": 4.501745896244219e-06, "loss": 0.0388, "step": 175 }, { "epoch": 2.7936507936507935, "grad_norm": 0.5815466046333313, "learning_rate": 4.4495008610298435e-06, "loss": 0.0452, "step": 176 }, { "epoch": 2.8095238095238093, "grad_norm": 0.957595944404602, "learning_rate": 4.397316598723385e-06, "loss": 0.0584, "step": 177 }, { "epoch": 2.825396825396825, "grad_norm": 0.6840035915374756, "learning_rate": 4.345198870259139e-06, "loss": 0.0503, "step": 178 }, { "epoch": 2.8412698412698414, "grad_norm": 0.7481380105018616, "learning_rate": 4.2931534292263265e-06, "loss": 0.0561, "step": 179 }, { "epoch": 2.857142857142857, "grad_norm": 0.5073666572570801, "learning_rate": 4.241186021233925e-06, "loss": 0.0339, "step": 180 }, { "epoch": 2.873015873015873, "grad_norm": 0.6581488251686096, "learning_rate": 4.189302383276378e-06, "loss": 0.0629, "step": 181 }, { "epoch": 2.888888888888889, "grad_norm": 0.623148500919342, "learning_rate": 4.137508243100249e-06, "loss": 0.0521, "step": 182 }, { "epoch": 2.9047619047619047, "grad_norm": 0.621598482131958, "learning_rate": 4.085809318571905e-06, "loss": 0.0458, "step": 183 }, { "epoch": 2.9206349206349205, "grad_norm": 0.6772786974906921, "learning_rate": 4.034211317046285e-06, "loss": 0.044, "step": 184 }, { "epoch": 2.9365079365079367, "grad_norm": 1.262326955795288, "learning_rate": 3.982719934736832e-06, "loss": 0.046, "step": 185 }, { "epoch": 2.9523809523809526, "grad_norm": 0.6345784068107605, "learning_rate": 3.931340856086652e-06, "loss": 0.042, "step": 186 }, { "epoch": 2.9682539682539684, "grad_norm": 0.5799025297164917, "learning_rate": 3.880079753140978e-06, "loss": 0.0467, "step": 187 }, { "epoch": 2.984126984126984, "grad_norm": 0.6458066701889038, "learning_rate": 3.82894228492099e-06, "loss": 0.047, "step": 188 }, { "epoch": 3.0, "grad_norm": 0.5344202518463135, "learning_rate": 3.777934096799094e-06, "loss": 0.0418, "step": 189 }, { "epoch": 3.015873015873016, "grad_norm": 0.3208453953266144, "learning_rate": 3.7270608198756852e-06, "loss": 0.0124, "step": 190 }, { "epoch": 3.0317460317460316, "grad_norm": 0.48605024814605713, "learning_rate": 3.676328070357503e-06, "loss": 0.0332, "step": 191 }, { "epoch": 3.0476190476190474, "grad_norm": 0.5282842516899109, "learning_rate": 3.6257414489376217e-06, "loss": 0.0155, "step": 192 }, { "epoch": 3.0634920634920633, "grad_norm": 0.3682861030101776, "learning_rate": 3.5753065401771577e-06, "loss": 0.017, "step": 193 }, { "epoch": 3.0793650793650795, "grad_norm": 0.4284667372703552, "learning_rate": 3.5250289118887515e-06, "loss": 0.0107, "step": 194 }, { "epoch": 3.0952380952380953, "grad_norm": 0.36055704951286316, "learning_rate": 3.4749141145219118e-06, "loss": 0.0116, "step": 195 }, { "epoch": 3.111111111111111, "grad_norm": 0.42703211307525635, "learning_rate": 3.424967680550261e-06, "loss": 0.0089, "step": 196 }, { "epoch": 3.126984126984127, "grad_norm": 0.4422130882740021, "learning_rate": 3.3751951238607745e-06, "loss": 0.015, "step": 197 }, { "epoch": 3.142857142857143, "grad_norm": 1.8617546558380127, "learning_rate": 3.3256019391450696e-06, "loss": 0.044, "step": 198 }, { "epoch": 3.1587301587301586, "grad_norm": 0.4572526216506958, "learning_rate": 3.2761936012928147e-06, "loss": 0.0092, "step": 199 }, { "epoch": 3.1746031746031744, "grad_norm": 0.606041431427002, "learning_rate": 3.226975564787322e-06, "loss": 0.013, "step": 200 }, { "epoch": 3.1904761904761907, "grad_norm": 0.6772161722183228, "learning_rate": 3.177953263103394e-06, "loss": 0.0233, "step": 201 }, { "epoch": 3.2063492063492065, "grad_norm": 0.4099721908569336, "learning_rate": 3.1291321081074887e-06, "loss": 0.0074, "step": 202 }, { "epoch": 3.2222222222222223, "grad_norm": 0.34896424412727356, "learning_rate": 3.0805174894602775e-06, "loss": 0.0081, "step": 203 }, { "epoch": 3.238095238095238, "grad_norm": 0.5158839821815491, "learning_rate": 3.032114774021636e-06, "loss": 0.0174, "step": 204 }, { "epoch": 3.253968253968254, "grad_norm": 0.6538414359092712, "learning_rate": 2.9839293052581767e-06, "loss": 0.0188, "step": 205 }, { "epoch": 3.2698412698412698, "grad_norm": 0.4532378911972046, "learning_rate": 2.9359664026533443e-06, "loss": 0.0123, "step": 206 }, { "epoch": 3.2857142857142856, "grad_norm": 0.4588467478752136, "learning_rate": 2.8882313611201684e-06, "loss": 0.0154, "step": 207 }, { "epoch": 3.3015873015873014, "grad_norm": 0.6380066871643066, "learning_rate": 2.8407294504167238e-06, "loss": 0.0168, "step": 208 }, { "epoch": 3.317460317460317, "grad_norm": 0.3832227289676666, "learning_rate": 2.793465914564375e-06, "loss": 0.0118, "step": 209 }, { "epoch": 3.3333333333333335, "grad_norm": 0.4849812090396881, "learning_rate": 2.7464459712688517e-06, "loss": 0.0156, "step": 210 }, { "epoch": 3.3492063492063493, "grad_norm": 0.5382815003395081, "learning_rate": 2.6996748113442397e-06, "loss": 0.0162, "step": 211 }, { "epoch": 3.365079365079365, "grad_norm": 0.4217240810394287, "learning_rate": 2.653157598139932e-06, "loss": 0.0144, "step": 212 }, { "epoch": 3.380952380952381, "grad_norm": 0.3669329881668091, "learning_rate": 2.6068994669706184e-06, "loss": 0.0068, "step": 213 }, { "epoch": 3.3968253968253967, "grad_norm": 0.3910522162914276, "learning_rate": 2.560905524549364e-06, "loss": 0.0134, "step": 214 }, { "epoch": 3.4126984126984126, "grad_norm": 0.3835786283016205, "learning_rate": 2.515180848423853e-06, "loss": 0.009, "step": 215 }, { "epoch": 3.4285714285714284, "grad_norm": 0.39538145065307617, "learning_rate": 2.469730486415842e-06, "loss": 0.0154, "step": 216 }, { "epoch": 3.4444444444444446, "grad_norm": 0.48559364676475525, "learning_rate": 2.4245594560639086e-06, "loss": 0.0159, "step": 217 }, { "epoch": 3.4603174603174605, "grad_norm": 0.6111780405044556, "learning_rate": 2.379672744069527e-06, "loss": 0.0216, "step": 218 }, { "epoch": 3.4761904761904763, "grad_norm": 0.48673927783966064, "learning_rate": 2.335075305746558e-06, "loss": 0.0124, "step": 219 }, { "epoch": 3.492063492063492, "grad_norm": 0.4308861792087555, "learning_rate": 2.2907720644742064e-06, "loss": 0.0112, "step": 220 }, { "epoch": 3.507936507936508, "grad_norm": 0.39692750573158264, "learning_rate": 2.2467679111534963e-06, "loss": 0.0162, "step": 221 }, { "epoch": 3.5238095238095237, "grad_norm": 0.4090782701969147, "learning_rate": 2.2030677036673345e-06, "loss": 0.0214, "step": 222 }, { "epoch": 3.5396825396825395, "grad_norm": 0.4135613441467285, "learning_rate": 2.159676266344222e-06, "loss": 0.0157, "step": 223 }, { "epoch": 3.5555555555555554, "grad_norm": 0.8922034502029419, "learning_rate": 2.1165983894256647e-06, "loss": 0.0355, "step": 224 }, { "epoch": 3.571428571428571, "grad_norm": 0.25958430767059326, "learning_rate": 2.0738388285373532e-06, "loss": 0.0057, "step": 225 }, { "epoch": 3.5873015873015874, "grad_norm": 0.3267192244529724, "learning_rate": 2.0314023041641567e-06, "loss": 0.0089, "step": 226 }, { "epoch": 3.6031746031746033, "grad_norm": 0.4624428451061249, "learning_rate": 1.9892935011290037e-06, "loss": 0.0159, "step": 227 }, { "epoch": 3.619047619047619, "grad_norm": 0.2781091630458832, "learning_rate": 1.947517068075694e-06, "loss": 0.0083, "step": 228 }, { "epoch": 3.634920634920635, "grad_norm": 0.31553471088409424, "learning_rate": 1.9060776169557083e-06, "loss": 0.0137, "step": 229 }, { "epoch": 3.6507936507936507, "grad_norm": 0.5683518052101135, "learning_rate": 1.864979722519068e-06, "loss": 0.0298, "step": 230 }, { "epoch": 3.6666666666666665, "grad_norm": 0.3412047028541565, "learning_rate": 1.8242279218092968e-06, "loss": 0.0137, "step": 231 }, { "epoch": 3.682539682539683, "grad_norm": 0.5104885101318359, "learning_rate": 1.7838267136625536e-06, "loss": 0.0114, "step": 232 }, { "epoch": 3.6984126984126986, "grad_norm": 0.4502926170825958, "learning_rate": 1.743780558210979e-06, "loss": 0.013, "step": 233 }, { "epoch": 3.7142857142857144, "grad_norm": 0.48987045884132385, "learning_rate": 1.704093876390312e-06, "loss": 0.0217, "step": 234 }, { "epoch": 3.7301587301587302, "grad_norm": 0.3420720398426056, "learning_rate": 1.664771049451837e-06, "loss": 0.0089, "step": 235 }, { "epoch": 3.746031746031746, "grad_norm": 0.3107622563838959, "learning_rate": 1.6258164184787123e-06, "loss": 0.0085, "step": 236 }, { "epoch": 3.761904761904762, "grad_norm": 0.5032781958580017, "learning_rate": 1.5872342839067305e-06, "loss": 0.0075, "step": 237 }, { "epoch": 3.7777777777777777, "grad_norm": 0.3874753415584564, "learning_rate": 1.5490289050495678e-06, "loss": 0.0108, "step": 238 }, { "epoch": 3.7936507936507935, "grad_norm": 0.30904141068458557, "learning_rate": 1.511204499628574e-06, "loss": 0.0086, "step": 239 }, { "epoch": 3.8095238095238093, "grad_norm": 0.39750581979751587, "learning_rate": 1.4737652433071515e-06, "loss": 0.0135, "step": 240 }, { "epoch": 3.825396825396825, "grad_norm": 0.5290718674659729, "learning_rate": 1.4367152692297799e-06, "loss": 0.0187, "step": 241 }, { "epoch": 3.8412698412698414, "grad_norm": 0.4600606858730316, "learning_rate": 1.4000586675657312e-06, "loss": 0.0111, "step": 242 }, { "epoch": 3.857142857142857, "grad_norm": 0.461280882358551, "learning_rate": 1.3637994850575342e-06, "loss": 0.0147, "step": 243 }, { "epoch": 3.873015873015873, "grad_norm": 0.5423941016197205, "learning_rate": 1.3279417245742288e-06, "loss": 0.0236, "step": 244 }, { "epoch": 3.888888888888889, "grad_norm": 0.39030736684799194, "learning_rate": 1.2924893446694648e-06, "loss": 0.006, "step": 245 }, { "epoch": 3.9047619047619047, "grad_norm": 0.5754550099372864, "learning_rate": 1.257446259144494e-06, "loss": 0.0286, "step": 246 }, { "epoch": 3.9206349206349205, "grad_norm": 0.35015997290611267, "learning_rate": 1.222816336616104e-06, "loss": 0.0143, "step": 247 }, { "epoch": 3.9365079365079367, "grad_norm": 0.4731229245662689, "learning_rate": 1.1886034000895341e-06, "loss": 0.0177, "step": 248 }, { "epoch": 3.9523809523809526, "grad_norm": 0.3958592116832733, "learning_rate": 1.1548112265364336e-06, "loss": 0.0132, "step": 249 }, { "epoch": 3.9682539682539684, "grad_norm": 0.4273618161678314, "learning_rate": 1.1214435464779006e-06, "loss": 0.0206, "step": 250 }, { "epoch": 3.984126984126984, "grad_norm": 0.429865300655365, "learning_rate": 1.088504043572643e-06, "loss": 0.0131, "step": 251 }, { "epoch": 4.0, "grad_norm": 0.28878334164619446, "learning_rate": 1.055996354210323e-06, "loss": 0.0079, "step": 252 }, { "epoch": 4.015873015873016, "grad_norm": 0.12734107673168182, "learning_rate": 1.0239240671101065e-06, "loss": 0.0027, "step": 253 }, { "epoch": 4.031746031746032, "grad_norm": 0.24097059667110443, "learning_rate": 9.922907229244905e-07, "loss": 0.0063, "step": 254 }, { "epoch": 4.0476190476190474, "grad_norm": 0.18659502267837524, "learning_rate": 9.610998138484262e-07, "loss": 0.0032, "step": 255 }, { "epoch": 4.063492063492063, "grad_norm": 0.16850638389587402, "learning_rate": 9.303547832337934e-07, "loss": 0.0036, "step": 256 }, { "epoch": 4.079365079365079, "grad_norm": 0.13990715146064758, "learning_rate": 9.000590252092701e-07, "loss": 0.0034, "step": 257 }, { "epoch": 4.095238095238095, "grad_norm": 0.16486185789108276, "learning_rate": 8.702158843056319e-07, "loss": 0.0029, "step": 258 }, { "epoch": 4.111111111111111, "grad_norm": 0.13398580253124237, "learning_rate": 8.408286550865319e-07, "loss": 0.0041, "step": 259 }, { "epoch": 4.1269841269841265, "grad_norm": 0.14831550419330597, "learning_rate": 8.119005817847924e-07, "loss": 0.0039, "step": 260 }, { "epoch": 4.142857142857143, "grad_norm": 0.1601114124059677, "learning_rate": 7.834348579442552e-07, "loss": 0.0034, "step": 261 }, { "epoch": 4.158730158730159, "grad_norm": 0.2705823481082916, "learning_rate": 7.554346260672263e-07, "loss": 0.0054, "step": 262 }, { "epoch": 4.174603174603175, "grad_norm": 0.17291611433029175, "learning_rate": 7.279029772675572e-07, "loss": 0.0025, "step": 263 }, { "epoch": 4.190476190476191, "grad_norm": 0.11752714961767197, "learning_rate": 7.008429509293979e-07, "loss": 0.002, "step": 264 }, { "epoch": 4.2063492063492065, "grad_norm": 0.2471189945936203, "learning_rate": 6.742575343716584e-07, "loss": 0.0053, "step": 265 }, { "epoch": 4.222222222222222, "grad_norm": 0.2652353048324585, "learning_rate": 6.481496625182271e-07, "loss": 0.0035, "step": 266 }, { "epoch": 4.238095238095238, "grad_norm": 0.5520457029342651, "learning_rate": 6.225222175739598e-07, "loss": 0.0094, "step": 267 }, { "epoch": 4.253968253968254, "grad_norm": 0.1428154557943344, "learning_rate": 5.973780287065007e-07, "loss": 0.002, "step": 268 }, { "epoch": 4.26984126984127, "grad_norm": 0.38164809346199036, "learning_rate": 5.727198717339511e-07, "loss": 0.0185, "step": 269 }, { "epoch": 4.285714285714286, "grad_norm": 0.2093685418367386, "learning_rate": 5.485504688184307e-07, "loss": 0.0016, "step": 270 }, { "epoch": 4.301587301587301, "grad_norm": 0.15652765333652496, "learning_rate": 5.24872488165562e-07, "loss": 0.0029, "step": 271 }, { "epoch": 4.317460317460317, "grad_norm": 0.17758126556873322, "learning_rate": 5.016885437299113e-07, "loss": 0.003, "step": 272 }, { "epoch": 4.333333333333333, "grad_norm": 0.18992979824543, "learning_rate": 4.790011949264173e-07, "loss": 0.0064, "step": 273 }, { "epoch": 4.349206349206349, "grad_norm": 0.10369639843702316, "learning_rate": 4.5681294634784437e-07, "loss": 0.0014, "step": 274 }, { "epoch": 4.365079365079365, "grad_norm": 0.09521545469760895, "learning_rate": 4.3512624748828225e-07, "loss": 0.0012, "step": 275 }, { "epoch": 4.380952380952381, "grad_norm": 0.310982346534729, "learning_rate": 4.139434924727359e-07, "loss": 0.0059, "step": 276 }, { "epoch": 4.396825396825397, "grad_norm": 0.3128473162651062, "learning_rate": 3.9326701979281623e-07, "loss": 0.0059, "step": 277 }, { "epoch": 4.412698412698413, "grad_norm": 0.33223262429237366, "learning_rate": 3.7309911204858997e-07, "loss": 0.0031, "step": 278 }, { "epoch": 4.428571428571429, "grad_norm": 0.13230553269386292, "learning_rate": 3.534419956965823e-07, "loss": 0.0024, "step": 279 }, { "epoch": 4.444444444444445, "grad_norm": 0.20840996503829956, "learning_rate": 3.3429784080398765e-07, "loss": 0.005, "step": 280 }, { "epoch": 4.4603174603174605, "grad_norm": 0.21250075101852417, "learning_rate": 3.1566876080910193e-07, "loss": 0.0042, "step": 281 }, { "epoch": 4.476190476190476, "grad_norm": 0.14549191296100616, "learning_rate": 2.9755681228800904e-07, "loss": 0.0023, "step": 282 }, { "epoch": 4.492063492063492, "grad_norm": 0.27009841799736023, "learning_rate": 2.799639947275412e-07, "loss": 0.0037, "step": 283 }, { "epoch": 4.507936507936508, "grad_norm": 0.18486513197422028, "learning_rate": 2.6289225030454556e-07, "loss": 0.0025, "step": 284 }, { "epoch": 4.523809523809524, "grad_norm": 0.2685333490371704, "learning_rate": 2.4634346367147233e-07, "loss": 0.0038, "step": 285 }, { "epoch": 4.5396825396825395, "grad_norm": 0.22350570559501648, "learning_rate": 2.303194617483212e-07, "loss": 0.0052, "step": 286 }, { "epoch": 4.555555555555555, "grad_norm": 0.1270400732755661, "learning_rate": 2.1482201352095277e-07, "loss": 0.0037, "step": 287 }, { "epoch": 4.571428571428571, "grad_norm": 0.14703014492988586, "learning_rate": 1.998528298458019e-07, "loss": 0.0026, "step": 288 }, { "epoch": 4.587301587301587, "grad_norm": 0.05500807240605354, "learning_rate": 1.8541356326100436e-07, "loss": 0.0006, "step": 289 }, { "epoch": 4.603174603174603, "grad_norm": 0.23130561411380768, "learning_rate": 1.7150580780396385e-07, "loss": 0.0041, "step": 290 }, { "epoch": 4.619047619047619, "grad_norm": 0.14613750576972961, "learning_rate": 1.5813109883537792e-07, "loss": 0.0022, "step": 291 }, { "epoch": 4.634920634920634, "grad_norm": 0.19623324275016785, "learning_rate": 1.4529091286973994e-07, "loss": 0.0029, "step": 292 }, { "epoch": 4.650793650793651, "grad_norm": 0.2580835819244385, "learning_rate": 1.3298666741233424e-07, "loss": 0.0047, "step": 293 }, { "epoch": 4.666666666666667, "grad_norm": 0.19105015695095062, "learning_rate": 1.2121972080275378e-07, "loss": 0.0041, "step": 294 }, { "epoch": 4.682539682539683, "grad_norm": 0.18912175297737122, "learning_rate": 1.0999137206494315e-07, "loss": 0.0045, "step": 295 }, { "epoch": 4.698412698412699, "grad_norm": 0.2492668479681015, "learning_rate": 9.93028607637908e-08, "loss": 0.0067, "step": 296 }, { "epoch": 4.714285714285714, "grad_norm": 0.1554499715566635, "learning_rate": 8.915536686828764e-08, "loss": 0.0085, "step": 297 }, { "epoch": 4.73015873015873, "grad_norm": 0.16742238402366638, "learning_rate": 7.955001062125989e-08, "loss": 0.0036, "step": 298 }, { "epoch": 4.746031746031746, "grad_norm": 0.1783057302236557, "learning_rate": 7.048785241570321e-08, "loss": 0.0028, "step": 299 }, { "epoch": 4.761904761904762, "grad_norm": 0.1566859483718872, "learning_rate": 6.19698926777168e-08, "loss": 0.0033, "step": 300 }, { "epoch": 4.777777777777778, "grad_norm": 0.1521420031785965, "learning_rate": 5.399707175606117e-08, "loss": 0.0016, "step": 301 }, { "epoch": 4.7936507936507935, "grad_norm": 0.17157761752605438, "learning_rate": 4.657026981834623e-08, "loss": 0.002, "step": 302 }, { "epoch": 4.809523809523809, "grad_norm": 0.1927778720855713, "learning_rate": 3.9690306753866204e-08, "loss": 0.0029, "step": 303 }, { "epoch": 4.825396825396825, "grad_norm": 0.15908092260360718, "learning_rate": 3.3357942083085404e-08, "loss": 0.0016, "step": 304 }, { "epoch": 4.841269841269841, "grad_norm": 0.175292506814003, "learning_rate": 2.7573874873791372e-08, "loss": 0.0043, "step": 305 }, { "epoch": 4.857142857142857, "grad_norm": 0.121407650411129, "learning_rate": 2.233874366391997e-08, "loss": 0.001, "step": 306 }, { "epoch": 4.8730158730158735, "grad_norm": 0.2875802218914032, "learning_rate": 1.7653126391063425e-08, "loss": 0.005, "step": 307 }, { "epoch": 4.888888888888889, "grad_norm": 0.18822738528251648, "learning_rate": 1.3517540328669143e-08, "loss": 0.0025, "step": 308 }, { "epoch": 4.904761904761905, "grad_norm": 0.15360276401042938, "learning_rate": 9.93244202893262e-09, "loss": 0.0023, "step": 309 }, { "epoch": 4.920634920634921, "grad_norm": 0.29353615641593933, "learning_rate": 6.898227272398306e-09, "loss": 0.0065, "step": 310 }, { "epoch": 4.936507936507937, "grad_norm": 0.5181974172592163, "learning_rate": 4.415231024265665e-09, "loss": 0.0108, "step": 311 }, { "epoch": 4.9523809523809526, "grad_norm": 0.15942662954330444, "learning_rate": 2.4837273974115393e-09, "loss": 0.0026, "step": 312 }, { "epoch": 4.968253968253968, "grad_norm": 0.1265093982219696, "learning_rate": 1.1039296221276863e-09, "loss": 0.0116, "step": 313 }, { "epoch": 4.984126984126984, "grad_norm": 0.29910922050476074, "learning_rate": 2.7599002258127395e-10, "loss": 0.0022, "step": 314 }, { "epoch": 5.0, "grad_norm": 0.09251413494348526, "learning_rate": 0.0, "loss": 0.0015, "step": 315 }, { "epoch": 5.0, "step": 315, "total_flos": 32588508020736.0, "train_loss": 0.11869781007546754, "train_runtime": 5355.6377, "train_samples_per_second": 0.934, "train_steps_per_second": 0.059 } ], "logging_steps": 1.0, "max_steps": 315, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 32588508020736.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }