diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,81817 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9879760807595859, + "eval_steps": 700, + "global_step": 11664, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001714640889898622, + "grad_norm": 223.0, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.7472, + "step": 1 + }, + { + "epoch": 0.0001714640889898622, + "eval_loss": 2.7519192695617676, + "eval_runtime": 836.2413, + "eval_samples_per_second": 2.988, + "eval_steps_per_second": 2.988, + "step": 1 + }, + { + "epoch": 0.0003429281779797244, + "grad_norm": 127.0, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.6937, + "step": 2 + }, + { + "epoch": 0.0005143922669695865, + "grad_norm": 237.0, + "learning_rate": 6.000000000000001e-07, + "loss": 1.661, + "step": 3 + }, + { + "epoch": 0.0006858563559594488, + "grad_norm": 118.5, + "learning_rate": 8.000000000000001e-07, + "loss": 1.5902, + "step": 4 + }, + { + "epoch": 0.000857320444949311, + "grad_norm": 156.0, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.6023, + "step": 5 + }, + { + "epoch": 0.001028784533939173, + "grad_norm": 155.0, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.7616, + "step": 6 + }, + { + "epoch": 0.0012002486229290353, + "grad_norm": 94.5, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.7018, + "step": 7 + }, + { + "epoch": 0.0013717127119188975, + "grad_norm": 96.5, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.6484, + "step": 8 + }, + { + "epoch": 0.0015431768009087597, + "grad_norm": 290.0, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.5879, + "step": 9 + }, + { + "epoch": 0.001714640889898622, + "grad_norm": 154.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.5615, + "step": 10 + }, + { + "epoch": 0.001886104978888484, + "grad_norm": 68.0, + "learning_rate": 2.2e-06, + "loss": 1.5076, + "step": 11 + }, + { + "epoch": 0.002057569067878346, + "grad_norm": 70.5, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.5792, + "step": 12 + }, + { + "epoch": 0.0022290331568682086, + "grad_norm": 112.5, + "learning_rate": 2.6e-06, + "loss": 1.4464, + "step": 13 + }, + { + "epoch": 0.0024004972458580706, + "grad_norm": 47.5, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.4436, + "step": 14 + }, + { + "epoch": 0.0025719613348479326, + "grad_norm": 33.75, + "learning_rate": 3e-06, + "loss": 1.4214, + "step": 15 + }, + { + "epoch": 0.002743425423837795, + "grad_norm": 46.0, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.3857, + "step": 16 + }, + { + "epoch": 0.002914889512827657, + "grad_norm": 59.75, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.477, + "step": 17 + }, + { + "epoch": 0.0030863536018175194, + "grad_norm": 23.75, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.4501, + "step": 18 + }, + { + "epoch": 0.0032578176908073814, + "grad_norm": 19.625, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.5123, + "step": 19 + }, + { + "epoch": 0.003429281779797244, + "grad_norm": 12.75, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3971, + "step": 20 + }, + { + "epoch": 0.003600745868787106, + "grad_norm": 31.625, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.2835, + "step": 21 + }, + { + "epoch": 0.003772209957776968, + "grad_norm": 56.75, + "learning_rate": 4.4e-06, + "loss": 1.3467, + "step": 22 + }, + { + "epoch": 0.00394367404676683, + "grad_norm": 5.09375, + "learning_rate": 4.600000000000001e-06, + "loss": 1.2356, + "step": 23 + }, + { + "epoch": 0.004115138135756692, + "grad_norm": 83.5, + "learning_rate": 4.800000000000001e-06, + "loss": 1.3952, + "step": 24 + }, + { + "epoch": 0.004286602224746554, + "grad_norm": 1004.0, + "learning_rate": 5e-06, + "loss": 1.3461, + "step": 25 + }, + { + "epoch": 0.004458066313736417, + "grad_norm": 18.875, + "learning_rate": 5.2e-06, + "loss": 1.4225, + "step": 26 + }, + { + "epoch": 0.004629530402726279, + "grad_norm": 430.0, + "learning_rate": 5.400000000000001e-06, + "loss": 1.356, + "step": 27 + }, + { + "epoch": 0.004800994491716141, + "grad_norm": 62.25, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4085, + "step": 28 + }, + { + "epoch": 0.004972458580706003, + "grad_norm": 8.8125, + "learning_rate": 5.8e-06, + "loss": 1.2939, + "step": 29 + }, + { + "epoch": 0.005143922669695865, + "grad_norm": 131.0, + "learning_rate": 6e-06, + "loss": 1.3616, + "step": 30 + }, + { + "epoch": 0.005315386758685728, + "grad_norm": 43.0, + "learning_rate": 6.200000000000001e-06, + "loss": 1.4088, + "step": 31 + }, + { + "epoch": 0.00548685084767559, + "grad_norm": 28.0, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.3492, + "step": 32 + }, + { + "epoch": 0.005658314936665452, + "grad_norm": 18.5, + "learning_rate": 6.600000000000001e-06, + "loss": 1.3562, + "step": 33 + }, + { + "epoch": 0.005829779025655314, + "grad_norm": 19.375, + "learning_rate": 6.800000000000001e-06, + "loss": 1.2559, + "step": 34 + }, + { + "epoch": 0.006001243114645177, + "grad_norm": 23.0, + "learning_rate": 7e-06, + "loss": 1.3713, + "step": 35 + }, + { + "epoch": 0.006172707203635039, + "grad_norm": 211.0, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.2363, + "step": 36 + }, + { + "epoch": 0.006344171292624901, + "grad_norm": 34.5, + "learning_rate": 7.4e-06, + "loss": 1.371, + "step": 37 + }, + { + "epoch": 0.006515635381614763, + "grad_norm": 7.78125, + "learning_rate": 7.600000000000001e-06, + "loss": 1.3426, + "step": 38 + }, + { + "epoch": 0.006687099470604625, + "grad_norm": 21.0, + "learning_rate": 7.800000000000002e-06, + "loss": 1.2491, + "step": 39 + }, + { + "epoch": 0.006858563559594488, + "grad_norm": 2.859375, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2641, + "step": 40 + }, + { + "epoch": 0.00703002764858435, + "grad_norm": 11.9375, + "learning_rate": 8.2e-06, + "loss": 1.3252, + "step": 41 + }, + { + "epoch": 0.007201491737574212, + "grad_norm": 6.75, + "learning_rate": 8.400000000000001e-06, + "loss": 1.242, + "step": 42 + }, + { + "epoch": 0.007372955826564074, + "grad_norm": 14.3125, + "learning_rate": 8.6e-06, + "loss": 1.3359, + "step": 43 + }, + { + "epoch": 0.007544419915553936, + "grad_norm": 23.75, + "learning_rate": 8.8e-06, + "loss": 1.3308, + "step": 44 + }, + { + "epoch": 0.007715884004543799, + "grad_norm": 17.25, + "learning_rate": 9e-06, + "loss": 1.2513, + "step": 45 + }, + { + "epoch": 0.00788734809353366, + "grad_norm": 4.65625, + "learning_rate": 9.200000000000002e-06, + "loss": 1.2823, + "step": 46 + }, + { + "epoch": 0.008058812182523523, + "grad_norm": 4.46875, + "learning_rate": 9.4e-06, + "loss": 1.3031, + "step": 47 + }, + { + "epoch": 0.008230276271513385, + "grad_norm": 4.25, + "learning_rate": 9.600000000000001e-06, + "loss": 1.2793, + "step": 48 + }, + { + "epoch": 0.008401740360503247, + "grad_norm": 4.3125, + "learning_rate": 9.800000000000001e-06, + "loss": 1.3314, + "step": 49 + }, + { + "epoch": 0.008573204449493109, + "grad_norm": 9.1875, + "learning_rate": 1e-05, + "loss": 1.3205, + "step": 50 + }, + { + "epoch": 0.008744668538482972, + "grad_norm": 8.625, + "learning_rate": 1.02e-05, + "loss": 1.2554, + "step": 51 + }, + { + "epoch": 0.008916132627472834, + "grad_norm": 2.734375, + "learning_rate": 1.04e-05, + "loss": 1.2897, + "step": 52 + }, + { + "epoch": 0.009087596716462696, + "grad_norm": 3.609375, + "learning_rate": 1.0600000000000002e-05, + "loss": 1.3327, + "step": 53 + }, + { + "epoch": 0.009259060805452558, + "grad_norm": 8.9375, + "learning_rate": 1.0800000000000002e-05, + "loss": 1.2707, + "step": 54 + }, + { + "epoch": 0.00943052489444242, + "grad_norm": 9.4375, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.2724, + "step": 55 + }, + { + "epoch": 0.009601988983432282, + "grad_norm": 3.890625, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.2567, + "step": 56 + }, + { + "epoch": 0.009773453072422144, + "grad_norm": 3.796875, + "learning_rate": 1.14e-05, + "loss": 1.3487, + "step": 57 + }, + { + "epoch": 0.009944917161412006, + "grad_norm": 2.65625, + "learning_rate": 1.16e-05, + "loss": 1.2577, + "step": 58 + }, + { + "epoch": 0.010116381250401868, + "grad_norm": 2.71875, + "learning_rate": 1.18e-05, + "loss": 1.2358, + "step": 59 + }, + { + "epoch": 0.01028784533939173, + "grad_norm": 2.546875, + "learning_rate": 1.2e-05, + "loss": 1.2995, + "step": 60 + }, + { + "epoch": 0.010459309428381594, + "grad_norm": 2.984375, + "learning_rate": 1.22e-05, + "loss": 1.2554, + "step": 61 + }, + { + "epoch": 0.010630773517371456, + "grad_norm": 11.75, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.2732, + "step": 62 + }, + { + "epoch": 0.010802237606361318, + "grad_norm": 2.671875, + "learning_rate": 1.2600000000000001e-05, + "loss": 1.3157, + "step": 63 + }, + { + "epoch": 0.01097370169535118, + "grad_norm": 5.78125, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.1681, + "step": 64 + }, + { + "epoch": 0.011145165784341042, + "grad_norm": 2.71875, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.3121, + "step": 65 + }, + { + "epoch": 0.011316629873330904, + "grad_norm": 2.4375, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.2333, + "step": 66 + }, + { + "epoch": 0.011488093962320766, + "grad_norm": 2.296875, + "learning_rate": 1.3400000000000002e-05, + "loss": 1.1963, + "step": 67 + }, + { + "epoch": 0.011659558051310628, + "grad_norm": 2.6875, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.1645, + "step": 68 + }, + { + "epoch": 0.01183102214030049, + "grad_norm": 3.125, + "learning_rate": 1.38e-05, + "loss": 1.183, + "step": 69 + }, + { + "epoch": 0.012002486229290354, + "grad_norm": 2.265625, + "learning_rate": 1.4e-05, + "loss": 1.2707, + "step": 70 + }, + { + "epoch": 0.012173950318280216, + "grad_norm": 2.953125, + "learning_rate": 1.4200000000000001e-05, + "loss": 1.2276, + "step": 71 + }, + { + "epoch": 0.012345414407270078, + "grad_norm": 2.78125, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.2386, + "step": 72 + }, + { + "epoch": 0.01251687849625994, + "grad_norm": 2.421875, + "learning_rate": 1.46e-05, + "loss": 1.1404, + "step": 73 + }, + { + "epoch": 0.012688342585249802, + "grad_norm": 3.796875, + "learning_rate": 1.48e-05, + "loss": 1.2231, + "step": 74 + }, + { + "epoch": 0.012859806674239664, + "grad_norm": 2.46875, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.1835, + "step": 75 + }, + { + "epoch": 0.013031270763229526, + "grad_norm": 2.140625, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.2235, + "step": 76 + }, + { + "epoch": 0.013202734852219388, + "grad_norm": 2.203125, + "learning_rate": 1.54e-05, + "loss": 1.1566, + "step": 77 + }, + { + "epoch": 0.01337419894120925, + "grad_norm": 2.0, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.1565, + "step": 78 + }, + { + "epoch": 0.013545663030199113, + "grad_norm": 2.296875, + "learning_rate": 1.58e-05, + "loss": 1.2313, + "step": 79 + }, + { + "epoch": 0.013717127119188975, + "grad_norm": 2.046875, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.1824, + "step": 80 + }, + { + "epoch": 0.013888591208178837, + "grad_norm": 1.96875, + "learning_rate": 1.62e-05, + "loss": 1.1746, + "step": 81 + }, + { + "epoch": 0.0140600552971687, + "grad_norm": 2.359375, + "learning_rate": 1.64e-05, + "loss": 1.2601, + "step": 82 + }, + { + "epoch": 0.014231519386158561, + "grad_norm": 2.359375, + "learning_rate": 1.66e-05, + "loss": 1.1281, + "step": 83 + }, + { + "epoch": 0.014402983475148423, + "grad_norm": 2.0625, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.1808, + "step": 84 + }, + { + "epoch": 0.014574447564138285, + "grad_norm": 2.953125, + "learning_rate": 1.7e-05, + "loss": 1.1618, + "step": 85 + }, + { + "epoch": 0.014745911653128147, + "grad_norm": 2.25, + "learning_rate": 1.72e-05, + "loss": 1.1581, + "step": 86 + }, + { + "epoch": 0.01491737574211801, + "grad_norm": 2.265625, + "learning_rate": 1.7400000000000003e-05, + "loss": 1.2232, + "step": 87 + }, + { + "epoch": 0.015088839831107871, + "grad_norm": 3.390625, + "learning_rate": 1.76e-05, + "loss": 1.2678, + "step": 88 + }, + { + "epoch": 0.015260303920097735, + "grad_norm": 2.3125, + "learning_rate": 1.7800000000000002e-05, + "loss": 1.0981, + "step": 89 + }, + { + "epoch": 0.015431768009087597, + "grad_norm": 2.171875, + "learning_rate": 1.8e-05, + "loss": 1.1916, + "step": 90 + }, + { + "epoch": 0.01560323209807746, + "grad_norm": 2.015625, + "learning_rate": 1.8200000000000002e-05, + "loss": 1.1261, + "step": 91 + }, + { + "epoch": 0.01577469618706732, + "grad_norm": 2.09375, + "learning_rate": 1.8400000000000003e-05, + "loss": 1.1955, + "step": 92 + }, + { + "epoch": 0.015946160276057185, + "grad_norm": 10.6875, + "learning_rate": 1.86e-05, + "loss": 1.1729, + "step": 93 + }, + { + "epoch": 0.016117624365047045, + "grad_norm": 2.15625, + "learning_rate": 1.88e-05, + "loss": 1.1203, + "step": 94 + }, + { + "epoch": 0.01628908845403691, + "grad_norm": 2.171875, + "learning_rate": 1.9e-05, + "loss": 1.2245, + "step": 95 + }, + { + "epoch": 0.01646055254302677, + "grad_norm": 2.046875, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.191, + "step": 96 + }, + { + "epoch": 0.016632016632016633, + "grad_norm": 2.34375, + "learning_rate": 1.94e-05, + "loss": 1.276, + "step": 97 + }, + { + "epoch": 0.016803480721006493, + "grad_norm": 2.453125, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.2306, + "step": 98 + }, + { + "epoch": 0.016974944809996357, + "grad_norm": 2.0, + "learning_rate": 1.98e-05, + "loss": 1.2018, + "step": 99 + }, + { + "epoch": 0.017146408898986217, + "grad_norm": 2.875, + "learning_rate": 2e-05, + "loss": 1.2462, + "step": 100 + }, + { + "epoch": 0.01731787298797608, + "grad_norm": 2.0625, + "learning_rate": 1.9999999836931174e-05, + "loss": 1.1722, + "step": 101 + }, + { + "epoch": 0.017489337076965945, + "grad_norm": 1.8046875, + "learning_rate": 1.9999999347724693e-05, + "loss": 1.1661, + "step": 102 + }, + { + "epoch": 0.017660801165955805, + "grad_norm": 1.984375, + "learning_rate": 1.999999853238058e-05, + "loss": 1.1579, + "step": 103 + }, + { + "epoch": 0.01783226525494567, + "grad_norm": 2.578125, + "learning_rate": 1.9999997390898854e-05, + "loss": 1.244, + "step": 104 + }, + { + "epoch": 0.01800372934393553, + "grad_norm": 2.328125, + "learning_rate": 1.999999592327956e-05, + "loss": 1.1997, + "step": 105 + }, + { + "epoch": 0.018175193432925393, + "grad_norm": 2.28125, + "learning_rate": 1.999999412952274e-05, + "loss": 1.1714, + "step": 106 + }, + { + "epoch": 0.018346657521915253, + "grad_norm": 2.03125, + "learning_rate": 1.999999200962846e-05, + "loss": 1.1926, + "step": 107 + }, + { + "epoch": 0.018518121610905117, + "grad_norm": 6.46875, + "learning_rate": 1.999998956359678e-05, + "loss": 1.1984, + "step": 108 + }, + { + "epoch": 0.018689585699894977, + "grad_norm": 13.5625, + "learning_rate": 1.9999986791427782e-05, + "loss": 1.1753, + "step": 109 + }, + { + "epoch": 0.01886104978888484, + "grad_norm": 2.125, + "learning_rate": 1.9999983693121564e-05, + "loss": 1.1055, + "step": 110 + }, + { + "epoch": 0.0190325138778747, + "grad_norm": 2.046875, + "learning_rate": 1.9999980268678218e-05, + "loss": 1.1672, + "step": 111 + }, + { + "epoch": 0.019203977966864565, + "grad_norm": 2.09375, + "learning_rate": 1.999997651809786e-05, + "loss": 1.2352, + "step": 112 + }, + { + "epoch": 0.01937544205585443, + "grad_norm": 2.234375, + "learning_rate": 1.999997244138061e-05, + "loss": 1.095, + "step": 113 + }, + { + "epoch": 0.01954690614484429, + "grad_norm": 2.0625, + "learning_rate": 1.99999680385266e-05, + "loss": 1.2065, + "step": 114 + }, + { + "epoch": 0.019718370233834152, + "grad_norm": 2.296875, + "learning_rate": 1.999996330953598e-05, + "loss": 1.1259, + "step": 115 + }, + { + "epoch": 0.019889834322824013, + "grad_norm": 3.796875, + "learning_rate": 1.9999958254408897e-05, + "loss": 1.2381, + "step": 116 + }, + { + "epoch": 0.020061298411813876, + "grad_norm": 3.203125, + "learning_rate": 1.9999952873145523e-05, + "loss": 1.234, + "step": 117 + }, + { + "epoch": 0.020232762500803737, + "grad_norm": 2.390625, + "learning_rate": 1.9999947165746028e-05, + "loss": 1.2862, + "step": 118 + }, + { + "epoch": 0.0204042265897936, + "grad_norm": 2.25, + "learning_rate": 1.9999941132210598e-05, + "loss": 1.1938, + "step": 119 + }, + { + "epoch": 0.02057569067878346, + "grad_norm": 1.9765625, + "learning_rate": 1.999993477253943e-05, + "loss": 1.2301, + "step": 120 + }, + { + "epoch": 0.020747154767773324, + "grad_norm": 2.109375, + "learning_rate": 1.9999928086732736e-05, + "loss": 1.2761, + "step": 121 + }, + { + "epoch": 0.020918618856763188, + "grad_norm": 1.828125, + "learning_rate": 1.999992107479073e-05, + "loss": 1.1392, + "step": 122 + }, + { + "epoch": 0.02109008294575305, + "grad_norm": 1.859375, + "learning_rate": 1.9999913736713642e-05, + "loss": 1.0334, + "step": 123 + }, + { + "epoch": 0.021261547034742912, + "grad_norm": 1.84375, + "learning_rate": 1.9999906072501712e-05, + "loss": 1.1569, + "step": 124 + }, + { + "epoch": 0.021433011123732772, + "grad_norm": 1.8984375, + "learning_rate": 1.9999898082155185e-05, + "loss": 1.1254, + "step": 125 + }, + { + "epoch": 0.021604475212722636, + "grad_norm": 2.40625, + "learning_rate": 1.9999889765674326e-05, + "loss": 1.1018, + "step": 126 + }, + { + "epoch": 0.021775939301712496, + "grad_norm": 1.8125, + "learning_rate": 1.999988112305941e-05, + "loss": 1.1592, + "step": 127 + }, + { + "epoch": 0.02194740339070236, + "grad_norm": 1.9296875, + "learning_rate": 1.999987215431071e-05, + "loss": 1.1072, + "step": 128 + }, + { + "epoch": 0.02211886747969222, + "grad_norm": 2.21875, + "learning_rate": 1.9999862859428526e-05, + "loss": 1.1306, + "step": 129 + }, + { + "epoch": 0.022290331568682084, + "grad_norm": 2.125, + "learning_rate": 1.9999853238413154e-05, + "loss": 1.2421, + "step": 130 + }, + { + "epoch": 0.022461795657671948, + "grad_norm": 6.09375, + "learning_rate": 1.9999843291264915e-05, + "loss": 1.2552, + "step": 131 + }, + { + "epoch": 0.022633259746661808, + "grad_norm": 5.21875, + "learning_rate": 1.999983301798413e-05, + "loss": 1.2062, + "step": 132 + }, + { + "epoch": 0.022804723835651672, + "grad_norm": 2.03125, + "learning_rate": 1.9999822418571134e-05, + "loss": 1.1919, + "step": 133 + }, + { + "epoch": 0.022976187924641532, + "grad_norm": 1.9296875, + "learning_rate": 1.9999811493026275e-05, + "loss": 1.1638, + "step": 134 + }, + { + "epoch": 0.023147652013631396, + "grad_norm": 1.9609375, + "learning_rate": 1.9999800241349903e-05, + "loss": 1.2391, + "step": 135 + }, + { + "epoch": 0.023319116102621256, + "grad_norm": 1.84375, + "learning_rate": 1.9999788663542397e-05, + "loss": 1.1979, + "step": 136 + }, + { + "epoch": 0.02349058019161112, + "grad_norm": 2.0, + "learning_rate": 1.9999776759604123e-05, + "loss": 1.1564, + "step": 137 + }, + { + "epoch": 0.02366204428060098, + "grad_norm": 2.109375, + "learning_rate": 1.999976452953547e-05, + "loss": 1.1106, + "step": 138 + }, + { + "epoch": 0.023833508369590844, + "grad_norm": 2.5625, + "learning_rate": 1.9999751973336843e-05, + "loss": 1.1715, + "step": 139 + }, + { + "epoch": 0.024004972458580708, + "grad_norm": 1.90625, + "learning_rate": 1.9999739091008646e-05, + "loss": 1.0636, + "step": 140 + }, + { + "epoch": 0.024176436547570568, + "grad_norm": 1.8671875, + "learning_rate": 1.9999725882551305e-05, + "loss": 1.1546, + "step": 141 + }, + { + "epoch": 0.02434790063656043, + "grad_norm": 1.90625, + "learning_rate": 1.9999712347965245e-05, + "loss": 1.248, + "step": 142 + }, + { + "epoch": 0.024519364725550292, + "grad_norm": 1.8671875, + "learning_rate": 1.9999698487250914e-05, + "loss": 1.1635, + "step": 143 + }, + { + "epoch": 0.024690828814540156, + "grad_norm": 1.9140625, + "learning_rate": 1.9999684300408756e-05, + "loss": 1.2565, + "step": 144 + }, + { + "epoch": 0.024862292903530016, + "grad_norm": 1.9453125, + "learning_rate": 1.999966978743924e-05, + "loss": 1.1027, + "step": 145 + }, + { + "epoch": 0.02503375699251988, + "grad_norm": 2.09375, + "learning_rate": 1.9999654948342836e-05, + "loss": 1.1535, + "step": 146 + }, + { + "epoch": 0.02520522108150974, + "grad_norm": 2.265625, + "learning_rate": 1.999963978312003e-05, + "loss": 1.17, + "step": 147 + }, + { + "epoch": 0.025376685170499604, + "grad_norm": 1.9609375, + "learning_rate": 1.999962429177131e-05, + "loss": 1.1554, + "step": 148 + }, + { + "epoch": 0.025548149259489467, + "grad_norm": 2.046875, + "learning_rate": 1.9999608474297192e-05, + "loss": 1.2368, + "step": 149 + }, + { + "epoch": 0.025719613348479328, + "grad_norm": 2.28125, + "learning_rate": 1.9999592330698185e-05, + "loss": 1.1703, + "step": 150 + }, + { + "epoch": 0.02589107743746919, + "grad_norm": 1.875, + "learning_rate": 1.9999575860974817e-05, + "loss": 1.2139, + "step": 151 + }, + { + "epoch": 0.02606254152645905, + "grad_norm": 2.0625, + "learning_rate": 1.9999559065127627e-05, + "loss": 1.208, + "step": 152 + }, + { + "epoch": 0.026234005615448915, + "grad_norm": 1.9765625, + "learning_rate": 1.999954194315716e-05, + "loss": 1.1737, + "step": 153 + }, + { + "epoch": 0.026405469704438776, + "grad_norm": 1.890625, + "learning_rate": 1.9999524495063974e-05, + "loss": 1.1485, + "step": 154 + }, + { + "epoch": 0.02657693379342864, + "grad_norm": 1.9921875, + "learning_rate": 1.999950672084864e-05, + "loss": 1.1551, + "step": 155 + }, + { + "epoch": 0.0267483978824185, + "grad_norm": 1.8984375, + "learning_rate": 1.999948862051174e-05, + "loss": 1.1953, + "step": 156 + }, + { + "epoch": 0.026919861971408363, + "grad_norm": 1.90625, + "learning_rate": 1.999947019405386e-05, + "loss": 1.1708, + "step": 157 + }, + { + "epoch": 0.027091326060398227, + "grad_norm": 1.859375, + "learning_rate": 1.99994514414756e-05, + "loss": 1.2106, + "step": 158 + }, + { + "epoch": 0.027262790149388087, + "grad_norm": 2.28125, + "learning_rate": 1.999943236277758e-05, + "loss": 1.2831, + "step": 159 + }, + { + "epoch": 0.02743425423837795, + "grad_norm": 2.125, + "learning_rate": 1.999941295796041e-05, + "loss": 1.2245, + "step": 160 + }, + { + "epoch": 0.02760571832736781, + "grad_norm": 1.953125, + "learning_rate": 1.9999393227024733e-05, + "loss": 1.0303, + "step": 161 + }, + { + "epoch": 0.027777182416357675, + "grad_norm": 3.296875, + "learning_rate": 1.9999373169971184e-05, + "loss": 1.1286, + "step": 162 + }, + { + "epoch": 0.027948646505347535, + "grad_norm": 2.90625, + "learning_rate": 1.9999352786800427e-05, + "loss": 1.1177, + "step": 163 + }, + { + "epoch": 0.0281201105943374, + "grad_norm": 2.109375, + "learning_rate": 1.9999332077513118e-05, + "loss": 1.1579, + "step": 164 + }, + { + "epoch": 0.02829157468332726, + "grad_norm": 6.3125, + "learning_rate": 1.9999311042109938e-05, + "loss": 1.1761, + "step": 165 + }, + { + "epoch": 0.028463038772317123, + "grad_norm": 4.5, + "learning_rate": 1.9999289680591573e-05, + "loss": 1.1855, + "step": 166 + }, + { + "epoch": 0.028634502861306987, + "grad_norm": 3.03125, + "learning_rate": 1.9999267992958713e-05, + "loss": 1.1755, + "step": 167 + }, + { + "epoch": 0.028805966950296847, + "grad_norm": 2.453125, + "learning_rate": 1.999924597921207e-05, + "loss": 1.1019, + "step": 168 + }, + { + "epoch": 0.02897743103928671, + "grad_norm": 1.8515625, + "learning_rate": 1.9999223639352364e-05, + "loss": 1.1054, + "step": 169 + }, + { + "epoch": 0.02914889512827657, + "grad_norm": 1.9609375, + "learning_rate": 1.9999200973380325e-05, + "loss": 1.0905, + "step": 170 + }, + { + "epoch": 0.029320359217266435, + "grad_norm": 2.046875, + "learning_rate": 1.9999177981296682e-05, + "loss": 1.1277, + "step": 171 + }, + { + "epoch": 0.029491823306256295, + "grad_norm": 1.8046875, + "learning_rate": 1.9999154663102196e-05, + "loss": 1.1312, + "step": 172 + }, + { + "epoch": 0.02966328739524616, + "grad_norm": 2.015625, + "learning_rate": 1.999913101879762e-05, + "loss": 1.1578, + "step": 173 + }, + { + "epoch": 0.02983475148423602, + "grad_norm": 2.0, + "learning_rate": 1.999910704838373e-05, + "loss": 1.1742, + "step": 174 + }, + { + "epoch": 0.030006215573225883, + "grad_norm": 1.8671875, + "learning_rate": 1.9999082751861308e-05, + "loss": 1.1161, + "step": 175 + }, + { + "epoch": 0.030177679662215743, + "grad_norm": 1.9609375, + "learning_rate": 1.9999058129231144e-05, + "loss": 1.1534, + "step": 176 + }, + { + "epoch": 0.030349143751205607, + "grad_norm": 1.96875, + "learning_rate": 1.9999033180494037e-05, + "loss": 1.2287, + "step": 177 + }, + { + "epoch": 0.03052060784019547, + "grad_norm": 1.9453125, + "learning_rate": 1.999900790565081e-05, + "loss": 1.1124, + "step": 178 + }, + { + "epoch": 0.03069207192918533, + "grad_norm": 1.75, + "learning_rate": 1.9998982304702278e-05, + "loss": 1.1163, + "step": 179 + }, + { + "epoch": 0.030863536018175194, + "grad_norm": 2.140625, + "learning_rate": 1.9998956377649286e-05, + "loss": 1.1342, + "step": 180 + }, + { + "epoch": 0.031035000107165055, + "grad_norm": 1.9609375, + "learning_rate": 1.999893012449267e-05, + "loss": 1.2135, + "step": 181 + }, + { + "epoch": 0.03120646419615492, + "grad_norm": 1.7265625, + "learning_rate": 1.9998903545233293e-05, + "loss": 1.053, + "step": 182 + }, + { + "epoch": 0.03137792828514478, + "grad_norm": 1.9765625, + "learning_rate": 1.9998876639872016e-05, + "loss": 1.1699, + "step": 183 + }, + { + "epoch": 0.03154939237413464, + "grad_norm": 1.9921875, + "learning_rate": 1.999884940840972e-05, + "loss": 1.1576, + "step": 184 + }, + { + "epoch": 0.0317208564631245, + "grad_norm": 1.8828125, + "learning_rate": 1.9998821850847296e-05, + "loss": 1.1212, + "step": 185 + }, + { + "epoch": 0.03189232055211437, + "grad_norm": 2.046875, + "learning_rate": 1.9998793967185635e-05, + "loss": 1.2487, + "step": 186 + }, + { + "epoch": 0.03206378464110423, + "grad_norm": 1.953125, + "learning_rate": 1.999876575742565e-05, + "loss": 1.1613, + "step": 187 + }, + { + "epoch": 0.03223524873009409, + "grad_norm": 1.84375, + "learning_rate": 1.9998737221568264e-05, + "loss": 1.1357, + "step": 188 + }, + { + "epoch": 0.03240671281908395, + "grad_norm": 1.875, + "learning_rate": 1.9998708359614408e-05, + "loss": 1.0308, + "step": 189 + }, + { + "epoch": 0.03257817690807382, + "grad_norm": 1.9375, + "learning_rate": 1.9998679171565017e-05, + "loss": 1.1871, + "step": 190 + }, + { + "epoch": 0.03274964099706368, + "grad_norm": 1.9765625, + "learning_rate": 1.9998649657421047e-05, + "loss": 1.1016, + "step": 191 + }, + { + "epoch": 0.03292110508605354, + "grad_norm": 1.84375, + "learning_rate": 1.999861981718346e-05, + "loss": 1.1855, + "step": 192 + }, + { + "epoch": 0.0330925691750434, + "grad_norm": 2.765625, + "learning_rate": 1.999858965085323e-05, + "loss": 1.2059, + "step": 193 + }, + { + "epoch": 0.033264033264033266, + "grad_norm": 2.0625, + "learning_rate": 1.999855915843134e-05, + "loss": 1.2393, + "step": 194 + }, + { + "epoch": 0.033435497353023126, + "grad_norm": 1.90625, + "learning_rate": 1.999852833991879e-05, + "loss": 1.2064, + "step": 195 + }, + { + "epoch": 0.033606961442012986, + "grad_norm": 1.84375, + "learning_rate": 1.9998497195316572e-05, + "loss": 1.0907, + "step": 196 + }, + { + "epoch": 0.033778425531002854, + "grad_norm": 1.921875, + "learning_rate": 1.9998465724625715e-05, + "loss": 1.2311, + "step": 197 + }, + { + "epoch": 0.033949889619992714, + "grad_norm": 1.8125, + "learning_rate": 1.9998433927847238e-05, + "loss": 1.1369, + "step": 198 + }, + { + "epoch": 0.034121353708982574, + "grad_norm": 1.78125, + "learning_rate": 1.999840180498218e-05, + "loss": 1.1819, + "step": 199 + }, + { + "epoch": 0.034292817797972434, + "grad_norm": 1.8046875, + "learning_rate": 1.9998369356031587e-05, + "loss": 1.1348, + "step": 200 + }, + { + "epoch": 0.0344642818869623, + "grad_norm": 1.75, + "learning_rate": 1.9998336580996524e-05, + "loss": 1.141, + "step": 201 + }, + { + "epoch": 0.03463574597595216, + "grad_norm": 1.8359375, + "learning_rate": 1.999830347987805e-05, + "loss": 1.1466, + "step": 202 + }, + { + "epoch": 0.03480721006494202, + "grad_norm": 1.796875, + "learning_rate": 1.999827005267725e-05, + "loss": 1.2022, + "step": 203 + }, + { + "epoch": 0.03497867415393189, + "grad_norm": 1.8125, + "learning_rate": 1.9998236299395216e-05, + "loss": 1.2304, + "step": 204 + }, + { + "epoch": 0.03515013824292175, + "grad_norm": 1.875, + "learning_rate": 1.9998202220033044e-05, + "loss": 1.1918, + "step": 205 + }, + { + "epoch": 0.03532160233191161, + "grad_norm": 1.8203125, + "learning_rate": 1.9998167814591847e-05, + "loss": 1.1763, + "step": 206 + }, + { + "epoch": 0.03549306642090147, + "grad_norm": 1.8828125, + "learning_rate": 1.999813308307275e-05, + "loss": 1.1952, + "step": 207 + }, + { + "epoch": 0.03566453050989134, + "grad_norm": 1.9375, + "learning_rate": 1.9998098025476883e-05, + "loss": 1.0583, + "step": 208 + }, + { + "epoch": 0.0358359945988812, + "grad_norm": 4.21875, + "learning_rate": 1.9998062641805392e-05, + "loss": 1.1122, + "step": 209 + }, + { + "epoch": 0.03600745868787106, + "grad_norm": 1.890625, + "learning_rate": 1.9998026932059427e-05, + "loss": 1.0917, + "step": 210 + }, + { + "epoch": 0.03617892277686092, + "grad_norm": 1.8125, + "learning_rate": 1.9997990896240154e-05, + "loss": 1.1909, + "step": 211 + }, + { + "epoch": 0.036350386865850785, + "grad_norm": 1.796875, + "learning_rate": 1.999795453434875e-05, + "loss": 1.1432, + "step": 212 + }, + { + "epoch": 0.036521850954840646, + "grad_norm": 1.78125, + "learning_rate": 1.99979178463864e-05, + "loss": 1.0841, + "step": 213 + }, + { + "epoch": 0.036693315043830506, + "grad_norm": 1.796875, + "learning_rate": 1.9997880832354302e-05, + "loss": 1.2121, + "step": 214 + }, + { + "epoch": 0.03686477913282037, + "grad_norm": 1.8671875, + "learning_rate": 1.999784349225366e-05, + "loss": 1.0489, + "step": 215 + }, + { + "epoch": 0.03703624322181023, + "grad_norm": 1.8046875, + "learning_rate": 1.999780582608569e-05, + "loss": 1.1155, + "step": 216 + }, + { + "epoch": 0.037207707310800094, + "grad_norm": 2.234375, + "learning_rate": 1.999776783385163e-05, + "loss": 1.1808, + "step": 217 + }, + { + "epoch": 0.037379171399789954, + "grad_norm": 2.125, + "learning_rate": 1.9997729515552708e-05, + "loss": 1.2913, + "step": 218 + }, + { + "epoch": 0.03755063548877982, + "grad_norm": 1.875, + "learning_rate": 1.999769087119018e-05, + "loss": 1.1422, + "step": 219 + }, + { + "epoch": 0.03772209957776968, + "grad_norm": 1.9765625, + "learning_rate": 1.9997651900765308e-05, + "loss": 1.1079, + "step": 220 + }, + { + "epoch": 0.03789356366675954, + "grad_norm": 1.7890625, + "learning_rate": 1.999761260427935e-05, + "loss": 1.1709, + "step": 221 + }, + { + "epoch": 0.0380650277557494, + "grad_norm": 2.1875, + "learning_rate": 1.999757298173361e-05, + "loss": 1.2323, + "step": 222 + }, + { + "epoch": 0.03823649184473927, + "grad_norm": 1.953125, + "learning_rate": 1.999753303312936e-05, + "loss": 1.1526, + "step": 223 + }, + { + "epoch": 0.03840795593372913, + "grad_norm": 1.921875, + "learning_rate": 1.9997492758467915e-05, + "loss": 1.0609, + "step": 224 + }, + { + "epoch": 0.03857942002271899, + "grad_norm": 1.7890625, + "learning_rate": 1.9997452157750577e-05, + "loss": 1.1155, + "step": 225 + }, + { + "epoch": 0.03875088411170886, + "grad_norm": 1.8125, + "learning_rate": 1.9997411230978684e-05, + "loss": 1.1335, + "step": 226 + }, + { + "epoch": 0.03892234820069872, + "grad_norm": 1.9296875, + "learning_rate": 1.9997369978153564e-05, + "loss": 1.1145, + "step": 227 + }, + { + "epoch": 0.03909381228968858, + "grad_norm": 1.96875, + "learning_rate": 1.9997328399276558e-05, + "loss": 1.0612, + "step": 228 + }, + { + "epoch": 0.03926527637867844, + "grad_norm": 1.7890625, + "learning_rate": 1.9997286494349032e-05, + "loss": 1.1189, + "step": 229 + }, + { + "epoch": 0.039436740467668305, + "grad_norm": 1.84375, + "learning_rate": 1.999724426337234e-05, + "loss": 1.1008, + "step": 230 + }, + { + "epoch": 0.039608204556658165, + "grad_norm": 2.359375, + "learning_rate": 1.9997201706347875e-05, + "loss": 1.1649, + "step": 231 + }, + { + "epoch": 0.039779668645648025, + "grad_norm": 2.015625, + "learning_rate": 1.999715882327701e-05, + "loss": 1.1094, + "step": 232 + }, + { + "epoch": 0.03995113273463789, + "grad_norm": 1.84375, + "learning_rate": 1.999711561416115e-05, + "loss": 1.1228, + "step": 233 + }, + { + "epoch": 0.04012259682362775, + "grad_norm": 1.8125, + "learning_rate": 1.9997072079001705e-05, + "loss": 1.1056, + "step": 234 + }, + { + "epoch": 0.04029406091261761, + "grad_norm": 1.9296875, + "learning_rate": 1.9997028217800097e-05, + "loss": 1.2507, + "step": 235 + }, + { + "epoch": 0.04046552500160747, + "grad_norm": 1.796875, + "learning_rate": 1.999698403055775e-05, + "loss": 1.0986, + "step": 236 + }, + { + "epoch": 0.04063698909059734, + "grad_norm": 2.59375, + "learning_rate": 1.9996939517276107e-05, + "loss": 1.1712, + "step": 237 + }, + { + "epoch": 0.0408084531795872, + "grad_norm": 1.765625, + "learning_rate": 1.9996894677956628e-05, + "loss": 1.197, + "step": 238 + }, + { + "epoch": 0.04097991726857706, + "grad_norm": 1.953125, + "learning_rate": 1.9996849512600764e-05, + "loss": 1.0771, + "step": 239 + }, + { + "epoch": 0.04115138135756692, + "grad_norm": 1.8359375, + "learning_rate": 1.9996804021209995e-05, + "loss": 1.191, + "step": 240 + }, + { + "epoch": 0.04132284544655679, + "grad_norm": 1.8515625, + "learning_rate": 1.9996758203785797e-05, + "loss": 1.1842, + "step": 241 + }, + { + "epoch": 0.04149430953554665, + "grad_norm": 1.8359375, + "learning_rate": 1.9996712060329675e-05, + "loss": 1.1033, + "step": 242 + }, + { + "epoch": 0.04166577362453651, + "grad_norm": 1.734375, + "learning_rate": 1.999666559084313e-05, + "loss": 1.0938, + "step": 243 + }, + { + "epoch": 0.041837237713526376, + "grad_norm": 1.859375, + "learning_rate": 1.999661879532767e-05, + "loss": 1.2002, + "step": 244 + }, + { + "epoch": 0.042008701802516236, + "grad_norm": 1.7578125, + "learning_rate": 1.999657167378483e-05, + "loss": 1.195, + "step": 245 + }, + { + "epoch": 0.0421801658915061, + "grad_norm": 1.75, + "learning_rate": 1.9996524226216147e-05, + "loss": 1.1571, + "step": 246 + }, + { + "epoch": 0.04235162998049596, + "grad_norm": 1.921875, + "learning_rate": 1.9996476452623163e-05, + "loss": 1.0698, + "step": 247 + }, + { + "epoch": 0.042523094069485824, + "grad_norm": 1.8203125, + "learning_rate": 1.999642835300744e-05, + "loss": 1.1737, + "step": 248 + }, + { + "epoch": 0.042694558158475684, + "grad_norm": 1.90625, + "learning_rate": 1.9996379927370542e-05, + "loss": 1.1447, + "step": 249 + }, + { + "epoch": 0.042866022247465545, + "grad_norm": 1.8515625, + "learning_rate": 1.9996331175714056e-05, + "loss": 1.2027, + "step": 250 + }, + { + "epoch": 0.04303748633645541, + "grad_norm": 1.8515625, + "learning_rate": 1.9996282098039565e-05, + "loss": 1.109, + "step": 251 + }, + { + "epoch": 0.04320895042544527, + "grad_norm": 1.8203125, + "learning_rate": 1.9996232694348673e-05, + "loss": 1.1268, + "step": 252 + }, + { + "epoch": 0.04338041451443513, + "grad_norm": 1.7578125, + "learning_rate": 1.9996182964642992e-05, + "loss": 1.119, + "step": 253 + }, + { + "epoch": 0.04355187860342499, + "grad_norm": 1.7109375, + "learning_rate": 1.999613290892414e-05, + "loss": 1.139, + "step": 254 + }, + { + "epoch": 0.04372334269241486, + "grad_norm": 1.7421875, + "learning_rate": 1.999608252719375e-05, + "loss": 1.1325, + "step": 255 + }, + { + "epoch": 0.04389480678140472, + "grad_norm": 1.84375, + "learning_rate": 1.9996031819453474e-05, + "loss": 1.1633, + "step": 256 + }, + { + "epoch": 0.04406627087039458, + "grad_norm": 1.8828125, + "learning_rate": 1.9995980785704955e-05, + "loss": 1.2083, + "step": 257 + }, + { + "epoch": 0.04423773495938444, + "grad_norm": 1.71875, + "learning_rate": 1.999592942594986e-05, + "loss": 1.043, + "step": 258 + }, + { + "epoch": 0.04440919904837431, + "grad_norm": 1.90625, + "learning_rate": 1.9995877740189868e-05, + "loss": 1.153, + "step": 259 + }, + { + "epoch": 0.04458066313736417, + "grad_norm": 2.015625, + "learning_rate": 1.9995825728426655e-05, + "loss": 1.1146, + "step": 260 + }, + { + "epoch": 0.04475212722635403, + "grad_norm": 1.96875, + "learning_rate": 1.999577339066193e-05, + "loss": 1.1924, + "step": 261 + }, + { + "epoch": 0.044923591315343896, + "grad_norm": 1.859375, + "learning_rate": 1.9995720726897394e-05, + "loss": 1.1448, + "step": 262 + }, + { + "epoch": 0.045095055404333756, + "grad_norm": 1.9375, + "learning_rate": 1.9995667737134765e-05, + "loss": 1.1947, + "step": 263 + }, + { + "epoch": 0.045266519493323616, + "grad_norm": 1.8359375, + "learning_rate": 1.9995614421375768e-05, + "loss": 1.1596, + "step": 264 + }, + { + "epoch": 0.045437983582313476, + "grad_norm": 1.765625, + "learning_rate": 1.9995560779622148e-05, + "loss": 1.104, + "step": 265 + }, + { + "epoch": 0.045609447671303344, + "grad_norm": 1.9375, + "learning_rate": 1.999550681187565e-05, + "loss": 1.1444, + "step": 266 + }, + { + "epoch": 0.045780911760293204, + "grad_norm": 2.0, + "learning_rate": 1.9995452518138037e-05, + "loss": 1.1601, + "step": 267 + }, + { + "epoch": 0.045952375849283064, + "grad_norm": 1.875, + "learning_rate": 1.9995397898411073e-05, + "loss": 1.1729, + "step": 268 + }, + { + "epoch": 0.04612383993827293, + "grad_norm": 1.7734375, + "learning_rate": 1.999534295269655e-05, + "loss": 1.0968, + "step": 269 + }, + { + "epoch": 0.04629530402726279, + "grad_norm": 1.75, + "learning_rate": 1.999528768099625e-05, + "loss": 1.1298, + "step": 270 + }, + { + "epoch": 0.04646676811625265, + "grad_norm": 2.0, + "learning_rate": 1.999523208331198e-05, + "loss": 1.0809, + "step": 271 + }, + { + "epoch": 0.04663823220524251, + "grad_norm": 1.765625, + "learning_rate": 1.9995176159645557e-05, + "loss": 1.1636, + "step": 272 + }, + { + "epoch": 0.04680969629423238, + "grad_norm": 1.796875, + "learning_rate": 1.9995119909998798e-05, + "loss": 1.1, + "step": 273 + }, + { + "epoch": 0.04698116038322224, + "grad_norm": 4.6875, + "learning_rate": 1.9995063334373544e-05, + "loss": 1.1314, + "step": 274 + }, + { + "epoch": 0.0471526244722121, + "grad_norm": 1.859375, + "learning_rate": 1.9995006432771634e-05, + "loss": 1.1535, + "step": 275 + }, + { + "epoch": 0.04732408856120196, + "grad_norm": 1.734375, + "learning_rate": 1.9994949205194925e-05, + "loss": 1.0962, + "step": 276 + }, + { + "epoch": 0.04749555265019183, + "grad_norm": 1.875, + "learning_rate": 1.999489165164529e-05, + "loss": 1.2084, + "step": 277 + }, + { + "epoch": 0.04766701673918169, + "grad_norm": 1.765625, + "learning_rate": 1.9994833772124597e-05, + "loss": 1.1022, + "step": 278 + }, + { + "epoch": 0.04783848082817155, + "grad_norm": 1.8359375, + "learning_rate": 1.9994775566634737e-05, + "loss": 1.1553, + "step": 279 + }, + { + "epoch": 0.048009944917161415, + "grad_norm": 1.796875, + "learning_rate": 1.999471703517761e-05, + "loss": 1.1098, + "step": 280 + }, + { + "epoch": 0.048181409006151275, + "grad_norm": 1.8046875, + "learning_rate": 1.9994658177755124e-05, + "loss": 1.1211, + "step": 281 + }, + { + "epoch": 0.048352873095141136, + "grad_norm": 1.9296875, + "learning_rate": 1.99945989943692e-05, + "loss": 1.1674, + "step": 282 + }, + { + "epoch": 0.048524337184130996, + "grad_norm": 1.734375, + "learning_rate": 1.9994539485021765e-05, + "loss": 1.086, + "step": 283 + }, + { + "epoch": 0.04869580127312086, + "grad_norm": 1.921875, + "learning_rate": 1.999447964971476e-05, + "loss": 1.2183, + "step": 284 + }, + { + "epoch": 0.04886726536211072, + "grad_norm": 1.796875, + "learning_rate": 1.999441948845014e-05, + "loss": 1.1132, + "step": 285 + }, + { + "epoch": 0.049038729451100584, + "grad_norm": 1.765625, + "learning_rate": 1.9994359001229865e-05, + "loss": 1.0739, + "step": 286 + }, + { + "epoch": 0.049210193540090444, + "grad_norm": 1.8671875, + "learning_rate": 1.9994298188055907e-05, + "loss": 1.1649, + "step": 287 + }, + { + "epoch": 0.04938165762908031, + "grad_norm": 1.8828125, + "learning_rate": 1.999423704893025e-05, + "loss": 1.1438, + "step": 288 + }, + { + "epoch": 0.04955312171807017, + "grad_norm": 2.015625, + "learning_rate": 1.999417558385489e-05, + "loss": 1.166, + "step": 289 + }, + { + "epoch": 0.04972458580706003, + "grad_norm": 2.015625, + "learning_rate": 1.9994113792831825e-05, + "loss": 1.14, + "step": 290 + }, + { + "epoch": 0.0498960498960499, + "grad_norm": 1.8984375, + "learning_rate": 1.999405167586308e-05, + "loss": 1.186, + "step": 291 + }, + { + "epoch": 0.05006751398503976, + "grad_norm": 1.96875, + "learning_rate": 1.999398923295067e-05, + "loss": 1.1538, + "step": 292 + }, + { + "epoch": 0.05023897807402962, + "grad_norm": 1.84375, + "learning_rate": 1.9993926464096646e-05, + "loss": 1.1432, + "step": 293 + }, + { + "epoch": 0.05041044216301948, + "grad_norm": 1.78125, + "learning_rate": 1.999386336930304e-05, + "loss": 1.0595, + "step": 294 + }, + { + "epoch": 0.05058190625200935, + "grad_norm": 1.8203125, + "learning_rate": 1.999379994857192e-05, + "loss": 1.128, + "step": 295 + }, + { + "epoch": 0.05075337034099921, + "grad_norm": 1.78125, + "learning_rate": 1.9993736201905343e-05, + "loss": 1.1186, + "step": 296 + }, + { + "epoch": 0.05092483442998907, + "grad_norm": 1.71875, + "learning_rate": 1.9993672129305398e-05, + "loss": 1.086, + "step": 297 + }, + { + "epoch": 0.051096298518978935, + "grad_norm": 1.8125, + "learning_rate": 1.9993607730774176e-05, + "loss": 1.1423, + "step": 298 + }, + { + "epoch": 0.051267762607968795, + "grad_norm": 1.75, + "learning_rate": 1.9993543006313772e-05, + "loss": 1.1484, + "step": 299 + }, + { + "epoch": 0.051439226696958655, + "grad_norm": 1.7421875, + "learning_rate": 1.9993477955926298e-05, + "loss": 1.1604, + "step": 300 + }, + { + "epoch": 0.051610690785948515, + "grad_norm": 1.9375, + "learning_rate": 1.999341257961388e-05, + "loss": 1.1192, + "step": 301 + }, + { + "epoch": 0.05178215487493838, + "grad_norm": 1.90625, + "learning_rate": 1.9993346877378637e-05, + "loss": 1.0958, + "step": 302 + }, + { + "epoch": 0.05195361896392824, + "grad_norm": 1.828125, + "learning_rate": 1.9993280849222726e-05, + "loss": 1.0935, + "step": 303 + }, + { + "epoch": 0.0521250830529181, + "grad_norm": 1.8125, + "learning_rate": 1.9993214495148297e-05, + "loss": 1.1584, + "step": 304 + }, + { + "epoch": 0.05229654714190796, + "grad_norm": 1.7578125, + "learning_rate": 1.999314781515751e-05, + "loss": 1.0625, + "step": 305 + }, + { + "epoch": 0.05246801123089783, + "grad_norm": 1.75, + "learning_rate": 1.9993080809252542e-05, + "loss": 1.1468, + "step": 306 + }, + { + "epoch": 0.05263947531988769, + "grad_norm": 1.7421875, + "learning_rate": 1.999301347743558e-05, + "loss": 1.1285, + "step": 307 + }, + { + "epoch": 0.05281093940887755, + "grad_norm": 2.0, + "learning_rate": 1.999294581970882e-05, + "loss": 1.2316, + "step": 308 + }, + { + "epoch": 0.05298240349786742, + "grad_norm": 1.8515625, + "learning_rate": 1.9992877836074465e-05, + "loss": 1.1048, + "step": 309 + }, + { + "epoch": 0.05315386758685728, + "grad_norm": 1.8046875, + "learning_rate": 1.9992809526534732e-05, + "loss": 1.0039, + "step": 310 + }, + { + "epoch": 0.05332533167584714, + "grad_norm": 1.734375, + "learning_rate": 1.9992740891091852e-05, + "loss": 1.0773, + "step": 311 + }, + { + "epoch": 0.053496795764837, + "grad_norm": 1.7421875, + "learning_rate": 1.9992671929748062e-05, + "loss": 1.1283, + "step": 312 + }, + { + "epoch": 0.053668259853826866, + "grad_norm": 1.8671875, + "learning_rate": 1.999260264250561e-05, + "loss": 1.1078, + "step": 313 + }, + { + "epoch": 0.053839723942816727, + "grad_norm": 1.8046875, + "learning_rate": 1.9992533029366763e-05, + "loss": 1.1397, + "step": 314 + }, + { + "epoch": 0.05401118803180659, + "grad_norm": 1.6640625, + "learning_rate": 1.999246309033378e-05, + "loss": 1.0898, + "step": 315 + }, + { + "epoch": 0.054182652120796454, + "grad_norm": 1.6796875, + "learning_rate": 1.999239282540895e-05, + "loss": 1.066, + "step": 316 + }, + { + "epoch": 0.054354116209786314, + "grad_norm": 1.8203125, + "learning_rate": 1.9992322234594562e-05, + "loss": 1.2013, + "step": 317 + }, + { + "epoch": 0.054525580298776175, + "grad_norm": 1.8046875, + "learning_rate": 1.9992251317892916e-05, + "loss": 1.1571, + "step": 318 + }, + { + "epoch": 0.054697044387766035, + "grad_norm": 1.6953125, + "learning_rate": 1.999218007530633e-05, + "loss": 1.1173, + "step": 319 + }, + { + "epoch": 0.0548685084767559, + "grad_norm": 1.7109375, + "learning_rate": 1.9992108506837122e-05, + "loss": 1.099, + "step": 320 + }, + { + "epoch": 0.05503997256574576, + "grad_norm": 1.78125, + "learning_rate": 1.999203661248763e-05, + "loss": 1.1084, + "step": 321 + }, + { + "epoch": 0.05521143665473562, + "grad_norm": 1.765625, + "learning_rate": 1.9991964392260198e-05, + "loss": 1.0651, + "step": 322 + }, + { + "epoch": 0.05538290074372548, + "grad_norm": 1.7890625, + "learning_rate": 1.9991891846157182e-05, + "loss": 1.1868, + "step": 323 + }, + { + "epoch": 0.05555436483271535, + "grad_norm": 3.6875, + "learning_rate": 1.9991818974180944e-05, + "loss": 1.189, + "step": 324 + }, + { + "epoch": 0.05572582892170521, + "grad_norm": 1.734375, + "learning_rate": 1.9991745776333865e-05, + "loss": 1.1314, + "step": 325 + }, + { + "epoch": 0.05589729301069507, + "grad_norm": 2.234375, + "learning_rate": 1.9991672252618334e-05, + "loss": 1.1119, + "step": 326 + }, + { + "epoch": 0.05606875709968494, + "grad_norm": 2.015625, + "learning_rate": 1.999159840303674e-05, + "loss": 1.103, + "step": 327 + }, + { + "epoch": 0.0562402211886748, + "grad_norm": 1.7109375, + "learning_rate": 1.99915242275915e-05, + "loss": 1.1243, + "step": 328 + }, + { + "epoch": 0.05641168527766466, + "grad_norm": 1.96875, + "learning_rate": 1.9991449726285033e-05, + "loss": 1.1458, + "step": 329 + }, + { + "epoch": 0.05658314936665452, + "grad_norm": 1.8359375, + "learning_rate": 1.9991374899119763e-05, + "loss": 1.1669, + "step": 330 + }, + { + "epoch": 0.056754613455644386, + "grad_norm": 1.7265625, + "learning_rate": 1.9991299746098133e-05, + "loss": 1.1497, + "step": 331 + }, + { + "epoch": 0.056926077544634246, + "grad_norm": 1.828125, + "learning_rate": 1.9991224267222596e-05, + "loss": 1.0866, + "step": 332 + }, + { + "epoch": 0.057097541633624106, + "grad_norm": 1.890625, + "learning_rate": 1.9991148462495612e-05, + "loss": 1.1331, + "step": 333 + }, + { + "epoch": 0.05726900572261397, + "grad_norm": 1.7578125, + "learning_rate": 1.999107233191965e-05, + "loss": 1.1717, + "step": 334 + }, + { + "epoch": 0.057440469811603834, + "grad_norm": 1.734375, + "learning_rate": 1.9990995875497203e-05, + "loss": 1.1216, + "step": 335 + }, + { + "epoch": 0.057611933900593694, + "grad_norm": 1.671875, + "learning_rate": 1.9990919093230752e-05, + "loss": 1.0493, + "step": 336 + }, + { + "epoch": 0.057783397989583554, + "grad_norm": 1.8828125, + "learning_rate": 1.999084198512281e-05, + "loss": 1.0627, + "step": 337 + }, + { + "epoch": 0.05795486207857342, + "grad_norm": 1.9453125, + "learning_rate": 1.999076455117589e-05, + "loss": 1.0835, + "step": 338 + }, + { + "epoch": 0.05812632616756328, + "grad_norm": 1.765625, + "learning_rate": 1.999068679139251e-05, + "loss": 1.1594, + "step": 339 + }, + { + "epoch": 0.05829779025655314, + "grad_norm": 1.75, + "learning_rate": 1.9990608705775217e-05, + "loss": 1.1199, + "step": 340 + }, + { + "epoch": 0.058469254345543, + "grad_norm": 1.7578125, + "learning_rate": 1.9990530294326554e-05, + "loss": 1.1978, + "step": 341 + }, + { + "epoch": 0.05864071843453287, + "grad_norm": 1.8671875, + "learning_rate": 1.9990451557049077e-05, + "loss": 1.1584, + "step": 342 + }, + { + "epoch": 0.05881218252352273, + "grad_norm": 1.7578125, + "learning_rate": 1.9990372493945353e-05, + "loss": 1.1209, + "step": 343 + }, + { + "epoch": 0.05898364661251259, + "grad_norm": 1.796875, + "learning_rate": 1.9990293105017962e-05, + "loss": 1.1281, + "step": 344 + }, + { + "epoch": 0.05915511070150246, + "grad_norm": 1.734375, + "learning_rate": 1.999021339026949e-05, + "loss": 1.0771, + "step": 345 + }, + { + "epoch": 0.05932657479049232, + "grad_norm": 1.6796875, + "learning_rate": 1.9990133349702544e-05, + "loss": 1.1337, + "step": 346 + }, + { + "epoch": 0.05949803887948218, + "grad_norm": 1.8203125, + "learning_rate": 1.999005298331973e-05, + "loss": 1.1041, + "step": 347 + }, + { + "epoch": 0.05966950296847204, + "grad_norm": 1.8203125, + "learning_rate": 1.9989972291123666e-05, + "loss": 1.1493, + "step": 348 + }, + { + "epoch": 0.059840967057461905, + "grad_norm": 1.984375, + "learning_rate": 1.998989127311699e-05, + "loss": 1.1732, + "step": 349 + }, + { + "epoch": 0.060012431146451765, + "grad_norm": 1.8046875, + "learning_rate": 1.998980992930234e-05, + "loss": 1.0674, + "step": 350 + }, + { + "epoch": 0.060183895235441626, + "grad_norm": 3.609375, + "learning_rate": 1.9989728259682368e-05, + "loss": 1.23, + "step": 351 + }, + { + "epoch": 0.060355359324431486, + "grad_norm": 4.4375, + "learning_rate": 1.9989646264259743e-05, + "loss": 1.1521, + "step": 352 + }, + { + "epoch": 0.06052682341342135, + "grad_norm": 1.7734375, + "learning_rate": 1.9989563943037133e-05, + "loss": 1.1788, + "step": 353 + }, + { + "epoch": 0.06069828750241121, + "grad_norm": 1.7265625, + "learning_rate": 1.998948129601723e-05, + "loss": 1.1802, + "step": 354 + }, + { + "epoch": 0.060869751591401074, + "grad_norm": 1.6953125, + "learning_rate": 1.998939832320272e-05, + "loss": 1.073, + "step": 355 + }, + { + "epoch": 0.06104121568039094, + "grad_norm": 1.8515625, + "learning_rate": 1.9989315024596315e-05, + "loss": 1.1932, + "step": 356 + }, + { + "epoch": 0.0612126797693808, + "grad_norm": 1.8203125, + "learning_rate": 1.998923140020073e-05, + "loss": 1.0352, + "step": 357 + }, + { + "epoch": 0.06138414385837066, + "grad_norm": 1.7890625, + "learning_rate": 1.9989147450018698e-05, + "loss": 1.1364, + "step": 358 + }, + { + "epoch": 0.06155560794736052, + "grad_norm": 1.6328125, + "learning_rate": 1.9989063174052948e-05, + "loss": 1.0597, + "step": 359 + }, + { + "epoch": 0.06172707203635039, + "grad_norm": 1.78125, + "learning_rate": 1.998897857230623e-05, + "loss": 1.1841, + "step": 360 + }, + { + "epoch": 0.06189853612534025, + "grad_norm": 1.7265625, + "learning_rate": 1.9988893644781312e-05, + "loss": 1.0917, + "step": 361 + }, + { + "epoch": 0.06207000021433011, + "grad_norm": 1.65625, + "learning_rate": 1.9988808391480955e-05, + "loss": 1.1106, + "step": 362 + }, + { + "epoch": 0.06224146430331998, + "grad_norm": 1.7734375, + "learning_rate": 1.998872281240794e-05, + "loss": 1.0291, + "step": 363 + }, + { + "epoch": 0.06241292839230984, + "grad_norm": 1.6328125, + "learning_rate": 1.998863690756506e-05, + "loss": 1.1161, + "step": 364 + }, + { + "epoch": 0.0625843924812997, + "grad_norm": 1.671875, + "learning_rate": 1.998855067695512e-05, + "loss": 1.1159, + "step": 365 + }, + { + "epoch": 0.06275585657028956, + "grad_norm": 1.859375, + "learning_rate": 1.9988464120580925e-05, + "loss": 1.098, + "step": 366 + }, + { + "epoch": 0.06292732065927942, + "grad_norm": 1.7734375, + "learning_rate": 1.9988377238445308e-05, + "loss": 1.1378, + "step": 367 + }, + { + "epoch": 0.06309878474826928, + "grad_norm": 1.6953125, + "learning_rate": 1.9988290030551088e-05, + "loss": 1.0319, + "step": 368 + }, + { + "epoch": 0.06327024883725915, + "grad_norm": 1.734375, + "learning_rate": 1.9988202496901126e-05, + "loss": 1.0653, + "step": 369 + }, + { + "epoch": 0.063441712926249, + "grad_norm": 1.703125, + "learning_rate": 1.9988114637498264e-05, + "loss": 1.0302, + "step": 370 + }, + { + "epoch": 0.06361317701523887, + "grad_norm": 1.7421875, + "learning_rate": 1.9988026452345376e-05, + "loss": 1.0393, + "step": 371 + }, + { + "epoch": 0.06378464110422874, + "grad_norm": 1.7109375, + "learning_rate": 1.9987937941445327e-05, + "loss": 1.0318, + "step": 372 + }, + { + "epoch": 0.0639561051932186, + "grad_norm": 1.7265625, + "learning_rate": 1.9987849104801018e-05, + "loss": 1.1677, + "step": 373 + }, + { + "epoch": 0.06412756928220846, + "grad_norm": 1.765625, + "learning_rate": 1.9987759942415335e-05, + "loss": 1.1635, + "step": 374 + }, + { + "epoch": 0.06429903337119831, + "grad_norm": 1.7421875, + "learning_rate": 1.998767045429119e-05, + "loss": 1.1291, + "step": 375 + }, + { + "epoch": 0.06447049746018818, + "grad_norm": 1.78125, + "learning_rate": 1.99875806404315e-05, + "loss": 1.1369, + "step": 376 + }, + { + "epoch": 0.06464196154917805, + "grad_norm": 1.90625, + "learning_rate": 1.9987490500839198e-05, + "loss": 1.1804, + "step": 377 + }, + { + "epoch": 0.0648134256381679, + "grad_norm": 1.71875, + "learning_rate": 1.998740003551722e-05, + "loss": 1.1491, + "step": 378 + }, + { + "epoch": 0.06498488972715777, + "grad_norm": 1.7109375, + "learning_rate": 1.9987309244468517e-05, + "loss": 1.147, + "step": 379 + }, + { + "epoch": 0.06515635381614764, + "grad_norm": 1.8125, + "learning_rate": 1.9987218127696054e-05, + "loss": 1.0988, + "step": 380 + }, + { + "epoch": 0.06532781790513749, + "grad_norm": 2.078125, + "learning_rate": 1.9987126685202795e-05, + "loss": 1.0034, + "step": 381 + }, + { + "epoch": 0.06549928199412736, + "grad_norm": 1.7421875, + "learning_rate": 1.998703491699173e-05, + "loss": 1.1178, + "step": 382 + }, + { + "epoch": 0.06567074608311722, + "grad_norm": 1.671875, + "learning_rate": 1.998694282306585e-05, + "loss": 1.0409, + "step": 383 + }, + { + "epoch": 0.06584221017210708, + "grad_norm": 1.7421875, + "learning_rate": 1.9986850403428155e-05, + "loss": 1.1258, + "step": 384 + }, + { + "epoch": 0.06601367426109694, + "grad_norm": 1.75, + "learning_rate": 1.998675765808166e-05, + "loss": 1.1269, + "step": 385 + }, + { + "epoch": 0.0661851383500868, + "grad_norm": 1.8203125, + "learning_rate": 1.9986664587029395e-05, + "loss": 1.142, + "step": 386 + }, + { + "epoch": 0.06635660243907666, + "grad_norm": 1.609375, + "learning_rate": 1.9986571190274388e-05, + "loss": 1.1271, + "step": 387 + }, + { + "epoch": 0.06652806652806653, + "grad_norm": 1.78125, + "learning_rate": 1.9986477467819688e-05, + "loss": 1.0659, + "step": 388 + }, + { + "epoch": 0.06669953061705639, + "grad_norm": 1.7578125, + "learning_rate": 1.9986383419668355e-05, + "loss": 1.0759, + "step": 389 + }, + { + "epoch": 0.06687099470604625, + "grad_norm": 1.75, + "learning_rate": 1.998628904582345e-05, + "loss": 1.0522, + "step": 390 + }, + { + "epoch": 0.06704245879503612, + "grad_norm": 1.671875, + "learning_rate": 1.998619434628806e-05, + "loss": 1.1373, + "step": 391 + }, + { + "epoch": 0.06721392288402597, + "grad_norm": 1.6796875, + "learning_rate": 1.9986099321065266e-05, + "loss": 1.1222, + "step": 392 + }, + { + "epoch": 0.06738538697301584, + "grad_norm": 1.65625, + "learning_rate": 1.9986003970158164e-05, + "loss": 1.1167, + "step": 393 + }, + { + "epoch": 0.06755685106200571, + "grad_norm": 1.734375, + "learning_rate": 1.9985908293569873e-05, + "loss": 1.1531, + "step": 394 + }, + { + "epoch": 0.06772831515099556, + "grad_norm": 1.890625, + "learning_rate": 1.998581229130351e-05, + "loss": 1.1829, + "step": 395 + }, + { + "epoch": 0.06789977923998543, + "grad_norm": 1.765625, + "learning_rate": 1.99857159633622e-05, + "loss": 1.201, + "step": 396 + }, + { + "epoch": 0.06807124332897528, + "grad_norm": 1.7109375, + "learning_rate": 1.9985619309749096e-05, + "loss": 1.1298, + "step": 397 + }, + { + "epoch": 0.06824270741796515, + "grad_norm": 1.71875, + "learning_rate": 1.9985522330467343e-05, + "loss": 1.07, + "step": 398 + }, + { + "epoch": 0.06841417150695502, + "grad_norm": 1.6171875, + "learning_rate": 1.9985425025520098e-05, + "loss": 1.0821, + "step": 399 + }, + { + "epoch": 0.06858563559594487, + "grad_norm": 1.7578125, + "learning_rate": 1.998532739491055e-05, + "loss": 1.1798, + "step": 400 + }, + { + "epoch": 0.06875709968493474, + "grad_norm": 1.75, + "learning_rate": 1.998522943864187e-05, + "loss": 1.1651, + "step": 401 + }, + { + "epoch": 0.0689285637739246, + "grad_norm": 1.640625, + "learning_rate": 1.9985131156717258e-05, + "loss": 1.0264, + "step": 402 + }, + { + "epoch": 0.06910002786291446, + "grad_norm": 1.7109375, + "learning_rate": 1.9985032549139917e-05, + "loss": 1.153, + "step": 403 + }, + { + "epoch": 0.06927149195190432, + "grad_norm": 1.703125, + "learning_rate": 1.9984933615913068e-05, + "loss": 1.0631, + "step": 404 + }, + { + "epoch": 0.06944295604089419, + "grad_norm": 1.6796875, + "learning_rate": 1.9984834357039927e-05, + "loss": 1.1154, + "step": 405 + }, + { + "epoch": 0.06961442012988404, + "grad_norm": 1.75, + "learning_rate": 1.9984734772523747e-05, + "loss": 1.1756, + "step": 406 + }, + { + "epoch": 0.06978588421887391, + "grad_norm": 1.7890625, + "learning_rate": 1.998463486236776e-05, + "loss": 1.126, + "step": 407 + }, + { + "epoch": 0.06995734830786378, + "grad_norm": 1.6953125, + "learning_rate": 1.9984534626575236e-05, + "loss": 1.1288, + "step": 408 + }, + { + "epoch": 0.07012881239685363, + "grad_norm": 1.734375, + "learning_rate": 1.9984434065149435e-05, + "loss": 1.1455, + "step": 409 + }, + { + "epoch": 0.0703002764858435, + "grad_norm": 1.734375, + "learning_rate": 1.9984333178093646e-05, + "loss": 1.0953, + "step": 410 + }, + { + "epoch": 0.07047174057483335, + "grad_norm": 1.7890625, + "learning_rate": 1.9984231965411154e-05, + "loss": 1.1731, + "step": 411 + }, + { + "epoch": 0.07064320466382322, + "grad_norm": 1.828125, + "learning_rate": 1.9984130427105257e-05, + "loss": 1.0778, + "step": 412 + }, + { + "epoch": 0.07081466875281309, + "grad_norm": 1.7890625, + "learning_rate": 1.9984028563179274e-05, + "loss": 1.1916, + "step": 413 + }, + { + "epoch": 0.07098613284180294, + "grad_norm": 1.8359375, + "learning_rate": 1.998392637363652e-05, + "loss": 1.1299, + "step": 414 + }, + { + "epoch": 0.07115759693079281, + "grad_norm": 1.6953125, + "learning_rate": 1.9983823858480333e-05, + "loss": 1.0373, + "step": 415 + }, + { + "epoch": 0.07132906101978267, + "grad_norm": 1.90625, + "learning_rate": 1.9983721017714055e-05, + "loss": 1.1585, + "step": 416 + }, + { + "epoch": 0.07150052510877253, + "grad_norm": 1.71875, + "learning_rate": 1.9983617851341038e-05, + "loss": 1.1074, + "step": 417 + }, + { + "epoch": 0.0716719891977624, + "grad_norm": 1.9375, + "learning_rate": 1.998351435936465e-05, + "loss": 1.2491, + "step": 418 + }, + { + "epoch": 0.07184345328675226, + "grad_norm": 1.671875, + "learning_rate": 1.998341054178826e-05, + "loss": 1.006, + "step": 419 + }, + { + "epoch": 0.07201491737574212, + "grad_norm": 1.8046875, + "learning_rate": 1.998330639861526e-05, + "loss": 1.1695, + "step": 420 + }, + { + "epoch": 0.07218638146473198, + "grad_norm": 1.6953125, + "learning_rate": 1.9983201929849044e-05, + "loss": 1.1085, + "step": 421 + }, + { + "epoch": 0.07235784555372184, + "grad_norm": 1.671875, + "learning_rate": 1.9983097135493024e-05, + "loss": 1.1592, + "step": 422 + }, + { + "epoch": 0.0725293096427117, + "grad_norm": 1.78125, + "learning_rate": 1.998299201555061e-05, + "loss": 1.0885, + "step": 423 + }, + { + "epoch": 0.07270077373170157, + "grad_norm": 1.6953125, + "learning_rate": 1.998288657002523e-05, + "loss": 1.078, + "step": 424 + }, + { + "epoch": 0.07287223782069142, + "grad_norm": 1.75, + "learning_rate": 1.9982780798920337e-05, + "loss": 1.0714, + "step": 425 + }, + { + "epoch": 0.07304370190968129, + "grad_norm": 1.71875, + "learning_rate": 1.9982674702239363e-05, + "loss": 1.1147, + "step": 426 + }, + { + "epoch": 0.07321516599867116, + "grad_norm": 1.71875, + "learning_rate": 1.998256827998578e-05, + "loss": 1.1031, + "step": 427 + }, + { + "epoch": 0.07338663008766101, + "grad_norm": 1.765625, + "learning_rate": 1.9982461532163052e-05, + "loss": 1.0766, + "step": 428 + }, + { + "epoch": 0.07355809417665088, + "grad_norm": 1.90625, + "learning_rate": 1.9982354458774658e-05, + "loss": 1.0823, + "step": 429 + }, + { + "epoch": 0.07372955826564075, + "grad_norm": 1.7109375, + "learning_rate": 1.9982247059824103e-05, + "loss": 1.0892, + "step": 430 + }, + { + "epoch": 0.0739010223546306, + "grad_norm": 1.7578125, + "learning_rate": 1.9982139335314878e-05, + "loss": 1.081, + "step": 431 + }, + { + "epoch": 0.07407248644362047, + "grad_norm": 1.65625, + "learning_rate": 1.99820312852505e-05, + "loss": 1.0797, + "step": 432 + }, + { + "epoch": 0.07424395053261032, + "grad_norm": 1.9921875, + "learning_rate": 1.9981922909634495e-05, + "loss": 1.1905, + "step": 433 + }, + { + "epoch": 0.07441541462160019, + "grad_norm": 1.6796875, + "learning_rate": 1.9981814208470394e-05, + "loss": 1.0763, + "step": 434 + }, + { + "epoch": 0.07458687871059005, + "grad_norm": 1.7109375, + "learning_rate": 1.9981705181761747e-05, + "loss": 1.1211, + "step": 435 + }, + { + "epoch": 0.07475834279957991, + "grad_norm": 1.71875, + "learning_rate": 1.99815958295121e-05, + "loss": 1.1659, + "step": 436 + }, + { + "epoch": 0.07492980688856977, + "grad_norm": 1.7265625, + "learning_rate": 1.9981486151725027e-05, + "loss": 1.075, + "step": 437 + }, + { + "epoch": 0.07510127097755964, + "grad_norm": 1.6796875, + "learning_rate": 1.9981376148404112e-05, + "loss": 1.0265, + "step": 438 + }, + { + "epoch": 0.0752727350665495, + "grad_norm": 1.796875, + "learning_rate": 1.9981265819552927e-05, + "loss": 1.1912, + "step": 439 + }, + { + "epoch": 0.07544419915553936, + "grad_norm": 1.7578125, + "learning_rate": 1.998115516517508e-05, + "loss": 1.1765, + "step": 440 + }, + { + "epoch": 0.07561566324452923, + "grad_norm": 1.734375, + "learning_rate": 1.9981044185274174e-05, + "loss": 1.0593, + "step": 441 + }, + { + "epoch": 0.07578712733351908, + "grad_norm": 1.796875, + "learning_rate": 1.9980932879853835e-05, + "loss": 0.98, + "step": 442 + }, + { + "epoch": 0.07595859142250895, + "grad_norm": 1.75, + "learning_rate": 1.998082124891769e-05, + "loss": 1.1933, + "step": 443 + }, + { + "epoch": 0.0761300555114988, + "grad_norm": 1.828125, + "learning_rate": 1.998070929246938e-05, + "loss": 1.1674, + "step": 444 + }, + { + "epoch": 0.07630151960048867, + "grad_norm": 1.734375, + "learning_rate": 1.998059701051255e-05, + "loss": 1.1136, + "step": 445 + }, + { + "epoch": 0.07647298368947854, + "grad_norm": 1.8046875, + "learning_rate": 1.9980484403050876e-05, + "loss": 1.1286, + "step": 446 + }, + { + "epoch": 0.07664444777846839, + "grad_norm": 1.8515625, + "learning_rate": 1.998037147008802e-05, + "loss": 1.1737, + "step": 447 + }, + { + "epoch": 0.07681591186745826, + "grad_norm": 1.7421875, + "learning_rate": 1.998025821162767e-05, + "loss": 1.1155, + "step": 448 + }, + { + "epoch": 0.07698737595644813, + "grad_norm": 1.6171875, + "learning_rate": 1.9980144627673514e-05, + "loss": 1.1302, + "step": 449 + }, + { + "epoch": 0.07715884004543798, + "grad_norm": 1.828125, + "learning_rate": 1.9980030718229262e-05, + "loss": 1.1465, + "step": 450 + }, + { + "epoch": 0.07733030413442785, + "grad_norm": 1.640625, + "learning_rate": 1.9979916483298625e-05, + "loss": 1.0567, + "step": 451 + }, + { + "epoch": 0.07750176822341771, + "grad_norm": 3.21875, + "learning_rate": 1.997980192288533e-05, + "loss": 1.1799, + "step": 452 + }, + { + "epoch": 0.07767323231240757, + "grad_norm": 1.75, + "learning_rate": 1.9979687036993116e-05, + "loss": 1.0321, + "step": 453 + }, + { + "epoch": 0.07784469640139743, + "grad_norm": 1.6484375, + "learning_rate": 1.9979571825625726e-05, + "loss": 1.0242, + "step": 454 + }, + { + "epoch": 0.0780161604903873, + "grad_norm": 1.6640625, + "learning_rate": 1.9979456288786926e-05, + "loss": 1.0746, + "step": 455 + }, + { + "epoch": 0.07818762457937715, + "grad_norm": 1.7265625, + "learning_rate": 1.997934042648047e-05, + "loss": 1.0803, + "step": 456 + }, + { + "epoch": 0.07835908866836702, + "grad_norm": 2.234375, + "learning_rate": 1.9979224238710143e-05, + "loss": 1.094, + "step": 457 + }, + { + "epoch": 0.07853055275735688, + "grad_norm": 1.953125, + "learning_rate": 1.997910772547974e-05, + "loss": 1.1616, + "step": 458 + }, + { + "epoch": 0.07870201684634674, + "grad_norm": 6.75, + "learning_rate": 1.9978990886793056e-05, + "loss": 1.0693, + "step": 459 + }, + { + "epoch": 0.07887348093533661, + "grad_norm": 5.75, + "learning_rate": 1.9978873722653896e-05, + "loss": 1.1452, + "step": 460 + }, + { + "epoch": 0.07904494502432646, + "grad_norm": 3.59375, + "learning_rate": 1.997875623306609e-05, + "loss": 1.1542, + "step": 461 + }, + { + "epoch": 0.07921640911331633, + "grad_norm": 1.7265625, + "learning_rate": 1.9978638418033473e-05, + "loss": 1.1215, + "step": 462 + }, + { + "epoch": 0.0793878732023062, + "grad_norm": 1.7109375, + "learning_rate": 1.9978520277559873e-05, + "loss": 1.0942, + "step": 463 + }, + { + "epoch": 0.07955933729129605, + "grad_norm": 1.7265625, + "learning_rate": 1.9978401811649157e-05, + "loss": 1.0332, + "step": 464 + }, + { + "epoch": 0.07973080138028592, + "grad_norm": 1.8828125, + "learning_rate": 1.9978283020305177e-05, + "loss": 1.1336, + "step": 465 + }, + { + "epoch": 0.07990226546927578, + "grad_norm": 1.8359375, + "learning_rate": 1.9978163903531817e-05, + "loss": 1.0954, + "step": 466 + }, + { + "epoch": 0.08007372955826564, + "grad_norm": 1.6796875, + "learning_rate": 1.997804446133296e-05, + "loss": 1.0685, + "step": 467 + }, + { + "epoch": 0.0802451936472555, + "grad_norm": 1.59375, + "learning_rate": 1.9977924693712492e-05, + "loss": 1.1116, + "step": 468 + }, + { + "epoch": 0.08041665773624536, + "grad_norm": 1.7265625, + "learning_rate": 1.9977804600674327e-05, + "loss": 1.082, + "step": 469 + }, + { + "epoch": 0.08058812182523523, + "grad_norm": 1.6796875, + "learning_rate": 1.9977684182222387e-05, + "loss": 1.035, + "step": 470 + }, + { + "epoch": 0.0807595859142251, + "grad_norm": 1.734375, + "learning_rate": 1.9977563438360593e-05, + "loss": 1.1146, + "step": 471 + }, + { + "epoch": 0.08093105000321495, + "grad_norm": 1.78125, + "learning_rate": 1.997744236909288e-05, + "loss": 1.04, + "step": 472 + }, + { + "epoch": 0.08110251409220481, + "grad_norm": 1.7734375, + "learning_rate": 1.9977320974423196e-05, + "loss": 1.1159, + "step": 473 + }, + { + "epoch": 0.08127397818119468, + "grad_norm": 1.7734375, + "learning_rate": 1.997719925435551e-05, + "loss": 1.1449, + "step": 474 + }, + { + "epoch": 0.08144544227018453, + "grad_norm": 1.78125, + "learning_rate": 1.9977077208893788e-05, + "loss": 1.1145, + "step": 475 + }, + { + "epoch": 0.0816169063591744, + "grad_norm": 1.765625, + "learning_rate": 1.9976954838042e-05, + "loss": 1.1232, + "step": 476 + }, + { + "epoch": 0.08178837044816427, + "grad_norm": 1.8515625, + "learning_rate": 1.9976832141804153e-05, + "loss": 1.2183, + "step": 477 + }, + { + "epoch": 0.08195983453715412, + "grad_norm": 1.65625, + "learning_rate": 1.9976709120184234e-05, + "loss": 1.0607, + "step": 478 + }, + { + "epoch": 0.08213129862614399, + "grad_norm": 1.8046875, + "learning_rate": 1.9976585773186267e-05, + "loss": 1.1443, + "step": 479 + }, + { + "epoch": 0.08230276271513384, + "grad_norm": 1.8359375, + "learning_rate": 1.9976462100814268e-05, + "loss": 1.0947, + "step": 480 + }, + { + "epoch": 0.08247422680412371, + "grad_norm": 1.8046875, + "learning_rate": 1.997633810307227e-05, + "loss": 1.065, + "step": 481 + }, + { + "epoch": 0.08264569089311358, + "grad_norm": 1.7578125, + "learning_rate": 1.997621377996432e-05, + "loss": 1.0127, + "step": 482 + }, + { + "epoch": 0.08281715498210343, + "grad_norm": 1.953125, + "learning_rate": 1.997608913149447e-05, + "loss": 1.1413, + "step": 483 + }, + { + "epoch": 0.0829886190710933, + "grad_norm": 1.640625, + "learning_rate": 1.997596415766679e-05, + "loss": 1.1008, + "step": 484 + }, + { + "epoch": 0.08316008316008316, + "grad_norm": 1.6875, + "learning_rate": 1.997583885848535e-05, + "loss": 1.1314, + "step": 485 + }, + { + "epoch": 0.08333154724907302, + "grad_norm": 1.7265625, + "learning_rate": 1.9975713233954242e-05, + "loss": 1.1378, + "step": 486 + }, + { + "epoch": 0.08350301133806289, + "grad_norm": 1.890625, + "learning_rate": 1.9975587284077558e-05, + "loss": 1.1202, + "step": 487 + }, + { + "epoch": 0.08367447542705275, + "grad_norm": 1.765625, + "learning_rate": 1.9975461008859408e-05, + "loss": 1.1505, + "step": 488 + }, + { + "epoch": 0.0838459395160426, + "grad_norm": 1.6171875, + "learning_rate": 1.9975334408303916e-05, + "loss": 1.072, + "step": 489 + }, + { + "epoch": 0.08401740360503247, + "grad_norm": 1.8515625, + "learning_rate": 1.9975207482415198e-05, + "loss": 1.1842, + "step": 490 + }, + { + "epoch": 0.08418886769402233, + "grad_norm": 1.7109375, + "learning_rate": 1.9975080231197406e-05, + "loss": 1.1735, + "step": 491 + }, + { + "epoch": 0.0843603317830122, + "grad_norm": 1.6953125, + "learning_rate": 1.997495265465468e-05, + "loss": 1.0909, + "step": 492 + }, + { + "epoch": 0.08453179587200206, + "grad_norm": 1.7421875, + "learning_rate": 1.997482475279119e-05, + "loss": 1.0719, + "step": 493 + }, + { + "epoch": 0.08470325996099191, + "grad_norm": 1.8046875, + "learning_rate": 1.9974696525611102e-05, + "loss": 1.0718, + "step": 494 + }, + { + "epoch": 0.08487472404998178, + "grad_norm": 1.6796875, + "learning_rate": 1.99745679731186e-05, + "loss": 1.1493, + "step": 495 + }, + { + "epoch": 0.08504618813897165, + "grad_norm": 2.515625, + "learning_rate": 1.9974439095317874e-05, + "loss": 1.2006, + "step": 496 + }, + { + "epoch": 0.0852176522279615, + "grad_norm": 1.7109375, + "learning_rate": 1.997430989221313e-05, + "loss": 1.1398, + "step": 497 + }, + { + "epoch": 0.08538911631695137, + "grad_norm": 1.9140625, + "learning_rate": 1.9974180363808577e-05, + "loss": 1.1164, + "step": 498 + }, + { + "epoch": 0.08556058040594124, + "grad_norm": 1.6640625, + "learning_rate": 1.9974050510108447e-05, + "loss": 1.0891, + "step": 499 + }, + { + "epoch": 0.08573204449493109, + "grad_norm": 1.765625, + "learning_rate": 1.9973920331116973e-05, + "loss": 1.1879, + "step": 500 + }, + { + "epoch": 0.08590350858392096, + "grad_norm": 1.796875, + "learning_rate": 1.9973789826838393e-05, + "loss": 1.1017, + "step": 501 + }, + { + "epoch": 0.08607497267291082, + "grad_norm": 1.75, + "learning_rate": 1.997365899727697e-05, + "loss": 1.1061, + "step": 502 + }, + { + "epoch": 0.08624643676190068, + "grad_norm": 1.6796875, + "learning_rate": 1.9973527842436975e-05, + "loss": 1.1116, + "step": 503 + }, + { + "epoch": 0.08641790085089054, + "grad_norm": 1.9375, + "learning_rate": 1.9973396362322677e-05, + "loss": 1.1648, + "step": 504 + }, + { + "epoch": 0.0865893649398804, + "grad_norm": 1.7890625, + "learning_rate": 1.9973264556938365e-05, + "loss": 1.1449, + "step": 505 + }, + { + "epoch": 0.08676082902887026, + "grad_norm": 1.796875, + "learning_rate": 1.9973132426288343e-05, + "loss": 1.1397, + "step": 506 + }, + { + "epoch": 0.08693229311786013, + "grad_norm": 1.7265625, + "learning_rate": 1.9972999970376917e-05, + "loss": 1.0692, + "step": 507 + }, + { + "epoch": 0.08710375720684999, + "grad_norm": 1.8125, + "learning_rate": 1.9972867189208403e-05, + "loss": 1.0507, + "step": 508 + }, + { + "epoch": 0.08727522129583985, + "grad_norm": 1.8359375, + "learning_rate": 1.997273408278714e-05, + "loss": 1.1395, + "step": 509 + }, + { + "epoch": 0.08744668538482972, + "grad_norm": 1.734375, + "learning_rate": 1.9972600651117465e-05, + "loss": 1.0305, + "step": 510 + }, + { + "epoch": 0.08761814947381957, + "grad_norm": 1.7578125, + "learning_rate": 1.9972466894203727e-05, + "loss": 1.1262, + "step": 511 + }, + { + "epoch": 0.08778961356280944, + "grad_norm": 1.796875, + "learning_rate": 1.9972332812050293e-05, + "loss": 1.1948, + "step": 512 + }, + { + "epoch": 0.08796107765179931, + "grad_norm": 1.6953125, + "learning_rate": 1.997219840466153e-05, + "loss": 0.9924, + "step": 513 + }, + { + "epoch": 0.08813254174078916, + "grad_norm": 1.78125, + "learning_rate": 1.997206367204183e-05, + "loss": 0.992, + "step": 514 + }, + { + "epoch": 0.08830400582977903, + "grad_norm": 1.71875, + "learning_rate": 1.9971928614195578e-05, + "loss": 1.1893, + "step": 515 + }, + { + "epoch": 0.08847546991876888, + "grad_norm": 1.6796875, + "learning_rate": 1.9971793231127185e-05, + "loss": 1.1084, + "step": 516 + }, + { + "epoch": 0.08864693400775875, + "grad_norm": 1.8125, + "learning_rate": 1.9971657522841064e-05, + "loss": 1.0878, + "step": 517 + }, + { + "epoch": 0.08881839809674862, + "grad_norm": 1.671875, + "learning_rate": 1.9971521489341644e-05, + "loss": 1.0084, + "step": 518 + }, + { + "epoch": 0.08898986218573847, + "grad_norm": 1.6953125, + "learning_rate": 1.9971385130633357e-05, + "loss": 1.0803, + "step": 519 + }, + { + "epoch": 0.08916132627472834, + "grad_norm": 1.609375, + "learning_rate": 1.9971248446720654e-05, + "loss": 1.1219, + "step": 520 + }, + { + "epoch": 0.0893327903637182, + "grad_norm": 1.6484375, + "learning_rate": 1.9971111437607988e-05, + "loss": 1.0364, + "step": 521 + }, + { + "epoch": 0.08950425445270806, + "grad_norm": 1.6484375, + "learning_rate": 1.9970974103299833e-05, + "loss": 1.1015, + "step": 522 + }, + { + "epoch": 0.08967571854169792, + "grad_norm": 1.7421875, + "learning_rate": 1.997083644380066e-05, + "loss": 1.1518, + "step": 523 + }, + { + "epoch": 0.08984718263068779, + "grad_norm": 1.734375, + "learning_rate": 1.9970698459114967e-05, + "loss": 1.1541, + "step": 524 + }, + { + "epoch": 0.09001864671967764, + "grad_norm": 1.671875, + "learning_rate": 1.9970560149247255e-05, + "loss": 0.9705, + "step": 525 + }, + { + "epoch": 0.09019011080866751, + "grad_norm": 1.703125, + "learning_rate": 1.9970421514202025e-05, + "loss": 1.1515, + "step": 526 + }, + { + "epoch": 0.09036157489765737, + "grad_norm": 1.71875, + "learning_rate": 1.9970282553983805e-05, + "loss": 1.1664, + "step": 527 + }, + { + "epoch": 0.09053303898664723, + "grad_norm": 1.671875, + "learning_rate": 1.9970143268597127e-05, + "loss": 1.08, + "step": 528 + }, + { + "epoch": 0.0907045030756371, + "grad_norm": 1.65625, + "learning_rate": 1.9970003658046536e-05, + "loss": 1.129, + "step": 529 + }, + { + "epoch": 0.09087596716462695, + "grad_norm": 1.6328125, + "learning_rate": 1.996986372233658e-05, + "loss": 1.0173, + "step": 530 + }, + { + "epoch": 0.09104743125361682, + "grad_norm": 1.59375, + "learning_rate": 1.9969723461471826e-05, + "loss": 1.0457, + "step": 531 + }, + { + "epoch": 0.09121889534260669, + "grad_norm": 1.6796875, + "learning_rate": 1.9969582875456846e-05, + "loss": 1.1081, + "step": 532 + }, + { + "epoch": 0.09139035943159654, + "grad_norm": 1.734375, + "learning_rate": 1.9969441964296227e-05, + "loss": 1.0597, + "step": 533 + }, + { + "epoch": 0.09156182352058641, + "grad_norm": 1.640625, + "learning_rate": 1.9969300727994564e-05, + "loss": 1.0248, + "step": 534 + }, + { + "epoch": 0.09173328760957628, + "grad_norm": 1.7734375, + "learning_rate": 1.9969159166556462e-05, + "loss": 1.1426, + "step": 535 + }, + { + "epoch": 0.09190475169856613, + "grad_norm": 1.7265625, + "learning_rate": 1.9969017279986542e-05, + "loss": 1.1681, + "step": 536 + }, + { + "epoch": 0.092076215787556, + "grad_norm": 1.8203125, + "learning_rate": 1.9968875068289425e-05, + "loss": 1.09, + "step": 537 + }, + { + "epoch": 0.09224767987654586, + "grad_norm": 1.828125, + "learning_rate": 1.9968732531469757e-05, + "loss": 1.1262, + "step": 538 + }, + { + "epoch": 0.09241914396553572, + "grad_norm": 1.65625, + "learning_rate": 1.9968589669532183e-05, + "loss": 1.0702, + "step": 539 + }, + { + "epoch": 0.09259060805452558, + "grad_norm": 1.6875, + "learning_rate": 1.9968446482481357e-05, + "loss": 1.108, + "step": 540 + }, + { + "epoch": 0.09276207214351544, + "grad_norm": 1.7265625, + "learning_rate": 1.9968302970321956e-05, + "loss": 1.1241, + "step": 541 + }, + { + "epoch": 0.0929335362325053, + "grad_norm": 1.7734375, + "learning_rate": 1.9968159133058655e-05, + "loss": 1.1355, + "step": 542 + }, + { + "epoch": 0.09310500032149517, + "grad_norm": 1.7265625, + "learning_rate": 1.996801497069615e-05, + "loss": 1.1341, + "step": 543 + }, + { + "epoch": 0.09327646441048502, + "grad_norm": 1.78125, + "learning_rate": 1.996787048323914e-05, + "loss": 1.1449, + "step": 544 + }, + { + "epoch": 0.09344792849947489, + "grad_norm": 1.671875, + "learning_rate": 1.9967725670692343e-05, + "loss": 1.084, + "step": 545 + }, + { + "epoch": 0.09361939258846476, + "grad_norm": 1.890625, + "learning_rate": 1.996758053306047e-05, + "loss": 1.1289, + "step": 546 + }, + { + "epoch": 0.09379085667745461, + "grad_norm": 1.6171875, + "learning_rate": 1.9967435070348265e-05, + "loss": 1.0199, + "step": 547 + }, + { + "epoch": 0.09396232076644448, + "grad_norm": 1.7578125, + "learning_rate": 1.9967289282560468e-05, + "loss": 1.1458, + "step": 548 + }, + { + "epoch": 0.09413378485543435, + "grad_norm": 1.609375, + "learning_rate": 1.9967143169701836e-05, + "loss": 0.9749, + "step": 549 + }, + { + "epoch": 0.0943052489444242, + "grad_norm": 1.8125, + "learning_rate": 1.996699673177713e-05, + "loss": 1.0688, + "step": 550 + }, + { + "epoch": 0.09447671303341407, + "grad_norm": 1.703125, + "learning_rate": 1.996684996879113e-05, + "loss": 1.0636, + "step": 551 + }, + { + "epoch": 0.09464817712240392, + "grad_norm": 1.7109375, + "learning_rate": 1.9966702880748618e-05, + "loss": 1.1019, + "step": 552 + }, + { + "epoch": 0.09481964121139379, + "grad_norm": 1.6875, + "learning_rate": 1.9966555467654395e-05, + "loss": 1.1146, + "step": 553 + }, + { + "epoch": 0.09499110530038365, + "grad_norm": 1.890625, + "learning_rate": 1.996640772951327e-05, + "loss": 1.0833, + "step": 554 + }, + { + "epoch": 0.09516256938937351, + "grad_norm": 1.6640625, + "learning_rate": 1.9966259666330055e-05, + "loss": 1.0641, + "step": 555 + }, + { + "epoch": 0.09533403347836338, + "grad_norm": 1.65625, + "learning_rate": 1.9966111278109586e-05, + "loss": 1.016, + "step": 556 + }, + { + "epoch": 0.09550549756735324, + "grad_norm": 1.7890625, + "learning_rate": 1.99659625648567e-05, + "loss": 1.1166, + "step": 557 + }, + { + "epoch": 0.0956769616563431, + "grad_norm": 1.671875, + "learning_rate": 1.9965813526576246e-05, + "loss": 1.0452, + "step": 558 + }, + { + "epoch": 0.09584842574533296, + "grad_norm": 1.7734375, + "learning_rate": 1.996566416327308e-05, + "loss": 1.0721, + "step": 559 + }, + { + "epoch": 0.09601988983432283, + "grad_norm": 1.65625, + "learning_rate": 1.9965514474952084e-05, + "loss": 1.0086, + "step": 560 + }, + { + "epoch": 0.09619135392331268, + "grad_norm": 1.71875, + "learning_rate": 1.996536446161813e-05, + "loss": 1.0513, + "step": 561 + }, + { + "epoch": 0.09636281801230255, + "grad_norm": 1.828125, + "learning_rate": 1.996521412327612e-05, + "loss": 1.1228, + "step": 562 + }, + { + "epoch": 0.0965342821012924, + "grad_norm": 1.6796875, + "learning_rate": 1.996506345993095e-05, + "loss": 1.1084, + "step": 563 + }, + { + "epoch": 0.09670574619028227, + "grad_norm": 1.8984375, + "learning_rate": 1.9964912471587533e-05, + "loss": 1.1142, + "step": 564 + }, + { + "epoch": 0.09687721027927214, + "grad_norm": 1.6875, + "learning_rate": 1.99647611582508e-05, + "loss": 1.05, + "step": 565 + }, + { + "epoch": 0.09704867436826199, + "grad_norm": 1.8671875, + "learning_rate": 1.996460951992568e-05, + "loss": 1.1963, + "step": 566 + }, + { + "epoch": 0.09722013845725186, + "grad_norm": 1.6484375, + "learning_rate": 1.996445755661712e-05, + "loss": 1.0478, + "step": 567 + }, + { + "epoch": 0.09739160254624173, + "grad_norm": 1.75, + "learning_rate": 1.9964305268330075e-05, + "loss": 1.0692, + "step": 568 + }, + { + "epoch": 0.09756306663523158, + "grad_norm": 1.7421875, + "learning_rate": 1.9964152655069514e-05, + "loss": 1.0776, + "step": 569 + }, + { + "epoch": 0.09773453072422145, + "grad_norm": 1.8125, + "learning_rate": 1.9963999716840417e-05, + "loss": 1.0419, + "step": 570 + }, + { + "epoch": 0.09790599481321131, + "grad_norm": 1.6953125, + "learning_rate": 1.9963846453647763e-05, + "loss": 1.1413, + "step": 571 + }, + { + "epoch": 0.09807745890220117, + "grad_norm": 1.8203125, + "learning_rate": 1.996369286549656e-05, + "loss": 1.073, + "step": 572 + }, + { + "epoch": 0.09824892299119103, + "grad_norm": 1.7421875, + "learning_rate": 1.9963538952391808e-05, + "loss": 1.1669, + "step": 573 + }, + { + "epoch": 0.09842038708018089, + "grad_norm": 1.7578125, + "learning_rate": 1.9963384714338537e-05, + "loss": 1.1771, + "step": 574 + }, + { + "epoch": 0.09859185116917075, + "grad_norm": 1.859375, + "learning_rate": 1.996323015134177e-05, + "loss": 1.1514, + "step": 575 + }, + { + "epoch": 0.09876331525816062, + "grad_norm": 1.7734375, + "learning_rate": 1.9963075263406545e-05, + "loss": 1.163, + "step": 576 + }, + { + "epoch": 0.09893477934715048, + "grad_norm": 1.6875, + "learning_rate": 1.9962920050537922e-05, + "loss": 1.1313, + "step": 577 + }, + { + "epoch": 0.09910624343614034, + "grad_norm": 1.9453125, + "learning_rate": 1.996276451274096e-05, + "loss": 1.0454, + "step": 578 + }, + { + "epoch": 0.09927770752513021, + "grad_norm": 1.75, + "learning_rate": 1.996260865002073e-05, + "loss": 1.2105, + "step": 579 + }, + { + "epoch": 0.09944917161412006, + "grad_norm": 1.6484375, + "learning_rate": 1.996245246238232e-05, + "loss": 1.1686, + "step": 580 + }, + { + "epoch": 0.09962063570310993, + "grad_norm": 1.6953125, + "learning_rate": 1.9962295949830814e-05, + "loss": 1.1004, + "step": 581 + }, + { + "epoch": 0.0997920997920998, + "grad_norm": 1.765625, + "learning_rate": 1.9962139112371324e-05, + "loss": 1.2155, + "step": 582 + }, + { + "epoch": 0.09996356388108965, + "grad_norm": 1.703125, + "learning_rate": 1.9961981950008966e-05, + "loss": 1.1185, + "step": 583 + }, + { + "epoch": 0.10013502797007952, + "grad_norm": 1.65625, + "learning_rate": 1.9961824462748863e-05, + "loss": 1.0739, + "step": 584 + }, + { + "epoch": 0.10030649205906939, + "grad_norm": 1.65625, + "learning_rate": 1.996166665059615e-05, + "loss": 1.1047, + "step": 585 + }, + { + "epoch": 0.10047795614805924, + "grad_norm": 1.6484375, + "learning_rate": 1.9961508513555975e-05, + "loss": 1.1484, + "step": 586 + }, + { + "epoch": 0.1006494202370491, + "grad_norm": 1.671875, + "learning_rate": 1.9961350051633497e-05, + "loss": 1.0129, + "step": 587 + }, + { + "epoch": 0.10082088432603896, + "grad_norm": 1.734375, + "learning_rate": 1.9961191264833884e-05, + "loss": 1.0761, + "step": 588 + }, + { + "epoch": 0.10099234841502883, + "grad_norm": 1.859375, + "learning_rate": 1.9961032153162312e-05, + "loss": 1.1838, + "step": 589 + }, + { + "epoch": 0.1011638125040187, + "grad_norm": 1.6953125, + "learning_rate": 1.9960872716623972e-05, + "loss": 1.0953, + "step": 590 + }, + { + "epoch": 0.10133527659300855, + "grad_norm": 1.71875, + "learning_rate": 1.9960712955224065e-05, + "loss": 1.0843, + "step": 591 + }, + { + "epoch": 0.10150674068199841, + "grad_norm": 1.6953125, + "learning_rate": 1.9960552868967796e-05, + "loss": 1.2278, + "step": 592 + }, + { + "epoch": 0.10167820477098828, + "grad_norm": 1.6953125, + "learning_rate": 1.9960392457860394e-05, + "loss": 1.1326, + "step": 593 + }, + { + "epoch": 0.10184966885997813, + "grad_norm": 1.96875, + "learning_rate": 1.9960231721907083e-05, + "loss": 1.1644, + "step": 594 + }, + { + "epoch": 0.102021132948968, + "grad_norm": 1.6875, + "learning_rate": 1.996007066111311e-05, + "loss": 1.1278, + "step": 595 + }, + { + "epoch": 0.10219259703795787, + "grad_norm": 1.7890625, + "learning_rate": 1.995990927548373e-05, + "loss": 1.1009, + "step": 596 + }, + { + "epoch": 0.10236406112694772, + "grad_norm": 1.7421875, + "learning_rate": 1.99597475650242e-05, + "loss": 1.1035, + "step": 597 + }, + { + "epoch": 0.10253552521593759, + "grad_norm": 1.6953125, + "learning_rate": 1.9959585529739796e-05, + "loss": 1.1012, + "step": 598 + }, + { + "epoch": 0.10270698930492744, + "grad_norm": 1.65625, + "learning_rate": 1.9959423169635804e-05, + "loss": 1.1065, + "step": 599 + }, + { + "epoch": 0.10287845339391731, + "grad_norm": 1.6875, + "learning_rate": 1.995926048471752e-05, + "loss": 1.1969, + "step": 600 + }, + { + "epoch": 0.10304991748290718, + "grad_norm": 1.7734375, + "learning_rate": 1.9959097474990248e-05, + "loss": 1.163, + "step": 601 + }, + { + "epoch": 0.10322138157189703, + "grad_norm": 1.8046875, + "learning_rate": 1.9958934140459302e-05, + "loss": 1.1103, + "step": 602 + }, + { + "epoch": 0.1033928456608869, + "grad_norm": 1.6953125, + "learning_rate": 1.9958770481130016e-05, + "loss": 1.1603, + "step": 603 + }, + { + "epoch": 0.10356430974987677, + "grad_norm": 1.765625, + "learning_rate": 1.995860649700772e-05, + "loss": 1.1266, + "step": 604 + }, + { + "epoch": 0.10373577383886662, + "grad_norm": 1.8515625, + "learning_rate": 1.995844218809777e-05, + "loss": 1.1011, + "step": 605 + }, + { + "epoch": 0.10390723792785649, + "grad_norm": 1.78125, + "learning_rate": 1.9958277554405516e-05, + "loss": 1.0317, + "step": 606 + }, + { + "epoch": 0.10407870201684635, + "grad_norm": 1.8359375, + "learning_rate": 1.9958112595936333e-05, + "loss": 1.0968, + "step": 607 + }, + { + "epoch": 0.1042501661058362, + "grad_norm": 1.6640625, + "learning_rate": 1.9957947312695602e-05, + "loss": 1.0747, + "step": 608 + }, + { + "epoch": 0.10442163019482607, + "grad_norm": 1.7109375, + "learning_rate": 1.9957781704688707e-05, + "loss": 1.0624, + "step": 609 + }, + { + "epoch": 0.10459309428381593, + "grad_norm": 1.8125, + "learning_rate": 1.9957615771921055e-05, + "loss": 1.0525, + "step": 610 + }, + { + "epoch": 0.1047645583728058, + "grad_norm": 2.078125, + "learning_rate": 1.9957449514398054e-05, + "loss": 1.1324, + "step": 611 + }, + { + "epoch": 0.10493602246179566, + "grad_norm": 1.609375, + "learning_rate": 1.9957282932125132e-05, + "loss": 1.0306, + "step": 612 + }, + { + "epoch": 0.10510748655078551, + "grad_norm": 1.671875, + "learning_rate": 1.9957116025107713e-05, + "loss": 1.1239, + "step": 613 + }, + { + "epoch": 0.10527895063977538, + "grad_norm": 1.8828125, + "learning_rate": 1.9956948793351248e-05, + "loss": 1.0913, + "step": 614 + }, + { + "epoch": 0.10545041472876525, + "grad_norm": 1.7109375, + "learning_rate": 1.995678123686119e-05, + "loss": 1.0655, + "step": 615 + }, + { + "epoch": 0.1056218788177551, + "grad_norm": 1.6953125, + "learning_rate": 1.9956613355643e-05, + "loss": 1.0825, + "step": 616 + }, + { + "epoch": 0.10579334290674497, + "grad_norm": 1.90625, + "learning_rate": 1.9956445149702154e-05, + "loss": 1.1483, + "step": 617 + }, + { + "epoch": 0.10596480699573484, + "grad_norm": 2.34375, + "learning_rate": 1.9956276619044145e-05, + "loss": 1.1081, + "step": 618 + }, + { + "epoch": 0.10613627108472469, + "grad_norm": 1.734375, + "learning_rate": 1.9956107763674456e-05, + "loss": 1.1174, + "step": 619 + }, + { + "epoch": 0.10630773517371456, + "grad_norm": 1.6796875, + "learning_rate": 1.9955938583598606e-05, + "loss": 1.1238, + "step": 620 + }, + { + "epoch": 0.10647919926270442, + "grad_norm": 1.7890625, + "learning_rate": 1.9955769078822106e-05, + "loss": 1.1835, + "step": 621 + }, + { + "epoch": 0.10665066335169428, + "grad_norm": 1.8046875, + "learning_rate": 1.9955599249350487e-05, + "loss": 1.1702, + "step": 622 + }, + { + "epoch": 0.10682212744068414, + "grad_norm": 1.7421875, + "learning_rate": 1.995542909518929e-05, + "loss": 1.0848, + "step": 623 + }, + { + "epoch": 0.106993591529674, + "grad_norm": 1.7734375, + "learning_rate": 1.995525861634406e-05, + "loss": 1.106, + "step": 624 + }, + { + "epoch": 0.10716505561866387, + "grad_norm": 1.6875, + "learning_rate": 1.9955087812820357e-05, + "loss": 1.0085, + "step": 625 + }, + { + "epoch": 0.10733651970765373, + "grad_norm": 1.7578125, + "learning_rate": 1.9954916684623753e-05, + "loss": 1.083, + "step": 626 + }, + { + "epoch": 0.10750798379664359, + "grad_norm": 1.7578125, + "learning_rate": 1.995474523175983e-05, + "loss": 1.1413, + "step": 627 + }, + { + "epoch": 0.10767944788563345, + "grad_norm": 1.578125, + "learning_rate": 1.9954573454234177e-05, + "loss": 1.0141, + "step": 628 + }, + { + "epoch": 0.10785091197462332, + "grad_norm": 10.1875, + "learning_rate": 1.99544013520524e-05, + "loss": 1.1188, + "step": 629 + }, + { + "epoch": 0.10802237606361317, + "grad_norm": 1.8125, + "learning_rate": 1.9954228925220112e-05, + "loss": 1.0341, + "step": 630 + }, + { + "epoch": 0.10819384015260304, + "grad_norm": 1.78125, + "learning_rate": 1.9954056173742933e-05, + "loss": 1.1826, + "step": 631 + }, + { + "epoch": 0.10836530424159291, + "grad_norm": 1.609375, + "learning_rate": 1.99538830976265e-05, + "loss": 1.0859, + "step": 632 + }, + { + "epoch": 0.10853676833058276, + "grad_norm": 1.78125, + "learning_rate": 1.9953709696876452e-05, + "loss": 1.0958, + "step": 633 + }, + { + "epoch": 0.10870823241957263, + "grad_norm": 1.8125, + "learning_rate": 1.9953535971498452e-05, + "loss": 1.1274, + "step": 634 + }, + { + "epoch": 0.10887969650856248, + "grad_norm": 1.71875, + "learning_rate": 1.9953361921498163e-05, + "loss": 1.186, + "step": 635 + }, + { + "epoch": 0.10905116059755235, + "grad_norm": 1.734375, + "learning_rate": 1.9953187546881257e-05, + "loss": 1.0017, + "step": 636 + }, + { + "epoch": 0.10922262468654222, + "grad_norm": 1.71875, + "learning_rate": 1.995301284765343e-05, + "loss": 1.1105, + "step": 637 + }, + { + "epoch": 0.10939408877553207, + "grad_norm": 1.7109375, + "learning_rate": 1.9952837823820373e-05, + "loss": 1.0799, + "step": 638 + }, + { + "epoch": 0.10956555286452194, + "grad_norm": 1.734375, + "learning_rate": 1.9952662475387794e-05, + "loss": 1.0469, + "step": 639 + }, + { + "epoch": 0.1097370169535118, + "grad_norm": 1.625, + "learning_rate": 1.9952486802361416e-05, + "loss": 1.0985, + "step": 640 + }, + { + "epoch": 0.10990848104250166, + "grad_norm": 1.828125, + "learning_rate": 1.995231080474696e-05, + "loss": 1.1092, + "step": 641 + }, + { + "epoch": 0.11007994513149152, + "grad_norm": 1.640625, + "learning_rate": 1.9952134482550175e-05, + "loss": 1.0038, + "step": 642 + }, + { + "epoch": 0.11025140922048139, + "grad_norm": 1.703125, + "learning_rate": 1.9951957835776808e-05, + "loss": 1.1089, + "step": 643 + }, + { + "epoch": 0.11042287330947125, + "grad_norm": 1.6875, + "learning_rate": 1.9951780864432623e-05, + "loss": 1.0895, + "step": 644 + }, + { + "epoch": 0.11059433739846111, + "grad_norm": 1.65625, + "learning_rate": 1.9951603568523387e-05, + "loss": 1.0533, + "step": 645 + }, + { + "epoch": 0.11076580148745097, + "grad_norm": 1.8046875, + "learning_rate": 1.9951425948054886e-05, + "loss": 1.1723, + "step": 646 + }, + { + "epoch": 0.11093726557644083, + "grad_norm": 1.8203125, + "learning_rate": 1.995124800303291e-05, + "loss": 1.2228, + "step": 647 + }, + { + "epoch": 0.1111087296654307, + "grad_norm": 1.8359375, + "learning_rate": 1.9951069733463262e-05, + "loss": 1.1278, + "step": 648 + }, + { + "epoch": 0.11128019375442055, + "grad_norm": 1.8203125, + "learning_rate": 1.995089113935176e-05, + "loss": 1.1156, + "step": 649 + }, + { + "epoch": 0.11145165784341042, + "grad_norm": 1.7265625, + "learning_rate": 1.9950712220704224e-05, + "loss": 1.1134, + "step": 650 + }, + { + "epoch": 0.11162312193240029, + "grad_norm": 1.6953125, + "learning_rate": 1.9950532977526493e-05, + "loss": 1.1028, + "step": 651 + }, + { + "epoch": 0.11179458602139014, + "grad_norm": 1.828125, + "learning_rate": 1.9950353409824412e-05, + "loss": 1.0454, + "step": 652 + }, + { + "epoch": 0.11196605011038001, + "grad_norm": 1.859375, + "learning_rate": 1.995017351760384e-05, + "loss": 1.0748, + "step": 653 + }, + { + "epoch": 0.11213751419936988, + "grad_norm": 1.8359375, + "learning_rate": 1.9949993300870637e-05, + "loss": 1.1675, + "step": 654 + }, + { + "epoch": 0.11230897828835973, + "grad_norm": 1.7890625, + "learning_rate": 1.9949812759630683e-05, + "loss": 1.1367, + "step": 655 + }, + { + "epoch": 0.1124804423773496, + "grad_norm": 1.75, + "learning_rate": 1.9949631893889866e-05, + "loss": 1.0374, + "step": 656 + }, + { + "epoch": 0.11265190646633945, + "grad_norm": 1.8125, + "learning_rate": 1.994945070365409e-05, + "loss": 1.1405, + "step": 657 + }, + { + "epoch": 0.11282337055532932, + "grad_norm": 1.78125, + "learning_rate": 1.994926918892926e-05, + "loss": 1.1399, + "step": 658 + }, + { + "epoch": 0.11299483464431918, + "grad_norm": 2.0625, + "learning_rate": 1.9949087349721296e-05, + "loss": 1.1436, + "step": 659 + }, + { + "epoch": 0.11316629873330904, + "grad_norm": 1.7265625, + "learning_rate": 1.9948905186036128e-05, + "loss": 1.0613, + "step": 660 + }, + { + "epoch": 0.1133377628222989, + "grad_norm": 1.6484375, + "learning_rate": 1.9948722697879696e-05, + "loss": 1.1771, + "step": 661 + }, + { + "epoch": 0.11350922691128877, + "grad_norm": 1.8359375, + "learning_rate": 1.9948539885257956e-05, + "loss": 1.1123, + "step": 662 + }, + { + "epoch": 0.11368069100027862, + "grad_norm": 1.6953125, + "learning_rate": 1.9948356748176867e-05, + "loss": 1.0366, + "step": 663 + }, + { + "epoch": 0.11385215508926849, + "grad_norm": 1.6875, + "learning_rate": 1.9948173286642403e-05, + "loss": 1.0478, + "step": 664 + }, + { + "epoch": 0.11402361917825836, + "grad_norm": 1.734375, + "learning_rate": 1.9947989500660544e-05, + "loss": 1.0948, + "step": 665 + }, + { + "epoch": 0.11419508326724821, + "grad_norm": 1.6484375, + "learning_rate": 1.994780539023729e-05, + "loss": 1.0645, + "step": 666 + }, + { + "epoch": 0.11436654735623808, + "grad_norm": 1.7734375, + "learning_rate": 1.9947620955378642e-05, + "loss": 1.0534, + "step": 667 + }, + { + "epoch": 0.11453801144522795, + "grad_norm": 1.7421875, + "learning_rate": 1.9947436196090614e-05, + "loss": 1.077, + "step": 668 + }, + { + "epoch": 0.1147094755342178, + "grad_norm": 1.7890625, + "learning_rate": 1.9947251112379233e-05, + "loss": 1.0538, + "step": 669 + }, + { + "epoch": 0.11488093962320767, + "grad_norm": 1.6796875, + "learning_rate": 1.9947065704250533e-05, + "loss": 1.0747, + "step": 670 + }, + { + "epoch": 0.11505240371219752, + "grad_norm": 1.75, + "learning_rate": 1.994687997171057e-05, + "loss": 1.0738, + "step": 671 + }, + { + "epoch": 0.11522386780118739, + "grad_norm": 1.6953125, + "learning_rate": 1.994669391476539e-05, + "loss": 1.109, + "step": 672 + }, + { + "epoch": 0.11539533189017726, + "grad_norm": 1.5703125, + "learning_rate": 1.9946507533421067e-05, + "loss": 1.0449, + "step": 673 + }, + { + "epoch": 0.11556679597916711, + "grad_norm": 1.7109375, + "learning_rate": 1.9946320827683676e-05, + "loss": 1.046, + "step": 674 + }, + { + "epoch": 0.11573826006815698, + "grad_norm": 1.6796875, + "learning_rate": 1.994613379755931e-05, + "loss": 1.0857, + "step": 675 + }, + { + "epoch": 0.11590972415714684, + "grad_norm": 1.625, + "learning_rate": 1.994594644305407e-05, + "loss": 1.0279, + "step": 676 + }, + { + "epoch": 0.1160811882461367, + "grad_norm": 1.7421875, + "learning_rate": 1.9945758764174056e-05, + "loss": 1.1194, + "step": 677 + }, + { + "epoch": 0.11625265233512656, + "grad_norm": 1.7109375, + "learning_rate": 1.9945570760925403e-05, + "loss": 1.0465, + "step": 678 + }, + { + "epoch": 0.11642411642411643, + "grad_norm": 1.875, + "learning_rate": 1.9945382433314234e-05, + "loss": 1.042, + "step": 679 + }, + { + "epoch": 0.11659558051310628, + "grad_norm": 1.7734375, + "learning_rate": 1.9945193781346695e-05, + "loss": 1.0926, + "step": 680 + }, + { + "epoch": 0.11676704460209615, + "grad_norm": 1.7421875, + "learning_rate": 1.9945004805028933e-05, + "loss": 1.0987, + "step": 681 + }, + { + "epoch": 0.116938508691086, + "grad_norm": 1.7265625, + "learning_rate": 1.9944815504367117e-05, + "loss": 1.1226, + "step": 682 + }, + { + "epoch": 0.11710997278007587, + "grad_norm": 1.6953125, + "learning_rate": 1.9944625879367417e-05, + "loss": 1.0064, + "step": 683 + }, + { + "epoch": 0.11728143686906574, + "grad_norm": 1.7890625, + "learning_rate": 1.994443593003602e-05, + "loss": 1.1373, + "step": 684 + }, + { + "epoch": 0.11745290095805559, + "grad_norm": 1.734375, + "learning_rate": 1.9944245656379123e-05, + "loss": 1.1338, + "step": 685 + }, + { + "epoch": 0.11762436504704546, + "grad_norm": 1.7734375, + "learning_rate": 1.9944055058402923e-05, + "loss": 1.0695, + "step": 686 + }, + { + "epoch": 0.11779582913603533, + "grad_norm": 1.75, + "learning_rate": 1.9943864136113647e-05, + "loss": 1.1445, + "step": 687 + }, + { + "epoch": 0.11796729322502518, + "grad_norm": 1.765625, + "learning_rate": 1.9943672889517515e-05, + "loss": 1.1899, + "step": 688 + }, + { + "epoch": 0.11813875731401505, + "grad_norm": 1.71875, + "learning_rate": 1.9943481318620765e-05, + "loss": 1.0404, + "step": 689 + }, + { + "epoch": 0.11831022140300491, + "grad_norm": 1.8046875, + "learning_rate": 1.9943289423429645e-05, + "loss": 1.0936, + "step": 690 + }, + { + "epoch": 0.11848168549199477, + "grad_norm": 1.7578125, + "learning_rate": 1.9943097203950413e-05, + "loss": 1.0992, + "step": 691 + }, + { + "epoch": 0.11865314958098463, + "grad_norm": 1.7265625, + "learning_rate": 1.994290466018934e-05, + "loss": 1.0978, + "step": 692 + }, + { + "epoch": 0.11882461366997449, + "grad_norm": 1.7890625, + "learning_rate": 1.9942711792152708e-05, + "loss": 1.1341, + "step": 693 + }, + { + "epoch": 0.11899607775896436, + "grad_norm": 7.25, + "learning_rate": 1.99425185998468e-05, + "loss": 1.1517, + "step": 694 + }, + { + "epoch": 0.11916754184795422, + "grad_norm": 1.890625, + "learning_rate": 1.9942325083277917e-05, + "loss": 1.155, + "step": 695 + }, + { + "epoch": 0.11933900593694408, + "grad_norm": 1.6953125, + "learning_rate": 1.994213124245238e-05, + "loss": 1.0425, + "step": 696 + }, + { + "epoch": 0.11951047002593394, + "grad_norm": 1.7890625, + "learning_rate": 1.99419370773765e-05, + "loss": 1.0609, + "step": 697 + }, + { + "epoch": 0.11968193411492381, + "grad_norm": 1.6875, + "learning_rate": 1.9941742588056616e-05, + "loss": 1.0257, + "step": 698 + }, + { + "epoch": 0.11985339820391366, + "grad_norm": 1.7578125, + "learning_rate": 1.994154777449907e-05, + "loss": 1.1413, + "step": 699 + }, + { + "epoch": 0.12002486229290353, + "grad_norm": 1.7890625, + "learning_rate": 1.9941352636710215e-05, + "loss": 1.1103, + "step": 700 + }, + { + "epoch": 0.12002486229290353, + "eval_loss": 0.9447146058082581, + "eval_runtime": 837.1484, + "eval_samples_per_second": 2.985, + "eval_steps_per_second": 2.985, + "step": 700 + }, + { + "epoch": 0.1201963263818934, + "grad_norm": 1.6171875, + "learning_rate": 1.9941157174696412e-05, + "loss": 1.1096, + "step": 701 + }, + { + "epoch": 0.12036779047088325, + "grad_norm": 1.765625, + "learning_rate": 1.9940961388464042e-05, + "loss": 1.0937, + "step": 702 + }, + { + "epoch": 0.12053925455987312, + "grad_norm": 1.7109375, + "learning_rate": 1.9940765278019484e-05, + "loss": 1.0675, + "step": 703 + }, + { + "epoch": 0.12071071864886297, + "grad_norm": 1.75, + "learning_rate": 1.994056884336914e-05, + "loss": 1.0785, + "step": 704 + }, + { + "epoch": 0.12088218273785284, + "grad_norm": 1.7109375, + "learning_rate": 1.994037208451941e-05, + "loss": 1.1388, + "step": 705 + }, + { + "epoch": 0.1210536468268427, + "grad_norm": 1.7421875, + "learning_rate": 1.9940175001476715e-05, + "loss": 1.1303, + "step": 706 + }, + { + "epoch": 0.12122511091583256, + "grad_norm": 1.6484375, + "learning_rate": 1.9939977594247483e-05, + "loss": 1.0755, + "step": 707 + }, + { + "epoch": 0.12139657500482243, + "grad_norm": 1.6328125, + "learning_rate": 1.9939779862838153e-05, + "loss": 1.1185, + "step": 708 + }, + { + "epoch": 0.1215680390938123, + "grad_norm": 1.8828125, + "learning_rate": 1.9939581807255168e-05, + "loss": 1.1227, + "step": 709 + }, + { + "epoch": 0.12173950318280215, + "grad_norm": 1.828125, + "learning_rate": 1.9939383427504994e-05, + "loss": 1.0672, + "step": 710 + }, + { + "epoch": 0.12191096727179201, + "grad_norm": 1.6875, + "learning_rate": 1.9939184723594097e-05, + "loss": 1.076, + "step": 711 + }, + { + "epoch": 0.12208243136078188, + "grad_norm": 1.78125, + "learning_rate": 1.9938985695528957e-05, + "loss": 1.137, + "step": 712 + }, + { + "epoch": 0.12225389544977174, + "grad_norm": 1.78125, + "learning_rate": 1.993878634331607e-05, + "loss": 1.1944, + "step": 713 + }, + { + "epoch": 0.1224253595387616, + "grad_norm": 1.7578125, + "learning_rate": 1.9938586666961933e-05, + "loss": 1.1928, + "step": 714 + }, + { + "epoch": 0.12259682362775147, + "grad_norm": 1.6953125, + "learning_rate": 1.993838666647306e-05, + "loss": 1.0469, + "step": 715 + }, + { + "epoch": 0.12276828771674132, + "grad_norm": 1.734375, + "learning_rate": 1.9938186341855972e-05, + "loss": 1.0558, + "step": 716 + }, + { + "epoch": 0.12293975180573119, + "grad_norm": 1.7734375, + "learning_rate": 1.9937985693117205e-05, + "loss": 1.101, + "step": 717 + }, + { + "epoch": 0.12311121589472104, + "grad_norm": 1.921875, + "learning_rate": 1.99377847202633e-05, + "loss": 1.1684, + "step": 718 + }, + { + "epoch": 0.12328267998371091, + "grad_norm": 1.65625, + "learning_rate": 1.9937583423300816e-05, + "loss": 0.9685, + "step": 719 + }, + { + "epoch": 0.12345414407270078, + "grad_norm": 1.734375, + "learning_rate": 1.993738180223631e-05, + "loss": 1.1437, + "step": 720 + }, + { + "epoch": 0.12362560816169063, + "grad_norm": 1.71875, + "learning_rate": 1.9937179857076368e-05, + "loss": 1.0985, + "step": 721 + }, + { + "epoch": 0.1237970722506805, + "grad_norm": 1.6953125, + "learning_rate": 1.993697758782757e-05, + "loss": 1.028, + "step": 722 + }, + { + "epoch": 0.12396853633967037, + "grad_norm": 1.7265625, + "learning_rate": 1.9936774994496512e-05, + "loss": 1.1075, + "step": 723 + }, + { + "epoch": 0.12414000042866022, + "grad_norm": 1.828125, + "learning_rate": 1.99365720770898e-05, + "loss": 1.0201, + "step": 724 + }, + { + "epoch": 0.12431146451765009, + "grad_norm": 1.7578125, + "learning_rate": 1.993636883561406e-05, + "loss": 1.1393, + "step": 725 + }, + { + "epoch": 0.12448292860663995, + "grad_norm": 1.7265625, + "learning_rate": 1.9936165270075915e-05, + "loss": 1.0818, + "step": 726 + }, + { + "epoch": 0.1246543926956298, + "grad_norm": 1.7421875, + "learning_rate": 1.9935961380482003e-05, + "loss": 1.0204, + "step": 727 + }, + { + "epoch": 0.12482585678461967, + "grad_norm": 1.65625, + "learning_rate": 1.9935757166838974e-05, + "loss": 1.1329, + "step": 728 + }, + { + "epoch": 0.12499732087360953, + "grad_norm": 1.7421875, + "learning_rate": 1.993555262915349e-05, + "loss": 1.158, + "step": 729 + }, + { + "epoch": 0.1251687849625994, + "grad_norm": 1.671875, + "learning_rate": 1.9935347767432224e-05, + "loss": 1.151, + "step": 730 + }, + { + "epoch": 0.12534024905158925, + "grad_norm": 1.6328125, + "learning_rate": 1.993514258168185e-05, + "loss": 1.1295, + "step": 731 + }, + { + "epoch": 0.12551171314057913, + "grad_norm": 1.6328125, + "learning_rate": 1.9934937071909064e-05, + "loss": 1.1672, + "step": 732 + }, + { + "epoch": 0.12568317722956898, + "grad_norm": 1.7578125, + "learning_rate": 1.993473123812057e-05, + "loss": 1.0816, + "step": 733 + }, + { + "epoch": 0.12585464131855884, + "grad_norm": 1.71875, + "learning_rate": 1.993452508032308e-05, + "loss": 1.0609, + "step": 734 + }, + { + "epoch": 0.12602610540754872, + "grad_norm": 1.6015625, + "learning_rate": 1.9934318598523315e-05, + "loss": 1.0721, + "step": 735 + }, + { + "epoch": 0.12619756949653857, + "grad_norm": 1.7265625, + "learning_rate": 1.9934111792728015e-05, + "loss": 1.1286, + "step": 736 + }, + { + "epoch": 0.12636903358552842, + "grad_norm": 1.78125, + "learning_rate": 1.993390466294392e-05, + "loss": 1.134, + "step": 737 + }, + { + "epoch": 0.1265404976745183, + "grad_norm": 1.6875, + "learning_rate": 1.9933697209177785e-05, + "loss": 1.1024, + "step": 738 + }, + { + "epoch": 0.12671196176350816, + "grad_norm": 1.6171875, + "learning_rate": 1.9933489431436375e-05, + "loss": 1.0655, + "step": 739 + }, + { + "epoch": 0.126883425852498, + "grad_norm": 1.703125, + "learning_rate": 1.9933281329726473e-05, + "loss": 1.0384, + "step": 740 + }, + { + "epoch": 0.1270548899414879, + "grad_norm": 1.6328125, + "learning_rate": 1.993307290405486e-05, + "loss": 1.0531, + "step": 741 + }, + { + "epoch": 0.12722635403047775, + "grad_norm": 1.703125, + "learning_rate": 1.9932864154428335e-05, + "loss": 1.0878, + "step": 742 + }, + { + "epoch": 0.1273978181194676, + "grad_norm": 1.71875, + "learning_rate": 1.9932655080853706e-05, + "loss": 1.1547, + "step": 743 + }, + { + "epoch": 0.12756928220845748, + "grad_norm": 1.8046875, + "learning_rate": 1.9932445683337795e-05, + "loss": 1.1422, + "step": 744 + }, + { + "epoch": 0.12774074629744733, + "grad_norm": 1.765625, + "learning_rate": 1.9932235961887424e-05, + "loss": 1.1325, + "step": 745 + }, + { + "epoch": 0.1279122103864372, + "grad_norm": 1.78125, + "learning_rate": 1.9932025916509436e-05, + "loss": 1.122, + "step": 746 + }, + { + "epoch": 0.12808367447542704, + "grad_norm": 1.6640625, + "learning_rate": 1.9931815547210686e-05, + "loss": 1.1181, + "step": 747 + }, + { + "epoch": 0.12825513856441692, + "grad_norm": 1.640625, + "learning_rate": 1.993160485399803e-05, + "loss": 1.0483, + "step": 748 + }, + { + "epoch": 0.12842660265340677, + "grad_norm": 1.796875, + "learning_rate": 1.9931393836878338e-05, + "loss": 1.1126, + "step": 749 + }, + { + "epoch": 0.12859806674239663, + "grad_norm": 1.6484375, + "learning_rate": 1.99311824958585e-05, + "loss": 1.0719, + "step": 750 + }, + { + "epoch": 0.1287695308313865, + "grad_norm": 1.7578125, + "learning_rate": 1.99309708309454e-05, + "loss": 1.1756, + "step": 751 + }, + { + "epoch": 0.12894099492037636, + "grad_norm": 1.7265625, + "learning_rate": 1.9930758842145947e-05, + "loss": 1.1677, + "step": 752 + }, + { + "epoch": 0.12911245900936622, + "grad_norm": 1.6328125, + "learning_rate": 1.993054652946705e-05, + "loss": 1.1622, + "step": 753 + }, + { + "epoch": 0.1292839230983561, + "grad_norm": 1.71875, + "learning_rate": 1.9930333892915636e-05, + "loss": 1.0601, + "step": 754 + }, + { + "epoch": 0.12945538718734595, + "grad_norm": 1.5859375, + "learning_rate": 1.993012093249864e-05, + "loss": 1.022, + "step": 755 + }, + { + "epoch": 0.1296268512763358, + "grad_norm": 1.6796875, + "learning_rate": 1.9929907648223004e-05, + "loss": 1.122, + "step": 756 + }, + { + "epoch": 0.12979831536532568, + "grad_norm": 1.6171875, + "learning_rate": 1.9929694040095694e-05, + "loss": 1.0447, + "step": 757 + }, + { + "epoch": 0.12996977945431554, + "grad_norm": 1.6796875, + "learning_rate": 1.9929480108123666e-05, + "loss": 1.1163, + "step": 758 + }, + { + "epoch": 0.1301412435433054, + "grad_norm": 1.6875, + "learning_rate": 1.99292658523139e-05, + "loss": 1.136, + "step": 759 + }, + { + "epoch": 0.13031270763229527, + "grad_norm": 1.7265625, + "learning_rate": 1.9929051272673385e-05, + "loss": 1.0643, + "step": 760 + }, + { + "epoch": 0.13048417172128512, + "grad_norm": 1.7421875, + "learning_rate": 1.9928836369209115e-05, + "loss": 1.0715, + "step": 761 + }, + { + "epoch": 0.13065563581027498, + "grad_norm": 1.75, + "learning_rate": 1.9928621141928107e-05, + "loss": 1.0668, + "step": 762 + }, + { + "epoch": 0.13082709989926486, + "grad_norm": 1.796875, + "learning_rate": 1.9928405590837377e-05, + "loss": 1.2258, + "step": 763 + }, + { + "epoch": 0.1309985639882547, + "grad_norm": 1.7109375, + "learning_rate": 1.992818971594395e-05, + "loss": 1.0404, + "step": 764 + }, + { + "epoch": 0.13117002807724457, + "grad_norm": 1.6484375, + "learning_rate": 1.992797351725487e-05, + "loss": 1.0869, + "step": 765 + }, + { + "epoch": 0.13134149216623445, + "grad_norm": 1.5859375, + "learning_rate": 1.9927756994777193e-05, + "loss": 1.0574, + "step": 766 + }, + { + "epoch": 0.1315129562552243, + "grad_norm": 1.8515625, + "learning_rate": 1.9927540148517976e-05, + "loss": 1.1286, + "step": 767 + }, + { + "epoch": 0.13168442034421415, + "grad_norm": 1.734375, + "learning_rate": 1.9927322978484283e-05, + "loss": 1.129, + "step": 768 + }, + { + "epoch": 0.131855884433204, + "grad_norm": 1.703125, + "learning_rate": 1.9927105484683214e-05, + "loss": 1.0774, + "step": 769 + }, + { + "epoch": 0.1320273485221939, + "grad_norm": 1.7578125, + "learning_rate": 1.9926887667121847e-05, + "loss": 1.0588, + "step": 770 + }, + { + "epoch": 0.13219881261118374, + "grad_norm": 1.625, + "learning_rate": 1.9926669525807295e-05, + "loss": 1.1937, + "step": 771 + }, + { + "epoch": 0.1323702767001736, + "grad_norm": 1.6328125, + "learning_rate": 1.9926451060746668e-05, + "loss": 1.1166, + "step": 772 + }, + { + "epoch": 0.13254174078916348, + "grad_norm": 1.6796875, + "learning_rate": 1.9926232271947094e-05, + "loss": 1.1121, + "step": 773 + }, + { + "epoch": 0.13271320487815333, + "grad_norm": 1.796875, + "learning_rate": 1.9926013159415705e-05, + "loss": 1.1396, + "step": 774 + }, + { + "epoch": 0.13288466896714318, + "grad_norm": 1.7265625, + "learning_rate": 1.992579372315965e-05, + "loss": 1.0838, + "step": 775 + }, + { + "epoch": 0.13305613305613306, + "grad_norm": 1.7734375, + "learning_rate": 1.992557396318608e-05, + "loss": 1.1048, + "step": 776 + }, + { + "epoch": 0.13322759714512292, + "grad_norm": 1.6328125, + "learning_rate": 1.9925353879502175e-05, + "loss": 1.047, + "step": 777 + }, + { + "epoch": 0.13339906123411277, + "grad_norm": 1.6875, + "learning_rate": 1.9925133472115098e-05, + "loss": 1.0915, + "step": 778 + }, + { + "epoch": 0.13357052532310265, + "grad_norm": 1.796875, + "learning_rate": 1.9924912741032048e-05, + "loss": 1.1477, + "step": 779 + }, + { + "epoch": 0.1337419894120925, + "grad_norm": 1.59375, + "learning_rate": 1.9924691686260217e-05, + "loss": 1.0514, + "step": 780 + }, + { + "epoch": 0.13391345350108236, + "grad_norm": 1.5859375, + "learning_rate": 1.992447030780682e-05, + "loss": 1.0808, + "step": 781 + }, + { + "epoch": 0.13408491759007224, + "grad_norm": 1.75, + "learning_rate": 1.992424860567907e-05, + "loss": 1.165, + "step": 782 + }, + { + "epoch": 0.1342563816790621, + "grad_norm": 1.75, + "learning_rate": 1.9924026579884206e-05, + "loss": 1.0896, + "step": 783 + }, + { + "epoch": 0.13442784576805195, + "grad_norm": 1.671875, + "learning_rate": 1.9923804230429464e-05, + "loss": 1.1519, + "step": 784 + }, + { + "epoch": 0.13459930985704183, + "grad_norm": 1.6328125, + "learning_rate": 1.9923581557322094e-05, + "loss": 1.0796, + "step": 785 + }, + { + "epoch": 0.13477077394603168, + "grad_norm": 1.7265625, + "learning_rate": 1.9923358560569364e-05, + "loss": 1.1155, + "step": 786 + }, + { + "epoch": 0.13494223803502153, + "grad_norm": 1.6796875, + "learning_rate": 1.992313524017854e-05, + "loss": 1.0285, + "step": 787 + }, + { + "epoch": 0.13511370212401141, + "grad_norm": 1.7265625, + "learning_rate": 1.992291159615691e-05, + "loss": 1.1588, + "step": 788 + }, + { + "epoch": 0.13528516621300127, + "grad_norm": 1.7890625, + "learning_rate": 1.9922687628511768e-05, + "loss": 1.181, + "step": 789 + }, + { + "epoch": 0.13545663030199112, + "grad_norm": 1.703125, + "learning_rate": 1.9922463337250417e-05, + "loss": 1.0962, + "step": 790 + }, + { + "epoch": 0.135628094390981, + "grad_norm": 1.796875, + "learning_rate": 1.992223872238017e-05, + "loss": 1.1206, + "step": 791 + }, + { + "epoch": 0.13579955847997086, + "grad_norm": 1.703125, + "learning_rate": 1.9922013783908358e-05, + "loss": 1.117, + "step": 792 + }, + { + "epoch": 0.1359710225689607, + "grad_norm": 1.7265625, + "learning_rate": 1.9921788521842308e-05, + "loss": 1.0403, + "step": 793 + }, + { + "epoch": 0.13614248665795056, + "grad_norm": 1.8359375, + "learning_rate": 1.9921562936189377e-05, + "loss": 1.0875, + "step": 794 + }, + { + "epoch": 0.13631395074694044, + "grad_norm": 1.6796875, + "learning_rate": 1.9921337026956918e-05, + "loss": 1.1067, + "step": 795 + }, + { + "epoch": 0.1364854148359303, + "grad_norm": 1.5390625, + "learning_rate": 1.9921110794152296e-05, + "loss": 1.0446, + "step": 796 + }, + { + "epoch": 0.13665687892492015, + "grad_norm": 1.671875, + "learning_rate": 1.992088423778289e-05, + "loss": 0.9725, + "step": 797 + }, + { + "epoch": 0.13682834301391003, + "grad_norm": 1.8359375, + "learning_rate": 1.9920657357856092e-05, + "loss": 1.113, + "step": 798 + }, + { + "epoch": 0.13699980710289988, + "grad_norm": 1.9375, + "learning_rate": 1.9920430154379302e-05, + "loss": 1.1265, + "step": 799 + }, + { + "epoch": 0.13717127119188974, + "grad_norm": 1.6875, + "learning_rate": 1.9920202627359924e-05, + "loss": 1.0608, + "step": 800 + }, + { + "epoch": 0.13734273528087962, + "grad_norm": 1.734375, + "learning_rate": 1.9919974776805386e-05, + "loss": 1.0655, + "step": 801 + }, + { + "epoch": 0.13751419936986947, + "grad_norm": 1.703125, + "learning_rate": 1.9919746602723113e-05, + "loss": 1.1335, + "step": 802 + }, + { + "epoch": 0.13768566345885933, + "grad_norm": 1.7265625, + "learning_rate": 1.991951810512055e-05, + "loss": 1.1218, + "step": 803 + }, + { + "epoch": 0.1378571275478492, + "grad_norm": 1.6328125, + "learning_rate": 1.991928928400515e-05, + "loss": 1.0527, + "step": 804 + }, + { + "epoch": 0.13802859163683906, + "grad_norm": 1.609375, + "learning_rate": 1.9919060139384368e-05, + "loss": 1.0453, + "step": 805 + }, + { + "epoch": 0.1382000557258289, + "grad_norm": 1.75, + "learning_rate": 1.991883067126569e-05, + "loss": 1.1147, + "step": 806 + }, + { + "epoch": 0.1383715198148188, + "grad_norm": 1.75, + "learning_rate": 1.991860087965659e-05, + "loss": 1.1992, + "step": 807 + }, + { + "epoch": 0.13854298390380865, + "grad_norm": 1.6171875, + "learning_rate": 1.9918370764564563e-05, + "loss": 1.0232, + "step": 808 + }, + { + "epoch": 0.1387144479927985, + "grad_norm": 1.578125, + "learning_rate": 1.9918140325997117e-05, + "loss": 1.0054, + "step": 809 + }, + { + "epoch": 0.13888591208178838, + "grad_norm": 1.6953125, + "learning_rate": 1.991790956396177e-05, + "loss": 1.0625, + "step": 810 + }, + { + "epoch": 0.13905737617077824, + "grad_norm": 1.671875, + "learning_rate": 1.9917678478466043e-05, + "loss": 1.0785, + "step": 811 + }, + { + "epoch": 0.1392288402597681, + "grad_norm": 1.671875, + "learning_rate": 1.9917447069517473e-05, + "loss": 1.1155, + "step": 812 + }, + { + "epoch": 0.13940030434875797, + "grad_norm": 1.78125, + "learning_rate": 1.991721533712361e-05, + "loss": 1.0851, + "step": 813 + }, + { + "epoch": 0.13957176843774782, + "grad_norm": 1.8046875, + "learning_rate": 1.9916983281292008e-05, + "loss": 1.1355, + "step": 814 + }, + { + "epoch": 0.13974323252673768, + "grad_norm": 1.703125, + "learning_rate": 1.9916750902030243e-05, + "loss": 1.1464, + "step": 815 + }, + { + "epoch": 0.13991469661572756, + "grad_norm": 1.75, + "learning_rate": 1.991651819934588e-05, + "loss": 1.0878, + "step": 816 + }, + { + "epoch": 0.1400861607047174, + "grad_norm": 1.7578125, + "learning_rate": 1.9916285173246522e-05, + "loss": 1.0585, + "step": 817 + }, + { + "epoch": 0.14025762479370726, + "grad_norm": 1.6796875, + "learning_rate": 1.9916051823739758e-05, + "loss": 1.0578, + "step": 818 + }, + { + "epoch": 0.14042908888269712, + "grad_norm": 1.734375, + "learning_rate": 1.991581815083321e-05, + "loss": 1.1667, + "step": 819 + }, + { + "epoch": 0.140600552971687, + "grad_norm": 1.6796875, + "learning_rate": 1.9915584154534492e-05, + "loss": 0.9936, + "step": 820 + }, + { + "epoch": 0.14077201706067685, + "grad_norm": 1.59375, + "learning_rate": 1.991534983485123e-05, + "loss": 0.9491, + "step": 821 + }, + { + "epoch": 0.1409434811496667, + "grad_norm": 1.75, + "learning_rate": 1.9915115191791078e-05, + "loss": 1.15, + "step": 822 + }, + { + "epoch": 0.1411149452386566, + "grad_norm": 1.6328125, + "learning_rate": 1.991488022536168e-05, + "loss": 1.0197, + "step": 823 + }, + { + "epoch": 0.14128640932764644, + "grad_norm": 1.7265625, + "learning_rate": 1.99146449355707e-05, + "loss": 1.0575, + "step": 824 + }, + { + "epoch": 0.1414578734166363, + "grad_norm": 1.6953125, + "learning_rate": 1.9914409322425817e-05, + "loss": 1.0816, + "step": 825 + }, + { + "epoch": 0.14162933750562617, + "grad_norm": 1.7421875, + "learning_rate": 1.9914173385934715e-05, + "loss": 1.1971, + "step": 826 + }, + { + "epoch": 0.14180080159461603, + "grad_norm": 1.875, + "learning_rate": 1.991393712610508e-05, + "loss": 1.1115, + "step": 827 + }, + { + "epoch": 0.14197226568360588, + "grad_norm": 1.6875, + "learning_rate": 1.991370054294462e-05, + "loss": 1.0964, + "step": 828 + }, + { + "epoch": 0.14214372977259576, + "grad_norm": 1.65625, + "learning_rate": 1.9913463636461062e-05, + "loss": 1.0427, + "step": 829 + }, + { + "epoch": 0.14231519386158561, + "grad_norm": 1.7109375, + "learning_rate": 1.991322640666212e-05, + "loss": 1.1705, + "step": 830 + }, + { + "epoch": 0.14248665795057547, + "grad_norm": 1.6484375, + "learning_rate": 1.9912988853555536e-05, + "loss": 1.0714, + "step": 831 + }, + { + "epoch": 0.14265812203956535, + "grad_norm": 1.6328125, + "learning_rate": 1.9912750977149056e-05, + "loss": 1.0938, + "step": 832 + }, + { + "epoch": 0.1428295861285552, + "grad_norm": 1.625, + "learning_rate": 1.9912512777450436e-05, + "loss": 1.0464, + "step": 833 + }, + { + "epoch": 0.14300105021754506, + "grad_norm": 1.6015625, + "learning_rate": 1.9912274254467452e-05, + "loss": 1.0499, + "step": 834 + }, + { + "epoch": 0.14317251430653494, + "grad_norm": 1.796875, + "learning_rate": 1.9912035408207875e-05, + "loss": 1.1257, + "step": 835 + }, + { + "epoch": 0.1433439783955248, + "grad_norm": 1.640625, + "learning_rate": 1.9911796238679502e-05, + "loss": 1.0548, + "step": 836 + }, + { + "epoch": 0.14351544248451464, + "grad_norm": 1.765625, + "learning_rate": 1.9911556745890123e-05, + "loss": 1.0836, + "step": 837 + }, + { + "epoch": 0.14368690657350452, + "grad_norm": 1.671875, + "learning_rate": 1.9911316929847563e-05, + "loss": 1.1414, + "step": 838 + }, + { + "epoch": 0.14385837066249438, + "grad_norm": 1.6484375, + "learning_rate": 1.991107679055963e-05, + "loss": 1.0649, + "step": 839 + }, + { + "epoch": 0.14402983475148423, + "grad_norm": 2.0, + "learning_rate": 1.991083632803416e-05, + "loss": 1.0143, + "step": 840 + }, + { + "epoch": 0.14420129884047408, + "grad_norm": 1.7421875, + "learning_rate": 1.9910595542279002e-05, + "loss": 1.1205, + "step": 841 + }, + { + "epoch": 0.14437276292946397, + "grad_norm": 1.625, + "learning_rate": 1.9910354433302e-05, + "loss": 0.9626, + "step": 842 + }, + { + "epoch": 0.14454422701845382, + "grad_norm": 1.6796875, + "learning_rate": 1.991011300111102e-05, + "loss": 0.9339, + "step": 843 + }, + { + "epoch": 0.14471569110744367, + "grad_norm": 1.7109375, + "learning_rate": 1.990987124571394e-05, + "loss": 1.0628, + "step": 844 + }, + { + "epoch": 0.14488715519643355, + "grad_norm": 1.71875, + "learning_rate": 1.9909629167118638e-05, + "loss": 1.082, + "step": 845 + }, + { + "epoch": 0.1450586192854234, + "grad_norm": 1.9375, + "learning_rate": 1.9909386765333016e-05, + "loss": 1.1171, + "step": 846 + }, + { + "epoch": 0.14523008337441326, + "grad_norm": 1.71875, + "learning_rate": 1.9909144040364975e-05, + "loss": 1.0785, + "step": 847 + }, + { + "epoch": 0.14540154746340314, + "grad_norm": 3.125, + "learning_rate": 1.990890099222243e-05, + "loss": 1.2007, + "step": 848 + }, + { + "epoch": 0.145573011552393, + "grad_norm": 1.734375, + "learning_rate": 1.9908657620913315e-05, + "loss": 1.1942, + "step": 849 + }, + { + "epoch": 0.14574447564138285, + "grad_norm": 5.53125, + "learning_rate": 1.990841392644556e-05, + "loss": 1.1591, + "step": 850 + }, + { + "epoch": 0.14591593973037273, + "grad_norm": 1.7578125, + "learning_rate": 1.9908169908827113e-05, + "loss": 1.1318, + "step": 851 + }, + { + "epoch": 0.14608740381936258, + "grad_norm": 1.625, + "learning_rate": 1.9907925568065937e-05, + "loss": 0.9784, + "step": 852 + }, + { + "epoch": 0.14625886790835244, + "grad_norm": 1.6953125, + "learning_rate": 1.9907680904169996e-05, + "loss": 1.1982, + "step": 853 + }, + { + "epoch": 0.14643033199734232, + "grad_norm": 1.703125, + "learning_rate": 1.9907435917147276e-05, + "loss": 1.0986, + "step": 854 + }, + { + "epoch": 0.14660179608633217, + "grad_norm": 1.6484375, + "learning_rate": 1.9907190607005762e-05, + "loss": 1.0898, + "step": 855 + }, + { + "epoch": 0.14677326017532202, + "grad_norm": 1.7109375, + "learning_rate": 1.990694497375345e-05, + "loss": 1.0914, + "step": 856 + }, + { + "epoch": 0.1469447242643119, + "grad_norm": 1.9375, + "learning_rate": 1.9906699017398363e-05, + "loss": 1.1664, + "step": 857 + }, + { + "epoch": 0.14711618835330176, + "grad_norm": 1.7578125, + "learning_rate": 1.9906452737948512e-05, + "loss": 1.0166, + "step": 858 + }, + { + "epoch": 0.1472876524422916, + "grad_norm": 1.71875, + "learning_rate": 1.9906206135411934e-05, + "loss": 1.0844, + "step": 859 + }, + { + "epoch": 0.1474591165312815, + "grad_norm": 1.6484375, + "learning_rate": 1.990595920979667e-05, + "loss": 1.135, + "step": 860 + }, + { + "epoch": 0.14763058062027135, + "grad_norm": 1.78125, + "learning_rate": 1.9905711961110777e-05, + "loss": 1.0987, + "step": 861 + }, + { + "epoch": 0.1478020447092612, + "grad_norm": 1.65625, + "learning_rate": 1.9905464389362312e-05, + "loss": 1.0714, + "step": 862 + }, + { + "epoch": 0.14797350879825108, + "grad_norm": 1.7578125, + "learning_rate": 1.9905216494559354e-05, + "loss": 1.1026, + "step": 863 + }, + { + "epoch": 0.14814497288724093, + "grad_norm": 1.6640625, + "learning_rate": 1.9904968276709986e-05, + "loss": 1.0989, + "step": 864 + }, + { + "epoch": 0.1483164369762308, + "grad_norm": 1.65625, + "learning_rate": 1.9904719735822303e-05, + "loss": 1.0343, + "step": 865 + }, + { + "epoch": 0.14848790106522064, + "grad_norm": 1.625, + "learning_rate": 1.9904470871904413e-05, + "loss": 1.0357, + "step": 866 + }, + { + "epoch": 0.14865936515421052, + "grad_norm": 1.6796875, + "learning_rate": 1.9904221684964434e-05, + "loss": 1.0805, + "step": 867 + }, + { + "epoch": 0.14883082924320037, + "grad_norm": 1.65625, + "learning_rate": 1.9903972175010486e-05, + "loss": 1.1604, + "step": 868 + }, + { + "epoch": 0.14900229333219023, + "grad_norm": 1.765625, + "learning_rate": 1.9903722342050712e-05, + "loss": 1.0845, + "step": 869 + }, + { + "epoch": 0.1491737574211801, + "grad_norm": 1.609375, + "learning_rate": 1.9903472186093257e-05, + "loss": 1.022, + "step": 870 + }, + { + "epoch": 0.14934522151016996, + "grad_norm": 1.703125, + "learning_rate": 1.9903221707146283e-05, + "loss": 1.097, + "step": 871 + }, + { + "epoch": 0.14951668559915982, + "grad_norm": 1.796875, + "learning_rate": 1.9902970905217955e-05, + "loss": 1.1724, + "step": 872 + }, + { + "epoch": 0.1496881496881497, + "grad_norm": 1.5703125, + "learning_rate": 1.9902719780316455e-05, + "loss": 1.1537, + "step": 873 + }, + { + "epoch": 0.14985961377713955, + "grad_norm": 1.703125, + "learning_rate": 1.9902468332449973e-05, + "loss": 1.1606, + "step": 874 + }, + { + "epoch": 0.1500310778661294, + "grad_norm": 1.7734375, + "learning_rate": 1.990221656162671e-05, + "loss": 1.0925, + "step": 875 + }, + { + "epoch": 0.15020254195511928, + "grad_norm": 1.71875, + "learning_rate": 1.9901964467854876e-05, + "loss": 1.048, + "step": 876 + }, + { + "epoch": 0.15037400604410914, + "grad_norm": 1.65625, + "learning_rate": 1.9901712051142693e-05, + "loss": 1.1162, + "step": 877 + }, + { + "epoch": 0.150545470133099, + "grad_norm": 1.6015625, + "learning_rate": 1.9901459311498396e-05, + "loss": 1.0051, + "step": 878 + }, + { + "epoch": 0.15071693422208887, + "grad_norm": 1.7578125, + "learning_rate": 1.990120624893022e-05, + "loss": 1.1115, + "step": 879 + }, + { + "epoch": 0.15088839831107873, + "grad_norm": 1.6640625, + "learning_rate": 1.990095286344643e-05, + "loss": 1.0596, + "step": 880 + }, + { + "epoch": 0.15105986240006858, + "grad_norm": 1.734375, + "learning_rate": 1.990069915505528e-05, + "loss": 1.083, + "step": 881 + }, + { + "epoch": 0.15123132648905846, + "grad_norm": 1.7734375, + "learning_rate": 1.9900445123765052e-05, + "loss": 1.0373, + "step": 882 + }, + { + "epoch": 0.1514027905780483, + "grad_norm": 1.625, + "learning_rate": 1.990019076958402e-05, + "loss": 1.0076, + "step": 883 + }, + { + "epoch": 0.15157425466703817, + "grad_norm": 1.8359375, + "learning_rate": 1.9899936092520495e-05, + "loss": 1.1757, + "step": 884 + }, + { + "epoch": 0.15174571875602805, + "grad_norm": 1.703125, + "learning_rate": 1.9899681092582768e-05, + "loss": 1.1041, + "step": 885 + }, + { + "epoch": 0.1519171828450179, + "grad_norm": 1.6796875, + "learning_rate": 1.9899425769779165e-05, + "loss": 1.1192, + "step": 886 + }, + { + "epoch": 0.15208864693400775, + "grad_norm": 1.6328125, + "learning_rate": 1.9899170124118007e-05, + "loss": 1.1226, + "step": 887 + }, + { + "epoch": 0.1522601110229976, + "grad_norm": 1.65625, + "learning_rate": 1.9898914155607635e-05, + "loss": 1.0934, + "step": 888 + }, + { + "epoch": 0.1524315751119875, + "grad_norm": 1.7265625, + "learning_rate": 1.98986578642564e-05, + "loss": 1.1332, + "step": 889 + }, + { + "epoch": 0.15260303920097734, + "grad_norm": 1.6328125, + "learning_rate": 1.9898401250072653e-05, + "loss": 1.1179, + "step": 890 + }, + { + "epoch": 0.1527745032899672, + "grad_norm": 1.6640625, + "learning_rate": 1.9898144313064765e-05, + "loss": 1.1094, + "step": 891 + }, + { + "epoch": 0.15294596737895708, + "grad_norm": 1.671875, + "learning_rate": 1.9897887053241126e-05, + "loss": 1.0668, + "step": 892 + }, + { + "epoch": 0.15311743146794693, + "grad_norm": 1.59375, + "learning_rate": 1.9897629470610113e-05, + "loss": 1.104, + "step": 893 + }, + { + "epoch": 0.15328889555693678, + "grad_norm": 1.671875, + "learning_rate": 1.9897371565180133e-05, + "loss": 1.0576, + "step": 894 + }, + { + "epoch": 0.15346035964592666, + "grad_norm": 1.7109375, + "learning_rate": 1.9897113336959595e-05, + "loss": 1.1034, + "step": 895 + }, + { + "epoch": 0.15363182373491652, + "grad_norm": 1.8046875, + "learning_rate": 1.9896854785956922e-05, + "loss": 1.0618, + "step": 896 + }, + { + "epoch": 0.15380328782390637, + "grad_norm": 1.6640625, + "learning_rate": 1.9896595912180547e-05, + "loss": 1.1598, + "step": 897 + }, + { + "epoch": 0.15397475191289625, + "grad_norm": 1.59375, + "learning_rate": 1.9896336715638913e-05, + "loss": 1.0393, + "step": 898 + }, + { + "epoch": 0.1541462160018861, + "grad_norm": 1.6875, + "learning_rate": 1.989607719634047e-05, + "loss": 1.1174, + "step": 899 + }, + { + "epoch": 0.15431768009087596, + "grad_norm": 1.671875, + "learning_rate": 1.989581735429369e-05, + "loss": 1.1053, + "step": 900 + }, + { + "epoch": 0.15448914417986584, + "grad_norm": 1.6796875, + "learning_rate": 1.989555718950704e-05, + "loss": 1.072, + "step": 901 + }, + { + "epoch": 0.1546606082688557, + "grad_norm": 1.7734375, + "learning_rate": 1.9895296701989002e-05, + "loss": 1.095, + "step": 902 + }, + { + "epoch": 0.15483207235784555, + "grad_norm": 1.65625, + "learning_rate": 1.989503589174808e-05, + "loss": 1.1022, + "step": 903 + }, + { + "epoch": 0.15500353644683543, + "grad_norm": 1.65625, + "learning_rate": 1.9894774758792775e-05, + "loss": 0.9856, + "step": 904 + }, + { + "epoch": 0.15517500053582528, + "grad_norm": 1.7265625, + "learning_rate": 1.9894513303131607e-05, + "loss": 1.0835, + "step": 905 + }, + { + "epoch": 0.15534646462481513, + "grad_norm": 1.6796875, + "learning_rate": 1.98942515247731e-05, + "loss": 1.0362, + "step": 906 + }, + { + "epoch": 0.15551792871380501, + "grad_norm": 1.6875, + "learning_rate": 1.9893989423725792e-05, + "loss": 1.1525, + "step": 907 + }, + { + "epoch": 0.15568939280279487, + "grad_norm": 1.6171875, + "learning_rate": 1.989372699999823e-05, + "loss": 1.0246, + "step": 908 + }, + { + "epoch": 0.15586085689178472, + "grad_norm": 1.6953125, + "learning_rate": 1.9893464253598974e-05, + "loss": 1.0513, + "step": 909 + }, + { + "epoch": 0.1560323209807746, + "grad_norm": 1.6875, + "learning_rate": 1.9893201184536598e-05, + "loss": 1.055, + "step": 910 + }, + { + "epoch": 0.15620378506976446, + "grad_norm": 1.8046875, + "learning_rate": 1.9892937792819676e-05, + "loss": 1.182, + "step": 911 + }, + { + "epoch": 0.1563752491587543, + "grad_norm": 1.609375, + "learning_rate": 1.9892674078456795e-05, + "loss": 1.0651, + "step": 912 + }, + { + "epoch": 0.15654671324774416, + "grad_norm": 1.7578125, + "learning_rate": 1.989241004145656e-05, + "loss": 1.1454, + "step": 913 + }, + { + "epoch": 0.15671817733673404, + "grad_norm": 1.703125, + "learning_rate": 1.989214568182759e-05, + "loss": 1.1123, + "step": 914 + }, + { + "epoch": 0.1568896414257239, + "grad_norm": 1.625, + "learning_rate": 1.9891880999578492e-05, + "loss": 1.0728, + "step": 915 + }, + { + "epoch": 0.15706110551471375, + "grad_norm": 1.8515625, + "learning_rate": 1.9891615994717904e-05, + "loss": 1.0941, + "step": 916 + }, + { + "epoch": 0.15723256960370363, + "grad_norm": 1.6796875, + "learning_rate": 1.9891350667254474e-05, + "loss": 1.1077, + "step": 917 + }, + { + "epoch": 0.15740403369269348, + "grad_norm": 1.6171875, + "learning_rate": 1.9891085017196848e-05, + "loss": 1.0342, + "step": 918 + }, + { + "epoch": 0.15757549778168334, + "grad_norm": 1.7421875, + "learning_rate": 1.98908190445537e-05, + "loss": 1.0765, + "step": 919 + }, + { + "epoch": 0.15774696187067322, + "grad_norm": 1.65625, + "learning_rate": 1.989055274933369e-05, + "loss": 1.1752, + "step": 920 + }, + { + "epoch": 0.15791842595966307, + "grad_norm": 1.6953125, + "learning_rate": 1.9890286131545514e-05, + "loss": 1.1564, + "step": 921 + }, + { + "epoch": 0.15808989004865293, + "grad_norm": 1.6875, + "learning_rate": 1.9890019191197863e-05, + "loss": 1.0157, + "step": 922 + }, + { + "epoch": 0.1582613541376428, + "grad_norm": 1.6875, + "learning_rate": 1.9889751928299446e-05, + "loss": 0.9963, + "step": 923 + }, + { + "epoch": 0.15843281822663266, + "grad_norm": 1.6640625, + "learning_rate": 1.9889484342858974e-05, + "loss": 1.0637, + "step": 924 + }, + { + "epoch": 0.1586042823156225, + "grad_norm": 1.7421875, + "learning_rate": 1.988921643488518e-05, + "loss": 1.1039, + "step": 925 + }, + { + "epoch": 0.1587757464046124, + "grad_norm": 1.8046875, + "learning_rate": 1.9888948204386797e-05, + "loss": 1.1995, + "step": 926 + }, + { + "epoch": 0.15894721049360225, + "grad_norm": 1.640625, + "learning_rate": 1.9888679651372572e-05, + "loss": 1.0425, + "step": 927 + }, + { + "epoch": 0.1591186745825921, + "grad_norm": 1.75, + "learning_rate": 1.988841077585127e-05, + "loss": 1.0697, + "step": 928 + }, + { + "epoch": 0.15929013867158198, + "grad_norm": 1.6875, + "learning_rate": 1.9888141577831656e-05, + "loss": 1.0453, + "step": 929 + }, + { + "epoch": 0.15946160276057184, + "grad_norm": 1.7109375, + "learning_rate": 1.988787205732251e-05, + "loss": 1.1458, + "step": 930 + }, + { + "epoch": 0.1596330668495617, + "grad_norm": 1.6796875, + "learning_rate": 1.9887602214332622e-05, + "loss": 1.1275, + "step": 931 + }, + { + "epoch": 0.15980453093855157, + "grad_norm": 1.671875, + "learning_rate": 1.9887332048870792e-05, + "loss": 1.0878, + "step": 932 + }, + { + "epoch": 0.15997599502754142, + "grad_norm": 1.6484375, + "learning_rate": 1.988706156094583e-05, + "loss": 1.0844, + "step": 933 + }, + { + "epoch": 0.16014745911653128, + "grad_norm": 1.6875, + "learning_rate": 1.988679075056656e-05, + "loss": 1.0174, + "step": 934 + }, + { + "epoch": 0.16031892320552113, + "grad_norm": 1.6875, + "learning_rate": 1.988651961774182e-05, + "loss": 1.0657, + "step": 935 + }, + { + "epoch": 0.160490387294511, + "grad_norm": 1.640625, + "learning_rate": 1.9886248162480436e-05, + "loss": 1.0754, + "step": 936 + }, + { + "epoch": 0.16066185138350086, + "grad_norm": 1.734375, + "learning_rate": 1.9885976384791276e-05, + "loss": 1.1581, + "step": 937 + }, + { + "epoch": 0.16083331547249072, + "grad_norm": 1.6484375, + "learning_rate": 1.98857042846832e-05, + "loss": 1.0489, + "step": 938 + }, + { + "epoch": 0.1610047795614806, + "grad_norm": 1.6328125, + "learning_rate": 1.9885431862165078e-05, + "loss": 1.1013, + "step": 939 + }, + { + "epoch": 0.16117624365047045, + "grad_norm": 1.8046875, + "learning_rate": 1.98851591172458e-05, + "loss": 1.019, + "step": 940 + }, + { + "epoch": 0.1613477077394603, + "grad_norm": 1.5703125, + "learning_rate": 1.9884886049934258e-05, + "loss": 0.9853, + "step": 941 + }, + { + "epoch": 0.1615191718284502, + "grad_norm": 1.6875, + "learning_rate": 1.988461266023936e-05, + "loss": 1.0605, + "step": 942 + }, + { + "epoch": 0.16169063591744004, + "grad_norm": 1.7421875, + "learning_rate": 1.9884338948170022e-05, + "loss": 1.0944, + "step": 943 + }, + { + "epoch": 0.1618621000064299, + "grad_norm": 1.7109375, + "learning_rate": 1.9884064913735165e-05, + "loss": 1.0564, + "step": 944 + }, + { + "epoch": 0.16203356409541977, + "grad_norm": 1.7109375, + "learning_rate": 1.9883790556943736e-05, + "loss": 1.1367, + "step": 945 + }, + { + "epoch": 0.16220502818440963, + "grad_norm": 1.65625, + "learning_rate": 1.9883515877804676e-05, + "loss": 1.095, + "step": 946 + }, + { + "epoch": 0.16237649227339948, + "grad_norm": 1.65625, + "learning_rate": 1.9883240876326947e-05, + "loss": 1.0305, + "step": 947 + }, + { + "epoch": 0.16254795636238936, + "grad_norm": 1.6875, + "learning_rate": 1.9882965552519517e-05, + "loss": 1.0605, + "step": 948 + }, + { + "epoch": 0.16271942045137922, + "grad_norm": 1.6171875, + "learning_rate": 1.988268990639136e-05, + "loss": 1.0784, + "step": 949 + }, + { + "epoch": 0.16289088454036907, + "grad_norm": 1.6796875, + "learning_rate": 1.9882413937951475e-05, + "loss": 1.1448, + "step": 950 + }, + { + "epoch": 0.16306234862935895, + "grad_norm": 1.6875, + "learning_rate": 1.9882137647208858e-05, + "loss": 1.0633, + "step": 951 + }, + { + "epoch": 0.1632338127183488, + "grad_norm": 1.6875, + "learning_rate": 1.9881861034172514e-05, + "loss": 1.0318, + "step": 952 + }, + { + "epoch": 0.16340527680733866, + "grad_norm": 1.65625, + "learning_rate": 1.9881584098851475e-05, + "loss": 0.9796, + "step": 953 + }, + { + "epoch": 0.16357674089632854, + "grad_norm": 1.6796875, + "learning_rate": 1.9881306841254768e-05, + "loss": 1.1173, + "step": 954 + }, + { + "epoch": 0.1637482049853184, + "grad_norm": 1.7109375, + "learning_rate": 1.9881029261391434e-05, + "loss": 1.0393, + "step": 955 + }, + { + "epoch": 0.16391966907430824, + "grad_norm": 1.8125, + "learning_rate": 1.9880751359270526e-05, + "loss": 1.1808, + "step": 956 + }, + { + "epoch": 0.16409113316329813, + "grad_norm": 1.703125, + "learning_rate": 1.988047313490111e-05, + "loss": 1.1231, + "step": 957 + }, + { + "epoch": 0.16426259725228798, + "grad_norm": 1.6171875, + "learning_rate": 1.988019458829226e-05, + "loss": 1.0345, + "step": 958 + }, + { + "epoch": 0.16443406134127783, + "grad_norm": 1.7421875, + "learning_rate": 1.9879915719453055e-05, + "loss": 1.0765, + "step": 959 + }, + { + "epoch": 0.16460552543026769, + "grad_norm": 1.7578125, + "learning_rate": 1.9879636528392597e-05, + "loss": 1.1324, + "step": 960 + }, + { + "epoch": 0.16477698951925757, + "grad_norm": 1.609375, + "learning_rate": 1.9879357015119993e-05, + "loss": 1.0949, + "step": 961 + }, + { + "epoch": 0.16494845360824742, + "grad_norm": 1.53125, + "learning_rate": 1.9879077179644346e-05, + "loss": 1.0602, + "step": 962 + }, + { + "epoch": 0.16511991769723727, + "grad_norm": 1.640625, + "learning_rate": 1.9878797021974795e-05, + "loss": 1.1427, + "step": 963 + }, + { + "epoch": 0.16529138178622715, + "grad_norm": 1.765625, + "learning_rate": 1.9878516542120473e-05, + "loss": 1.0229, + "step": 964 + }, + { + "epoch": 0.165462845875217, + "grad_norm": 1.59375, + "learning_rate": 1.987823574009053e-05, + "loss": 1.0367, + "step": 965 + }, + { + "epoch": 0.16563430996420686, + "grad_norm": 1.7265625, + "learning_rate": 1.9877954615894118e-05, + "loss": 1.0842, + "step": 966 + }, + { + "epoch": 0.16580577405319674, + "grad_norm": 1.6796875, + "learning_rate": 1.9877673169540408e-05, + "loss": 1.0605, + "step": 967 + }, + { + "epoch": 0.1659772381421866, + "grad_norm": 1.6875, + "learning_rate": 1.9877391401038583e-05, + "loss": 1.1082, + "step": 968 + }, + { + "epoch": 0.16614870223117645, + "grad_norm": 1.7265625, + "learning_rate": 1.987710931039783e-05, + "loss": 1.171, + "step": 969 + }, + { + "epoch": 0.16632016632016633, + "grad_norm": 1.7109375, + "learning_rate": 1.9876826897627347e-05, + "loss": 1.0277, + "step": 970 + }, + { + "epoch": 0.16649163040915618, + "grad_norm": 1.75, + "learning_rate": 1.9876544162736343e-05, + "loss": 1.0761, + "step": 971 + }, + { + "epoch": 0.16666309449814604, + "grad_norm": 1.640625, + "learning_rate": 1.987626110573405e-05, + "loss": 1.0559, + "step": 972 + }, + { + "epoch": 0.16683455858713592, + "grad_norm": 1.609375, + "learning_rate": 1.9875977726629685e-05, + "loss": 1.0628, + "step": 973 + }, + { + "epoch": 0.16700602267612577, + "grad_norm": 1.65625, + "learning_rate": 1.98756940254325e-05, + "loss": 1.0941, + "step": 974 + }, + { + "epoch": 0.16717748676511562, + "grad_norm": 1.6953125, + "learning_rate": 1.9875410002151745e-05, + "loss": 1.0969, + "step": 975 + }, + { + "epoch": 0.1673489508541055, + "grad_norm": 1.640625, + "learning_rate": 1.987512565679668e-05, + "loss": 1.0515, + "step": 976 + }, + { + "epoch": 0.16752041494309536, + "grad_norm": 1.6953125, + "learning_rate": 1.987484098937658e-05, + "loss": 1.0269, + "step": 977 + }, + { + "epoch": 0.1676918790320852, + "grad_norm": 1.6015625, + "learning_rate": 1.9874555999900735e-05, + "loss": 1.0805, + "step": 978 + }, + { + "epoch": 0.1678633431210751, + "grad_norm": 1.6953125, + "learning_rate": 1.9874270688378433e-05, + "loss": 1.1372, + "step": 979 + }, + { + "epoch": 0.16803480721006495, + "grad_norm": 1.6953125, + "learning_rate": 1.987398505481898e-05, + "loss": 1.1989, + "step": 980 + }, + { + "epoch": 0.1682062712990548, + "grad_norm": 1.5859375, + "learning_rate": 1.987369909923169e-05, + "loss": 1.0114, + "step": 981 + }, + { + "epoch": 0.16837773538804465, + "grad_norm": 1.640625, + "learning_rate": 1.9873412821625894e-05, + "loss": 1.0541, + "step": 982 + }, + { + "epoch": 0.16854919947703453, + "grad_norm": 1.6171875, + "learning_rate": 1.987312622201093e-05, + "loss": 1.0933, + "step": 983 + }, + { + "epoch": 0.1687206635660244, + "grad_norm": 1.671875, + "learning_rate": 1.9872839300396132e-05, + "loss": 1.0502, + "step": 984 + }, + { + "epoch": 0.16889212765501424, + "grad_norm": 1.6640625, + "learning_rate": 1.987255205679087e-05, + "loss": 1.0388, + "step": 985 + }, + { + "epoch": 0.16906359174400412, + "grad_norm": 1.640625, + "learning_rate": 1.9872264491204514e-05, + "loss": 1.0497, + "step": 986 + }, + { + "epoch": 0.16923505583299397, + "grad_norm": 1.7421875, + "learning_rate": 1.9871976603646432e-05, + "loss": 1.0919, + "step": 987 + }, + { + "epoch": 0.16940651992198383, + "grad_norm": 1.734375, + "learning_rate": 1.987168839412602e-05, + "loss": 1.0666, + "step": 988 + }, + { + "epoch": 0.1695779840109737, + "grad_norm": 1.6953125, + "learning_rate": 1.9871399862652677e-05, + "loss": 1.0808, + "step": 989 + }, + { + "epoch": 0.16974944809996356, + "grad_norm": 1.59375, + "learning_rate": 1.9871111009235814e-05, + "loss": 1.0397, + "step": 990 + }, + { + "epoch": 0.16992091218895342, + "grad_norm": 1.578125, + "learning_rate": 1.9870821833884845e-05, + "loss": 0.9993, + "step": 991 + }, + { + "epoch": 0.1700923762779433, + "grad_norm": 1.75, + "learning_rate": 1.987053233660921e-05, + "loss": 1.2037, + "step": 992 + }, + { + "epoch": 0.17026384036693315, + "grad_norm": 1.578125, + "learning_rate": 1.9870242517418346e-05, + "loss": 1.0779, + "step": 993 + }, + { + "epoch": 0.170435304455923, + "grad_norm": 1.6953125, + "learning_rate": 1.9869952376321705e-05, + "loss": 1.1726, + "step": 994 + }, + { + "epoch": 0.17060676854491288, + "grad_norm": 1.671875, + "learning_rate": 1.986966191332875e-05, + "loss": 1.0898, + "step": 995 + }, + { + "epoch": 0.17077823263390274, + "grad_norm": 1.6796875, + "learning_rate": 1.9869371128448952e-05, + "loss": 1.0367, + "step": 996 + }, + { + "epoch": 0.1709496967228926, + "grad_norm": 1.6796875, + "learning_rate": 1.98690800216918e-05, + "loss": 1.063, + "step": 997 + }, + { + "epoch": 0.17112116081188247, + "grad_norm": 1.953125, + "learning_rate": 1.9868788593066783e-05, + "loss": 1.0421, + "step": 998 + }, + { + "epoch": 0.17129262490087233, + "grad_norm": 1.640625, + "learning_rate": 1.9868496842583412e-05, + "loss": 1.0655, + "step": 999 + }, + { + "epoch": 0.17146408898986218, + "grad_norm": 1.6875, + "learning_rate": 1.9868204770251194e-05, + "loss": 1.1434, + "step": 1000 + }, + { + "epoch": 0.17163555307885206, + "grad_norm": 1.6640625, + "learning_rate": 1.9867912376079657e-05, + "loss": 1.1156, + "step": 1001 + }, + { + "epoch": 0.1718070171678419, + "grad_norm": 1.6328125, + "learning_rate": 1.986761966007834e-05, + "loss": 1.0472, + "step": 1002 + }, + { + "epoch": 0.17197848125683177, + "grad_norm": 1.75, + "learning_rate": 1.986732662225679e-05, + "loss": 1.1131, + "step": 1003 + }, + { + "epoch": 0.17214994534582165, + "grad_norm": 1.640625, + "learning_rate": 1.986703326262456e-05, + "loss": 1.0208, + "step": 1004 + }, + { + "epoch": 0.1723214094348115, + "grad_norm": 1.609375, + "learning_rate": 1.9866739581191217e-05, + "loss": 1.0863, + "step": 1005 + }, + { + "epoch": 0.17249287352380135, + "grad_norm": 1.6796875, + "learning_rate": 1.9866445577966345e-05, + "loss": 1.095, + "step": 1006 + }, + { + "epoch": 0.1726643376127912, + "grad_norm": 1.7421875, + "learning_rate": 1.9866151252959524e-05, + "loss": 1.1355, + "step": 1007 + }, + { + "epoch": 0.1728358017017811, + "grad_norm": 1.6796875, + "learning_rate": 1.9865856606180364e-05, + "loss": 1.0531, + "step": 1008 + }, + { + "epoch": 0.17300726579077094, + "grad_norm": 1.703125, + "learning_rate": 1.9865561637638466e-05, + "loss": 1.0611, + "step": 1009 + }, + { + "epoch": 0.1731787298797608, + "grad_norm": 1.6953125, + "learning_rate": 1.9865266347343453e-05, + "loss": 1.0575, + "step": 1010 + }, + { + "epoch": 0.17335019396875068, + "grad_norm": 1.6171875, + "learning_rate": 1.9864970735304956e-05, + "loss": 0.9826, + "step": 1011 + }, + { + "epoch": 0.17352165805774053, + "grad_norm": 1.6171875, + "learning_rate": 1.9864674801532616e-05, + "loss": 1.0855, + "step": 1012 + }, + { + "epoch": 0.17369312214673038, + "grad_norm": 1.7109375, + "learning_rate": 1.9864378546036082e-05, + "loss": 1.1052, + "step": 1013 + }, + { + "epoch": 0.17386458623572026, + "grad_norm": 1.7734375, + "learning_rate": 1.986408196882502e-05, + "loss": 1.0988, + "step": 1014 + }, + { + "epoch": 0.17403605032471012, + "grad_norm": 1.6328125, + "learning_rate": 1.9863785069909095e-05, + "loss": 1.0497, + "step": 1015 + }, + { + "epoch": 0.17420751441369997, + "grad_norm": 1.71875, + "learning_rate": 1.9863487849298e-05, + "loss": 1.0612, + "step": 1016 + }, + { + "epoch": 0.17437897850268985, + "grad_norm": 1.78125, + "learning_rate": 1.9863190307001426e-05, + "loss": 1.0183, + "step": 1017 + }, + { + "epoch": 0.1745504425916797, + "grad_norm": 1.7734375, + "learning_rate": 1.986289244302907e-05, + "loss": 1.077, + "step": 1018 + }, + { + "epoch": 0.17472190668066956, + "grad_norm": 1.625, + "learning_rate": 1.986259425739065e-05, + "loss": 1.0255, + "step": 1019 + }, + { + "epoch": 0.17489337076965944, + "grad_norm": 1.6640625, + "learning_rate": 1.9862295750095896e-05, + "loss": 1.0715, + "step": 1020 + }, + { + "epoch": 0.1750648348586493, + "grad_norm": 1.6640625, + "learning_rate": 1.986199692115454e-05, + "loss": 1.1179, + "step": 1021 + }, + { + "epoch": 0.17523629894763915, + "grad_norm": 1.6171875, + "learning_rate": 1.9861697770576326e-05, + "loss": 1.0499, + "step": 1022 + }, + { + "epoch": 0.17540776303662903, + "grad_norm": 1.7265625, + "learning_rate": 1.9861398298371012e-05, + "loss": 1.1127, + "step": 1023 + }, + { + "epoch": 0.17557922712561888, + "grad_norm": 1.7578125, + "learning_rate": 1.9861098504548365e-05, + "loss": 1.0414, + "step": 1024 + }, + { + "epoch": 0.17575069121460873, + "grad_norm": 1.7265625, + "learning_rate": 1.9860798389118163e-05, + "loss": 1.0307, + "step": 1025 + }, + { + "epoch": 0.17592215530359862, + "grad_norm": 1.6328125, + "learning_rate": 1.9860497952090192e-05, + "loss": 1.0727, + "step": 1026 + }, + { + "epoch": 0.17609361939258847, + "grad_norm": 1.5859375, + "learning_rate": 1.9860197193474253e-05, + "loss": 1.0412, + "step": 1027 + }, + { + "epoch": 0.17626508348157832, + "grad_norm": 1.7109375, + "learning_rate": 1.9859896113280152e-05, + "loss": 1.0912, + "step": 1028 + }, + { + "epoch": 0.1764365475705682, + "grad_norm": 1.609375, + "learning_rate": 1.985959471151771e-05, + "loss": 1.1073, + "step": 1029 + }, + { + "epoch": 0.17660801165955806, + "grad_norm": 1.6171875, + "learning_rate": 1.9859292988196757e-05, + "loss": 1.1186, + "step": 1030 + }, + { + "epoch": 0.1767794757485479, + "grad_norm": 1.5703125, + "learning_rate": 1.9858990943327136e-05, + "loss": 1.0762, + "step": 1031 + }, + { + "epoch": 0.17695093983753776, + "grad_norm": 1.7890625, + "learning_rate": 1.985868857691869e-05, + "loss": 1.1374, + "step": 1032 + }, + { + "epoch": 0.17712240392652764, + "grad_norm": 1.703125, + "learning_rate": 1.985838588898129e-05, + "loss": 1.1087, + "step": 1033 + }, + { + "epoch": 0.1772938680155175, + "grad_norm": 1.6953125, + "learning_rate": 1.98580828795248e-05, + "loss": 1.0683, + "step": 1034 + }, + { + "epoch": 0.17746533210450735, + "grad_norm": 1.6484375, + "learning_rate": 1.9857779548559103e-05, + "loss": 1.1012, + "step": 1035 + }, + { + "epoch": 0.17763679619349723, + "grad_norm": 1.75, + "learning_rate": 1.9857475896094094e-05, + "loss": 1.1002, + "step": 1036 + }, + { + "epoch": 0.17780826028248709, + "grad_norm": 1.6875, + "learning_rate": 1.9857171922139678e-05, + "loss": 1.0632, + "step": 1037 + }, + { + "epoch": 0.17797972437147694, + "grad_norm": 1.5546875, + "learning_rate": 1.985686762670577e-05, + "loss": 0.9738, + "step": 1038 + }, + { + "epoch": 0.17815118846046682, + "grad_norm": 1.90625, + "learning_rate": 1.9856563009802286e-05, + "loss": 0.9626, + "step": 1039 + }, + { + "epoch": 0.17832265254945667, + "grad_norm": 1.65625, + "learning_rate": 1.985625807143917e-05, + "loss": 1.1021, + "step": 1040 + }, + { + "epoch": 0.17849411663844653, + "grad_norm": 1.71875, + "learning_rate": 1.985595281162636e-05, + "loss": 1.0798, + "step": 1041 + }, + { + "epoch": 0.1786655807274364, + "grad_norm": 1.7421875, + "learning_rate": 1.9855647230373817e-05, + "loss": 1.1084, + "step": 1042 + }, + { + "epoch": 0.17883704481642626, + "grad_norm": 1.7578125, + "learning_rate": 1.9855341327691506e-05, + "loss": 1.0181, + "step": 1043 + }, + { + "epoch": 0.1790085089054161, + "grad_norm": 1.65625, + "learning_rate": 1.9855035103589396e-05, + "loss": 1.0188, + "step": 1044 + }, + { + "epoch": 0.179179972994406, + "grad_norm": 1.640625, + "learning_rate": 1.9854728558077488e-05, + "loss": 1.1623, + "step": 1045 + }, + { + "epoch": 0.17935143708339585, + "grad_norm": 1.7109375, + "learning_rate": 1.9854421691165768e-05, + "loss": 1.0942, + "step": 1046 + }, + { + "epoch": 0.1795229011723857, + "grad_norm": 1.6015625, + "learning_rate": 1.9854114502864252e-05, + "loss": 0.9914, + "step": 1047 + }, + { + "epoch": 0.17969436526137558, + "grad_norm": 1.7421875, + "learning_rate": 1.9853806993182952e-05, + "loss": 1.1027, + "step": 1048 + }, + { + "epoch": 0.17986582935036544, + "grad_norm": 1.6640625, + "learning_rate": 1.98534991621319e-05, + "loss": 1.0728, + "step": 1049 + }, + { + "epoch": 0.1800372934393553, + "grad_norm": 1.6640625, + "learning_rate": 1.9853191009721137e-05, + "loss": 1.1005, + "step": 1050 + }, + { + "epoch": 0.18020875752834517, + "grad_norm": 1.59375, + "learning_rate": 1.9852882535960712e-05, + "loss": 1.0844, + "step": 1051 + }, + { + "epoch": 0.18038022161733502, + "grad_norm": 1.6953125, + "learning_rate": 1.985257374086068e-05, + "loss": 1.0699, + "step": 1052 + }, + { + "epoch": 0.18055168570632488, + "grad_norm": 1.6171875, + "learning_rate": 1.985226462443112e-05, + "loss": 1.0853, + "step": 1053 + }, + { + "epoch": 0.18072314979531473, + "grad_norm": 1.6171875, + "learning_rate": 1.9851955186682114e-05, + "loss": 1.0661, + "step": 1054 + }, + { + "epoch": 0.1808946138843046, + "grad_norm": 1.7265625, + "learning_rate": 1.9851645427623744e-05, + "loss": 1.11, + "step": 1055 + }, + { + "epoch": 0.18106607797329446, + "grad_norm": 1.7421875, + "learning_rate": 1.9851335347266124e-05, + "loss": 1.0096, + "step": 1056 + }, + { + "epoch": 0.18123754206228432, + "grad_norm": 1.7734375, + "learning_rate": 1.985102494561936e-05, + "loss": 1.0221, + "step": 1057 + }, + { + "epoch": 0.1814090061512742, + "grad_norm": 1.7578125, + "learning_rate": 1.9850714222693576e-05, + "loss": 1.0572, + "step": 1058 + }, + { + "epoch": 0.18158047024026405, + "grad_norm": 1.65625, + "learning_rate": 1.9850403178498906e-05, + "loss": 1.1359, + "step": 1059 + }, + { + "epoch": 0.1817519343292539, + "grad_norm": 1.6953125, + "learning_rate": 1.9850091813045496e-05, + "loss": 1.0709, + "step": 1060 + }, + { + "epoch": 0.1819233984182438, + "grad_norm": 1.875, + "learning_rate": 1.9849780126343502e-05, + "loss": 1.0282, + "step": 1061 + }, + { + "epoch": 0.18209486250723364, + "grad_norm": 1.765625, + "learning_rate": 1.984946811840309e-05, + "loss": 1.0842, + "step": 1062 + }, + { + "epoch": 0.1822663265962235, + "grad_norm": 1.71875, + "learning_rate": 1.9849155789234427e-05, + "loss": 1.0752, + "step": 1063 + }, + { + "epoch": 0.18243779068521337, + "grad_norm": 1.6875, + "learning_rate": 1.9848843138847704e-05, + "loss": 0.9987, + "step": 1064 + }, + { + "epoch": 0.18260925477420323, + "grad_norm": 1.671875, + "learning_rate": 1.984853016725312e-05, + "loss": 1.1689, + "step": 1065 + }, + { + "epoch": 0.18278071886319308, + "grad_norm": 1.640625, + "learning_rate": 1.9848216874460885e-05, + "loss": 1.0585, + "step": 1066 + }, + { + "epoch": 0.18295218295218296, + "grad_norm": 1.59375, + "learning_rate": 1.984790326048121e-05, + "loss": 1.1052, + "step": 1067 + }, + { + "epoch": 0.18312364704117282, + "grad_norm": 1.71875, + "learning_rate": 1.984758932532433e-05, + "loss": 1.0769, + "step": 1068 + }, + { + "epoch": 0.18329511113016267, + "grad_norm": 1.5625, + "learning_rate": 1.9847275069000473e-05, + "loss": 1.0894, + "step": 1069 + }, + { + "epoch": 0.18346657521915255, + "grad_norm": 1.7265625, + "learning_rate": 1.98469604915199e-05, + "loss": 1.1557, + "step": 1070 + }, + { + "epoch": 0.1836380393081424, + "grad_norm": 1.6484375, + "learning_rate": 1.9846645592892862e-05, + "loss": 1.0458, + "step": 1071 + }, + { + "epoch": 0.18380950339713226, + "grad_norm": 1.7421875, + "learning_rate": 1.9846330373129633e-05, + "loss": 1.1307, + "step": 1072 + }, + { + "epoch": 0.18398096748612214, + "grad_norm": 1.71875, + "learning_rate": 1.9846014832240494e-05, + "loss": 1.004, + "step": 1073 + }, + { + "epoch": 0.184152431575112, + "grad_norm": 1.7265625, + "learning_rate": 1.9845698970235733e-05, + "loss": 1.0424, + "step": 1074 + }, + { + "epoch": 0.18432389566410184, + "grad_norm": 1.7421875, + "learning_rate": 1.9845382787125653e-05, + "loss": 1.0541, + "step": 1075 + }, + { + "epoch": 0.18449535975309173, + "grad_norm": 1.65625, + "learning_rate": 1.984506628292057e-05, + "loss": 1.0967, + "step": 1076 + }, + { + "epoch": 0.18466682384208158, + "grad_norm": 1.6328125, + "learning_rate": 1.9844749457630797e-05, + "loss": 1.1174, + "step": 1077 + }, + { + "epoch": 0.18483828793107143, + "grad_norm": 1.65625, + "learning_rate": 1.9844432311266675e-05, + "loss": 1.0977, + "step": 1078 + }, + { + "epoch": 0.18500975202006129, + "grad_norm": 1.640625, + "learning_rate": 1.9844114843838542e-05, + "loss": 1.1425, + "step": 1079 + }, + { + "epoch": 0.18518121610905117, + "grad_norm": 1.75, + "learning_rate": 1.984379705535676e-05, + "loss": 1.0752, + "step": 1080 + }, + { + "epoch": 0.18535268019804102, + "grad_norm": 1.8046875, + "learning_rate": 1.9843478945831684e-05, + "loss": 1.1111, + "step": 1081 + }, + { + "epoch": 0.18552414428703087, + "grad_norm": 1.625, + "learning_rate": 1.984316051527369e-05, + "loss": 1.0563, + "step": 1082 + }, + { + "epoch": 0.18569560837602075, + "grad_norm": 1.765625, + "learning_rate": 1.9842841763693167e-05, + "loss": 1.0428, + "step": 1083 + }, + { + "epoch": 0.1858670724650106, + "grad_norm": 1.5703125, + "learning_rate": 1.9842522691100513e-05, + "loss": 1.0658, + "step": 1084 + }, + { + "epoch": 0.18603853655400046, + "grad_norm": 1.7421875, + "learning_rate": 1.984220329750613e-05, + "loss": 1.1422, + "step": 1085 + }, + { + "epoch": 0.18621000064299034, + "grad_norm": 1.6328125, + "learning_rate": 1.984188358292043e-05, + "loss": 1.1716, + "step": 1086 + }, + { + "epoch": 0.1863814647319802, + "grad_norm": 1.53125, + "learning_rate": 1.984156354735385e-05, + "loss": 1.0068, + "step": 1087 + }, + { + "epoch": 0.18655292882097005, + "grad_norm": 1.6796875, + "learning_rate": 1.984124319081682e-05, + "loss": 1.0618, + "step": 1088 + }, + { + "epoch": 0.18672439290995993, + "grad_norm": 1.6484375, + "learning_rate": 1.9840922513319793e-05, + "loss": 0.9207, + "step": 1089 + }, + { + "epoch": 0.18689585699894978, + "grad_norm": 1.796875, + "learning_rate": 1.9840601514873226e-05, + "loss": 1.0818, + "step": 1090 + }, + { + "epoch": 0.18706732108793964, + "grad_norm": 1.6875, + "learning_rate": 1.9840280195487587e-05, + "loss": 1.0598, + "step": 1091 + }, + { + "epoch": 0.18723878517692952, + "grad_norm": 1.7265625, + "learning_rate": 1.9839958555173354e-05, + "loss": 1.1416, + "step": 1092 + }, + { + "epoch": 0.18741024926591937, + "grad_norm": 1.5859375, + "learning_rate": 1.9839636593941018e-05, + "loss": 1.0442, + "step": 1093 + }, + { + "epoch": 0.18758171335490922, + "grad_norm": 1.7109375, + "learning_rate": 1.9839314311801085e-05, + "loss": 1.0928, + "step": 1094 + }, + { + "epoch": 0.1877531774438991, + "grad_norm": 1.71875, + "learning_rate": 1.9838991708764054e-05, + "loss": 1.0887, + "step": 1095 + }, + { + "epoch": 0.18792464153288896, + "grad_norm": 1.5703125, + "learning_rate": 1.9838668784840457e-05, + "loss": 1.0573, + "step": 1096 + }, + { + "epoch": 0.1880961056218788, + "grad_norm": 1.6015625, + "learning_rate": 1.9838345540040823e-05, + "loss": 1.0727, + "step": 1097 + }, + { + "epoch": 0.1882675697108687, + "grad_norm": 1.703125, + "learning_rate": 1.983802197437569e-05, + "loss": 1.0522, + "step": 1098 + }, + { + "epoch": 0.18843903379985855, + "grad_norm": 1.6328125, + "learning_rate": 1.9837698087855615e-05, + "loss": 1.0368, + "step": 1099 + }, + { + "epoch": 0.1886104978888484, + "grad_norm": 1.640625, + "learning_rate": 1.9837373880491162e-05, + "loss": 1.1018, + "step": 1100 + }, + { + "epoch": 0.18878196197783825, + "grad_norm": 1.6015625, + "learning_rate": 1.98370493522929e-05, + "loss": 1.0202, + "step": 1101 + }, + { + "epoch": 0.18895342606682813, + "grad_norm": 1.6484375, + "learning_rate": 1.9836724503271417e-05, + "loss": 1.1532, + "step": 1102 + }, + { + "epoch": 0.189124890155818, + "grad_norm": 1.7109375, + "learning_rate": 1.9836399333437307e-05, + "loss": 1.0213, + "step": 1103 + }, + { + "epoch": 0.18929635424480784, + "grad_norm": 1.703125, + "learning_rate": 1.9836073842801175e-05, + "loss": 1.0724, + "step": 1104 + }, + { + "epoch": 0.18946781833379772, + "grad_norm": 1.7265625, + "learning_rate": 1.983574803137363e-05, + "loss": 1.1454, + "step": 1105 + }, + { + "epoch": 0.18963928242278758, + "grad_norm": 1.65625, + "learning_rate": 1.9835421899165312e-05, + "loss": 1.0038, + "step": 1106 + }, + { + "epoch": 0.18981074651177743, + "grad_norm": 1.6875, + "learning_rate": 1.9835095446186842e-05, + "loss": 1.0624, + "step": 1107 + }, + { + "epoch": 0.1899822106007673, + "grad_norm": 1.625, + "learning_rate": 1.9834768672448877e-05, + "loss": 1.1086, + "step": 1108 + }, + { + "epoch": 0.19015367468975716, + "grad_norm": 1.75, + "learning_rate": 1.9834441577962072e-05, + "loss": 1.1179, + "step": 1109 + }, + { + "epoch": 0.19032513877874702, + "grad_norm": 1.765625, + "learning_rate": 1.983411416273709e-05, + "loss": 1.2127, + "step": 1110 + }, + { + "epoch": 0.1904966028677369, + "grad_norm": 1.734375, + "learning_rate": 1.983378642678462e-05, + "loss": 1.0469, + "step": 1111 + }, + { + "epoch": 0.19066806695672675, + "grad_norm": 1.703125, + "learning_rate": 1.983345837011534e-05, + "loss": 1.1232, + "step": 1112 + }, + { + "epoch": 0.1908395310457166, + "grad_norm": 1.6640625, + "learning_rate": 1.9833129992739956e-05, + "loss": 1.1014, + "step": 1113 + }, + { + "epoch": 0.19101099513470648, + "grad_norm": 1.671875, + "learning_rate": 1.983280129466917e-05, + "loss": 1.0703, + "step": 1114 + }, + { + "epoch": 0.19118245922369634, + "grad_norm": 1.6484375, + "learning_rate": 1.983247227591371e-05, + "loss": 1.0507, + "step": 1115 + }, + { + "epoch": 0.1913539233126862, + "grad_norm": 1.671875, + "learning_rate": 1.9832142936484303e-05, + "loss": 1.0769, + "step": 1116 + }, + { + "epoch": 0.19152538740167607, + "grad_norm": 1.7890625, + "learning_rate": 1.983181327639169e-05, + "loss": 1.071, + "step": 1117 + }, + { + "epoch": 0.19169685149066593, + "grad_norm": 1.6484375, + "learning_rate": 1.9831483295646623e-05, + "loss": 1.1299, + "step": 1118 + }, + { + "epoch": 0.19186831557965578, + "grad_norm": 1.640625, + "learning_rate": 1.9831152994259863e-05, + "loss": 0.9908, + "step": 1119 + }, + { + "epoch": 0.19203977966864566, + "grad_norm": 1.5859375, + "learning_rate": 1.9830822372242185e-05, + "loss": 1.1028, + "step": 1120 + }, + { + "epoch": 0.1922112437576355, + "grad_norm": 1.5703125, + "learning_rate": 1.983049142960437e-05, + "loss": 0.9642, + "step": 1121 + }, + { + "epoch": 0.19238270784662537, + "grad_norm": 1.7265625, + "learning_rate": 1.983016016635721e-05, + "loss": 1.0726, + "step": 1122 + }, + { + "epoch": 0.19255417193561525, + "grad_norm": 1.71875, + "learning_rate": 1.982982858251151e-05, + "loss": 1.0486, + "step": 1123 + }, + { + "epoch": 0.1927256360246051, + "grad_norm": 1.6953125, + "learning_rate": 1.9829496678078083e-05, + "loss": 1.0696, + "step": 1124 + }, + { + "epoch": 0.19289710011359495, + "grad_norm": 1.6171875, + "learning_rate": 1.9829164453067754e-05, + "loss": 1.0225, + "step": 1125 + }, + { + "epoch": 0.1930685642025848, + "grad_norm": 1.7265625, + "learning_rate": 1.9828831907491364e-05, + "loss": 1.1109, + "step": 1126 + }, + { + "epoch": 0.1932400282915747, + "grad_norm": 1.6796875, + "learning_rate": 1.982849904135975e-05, + "loss": 1.0883, + "step": 1127 + }, + { + "epoch": 0.19341149238056454, + "grad_norm": 1.7421875, + "learning_rate": 1.9828165854683774e-05, + "loss": 1.1209, + "step": 1128 + }, + { + "epoch": 0.1935829564695544, + "grad_norm": 1.6796875, + "learning_rate": 1.98278323474743e-05, + "loss": 1.1511, + "step": 1129 + }, + { + "epoch": 0.19375442055854428, + "grad_norm": 1.6484375, + "learning_rate": 1.9827498519742202e-05, + "loss": 1.1082, + "step": 1130 + }, + { + "epoch": 0.19392588464753413, + "grad_norm": 1.6640625, + "learning_rate": 1.982716437149837e-05, + "loss": 1.0247, + "step": 1131 + }, + { + "epoch": 0.19409734873652398, + "grad_norm": 1.7734375, + "learning_rate": 1.9826829902753706e-05, + "loss": 1.1547, + "step": 1132 + }, + { + "epoch": 0.19426881282551386, + "grad_norm": 1.671875, + "learning_rate": 1.9826495113519112e-05, + "loss": 1.0645, + "step": 1133 + }, + { + "epoch": 0.19444027691450372, + "grad_norm": 1.6171875, + "learning_rate": 1.982616000380551e-05, + "loss": 1.0384, + "step": 1134 + }, + { + "epoch": 0.19461174100349357, + "grad_norm": 1.71875, + "learning_rate": 1.9825824573623825e-05, + "loss": 1.086, + "step": 1135 + }, + { + "epoch": 0.19478320509248345, + "grad_norm": 1.7109375, + "learning_rate": 1.9825488822985007e-05, + "loss": 1.1152, + "step": 1136 + }, + { + "epoch": 0.1949546691814733, + "grad_norm": 1.5625, + "learning_rate": 1.9825152751899993e-05, + "loss": 0.9781, + "step": 1137 + }, + { + "epoch": 0.19512613327046316, + "grad_norm": 1.5859375, + "learning_rate": 1.9824816360379753e-05, + "loss": 1.0205, + "step": 1138 + }, + { + "epoch": 0.19529759735945304, + "grad_norm": 1.640625, + "learning_rate": 1.9824479648435253e-05, + "loss": 1.0696, + "step": 1139 + }, + { + "epoch": 0.1954690614484429, + "grad_norm": 1.65625, + "learning_rate": 1.9824142616077475e-05, + "loss": 1.0341, + "step": 1140 + }, + { + "epoch": 0.19564052553743275, + "grad_norm": 1.6484375, + "learning_rate": 1.9823805263317415e-05, + "loss": 0.989, + "step": 1141 + }, + { + "epoch": 0.19581198962642263, + "grad_norm": 1.6484375, + "learning_rate": 1.9823467590166073e-05, + "loss": 1.0009, + "step": 1142 + }, + { + "epoch": 0.19598345371541248, + "grad_norm": 1.7734375, + "learning_rate": 1.9823129596634456e-05, + "loss": 1.1526, + "step": 1143 + }, + { + "epoch": 0.19615491780440233, + "grad_norm": 1.84375, + "learning_rate": 1.98227912827336e-05, + "loss": 1.047, + "step": 1144 + }, + { + "epoch": 0.19632638189339222, + "grad_norm": 1.7265625, + "learning_rate": 1.9822452648474524e-05, + "loss": 1.0929, + "step": 1145 + }, + { + "epoch": 0.19649784598238207, + "grad_norm": 1.734375, + "learning_rate": 1.9822113693868282e-05, + "loss": 1.0357, + "step": 1146 + }, + { + "epoch": 0.19666931007137192, + "grad_norm": 1.6640625, + "learning_rate": 1.9821774418925924e-05, + "loss": 1.0461, + "step": 1147 + }, + { + "epoch": 0.19684077416036178, + "grad_norm": 1.6171875, + "learning_rate": 1.982143482365852e-05, + "loss": 1.0513, + "step": 1148 + }, + { + "epoch": 0.19701223824935166, + "grad_norm": 1.7734375, + "learning_rate": 1.982109490807714e-05, + "loss": 1.05, + "step": 1149 + }, + { + "epoch": 0.1971837023383415, + "grad_norm": 1.6640625, + "learning_rate": 1.982075467219287e-05, + "loss": 1.1052, + "step": 1150 + }, + { + "epoch": 0.19735516642733136, + "grad_norm": 1.7265625, + "learning_rate": 1.9820414116016814e-05, + "loss": 1.1074, + "step": 1151 + }, + { + "epoch": 0.19752663051632124, + "grad_norm": 1.734375, + "learning_rate": 1.982007323956007e-05, + "loss": 1.0711, + "step": 1152 + }, + { + "epoch": 0.1976980946053111, + "grad_norm": 1.6796875, + "learning_rate": 1.9819732042833762e-05, + "loss": 1.0514, + "step": 1153 + }, + { + "epoch": 0.19786955869430095, + "grad_norm": 1.6953125, + "learning_rate": 1.981939052584901e-05, + "loss": 1.0599, + "step": 1154 + }, + { + "epoch": 0.19804102278329083, + "grad_norm": 1.703125, + "learning_rate": 1.981904868861696e-05, + "loss": 1.1223, + "step": 1155 + }, + { + "epoch": 0.19821248687228069, + "grad_norm": 1.7734375, + "learning_rate": 1.9818706531148754e-05, + "loss": 1.1717, + "step": 1156 + }, + { + "epoch": 0.19838395096127054, + "grad_norm": 1.765625, + "learning_rate": 1.9818364053455553e-05, + "loss": 1.1644, + "step": 1157 + }, + { + "epoch": 0.19855541505026042, + "grad_norm": 1.65625, + "learning_rate": 1.9818021255548536e-05, + "loss": 1.064, + "step": 1158 + }, + { + "epoch": 0.19872687913925027, + "grad_norm": 1.8359375, + "learning_rate": 1.9817678137438866e-05, + "loss": 1.1106, + "step": 1159 + }, + { + "epoch": 0.19889834322824013, + "grad_norm": 1.7265625, + "learning_rate": 1.9817334699137746e-05, + "loss": 1.0919, + "step": 1160 + }, + { + "epoch": 0.19906980731723, + "grad_norm": 1.5546875, + "learning_rate": 1.9816990940656373e-05, + "loss": 1.0914, + "step": 1161 + }, + { + "epoch": 0.19924127140621986, + "grad_norm": 1.734375, + "learning_rate": 1.9816646862005954e-05, + "loss": 1.1322, + "step": 1162 + }, + { + "epoch": 0.19941273549520971, + "grad_norm": 1.6328125, + "learning_rate": 1.9816302463197718e-05, + "loss": 1.0757, + "step": 1163 + }, + { + "epoch": 0.1995841995841996, + "grad_norm": 1.6328125, + "learning_rate": 1.9815957744242894e-05, + "loss": 1.0408, + "step": 1164 + }, + { + "epoch": 0.19975566367318945, + "grad_norm": 1.703125, + "learning_rate": 1.9815612705152726e-05, + "loss": 1.1011, + "step": 1165 + }, + { + "epoch": 0.1999271277621793, + "grad_norm": 1.65625, + "learning_rate": 1.9815267345938464e-05, + "loss": 1.052, + "step": 1166 + }, + { + "epoch": 0.20009859185116918, + "grad_norm": 1.6953125, + "learning_rate": 1.9814921666611372e-05, + "loss": 0.9697, + "step": 1167 + }, + { + "epoch": 0.20027005594015904, + "grad_norm": 1.5859375, + "learning_rate": 1.9814575667182723e-05, + "loss": 1.0396, + "step": 1168 + }, + { + "epoch": 0.2004415200291489, + "grad_norm": 1.640625, + "learning_rate": 1.9814229347663806e-05, + "loss": 1.1019, + "step": 1169 + }, + { + "epoch": 0.20061298411813877, + "grad_norm": 1.625, + "learning_rate": 1.9813882708065914e-05, + "loss": 1.03, + "step": 1170 + }, + { + "epoch": 0.20078444820712862, + "grad_norm": 1.671875, + "learning_rate": 1.981353574840035e-05, + "loss": 1.0963, + "step": 1171 + }, + { + "epoch": 0.20095591229611848, + "grad_norm": 1.6953125, + "learning_rate": 1.981318846867843e-05, + "loss": 1.0078, + "step": 1172 + }, + { + "epoch": 0.20112737638510833, + "grad_norm": 1.6875, + "learning_rate": 1.9812840868911484e-05, + "loss": 1.1007, + "step": 1173 + }, + { + "epoch": 0.2012988404740982, + "grad_norm": 1.7421875, + "learning_rate": 1.981249294911084e-05, + "loss": 1.1998, + "step": 1174 + }, + { + "epoch": 0.20147030456308807, + "grad_norm": 1.671875, + "learning_rate": 1.9812144709287856e-05, + "loss": 1.0579, + "step": 1175 + }, + { + "epoch": 0.20164176865207792, + "grad_norm": 1.7109375, + "learning_rate": 1.9811796149453883e-05, + "loss": 1.1642, + "step": 1176 + }, + { + "epoch": 0.2018132327410678, + "grad_norm": 1.6484375, + "learning_rate": 1.9811447269620284e-05, + "loss": 1.1081, + "step": 1177 + }, + { + "epoch": 0.20198469683005765, + "grad_norm": 1.640625, + "learning_rate": 1.9811098069798448e-05, + "loss": 1.024, + "step": 1178 + }, + { + "epoch": 0.2021561609190475, + "grad_norm": 1.5625, + "learning_rate": 1.981074854999976e-05, + "loss": 1.077, + "step": 1179 + }, + { + "epoch": 0.2023276250080374, + "grad_norm": 2.1875, + "learning_rate": 1.9810398710235616e-05, + "loss": 1.0823, + "step": 1180 + }, + { + "epoch": 0.20249908909702724, + "grad_norm": 1.7578125, + "learning_rate": 1.981004855051743e-05, + "loss": 1.0865, + "step": 1181 + }, + { + "epoch": 0.2026705531860171, + "grad_norm": 1.734375, + "learning_rate": 1.9809698070856616e-05, + "loss": 1.1196, + "step": 1182 + }, + { + "epoch": 0.20284201727500697, + "grad_norm": 1.7734375, + "learning_rate": 1.980934727126461e-05, + "loss": 1.1146, + "step": 1183 + }, + { + "epoch": 0.20301348136399683, + "grad_norm": 1.703125, + "learning_rate": 1.980899615175285e-05, + "loss": 1.082, + "step": 1184 + }, + { + "epoch": 0.20318494545298668, + "grad_norm": 1.6875, + "learning_rate": 1.9808644712332786e-05, + "loss": 1.08, + "step": 1185 + }, + { + "epoch": 0.20335640954197656, + "grad_norm": 1.6796875, + "learning_rate": 1.9808292953015888e-05, + "loss": 1.0736, + "step": 1186 + }, + { + "epoch": 0.20352787363096642, + "grad_norm": 1.703125, + "learning_rate": 1.980794087381362e-05, + "loss": 1.0743, + "step": 1187 + }, + { + "epoch": 0.20369933771995627, + "grad_norm": 1.6796875, + "learning_rate": 1.980758847473746e-05, + "loss": 1.1012, + "step": 1188 + }, + { + "epoch": 0.20387080180894615, + "grad_norm": 1.609375, + "learning_rate": 1.9807235755798918e-05, + "loss": 0.9476, + "step": 1189 + }, + { + "epoch": 0.204042265897936, + "grad_norm": 1.65625, + "learning_rate": 1.980688271700948e-05, + "loss": 1.003, + "step": 1190 + }, + { + "epoch": 0.20421372998692586, + "grad_norm": 1.734375, + "learning_rate": 1.980652935838067e-05, + "loss": 1.0686, + "step": 1191 + }, + { + "epoch": 0.20438519407591574, + "grad_norm": 39.5, + "learning_rate": 1.9806175679924008e-05, + "loss": 1.0877, + "step": 1192 + }, + { + "epoch": 0.2045566581649056, + "grad_norm": 1.6015625, + "learning_rate": 1.9805821681651035e-05, + "loss": 1.0505, + "step": 1193 + }, + { + "epoch": 0.20472812225389544, + "grad_norm": 1.859375, + "learning_rate": 1.980546736357329e-05, + "loss": 1.117, + "step": 1194 + }, + { + "epoch": 0.2048995863428853, + "grad_norm": 1.703125, + "learning_rate": 1.9805112725702326e-05, + "loss": 1.07, + "step": 1195 + }, + { + "epoch": 0.20507105043187518, + "grad_norm": 1.703125, + "learning_rate": 1.980475776804972e-05, + "loss": 1.0231, + "step": 1196 + }, + { + "epoch": 0.20524251452086503, + "grad_norm": 1.6796875, + "learning_rate": 1.9804402490627038e-05, + "loss": 1.0791, + "step": 1197 + }, + { + "epoch": 0.20541397860985489, + "grad_norm": 1.7421875, + "learning_rate": 1.9804046893445868e-05, + "loss": 1.1112, + "step": 1198 + }, + { + "epoch": 0.20558544269884477, + "grad_norm": 1.6875, + "learning_rate": 1.9803690976517814e-05, + "loss": 1.0347, + "step": 1199 + }, + { + "epoch": 0.20575690678783462, + "grad_norm": 1.6640625, + "learning_rate": 1.9803334739854477e-05, + "loss": 1.1235, + "step": 1200 + }, + { + "epoch": 0.20592837087682447, + "grad_norm": 1.5546875, + "learning_rate": 1.9802978183467482e-05, + "loss": 1.0379, + "step": 1201 + }, + { + "epoch": 0.20609983496581435, + "grad_norm": 1.6953125, + "learning_rate": 1.9802621307368453e-05, + "loss": 1.0426, + "step": 1202 + }, + { + "epoch": 0.2062712990548042, + "grad_norm": 1.6640625, + "learning_rate": 1.980226411156903e-05, + "loss": 1.1002, + "step": 1203 + }, + { + "epoch": 0.20644276314379406, + "grad_norm": 1.53125, + "learning_rate": 1.980190659608086e-05, + "loss": 1.0196, + "step": 1204 + }, + { + "epoch": 0.20661422723278394, + "grad_norm": 1.7109375, + "learning_rate": 1.9801548760915607e-05, + "loss": 1.0715, + "step": 1205 + }, + { + "epoch": 0.2067856913217738, + "grad_norm": 1.671875, + "learning_rate": 1.980119060608494e-05, + "loss": 0.9744, + "step": 1206 + }, + { + "epoch": 0.20695715541076365, + "grad_norm": 1.6875, + "learning_rate": 1.9800832131600537e-05, + "loss": 1.1004, + "step": 1207 + }, + { + "epoch": 0.20712861949975353, + "grad_norm": 1.6484375, + "learning_rate": 1.98004733374741e-05, + "loss": 1.0266, + "step": 1208 + }, + { + "epoch": 0.20730008358874338, + "grad_norm": 1.671875, + "learning_rate": 1.9800114223717314e-05, + "loss": 0.9948, + "step": 1209 + }, + { + "epoch": 0.20747154767773324, + "grad_norm": 1.625, + "learning_rate": 1.97997547903419e-05, + "loss": 1.0021, + "step": 1210 + }, + { + "epoch": 0.20764301176672312, + "grad_norm": 1.640625, + "learning_rate": 1.9799395037359583e-05, + "loss": 1.0087, + "step": 1211 + }, + { + "epoch": 0.20781447585571297, + "grad_norm": 1.6953125, + "learning_rate": 1.9799034964782088e-05, + "loss": 1.0903, + "step": 1212 + }, + { + "epoch": 0.20798593994470282, + "grad_norm": 1.703125, + "learning_rate": 1.9798674572621167e-05, + "loss": 1.1095, + "step": 1213 + }, + { + "epoch": 0.2081574040336927, + "grad_norm": 1.671875, + "learning_rate": 1.979831386088857e-05, + "loss": 1.077, + "step": 1214 + }, + { + "epoch": 0.20832886812268256, + "grad_norm": 1.6640625, + "learning_rate": 1.979795282959606e-05, + "loss": 1.008, + "step": 1215 + }, + { + "epoch": 0.2085003322116724, + "grad_norm": 1.6875, + "learning_rate": 1.9797591478755413e-05, + "loss": 1.0306, + "step": 1216 + }, + { + "epoch": 0.2086717963006623, + "grad_norm": 1.578125, + "learning_rate": 1.9797229808378412e-05, + "loss": 1.0404, + "step": 1217 + }, + { + "epoch": 0.20884326038965215, + "grad_norm": 1.5546875, + "learning_rate": 1.9796867818476854e-05, + "loss": 1.049, + "step": 1218 + }, + { + "epoch": 0.209014724478642, + "grad_norm": 1.6484375, + "learning_rate": 1.9796505509062545e-05, + "loss": 0.9834, + "step": 1219 + }, + { + "epoch": 0.20918618856763185, + "grad_norm": 1.6640625, + "learning_rate": 1.97961428801473e-05, + "loss": 1.0725, + "step": 1220 + }, + { + "epoch": 0.20935765265662173, + "grad_norm": 1.75, + "learning_rate": 1.979577993174295e-05, + "loss": 1.0489, + "step": 1221 + }, + { + "epoch": 0.2095291167456116, + "grad_norm": 1.6796875, + "learning_rate": 1.9795416663861328e-05, + "loss": 1.0873, + "step": 1222 + }, + { + "epoch": 0.20970058083460144, + "grad_norm": 1.6328125, + "learning_rate": 1.9795053076514283e-05, + "loss": 1.008, + "step": 1223 + }, + { + "epoch": 0.20987204492359132, + "grad_norm": 3.4375, + "learning_rate": 1.979468916971367e-05, + "loss": 1.0834, + "step": 1224 + }, + { + "epoch": 0.21004350901258118, + "grad_norm": 1.7265625, + "learning_rate": 1.9794324943471365e-05, + "loss": 1.0316, + "step": 1225 + }, + { + "epoch": 0.21021497310157103, + "grad_norm": 1.7578125, + "learning_rate": 1.9793960397799233e-05, + "loss": 1.0937, + "step": 1226 + }, + { + "epoch": 0.2103864371905609, + "grad_norm": 1.65625, + "learning_rate": 1.979359553270918e-05, + "loss": 1.0939, + "step": 1227 + }, + { + "epoch": 0.21055790127955076, + "grad_norm": 1.671875, + "learning_rate": 1.9793230348213094e-05, + "loss": 1.0984, + "step": 1228 + }, + { + "epoch": 0.21072936536854062, + "grad_norm": 1.640625, + "learning_rate": 1.979286484432289e-05, + "loss": 1.0379, + "step": 1229 + }, + { + "epoch": 0.2109008294575305, + "grad_norm": 1.6953125, + "learning_rate": 1.9792499021050484e-05, + "loss": 1.1062, + "step": 1230 + }, + { + "epoch": 0.21107229354652035, + "grad_norm": 1.7109375, + "learning_rate": 1.979213287840781e-05, + "loss": 0.9739, + "step": 1231 + }, + { + "epoch": 0.2112437576355102, + "grad_norm": 1.71875, + "learning_rate": 1.979176641640681e-05, + "loss": 1.1277, + "step": 1232 + }, + { + "epoch": 0.21141522172450009, + "grad_norm": 1.7265625, + "learning_rate": 1.9791399635059438e-05, + "loss": 0.9814, + "step": 1233 + }, + { + "epoch": 0.21158668581348994, + "grad_norm": 1.6953125, + "learning_rate": 1.979103253437765e-05, + "loss": 1.0308, + "step": 1234 + }, + { + "epoch": 0.2117581499024798, + "grad_norm": 1.71875, + "learning_rate": 1.979066511437342e-05, + "loss": 1.0897, + "step": 1235 + }, + { + "epoch": 0.21192961399146967, + "grad_norm": 1.7890625, + "learning_rate": 1.9790297375058733e-05, + "loss": 1.0596, + "step": 1236 + }, + { + "epoch": 0.21210107808045953, + "grad_norm": 1.65625, + "learning_rate": 1.9789929316445584e-05, + "loss": 1.0471, + "step": 1237 + }, + { + "epoch": 0.21227254216944938, + "grad_norm": 1.609375, + "learning_rate": 1.9789560938545972e-05, + "loss": 1.0457, + "step": 1238 + }, + { + "epoch": 0.21244400625843926, + "grad_norm": 1.6953125, + "learning_rate": 1.9789192241371916e-05, + "loss": 1.0937, + "step": 1239 + }, + { + "epoch": 0.21261547034742911, + "grad_norm": 1.703125, + "learning_rate": 1.9788823224935436e-05, + "loss": 1.082, + "step": 1240 + }, + { + "epoch": 0.21278693443641897, + "grad_norm": 1.671875, + "learning_rate": 1.978845388924857e-05, + "loss": 1.0213, + "step": 1241 + }, + { + "epoch": 0.21295839852540885, + "grad_norm": 1.6796875, + "learning_rate": 1.9788084234323365e-05, + "loss": 1.0233, + "step": 1242 + }, + { + "epoch": 0.2131298626143987, + "grad_norm": 1.6484375, + "learning_rate": 1.978771426017187e-05, + "loss": 1.0576, + "step": 1243 + }, + { + "epoch": 0.21330132670338856, + "grad_norm": 1.6875, + "learning_rate": 1.978734396680616e-05, + "loss": 1.0684, + "step": 1244 + }, + { + "epoch": 0.2134727907923784, + "grad_norm": 1.7734375, + "learning_rate": 1.9786973354238304e-05, + "loss": 1.0791, + "step": 1245 + }, + { + "epoch": 0.2136442548813683, + "grad_norm": 1.6171875, + "learning_rate": 1.9786602422480397e-05, + "loss": 1.1244, + "step": 1246 + }, + { + "epoch": 0.21381571897035814, + "grad_norm": 1.671875, + "learning_rate": 1.9786231171544533e-05, + "loss": 1.0968, + "step": 1247 + }, + { + "epoch": 0.213987183059348, + "grad_norm": 1.6484375, + "learning_rate": 1.9785859601442813e-05, + "loss": 1.12, + "step": 1248 + }, + { + "epoch": 0.21415864714833788, + "grad_norm": 1.59375, + "learning_rate": 1.9785487712187365e-05, + "loss": 1.0423, + "step": 1249 + }, + { + "epoch": 0.21433011123732773, + "grad_norm": 1.625, + "learning_rate": 1.9785115503790314e-05, + "loss": 1.0428, + "step": 1250 + }, + { + "epoch": 0.21450157532631758, + "grad_norm": 1.671875, + "learning_rate": 1.9784742976263797e-05, + "loss": 1.134, + "step": 1251 + }, + { + "epoch": 0.21467303941530747, + "grad_norm": 1.75, + "learning_rate": 1.9784370129619966e-05, + "loss": 1.1417, + "step": 1252 + }, + { + "epoch": 0.21484450350429732, + "grad_norm": 1.7734375, + "learning_rate": 1.9783996963870983e-05, + "loss": 1.1021, + "step": 1253 + }, + { + "epoch": 0.21501596759328717, + "grad_norm": 1.7890625, + "learning_rate": 1.978362347902901e-05, + "loss": 1.0492, + "step": 1254 + }, + { + "epoch": 0.21518743168227705, + "grad_norm": 1.7265625, + "learning_rate": 1.978324967510624e-05, + "loss": 1.1161, + "step": 1255 + }, + { + "epoch": 0.2153588957712669, + "grad_norm": 1.7109375, + "learning_rate": 1.9782875552114857e-05, + "loss": 1.159, + "step": 1256 + }, + { + "epoch": 0.21553035986025676, + "grad_norm": 1.796875, + "learning_rate": 1.978250111006706e-05, + "loss": 1.1507, + "step": 1257 + }, + { + "epoch": 0.21570182394924664, + "grad_norm": 1.90625, + "learning_rate": 1.978212634897507e-05, + "loss": 1.0697, + "step": 1258 + }, + { + "epoch": 0.2158732880382365, + "grad_norm": 1.6875, + "learning_rate": 1.97817512688511e-05, + "loss": 1.0872, + "step": 1259 + }, + { + "epoch": 0.21604475212722635, + "grad_norm": 1.5859375, + "learning_rate": 1.9781375869707384e-05, + "loss": 1.0489, + "step": 1260 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 1.7265625, + "learning_rate": 1.978100015155617e-05, + "loss": 1.0415, + "step": 1261 + }, + { + "epoch": 0.21638768030520608, + "grad_norm": 1.7890625, + "learning_rate": 1.978062411440971e-05, + "loss": 1.0646, + "step": 1262 + }, + { + "epoch": 0.21655914439419593, + "grad_norm": 1.7109375, + "learning_rate": 1.9780247758280265e-05, + "loss": 1.0606, + "step": 1263 + }, + { + "epoch": 0.21673060848318582, + "grad_norm": 1.6484375, + "learning_rate": 1.9779871083180113e-05, + "loss": 1.0649, + "step": 1264 + }, + { + "epoch": 0.21690207257217567, + "grad_norm": 1.546875, + "learning_rate": 1.9779494089121537e-05, + "loss": 1.0249, + "step": 1265 + }, + { + "epoch": 0.21707353666116552, + "grad_norm": 1.6484375, + "learning_rate": 1.9779116776116837e-05, + "loss": 1.0685, + "step": 1266 + }, + { + "epoch": 0.21724500075015538, + "grad_norm": 1.671875, + "learning_rate": 1.9778739144178307e-05, + "loss": 1.011, + "step": 1267 + }, + { + "epoch": 0.21741646483914526, + "grad_norm": 1.78125, + "learning_rate": 1.9778361193318276e-05, + "loss": 1.118, + "step": 1268 + }, + { + "epoch": 0.2175879289281351, + "grad_norm": 1.7109375, + "learning_rate": 1.9777982923549062e-05, + "loss": 1.1274, + "step": 1269 + }, + { + "epoch": 0.21775939301712496, + "grad_norm": 1.5703125, + "learning_rate": 1.9777604334883003e-05, + "loss": 0.9581, + "step": 1270 + }, + { + "epoch": 0.21793085710611484, + "grad_norm": 1.640625, + "learning_rate": 1.9777225427332448e-05, + "loss": 1.049, + "step": 1271 + }, + { + "epoch": 0.2181023211951047, + "grad_norm": 1.6328125, + "learning_rate": 1.9776846200909757e-05, + "loss": 1.038, + "step": 1272 + }, + { + "epoch": 0.21827378528409455, + "grad_norm": 1.625, + "learning_rate": 1.9776466655627292e-05, + "loss": 1.004, + "step": 1273 + }, + { + "epoch": 0.21844524937308443, + "grad_norm": 1.7109375, + "learning_rate": 1.977608679149744e-05, + "loss": 1.0698, + "step": 1274 + }, + { + "epoch": 0.21861671346207429, + "grad_norm": 1.6015625, + "learning_rate": 1.9775706608532577e-05, + "loss": 1.025, + "step": 1275 + }, + { + "epoch": 0.21878817755106414, + "grad_norm": 1.609375, + "learning_rate": 1.9775326106745113e-05, + "loss": 0.9535, + "step": 1276 + }, + { + "epoch": 0.21895964164005402, + "grad_norm": 1.625, + "learning_rate": 1.9774945286147454e-05, + "loss": 1.0276, + "step": 1277 + }, + { + "epoch": 0.21913110572904387, + "grad_norm": 1.71875, + "learning_rate": 1.977456414675202e-05, + "loss": 1.0779, + "step": 1278 + }, + { + "epoch": 0.21930256981803373, + "grad_norm": 1.7265625, + "learning_rate": 1.9774182688571245e-05, + "loss": 1.1006, + "step": 1279 + }, + { + "epoch": 0.2194740339070236, + "grad_norm": 1.625, + "learning_rate": 1.9773800911617563e-05, + "loss": 1.1209, + "step": 1280 + }, + { + "epoch": 0.21964549799601346, + "grad_norm": 1.703125, + "learning_rate": 1.9773418815903428e-05, + "loss": 0.9508, + "step": 1281 + }, + { + "epoch": 0.21981696208500331, + "grad_norm": 1.6875, + "learning_rate": 1.97730364014413e-05, + "loss": 1.0266, + "step": 1282 + }, + { + "epoch": 0.2199884261739932, + "grad_norm": 1.6875, + "learning_rate": 1.9772653668243658e-05, + "loss": 1.0921, + "step": 1283 + }, + { + "epoch": 0.22015989026298305, + "grad_norm": 1.6484375, + "learning_rate": 1.9772270616322977e-05, + "loss": 1.0855, + "step": 1284 + }, + { + "epoch": 0.2203313543519729, + "grad_norm": 1.6484375, + "learning_rate": 1.9771887245691753e-05, + "loss": 1.025, + "step": 1285 + }, + { + "epoch": 0.22050281844096278, + "grad_norm": 1.6796875, + "learning_rate": 1.977150355636249e-05, + "loss": 1.0946, + "step": 1286 + }, + { + "epoch": 0.22067428252995264, + "grad_norm": 1.609375, + "learning_rate": 1.9771119548347693e-05, + "loss": 1.0725, + "step": 1287 + }, + { + "epoch": 0.2208457466189425, + "grad_norm": 1.734375, + "learning_rate": 1.97707352216599e-05, + "loss": 1.0588, + "step": 1288 + }, + { + "epoch": 0.22101721070793237, + "grad_norm": 1.6328125, + "learning_rate": 1.977035057631164e-05, + "loss": 0.9982, + "step": 1289 + }, + { + "epoch": 0.22118867479692222, + "grad_norm": 1.7421875, + "learning_rate": 1.976996561231545e-05, + "loss": 1.0788, + "step": 1290 + }, + { + "epoch": 0.22136013888591208, + "grad_norm": 1.7109375, + "learning_rate": 1.976958032968389e-05, + "loss": 1.0912, + "step": 1291 + }, + { + "epoch": 0.22153160297490193, + "grad_norm": 1.734375, + "learning_rate": 1.976919472842953e-05, + "loss": 1.1632, + "step": 1292 + }, + { + "epoch": 0.2217030670638918, + "grad_norm": 1.6953125, + "learning_rate": 1.9768808808564944e-05, + "loss": 1.0549, + "step": 1293 + }, + { + "epoch": 0.22187453115288167, + "grad_norm": 1.7578125, + "learning_rate": 1.9768422570102717e-05, + "loss": 1.0129, + "step": 1294 + }, + { + "epoch": 0.22204599524187152, + "grad_norm": 1.7890625, + "learning_rate": 1.9768036013055444e-05, + "loss": 1.0461, + "step": 1295 + }, + { + "epoch": 0.2222174593308614, + "grad_norm": 1.578125, + "learning_rate": 1.976764913743573e-05, + "loss": 1.0885, + "step": 1296 + }, + { + "epoch": 0.22238892341985125, + "grad_norm": 1.578125, + "learning_rate": 1.97672619432562e-05, + "loss": 1.068, + "step": 1297 + }, + { + "epoch": 0.2225603875088411, + "grad_norm": 1.609375, + "learning_rate": 1.976687443052948e-05, + "loss": 1.1145, + "step": 1298 + }, + { + "epoch": 0.222731851597831, + "grad_norm": 1.5390625, + "learning_rate": 1.9766486599268203e-05, + "loss": 1.0323, + "step": 1299 + }, + { + "epoch": 0.22290331568682084, + "grad_norm": 1.640625, + "learning_rate": 1.976609844948502e-05, + "loss": 1.0156, + "step": 1300 + }, + { + "epoch": 0.2230747797758107, + "grad_norm": 1.65625, + "learning_rate": 1.9765709981192592e-05, + "loss": 1.0087, + "step": 1301 + }, + { + "epoch": 0.22324624386480058, + "grad_norm": 1.65625, + "learning_rate": 1.9765321194403588e-05, + "loss": 0.998, + "step": 1302 + }, + { + "epoch": 0.22341770795379043, + "grad_norm": 1.6796875, + "learning_rate": 1.9764932089130687e-05, + "loss": 1.0416, + "step": 1303 + }, + { + "epoch": 0.22358917204278028, + "grad_norm": 1.6796875, + "learning_rate": 1.9764542665386578e-05, + "loss": 1.0588, + "step": 1304 + }, + { + "epoch": 0.22376063613177016, + "grad_norm": 1.75, + "learning_rate": 1.9764152923183965e-05, + "loss": 0.9994, + "step": 1305 + }, + { + "epoch": 0.22393210022076002, + "grad_norm": 1.640625, + "learning_rate": 1.9763762862535556e-05, + "loss": 1.1422, + "step": 1306 + }, + { + "epoch": 0.22410356430974987, + "grad_norm": 1.7890625, + "learning_rate": 1.976337248345407e-05, + "loss": 1.0702, + "step": 1307 + }, + { + "epoch": 0.22427502839873975, + "grad_norm": 1.5703125, + "learning_rate": 1.9762981785952248e-05, + "loss": 1.0187, + "step": 1308 + }, + { + "epoch": 0.2244464924877296, + "grad_norm": 1.6796875, + "learning_rate": 1.976259077004282e-05, + "loss": 1.0987, + "step": 1309 + }, + { + "epoch": 0.22461795657671946, + "grad_norm": 1.65625, + "learning_rate": 1.976219943573855e-05, + "loss": 1.0949, + "step": 1310 + }, + { + "epoch": 0.22478942066570934, + "grad_norm": 1.7265625, + "learning_rate": 1.976180778305219e-05, + "loss": 1.0813, + "step": 1311 + }, + { + "epoch": 0.2249608847546992, + "grad_norm": 1.671875, + "learning_rate": 1.976141581199652e-05, + "loss": 1.1172, + "step": 1312 + }, + { + "epoch": 0.22513234884368905, + "grad_norm": 1.65625, + "learning_rate": 1.9761023522584325e-05, + "loss": 1.125, + "step": 1313 + }, + { + "epoch": 0.2253038129326789, + "grad_norm": 1.7421875, + "learning_rate": 1.976063091482839e-05, + "loss": 1.0168, + "step": 1314 + }, + { + "epoch": 0.22547527702166878, + "grad_norm": 1.6875, + "learning_rate": 1.9760237988741534e-05, + "loss": 1.0785, + "step": 1315 + }, + { + "epoch": 0.22564674111065863, + "grad_norm": 1.640625, + "learning_rate": 1.975984474433656e-05, + "loss": 1.0187, + "step": 1316 + }, + { + "epoch": 0.2258182051996485, + "grad_norm": 1.6953125, + "learning_rate": 1.9759451181626295e-05, + "loss": 1.0427, + "step": 1317 + }, + { + "epoch": 0.22598966928863837, + "grad_norm": 1.7890625, + "learning_rate": 1.9759057300623578e-05, + "loss": 1.1146, + "step": 1318 + }, + { + "epoch": 0.22616113337762822, + "grad_norm": 1.6796875, + "learning_rate": 1.975866310134125e-05, + "loss": 1.0689, + "step": 1319 + }, + { + "epoch": 0.22633259746661807, + "grad_norm": 1.65625, + "learning_rate": 1.9758268583792176e-05, + "loss": 1.0353, + "step": 1320 + }, + { + "epoch": 0.22650406155560796, + "grad_norm": 1.6171875, + "learning_rate": 1.9757873747989214e-05, + "loss": 0.9958, + "step": 1321 + }, + { + "epoch": 0.2266755256445978, + "grad_norm": 1.6015625, + "learning_rate": 1.9757478593945244e-05, + "loss": 1.0269, + "step": 1322 + }, + { + "epoch": 0.22684698973358766, + "grad_norm": 1.7734375, + "learning_rate": 1.9757083121673157e-05, + "loss": 1.1019, + "step": 1323 + }, + { + "epoch": 0.22701845382257754, + "grad_norm": 1.6953125, + "learning_rate": 1.9756687331185845e-05, + "loss": 1.0571, + "step": 1324 + }, + { + "epoch": 0.2271899179115674, + "grad_norm": 1.84375, + "learning_rate": 1.9756291222496217e-05, + "loss": 1.163, + "step": 1325 + }, + { + "epoch": 0.22736138200055725, + "grad_norm": 1.59375, + "learning_rate": 1.9755894795617196e-05, + "loss": 1.0597, + "step": 1326 + }, + { + "epoch": 0.22753284608954713, + "grad_norm": 1.609375, + "learning_rate": 1.975549805056171e-05, + "loss": 0.9859, + "step": 1327 + }, + { + "epoch": 0.22770431017853698, + "grad_norm": 1.6015625, + "learning_rate": 1.9755100987342695e-05, + "loss": 0.979, + "step": 1328 + }, + { + "epoch": 0.22787577426752684, + "grad_norm": 1.6875, + "learning_rate": 1.9754703605973104e-05, + "loss": 1.0554, + "step": 1329 + }, + { + "epoch": 0.22804723835651672, + "grad_norm": 1.6640625, + "learning_rate": 1.975430590646589e-05, + "loss": 1.0303, + "step": 1330 + }, + { + "epoch": 0.22821870244550657, + "grad_norm": 1.65625, + "learning_rate": 1.9753907888834037e-05, + "loss": 1.1333, + "step": 1331 + }, + { + "epoch": 0.22839016653449642, + "grad_norm": 1.6484375, + "learning_rate": 1.9753509553090513e-05, + "loss": 1.0818, + "step": 1332 + }, + { + "epoch": 0.2285616306234863, + "grad_norm": 1.6796875, + "learning_rate": 1.9753110899248313e-05, + "loss": 1.1285, + "step": 1333 + }, + { + "epoch": 0.22873309471247616, + "grad_norm": 1.65625, + "learning_rate": 1.9752711927320444e-05, + "loss": 1.1243, + "step": 1334 + }, + { + "epoch": 0.228904558801466, + "grad_norm": 1.625, + "learning_rate": 1.975231263731991e-05, + "loss": 1.0094, + "step": 1335 + }, + { + "epoch": 0.2290760228904559, + "grad_norm": 1.6796875, + "learning_rate": 1.975191302925974e-05, + "loss": 1.1451, + "step": 1336 + }, + { + "epoch": 0.22924748697944575, + "grad_norm": 1.5703125, + "learning_rate": 1.975151310315296e-05, + "loss": 1.0273, + "step": 1337 + }, + { + "epoch": 0.2294189510684356, + "grad_norm": 1.5703125, + "learning_rate": 1.975111285901262e-05, + "loss": 1.1098, + "step": 1338 + }, + { + "epoch": 0.22959041515742545, + "grad_norm": 1.65625, + "learning_rate": 1.975071229685177e-05, + "loss": 1.0838, + "step": 1339 + }, + { + "epoch": 0.22976187924641533, + "grad_norm": 1.640625, + "learning_rate": 1.9750311416683475e-05, + "loss": 1.0626, + "step": 1340 + }, + { + "epoch": 0.2299333433354052, + "grad_norm": 1.6328125, + "learning_rate": 1.974991021852081e-05, + "loss": 0.9985, + "step": 1341 + }, + { + "epoch": 0.23010480742439504, + "grad_norm": 1.640625, + "learning_rate": 1.9749508702376853e-05, + "loss": 1.0117, + "step": 1342 + }, + { + "epoch": 0.23027627151338492, + "grad_norm": 1.734375, + "learning_rate": 1.9749106868264706e-05, + "loss": 1.1324, + "step": 1343 + }, + { + "epoch": 0.23044773560237478, + "grad_norm": 1.6484375, + "learning_rate": 1.9748704716197474e-05, + "loss": 1.0518, + "step": 1344 + }, + { + "epoch": 0.23061919969136463, + "grad_norm": 1.609375, + "learning_rate": 1.9748302246188267e-05, + "loss": 1.0752, + "step": 1345 + }, + { + "epoch": 0.2307906637803545, + "grad_norm": 1.671875, + "learning_rate": 1.974789945825022e-05, + "loss": 1.0666, + "step": 1346 + }, + { + "epoch": 0.23096212786934436, + "grad_norm": 1.703125, + "learning_rate": 1.974749635239646e-05, + "loss": 1.1034, + "step": 1347 + }, + { + "epoch": 0.23113359195833422, + "grad_norm": 1.6875, + "learning_rate": 1.974709292864014e-05, + "loss": 1.0834, + "step": 1348 + }, + { + "epoch": 0.2313050560473241, + "grad_norm": 1.7109375, + "learning_rate": 1.9746689186994417e-05, + "loss": 1.1175, + "step": 1349 + }, + { + "epoch": 0.23147652013631395, + "grad_norm": 1.703125, + "learning_rate": 1.9746285127472458e-05, + "loss": 1.0594, + "step": 1350 + }, + { + "epoch": 0.2316479842253038, + "grad_norm": 1.671875, + "learning_rate": 1.9745880750087437e-05, + "loss": 1.083, + "step": 1351 + }, + { + "epoch": 0.23181944831429369, + "grad_norm": 1.625, + "learning_rate": 1.9745476054852543e-05, + "loss": 1.0639, + "step": 1352 + }, + { + "epoch": 0.23199091240328354, + "grad_norm": 1.65625, + "learning_rate": 1.9745071041780983e-05, + "loss": 1.0399, + "step": 1353 + }, + { + "epoch": 0.2321623764922734, + "grad_norm": 1.625, + "learning_rate": 1.9744665710885955e-05, + "loss": 1.1361, + "step": 1354 + }, + { + "epoch": 0.23233384058126327, + "grad_norm": 1.625, + "learning_rate": 1.9744260062180686e-05, + "loss": 0.987, + "step": 1355 + }, + { + "epoch": 0.23250530467025313, + "grad_norm": 1.640625, + "learning_rate": 1.9743854095678402e-05, + "loss": 1.2077, + "step": 1356 + }, + { + "epoch": 0.23267676875924298, + "grad_norm": 1.671875, + "learning_rate": 1.9743447811392342e-05, + "loss": 1.1149, + "step": 1357 + }, + { + "epoch": 0.23284823284823286, + "grad_norm": 1.703125, + "learning_rate": 1.974304120933576e-05, + "loss": 1.0483, + "step": 1358 + }, + { + "epoch": 0.23301969693722271, + "grad_norm": 1.625, + "learning_rate": 1.9742634289521915e-05, + "loss": 1.032, + "step": 1359 + }, + { + "epoch": 0.23319116102621257, + "grad_norm": 1.671875, + "learning_rate": 1.9742227051964078e-05, + "loss": 1.0213, + "step": 1360 + }, + { + "epoch": 0.23336262511520242, + "grad_norm": 1.8125, + "learning_rate": 1.9741819496675533e-05, + "loss": 1.0281, + "step": 1361 + }, + { + "epoch": 0.2335340892041923, + "grad_norm": 1.6015625, + "learning_rate": 1.974141162366957e-05, + "loss": 0.9483, + "step": 1362 + }, + { + "epoch": 0.23370555329318216, + "grad_norm": 1.765625, + "learning_rate": 1.9741003432959486e-05, + "loss": 1.1563, + "step": 1363 + }, + { + "epoch": 0.233877017382172, + "grad_norm": 1.6484375, + "learning_rate": 1.9740594924558606e-05, + "loss": 1.1034, + "step": 1364 + }, + { + "epoch": 0.2340484814711619, + "grad_norm": 1.53125, + "learning_rate": 1.974018609848024e-05, + "loss": 0.9837, + "step": 1365 + }, + { + "epoch": 0.23421994556015174, + "grad_norm": 1.59375, + "learning_rate": 1.973977695473773e-05, + "loss": 1.0679, + "step": 1366 + }, + { + "epoch": 0.2343914096491416, + "grad_norm": 1.8125, + "learning_rate": 1.9739367493344415e-05, + "loss": 1.1194, + "step": 1367 + }, + { + "epoch": 0.23456287373813148, + "grad_norm": 1.640625, + "learning_rate": 1.9738957714313653e-05, + "loss": 1.0915, + "step": 1368 + }, + { + "epoch": 0.23473433782712133, + "grad_norm": 1.703125, + "learning_rate": 1.9738547617658806e-05, + "loss": 1.076, + "step": 1369 + }, + { + "epoch": 0.23490580191611118, + "grad_norm": 1.6640625, + "learning_rate": 1.973813720339325e-05, + "loss": 0.9973, + "step": 1370 + }, + { + "epoch": 0.23507726600510107, + "grad_norm": 1.671875, + "learning_rate": 1.973772647153037e-05, + "loss": 1.224, + "step": 1371 + }, + { + "epoch": 0.23524873009409092, + "grad_norm": 1.5859375, + "learning_rate": 1.9737315422083557e-05, + "loss": 1.1084, + "step": 1372 + }, + { + "epoch": 0.23542019418308077, + "grad_norm": 1.703125, + "learning_rate": 1.9736904055066227e-05, + "loss": 1.1346, + "step": 1373 + }, + { + "epoch": 0.23559165827207065, + "grad_norm": 1.7109375, + "learning_rate": 1.9736492370491786e-05, + "loss": 1.0141, + "step": 1374 + }, + { + "epoch": 0.2357631223610605, + "grad_norm": 1.6171875, + "learning_rate": 1.973608036837366e-05, + "loss": 1.0714, + "step": 1375 + }, + { + "epoch": 0.23593458645005036, + "grad_norm": 1.671875, + "learning_rate": 1.9735668048725296e-05, + "loss": 1.131, + "step": 1376 + }, + { + "epoch": 0.23610605053904024, + "grad_norm": 1.5546875, + "learning_rate": 1.9735255411560137e-05, + "loss": 1.0559, + "step": 1377 + }, + { + "epoch": 0.2362775146280301, + "grad_norm": 1.6484375, + "learning_rate": 1.973484245689164e-05, + "loss": 1.0569, + "step": 1378 + }, + { + "epoch": 0.23644897871701995, + "grad_norm": 1.671875, + "learning_rate": 1.973442918473327e-05, + "loss": 1.0454, + "step": 1379 + }, + { + "epoch": 0.23662044280600983, + "grad_norm": 1.71875, + "learning_rate": 1.9734015595098507e-05, + "loss": 0.9595, + "step": 1380 + }, + { + "epoch": 0.23679190689499968, + "grad_norm": 1.6328125, + "learning_rate": 1.9733601688000837e-05, + "loss": 1.1188, + "step": 1381 + }, + { + "epoch": 0.23696337098398954, + "grad_norm": 1.6015625, + "learning_rate": 1.973318746345377e-05, + "loss": 0.972, + "step": 1382 + }, + { + "epoch": 0.23713483507297942, + "grad_norm": 1.6484375, + "learning_rate": 1.9732772921470804e-05, + "loss": 0.9974, + "step": 1383 + }, + { + "epoch": 0.23730629916196927, + "grad_norm": 1.7265625, + "learning_rate": 1.9732358062065465e-05, + "loss": 1.0464, + "step": 1384 + }, + { + "epoch": 0.23747776325095912, + "grad_norm": 1.640625, + "learning_rate": 1.973194288525128e-05, + "loss": 1.1304, + "step": 1385 + }, + { + "epoch": 0.23764922733994898, + "grad_norm": 1.6328125, + "learning_rate": 1.9731527391041786e-05, + "loss": 1.1416, + "step": 1386 + }, + { + "epoch": 0.23782069142893886, + "grad_norm": 1.734375, + "learning_rate": 1.9731111579450545e-05, + "loss": 1.1684, + "step": 1387 + }, + { + "epoch": 0.2379921555179287, + "grad_norm": 1.640625, + "learning_rate": 1.9730695450491106e-05, + "loss": 1.0167, + "step": 1388 + }, + { + "epoch": 0.23816361960691856, + "grad_norm": 1.546875, + "learning_rate": 1.973027900417705e-05, + "loss": 0.9737, + "step": 1389 + }, + { + "epoch": 0.23833508369590845, + "grad_norm": 1.6953125, + "learning_rate": 1.9729862240521953e-05, + "loss": 1.0621, + "step": 1390 + }, + { + "epoch": 0.2385065477848983, + "grad_norm": 1.78125, + "learning_rate": 1.9729445159539412e-05, + "loss": 1.0712, + "step": 1391 + }, + { + "epoch": 0.23867801187388815, + "grad_norm": 1.703125, + "learning_rate": 1.9729027761243022e-05, + "loss": 1.0961, + "step": 1392 + }, + { + "epoch": 0.23884947596287803, + "grad_norm": 1.546875, + "learning_rate": 1.9728610045646402e-05, + "loss": 1.0431, + "step": 1393 + }, + { + "epoch": 0.2390209400518679, + "grad_norm": 1.5625, + "learning_rate": 1.9728192012763174e-05, + "loss": 1.0822, + "step": 1394 + }, + { + "epoch": 0.23919240414085774, + "grad_norm": 1.6875, + "learning_rate": 1.9727773662606973e-05, + "loss": 1.1067, + "step": 1395 + }, + { + "epoch": 0.23936386822984762, + "grad_norm": 1.7109375, + "learning_rate": 1.9727354995191442e-05, + "loss": 1.099, + "step": 1396 + }, + { + "epoch": 0.23953533231883747, + "grad_norm": 1.640625, + "learning_rate": 1.972693601053023e-05, + "loss": 1.1294, + "step": 1397 + }, + { + "epoch": 0.23970679640782733, + "grad_norm": 1.7578125, + "learning_rate": 1.9726516708637012e-05, + "loss": 1.1383, + "step": 1398 + }, + { + "epoch": 0.2398782604968172, + "grad_norm": 1.578125, + "learning_rate": 1.9726097089525456e-05, + "loss": 1.0686, + "step": 1399 + }, + { + "epoch": 0.24004972458580706, + "grad_norm": 1.671875, + "learning_rate": 1.9725677153209246e-05, + "loss": 1.1246, + "step": 1400 + }, + { + "epoch": 0.24004972458580706, + "eval_loss": 0.9177221655845642, + "eval_runtime": 837.168, + "eval_samples_per_second": 2.985, + "eval_steps_per_second": 2.985, + "step": 1400 + }, + { + "epoch": 0.24022118867479691, + "grad_norm": 1.7578125, + "learning_rate": 1.9725256899702085e-05, + "loss": 1.0207, + "step": 1401 + }, + { + "epoch": 0.2403926527637868, + "grad_norm": 1.6015625, + "learning_rate": 1.9724836329017673e-05, + "loss": 1.0681, + "step": 1402 + }, + { + "epoch": 0.24056411685277665, + "grad_norm": 1.6875, + "learning_rate": 1.9724415441169723e-05, + "loss": 1.0937, + "step": 1403 + }, + { + "epoch": 0.2407355809417665, + "grad_norm": 1.5625, + "learning_rate": 1.9723994236171972e-05, + "loss": 1.1032, + "step": 1404 + }, + { + "epoch": 0.24090704503075638, + "grad_norm": 1.6484375, + "learning_rate": 1.972357271403815e-05, + "loss": 1.0918, + "step": 1405 + }, + { + "epoch": 0.24107850911974624, + "grad_norm": 1.8828125, + "learning_rate": 1.972315087478201e-05, + "loss": 1.1631, + "step": 1406 + }, + { + "epoch": 0.2412499732087361, + "grad_norm": 1.6484375, + "learning_rate": 1.97227287184173e-05, + "loss": 1.0473, + "step": 1407 + }, + { + "epoch": 0.24142143729772594, + "grad_norm": 1.625, + "learning_rate": 1.9722306244957797e-05, + "loss": 1.0959, + "step": 1408 + }, + { + "epoch": 0.24159290138671582, + "grad_norm": 1.578125, + "learning_rate": 1.9721883454417278e-05, + "loss": 0.9669, + "step": 1409 + }, + { + "epoch": 0.24176436547570568, + "grad_norm": 1.5546875, + "learning_rate": 1.972146034680953e-05, + "loss": 1.083, + "step": 1410 + }, + { + "epoch": 0.24193582956469553, + "grad_norm": 1.609375, + "learning_rate": 1.9721036922148352e-05, + "loss": 1.0486, + "step": 1411 + }, + { + "epoch": 0.2421072936536854, + "grad_norm": 1.625, + "learning_rate": 1.9720613180447553e-05, + "loss": 1.0244, + "step": 1412 + }, + { + "epoch": 0.24227875774267527, + "grad_norm": 1.8203125, + "learning_rate": 1.9720189121720953e-05, + "loss": 1.1736, + "step": 1413 + }, + { + "epoch": 0.24245022183166512, + "grad_norm": 1.765625, + "learning_rate": 1.9719764745982383e-05, + "loss": 1.0677, + "step": 1414 + }, + { + "epoch": 0.242621685920655, + "grad_norm": 1.65625, + "learning_rate": 1.9719340053245682e-05, + "loss": 1.0646, + "step": 1415 + }, + { + "epoch": 0.24279315000964485, + "grad_norm": 1.6796875, + "learning_rate": 1.9718915043524703e-05, + "loss": 1.0764, + "step": 1416 + }, + { + "epoch": 0.2429646140986347, + "grad_norm": 1.6328125, + "learning_rate": 1.9718489716833308e-05, + "loss": 1.0921, + "step": 1417 + }, + { + "epoch": 0.2431360781876246, + "grad_norm": 1.59375, + "learning_rate": 1.971806407318537e-05, + "loss": 1.0519, + "step": 1418 + }, + { + "epoch": 0.24330754227661444, + "grad_norm": 1.703125, + "learning_rate": 1.9717638112594765e-05, + "loss": 1.0683, + "step": 1419 + }, + { + "epoch": 0.2434790063656043, + "grad_norm": 1.640625, + "learning_rate": 1.9717211835075388e-05, + "loss": 1.0414, + "step": 1420 + }, + { + "epoch": 0.24365047045459418, + "grad_norm": 1.734375, + "learning_rate": 1.971678524064114e-05, + "loss": 1.0836, + "step": 1421 + }, + { + "epoch": 0.24382193454358403, + "grad_norm": 1.71875, + "learning_rate": 1.9716358329305937e-05, + "loss": 1.1452, + "step": 1422 + }, + { + "epoch": 0.24399339863257388, + "grad_norm": 1.6015625, + "learning_rate": 1.9715931101083696e-05, + "loss": 1.0282, + "step": 1423 + }, + { + "epoch": 0.24416486272156376, + "grad_norm": 1.7421875, + "learning_rate": 1.971550355598836e-05, + "loss": 1.0765, + "step": 1424 + }, + { + "epoch": 0.24433632681055362, + "grad_norm": 1.7109375, + "learning_rate": 1.9715075694033866e-05, + "loss": 1.0627, + "step": 1425 + }, + { + "epoch": 0.24450779089954347, + "grad_norm": 1.671875, + "learning_rate": 1.971464751523417e-05, + "loss": 1.0239, + "step": 1426 + }, + { + "epoch": 0.24467925498853335, + "grad_norm": 1.6953125, + "learning_rate": 1.9714219019603236e-05, + "loss": 1.1714, + "step": 1427 + }, + { + "epoch": 0.2448507190775232, + "grad_norm": 1.5703125, + "learning_rate": 1.971379020715504e-05, + "loss": 1.015, + "step": 1428 + }, + { + "epoch": 0.24502218316651306, + "grad_norm": 1.75, + "learning_rate": 1.971336107790357e-05, + "loss": 1.0271, + "step": 1429 + }, + { + "epoch": 0.24519364725550294, + "grad_norm": 1.59375, + "learning_rate": 1.971293163186281e-05, + "loss": 1.103, + "step": 1430 + }, + { + "epoch": 0.2453651113444928, + "grad_norm": 1.71875, + "learning_rate": 1.9712501869046782e-05, + "loss": 1.0114, + "step": 1431 + }, + { + "epoch": 0.24553657543348265, + "grad_norm": 1.8125, + "learning_rate": 1.971207178946949e-05, + "loss": 1.1274, + "step": 1432 + }, + { + "epoch": 0.2457080395224725, + "grad_norm": 1.671875, + "learning_rate": 1.9711641393144967e-05, + "loss": 1.0402, + "step": 1433 + }, + { + "epoch": 0.24587950361146238, + "grad_norm": 1.59375, + "learning_rate": 1.971121068008725e-05, + "loss": 1.1112, + "step": 1434 + }, + { + "epoch": 0.24605096770045223, + "grad_norm": 1.7265625, + "learning_rate": 1.9710779650310376e-05, + "loss": 1.1078, + "step": 1435 + }, + { + "epoch": 0.2462224317894421, + "grad_norm": 1.6171875, + "learning_rate": 1.9710348303828412e-05, + "loss": 1.0754, + "step": 1436 + }, + { + "epoch": 0.24639389587843197, + "grad_norm": 1.671875, + "learning_rate": 1.970991664065543e-05, + "loss": 0.9915, + "step": 1437 + }, + { + "epoch": 0.24656535996742182, + "grad_norm": 1.703125, + "learning_rate": 1.9709484660805498e-05, + "loss": 1.0486, + "step": 1438 + }, + { + "epoch": 0.24673682405641167, + "grad_norm": 1.6875, + "learning_rate": 1.9709052364292705e-05, + "loss": 1.1207, + "step": 1439 + }, + { + "epoch": 0.24690828814540156, + "grad_norm": 1.5390625, + "learning_rate": 1.970861975113116e-05, + "loss": 0.9616, + "step": 1440 + }, + { + "epoch": 0.2470797522343914, + "grad_norm": 1.5234375, + "learning_rate": 1.9708186821334964e-05, + "loss": 1.0366, + "step": 1441 + }, + { + "epoch": 0.24725121632338126, + "grad_norm": 1.65625, + "learning_rate": 1.9707753574918235e-05, + "loss": 1.0376, + "step": 1442 + }, + { + "epoch": 0.24742268041237114, + "grad_norm": 1.640625, + "learning_rate": 1.970732001189511e-05, + "loss": 1.0982, + "step": 1443 + }, + { + "epoch": 0.247594144501361, + "grad_norm": 1.734375, + "learning_rate": 1.9706886132279724e-05, + "loss": 1.1314, + "step": 1444 + }, + { + "epoch": 0.24776560859035085, + "grad_norm": 2.546875, + "learning_rate": 1.9706451936086228e-05, + "loss": 0.9716, + "step": 1445 + }, + { + "epoch": 0.24793707267934073, + "grad_norm": 1.578125, + "learning_rate": 1.9706017423328782e-05, + "loss": 0.9978, + "step": 1446 + }, + { + "epoch": 0.24810853676833058, + "grad_norm": 1.703125, + "learning_rate": 1.970558259402156e-05, + "loss": 1.0639, + "step": 1447 + }, + { + "epoch": 0.24828000085732044, + "grad_norm": 1.7890625, + "learning_rate": 1.970514744817874e-05, + "loss": 1.0706, + "step": 1448 + }, + { + "epoch": 0.24845146494631032, + "grad_norm": 1.703125, + "learning_rate": 1.9704711985814516e-05, + "loss": 1.1416, + "step": 1449 + }, + { + "epoch": 0.24862292903530017, + "grad_norm": 1.59375, + "learning_rate": 1.970427620694309e-05, + "loss": 1.155, + "step": 1450 + }, + { + "epoch": 0.24879439312429003, + "grad_norm": 1.5625, + "learning_rate": 1.9703840111578673e-05, + "loss": 1.0571, + "step": 1451 + }, + { + "epoch": 0.2489658572132799, + "grad_norm": 1.65625, + "learning_rate": 1.970340369973549e-05, + "loss": 1.0132, + "step": 1452 + }, + { + "epoch": 0.24913732130226976, + "grad_norm": 1.6328125, + "learning_rate": 1.9702966971427773e-05, + "loss": 1.0421, + "step": 1453 + }, + { + "epoch": 0.2493087853912596, + "grad_norm": 1.5625, + "learning_rate": 1.9702529926669768e-05, + "loss": 1.0326, + "step": 1454 + }, + { + "epoch": 0.2494802494802495, + "grad_norm": 1.6796875, + "learning_rate": 1.970209256547572e-05, + "loss": 1.0629, + "step": 1455 + }, + { + "epoch": 0.24965171356923935, + "grad_norm": 1.6875, + "learning_rate": 1.9701654887859904e-05, + "loss": 1.0527, + "step": 1456 + }, + { + "epoch": 0.2498231776582292, + "grad_norm": 1.6875, + "learning_rate": 1.9701216893836584e-05, + "loss": 1.0612, + "step": 1457 + }, + { + "epoch": 0.24999464174721905, + "grad_norm": 1.578125, + "learning_rate": 1.970077858342005e-05, + "loss": 0.9602, + "step": 1458 + }, + { + "epoch": 0.2501661058362089, + "grad_norm": 1.59375, + "learning_rate": 1.9700339956624603e-05, + "loss": 0.9985, + "step": 1459 + }, + { + "epoch": 0.2503375699251988, + "grad_norm": 1.5625, + "learning_rate": 1.9699901013464534e-05, + "loss": 0.9584, + "step": 1460 + }, + { + "epoch": 0.25050903401418867, + "grad_norm": 1.625, + "learning_rate": 1.969946175395417e-05, + "loss": 0.9766, + "step": 1461 + }, + { + "epoch": 0.2506804981031785, + "grad_norm": 1.6875, + "learning_rate": 1.9699022178107833e-05, + "loss": 1.0457, + "step": 1462 + }, + { + "epoch": 0.2508519621921684, + "grad_norm": 1.625, + "learning_rate": 1.969858228593986e-05, + "loss": 1.0036, + "step": 1463 + }, + { + "epoch": 0.25102342628115826, + "grad_norm": 1.6484375, + "learning_rate": 1.9698142077464597e-05, + "loss": 1.0776, + "step": 1464 + }, + { + "epoch": 0.2511948903701481, + "grad_norm": 1.6484375, + "learning_rate": 1.96977015526964e-05, + "loss": 1.1579, + "step": 1465 + }, + { + "epoch": 0.25136635445913796, + "grad_norm": 1.7109375, + "learning_rate": 1.969726071164964e-05, + "loss": 1.0759, + "step": 1466 + }, + { + "epoch": 0.25153781854812785, + "grad_norm": 1.6015625, + "learning_rate": 1.9696819554338693e-05, + "loss": 1.0797, + "step": 1467 + }, + { + "epoch": 0.25170928263711767, + "grad_norm": 1.5859375, + "learning_rate": 1.9696378080777937e-05, + "loss": 1.0332, + "step": 1468 + }, + { + "epoch": 0.25188074672610755, + "grad_norm": 1.6328125, + "learning_rate": 1.9695936290981788e-05, + "loss": 1.0922, + "step": 1469 + }, + { + "epoch": 0.25205221081509743, + "grad_norm": 1.703125, + "learning_rate": 1.969549418496464e-05, + "loss": 1.0826, + "step": 1470 + }, + { + "epoch": 0.25222367490408726, + "grad_norm": 1.7421875, + "learning_rate": 1.9695051762740917e-05, + "loss": 1.1504, + "step": 1471 + }, + { + "epoch": 0.25239513899307714, + "grad_norm": 1.6796875, + "learning_rate": 1.969460902432505e-05, + "loss": 1.048, + "step": 1472 + }, + { + "epoch": 0.252566603082067, + "grad_norm": 1.625, + "learning_rate": 1.9694165969731473e-05, + "loss": 1.0838, + "step": 1473 + }, + { + "epoch": 0.25273806717105685, + "grad_norm": 1.671875, + "learning_rate": 1.969372259897464e-05, + "loss": 1.0886, + "step": 1474 + }, + { + "epoch": 0.2529095312600467, + "grad_norm": 1.6796875, + "learning_rate": 1.969327891206901e-05, + "loss": 1.0252, + "step": 1475 + }, + { + "epoch": 0.2530809953490366, + "grad_norm": 1.6796875, + "learning_rate": 1.9692834909029056e-05, + "loss": 1.0893, + "step": 1476 + }, + { + "epoch": 0.25325245943802643, + "grad_norm": 1.8671875, + "learning_rate": 1.9692390589869256e-05, + "loss": 1.0419, + "step": 1477 + }, + { + "epoch": 0.2534239235270163, + "grad_norm": 1.6796875, + "learning_rate": 1.9691945954604095e-05, + "loss": 1.0386, + "step": 1478 + }, + { + "epoch": 0.2535953876160062, + "grad_norm": 1.6171875, + "learning_rate": 1.9691501003248086e-05, + "loss": 0.976, + "step": 1479 + }, + { + "epoch": 0.253766851704996, + "grad_norm": 1.7109375, + "learning_rate": 1.969105573581573e-05, + "loss": 1.0139, + "step": 1480 + }, + { + "epoch": 0.2539383157939859, + "grad_norm": 1.6640625, + "learning_rate": 1.9690610152321557e-05, + "loss": 0.9874, + "step": 1481 + }, + { + "epoch": 0.2541097798829758, + "grad_norm": 1.6640625, + "learning_rate": 1.969016425278009e-05, + "loss": 1.0118, + "step": 1482 + }, + { + "epoch": 0.2542812439719656, + "grad_norm": 1.640625, + "learning_rate": 1.9689718037205883e-05, + "loss": 1.0416, + "step": 1483 + }, + { + "epoch": 0.2544527080609555, + "grad_norm": 1.65625, + "learning_rate": 1.9689271505613477e-05, + "loss": 1.0712, + "step": 1484 + }, + { + "epoch": 0.25462417214994537, + "grad_norm": 1.65625, + "learning_rate": 1.9688824658017446e-05, + "loss": 1.0663, + "step": 1485 + }, + { + "epoch": 0.2547956362389352, + "grad_norm": 1.734375, + "learning_rate": 1.968837749443236e-05, + "loss": 1.1637, + "step": 1486 + }, + { + "epoch": 0.2549671003279251, + "grad_norm": 1.6640625, + "learning_rate": 1.9687930014872794e-05, + "loss": 1.0739, + "step": 1487 + }, + { + "epoch": 0.25513856441691496, + "grad_norm": 1.765625, + "learning_rate": 1.9687482219353352e-05, + "loss": 1.0067, + "step": 1488 + }, + { + "epoch": 0.2553100285059048, + "grad_norm": 1.578125, + "learning_rate": 1.9687034107888636e-05, + "loss": 1.0853, + "step": 1489 + }, + { + "epoch": 0.25548149259489467, + "grad_norm": 1.6484375, + "learning_rate": 1.9686585680493257e-05, + "loss": 1.1097, + "step": 1490 + }, + { + "epoch": 0.2556529566838845, + "grad_norm": 1.609375, + "learning_rate": 1.9686136937181847e-05, + "loss": 1.0814, + "step": 1491 + }, + { + "epoch": 0.2558244207728744, + "grad_norm": 1.6171875, + "learning_rate": 1.9685687877969035e-05, + "loss": 1.0519, + "step": 1492 + }, + { + "epoch": 0.25599588486186425, + "grad_norm": 1.609375, + "learning_rate": 1.9685238502869468e-05, + "loss": 1.1121, + "step": 1493 + }, + { + "epoch": 0.2561673489508541, + "grad_norm": 1.609375, + "learning_rate": 1.96847888118978e-05, + "loss": 1.0927, + "step": 1494 + }, + { + "epoch": 0.25633881303984396, + "grad_norm": 1.6796875, + "learning_rate": 1.96843388050687e-05, + "loss": 1.0372, + "step": 1495 + }, + { + "epoch": 0.25651027712883384, + "grad_norm": 1.6328125, + "learning_rate": 1.9683888482396844e-05, + "loss": 1.0362, + "step": 1496 + }, + { + "epoch": 0.25668174121782367, + "grad_norm": 1.578125, + "learning_rate": 1.968343784389692e-05, + "loss": 1.0311, + "step": 1497 + }, + { + "epoch": 0.25685320530681355, + "grad_norm": 1.5859375, + "learning_rate": 1.9682986889583623e-05, + "loss": 1.1184, + "step": 1498 + }, + { + "epoch": 0.25702466939580343, + "grad_norm": 1.71875, + "learning_rate": 1.968253561947166e-05, + "loss": 1.1175, + "step": 1499 + }, + { + "epoch": 0.25719613348479325, + "grad_norm": 1.6640625, + "learning_rate": 1.968208403357575e-05, + "loss": 1.0388, + "step": 1500 + }, + { + "epoch": 0.25736759757378314, + "grad_norm": 1.5, + "learning_rate": 1.968163213191062e-05, + "loss": 0.9964, + "step": 1501 + }, + { + "epoch": 0.257539061662773, + "grad_norm": 1.65625, + "learning_rate": 1.968117991449101e-05, + "loss": 1.1443, + "step": 1502 + }, + { + "epoch": 0.25771052575176284, + "grad_norm": 1.671875, + "learning_rate": 1.9680727381331665e-05, + "loss": 1.04, + "step": 1503 + }, + { + "epoch": 0.2578819898407527, + "grad_norm": 1.6328125, + "learning_rate": 1.9680274532447344e-05, + "loss": 1.0192, + "step": 1504 + }, + { + "epoch": 0.2580534539297426, + "grad_norm": 1.609375, + "learning_rate": 1.9679821367852824e-05, + "loss": 1.1258, + "step": 1505 + }, + { + "epoch": 0.25822491801873243, + "grad_norm": 1.5859375, + "learning_rate": 1.9679367887562874e-05, + "loss": 1.0358, + "step": 1506 + }, + { + "epoch": 0.2583963821077223, + "grad_norm": 1.7109375, + "learning_rate": 1.967891409159229e-05, + "loss": 1.1577, + "step": 1507 + }, + { + "epoch": 0.2585678461967122, + "grad_norm": 1.65625, + "learning_rate": 1.967845997995587e-05, + "loss": 0.9771, + "step": 1508 + }, + { + "epoch": 0.258739310285702, + "grad_norm": 1.8984375, + "learning_rate": 1.9678005552668423e-05, + "loss": 1.1342, + "step": 1509 + }, + { + "epoch": 0.2589107743746919, + "grad_norm": 1.671875, + "learning_rate": 1.967755080974477e-05, + "loss": 1.0732, + "step": 1510 + }, + { + "epoch": 0.2590822384636818, + "grad_norm": 1.734375, + "learning_rate": 1.9677095751199746e-05, + "loss": 1.1263, + "step": 1511 + }, + { + "epoch": 0.2592537025526716, + "grad_norm": 1.7109375, + "learning_rate": 1.9676640377048185e-05, + "loss": 1.0274, + "step": 1512 + }, + { + "epoch": 0.2594251666416615, + "grad_norm": 1.6328125, + "learning_rate": 1.9676184687304945e-05, + "loss": 0.998, + "step": 1513 + }, + { + "epoch": 0.25959663073065137, + "grad_norm": 1.578125, + "learning_rate": 1.9675728681984886e-05, + "loss": 1.0682, + "step": 1514 + }, + { + "epoch": 0.2597680948196412, + "grad_norm": 1.703125, + "learning_rate": 1.9675272361102876e-05, + "loss": 1.1237, + "step": 1515 + }, + { + "epoch": 0.2599395589086311, + "grad_norm": 1.7578125, + "learning_rate": 1.9674815724673804e-05, + "loss": 1.0046, + "step": 1516 + }, + { + "epoch": 0.26011102299762096, + "grad_norm": 1.6875, + "learning_rate": 1.967435877271256e-05, + "loss": 1.142, + "step": 1517 + }, + { + "epoch": 0.2602824870866108, + "grad_norm": 1.5859375, + "learning_rate": 1.9673901505234042e-05, + "loss": 0.9946, + "step": 1518 + }, + { + "epoch": 0.26045395117560066, + "grad_norm": 1.671875, + "learning_rate": 1.967344392225317e-05, + "loss": 1.0523, + "step": 1519 + }, + { + "epoch": 0.26062541526459054, + "grad_norm": 1.765625, + "learning_rate": 1.9672986023784863e-05, + "loss": 1.1062, + "step": 1520 + }, + { + "epoch": 0.26079687935358037, + "grad_norm": 1.59375, + "learning_rate": 1.967252780984406e-05, + "loss": 1.051, + "step": 1521 + }, + { + "epoch": 0.26096834344257025, + "grad_norm": 1.6953125, + "learning_rate": 1.9672069280445696e-05, + "loss": 1.0351, + "step": 1522 + }, + { + "epoch": 0.26113980753156013, + "grad_norm": 1.7578125, + "learning_rate": 1.967161043560474e-05, + "loss": 1.0894, + "step": 1523 + }, + { + "epoch": 0.26131127162054996, + "grad_norm": 1.6484375, + "learning_rate": 1.967115127533614e-05, + "loss": 1.0919, + "step": 1524 + }, + { + "epoch": 0.26148273570953984, + "grad_norm": 1.7421875, + "learning_rate": 1.967069179965488e-05, + "loss": 1.0772, + "step": 1525 + }, + { + "epoch": 0.2616541997985297, + "grad_norm": 1.609375, + "learning_rate": 1.9670232008575945e-05, + "loss": 0.9477, + "step": 1526 + }, + { + "epoch": 0.26182566388751954, + "grad_norm": 1.6484375, + "learning_rate": 1.966977190211433e-05, + "loss": 1.0974, + "step": 1527 + }, + { + "epoch": 0.2619971279765094, + "grad_norm": 1.625, + "learning_rate": 1.966931148028504e-05, + "loss": 1.0565, + "step": 1528 + }, + { + "epoch": 0.2621685920654993, + "grad_norm": 1.671875, + "learning_rate": 1.9668850743103093e-05, + "loss": 1.0793, + "step": 1529 + }, + { + "epoch": 0.26234005615448913, + "grad_norm": 1.8359375, + "learning_rate": 1.9668389690583512e-05, + "loss": 1.0805, + "step": 1530 + }, + { + "epoch": 0.262511520243479, + "grad_norm": 1.546875, + "learning_rate": 1.9667928322741337e-05, + "loss": 1.0825, + "step": 1531 + }, + { + "epoch": 0.2626829843324689, + "grad_norm": 1.59375, + "learning_rate": 1.9667466639591612e-05, + "loss": 1.052, + "step": 1532 + }, + { + "epoch": 0.2628544484214587, + "grad_norm": 1.6328125, + "learning_rate": 1.9667004641149393e-05, + "loss": 1.095, + "step": 1533 + }, + { + "epoch": 0.2630259125104486, + "grad_norm": 1.671875, + "learning_rate": 1.9666542327429754e-05, + "loss": 0.9686, + "step": 1534 + }, + { + "epoch": 0.2631973765994385, + "grad_norm": 1.6171875, + "learning_rate": 1.966607969844777e-05, + "loss": 1.0208, + "step": 1535 + }, + { + "epoch": 0.2633688406884283, + "grad_norm": 1.671875, + "learning_rate": 1.9665616754218523e-05, + "loss": 1.0695, + "step": 1536 + }, + { + "epoch": 0.2635403047774182, + "grad_norm": 1.5625, + "learning_rate": 1.966515349475712e-05, + "loss": 1.0443, + "step": 1537 + }, + { + "epoch": 0.263711768866408, + "grad_norm": 1.59375, + "learning_rate": 1.9664689920078665e-05, + "loss": 1.0577, + "step": 1538 + }, + { + "epoch": 0.2638832329553979, + "grad_norm": 1.640625, + "learning_rate": 1.9664226030198278e-05, + "loss": 1.0162, + "step": 1539 + }, + { + "epoch": 0.2640546970443878, + "grad_norm": 1.6875, + "learning_rate": 1.9663761825131087e-05, + "loss": 1.0553, + "step": 1540 + }, + { + "epoch": 0.2642261611333776, + "grad_norm": 1.6171875, + "learning_rate": 1.966329730489223e-05, + "loss": 1.0822, + "step": 1541 + }, + { + "epoch": 0.2643976252223675, + "grad_norm": 1.6328125, + "learning_rate": 1.9662832469496863e-05, + "loss": 0.9951, + "step": 1542 + }, + { + "epoch": 0.26456908931135736, + "grad_norm": 1.609375, + "learning_rate": 1.9662367318960143e-05, + "loss": 1.0579, + "step": 1543 + }, + { + "epoch": 0.2647405534003472, + "grad_norm": 1.6015625, + "learning_rate": 1.966190185329724e-05, + "loss": 1.0021, + "step": 1544 + }, + { + "epoch": 0.26491201748933707, + "grad_norm": 1.75, + "learning_rate": 1.9661436072523333e-05, + "loss": 1.0497, + "step": 1545 + }, + { + "epoch": 0.26508348157832695, + "grad_norm": 1.640625, + "learning_rate": 1.966096997665361e-05, + "loss": 1.0591, + "step": 1546 + }, + { + "epoch": 0.2652549456673168, + "grad_norm": 1.796875, + "learning_rate": 1.966050356570328e-05, + "loss": 1.128, + "step": 1547 + }, + { + "epoch": 0.26542640975630666, + "grad_norm": 1.96875, + "learning_rate": 1.9660036839687552e-05, + "loss": 0.9565, + "step": 1548 + }, + { + "epoch": 0.26559787384529654, + "grad_norm": 1.65625, + "learning_rate": 1.9659569798621642e-05, + "loss": 0.9964, + "step": 1549 + }, + { + "epoch": 0.26576933793428636, + "grad_norm": 14.0625, + "learning_rate": 1.965910244252079e-05, + "loss": 1.0035, + "step": 1550 + }, + { + "epoch": 0.26594080202327625, + "grad_norm": 1.921875, + "learning_rate": 1.9658634771400235e-05, + "loss": 1.0745, + "step": 1551 + }, + { + "epoch": 0.2661122661122661, + "grad_norm": 1.8984375, + "learning_rate": 1.9658166785275227e-05, + "loss": 1.1087, + "step": 1552 + }, + { + "epoch": 0.26628373020125595, + "grad_norm": 1.5859375, + "learning_rate": 1.9657698484161032e-05, + "loss": 0.9657, + "step": 1553 + }, + { + "epoch": 0.26645519429024583, + "grad_norm": 1.6484375, + "learning_rate": 1.965722986807292e-05, + "loss": 1.0533, + "step": 1554 + }, + { + "epoch": 0.2666266583792357, + "grad_norm": 1.75, + "learning_rate": 1.965676093702618e-05, + "loss": 1.0809, + "step": 1555 + }, + { + "epoch": 0.26679812246822554, + "grad_norm": 1.734375, + "learning_rate": 1.9656291691036098e-05, + "loss": 1.0149, + "step": 1556 + }, + { + "epoch": 0.2669695865572154, + "grad_norm": 1.640625, + "learning_rate": 1.9655822130117985e-05, + "loss": 1.1726, + "step": 1557 + }, + { + "epoch": 0.2671410506462053, + "grad_norm": 1.703125, + "learning_rate": 1.965535225428715e-05, + "loss": 1.0171, + "step": 1558 + }, + { + "epoch": 0.26731251473519513, + "grad_norm": 1.6015625, + "learning_rate": 1.9654882063558918e-05, + "loss": 1.0369, + "step": 1559 + }, + { + "epoch": 0.267483978824185, + "grad_norm": 1.625, + "learning_rate": 1.965441155794863e-05, + "loss": 1.0901, + "step": 1560 + }, + { + "epoch": 0.2676554429131749, + "grad_norm": 1.578125, + "learning_rate": 1.965394073747162e-05, + "loss": 1.0468, + "step": 1561 + }, + { + "epoch": 0.2678269070021647, + "grad_norm": 1.625, + "learning_rate": 1.965346960214325e-05, + "loss": 1.0866, + "step": 1562 + }, + { + "epoch": 0.2679983710911546, + "grad_norm": 1.6640625, + "learning_rate": 1.9652998151978887e-05, + "loss": 1.0789, + "step": 1563 + }, + { + "epoch": 0.2681698351801445, + "grad_norm": 1.703125, + "learning_rate": 1.9652526386993903e-05, + "loss": 1.0895, + "step": 1564 + }, + { + "epoch": 0.2683412992691343, + "grad_norm": 1.6640625, + "learning_rate": 1.9652054307203687e-05, + "loss": 1.0682, + "step": 1565 + }, + { + "epoch": 0.2685127633581242, + "grad_norm": 1.6796875, + "learning_rate": 1.9651581912623633e-05, + "loss": 1.0731, + "step": 1566 + }, + { + "epoch": 0.26868422744711407, + "grad_norm": 1.796875, + "learning_rate": 1.9651109203269147e-05, + "loss": 1.0308, + "step": 1567 + }, + { + "epoch": 0.2688556915361039, + "grad_norm": 1.6171875, + "learning_rate": 1.965063617915565e-05, + "loss": 0.9955, + "step": 1568 + }, + { + "epoch": 0.2690271556250938, + "grad_norm": 1.625, + "learning_rate": 1.9650162840298564e-05, + "loss": 1.0101, + "step": 1569 + }, + { + "epoch": 0.26919861971408365, + "grad_norm": 1.7890625, + "learning_rate": 1.964968918671333e-05, + "loss": 0.9961, + "step": 1570 + }, + { + "epoch": 0.2693700838030735, + "grad_norm": 1.6875, + "learning_rate": 1.9649215218415393e-05, + "loss": 1.0691, + "step": 1571 + }, + { + "epoch": 0.26954154789206336, + "grad_norm": 1.6171875, + "learning_rate": 1.9648740935420212e-05, + "loss": 1.0502, + "step": 1572 + }, + { + "epoch": 0.26971301198105324, + "grad_norm": 1.6015625, + "learning_rate": 1.9648266337743254e-05, + "loss": 1.0375, + "step": 1573 + }, + { + "epoch": 0.26988447607004307, + "grad_norm": 1.5625, + "learning_rate": 1.96477914254e-05, + "loss": 1.0687, + "step": 1574 + }, + { + "epoch": 0.27005594015903295, + "grad_norm": 1.7421875, + "learning_rate": 1.9647316198405943e-05, + "loss": 1.1145, + "step": 1575 + }, + { + "epoch": 0.27022740424802283, + "grad_norm": 1.59375, + "learning_rate": 1.964684065677657e-05, + "loss": 0.9817, + "step": 1576 + }, + { + "epoch": 0.27039886833701265, + "grad_norm": 1.7890625, + "learning_rate": 1.9646364800527396e-05, + "loss": 1.0822, + "step": 1577 + }, + { + "epoch": 0.27057033242600254, + "grad_norm": 1.6328125, + "learning_rate": 1.9645888629673944e-05, + "loss": 1.0437, + "step": 1578 + }, + { + "epoch": 0.2707417965149924, + "grad_norm": 1.6171875, + "learning_rate": 1.964541214423174e-05, + "loss": 1.09, + "step": 1579 + }, + { + "epoch": 0.27091326060398224, + "grad_norm": 1.75, + "learning_rate": 1.9644935344216325e-05, + "loss": 1.0711, + "step": 1580 + }, + { + "epoch": 0.2710847246929721, + "grad_norm": 1.7265625, + "learning_rate": 1.964445822964325e-05, + "loss": 1.0824, + "step": 1581 + }, + { + "epoch": 0.271256188781962, + "grad_norm": 1.75, + "learning_rate": 1.964398080052807e-05, + "loss": 1.0594, + "step": 1582 + }, + { + "epoch": 0.27142765287095183, + "grad_norm": 1.6796875, + "learning_rate": 1.9643503056886364e-05, + "loss": 1.1174, + "step": 1583 + }, + { + "epoch": 0.2715991169599417, + "grad_norm": 1.6015625, + "learning_rate": 1.964302499873371e-05, + "loss": 1.0093, + "step": 1584 + }, + { + "epoch": 0.2717705810489316, + "grad_norm": 1.6953125, + "learning_rate": 1.9642546626085693e-05, + "loss": 1.0025, + "step": 1585 + }, + { + "epoch": 0.2719420451379214, + "grad_norm": 1.6171875, + "learning_rate": 1.9642067938957926e-05, + "loss": 1.1346, + "step": 1586 + }, + { + "epoch": 0.2721135092269113, + "grad_norm": 1.640625, + "learning_rate": 1.964158893736601e-05, + "loss": 1.0351, + "step": 1587 + }, + { + "epoch": 0.2722849733159011, + "grad_norm": 1.6171875, + "learning_rate": 1.9641109621325577e-05, + "loss": 1.1297, + "step": 1588 + }, + { + "epoch": 0.272456437404891, + "grad_norm": 1.6640625, + "learning_rate": 1.9640629990852253e-05, + "loss": 1.028, + "step": 1589 + }, + { + "epoch": 0.2726279014938809, + "grad_norm": 1.59375, + "learning_rate": 1.964015004596168e-05, + "loss": 1.0212, + "step": 1590 + }, + { + "epoch": 0.2727993655828707, + "grad_norm": 1.7421875, + "learning_rate": 1.9639669786669513e-05, + "loss": 1.1078, + "step": 1591 + }, + { + "epoch": 0.2729708296718606, + "grad_norm": 1.6640625, + "learning_rate": 1.9639189212991415e-05, + "loss": 1.0638, + "step": 1592 + }, + { + "epoch": 0.2731422937608505, + "grad_norm": 1.6953125, + "learning_rate": 1.9638708324943056e-05, + "loss": 1.0655, + "step": 1593 + }, + { + "epoch": 0.2733137578498403, + "grad_norm": 1.6484375, + "learning_rate": 1.963822712254013e-05, + "loss": 1.0881, + "step": 1594 + }, + { + "epoch": 0.2734852219388302, + "grad_norm": 1.65625, + "learning_rate": 1.9637745605798316e-05, + "loss": 1.2106, + "step": 1595 + }, + { + "epoch": 0.27365668602782006, + "grad_norm": 1.5703125, + "learning_rate": 1.9637263774733326e-05, + "loss": 1.0209, + "step": 1596 + }, + { + "epoch": 0.2738281501168099, + "grad_norm": 1.7109375, + "learning_rate": 1.963678162936088e-05, + "loss": 1.1191, + "step": 1597 + }, + { + "epoch": 0.27399961420579977, + "grad_norm": 1.7578125, + "learning_rate": 1.963629916969669e-05, + "loss": 1.0532, + "step": 1598 + }, + { + "epoch": 0.27417107829478965, + "grad_norm": 1.5625, + "learning_rate": 1.96358163957565e-05, + "loss": 1.0672, + "step": 1599 + }, + { + "epoch": 0.2743425423837795, + "grad_norm": 1.703125, + "learning_rate": 1.963533330755605e-05, + "loss": 1.0651, + "step": 1600 + }, + { + "epoch": 0.27451400647276936, + "grad_norm": 1.6328125, + "learning_rate": 1.96348499051111e-05, + "loss": 1.0525, + "step": 1601 + }, + { + "epoch": 0.27468547056175924, + "grad_norm": 1.6796875, + "learning_rate": 1.963436618843741e-05, + "loss": 0.9884, + "step": 1602 + }, + { + "epoch": 0.27485693465074906, + "grad_norm": 1.6796875, + "learning_rate": 1.963388215755076e-05, + "loss": 1.1011, + "step": 1603 + }, + { + "epoch": 0.27502839873973894, + "grad_norm": 1.6953125, + "learning_rate": 1.9633397812466938e-05, + "loss": 1.076, + "step": 1604 + }, + { + "epoch": 0.2751998628287288, + "grad_norm": 1.6875, + "learning_rate": 1.9632913153201733e-05, + "loss": 1.0313, + "step": 1605 + }, + { + "epoch": 0.27537132691771865, + "grad_norm": 1.6328125, + "learning_rate": 1.9632428179770958e-05, + "loss": 1.0222, + "step": 1606 + }, + { + "epoch": 0.27554279100670853, + "grad_norm": 1.90625, + "learning_rate": 1.9631942892190428e-05, + "loss": 1.0423, + "step": 1607 + }, + { + "epoch": 0.2757142550956984, + "grad_norm": 1.6484375, + "learning_rate": 1.963145729047597e-05, + "loss": 1.1188, + "step": 1608 + }, + { + "epoch": 0.27588571918468824, + "grad_norm": 1.59375, + "learning_rate": 1.963097137464342e-05, + "loss": 0.9623, + "step": 1609 + }, + { + "epoch": 0.2760571832736781, + "grad_norm": 1.6875, + "learning_rate": 1.9630485144708627e-05, + "loss": 1.0379, + "step": 1610 + }, + { + "epoch": 0.276228647362668, + "grad_norm": 1.6796875, + "learning_rate": 1.962999860068745e-05, + "loss": 1.1151, + "step": 1611 + }, + { + "epoch": 0.2764001114516578, + "grad_norm": 1.6328125, + "learning_rate": 1.9629511742595752e-05, + "loss": 1.0464, + "step": 1612 + }, + { + "epoch": 0.2765715755406477, + "grad_norm": 1.6953125, + "learning_rate": 1.962902457044942e-05, + "loss": 1.1254, + "step": 1613 + }, + { + "epoch": 0.2767430396296376, + "grad_norm": 1.8203125, + "learning_rate": 1.9628537084264333e-05, + "loss": 1.1636, + "step": 1614 + }, + { + "epoch": 0.2769145037186274, + "grad_norm": 1.6875, + "learning_rate": 1.9628049284056395e-05, + "loss": 1.053, + "step": 1615 + }, + { + "epoch": 0.2770859678076173, + "grad_norm": 1.5859375, + "learning_rate": 1.9627561169841512e-05, + "loss": 1.0359, + "step": 1616 + }, + { + "epoch": 0.2772574318966072, + "grad_norm": 1.6796875, + "learning_rate": 1.9627072741635608e-05, + "loss": 0.9806, + "step": 1617 + }, + { + "epoch": 0.277428895985597, + "grad_norm": 1.515625, + "learning_rate": 1.9626583999454608e-05, + "loss": 1.066, + "step": 1618 + }, + { + "epoch": 0.2776003600745869, + "grad_norm": 1.6953125, + "learning_rate": 1.9626094943314452e-05, + "loss": 1.0277, + "step": 1619 + }, + { + "epoch": 0.27777182416357676, + "grad_norm": 1.6484375, + "learning_rate": 1.9625605573231093e-05, + "loss": 1.1298, + "step": 1620 + }, + { + "epoch": 0.2779432882525666, + "grad_norm": 1.671875, + "learning_rate": 1.962511588922049e-05, + "loss": 1.0838, + "step": 1621 + }, + { + "epoch": 0.27811475234155647, + "grad_norm": 1.578125, + "learning_rate": 1.9624625891298615e-05, + "loss": 1.0209, + "step": 1622 + }, + { + "epoch": 0.27828621643054635, + "grad_norm": 1.7421875, + "learning_rate": 1.9624135579481446e-05, + "loss": 1.09, + "step": 1623 + }, + { + "epoch": 0.2784576805195362, + "grad_norm": 1.609375, + "learning_rate": 1.9623644953784974e-05, + "loss": 1.0581, + "step": 1624 + }, + { + "epoch": 0.27862914460852606, + "grad_norm": 1.515625, + "learning_rate": 1.96231540142252e-05, + "loss": 1.0417, + "step": 1625 + }, + { + "epoch": 0.27880060869751594, + "grad_norm": 1.5703125, + "learning_rate": 1.9622662760818136e-05, + "loss": 1.0268, + "step": 1626 + }, + { + "epoch": 0.27897207278650576, + "grad_norm": 1.578125, + "learning_rate": 1.9622171193579806e-05, + "loss": 1.0325, + "step": 1627 + }, + { + "epoch": 0.27914353687549565, + "grad_norm": 1.6328125, + "learning_rate": 1.9621679312526234e-05, + "loss": 1.0768, + "step": 1628 + }, + { + "epoch": 0.2793150009644855, + "grad_norm": 1.6640625, + "learning_rate": 1.962118711767347e-05, + "loss": 0.9856, + "step": 1629 + }, + { + "epoch": 0.27948646505347535, + "grad_norm": 1.65625, + "learning_rate": 1.9620694609037567e-05, + "loss": 1.0344, + "step": 1630 + }, + { + "epoch": 0.27965792914246523, + "grad_norm": 1.6015625, + "learning_rate": 1.9620201786634584e-05, + "loss": 1.0485, + "step": 1631 + }, + { + "epoch": 0.2798293932314551, + "grad_norm": 1.5390625, + "learning_rate": 1.9619708650480595e-05, + "loss": 1.0014, + "step": 1632 + }, + { + "epoch": 0.28000085732044494, + "grad_norm": 1.6328125, + "learning_rate": 1.961921520059168e-05, + "loss": 1.0522, + "step": 1633 + }, + { + "epoch": 0.2801723214094348, + "grad_norm": 1.640625, + "learning_rate": 1.9618721436983935e-05, + "loss": 1.036, + "step": 1634 + }, + { + "epoch": 0.28034378549842465, + "grad_norm": 1.625, + "learning_rate": 1.9618227359673464e-05, + "loss": 1.181, + "step": 1635 + }, + { + "epoch": 0.28051524958741453, + "grad_norm": 1.6640625, + "learning_rate": 1.961773296867638e-05, + "loss": 1.013, + "step": 1636 + }, + { + "epoch": 0.2806867136764044, + "grad_norm": 1.578125, + "learning_rate": 1.9617238264008806e-05, + "loss": 1.0042, + "step": 1637 + }, + { + "epoch": 0.28085817776539423, + "grad_norm": 1.5859375, + "learning_rate": 1.961674324568688e-05, + "loss": 0.9712, + "step": 1638 + }, + { + "epoch": 0.2810296418543841, + "grad_norm": 1.6875, + "learning_rate": 1.9616247913726738e-05, + "loss": 1.0074, + "step": 1639 + }, + { + "epoch": 0.281201105943374, + "grad_norm": 1.6640625, + "learning_rate": 1.9615752268144544e-05, + "loss": 1.1121, + "step": 1640 + }, + { + "epoch": 0.2813725700323638, + "grad_norm": 1.71875, + "learning_rate": 1.9615256308956458e-05, + "loss": 1.0317, + "step": 1641 + }, + { + "epoch": 0.2815440341213537, + "grad_norm": 1.5546875, + "learning_rate": 1.961476003617866e-05, + "loss": 1.0479, + "step": 1642 + }, + { + "epoch": 0.2817154982103436, + "grad_norm": 1.578125, + "learning_rate": 1.9614263449827324e-05, + "loss": 1.0782, + "step": 1643 + }, + { + "epoch": 0.2818869622993334, + "grad_norm": 1.671875, + "learning_rate": 1.9613766549918657e-05, + "loss": 1.0909, + "step": 1644 + }, + { + "epoch": 0.2820584263883233, + "grad_norm": 1.7109375, + "learning_rate": 1.9613269336468862e-05, + "loss": 1.0773, + "step": 1645 + }, + { + "epoch": 0.2822298904773132, + "grad_norm": 1.6328125, + "learning_rate": 1.961277180949415e-05, + "loss": 1.0504, + "step": 1646 + }, + { + "epoch": 0.282401354566303, + "grad_norm": 1.59375, + "learning_rate": 1.9612273969010755e-05, + "loss": 1.0147, + "step": 1647 + }, + { + "epoch": 0.2825728186552929, + "grad_norm": 1.71875, + "learning_rate": 1.9611775815034905e-05, + "loss": 1.0605, + "step": 1648 + }, + { + "epoch": 0.28274428274428276, + "grad_norm": 1.578125, + "learning_rate": 1.9611277347582853e-05, + "loss": 1.0812, + "step": 1649 + }, + { + "epoch": 0.2829157468332726, + "grad_norm": 1.7421875, + "learning_rate": 1.9610778566670858e-05, + "loss": 1.0974, + "step": 1650 + }, + { + "epoch": 0.28308721092226247, + "grad_norm": 1.6015625, + "learning_rate": 1.9610279472315177e-05, + "loss": 0.9812, + "step": 1651 + }, + { + "epoch": 0.28325867501125235, + "grad_norm": 1.6953125, + "learning_rate": 1.9609780064532095e-05, + "loss": 1.0965, + "step": 1652 + }, + { + "epoch": 0.2834301391002422, + "grad_norm": 1.6484375, + "learning_rate": 1.96092803433379e-05, + "loss": 1.0054, + "step": 1653 + }, + { + "epoch": 0.28360160318923205, + "grad_norm": 1.6875, + "learning_rate": 1.9608780308748886e-05, + "loss": 1.0366, + "step": 1654 + }, + { + "epoch": 0.28377306727822194, + "grad_norm": 1.6328125, + "learning_rate": 1.9608279960781363e-05, + "loss": 1.0656, + "step": 1655 + }, + { + "epoch": 0.28394453136721176, + "grad_norm": 1.6015625, + "learning_rate": 1.960777929945165e-05, + "loss": 1.0081, + "step": 1656 + }, + { + "epoch": 0.28411599545620164, + "grad_norm": 1.703125, + "learning_rate": 1.9607278324776072e-05, + "loss": 1.1173, + "step": 1657 + }, + { + "epoch": 0.2842874595451915, + "grad_norm": 1.640625, + "learning_rate": 1.9606777036770978e-05, + "loss": 1.0612, + "step": 1658 + }, + { + "epoch": 0.28445892363418135, + "grad_norm": 1.640625, + "learning_rate": 1.96062754354527e-05, + "loss": 0.9736, + "step": 1659 + }, + { + "epoch": 0.28463038772317123, + "grad_norm": 1.6484375, + "learning_rate": 1.960577352083761e-05, + "loss": 1.1171, + "step": 1660 + }, + { + "epoch": 0.2848018518121611, + "grad_norm": 1.6796875, + "learning_rate": 1.9605271292942073e-05, + "loss": 1.0382, + "step": 1661 + }, + { + "epoch": 0.28497331590115094, + "grad_norm": 1.640625, + "learning_rate": 1.9604768751782468e-05, + "loss": 1.0854, + "step": 1662 + }, + { + "epoch": 0.2851447799901408, + "grad_norm": 1.640625, + "learning_rate": 1.9604265897375187e-05, + "loss": 1.0552, + "step": 1663 + }, + { + "epoch": 0.2853162440791307, + "grad_norm": 1.625, + "learning_rate": 1.960376272973663e-05, + "loss": 1.076, + "step": 1664 + }, + { + "epoch": 0.2854877081681205, + "grad_norm": 1.53125, + "learning_rate": 1.9603259248883203e-05, + "loss": 0.9763, + "step": 1665 + }, + { + "epoch": 0.2856591722571104, + "grad_norm": 1.6640625, + "learning_rate": 1.9602755454831334e-05, + "loss": 1.0594, + "step": 1666 + }, + { + "epoch": 0.2858306363461003, + "grad_norm": 1.6796875, + "learning_rate": 1.9602251347597442e-05, + "loss": 0.9935, + "step": 1667 + }, + { + "epoch": 0.2860021004350901, + "grad_norm": 1.71875, + "learning_rate": 1.960174692719798e-05, + "loss": 1.0511, + "step": 1668 + }, + { + "epoch": 0.28617356452408, + "grad_norm": 1.640625, + "learning_rate": 1.9601242193649394e-05, + "loss": 1.0227, + "step": 1669 + }, + { + "epoch": 0.2863450286130699, + "grad_norm": 1.65625, + "learning_rate": 1.9600737146968143e-05, + "loss": 1.0802, + "step": 1670 + }, + { + "epoch": 0.2865164927020597, + "grad_norm": 1.609375, + "learning_rate": 1.9600231787170704e-05, + "loss": 1.0771, + "step": 1671 + }, + { + "epoch": 0.2866879567910496, + "grad_norm": 1.6171875, + "learning_rate": 1.959972611427355e-05, + "loss": 1.0267, + "step": 1672 + }, + { + "epoch": 0.28685942088003946, + "grad_norm": 1.6640625, + "learning_rate": 1.959922012829318e-05, + "loss": 1.0999, + "step": 1673 + }, + { + "epoch": 0.2870308849690293, + "grad_norm": 1.6484375, + "learning_rate": 1.9598713829246097e-05, + "loss": 0.9944, + "step": 1674 + }, + { + "epoch": 0.28720234905801917, + "grad_norm": 1.65625, + "learning_rate": 1.9598207217148806e-05, + "loss": 1.0756, + "step": 1675 + }, + { + "epoch": 0.28737381314700905, + "grad_norm": 1.6953125, + "learning_rate": 1.9597700292017838e-05, + "loss": 1.032, + "step": 1676 + }, + { + "epoch": 0.2875452772359989, + "grad_norm": 1.6484375, + "learning_rate": 1.959719305386972e-05, + "loss": 1.0701, + "step": 1677 + }, + { + "epoch": 0.28771674132498876, + "grad_norm": 1.5390625, + "learning_rate": 1.9596685502720997e-05, + "loss": 1.0368, + "step": 1678 + }, + { + "epoch": 0.28788820541397864, + "grad_norm": 1.609375, + "learning_rate": 1.9596177638588223e-05, + "loss": 1.0462, + "step": 1679 + }, + { + "epoch": 0.28805966950296846, + "grad_norm": 1.625, + "learning_rate": 1.959566946148796e-05, + "loss": 1.0831, + "step": 1680 + }, + { + "epoch": 0.28823113359195834, + "grad_norm": 1.75, + "learning_rate": 1.9595160971436784e-05, + "loss": 1.2239, + "step": 1681 + }, + { + "epoch": 0.28840259768094817, + "grad_norm": 1.6875, + "learning_rate": 1.9594652168451274e-05, + "loss": 1.0844, + "step": 1682 + }, + { + "epoch": 0.28857406176993805, + "grad_norm": 1.7109375, + "learning_rate": 1.9594143052548027e-05, + "loss": 1.1673, + "step": 1683 + }, + { + "epoch": 0.28874552585892793, + "grad_norm": 1.6171875, + "learning_rate": 1.9593633623743646e-05, + "loss": 0.9826, + "step": 1684 + }, + { + "epoch": 0.28891698994791776, + "grad_norm": 1.6328125, + "learning_rate": 1.9593123882054748e-05, + "loss": 0.9722, + "step": 1685 + }, + { + "epoch": 0.28908845403690764, + "grad_norm": 1.71875, + "learning_rate": 1.9592613827497953e-05, + "loss": 1.0444, + "step": 1686 + }, + { + "epoch": 0.2892599181258975, + "grad_norm": 1.6328125, + "learning_rate": 1.9592103460089903e-05, + "loss": 1.1105, + "step": 1687 + }, + { + "epoch": 0.28943138221488734, + "grad_norm": 1.7265625, + "learning_rate": 1.9591592779847234e-05, + "loss": 1.0662, + "step": 1688 + }, + { + "epoch": 0.2896028463038772, + "grad_norm": 1.6328125, + "learning_rate": 1.9591081786786608e-05, + "loss": 1.1187, + "step": 1689 + }, + { + "epoch": 0.2897743103928671, + "grad_norm": 1.5625, + "learning_rate": 1.959057048092469e-05, + "loss": 1.0104, + "step": 1690 + }, + { + "epoch": 0.28994577448185693, + "grad_norm": 1.6328125, + "learning_rate": 1.9590058862278154e-05, + "loss": 1.0834, + "step": 1691 + }, + { + "epoch": 0.2901172385708468, + "grad_norm": 1.6796875, + "learning_rate": 1.9589546930863685e-05, + "loss": 1.0711, + "step": 1692 + }, + { + "epoch": 0.2902887026598367, + "grad_norm": 1.5625, + "learning_rate": 1.9589034686697977e-05, + "loss": 1.0707, + "step": 1693 + }, + { + "epoch": 0.2904601667488265, + "grad_norm": 1.5390625, + "learning_rate": 1.9588522129797744e-05, + "loss": 1.0679, + "step": 1694 + }, + { + "epoch": 0.2906316308378164, + "grad_norm": 1.7890625, + "learning_rate": 1.9588009260179693e-05, + "loss": 1.1389, + "step": 1695 + }, + { + "epoch": 0.2908030949268063, + "grad_norm": 1.59375, + "learning_rate": 1.9587496077860553e-05, + "loss": 0.9303, + "step": 1696 + }, + { + "epoch": 0.2909745590157961, + "grad_norm": 1.609375, + "learning_rate": 1.9586982582857067e-05, + "loss": 1.0341, + "step": 1697 + }, + { + "epoch": 0.291146023104786, + "grad_norm": 1.6640625, + "learning_rate": 1.9586468775185975e-05, + "loss": 1.0564, + "step": 1698 + }, + { + "epoch": 0.29131748719377587, + "grad_norm": 1.5703125, + "learning_rate": 1.958595465486404e-05, + "loss": 1.0828, + "step": 1699 + }, + { + "epoch": 0.2914889512827657, + "grad_norm": 1.703125, + "learning_rate": 1.9585440221908026e-05, + "loss": 1.141, + "step": 1700 + }, + { + "epoch": 0.2916604153717556, + "grad_norm": 1.625, + "learning_rate": 1.958492547633471e-05, + "loss": 1.0198, + "step": 1701 + }, + { + "epoch": 0.29183187946074546, + "grad_norm": 1.78125, + "learning_rate": 1.9584410418160876e-05, + "loss": 1.1171, + "step": 1702 + }, + { + "epoch": 0.2920033435497353, + "grad_norm": 1.6171875, + "learning_rate": 1.9583895047403335e-05, + "loss": 1.064, + "step": 1703 + }, + { + "epoch": 0.29217480763872516, + "grad_norm": 1.8203125, + "learning_rate": 1.958337936407888e-05, + "loss": 1.0585, + "step": 1704 + }, + { + "epoch": 0.29234627172771505, + "grad_norm": 1.640625, + "learning_rate": 1.9582863368204342e-05, + "loss": 1.055, + "step": 1705 + }, + { + "epoch": 0.29251773581670487, + "grad_norm": 1.671875, + "learning_rate": 1.958234705979654e-05, + "loss": 1.0747, + "step": 1706 + }, + { + "epoch": 0.29268919990569475, + "grad_norm": 1.578125, + "learning_rate": 1.958183043887232e-05, + "loss": 1.0259, + "step": 1707 + }, + { + "epoch": 0.29286066399468463, + "grad_norm": 1.5859375, + "learning_rate": 1.958131350544852e-05, + "loss": 1.0217, + "step": 1708 + }, + { + "epoch": 0.29303212808367446, + "grad_norm": 1.6171875, + "learning_rate": 1.9580796259542018e-05, + "loss": 1.0129, + "step": 1709 + }, + { + "epoch": 0.29320359217266434, + "grad_norm": 1.5546875, + "learning_rate": 1.958027870116966e-05, + "loss": 0.9592, + "step": 1710 + }, + { + "epoch": 0.2933750562616542, + "grad_norm": 1.5859375, + "learning_rate": 1.9579760830348345e-05, + "loss": 1.1031, + "step": 1711 + }, + { + "epoch": 0.29354652035064405, + "grad_norm": 1.7109375, + "learning_rate": 1.9579242647094956e-05, + "loss": 1.117, + "step": 1712 + }, + { + "epoch": 0.29371798443963393, + "grad_norm": 1.796875, + "learning_rate": 1.957872415142639e-05, + "loss": 1.029, + "step": 1713 + }, + { + "epoch": 0.2938894485286238, + "grad_norm": 1.6171875, + "learning_rate": 1.957820534335956e-05, + "loss": 1.0084, + "step": 1714 + }, + { + "epoch": 0.29406091261761363, + "grad_norm": 1.640625, + "learning_rate": 1.9577686222911386e-05, + "loss": 1.0399, + "step": 1715 + }, + { + "epoch": 0.2942323767066035, + "grad_norm": 1.578125, + "learning_rate": 1.9577166790098797e-05, + "loss": 1.0133, + "step": 1716 + }, + { + "epoch": 0.2944038407955934, + "grad_norm": 1.6875, + "learning_rate": 1.9576647044938733e-05, + "loss": 1.1458, + "step": 1717 + }, + { + "epoch": 0.2945753048845832, + "grad_norm": 1.703125, + "learning_rate": 1.957612698744815e-05, + "loss": 1.1054, + "step": 1718 + }, + { + "epoch": 0.2947467689735731, + "grad_norm": 1.6640625, + "learning_rate": 1.9575606617644e-05, + "loss": 1.0095, + "step": 1719 + }, + { + "epoch": 0.294918233062563, + "grad_norm": 1.6796875, + "learning_rate": 1.9575085935543266e-05, + "loss": 1.0516, + "step": 1720 + }, + { + "epoch": 0.2950896971515528, + "grad_norm": 1.65625, + "learning_rate": 1.9574564941162918e-05, + "loss": 1.0511, + "step": 1721 + }, + { + "epoch": 0.2952611612405427, + "grad_norm": 1.671875, + "learning_rate": 1.9574043634519957e-05, + "loss": 1.071, + "step": 1722 + }, + { + "epoch": 0.29543262532953257, + "grad_norm": 1.6796875, + "learning_rate": 1.9573522015631378e-05, + "loss": 1.1122, + "step": 1723 + }, + { + "epoch": 0.2956040894185224, + "grad_norm": 1.6328125, + "learning_rate": 1.9573000084514197e-05, + "loss": 1.0394, + "step": 1724 + }, + { + "epoch": 0.2957755535075123, + "grad_norm": 1.703125, + "learning_rate": 1.9572477841185435e-05, + "loss": 1.1326, + "step": 1725 + }, + { + "epoch": 0.29594701759650216, + "grad_norm": 1.5859375, + "learning_rate": 1.9571955285662124e-05, + "loss": 1.0425, + "step": 1726 + }, + { + "epoch": 0.296118481685492, + "grad_norm": 1.6484375, + "learning_rate": 1.9571432417961308e-05, + "loss": 1.0776, + "step": 1727 + }, + { + "epoch": 0.29628994577448187, + "grad_norm": 1.546875, + "learning_rate": 1.9570909238100034e-05, + "loss": 1.0177, + "step": 1728 + }, + { + "epoch": 0.2964614098634717, + "grad_norm": 1.6328125, + "learning_rate": 1.9570385746095372e-05, + "loss": 1.0884, + "step": 1729 + }, + { + "epoch": 0.2966328739524616, + "grad_norm": 1.640625, + "learning_rate": 1.956986194196439e-05, + "loss": 1.069, + "step": 1730 + }, + { + "epoch": 0.29680433804145145, + "grad_norm": 1.640625, + "learning_rate": 1.9569337825724174e-05, + "loss": 1.0551, + "step": 1731 + }, + { + "epoch": 0.2969758021304413, + "grad_norm": 1.578125, + "learning_rate": 1.9568813397391816e-05, + "loss": 1.0309, + "step": 1732 + }, + { + "epoch": 0.29714726621943116, + "grad_norm": 1.8359375, + "learning_rate": 1.9568288656984423e-05, + "loss": 1.1874, + "step": 1733 + }, + { + "epoch": 0.29731873030842104, + "grad_norm": 1.578125, + "learning_rate": 1.9567763604519105e-05, + "loss": 1.0737, + "step": 1734 + }, + { + "epoch": 0.29749019439741087, + "grad_norm": 1.609375, + "learning_rate": 1.9567238240012988e-05, + "loss": 1.0948, + "step": 1735 + }, + { + "epoch": 0.29766165848640075, + "grad_norm": 1.5703125, + "learning_rate": 1.9566712563483203e-05, + "loss": 1.0346, + "step": 1736 + }, + { + "epoch": 0.29783312257539063, + "grad_norm": 1.609375, + "learning_rate": 1.9566186574946894e-05, + "loss": 1.0707, + "step": 1737 + }, + { + "epoch": 0.29800458666438046, + "grad_norm": 1.75, + "learning_rate": 1.956566027442122e-05, + "loss": 1.1326, + "step": 1738 + }, + { + "epoch": 0.29817605075337034, + "grad_norm": 1.734375, + "learning_rate": 1.9565133661923346e-05, + "loss": 1.1118, + "step": 1739 + }, + { + "epoch": 0.2983475148423602, + "grad_norm": 1.6640625, + "learning_rate": 1.9564606737470444e-05, + "loss": 1.056, + "step": 1740 + }, + { + "epoch": 0.29851897893135004, + "grad_norm": 1.546875, + "learning_rate": 1.9564079501079698e-05, + "loss": 1.0614, + "step": 1741 + }, + { + "epoch": 0.2986904430203399, + "grad_norm": 1.5546875, + "learning_rate": 1.9563551952768303e-05, + "loss": 1.0242, + "step": 1742 + }, + { + "epoch": 0.2988619071093298, + "grad_norm": 1.6484375, + "learning_rate": 1.9563024092553465e-05, + "loss": 0.999, + "step": 1743 + }, + { + "epoch": 0.29903337119831963, + "grad_norm": 1.6640625, + "learning_rate": 1.9562495920452405e-05, + "loss": 1.0743, + "step": 1744 + }, + { + "epoch": 0.2992048352873095, + "grad_norm": 1.6796875, + "learning_rate": 1.9561967436482342e-05, + "loss": 1.1008, + "step": 1745 + }, + { + "epoch": 0.2993762993762994, + "grad_norm": 1.7734375, + "learning_rate": 1.956143864066051e-05, + "loss": 1.0411, + "step": 1746 + }, + { + "epoch": 0.2995477634652892, + "grad_norm": 1.609375, + "learning_rate": 1.9560909533004168e-05, + "loss": 1.0089, + "step": 1747 + }, + { + "epoch": 0.2997192275542791, + "grad_norm": 1.6171875, + "learning_rate": 1.9560380113530555e-05, + "loss": 1.083, + "step": 1748 + }, + { + "epoch": 0.299890691643269, + "grad_norm": 1.609375, + "learning_rate": 1.955985038225695e-05, + "loss": 1.0368, + "step": 1749 + }, + { + "epoch": 0.3000621557322588, + "grad_norm": 1.6640625, + "learning_rate": 1.955932033920062e-05, + "loss": 1.1021, + "step": 1750 + }, + { + "epoch": 0.3002336198212487, + "grad_norm": 1.6796875, + "learning_rate": 1.955878998437886e-05, + "loss": 1.0446, + "step": 1751 + }, + { + "epoch": 0.30040508391023857, + "grad_norm": 1.6171875, + "learning_rate": 1.9558259317808964e-05, + "loss": 1.1077, + "step": 1752 + }, + { + "epoch": 0.3005765479992284, + "grad_norm": 1.59375, + "learning_rate": 1.9557728339508238e-05, + "loss": 1.0784, + "step": 1753 + }, + { + "epoch": 0.3007480120882183, + "grad_norm": 1.5234375, + "learning_rate": 1.9557197049493997e-05, + "loss": 0.9533, + "step": 1754 + }, + { + "epoch": 0.30091947617720816, + "grad_norm": 1.6328125, + "learning_rate": 1.9556665447783577e-05, + "loss": 1.051, + "step": 1755 + }, + { + "epoch": 0.301090940266198, + "grad_norm": 1.8125, + "learning_rate": 1.9556133534394304e-05, + "loss": 1.1247, + "step": 1756 + }, + { + "epoch": 0.30126240435518786, + "grad_norm": 1.6328125, + "learning_rate": 1.9555601309343536e-05, + "loss": 1.1179, + "step": 1757 + }, + { + "epoch": 0.30143386844417774, + "grad_norm": 1.609375, + "learning_rate": 1.955506877264862e-05, + "loss": 1.0374, + "step": 1758 + }, + { + "epoch": 0.30160533253316757, + "grad_norm": 1.7578125, + "learning_rate": 1.9554535924326936e-05, + "loss": 1.1062, + "step": 1759 + }, + { + "epoch": 0.30177679662215745, + "grad_norm": 1.5703125, + "learning_rate": 1.9554002764395856e-05, + "loss": 0.9826, + "step": 1760 + }, + { + "epoch": 0.30194826071114733, + "grad_norm": 1.5546875, + "learning_rate": 1.9553469292872765e-05, + "loss": 0.9971, + "step": 1761 + }, + { + "epoch": 0.30211972480013716, + "grad_norm": 1.65625, + "learning_rate": 1.955293550977507e-05, + "loss": 1.0656, + "step": 1762 + }, + { + "epoch": 0.30229118888912704, + "grad_norm": 1.5625, + "learning_rate": 1.955240141512017e-05, + "loss": 1.0908, + "step": 1763 + }, + { + "epoch": 0.3024626529781169, + "grad_norm": 1.6015625, + "learning_rate": 1.9551867008925492e-05, + "loss": 1.0144, + "step": 1764 + }, + { + "epoch": 0.30263411706710674, + "grad_norm": 1.6875, + "learning_rate": 1.955133229120846e-05, + "loss": 1.0528, + "step": 1765 + }, + { + "epoch": 0.3028055811560966, + "grad_norm": 1.671875, + "learning_rate": 1.9550797261986516e-05, + "loss": 1.0964, + "step": 1766 + }, + { + "epoch": 0.3029770452450865, + "grad_norm": 1.546875, + "learning_rate": 1.9550261921277108e-05, + "loss": 1.0305, + "step": 1767 + }, + { + "epoch": 0.30314850933407633, + "grad_norm": 1.625, + "learning_rate": 1.9549726269097696e-05, + "loss": 1.0677, + "step": 1768 + }, + { + "epoch": 0.3033199734230662, + "grad_norm": 1.703125, + "learning_rate": 1.9549190305465754e-05, + "loss": 1.0878, + "step": 1769 + }, + { + "epoch": 0.3034914375120561, + "grad_norm": 1.625, + "learning_rate": 1.9548654030398754e-05, + "loss": 1.0571, + "step": 1770 + }, + { + "epoch": 0.3036629016010459, + "grad_norm": 1.5078125, + "learning_rate": 1.954811744391419e-05, + "loss": 0.9636, + "step": 1771 + }, + { + "epoch": 0.3038343656900358, + "grad_norm": 1.609375, + "learning_rate": 1.9547580546029555e-05, + "loss": 1.0493, + "step": 1772 + }, + { + "epoch": 0.3040058297790257, + "grad_norm": 1.640625, + "learning_rate": 1.9547043336762372e-05, + "loss": 1.0085, + "step": 1773 + }, + { + "epoch": 0.3041772938680155, + "grad_norm": 1.6171875, + "learning_rate": 1.9546505816130153e-05, + "loss": 1.1213, + "step": 1774 + }, + { + "epoch": 0.3043487579570054, + "grad_norm": 1.671875, + "learning_rate": 1.954596798415043e-05, + "loss": 1.0836, + "step": 1775 + }, + { + "epoch": 0.3045202220459952, + "grad_norm": 1.65625, + "learning_rate": 1.9545429840840744e-05, + "loss": 1.0237, + "step": 1776 + }, + { + "epoch": 0.3046916861349851, + "grad_norm": 1.6875, + "learning_rate": 1.9544891386218647e-05, + "loss": 1.0164, + "step": 1777 + }, + { + "epoch": 0.304863150223975, + "grad_norm": 1.6484375, + "learning_rate": 1.95443526203017e-05, + "loss": 1.037, + "step": 1778 + }, + { + "epoch": 0.3050346143129648, + "grad_norm": 1.640625, + "learning_rate": 1.9543813543107473e-05, + "loss": 1.038, + "step": 1779 + }, + { + "epoch": 0.3052060784019547, + "grad_norm": 1.6171875, + "learning_rate": 1.9543274154653544e-05, + "loss": 1.0393, + "step": 1780 + }, + { + "epoch": 0.30537754249094456, + "grad_norm": 1.5859375, + "learning_rate": 1.954273445495751e-05, + "loss": 1.0726, + "step": 1781 + }, + { + "epoch": 0.3055490065799344, + "grad_norm": 1.7734375, + "learning_rate": 1.9542194444036976e-05, + "loss": 1.157, + "step": 1782 + }, + { + "epoch": 0.30572047066892427, + "grad_norm": 1.5859375, + "learning_rate": 1.9541654121909543e-05, + "loss": 1.0675, + "step": 1783 + }, + { + "epoch": 0.30589193475791415, + "grad_norm": 1.625, + "learning_rate": 1.954111348859284e-05, + "loss": 1.0843, + "step": 1784 + }, + { + "epoch": 0.306063398846904, + "grad_norm": 1.7421875, + "learning_rate": 1.95405725441045e-05, + "loss": 1.1394, + "step": 1785 + }, + { + "epoch": 0.30623486293589386, + "grad_norm": 1.6953125, + "learning_rate": 1.9540031288462157e-05, + "loss": 1.0638, + "step": 1786 + }, + { + "epoch": 0.30640632702488374, + "grad_norm": 1.6015625, + "learning_rate": 1.9539489721683477e-05, + "loss": 1.0697, + "step": 1787 + }, + { + "epoch": 0.30657779111387357, + "grad_norm": 1.6484375, + "learning_rate": 1.953894784378611e-05, + "loss": 1.1335, + "step": 1788 + }, + { + "epoch": 0.30674925520286345, + "grad_norm": 1.703125, + "learning_rate": 1.9538405654787733e-05, + "loss": 1.1439, + "step": 1789 + }, + { + "epoch": 0.30692071929185333, + "grad_norm": 1.609375, + "learning_rate": 1.953786315470603e-05, + "loss": 1.1355, + "step": 1790 + }, + { + "epoch": 0.30709218338084315, + "grad_norm": 1.625, + "learning_rate": 1.9537320343558696e-05, + "loss": 1.0443, + "step": 1791 + }, + { + "epoch": 0.30726364746983303, + "grad_norm": 1.6640625, + "learning_rate": 1.9536777221363427e-05, + "loss": 1.0282, + "step": 1792 + }, + { + "epoch": 0.3074351115588229, + "grad_norm": 1.5390625, + "learning_rate": 1.9536233788137945e-05, + "loss": 1.0803, + "step": 1793 + }, + { + "epoch": 0.30760657564781274, + "grad_norm": 1.609375, + "learning_rate": 1.9535690043899965e-05, + "loss": 1.0731, + "step": 1794 + }, + { + "epoch": 0.3077780397368026, + "grad_norm": 1.6484375, + "learning_rate": 1.953514598866723e-05, + "loss": 1.1285, + "step": 1795 + }, + { + "epoch": 0.3079495038257925, + "grad_norm": 1.625, + "learning_rate": 1.9534601622457473e-05, + "loss": 1.0675, + "step": 1796 + }, + { + "epoch": 0.30812096791478233, + "grad_norm": 1.5234375, + "learning_rate": 1.9534056945288454e-05, + "loss": 0.9412, + "step": 1797 + }, + { + "epoch": 0.3082924320037722, + "grad_norm": 1.6171875, + "learning_rate": 1.953351195717794e-05, + "loss": 1.0785, + "step": 1798 + }, + { + "epoch": 0.3084638960927621, + "grad_norm": 1.640625, + "learning_rate": 1.9532966658143697e-05, + "loss": 1.1037, + "step": 1799 + }, + { + "epoch": 0.3086353601817519, + "grad_norm": 1.7265625, + "learning_rate": 1.953242104820351e-05, + "loss": 1.1528, + "step": 1800 + }, + { + "epoch": 0.3088068242707418, + "grad_norm": 1.578125, + "learning_rate": 1.9531875127375185e-05, + "loss": 1.004, + "step": 1801 + }, + { + "epoch": 0.3089782883597317, + "grad_norm": 1.6171875, + "learning_rate": 1.9531328895676515e-05, + "loss": 0.9947, + "step": 1802 + }, + { + "epoch": 0.3091497524487215, + "grad_norm": 1.6796875, + "learning_rate": 1.9530782353125315e-05, + "loss": 0.9227, + "step": 1803 + }, + { + "epoch": 0.3093212165377114, + "grad_norm": 1.71875, + "learning_rate": 1.9530235499739417e-05, + "loss": 0.987, + "step": 1804 + }, + { + "epoch": 0.30949268062670127, + "grad_norm": 1.7265625, + "learning_rate": 1.952968833553665e-05, + "loss": 1.1483, + "step": 1805 + }, + { + "epoch": 0.3096641447156911, + "grad_norm": 1.75, + "learning_rate": 1.952914086053486e-05, + "loss": 1.0115, + "step": 1806 + }, + { + "epoch": 0.309835608804681, + "grad_norm": 1.7265625, + "learning_rate": 1.9528593074751903e-05, + "loss": 1.1096, + "step": 1807 + }, + { + "epoch": 0.31000707289367085, + "grad_norm": 1.578125, + "learning_rate": 1.952804497820565e-05, + "loss": 1.0568, + "step": 1808 + }, + { + "epoch": 0.3101785369826607, + "grad_norm": 1.5859375, + "learning_rate": 1.9527496570913964e-05, + "loss": 1.1535, + "step": 1809 + }, + { + "epoch": 0.31035000107165056, + "grad_norm": 1.6171875, + "learning_rate": 1.9526947852894743e-05, + "loss": 1.0408, + "step": 1810 + }, + { + "epoch": 0.31052146516064044, + "grad_norm": 1.59375, + "learning_rate": 1.9526398824165874e-05, + "loss": 1.0993, + "step": 1811 + }, + { + "epoch": 0.31069292924963027, + "grad_norm": 1.6484375, + "learning_rate": 1.9525849484745266e-05, + "loss": 1.0322, + "step": 1812 + }, + { + "epoch": 0.31086439333862015, + "grad_norm": 1.6171875, + "learning_rate": 1.9525299834650838e-05, + "loss": 1.0349, + "step": 1813 + }, + { + "epoch": 0.31103585742761003, + "grad_norm": 1.6796875, + "learning_rate": 1.9524749873900514e-05, + "loss": 1.108, + "step": 1814 + }, + { + "epoch": 0.31120732151659986, + "grad_norm": 1.703125, + "learning_rate": 1.9524199602512227e-05, + "loss": 1.1155, + "step": 1815 + }, + { + "epoch": 0.31137878560558974, + "grad_norm": 1.65625, + "learning_rate": 1.9523649020503925e-05, + "loss": 0.9738, + "step": 1816 + }, + { + "epoch": 0.3115502496945796, + "grad_norm": 1.65625, + "learning_rate": 1.9523098127893566e-05, + "loss": 1.0731, + "step": 1817 + }, + { + "epoch": 0.31172171378356944, + "grad_norm": 1.671875, + "learning_rate": 1.9522546924699117e-05, + "loss": 0.9524, + "step": 1818 + }, + { + "epoch": 0.3118931778725593, + "grad_norm": 1.7421875, + "learning_rate": 1.9521995410938556e-05, + "loss": 1.0931, + "step": 1819 + }, + { + "epoch": 0.3120646419615492, + "grad_norm": 1.7109375, + "learning_rate": 1.9521443586629866e-05, + "loss": 1.0354, + "step": 1820 + }, + { + "epoch": 0.31223610605053903, + "grad_norm": 1.6171875, + "learning_rate": 1.952089145179105e-05, + "loss": 1.0416, + "step": 1821 + }, + { + "epoch": 0.3124075701395289, + "grad_norm": 1.6015625, + "learning_rate": 1.9520339006440107e-05, + "loss": 1.0128, + "step": 1822 + }, + { + "epoch": 0.31257903422851874, + "grad_norm": 1.578125, + "learning_rate": 1.951978625059506e-05, + "loss": 1.0744, + "step": 1823 + }, + { + "epoch": 0.3127504983175086, + "grad_norm": 1.640625, + "learning_rate": 1.9519233184273937e-05, + "loss": 1.0984, + "step": 1824 + }, + { + "epoch": 0.3129219624064985, + "grad_norm": 1.7265625, + "learning_rate": 1.951867980749477e-05, + "loss": 1.1042, + "step": 1825 + }, + { + "epoch": 0.3130934264954883, + "grad_norm": 1.625, + "learning_rate": 1.9518126120275615e-05, + "loss": 1.0636, + "step": 1826 + }, + { + "epoch": 0.3132648905844782, + "grad_norm": 1.640625, + "learning_rate": 1.9517572122634522e-05, + "loss": 1.0009, + "step": 1827 + }, + { + "epoch": 0.3134363546734681, + "grad_norm": 1.6015625, + "learning_rate": 1.9517017814589562e-05, + "loss": 1.0826, + "step": 1828 + }, + { + "epoch": 0.3136078187624579, + "grad_norm": 1.6328125, + "learning_rate": 1.9516463196158818e-05, + "loss": 1.0033, + "step": 1829 + }, + { + "epoch": 0.3137792828514478, + "grad_norm": 1.546875, + "learning_rate": 1.951590826736037e-05, + "loss": 0.9962, + "step": 1830 + }, + { + "epoch": 0.3139507469404377, + "grad_norm": 1.6796875, + "learning_rate": 1.9515353028212317e-05, + "loss": 1.0731, + "step": 1831 + }, + { + "epoch": 0.3141222110294275, + "grad_norm": 1.6328125, + "learning_rate": 1.9514797478732773e-05, + "loss": 1.0536, + "step": 1832 + }, + { + "epoch": 0.3142936751184174, + "grad_norm": 1.5703125, + "learning_rate": 1.9514241618939855e-05, + "loss": 0.9774, + "step": 1833 + }, + { + "epoch": 0.31446513920740726, + "grad_norm": 1.6640625, + "learning_rate": 1.9513685448851688e-05, + "loss": 1.0468, + "step": 1834 + }, + { + "epoch": 0.3146366032963971, + "grad_norm": 1.5625, + "learning_rate": 1.9513128968486414e-05, + "loss": 1.0279, + "step": 1835 + }, + { + "epoch": 0.31480806738538697, + "grad_norm": 1.5390625, + "learning_rate": 1.9512572177862184e-05, + "loss": 0.9927, + "step": 1836 + }, + { + "epoch": 0.31497953147437685, + "grad_norm": 1.546875, + "learning_rate": 1.951201507699715e-05, + "loss": 1.0225, + "step": 1837 + }, + { + "epoch": 0.3151509955633667, + "grad_norm": 1.6015625, + "learning_rate": 1.951145766590949e-05, + "loss": 1.0497, + "step": 1838 + }, + { + "epoch": 0.31532245965235656, + "grad_norm": 1.59375, + "learning_rate": 1.9510899944617377e-05, + "loss": 0.9978, + "step": 1839 + }, + { + "epoch": 0.31549392374134644, + "grad_norm": 1.671875, + "learning_rate": 1.9510341913139e-05, + "loss": 1.0166, + "step": 1840 + }, + { + "epoch": 0.31566538783033626, + "grad_norm": 1.7265625, + "learning_rate": 1.950978357149256e-05, + "loss": 1.1047, + "step": 1841 + }, + { + "epoch": 0.31583685191932614, + "grad_norm": 1.6328125, + "learning_rate": 1.9509224919696274e-05, + "loss": 0.9729, + "step": 1842 + }, + { + "epoch": 0.316008316008316, + "grad_norm": 1.5859375, + "learning_rate": 1.9508665957768345e-05, + "loss": 1.0282, + "step": 1843 + }, + { + "epoch": 0.31617978009730585, + "grad_norm": 1.609375, + "learning_rate": 1.950810668572702e-05, + "loss": 1.0229, + "step": 1844 + }, + { + "epoch": 0.31635124418629573, + "grad_norm": 1.6953125, + "learning_rate": 1.9507547103590528e-05, + "loss": 1.1291, + "step": 1845 + }, + { + "epoch": 0.3165227082752856, + "grad_norm": 1.609375, + "learning_rate": 1.9506987211377125e-05, + "loss": 1.1058, + "step": 1846 + }, + { + "epoch": 0.31669417236427544, + "grad_norm": 1.6171875, + "learning_rate": 1.950642700910507e-05, + "loss": 1.0973, + "step": 1847 + }, + { + "epoch": 0.3168656364532653, + "grad_norm": 42.25, + "learning_rate": 1.950586649679263e-05, + "loss": 1.2741, + "step": 1848 + }, + { + "epoch": 0.3170371005422552, + "grad_norm": 1.703125, + "learning_rate": 1.9505305674458087e-05, + "loss": 1.0999, + "step": 1849 + }, + { + "epoch": 0.317208564631245, + "grad_norm": 1.609375, + "learning_rate": 1.9504744542119735e-05, + "loss": 1.004, + "step": 1850 + }, + { + "epoch": 0.3173800287202349, + "grad_norm": 1.6171875, + "learning_rate": 1.950418309979587e-05, + "loss": 1.0514, + "step": 1851 + }, + { + "epoch": 0.3175514928092248, + "grad_norm": 1.671875, + "learning_rate": 1.9503621347504806e-05, + "loss": 1.0184, + "step": 1852 + }, + { + "epoch": 0.3177229568982146, + "grad_norm": 1.7265625, + "learning_rate": 1.950305928526486e-05, + "loss": 1.0488, + "step": 1853 + }, + { + "epoch": 0.3178944209872045, + "grad_norm": 1.609375, + "learning_rate": 1.950249691309437e-05, + "loss": 0.9829, + "step": 1854 + }, + { + "epoch": 0.3180658850761944, + "grad_norm": 1.546875, + "learning_rate": 1.950193423101167e-05, + "loss": 0.9648, + "step": 1855 + }, + { + "epoch": 0.3182373491651842, + "grad_norm": 1.6015625, + "learning_rate": 1.9501371239035113e-05, + "loss": 0.9902, + "step": 1856 + }, + { + "epoch": 0.3184088132541741, + "grad_norm": 1.5859375, + "learning_rate": 1.950080793718306e-05, + "loss": 1.0648, + "step": 1857 + }, + { + "epoch": 0.31858027734316396, + "grad_norm": 1.578125, + "learning_rate": 1.9500244325473888e-05, + "loss": 1.0418, + "step": 1858 + }, + { + "epoch": 0.3187517414321538, + "grad_norm": 1.65625, + "learning_rate": 1.949968040392597e-05, + "loss": 1.0923, + "step": 1859 + }, + { + "epoch": 0.31892320552114367, + "grad_norm": 1.671875, + "learning_rate": 1.9499116172557703e-05, + "loss": 1.1025, + "step": 1860 + }, + { + "epoch": 0.31909466961013355, + "grad_norm": 1.6328125, + "learning_rate": 1.949855163138749e-05, + "loss": 1.0503, + "step": 1861 + }, + { + "epoch": 0.3192661336991234, + "grad_norm": 1.6171875, + "learning_rate": 1.9497986780433735e-05, + "loss": 1.0073, + "step": 1862 + }, + { + "epoch": 0.31943759778811326, + "grad_norm": 1.640625, + "learning_rate": 1.9497421619714866e-05, + "loss": 1.0719, + "step": 1863 + }, + { + "epoch": 0.31960906187710314, + "grad_norm": 2.03125, + "learning_rate": 1.9496856149249315e-05, + "loss": 1.0481, + "step": 1864 + }, + { + "epoch": 0.31978052596609297, + "grad_norm": 1.59375, + "learning_rate": 1.9496290369055522e-05, + "loss": 1.0105, + "step": 1865 + }, + { + "epoch": 0.31995199005508285, + "grad_norm": 1.59375, + "learning_rate": 1.9495724279151945e-05, + "loss": 1.0392, + "step": 1866 + }, + { + "epoch": 0.3201234541440727, + "grad_norm": 1.53125, + "learning_rate": 1.949515787955704e-05, + "loss": 1.0413, + "step": 1867 + }, + { + "epoch": 0.32029491823306255, + "grad_norm": 1.703125, + "learning_rate": 1.9494591170289276e-05, + "loss": 1.0007, + "step": 1868 + }, + { + "epoch": 0.32046638232205243, + "grad_norm": 1.59375, + "learning_rate": 1.9494024151367145e-05, + "loss": 1.0438, + "step": 1869 + }, + { + "epoch": 0.32063784641104226, + "grad_norm": 1.796875, + "learning_rate": 1.9493456822809135e-05, + "loss": 1.0791, + "step": 1870 + }, + { + "epoch": 0.32080931050003214, + "grad_norm": 1.65625, + "learning_rate": 1.949288918463375e-05, + "loss": 1.0887, + "step": 1871 + }, + { + "epoch": 0.320980774589022, + "grad_norm": 1.9140625, + "learning_rate": 1.9492321236859496e-05, + "loss": 1.0292, + "step": 1872 + }, + { + "epoch": 0.32115223867801185, + "grad_norm": 1.6171875, + "learning_rate": 1.9491752979504906e-05, + "loss": 1.0404, + "step": 1873 + }, + { + "epoch": 0.32132370276700173, + "grad_norm": 1.671875, + "learning_rate": 1.949118441258851e-05, + "loss": 1.0434, + "step": 1874 + }, + { + "epoch": 0.3214951668559916, + "grad_norm": 1.59375, + "learning_rate": 1.9490615536128847e-05, + "loss": 0.9624, + "step": 1875 + }, + { + "epoch": 0.32166663094498144, + "grad_norm": 1.6796875, + "learning_rate": 1.9490046350144475e-05, + "loss": 1.1028, + "step": 1876 + }, + { + "epoch": 0.3218380950339713, + "grad_norm": 1.6640625, + "learning_rate": 1.948947685465395e-05, + "loss": 1.1315, + "step": 1877 + }, + { + "epoch": 0.3220095591229612, + "grad_norm": 1.5625, + "learning_rate": 1.9488907049675858e-05, + "loss": 1.0452, + "step": 1878 + }, + { + "epoch": 0.322181023211951, + "grad_norm": 1.6640625, + "learning_rate": 1.9488336935228772e-05, + "loss": 1.1242, + "step": 1879 + }, + { + "epoch": 0.3223524873009409, + "grad_norm": 1.578125, + "learning_rate": 1.948776651133129e-05, + "loss": 1.0056, + "step": 1880 + }, + { + "epoch": 0.3225239513899308, + "grad_norm": 1.765625, + "learning_rate": 1.9487195778002013e-05, + "loss": 1.1483, + "step": 1881 + }, + { + "epoch": 0.3226954154789206, + "grad_norm": 1.7109375, + "learning_rate": 1.9486624735259557e-05, + "loss": 1.0993, + "step": 1882 + }, + { + "epoch": 0.3228668795679105, + "grad_norm": 1.5625, + "learning_rate": 1.9486053383122544e-05, + "loss": 1.0002, + "step": 1883 + }, + { + "epoch": 0.3230383436569004, + "grad_norm": 1.7578125, + "learning_rate": 1.948548172160961e-05, + "loss": 1.0762, + "step": 1884 + }, + { + "epoch": 0.3232098077458902, + "grad_norm": 1.5703125, + "learning_rate": 1.94849097507394e-05, + "loss": 1.0094, + "step": 1885 + }, + { + "epoch": 0.3233812718348801, + "grad_norm": 1.65625, + "learning_rate": 1.9484337470530563e-05, + "loss": 1.0485, + "step": 1886 + }, + { + "epoch": 0.32355273592386996, + "grad_norm": 1.75, + "learning_rate": 1.9483764881001767e-05, + "loss": 1.102, + "step": 1887 + }, + { + "epoch": 0.3237242000128598, + "grad_norm": 1.6875, + "learning_rate": 1.9483191982171686e-05, + "loss": 1.0389, + "step": 1888 + }, + { + "epoch": 0.32389566410184967, + "grad_norm": 1.5078125, + "learning_rate": 1.9482618774059007e-05, + "loss": 0.9699, + "step": 1889 + }, + { + "epoch": 0.32406712819083955, + "grad_norm": 1.7109375, + "learning_rate": 1.9482045256682415e-05, + "loss": 1.1713, + "step": 1890 + }, + { + "epoch": 0.3242385922798294, + "grad_norm": 1.734375, + "learning_rate": 1.9481471430060627e-05, + "loss": 1.074, + "step": 1891 + }, + { + "epoch": 0.32441005636881926, + "grad_norm": 1.7421875, + "learning_rate": 1.9480897294212348e-05, + "loss": 1.0589, + "step": 1892 + }, + { + "epoch": 0.32458152045780914, + "grad_norm": 1.6875, + "learning_rate": 1.9480322849156307e-05, + "loss": 1.0421, + "step": 1893 + }, + { + "epoch": 0.32475298454679896, + "grad_norm": 1.6015625, + "learning_rate": 1.947974809491124e-05, + "loss": 1.0185, + "step": 1894 + }, + { + "epoch": 0.32492444863578884, + "grad_norm": 3.296875, + "learning_rate": 1.947917303149589e-05, + "loss": 1.1065, + "step": 1895 + }, + { + "epoch": 0.3250959127247787, + "grad_norm": 1.6171875, + "learning_rate": 1.9478597658929012e-05, + "loss": 1.0147, + "step": 1896 + }, + { + "epoch": 0.32526737681376855, + "grad_norm": 2.28125, + "learning_rate": 1.947802197722937e-05, + "loss": 1.055, + "step": 1897 + }, + { + "epoch": 0.32543884090275843, + "grad_norm": 1.84375, + "learning_rate": 1.9477445986415743e-05, + "loss": 1.0291, + "step": 1898 + }, + { + "epoch": 0.3256103049917483, + "grad_norm": 1.6328125, + "learning_rate": 1.947686968650691e-05, + "loss": 1.0111, + "step": 1899 + }, + { + "epoch": 0.32578176908073814, + "grad_norm": 1.59375, + "learning_rate": 1.9476293077521674e-05, + "loss": 1.0271, + "step": 1900 + }, + { + "epoch": 0.325953233169728, + "grad_norm": 1.640625, + "learning_rate": 1.9475716159478836e-05, + "loss": 1.0097, + "step": 1901 + }, + { + "epoch": 0.3261246972587179, + "grad_norm": 1.7734375, + "learning_rate": 1.947513893239721e-05, + "loss": 1.0723, + "step": 1902 + }, + { + "epoch": 0.3262961613477077, + "grad_norm": 1.703125, + "learning_rate": 1.947456139629562e-05, + "loss": 1.092, + "step": 1903 + }, + { + "epoch": 0.3264676254366976, + "grad_norm": 1.71875, + "learning_rate": 1.947398355119291e-05, + "loss": 1.0427, + "step": 1904 + }, + { + "epoch": 0.3266390895256875, + "grad_norm": 1.453125, + "learning_rate": 1.9473405397107917e-05, + "loss": 0.9551, + "step": 1905 + }, + { + "epoch": 0.3268105536146773, + "grad_norm": 1.6328125, + "learning_rate": 1.9472826934059506e-05, + "loss": 1.0296, + "step": 1906 + }, + { + "epoch": 0.3269820177036672, + "grad_norm": 1.5859375, + "learning_rate": 1.947224816206653e-05, + "loss": 1.072, + "step": 1907 + }, + { + "epoch": 0.3271534817926571, + "grad_norm": 1.6796875, + "learning_rate": 1.9471669081147878e-05, + "loss": 0.9822, + "step": 1908 + }, + { + "epoch": 0.3273249458816469, + "grad_norm": 1.640625, + "learning_rate": 1.947108969132243e-05, + "loss": 1.0869, + "step": 1909 + }, + { + "epoch": 0.3274964099706368, + "grad_norm": 1.6015625, + "learning_rate": 1.947050999260908e-05, + "loss": 1.0469, + "step": 1910 + }, + { + "epoch": 0.32766787405962666, + "grad_norm": 1.671875, + "learning_rate": 1.9469929985026738e-05, + "loss": 1.1537, + "step": 1911 + }, + { + "epoch": 0.3278393381486165, + "grad_norm": 1.59375, + "learning_rate": 1.9469349668594314e-05, + "loss": 1.093, + "step": 1912 + }, + { + "epoch": 0.32801080223760637, + "grad_norm": 1.6171875, + "learning_rate": 1.9468769043330744e-05, + "loss": 1.078, + "step": 1913 + }, + { + "epoch": 0.32818226632659625, + "grad_norm": 1.640625, + "learning_rate": 1.946818810925496e-05, + "loss": 1.0673, + "step": 1914 + }, + { + "epoch": 0.3283537304155861, + "grad_norm": 1.671875, + "learning_rate": 1.9467606866385905e-05, + "loss": 1.105, + "step": 1915 + }, + { + "epoch": 0.32852519450457596, + "grad_norm": 1.7109375, + "learning_rate": 1.946702531474254e-05, + "loss": 1.0411, + "step": 1916 + }, + { + "epoch": 0.3286966585935658, + "grad_norm": 1.578125, + "learning_rate": 1.946644345434383e-05, + "loss": 0.9658, + "step": 1917 + }, + { + "epoch": 0.32886812268255566, + "grad_norm": 1.671875, + "learning_rate": 1.9465861285208752e-05, + "loss": 1.0598, + "step": 1918 + }, + { + "epoch": 0.32903958677154554, + "grad_norm": 1.6328125, + "learning_rate": 1.9465278807356292e-05, + "loss": 1.104, + "step": 1919 + }, + { + "epoch": 0.32921105086053537, + "grad_norm": 1.65625, + "learning_rate": 1.9464696020805446e-05, + "loss": 1.1023, + "step": 1920 + }, + { + "epoch": 0.32938251494952525, + "grad_norm": 1.6484375, + "learning_rate": 1.9464112925575224e-05, + "loss": 1.1455, + "step": 1921 + }, + { + "epoch": 0.32955397903851513, + "grad_norm": 1.5859375, + "learning_rate": 1.9463529521684638e-05, + "loss": 1.0643, + "step": 1922 + }, + { + "epoch": 0.32972544312750496, + "grad_norm": 1.65625, + "learning_rate": 1.9462945809152722e-05, + "loss": 1.0266, + "step": 1923 + }, + { + "epoch": 0.32989690721649484, + "grad_norm": 1.6484375, + "learning_rate": 1.9462361787998503e-05, + "loss": 0.9947, + "step": 1924 + }, + { + "epoch": 0.3300683713054847, + "grad_norm": 1.625, + "learning_rate": 1.9461777458241038e-05, + "loss": 1.1242, + "step": 1925 + }, + { + "epoch": 0.33023983539447455, + "grad_norm": 1.6484375, + "learning_rate": 1.9461192819899383e-05, + "loss": 1.04, + "step": 1926 + }, + { + "epoch": 0.3304112994834644, + "grad_norm": 1.6171875, + "learning_rate": 1.94606078729926e-05, + "loss": 0.9667, + "step": 1927 + }, + { + "epoch": 0.3305827635724543, + "grad_norm": 1.6328125, + "learning_rate": 1.9460022617539765e-05, + "loss": 1.0544, + "step": 1928 + }, + { + "epoch": 0.33075422766144413, + "grad_norm": 1.6328125, + "learning_rate": 1.9459437053559974e-05, + "loss": 1.0008, + "step": 1929 + }, + { + "epoch": 0.330925691750434, + "grad_norm": 1.6328125, + "learning_rate": 1.9458851181072317e-05, + "loss": 1.121, + "step": 1930 + }, + { + "epoch": 0.3310971558394239, + "grad_norm": 1.6484375, + "learning_rate": 1.9458265000095905e-05, + "loss": 1.0576, + "step": 1931 + }, + { + "epoch": 0.3312686199284137, + "grad_norm": 1.6640625, + "learning_rate": 1.9457678510649857e-05, + "loss": 1.1327, + "step": 1932 + }, + { + "epoch": 0.3314400840174036, + "grad_norm": 1.53125, + "learning_rate": 1.9457091712753298e-05, + "loss": 1.0161, + "step": 1933 + }, + { + "epoch": 0.3316115481063935, + "grad_norm": 1.6328125, + "learning_rate": 1.9456504606425365e-05, + "loss": 0.9686, + "step": 1934 + }, + { + "epoch": 0.3317830121953833, + "grad_norm": 1.6015625, + "learning_rate": 1.9455917191685207e-05, + "loss": 0.9736, + "step": 1935 + }, + { + "epoch": 0.3319544762843732, + "grad_norm": 1.546875, + "learning_rate": 1.945532946855198e-05, + "loss": 1.0819, + "step": 1936 + }, + { + "epoch": 0.33212594037336307, + "grad_norm": 1.609375, + "learning_rate": 1.9454741437044858e-05, + "loss": 1.1065, + "step": 1937 + }, + { + "epoch": 0.3322974044623529, + "grad_norm": 1.5859375, + "learning_rate": 1.945415309718301e-05, + "loss": 0.9961, + "step": 1938 + }, + { + "epoch": 0.3324688685513428, + "grad_norm": 1.6875, + "learning_rate": 1.945356444898563e-05, + "loss": 1.1321, + "step": 1939 + }, + { + "epoch": 0.33264033264033266, + "grad_norm": 1.6484375, + "learning_rate": 1.9452975492471915e-05, + "loss": 1.0079, + "step": 1940 + }, + { + "epoch": 0.3328117967293225, + "grad_norm": 1.5390625, + "learning_rate": 1.9452386227661076e-05, + "loss": 1.0916, + "step": 1941 + }, + { + "epoch": 0.33298326081831237, + "grad_norm": 1.6875, + "learning_rate": 1.945179665457232e-05, + "loss": 1.0289, + "step": 1942 + }, + { + "epoch": 0.33315472490730225, + "grad_norm": 1.7265625, + "learning_rate": 1.945120677322489e-05, + "loss": 1.1086, + "step": 1943 + }, + { + "epoch": 0.33332618899629207, + "grad_norm": 1.625, + "learning_rate": 1.9450616583638013e-05, + "loss": 0.9818, + "step": 1944 + }, + { + "epoch": 0.33349765308528195, + "grad_norm": 1.5703125, + "learning_rate": 1.9450026085830946e-05, + "loss": 1.0208, + "step": 1945 + }, + { + "epoch": 0.33366911717427183, + "grad_norm": 1.6484375, + "learning_rate": 1.9449435279822934e-05, + "loss": 1.0519, + "step": 1946 + }, + { + "epoch": 0.33384058126326166, + "grad_norm": 1.609375, + "learning_rate": 1.9448844165633264e-05, + "loss": 1.0612, + "step": 1947 + }, + { + "epoch": 0.33401204535225154, + "grad_norm": 1.6328125, + "learning_rate": 1.9448252743281196e-05, + "loss": 1.0525, + "step": 1948 + }, + { + "epoch": 0.3341835094412414, + "grad_norm": 1.6875, + "learning_rate": 1.9447661012786034e-05, + "loss": 1.0689, + "step": 1949 + }, + { + "epoch": 0.33435497353023125, + "grad_norm": 1.6640625, + "learning_rate": 1.9447068974167068e-05, + "loss": 1.1055, + "step": 1950 + }, + { + "epoch": 0.33452643761922113, + "grad_norm": 1.6328125, + "learning_rate": 1.9446476627443608e-05, + "loss": 0.9484, + "step": 1951 + }, + { + "epoch": 0.334697901708211, + "grad_norm": 1.5546875, + "learning_rate": 1.9445883972634973e-05, + "loss": 1.0176, + "step": 1952 + }, + { + "epoch": 0.33486936579720084, + "grad_norm": 1.5078125, + "learning_rate": 1.9445291009760493e-05, + "loss": 0.9543, + "step": 1953 + }, + { + "epoch": 0.3350408298861907, + "grad_norm": 1.609375, + "learning_rate": 1.9444697738839503e-05, + "loss": 1.065, + "step": 1954 + }, + { + "epoch": 0.3352122939751806, + "grad_norm": 1.7734375, + "learning_rate": 1.9444104159891353e-05, + "loss": 1.0328, + "step": 1955 + }, + { + "epoch": 0.3353837580641704, + "grad_norm": 1.703125, + "learning_rate": 1.9443510272935407e-05, + "loss": 1.175, + "step": 1956 + }, + { + "epoch": 0.3355552221531603, + "grad_norm": 1.6640625, + "learning_rate": 1.944291607799103e-05, + "loss": 0.9973, + "step": 1957 + }, + { + "epoch": 0.3357266862421502, + "grad_norm": 1.5625, + "learning_rate": 1.94423215750776e-05, + "loss": 1.0261, + "step": 1958 + }, + { + "epoch": 0.33589815033114, + "grad_norm": 1.578125, + "learning_rate": 1.9441726764214506e-05, + "loss": 1.0332, + "step": 1959 + }, + { + "epoch": 0.3360696144201299, + "grad_norm": 1.7109375, + "learning_rate": 1.9441131645421146e-05, + "loss": 1.091, + "step": 1960 + }, + { + "epoch": 0.3362410785091198, + "grad_norm": 1.6796875, + "learning_rate": 1.9440536218716934e-05, + "loss": 1.0468, + "step": 1961 + }, + { + "epoch": 0.3364125425981096, + "grad_norm": 1.671875, + "learning_rate": 1.9439940484121287e-05, + "loss": 1.0747, + "step": 1962 + }, + { + "epoch": 0.3365840066870995, + "grad_norm": 1.59375, + "learning_rate": 1.943934444165363e-05, + "loss": 1.0025, + "step": 1963 + }, + { + "epoch": 0.3367554707760893, + "grad_norm": 1.625, + "learning_rate": 1.943874809133341e-05, + "loss": 1.0103, + "step": 1964 + }, + { + "epoch": 0.3369269348650792, + "grad_norm": 1.5859375, + "learning_rate": 1.9438151433180068e-05, + "loss": 1.0693, + "step": 1965 + }, + { + "epoch": 0.33709839895406907, + "grad_norm": 1.546875, + "learning_rate": 1.9437554467213066e-05, + "loss": 1.0221, + "step": 1966 + }, + { + "epoch": 0.3372698630430589, + "grad_norm": 1.5390625, + "learning_rate": 1.943695719345188e-05, + "loss": 1.1032, + "step": 1967 + }, + { + "epoch": 0.3374413271320488, + "grad_norm": 1.578125, + "learning_rate": 1.9436359611915978e-05, + "loss": 1.0418, + "step": 1968 + }, + { + "epoch": 0.33761279122103866, + "grad_norm": 1.6640625, + "learning_rate": 1.943576172262486e-05, + "loss": 1.0459, + "step": 1969 + }, + { + "epoch": 0.3377842553100285, + "grad_norm": 1.6484375, + "learning_rate": 1.9435163525598017e-05, + "loss": 0.9038, + "step": 1970 + }, + { + "epoch": 0.33795571939901836, + "grad_norm": 1.6953125, + "learning_rate": 1.943456502085496e-05, + "loss": 1.061, + "step": 1971 + }, + { + "epoch": 0.33812718348800824, + "grad_norm": 1.625, + "learning_rate": 1.9433966208415212e-05, + "loss": 1.0721, + "step": 1972 + }, + { + "epoch": 0.33829864757699807, + "grad_norm": 1.5703125, + "learning_rate": 1.94333670882983e-05, + "loss": 1.0833, + "step": 1973 + }, + { + "epoch": 0.33847011166598795, + "grad_norm": 1.4921875, + "learning_rate": 1.9432767660523768e-05, + "loss": 0.9957, + "step": 1974 + }, + { + "epoch": 0.33864157575497783, + "grad_norm": 1.609375, + "learning_rate": 1.943216792511116e-05, + "loss": 0.9921, + "step": 1975 + }, + { + "epoch": 0.33881303984396766, + "grad_norm": 1.6640625, + "learning_rate": 1.9431567882080042e-05, + "loss": 1.002, + "step": 1976 + }, + { + "epoch": 0.33898450393295754, + "grad_norm": 1.6953125, + "learning_rate": 1.9430967531449973e-05, + "loss": 1.1183, + "step": 1977 + }, + { + "epoch": 0.3391559680219474, + "grad_norm": 1.65625, + "learning_rate": 1.943036687324054e-05, + "loss": 1.0156, + "step": 1978 + }, + { + "epoch": 0.33932743211093724, + "grad_norm": 1.671875, + "learning_rate": 1.9429765907471336e-05, + "loss": 1.1207, + "step": 1979 + }, + { + "epoch": 0.3394988961999271, + "grad_norm": 1.7265625, + "learning_rate": 1.942916463416195e-05, + "loss": 1.07, + "step": 1980 + }, + { + "epoch": 0.339670360288917, + "grad_norm": 1.6015625, + "learning_rate": 1.9428563053332004e-05, + "loss": 0.9844, + "step": 1981 + }, + { + "epoch": 0.33984182437790683, + "grad_norm": 1.59375, + "learning_rate": 1.942796116500111e-05, + "loss": 1.0781, + "step": 1982 + }, + { + "epoch": 0.3400132884668967, + "grad_norm": 1.6484375, + "learning_rate": 1.94273589691889e-05, + "loss": 1.078, + "step": 1983 + }, + { + "epoch": 0.3401847525558866, + "grad_norm": 1.6875, + "learning_rate": 1.9426756465915014e-05, + "loss": 1.1393, + "step": 1984 + }, + { + "epoch": 0.3403562166448764, + "grad_norm": 1.6015625, + "learning_rate": 1.94261536551991e-05, + "loss": 1.0316, + "step": 1985 + }, + { + "epoch": 0.3405276807338663, + "grad_norm": 1.578125, + "learning_rate": 1.9425550537060826e-05, + "loss": 1.1531, + "step": 1986 + }, + { + "epoch": 0.3406991448228562, + "grad_norm": 1.5546875, + "learning_rate": 1.942494711151985e-05, + "loss": 1.0325, + "step": 1987 + }, + { + "epoch": 0.340870608911846, + "grad_norm": 1.640625, + "learning_rate": 1.9424343378595857e-05, + "loss": 1.0903, + "step": 1988 + }, + { + "epoch": 0.3410420730008359, + "grad_norm": 1.65625, + "learning_rate": 1.942373933830854e-05, + "loss": 1.092, + "step": 1989 + }, + { + "epoch": 0.34121353708982577, + "grad_norm": 1.5625, + "learning_rate": 1.9423134990677596e-05, + "loss": 1.032, + "step": 1990 + }, + { + "epoch": 0.3413850011788156, + "grad_norm": 1.625, + "learning_rate": 1.9422530335722736e-05, + "loss": 1.0583, + "step": 1991 + }, + { + "epoch": 0.3415564652678055, + "grad_norm": 1.6015625, + "learning_rate": 1.942192537346368e-05, + "loss": 1.0195, + "step": 1992 + }, + { + "epoch": 0.34172792935679536, + "grad_norm": 1.6640625, + "learning_rate": 1.942132010392016e-05, + "loss": 1.1263, + "step": 1993 + }, + { + "epoch": 0.3418993934457852, + "grad_norm": 1.6171875, + "learning_rate": 1.9420714527111907e-05, + "loss": 1.1151, + "step": 1994 + }, + { + "epoch": 0.34207085753477506, + "grad_norm": 1.640625, + "learning_rate": 1.9420108643058685e-05, + "loss": 1.0326, + "step": 1995 + }, + { + "epoch": 0.34224232162376494, + "grad_norm": 1.578125, + "learning_rate": 1.9419502451780243e-05, + "loss": 0.9469, + "step": 1996 + }, + { + "epoch": 0.34241378571275477, + "grad_norm": 1.546875, + "learning_rate": 1.941889595329636e-05, + "loss": 1.0316, + "step": 1997 + }, + { + "epoch": 0.34258524980174465, + "grad_norm": 1.6171875, + "learning_rate": 1.9418289147626804e-05, + "loss": 1.069, + "step": 1998 + }, + { + "epoch": 0.34275671389073453, + "grad_norm": 1.59375, + "learning_rate": 1.941768203479138e-05, + "loss": 1.049, + "step": 1999 + }, + { + "epoch": 0.34292817797972436, + "grad_norm": 1.6171875, + "learning_rate": 1.941707461480988e-05, + "loss": 1.0452, + "step": 2000 + }, + { + "epoch": 0.34309964206871424, + "grad_norm": 1.640625, + "learning_rate": 1.9416466887702116e-05, + "loss": 1.0781, + "step": 2001 + }, + { + "epoch": 0.3432711061577041, + "grad_norm": 1.671875, + "learning_rate": 1.9415858853487904e-05, + "loss": 0.991, + "step": 2002 + }, + { + "epoch": 0.34344257024669395, + "grad_norm": 1.5703125, + "learning_rate": 1.941525051218708e-05, + "loss": 1.0131, + "step": 2003 + }, + { + "epoch": 0.3436140343356838, + "grad_norm": 2.25, + "learning_rate": 1.9414641863819484e-05, + "loss": 1.1457, + "step": 2004 + }, + { + "epoch": 0.3437854984246737, + "grad_norm": 1.6171875, + "learning_rate": 1.9414032908404962e-05, + "loss": 1.1031, + "step": 2005 + }, + { + "epoch": 0.34395696251366353, + "grad_norm": 1.671875, + "learning_rate": 1.9413423645963378e-05, + "loss": 1.1239, + "step": 2006 + }, + { + "epoch": 0.3441284266026534, + "grad_norm": 1.578125, + "learning_rate": 1.94128140765146e-05, + "loss": 1.0762, + "step": 2007 + }, + { + "epoch": 0.3442998906916433, + "grad_norm": 1.5625, + "learning_rate": 1.9412204200078514e-05, + "loss": 1.0155, + "step": 2008 + }, + { + "epoch": 0.3444713547806331, + "grad_norm": 1.6328125, + "learning_rate": 1.9411594016675e-05, + "loss": 1.1015, + "step": 2009 + }, + { + "epoch": 0.344642818869623, + "grad_norm": 1.703125, + "learning_rate": 1.9410983526323964e-05, + "loss": 1.1553, + "step": 2010 + }, + { + "epoch": 0.3448142829586129, + "grad_norm": 1.6171875, + "learning_rate": 1.941037272904532e-05, + "loss": 1.1046, + "step": 2011 + }, + { + "epoch": 0.3449857470476027, + "grad_norm": 1.5234375, + "learning_rate": 1.940976162485899e-05, + "loss": 1.0013, + "step": 2012 + }, + { + "epoch": 0.3451572111365926, + "grad_norm": 1.6875, + "learning_rate": 1.9409150213784892e-05, + "loss": 1.0267, + "step": 2013 + }, + { + "epoch": 0.3453286752255824, + "grad_norm": 1.546875, + "learning_rate": 1.9408538495842973e-05, + "loss": 1.0041, + "step": 2014 + }, + { + "epoch": 0.3455001393145723, + "grad_norm": 1.6015625, + "learning_rate": 1.9407926471053188e-05, + "loss": 1.02, + "step": 2015 + }, + { + "epoch": 0.3456716034035622, + "grad_norm": 1.6171875, + "learning_rate": 1.9407314139435495e-05, + "loss": 1.0167, + "step": 2016 + }, + { + "epoch": 0.345843067492552, + "grad_norm": 1.7109375, + "learning_rate": 1.9406701501009862e-05, + "loss": 1.0526, + "step": 2017 + }, + { + "epoch": 0.3460145315815419, + "grad_norm": 1.6328125, + "learning_rate": 1.9406088555796268e-05, + "loss": 1.0379, + "step": 2018 + }, + { + "epoch": 0.34618599567053177, + "grad_norm": 1.625, + "learning_rate": 1.940547530381471e-05, + "loss": 1.0715, + "step": 2019 + }, + { + "epoch": 0.3463574597595216, + "grad_norm": 1.6328125, + "learning_rate": 1.9404861745085184e-05, + "loss": 1.0622, + "step": 2020 + }, + { + "epoch": 0.34652892384851147, + "grad_norm": 1.5859375, + "learning_rate": 1.94042478796277e-05, + "loss": 0.9886, + "step": 2021 + }, + { + "epoch": 0.34670038793750135, + "grad_norm": 1.515625, + "learning_rate": 1.9403633707462282e-05, + "loss": 1.0259, + "step": 2022 + }, + { + "epoch": 0.3468718520264912, + "grad_norm": 1.703125, + "learning_rate": 1.9403019228608953e-05, + "loss": 0.9734, + "step": 2023 + }, + { + "epoch": 0.34704331611548106, + "grad_norm": 1.59375, + "learning_rate": 1.9402404443087763e-05, + "loss": 1.0332, + "step": 2024 + }, + { + "epoch": 0.34721478020447094, + "grad_norm": 1.625, + "learning_rate": 1.9401789350918755e-05, + "loss": 1.0274, + "step": 2025 + }, + { + "epoch": 0.34738624429346077, + "grad_norm": 1.7265625, + "learning_rate": 1.9401173952121993e-05, + "loss": 1.1486, + "step": 2026 + }, + { + "epoch": 0.34755770838245065, + "grad_norm": 1.5859375, + "learning_rate": 1.9400558246717547e-05, + "loss": 1.0557, + "step": 2027 + }, + { + "epoch": 0.34772917247144053, + "grad_norm": 1.578125, + "learning_rate": 1.93999422347255e-05, + "loss": 0.9885, + "step": 2028 + }, + { + "epoch": 0.34790063656043035, + "grad_norm": 1.5625, + "learning_rate": 1.9399325916165937e-05, + "loss": 1.1045, + "step": 2029 + }, + { + "epoch": 0.34807210064942024, + "grad_norm": 1.53125, + "learning_rate": 1.939870929105896e-05, + "loss": 0.9841, + "step": 2030 + }, + { + "epoch": 0.3482435647384101, + "grad_norm": 1.5859375, + "learning_rate": 1.9398092359424683e-05, + "loss": 0.9914, + "step": 2031 + }, + { + "epoch": 0.34841502882739994, + "grad_norm": 1.5703125, + "learning_rate": 1.9397475121283226e-05, + "loss": 1.0192, + "step": 2032 + }, + { + "epoch": 0.3485864929163898, + "grad_norm": 1.6015625, + "learning_rate": 1.9396857576654714e-05, + "loss": 1.0977, + "step": 2033 + }, + { + "epoch": 0.3487579570053797, + "grad_norm": 1.5703125, + "learning_rate": 1.9396239725559294e-05, + "loss": 1.0574, + "step": 2034 + }, + { + "epoch": 0.34892942109436953, + "grad_norm": 1.5859375, + "learning_rate": 1.939562156801711e-05, + "loss": 1.0004, + "step": 2035 + }, + { + "epoch": 0.3491008851833594, + "grad_norm": 1.6171875, + "learning_rate": 1.939500310404833e-05, + "loss": 1.0306, + "step": 2036 + }, + { + "epoch": 0.3492723492723493, + "grad_norm": 1.65625, + "learning_rate": 1.9394384333673117e-05, + "loss": 1.0474, + "step": 2037 + }, + { + "epoch": 0.3494438133613391, + "grad_norm": 1.6640625, + "learning_rate": 1.9393765256911657e-05, + "loss": 1.0378, + "step": 2038 + }, + { + "epoch": 0.349615277450329, + "grad_norm": 1.671875, + "learning_rate": 1.9393145873784137e-05, + "loss": 1.0387, + "step": 2039 + }, + { + "epoch": 0.3497867415393189, + "grad_norm": 1.6953125, + "learning_rate": 1.939252618431076e-05, + "loss": 1.0389, + "step": 2040 + }, + { + "epoch": 0.3499582056283087, + "grad_norm": 1.7578125, + "learning_rate": 1.9391906188511736e-05, + "loss": 1.041, + "step": 2041 + }, + { + "epoch": 0.3501296697172986, + "grad_norm": 1.6171875, + "learning_rate": 1.939128588640728e-05, + "loss": 1.1094, + "step": 2042 + }, + { + "epoch": 0.35030113380628847, + "grad_norm": 1.5859375, + "learning_rate": 1.9390665278017635e-05, + "loss": 1.0563, + "step": 2043 + }, + { + "epoch": 0.3504725978952783, + "grad_norm": 1.5625, + "learning_rate": 1.939004436336303e-05, + "loss": 1.0217, + "step": 2044 + }, + { + "epoch": 0.3506440619842682, + "grad_norm": 1.6328125, + "learning_rate": 1.9389423142463715e-05, + "loss": 1.0005, + "step": 2045 + }, + { + "epoch": 0.35081552607325805, + "grad_norm": 1.7265625, + "learning_rate": 1.9388801615339955e-05, + "loss": 1.0865, + "step": 2046 + }, + { + "epoch": 0.3509869901622479, + "grad_norm": 1.6171875, + "learning_rate": 1.9388179782012023e-05, + "loss": 1.0456, + "step": 2047 + }, + { + "epoch": 0.35115845425123776, + "grad_norm": 1.5625, + "learning_rate": 1.9387557642500192e-05, + "loss": 1.0332, + "step": 2048 + }, + { + "epoch": 0.35132991834022764, + "grad_norm": 1.6875, + "learning_rate": 1.9386935196824756e-05, + "loss": 1.057, + "step": 2049 + }, + { + "epoch": 0.35150138242921747, + "grad_norm": 1.71875, + "learning_rate": 1.938631244500602e-05, + "loss": 1.1701, + "step": 2050 + }, + { + "epoch": 0.35167284651820735, + "grad_norm": 1.640625, + "learning_rate": 1.9385689387064285e-05, + "loss": 0.9727, + "step": 2051 + }, + { + "epoch": 0.35184431060719723, + "grad_norm": 1.6640625, + "learning_rate": 1.9385066023019878e-05, + "loss": 1.0281, + "step": 2052 + }, + { + "epoch": 0.35201577469618706, + "grad_norm": 1.6015625, + "learning_rate": 1.9384442352893125e-05, + "loss": 1.0045, + "step": 2053 + }, + { + "epoch": 0.35218723878517694, + "grad_norm": 1.5859375, + "learning_rate": 1.938381837670437e-05, + "loss": 1.0382, + "step": 2054 + }, + { + "epoch": 0.3523587028741668, + "grad_norm": 1.6484375, + "learning_rate": 1.938319409447396e-05, + "loss": 1.0064, + "step": 2055 + }, + { + "epoch": 0.35253016696315664, + "grad_norm": 1.6171875, + "learning_rate": 1.938256950622226e-05, + "loss": 1.0506, + "step": 2056 + }, + { + "epoch": 0.3527016310521465, + "grad_norm": 1.5703125, + "learning_rate": 1.938194461196963e-05, + "loss": 1.0538, + "step": 2057 + }, + { + "epoch": 0.3528730951411364, + "grad_norm": 1.6015625, + "learning_rate": 1.9381319411736467e-05, + "loss": 1.0444, + "step": 2058 + }, + { + "epoch": 0.35304455923012623, + "grad_norm": 1.5390625, + "learning_rate": 1.9380693905543144e-05, + "loss": 1.0035, + "step": 2059 + }, + { + "epoch": 0.3532160233191161, + "grad_norm": 1.7578125, + "learning_rate": 1.938006809341007e-05, + "loss": 1.1376, + "step": 2060 + }, + { + "epoch": 0.35338748740810594, + "grad_norm": 1.671875, + "learning_rate": 1.9379441975357654e-05, + "loss": 1.0271, + "step": 2061 + }, + { + "epoch": 0.3535589514970958, + "grad_norm": 1.609375, + "learning_rate": 1.9378815551406315e-05, + "loss": 1.0318, + "step": 2062 + }, + { + "epoch": 0.3537304155860857, + "grad_norm": 1.6015625, + "learning_rate": 1.9378188821576484e-05, + "loss": 1.1232, + "step": 2063 + }, + { + "epoch": 0.3539018796750755, + "grad_norm": 1.6328125, + "learning_rate": 1.9377561785888602e-05, + "loss": 0.9949, + "step": 2064 + }, + { + "epoch": 0.3540733437640654, + "grad_norm": 1.6328125, + "learning_rate": 1.9376934444363114e-05, + "loss": 1.1025, + "step": 2065 + }, + { + "epoch": 0.3542448078530553, + "grad_norm": 1.6484375, + "learning_rate": 1.9376306797020486e-05, + "loss": 1.1526, + "step": 2066 + }, + { + "epoch": 0.3544162719420451, + "grad_norm": 20.375, + "learning_rate": 1.9375678843881186e-05, + "loss": 1.1018, + "step": 2067 + }, + { + "epoch": 0.354587736031035, + "grad_norm": 1.53125, + "learning_rate": 1.937505058496569e-05, + "loss": 1.0327, + "step": 2068 + }, + { + "epoch": 0.3547592001200249, + "grad_norm": 1.5625, + "learning_rate": 1.937442202029449e-05, + "loss": 1.0052, + "step": 2069 + }, + { + "epoch": 0.3549306642090147, + "grad_norm": 1.6484375, + "learning_rate": 1.9373793149888092e-05, + "loss": 1.0494, + "step": 2070 + }, + { + "epoch": 0.3551021282980046, + "grad_norm": 1.7109375, + "learning_rate": 1.9373163973767e-05, + "loss": 1.0583, + "step": 2071 + }, + { + "epoch": 0.35527359238699446, + "grad_norm": 1.8203125, + "learning_rate": 1.937253449195174e-05, + "loss": 1.0328, + "step": 2072 + }, + { + "epoch": 0.3554450564759843, + "grad_norm": 1.5703125, + "learning_rate": 1.9371904704462826e-05, + "loss": 1.0797, + "step": 2073 + }, + { + "epoch": 0.35561652056497417, + "grad_norm": 1.5625, + "learning_rate": 1.9371274611320813e-05, + "loss": 1.0437, + "step": 2074 + }, + { + "epoch": 0.35578798465396405, + "grad_norm": 1.8203125, + "learning_rate": 1.9370644212546246e-05, + "loss": 1.0953, + "step": 2075 + }, + { + "epoch": 0.3559594487429539, + "grad_norm": 1.953125, + "learning_rate": 1.9370013508159685e-05, + "loss": 1.1619, + "step": 2076 + }, + { + "epoch": 0.35613091283194376, + "grad_norm": 1.7421875, + "learning_rate": 1.9369382498181698e-05, + "loss": 1.0185, + "step": 2077 + }, + { + "epoch": 0.35630237692093364, + "grad_norm": 1.75, + "learning_rate": 1.936875118263287e-05, + "loss": 1.112, + "step": 2078 + }, + { + "epoch": 0.35647384100992346, + "grad_norm": 1.7109375, + "learning_rate": 1.936811956153378e-05, + "loss": 1.0548, + "step": 2079 + }, + { + "epoch": 0.35664530509891335, + "grad_norm": 1.6484375, + "learning_rate": 1.936748763490504e-05, + "loss": 1.0316, + "step": 2080 + }, + { + "epoch": 0.3568167691879032, + "grad_norm": 1.6953125, + "learning_rate": 1.936685540276725e-05, + "loss": 0.9346, + "step": 2081 + }, + { + "epoch": 0.35698823327689305, + "grad_norm": 1.6953125, + "learning_rate": 1.9366222865141032e-05, + "loss": 1.0405, + "step": 2082 + }, + { + "epoch": 0.35715969736588293, + "grad_norm": 1.6015625, + "learning_rate": 1.936559002204702e-05, + "loss": 0.9518, + "step": 2083 + }, + { + "epoch": 0.3573311614548728, + "grad_norm": 1.703125, + "learning_rate": 1.936495687350585e-05, + "loss": 1.0378, + "step": 2084 + }, + { + "epoch": 0.35750262554386264, + "grad_norm": 1.6015625, + "learning_rate": 1.9364323419538166e-05, + "loss": 0.9886, + "step": 2085 + }, + { + "epoch": 0.3576740896328525, + "grad_norm": 1.7578125, + "learning_rate": 1.9363689660164637e-05, + "loss": 1.0009, + "step": 2086 + }, + { + "epoch": 0.3578455537218424, + "grad_norm": 1.6015625, + "learning_rate": 1.9363055595405928e-05, + "loss": 0.9917, + "step": 2087 + }, + { + "epoch": 0.3580170178108322, + "grad_norm": 1.609375, + "learning_rate": 1.9362421225282717e-05, + "loss": 1.0459, + "step": 2088 + }, + { + "epoch": 0.3581884818998221, + "grad_norm": 1.65625, + "learning_rate": 1.936178654981569e-05, + "loss": 1.1376, + "step": 2089 + }, + { + "epoch": 0.358359945988812, + "grad_norm": 1.53125, + "learning_rate": 1.9361151569025556e-05, + "loss": 1.0498, + "step": 2090 + }, + { + "epoch": 0.3585314100778018, + "grad_norm": 1.6171875, + "learning_rate": 1.936051628293302e-05, + "loss": 1.0736, + "step": 2091 + }, + { + "epoch": 0.3587028741667917, + "grad_norm": 1.6171875, + "learning_rate": 1.9359880691558796e-05, + "loss": 1.1096, + "step": 2092 + }, + { + "epoch": 0.3588743382557816, + "grad_norm": 1.6640625, + "learning_rate": 1.9359244794923615e-05, + "loss": 1.001, + "step": 2093 + }, + { + "epoch": 0.3590458023447714, + "grad_norm": 1.7265625, + "learning_rate": 1.935860859304822e-05, + "loss": 1.0068, + "step": 2094 + }, + { + "epoch": 0.3592172664337613, + "grad_norm": 1.6953125, + "learning_rate": 1.9357972085953356e-05, + "loss": 1.0759, + "step": 2095 + }, + { + "epoch": 0.35938873052275117, + "grad_norm": 1.5859375, + "learning_rate": 1.9357335273659785e-05, + "loss": 1.0363, + "step": 2096 + }, + { + "epoch": 0.359560194611741, + "grad_norm": 1.546875, + "learning_rate": 1.9356698156188273e-05, + "loss": 0.9934, + "step": 2097 + }, + { + "epoch": 0.35973165870073087, + "grad_norm": 1.7578125, + "learning_rate": 1.9356060733559602e-05, + "loss": 1.1027, + "step": 2098 + }, + { + "epoch": 0.35990312278972075, + "grad_norm": 1.734375, + "learning_rate": 1.9355423005794556e-05, + "loss": 1.0325, + "step": 2099 + }, + { + "epoch": 0.3600745868787106, + "grad_norm": 1.6796875, + "learning_rate": 1.9354784972913938e-05, + "loss": 1.116, + "step": 2100 + }, + { + "epoch": 0.3600745868787106, + "eval_loss": 0.8986930847167969, + "eval_runtime": 837.0926, + "eval_samples_per_second": 2.985, + "eval_steps_per_second": 2.985, + "step": 2100 + }, + { + "epoch": 0.36024605096770046, + "grad_norm": 1.625, + "learning_rate": 1.9354146634938558e-05, + "loss": 1.1254, + "step": 2101 + }, + { + "epoch": 0.36041751505669034, + "grad_norm": 1.6640625, + "learning_rate": 1.9353507991889228e-05, + "loss": 1.0357, + "step": 2102 + }, + { + "epoch": 0.36058897914568017, + "grad_norm": 1.6328125, + "learning_rate": 1.935286904378678e-05, + "loss": 1.0318, + "step": 2103 + }, + { + "epoch": 0.36076044323467005, + "grad_norm": 1.640625, + "learning_rate": 1.9352229790652056e-05, + "loss": 1.0231, + "step": 2104 + }, + { + "epoch": 0.36093190732365993, + "grad_norm": 1.765625, + "learning_rate": 1.9351590232505898e-05, + "loss": 1.0675, + "step": 2105 + }, + { + "epoch": 0.36110337141264975, + "grad_norm": 1.65625, + "learning_rate": 1.935095036936917e-05, + "loss": 1.0321, + "step": 2106 + }, + { + "epoch": 0.36127483550163964, + "grad_norm": 1.6796875, + "learning_rate": 1.9350310201262737e-05, + "loss": 1.0532, + "step": 2107 + }, + { + "epoch": 0.36144629959062946, + "grad_norm": 1.625, + "learning_rate": 1.934966972820748e-05, + "loss": 1.0822, + "step": 2108 + }, + { + "epoch": 0.36161776367961934, + "grad_norm": 1.5859375, + "learning_rate": 1.934902895022428e-05, + "loss": 1.0376, + "step": 2109 + }, + { + "epoch": 0.3617892277686092, + "grad_norm": 1.6484375, + "learning_rate": 1.9348387867334047e-05, + "loss": 1.0499, + "step": 2110 + }, + { + "epoch": 0.36196069185759905, + "grad_norm": 1.6171875, + "learning_rate": 1.934774647955768e-05, + "loss": 1.0162, + "step": 2111 + }, + { + "epoch": 0.36213215594658893, + "grad_norm": 1.734375, + "learning_rate": 1.9347104786916103e-05, + "loss": 1.042, + "step": 2112 + }, + { + "epoch": 0.3623036200355788, + "grad_norm": 1.671875, + "learning_rate": 1.934646278943024e-05, + "loss": 1.057, + "step": 2113 + }, + { + "epoch": 0.36247508412456864, + "grad_norm": 1.703125, + "learning_rate": 1.9345820487121026e-05, + "loss": 0.9956, + "step": 2114 + }, + { + "epoch": 0.3626465482135585, + "grad_norm": 1.5546875, + "learning_rate": 1.9345177880009417e-05, + "loss": 0.9508, + "step": 2115 + }, + { + "epoch": 0.3628180123025484, + "grad_norm": 1.8359375, + "learning_rate": 1.9344534968116365e-05, + "loss": 1.1172, + "step": 2116 + }, + { + "epoch": 0.3629894763915382, + "grad_norm": 1.6875, + "learning_rate": 1.9343891751462838e-05, + "loss": 1.1314, + "step": 2117 + }, + { + "epoch": 0.3631609404805281, + "grad_norm": 1.5703125, + "learning_rate": 1.9343248230069815e-05, + "loss": 1.0684, + "step": 2118 + }, + { + "epoch": 0.363332404569518, + "grad_norm": 1.625, + "learning_rate": 1.9342604403958287e-05, + "loss": 0.9849, + "step": 2119 + }, + { + "epoch": 0.3635038686585078, + "grad_norm": 1.5390625, + "learning_rate": 1.9341960273149246e-05, + "loss": 1.0186, + "step": 2120 + }, + { + "epoch": 0.3636753327474977, + "grad_norm": 1.6640625, + "learning_rate": 1.9341315837663704e-05, + "loss": 1.0682, + "step": 2121 + }, + { + "epoch": 0.3638467968364876, + "grad_norm": 1.671875, + "learning_rate": 1.9340671097522678e-05, + "loss": 1.0753, + "step": 2122 + }, + { + "epoch": 0.3640182609254774, + "grad_norm": 1.6484375, + "learning_rate": 1.934002605274719e-05, + "loss": 1.1382, + "step": 2123 + }, + { + "epoch": 0.3641897250144673, + "grad_norm": 1.5625, + "learning_rate": 1.933938070335828e-05, + "loss": 1.0476, + "step": 2124 + }, + { + "epoch": 0.36436118910345716, + "grad_norm": 1.5859375, + "learning_rate": 1.9338735049377e-05, + "loss": 1.0797, + "step": 2125 + }, + { + "epoch": 0.364532653192447, + "grad_norm": 1.53125, + "learning_rate": 1.9338089090824402e-05, + "loss": 1.0387, + "step": 2126 + }, + { + "epoch": 0.36470411728143687, + "grad_norm": 1.5859375, + "learning_rate": 1.9337442827721555e-05, + "loss": 1.0245, + "step": 2127 + }, + { + "epoch": 0.36487558137042675, + "grad_norm": 1.6015625, + "learning_rate": 1.9336796260089534e-05, + "loss": 1.0181, + "step": 2128 + }, + { + "epoch": 0.3650470454594166, + "grad_norm": 1.65625, + "learning_rate": 1.933614938794943e-05, + "loss": 1.0161, + "step": 2129 + }, + { + "epoch": 0.36521850954840646, + "grad_norm": 1.75, + "learning_rate": 1.933550221132234e-05, + "loss": 1.1429, + "step": 2130 + }, + { + "epoch": 0.36538997363739634, + "grad_norm": 1.75, + "learning_rate": 1.933485473022936e-05, + "loss": 1.0651, + "step": 2131 + }, + { + "epoch": 0.36556143772638616, + "grad_norm": 1.5703125, + "learning_rate": 1.9334206944691626e-05, + "loss": 1.0549, + "step": 2132 + }, + { + "epoch": 0.36573290181537604, + "grad_norm": 1.6796875, + "learning_rate": 1.933355885473025e-05, + "loss": 1.0231, + "step": 2133 + }, + { + "epoch": 0.3659043659043659, + "grad_norm": 1.640625, + "learning_rate": 1.933291046036637e-05, + "loss": 1.0812, + "step": 2134 + }, + { + "epoch": 0.36607582999335575, + "grad_norm": 1.6953125, + "learning_rate": 1.9332261761621138e-05, + "loss": 1.1038, + "step": 2135 + }, + { + "epoch": 0.36624729408234563, + "grad_norm": 1.6328125, + "learning_rate": 1.9331612758515706e-05, + "loss": 1.0191, + "step": 2136 + }, + { + "epoch": 0.3664187581713355, + "grad_norm": 1.640625, + "learning_rate": 1.933096345107125e-05, + "loss": 1.0076, + "step": 2137 + }, + { + "epoch": 0.36659022226032534, + "grad_norm": 1.53125, + "learning_rate": 1.933031383930893e-05, + "loss": 1.016, + "step": 2138 + }, + { + "epoch": 0.3667616863493152, + "grad_norm": 1.8984375, + "learning_rate": 1.9329663923249945e-05, + "loss": 1.0134, + "step": 2139 + }, + { + "epoch": 0.3669331504383051, + "grad_norm": 1.8046875, + "learning_rate": 1.9329013702915485e-05, + "loss": 1.2297, + "step": 2140 + }, + { + "epoch": 0.3671046145272949, + "grad_norm": 1.609375, + "learning_rate": 1.932836317832676e-05, + "loss": 0.9963, + "step": 2141 + }, + { + "epoch": 0.3672760786162848, + "grad_norm": 1.6328125, + "learning_rate": 1.9327712349504982e-05, + "loss": 1.0519, + "step": 2142 + }, + { + "epoch": 0.3674475427052747, + "grad_norm": 1.734375, + "learning_rate": 1.9327061216471382e-05, + "loss": 1.0634, + "step": 2143 + }, + { + "epoch": 0.3676190067942645, + "grad_norm": 1.609375, + "learning_rate": 1.932640977924719e-05, + "loss": 1.117, + "step": 2144 + }, + { + "epoch": 0.3677904708832544, + "grad_norm": 1.609375, + "learning_rate": 1.932575803785366e-05, + "loss": 1.0166, + "step": 2145 + }, + { + "epoch": 0.3679619349722443, + "grad_norm": 1.7265625, + "learning_rate": 1.932510599231204e-05, + "loss": 1.1124, + "step": 2146 + }, + { + "epoch": 0.3681333990612341, + "grad_norm": 1.6015625, + "learning_rate": 1.9324453642643598e-05, + "loss": 0.9047, + "step": 2147 + }, + { + "epoch": 0.368304863150224, + "grad_norm": 1.578125, + "learning_rate": 1.932380098886961e-05, + "loss": 1.0224, + "step": 2148 + }, + { + "epoch": 0.36847632723921386, + "grad_norm": 1.734375, + "learning_rate": 1.9323148031011363e-05, + "loss": 1.0019, + "step": 2149 + }, + { + "epoch": 0.3686477913282037, + "grad_norm": 1.671875, + "learning_rate": 1.932249476909015e-05, + "loss": 1.079, + "step": 2150 + }, + { + "epoch": 0.36881925541719357, + "grad_norm": 1.7109375, + "learning_rate": 1.9321841203127277e-05, + "loss": 1.0723, + "step": 2151 + }, + { + "epoch": 0.36899071950618345, + "grad_norm": 1.65625, + "learning_rate": 1.9321187333144064e-05, + "loss": 0.9337, + "step": 2152 + }, + { + "epoch": 0.3691621835951733, + "grad_norm": 1.6484375, + "learning_rate": 1.9320533159161824e-05, + "loss": 1.0433, + "step": 2153 + }, + { + "epoch": 0.36933364768416316, + "grad_norm": 1.6484375, + "learning_rate": 1.9319878681201905e-05, + "loss": 1.0609, + "step": 2154 + }, + { + "epoch": 0.369505111773153, + "grad_norm": 1.703125, + "learning_rate": 1.9319223899285643e-05, + "loss": 1.0004, + "step": 2155 + }, + { + "epoch": 0.36967657586214286, + "grad_norm": 1.640625, + "learning_rate": 1.93185688134344e-05, + "loss": 1.0564, + "step": 2156 + }, + { + "epoch": 0.36984803995113275, + "grad_norm": 1.8125, + "learning_rate": 1.9317913423669535e-05, + "loss": 1.1132, + "step": 2157 + }, + { + "epoch": 0.37001950404012257, + "grad_norm": 1.59375, + "learning_rate": 1.931725773001243e-05, + "loss": 0.9931, + "step": 2158 + }, + { + "epoch": 0.37019096812911245, + "grad_norm": 1.6953125, + "learning_rate": 1.931660173248446e-05, + "loss": 1.121, + "step": 2159 + }, + { + "epoch": 0.37036243221810233, + "grad_norm": 1.5546875, + "learning_rate": 1.931594543110703e-05, + "loss": 1.0856, + "step": 2160 + }, + { + "epoch": 0.37053389630709216, + "grad_norm": 1.6640625, + "learning_rate": 1.9315288825901534e-05, + "loss": 1.1187, + "step": 2161 + }, + { + "epoch": 0.37070536039608204, + "grad_norm": 1.7734375, + "learning_rate": 1.931463191688939e-05, + "loss": 1.0063, + "step": 2162 + }, + { + "epoch": 0.3708768244850719, + "grad_norm": 1.703125, + "learning_rate": 1.9313974704092028e-05, + "loss": 1.0472, + "step": 2163 + }, + { + "epoch": 0.37104828857406175, + "grad_norm": 1.65625, + "learning_rate": 1.9313317187530875e-05, + "loss": 1.1154, + "step": 2164 + }, + { + "epoch": 0.3712197526630516, + "grad_norm": 1.7109375, + "learning_rate": 1.931265936722738e-05, + "loss": 1.0989, + "step": 2165 + }, + { + "epoch": 0.3713912167520415, + "grad_norm": 1.71875, + "learning_rate": 1.931200124320299e-05, + "loss": 1.0552, + "step": 2166 + }, + { + "epoch": 0.37156268084103133, + "grad_norm": 1.6484375, + "learning_rate": 1.931134281547918e-05, + "loss": 1.1136, + "step": 2167 + }, + { + "epoch": 0.3717341449300212, + "grad_norm": 1.6953125, + "learning_rate": 1.9310684084077413e-05, + "loss": 0.9891, + "step": 2168 + }, + { + "epoch": 0.3719056090190111, + "grad_norm": 1.734375, + "learning_rate": 1.931002504901918e-05, + "loss": 1.0897, + "step": 2169 + }, + { + "epoch": 0.3720770731080009, + "grad_norm": 1.6484375, + "learning_rate": 1.930936571032597e-05, + "loss": 1.0432, + "step": 2170 + }, + { + "epoch": 0.3722485371969908, + "grad_norm": 1.6484375, + "learning_rate": 1.930870606801929e-05, + "loss": 0.9868, + "step": 2171 + }, + { + "epoch": 0.3724200012859807, + "grad_norm": 1.6328125, + "learning_rate": 1.9308046122120648e-05, + "loss": 1.037, + "step": 2172 + }, + { + "epoch": 0.3725914653749705, + "grad_norm": 1.59375, + "learning_rate": 1.9307385872651574e-05, + "loss": 1.0493, + "step": 2173 + }, + { + "epoch": 0.3727629294639604, + "grad_norm": 1.65625, + "learning_rate": 1.9306725319633603e-05, + "loss": 1.011, + "step": 2174 + }, + { + "epoch": 0.37293439355295027, + "grad_norm": 1.578125, + "learning_rate": 1.9306064463088267e-05, + "loss": 1.0288, + "step": 2175 + }, + { + "epoch": 0.3731058576419401, + "grad_norm": 1.671875, + "learning_rate": 1.9305403303037128e-05, + "loss": 1.0481, + "step": 2176 + }, + { + "epoch": 0.37327732173093, + "grad_norm": 1.6796875, + "learning_rate": 1.930474183950174e-05, + "loss": 0.994, + "step": 2177 + }, + { + "epoch": 0.37344878581991986, + "grad_norm": 1.6328125, + "learning_rate": 1.930408007250369e-05, + "loss": 1.0024, + "step": 2178 + }, + { + "epoch": 0.3736202499089097, + "grad_norm": 1.609375, + "learning_rate": 1.9303418002064553e-05, + "loss": 1.0486, + "step": 2179 + }, + { + "epoch": 0.37379171399789957, + "grad_norm": 1.578125, + "learning_rate": 1.9302755628205917e-05, + "loss": 1.0745, + "step": 2180 + }, + { + "epoch": 0.37396317808688945, + "grad_norm": 1.5859375, + "learning_rate": 1.9302092950949393e-05, + "loss": 1.0946, + "step": 2181 + }, + { + "epoch": 0.3741346421758793, + "grad_norm": 1.625, + "learning_rate": 1.9301429970316588e-05, + "loss": 1.0594, + "step": 2182 + }, + { + "epoch": 0.37430610626486915, + "grad_norm": 1.5234375, + "learning_rate": 1.9300766686329124e-05, + "loss": 1.0049, + "step": 2183 + }, + { + "epoch": 0.37447757035385904, + "grad_norm": 1.671875, + "learning_rate": 1.9300103099008634e-05, + "loss": 1.0796, + "step": 2184 + }, + { + "epoch": 0.37464903444284886, + "grad_norm": 1.671875, + "learning_rate": 1.9299439208376763e-05, + "loss": 1.0713, + "step": 2185 + }, + { + "epoch": 0.37482049853183874, + "grad_norm": 1.59375, + "learning_rate": 1.9298775014455163e-05, + "loss": 1.1084, + "step": 2186 + }, + { + "epoch": 0.3749919626208286, + "grad_norm": 1.65625, + "learning_rate": 1.9298110517265492e-05, + "loss": 0.9409, + "step": 2187 + }, + { + "epoch": 0.37516342670981845, + "grad_norm": 1.6015625, + "learning_rate": 1.9297445716829423e-05, + "loss": 0.9509, + "step": 2188 + }, + { + "epoch": 0.37533489079880833, + "grad_norm": 1.609375, + "learning_rate": 1.9296780613168638e-05, + "loss": 1.0959, + "step": 2189 + }, + { + "epoch": 0.3755063548877982, + "grad_norm": 1.59375, + "learning_rate": 1.929611520630483e-05, + "loss": 1.0904, + "step": 2190 + }, + { + "epoch": 0.37567781897678804, + "grad_norm": 1.6953125, + "learning_rate": 1.92954494962597e-05, + "loss": 1.0285, + "step": 2191 + }, + { + "epoch": 0.3758492830657779, + "grad_norm": 1.5703125, + "learning_rate": 1.929478348305496e-05, + "loss": 1.0246, + "step": 2192 + }, + { + "epoch": 0.3760207471547678, + "grad_norm": 1.7578125, + "learning_rate": 1.9294117166712322e-05, + "loss": 1.1087, + "step": 2193 + }, + { + "epoch": 0.3761922112437576, + "grad_norm": 1.5859375, + "learning_rate": 1.929345054725353e-05, + "loss": 1.0125, + "step": 2194 + }, + { + "epoch": 0.3763636753327475, + "grad_norm": 1.734375, + "learning_rate": 1.929278362470032e-05, + "loss": 1.0662, + "step": 2195 + }, + { + "epoch": 0.3765351394217374, + "grad_norm": 1.6171875, + "learning_rate": 1.9292116399074442e-05, + "loss": 0.9725, + "step": 2196 + }, + { + "epoch": 0.3767066035107272, + "grad_norm": 1.59375, + "learning_rate": 1.9291448870397658e-05, + "loss": 0.9811, + "step": 2197 + }, + { + "epoch": 0.3768780675997171, + "grad_norm": 1.65625, + "learning_rate": 1.929078103869174e-05, + "loss": 1.0886, + "step": 2198 + }, + { + "epoch": 0.377049531688707, + "grad_norm": 1.578125, + "learning_rate": 1.929011290397846e-05, + "loss": 1.0572, + "step": 2199 + }, + { + "epoch": 0.3772209957776968, + "grad_norm": 1.4765625, + "learning_rate": 1.9289444466279622e-05, + "loss": 0.9523, + "step": 2200 + }, + { + "epoch": 0.3773924598666867, + "grad_norm": 1.5390625, + "learning_rate": 1.9288775725617014e-05, + "loss": 1.0021, + "step": 2201 + }, + { + "epoch": 0.3775639239556765, + "grad_norm": 1.5546875, + "learning_rate": 1.9288106682012456e-05, + "loss": 1.1431, + "step": 2202 + }, + { + "epoch": 0.3777353880446664, + "grad_norm": 1.7578125, + "learning_rate": 1.928743733548776e-05, + "loss": 1.1734, + "step": 2203 + }, + { + "epoch": 0.37790685213365627, + "grad_norm": 1.625, + "learning_rate": 1.9286767686064757e-05, + "loss": 1.0438, + "step": 2204 + }, + { + "epoch": 0.3780783162226461, + "grad_norm": 1.671875, + "learning_rate": 1.9286097733765296e-05, + "loss": 1.0226, + "step": 2205 + }, + { + "epoch": 0.378249780311636, + "grad_norm": 1.7734375, + "learning_rate": 1.9285427478611214e-05, + "loss": 1.0586, + "step": 2206 + }, + { + "epoch": 0.37842124440062586, + "grad_norm": 1.609375, + "learning_rate": 1.9284756920624378e-05, + "loss": 1.0845, + "step": 2207 + }, + { + "epoch": 0.3785927084896157, + "grad_norm": 1.578125, + "learning_rate": 1.9284086059826654e-05, + "loss": 1.048, + "step": 2208 + }, + { + "epoch": 0.37876417257860556, + "grad_norm": 2.296875, + "learning_rate": 1.9283414896239924e-05, + "loss": 0.9909, + "step": 2209 + }, + { + "epoch": 0.37893563666759544, + "grad_norm": 1.578125, + "learning_rate": 1.928274342988608e-05, + "loss": 1.0743, + "step": 2210 + }, + { + "epoch": 0.37910710075658527, + "grad_norm": 2.671875, + "learning_rate": 1.928207166078701e-05, + "loss": 1.0515, + "step": 2211 + }, + { + "epoch": 0.37927856484557515, + "grad_norm": 1.5625, + "learning_rate": 1.9281399588964636e-05, + "loss": 0.998, + "step": 2212 + }, + { + "epoch": 0.37945002893456503, + "grad_norm": 1.6640625, + "learning_rate": 1.9280727214440865e-05, + "loss": 1.0134, + "step": 2213 + }, + { + "epoch": 0.37962149302355486, + "grad_norm": 1.640625, + "learning_rate": 1.928005453723764e-05, + "loss": 0.9262, + "step": 2214 + }, + { + "epoch": 0.37979295711254474, + "grad_norm": 1.7265625, + "learning_rate": 1.9279381557376883e-05, + "loss": 1.1135, + "step": 2215 + }, + { + "epoch": 0.3799644212015346, + "grad_norm": 1.59375, + "learning_rate": 1.9278708274880556e-05, + "loss": 1.1122, + "step": 2216 + }, + { + "epoch": 0.38013588529052444, + "grad_norm": 1.53125, + "learning_rate": 1.927803468977061e-05, + "loss": 1.0032, + "step": 2217 + }, + { + "epoch": 0.3803073493795143, + "grad_norm": 1.6015625, + "learning_rate": 1.9277360802069013e-05, + "loss": 1.0734, + "step": 2218 + }, + { + "epoch": 0.3804788134685042, + "grad_norm": 1.6328125, + "learning_rate": 1.927668661179775e-05, + "loss": 1.0122, + "step": 2219 + }, + { + "epoch": 0.38065027755749403, + "grad_norm": 1.5, + "learning_rate": 1.9276012118978794e-05, + "loss": 0.9827, + "step": 2220 + }, + { + "epoch": 0.3808217416464839, + "grad_norm": 1.6640625, + "learning_rate": 1.927533732363416e-05, + "loss": 1.1203, + "step": 2221 + }, + { + "epoch": 0.3809932057354738, + "grad_norm": 1.6171875, + "learning_rate": 1.927466222578585e-05, + "loss": 1.0789, + "step": 2222 + }, + { + "epoch": 0.3811646698244636, + "grad_norm": 1.65625, + "learning_rate": 1.9273986825455874e-05, + "loss": 1.067, + "step": 2223 + }, + { + "epoch": 0.3813361339134535, + "grad_norm": 1.6171875, + "learning_rate": 1.927331112266627e-05, + "loss": 1.0309, + "step": 2224 + }, + { + "epoch": 0.3815075980024434, + "grad_norm": 1.5625, + "learning_rate": 1.9272635117439066e-05, + "loss": 1.0483, + "step": 2225 + }, + { + "epoch": 0.3816790620914332, + "grad_norm": 1.5078125, + "learning_rate": 1.9271958809796315e-05, + "loss": 1.0048, + "step": 2226 + }, + { + "epoch": 0.3818505261804231, + "grad_norm": 1.625, + "learning_rate": 1.9271282199760076e-05, + "loss": 1.0654, + "step": 2227 + }, + { + "epoch": 0.38202199026941297, + "grad_norm": 1.5546875, + "learning_rate": 1.9270605287352406e-05, + "loss": 1.0612, + "step": 2228 + }, + { + "epoch": 0.3821934543584028, + "grad_norm": 1.6015625, + "learning_rate": 1.9269928072595392e-05, + "loss": 1.0226, + "step": 2229 + }, + { + "epoch": 0.3823649184473927, + "grad_norm": 1.5703125, + "learning_rate": 1.926925055551111e-05, + "loss": 0.9787, + "step": 2230 + }, + { + "epoch": 0.38253638253638256, + "grad_norm": 1.53125, + "learning_rate": 1.9268572736121668e-05, + "loss": 1.0347, + "step": 2231 + }, + { + "epoch": 0.3827078466253724, + "grad_norm": 1.6015625, + "learning_rate": 1.9267894614449168e-05, + "loss": 1.1223, + "step": 2232 + }, + { + "epoch": 0.38287931071436226, + "grad_norm": 1.6015625, + "learning_rate": 1.9267216190515726e-05, + "loss": 1.0425, + "step": 2233 + }, + { + "epoch": 0.38305077480335215, + "grad_norm": 1.6484375, + "learning_rate": 1.926653746434346e-05, + "loss": 1.0454, + "step": 2234 + }, + { + "epoch": 0.38322223889234197, + "grad_norm": 1.6484375, + "learning_rate": 1.9265858435954515e-05, + "loss": 0.9754, + "step": 2235 + }, + { + "epoch": 0.38339370298133185, + "grad_norm": 1.75, + "learning_rate": 1.926517910537104e-05, + "loss": 1.148, + "step": 2236 + }, + { + "epoch": 0.38356516707032173, + "grad_norm": 1.7421875, + "learning_rate": 1.926449947261518e-05, + "loss": 1.041, + "step": 2237 + }, + { + "epoch": 0.38373663115931156, + "grad_norm": 1.703125, + "learning_rate": 1.9263819537709104e-05, + "loss": 1.0121, + "step": 2238 + }, + { + "epoch": 0.38390809524830144, + "grad_norm": 1.703125, + "learning_rate": 1.9263139300674994e-05, + "loss": 1.0594, + "step": 2239 + }, + { + "epoch": 0.3840795593372913, + "grad_norm": 1.7265625, + "learning_rate": 1.9262458761535026e-05, + "loss": 1.0862, + "step": 2240 + }, + { + "epoch": 0.38425102342628115, + "grad_norm": 1.546875, + "learning_rate": 1.92617779203114e-05, + "loss": 1.0087, + "step": 2241 + }, + { + "epoch": 0.384422487515271, + "grad_norm": 1.734375, + "learning_rate": 1.9261096777026318e-05, + "loss": 1.0964, + "step": 2242 + }, + { + "epoch": 0.3845939516042609, + "grad_norm": 1.546875, + "learning_rate": 1.9260415331701996e-05, + "loss": 0.9698, + "step": 2243 + }, + { + "epoch": 0.38476541569325073, + "grad_norm": 1.609375, + "learning_rate": 1.925973358436066e-05, + "loss": 0.9941, + "step": 2244 + }, + { + "epoch": 0.3849368797822406, + "grad_norm": 1.6484375, + "learning_rate": 1.9259051535024542e-05, + "loss": 1.0449, + "step": 2245 + }, + { + "epoch": 0.3851083438712305, + "grad_norm": 1.71875, + "learning_rate": 1.9258369183715887e-05, + "loss": 1.1611, + "step": 2246 + }, + { + "epoch": 0.3852798079602203, + "grad_norm": 1.6171875, + "learning_rate": 1.9257686530456947e-05, + "loss": 1.0714, + "step": 2247 + }, + { + "epoch": 0.3854512720492102, + "grad_norm": 1.5234375, + "learning_rate": 1.9257003575269992e-05, + "loss": 0.9581, + "step": 2248 + }, + { + "epoch": 0.38562273613820003, + "grad_norm": 1.578125, + "learning_rate": 1.925632031817729e-05, + "loss": 1.0036, + "step": 2249 + }, + { + "epoch": 0.3857942002271899, + "grad_norm": 1.5859375, + "learning_rate": 1.9255636759201127e-05, + "loss": 1.0602, + "step": 2250 + }, + { + "epoch": 0.3859656643161798, + "grad_norm": 1.703125, + "learning_rate": 1.925495289836379e-05, + "loss": 1.1031, + "step": 2251 + }, + { + "epoch": 0.3861371284051696, + "grad_norm": 1.65625, + "learning_rate": 1.9254268735687595e-05, + "loss": 1.1401, + "step": 2252 + }, + { + "epoch": 0.3863085924941595, + "grad_norm": 1.59375, + "learning_rate": 1.9253584271194848e-05, + "loss": 1.0456, + "step": 2253 + }, + { + "epoch": 0.3864800565831494, + "grad_norm": 1.6875, + "learning_rate": 1.925289950490787e-05, + "loss": 1.1156, + "step": 2254 + }, + { + "epoch": 0.3866515206721392, + "grad_norm": 1.75, + "learning_rate": 1.9252214436848996e-05, + "loss": 1.0134, + "step": 2255 + }, + { + "epoch": 0.3868229847611291, + "grad_norm": 1.5859375, + "learning_rate": 1.925152906704057e-05, + "loss": 1.0402, + "step": 2256 + }, + { + "epoch": 0.38699444885011897, + "grad_norm": 1.6953125, + "learning_rate": 1.925084339550494e-05, + "loss": 1.0452, + "step": 2257 + }, + { + "epoch": 0.3871659129391088, + "grad_norm": 1.671875, + "learning_rate": 1.9250157422264477e-05, + "loss": 1.0607, + "step": 2258 + }, + { + "epoch": 0.3873373770280987, + "grad_norm": 1.578125, + "learning_rate": 1.9249471147341544e-05, + "loss": 1.0349, + "step": 2259 + }, + { + "epoch": 0.38750884111708855, + "grad_norm": 1.6640625, + "learning_rate": 1.9248784570758526e-05, + "loss": 0.984, + "step": 2260 + }, + { + "epoch": 0.3876803052060784, + "grad_norm": 1.671875, + "learning_rate": 1.9248097692537815e-05, + "loss": 1.0693, + "step": 2261 + }, + { + "epoch": 0.38785176929506826, + "grad_norm": 1.65625, + "learning_rate": 1.9247410512701815e-05, + "loss": 1.0794, + "step": 2262 + }, + { + "epoch": 0.38802323338405814, + "grad_norm": 1.578125, + "learning_rate": 1.9246723031272934e-05, + "loss": 0.9385, + "step": 2263 + }, + { + "epoch": 0.38819469747304797, + "grad_norm": 1.578125, + "learning_rate": 1.9246035248273598e-05, + "loss": 1.04, + "step": 2264 + }, + { + "epoch": 0.38836616156203785, + "grad_norm": 1.703125, + "learning_rate": 1.9245347163726233e-05, + "loss": 1.0896, + "step": 2265 + }, + { + "epoch": 0.38853762565102773, + "grad_norm": 1.625, + "learning_rate": 1.9244658777653282e-05, + "loss": 1.0744, + "step": 2266 + }, + { + "epoch": 0.38870908974001755, + "grad_norm": 1.640625, + "learning_rate": 1.9243970090077196e-05, + "loss": 1.0505, + "step": 2267 + }, + { + "epoch": 0.38888055382900744, + "grad_norm": 1.5625, + "learning_rate": 1.9243281101020436e-05, + "loss": 1.0416, + "step": 2268 + }, + { + "epoch": 0.3890520179179973, + "grad_norm": 1.609375, + "learning_rate": 1.9242591810505474e-05, + "loss": 0.9873, + "step": 2269 + }, + { + "epoch": 0.38922348200698714, + "grad_norm": 1.6875, + "learning_rate": 1.9241902218554787e-05, + "loss": 1.0796, + "step": 2270 + }, + { + "epoch": 0.389394946095977, + "grad_norm": 1.6640625, + "learning_rate": 1.9241212325190867e-05, + "loss": 1.1094, + "step": 2271 + }, + { + "epoch": 0.3895664101849669, + "grad_norm": 1.6484375, + "learning_rate": 1.9240522130436213e-05, + "loss": 1.0207, + "step": 2272 + }, + { + "epoch": 0.38973787427395673, + "grad_norm": 1.734375, + "learning_rate": 1.9239831634313338e-05, + "loss": 0.9764, + "step": 2273 + }, + { + "epoch": 0.3899093383629466, + "grad_norm": 1.6484375, + "learning_rate": 1.923914083684476e-05, + "loss": 1.0445, + "step": 2274 + }, + { + "epoch": 0.3900808024519365, + "grad_norm": 1.6171875, + "learning_rate": 1.9238449738053003e-05, + "loss": 1.0329, + "step": 2275 + }, + { + "epoch": 0.3902522665409263, + "grad_norm": 1.671875, + "learning_rate": 1.9237758337960616e-05, + "loss": 1.0006, + "step": 2276 + }, + { + "epoch": 0.3904237306299162, + "grad_norm": 1.5625, + "learning_rate": 1.9237066636590142e-05, + "loss": 0.9341, + "step": 2277 + }, + { + "epoch": 0.3905951947189061, + "grad_norm": 1.6875, + "learning_rate": 1.923637463396414e-05, + "loss": 1.0451, + "step": 2278 + }, + { + "epoch": 0.3907666588078959, + "grad_norm": 1.578125, + "learning_rate": 1.9235682330105182e-05, + "loss": 1.0136, + "step": 2279 + }, + { + "epoch": 0.3909381228968858, + "grad_norm": 1.703125, + "learning_rate": 1.9234989725035843e-05, + "loss": 1.0312, + "step": 2280 + }, + { + "epoch": 0.39110958698587567, + "grad_norm": 1.6640625, + "learning_rate": 1.923429681877872e-05, + "loss": 0.9722, + "step": 2281 + }, + { + "epoch": 0.3912810510748655, + "grad_norm": 1.65625, + "learning_rate": 1.9233603611356394e-05, + "loss": 1.036, + "step": 2282 + }, + { + "epoch": 0.3914525151638554, + "grad_norm": 1.671875, + "learning_rate": 1.923291010279149e-05, + "loss": 1.0469, + "step": 2283 + }, + { + "epoch": 0.39162397925284526, + "grad_norm": 1.5703125, + "learning_rate": 1.923221629310662e-05, + "loss": 1.0114, + "step": 2284 + }, + { + "epoch": 0.3917954433418351, + "grad_norm": 1.6328125, + "learning_rate": 1.923152218232441e-05, + "loss": 0.9944, + "step": 2285 + }, + { + "epoch": 0.39196690743082496, + "grad_norm": 1.65625, + "learning_rate": 1.9230827770467497e-05, + "loss": 0.985, + "step": 2286 + }, + { + "epoch": 0.39213837151981484, + "grad_norm": 1.6015625, + "learning_rate": 1.9230133057558533e-05, + "loss": 0.9982, + "step": 2287 + }, + { + "epoch": 0.39230983560880467, + "grad_norm": 1.671875, + "learning_rate": 1.922943804362017e-05, + "loss": 1.0644, + "step": 2288 + }, + { + "epoch": 0.39248129969779455, + "grad_norm": 1.6015625, + "learning_rate": 1.922874272867508e-05, + "loss": 1.0168, + "step": 2289 + }, + { + "epoch": 0.39265276378678443, + "grad_norm": 1.5703125, + "learning_rate": 1.9228047112745938e-05, + "loss": 1.0118, + "step": 2290 + }, + { + "epoch": 0.39282422787577426, + "grad_norm": 1.53125, + "learning_rate": 1.9227351195855428e-05, + "loss": 1.0358, + "step": 2291 + }, + { + "epoch": 0.39299569196476414, + "grad_norm": 1.546875, + "learning_rate": 1.9226654978026248e-05, + "loss": 0.9959, + "step": 2292 + }, + { + "epoch": 0.393167156053754, + "grad_norm": 1.578125, + "learning_rate": 1.9225958459281105e-05, + "loss": 1.0322, + "step": 2293 + }, + { + "epoch": 0.39333862014274384, + "grad_norm": 1.5703125, + "learning_rate": 1.9225261639642717e-05, + "loss": 0.9162, + "step": 2294 + }, + { + "epoch": 0.3935100842317337, + "grad_norm": 1.546875, + "learning_rate": 1.922456451913381e-05, + "loss": 0.9757, + "step": 2295 + }, + { + "epoch": 0.39368154832072355, + "grad_norm": 1.6953125, + "learning_rate": 1.9223867097777113e-05, + "loss": 1.0772, + "step": 2296 + }, + { + "epoch": 0.39385301240971343, + "grad_norm": 1.5546875, + "learning_rate": 1.9223169375595376e-05, + "loss": 0.9295, + "step": 2297 + }, + { + "epoch": 0.3940244764987033, + "grad_norm": 1.6171875, + "learning_rate": 1.9222471352611357e-05, + "loss": 1.0406, + "step": 2298 + }, + { + "epoch": 0.39419594058769314, + "grad_norm": 1.578125, + "learning_rate": 1.9221773028847817e-05, + "loss": 0.9843, + "step": 2299 + }, + { + "epoch": 0.394367404676683, + "grad_norm": 1.5859375, + "learning_rate": 1.9221074404327532e-05, + "loss": 1.0198, + "step": 2300 + }, + { + "epoch": 0.3945388687656729, + "grad_norm": 1.5703125, + "learning_rate": 1.922037547907329e-05, + "loss": 1.0301, + "step": 2301 + }, + { + "epoch": 0.3947103328546627, + "grad_norm": 1.9375, + "learning_rate": 1.921967625310788e-05, + "loss": 0.9537, + "step": 2302 + }, + { + "epoch": 0.3948817969436526, + "grad_norm": 1.5546875, + "learning_rate": 1.9218976726454112e-05, + "loss": 0.9385, + "step": 2303 + }, + { + "epoch": 0.3950532610326425, + "grad_norm": 1.59375, + "learning_rate": 1.9218276899134794e-05, + "loss": 1.0448, + "step": 2304 + }, + { + "epoch": 0.3952247251216323, + "grad_norm": 8.9375, + "learning_rate": 1.9217576771172756e-05, + "loss": 1.1329, + "step": 2305 + }, + { + "epoch": 0.3953961892106222, + "grad_norm": 1.59375, + "learning_rate": 1.9216876342590827e-05, + "loss": 0.9946, + "step": 2306 + }, + { + "epoch": 0.3955676532996121, + "grad_norm": 1.609375, + "learning_rate": 1.9216175613411857e-05, + "loss": 1.1019, + "step": 2307 + }, + { + "epoch": 0.3957391173886019, + "grad_norm": 1.59375, + "learning_rate": 1.9215474583658693e-05, + "loss": 0.9478, + "step": 2308 + }, + { + "epoch": 0.3959105814775918, + "grad_norm": 1.6875, + "learning_rate": 1.92147732533542e-05, + "loss": 1.1428, + "step": 2309 + }, + { + "epoch": 0.39608204556658166, + "grad_norm": 1.5390625, + "learning_rate": 1.921407162252125e-05, + "loss": 1.1084, + "step": 2310 + }, + { + "epoch": 0.3962535096555715, + "grad_norm": 1.6796875, + "learning_rate": 1.921336969118273e-05, + "loss": 1.0258, + "step": 2311 + }, + { + "epoch": 0.39642497374456137, + "grad_norm": 1.53125, + "learning_rate": 1.9212667459361532e-05, + "loss": 0.9763, + "step": 2312 + }, + { + "epoch": 0.39659643783355125, + "grad_norm": 1.6015625, + "learning_rate": 1.9211964927080552e-05, + "loss": 1.0425, + "step": 2313 + }, + { + "epoch": 0.3967679019225411, + "grad_norm": 1.640625, + "learning_rate": 1.921126209436271e-05, + "loss": 1.0274, + "step": 2314 + }, + { + "epoch": 0.39693936601153096, + "grad_norm": 1.640625, + "learning_rate": 1.9210558961230922e-05, + "loss": 0.9467, + "step": 2315 + }, + { + "epoch": 0.39711083010052084, + "grad_norm": 1.65625, + "learning_rate": 1.9209855527708127e-05, + "loss": 1.0797, + "step": 2316 + }, + { + "epoch": 0.39728229418951067, + "grad_norm": 1.625, + "learning_rate": 1.9209151793817257e-05, + "loss": 1.0166, + "step": 2317 + }, + { + "epoch": 0.39745375827850055, + "grad_norm": 1.6796875, + "learning_rate": 1.920844775958127e-05, + "loss": 1.0113, + "step": 2318 + }, + { + "epoch": 0.3976252223674904, + "grad_norm": 1.609375, + "learning_rate": 1.9207743425023125e-05, + "loss": 1.0353, + "step": 2319 + }, + { + "epoch": 0.39779668645648025, + "grad_norm": 1.6015625, + "learning_rate": 1.9207038790165796e-05, + "loss": 1.0908, + "step": 2320 + }, + { + "epoch": 0.39796815054547013, + "grad_norm": 1.625, + "learning_rate": 1.9206333855032262e-05, + "loss": 0.9886, + "step": 2321 + }, + { + "epoch": 0.39813961463446, + "grad_norm": 1.6015625, + "learning_rate": 1.9205628619645514e-05, + "loss": 1.0451, + "step": 2322 + }, + { + "epoch": 0.39831107872344984, + "grad_norm": 1.7421875, + "learning_rate": 1.9204923084028548e-05, + "loss": 1.0668, + "step": 2323 + }, + { + "epoch": 0.3984825428124397, + "grad_norm": 1.7578125, + "learning_rate": 1.920421724820438e-05, + "loss": 1.1112, + "step": 2324 + }, + { + "epoch": 0.3986540069014296, + "grad_norm": 1.59375, + "learning_rate": 1.9203511112196026e-05, + "loss": 1.0489, + "step": 2325 + }, + { + "epoch": 0.39882547099041943, + "grad_norm": 1.5390625, + "learning_rate": 1.920280467602652e-05, + "loss": 0.9801, + "step": 2326 + }, + { + "epoch": 0.3989969350794093, + "grad_norm": 1.6875, + "learning_rate": 1.9202097939718896e-05, + "loss": 1.1454, + "step": 2327 + }, + { + "epoch": 0.3991683991683992, + "grad_norm": 1.59375, + "learning_rate": 1.9201390903296208e-05, + "loss": 0.9652, + "step": 2328 + }, + { + "epoch": 0.399339863257389, + "grad_norm": 1.53125, + "learning_rate": 1.9200683566781513e-05, + "loss": 0.9748, + "step": 2329 + }, + { + "epoch": 0.3995113273463789, + "grad_norm": 1.578125, + "learning_rate": 1.919997593019788e-05, + "loss": 1.0532, + "step": 2330 + }, + { + "epoch": 0.3996827914353688, + "grad_norm": 1.5859375, + "learning_rate": 1.9199267993568387e-05, + "loss": 0.9792, + "step": 2331 + }, + { + "epoch": 0.3998542555243586, + "grad_norm": 1.6328125, + "learning_rate": 1.9198559756916123e-05, + "loss": 1.0428, + "step": 2332 + }, + { + "epoch": 0.4000257196133485, + "grad_norm": 1.703125, + "learning_rate": 1.919785122026419e-05, + "loss": 1.038, + "step": 2333 + }, + { + "epoch": 0.40019718370233837, + "grad_norm": 1.625, + "learning_rate": 1.919714238363569e-05, + "loss": 1.0486, + "step": 2334 + }, + { + "epoch": 0.4003686477913282, + "grad_norm": 1.7578125, + "learning_rate": 1.9196433247053746e-05, + "loss": 1.0839, + "step": 2335 + }, + { + "epoch": 0.4005401118803181, + "grad_norm": 1.75, + "learning_rate": 1.919572381054148e-05, + "loss": 1.1026, + "step": 2336 + }, + { + "epoch": 0.40071157596930795, + "grad_norm": 1.546875, + "learning_rate": 1.9195014074122037e-05, + "loss": 1.0809, + "step": 2337 + }, + { + "epoch": 0.4008830400582978, + "grad_norm": 1.703125, + "learning_rate": 1.9194304037818555e-05, + "loss": 1.1521, + "step": 2338 + }, + { + "epoch": 0.40105450414728766, + "grad_norm": 1.5859375, + "learning_rate": 1.9193593701654202e-05, + "loss": 1.0516, + "step": 2339 + }, + { + "epoch": 0.40122596823627754, + "grad_norm": 1.5546875, + "learning_rate": 1.9192883065652132e-05, + "loss": 1.058, + "step": 2340 + }, + { + "epoch": 0.40139743232526737, + "grad_norm": 1.578125, + "learning_rate": 1.9192172129835533e-05, + "loss": 1.053, + "step": 2341 + }, + { + "epoch": 0.40156889641425725, + "grad_norm": 1.59375, + "learning_rate": 1.919146089422758e-05, + "loss": 1.0059, + "step": 2342 + }, + { + "epoch": 0.4017403605032471, + "grad_norm": 1.546875, + "learning_rate": 1.9190749358851485e-05, + "loss": 1.0745, + "step": 2343 + }, + { + "epoch": 0.40191182459223695, + "grad_norm": 1.59375, + "learning_rate": 1.9190037523730435e-05, + "loss": 0.991, + "step": 2344 + }, + { + "epoch": 0.40208328868122684, + "grad_norm": 1.6875, + "learning_rate": 1.918932538888766e-05, + "loss": 0.9776, + "step": 2345 + }, + { + "epoch": 0.40225475277021666, + "grad_norm": 1.6796875, + "learning_rate": 1.918861295434638e-05, + "loss": 1.0356, + "step": 2346 + }, + { + "epoch": 0.40242621685920654, + "grad_norm": 1.6328125, + "learning_rate": 1.918790022012983e-05, + "loss": 0.9826, + "step": 2347 + }, + { + "epoch": 0.4025976809481964, + "grad_norm": 1.59375, + "learning_rate": 1.9187187186261254e-05, + "loss": 1.0368, + "step": 2348 + }, + { + "epoch": 0.40276914503718625, + "grad_norm": 1.625, + "learning_rate": 1.918647385276391e-05, + "loss": 1.0812, + "step": 2349 + }, + { + "epoch": 0.40294060912617613, + "grad_norm": 1.6875, + "learning_rate": 1.9185760219661057e-05, + "loss": 1.0314, + "step": 2350 + }, + { + "epoch": 0.403112073215166, + "grad_norm": 1.5390625, + "learning_rate": 1.9185046286975978e-05, + "loss": 1.0582, + "step": 2351 + }, + { + "epoch": 0.40328353730415584, + "grad_norm": 1.703125, + "learning_rate": 1.9184332054731948e-05, + "loss": 1.0616, + "step": 2352 + }, + { + "epoch": 0.4034550013931457, + "grad_norm": 1.546875, + "learning_rate": 1.9183617522952267e-05, + "loss": 1.0125, + "step": 2353 + }, + { + "epoch": 0.4036264654821356, + "grad_norm": 1.7578125, + "learning_rate": 1.9182902691660235e-05, + "loss": 1.0078, + "step": 2354 + }, + { + "epoch": 0.4037979295711254, + "grad_norm": 1.671875, + "learning_rate": 1.9182187560879165e-05, + "loss": 1.0332, + "step": 2355 + }, + { + "epoch": 0.4039693936601153, + "grad_norm": 1.7109375, + "learning_rate": 1.9181472130632385e-05, + "loss": 1.0976, + "step": 2356 + }, + { + "epoch": 0.4041408577491052, + "grad_norm": 1.6796875, + "learning_rate": 1.9180756400943222e-05, + "loss": 1.0831, + "step": 2357 + }, + { + "epoch": 0.404312321838095, + "grad_norm": 1.5390625, + "learning_rate": 1.9180040371835025e-05, + "loss": 1.0052, + "step": 2358 + }, + { + "epoch": 0.4044837859270849, + "grad_norm": 1.6640625, + "learning_rate": 1.917932404333114e-05, + "loss": 1.1313, + "step": 2359 + }, + { + "epoch": 0.4046552500160748, + "grad_norm": 1.65625, + "learning_rate": 1.917860741545493e-05, + "loss": 1.087, + "step": 2360 + }, + { + "epoch": 0.4048267141050646, + "grad_norm": 1.6328125, + "learning_rate": 1.9177890488229775e-05, + "loss": 1.1294, + "step": 2361 + }, + { + "epoch": 0.4049981781940545, + "grad_norm": 1.5, + "learning_rate": 1.9177173261679045e-05, + "loss": 0.8989, + "step": 2362 + }, + { + "epoch": 0.40516964228304436, + "grad_norm": 1.5859375, + "learning_rate": 1.9176455735826136e-05, + "loss": 1.0715, + "step": 2363 + }, + { + "epoch": 0.4053411063720342, + "grad_norm": 1.7109375, + "learning_rate": 1.917573791069445e-05, + "loss": 1.042, + "step": 2364 + }, + { + "epoch": 0.40551257046102407, + "grad_norm": 1.640625, + "learning_rate": 1.9175019786307404e-05, + "loss": 1.0684, + "step": 2365 + }, + { + "epoch": 0.40568403455001395, + "grad_norm": 1.6875, + "learning_rate": 1.9174301362688408e-05, + "loss": 1.0898, + "step": 2366 + }, + { + "epoch": 0.4058554986390038, + "grad_norm": 1.6171875, + "learning_rate": 1.9173582639860895e-05, + "loss": 1.0967, + "step": 2367 + }, + { + "epoch": 0.40602696272799366, + "grad_norm": 1.7109375, + "learning_rate": 1.9172863617848315e-05, + "loss": 1.0429, + "step": 2368 + }, + { + "epoch": 0.40619842681698354, + "grad_norm": 1.6796875, + "learning_rate": 1.9172144296674106e-05, + "loss": 1.0442, + "step": 2369 + }, + { + "epoch": 0.40636989090597336, + "grad_norm": 1.625, + "learning_rate": 1.917142467636173e-05, + "loss": 1.1039, + "step": 2370 + }, + { + "epoch": 0.40654135499496324, + "grad_norm": 1.625, + "learning_rate": 1.9170704756934663e-05, + "loss": 0.9351, + "step": 2371 + }, + { + "epoch": 0.4067128190839531, + "grad_norm": 1.6640625, + "learning_rate": 1.9169984538416382e-05, + "loss": 1.0478, + "step": 2372 + }, + { + "epoch": 0.40688428317294295, + "grad_norm": 1.53125, + "learning_rate": 1.916926402083037e-05, + "loss": 1.0912, + "step": 2373 + }, + { + "epoch": 0.40705574726193283, + "grad_norm": 1.71875, + "learning_rate": 1.916854320420013e-05, + "loss": 1.0144, + "step": 2374 + }, + { + "epoch": 0.4072272113509227, + "grad_norm": 1.640625, + "learning_rate": 1.9167822088549177e-05, + "loss": 1.1344, + "step": 2375 + }, + { + "epoch": 0.40739867543991254, + "grad_norm": 1.546875, + "learning_rate": 1.9167100673901014e-05, + "loss": 0.9543, + "step": 2376 + }, + { + "epoch": 0.4075701395289024, + "grad_norm": 1.6484375, + "learning_rate": 1.9166378960279185e-05, + "loss": 1.0708, + "step": 2377 + }, + { + "epoch": 0.4077416036178923, + "grad_norm": 1.609375, + "learning_rate": 1.9165656947707216e-05, + "loss": 1.021, + "step": 2378 + }, + { + "epoch": 0.4079130677068821, + "grad_norm": 1.640625, + "learning_rate": 1.916493463620866e-05, + "loss": 0.9727, + "step": 2379 + }, + { + "epoch": 0.408084531795872, + "grad_norm": 1.6796875, + "learning_rate": 1.9164212025807073e-05, + "loss": 1.0842, + "step": 2380 + }, + { + "epoch": 0.4082559958848619, + "grad_norm": 1.703125, + "learning_rate": 1.9163489116526025e-05, + "loss": 1.0958, + "step": 2381 + }, + { + "epoch": 0.4084274599738517, + "grad_norm": 1.5390625, + "learning_rate": 1.916276590838909e-05, + "loss": 1.0029, + "step": 2382 + }, + { + "epoch": 0.4085989240628416, + "grad_norm": 1.8515625, + "learning_rate": 1.9162042401419853e-05, + "loss": 1.0555, + "step": 2383 + }, + { + "epoch": 0.4087703881518315, + "grad_norm": 1.5703125, + "learning_rate": 1.9161318595641915e-05, + "loss": 1.0398, + "step": 2384 + }, + { + "epoch": 0.4089418522408213, + "grad_norm": 1.765625, + "learning_rate": 1.9160594491078875e-05, + "loss": 1.0446, + "step": 2385 + }, + { + "epoch": 0.4091133163298112, + "grad_norm": 1.625, + "learning_rate": 1.915987008775436e-05, + "loss": 1.0675, + "step": 2386 + }, + { + "epoch": 0.40928478041880106, + "grad_norm": 1.625, + "learning_rate": 1.915914538569198e-05, + "loss": 1.1024, + "step": 2387 + }, + { + "epoch": 0.4094562445077909, + "grad_norm": 1.625, + "learning_rate": 1.9158420384915387e-05, + "loss": 1.106, + "step": 2388 + }, + { + "epoch": 0.40962770859678077, + "grad_norm": 1.6015625, + "learning_rate": 1.915769508544821e-05, + "loss": 1.0341, + "step": 2389 + }, + { + "epoch": 0.4097991726857706, + "grad_norm": 1.6875, + "learning_rate": 1.915696948731412e-05, + "loss": 1.062, + "step": 2390 + }, + { + "epoch": 0.4099706367747605, + "grad_norm": 1.6015625, + "learning_rate": 1.9156243590536764e-05, + "loss": 1.1496, + "step": 2391 + }, + { + "epoch": 0.41014210086375036, + "grad_norm": 1.640625, + "learning_rate": 1.9155517395139833e-05, + "loss": 1.0598, + "step": 2392 + }, + { + "epoch": 0.4103135649527402, + "grad_norm": 1.65625, + "learning_rate": 1.9154790901147e-05, + "loss": 0.9867, + "step": 2393 + }, + { + "epoch": 0.41048502904173007, + "grad_norm": 1.6015625, + "learning_rate": 1.915406410858196e-05, + "loss": 1.0782, + "step": 2394 + }, + { + "epoch": 0.41065649313071995, + "grad_norm": 1.65625, + "learning_rate": 1.9153337017468424e-05, + "loss": 1.0514, + "step": 2395 + }, + { + "epoch": 0.41082795721970977, + "grad_norm": 1.6484375, + "learning_rate": 1.9152609627830095e-05, + "loss": 1.0171, + "step": 2396 + }, + { + "epoch": 0.41099942130869965, + "grad_norm": 1.6484375, + "learning_rate": 1.91518819396907e-05, + "loss": 1.0764, + "step": 2397 + }, + { + "epoch": 0.41117088539768953, + "grad_norm": 1.8984375, + "learning_rate": 1.9151153953073976e-05, + "loss": 1.0801, + "step": 2398 + }, + { + "epoch": 0.41134234948667936, + "grad_norm": 1.5625, + "learning_rate": 1.9150425668003657e-05, + "loss": 1.0627, + "step": 2399 + }, + { + "epoch": 0.41151381357566924, + "grad_norm": 1.59375, + "learning_rate": 1.9149697084503502e-05, + "loss": 1.0157, + "step": 2400 + }, + { + "epoch": 0.4116852776646591, + "grad_norm": 1.53125, + "learning_rate": 1.9148968202597272e-05, + "loss": 0.9235, + "step": 2401 + }, + { + "epoch": 0.41185674175364895, + "grad_norm": 1.6171875, + "learning_rate": 1.9148239022308734e-05, + "loss": 1.0204, + "step": 2402 + }, + { + "epoch": 0.41202820584263883, + "grad_norm": 1.671875, + "learning_rate": 1.9147509543661677e-05, + "loss": 1.0096, + "step": 2403 + }, + { + "epoch": 0.4121996699316287, + "grad_norm": 1.703125, + "learning_rate": 1.9146779766679885e-05, + "loss": 1.0784, + "step": 2404 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 1.6953125, + "learning_rate": 1.9146049691387157e-05, + "loss": 0.9982, + "step": 2405 + }, + { + "epoch": 0.4125425981096084, + "grad_norm": 1.609375, + "learning_rate": 1.9145319317807314e-05, + "loss": 1.0323, + "step": 2406 + }, + { + "epoch": 0.4127140621985983, + "grad_norm": 1.6875, + "learning_rate": 1.9144588645964165e-05, + "loss": 1.0299, + "step": 2407 + }, + { + "epoch": 0.4128855262875881, + "grad_norm": 1.6484375, + "learning_rate": 1.914385767588155e-05, + "loss": 1.052, + "step": 2408 + }, + { + "epoch": 0.413056990376578, + "grad_norm": 1.7109375, + "learning_rate": 1.91431264075833e-05, + "loss": 1.0281, + "step": 2409 + }, + { + "epoch": 0.4132284544655679, + "grad_norm": 1.7109375, + "learning_rate": 1.914239484109327e-05, + "loss": 1.0676, + "step": 2410 + }, + { + "epoch": 0.4133999185545577, + "grad_norm": 1.640625, + "learning_rate": 1.9141662976435315e-05, + "loss": 1.1174, + "step": 2411 + }, + { + "epoch": 0.4135713826435476, + "grad_norm": 1.6640625, + "learning_rate": 1.9140930813633307e-05, + "loss": 1.1627, + "step": 2412 + }, + { + "epoch": 0.4137428467325375, + "grad_norm": 1.8046875, + "learning_rate": 1.9140198352711124e-05, + "loss": 1.0565, + "step": 2413 + }, + { + "epoch": 0.4139143108215273, + "grad_norm": 1.6484375, + "learning_rate": 1.9139465593692653e-05, + "loss": 1.0109, + "step": 2414 + }, + { + "epoch": 0.4140857749105172, + "grad_norm": 1.625, + "learning_rate": 1.9138732536601794e-05, + "loss": 1.0394, + "step": 2415 + }, + { + "epoch": 0.41425723899950706, + "grad_norm": 1.625, + "learning_rate": 1.9137999181462453e-05, + "loss": 1.023, + "step": 2416 + }, + { + "epoch": 0.4144287030884969, + "grad_norm": 1.8203125, + "learning_rate": 1.913726552829855e-05, + "loss": 1.0992, + "step": 2417 + }, + { + "epoch": 0.41460016717748677, + "grad_norm": 1.6640625, + "learning_rate": 1.9136531577134007e-05, + "loss": 1.1668, + "step": 2418 + }, + { + "epoch": 0.41477163126647665, + "grad_norm": 1.6171875, + "learning_rate": 1.9135797327992766e-05, + "loss": 0.9833, + "step": 2419 + }, + { + "epoch": 0.4149430953554665, + "grad_norm": 1.6640625, + "learning_rate": 1.9135062780898775e-05, + "loss": 1.0893, + "step": 2420 + }, + { + "epoch": 0.41511455944445635, + "grad_norm": 1.640625, + "learning_rate": 1.913432793587598e-05, + "loss": 1.0264, + "step": 2421 + }, + { + "epoch": 0.41528602353344624, + "grad_norm": 1.640625, + "learning_rate": 1.913359279294836e-05, + "loss": 0.9673, + "step": 2422 + }, + { + "epoch": 0.41545748762243606, + "grad_norm": 1.6171875, + "learning_rate": 1.9132857352139884e-05, + "loss": 1.0679, + "step": 2423 + }, + { + "epoch": 0.41562895171142594, + "grad_norm": 1.546875, + "learning_rate": 1.913212161347454e-05, + "loss": 0.9749, + "step": 2424 + }, + { + "epoch": 0.4158004158004158, + "grad_norm": 1.6640625, + "learning_rate": 1.913138557697632e-05, + "loss": 1.1008, + "step": 2425 + }, + { + "epoch": 0.41597187988940565, + "grad_norm": 1.5703125, + "learning_rate": 1.913064924266923e-05, + "loss": 1.0338, + "step": 2426 + }, + { + "epoch": 0.41614334397839553, + "grad_norm": 1.7109375, + "learning_rate": 1.9129912610577287e-05, + "loss": 1.0881, + "step": 2427 + }, + { + "epoch": 0.4163148080673854, + "grad_norm": 1.5625, + "learning_rate": 1.912917568072451e-05, + "loss": 1.1241, + "step": 2428 + }, + { + "epoch": 0.41648627215637524, + "grad_norm": 1.640625, + "learning_rate": 1.912843845313494e-05, + "loss": 1.1109, + "step": 2429 + }, + { + "epoch": 0.4166577362453651, + "grad_norm": 1.59375, + "learning_rate": 1.9127700927832616e-05, + "loss": 0.9705, + "step": 2430 + }, + { + "epoch": 0.416829200334355, + "grad_norm": 1.59375, + "learning_rate": 1.9126963104841595e-05, + "loss": 1.0619, + "step": 2431 + }, + { + "epoch": 0.4170006644233448, + "grad_norm": 1.6015625, + "learning_rate": 1.912622498418594e-05, + "loss": 1.0899, + "step": 2432 + }, + { + "epoch": 0.4171721285123347, + "grad_norm": 1.6484375, + "learning_rate": 1.912548656588972e-05, + "loss": 1.0203, + "step": 2433 + }, + { + "epoch": 0.4173435926013246, + "grad_norm": 1.703125, + "learning_rate": 1.9124747849977016e-05, + "loss": 1.0402, + "step": 2434 + }, + { + "epoch": 0.4175150566903144, + "grad_norm": 1.6328125, + "learning_rate": 1.9124008836471927e-05, + "loss": 1.0895, + "step": 2435 + }, + { + "epoch": 0.4176865207793043, + "grad_norm": 1.625, + "learning_rate": 1.9123269525398554e-05, + "loss": 1.0637, + "step": 2436 + }, + { + "epoch": 0.4178579848682941, + "grad_norm": 1.6953125, + "learning_rate": 1.9122529916781002e-05, + "loss": 0.9888, + "step": 2437 + }, + { + "epoch": 0.418029448957284, + "grad_norm": 1.671875, + "learning_rate": 1.9121790010643402e-05, + "loss": 1.0156, + "step": 2438 + }, + { + "epoch": 0.4182009130462739, + "grad_norm": 1.6328125, + "learning_rate": 1.912104980700988e-05, + "loss": 1.1407, + "step": 2439 + }, + { + "epoch": 0.4183723771352637, + "grad_norm": 1.5, + "learning_rate": 1.9120309305904575e-05, + "loss": 1.0139, + "step": 2440 + }, + { + "epoch": 0.4185438412242536, + "grad_norm": 1.640625, + "learning_rate": 1.911956850735164e-05, + "loss": 1.015, + "step": 2441 + }, + { + "epoch": 0.41871530531324347, + "grad_norm": 1.65625, + "learning_rate": 1.9118827411375233e-05, + "loss": 1.0958, + "step": 2442 + }, + { + "epoch": 0.4188867694022333, + "grad_norm": 1.546875, + "learning_rate": 1.911808601799953e-05, + "loss": 0.9161, + "step": 2443 + }, + { + "epoch": 0.4190582334912232, + "grad_norm": 1.6640625, + "learning_rate": 1.9117344327248704e-05, + "loss": 1.0281, + "step": 2444 + }, + { + "epoch": 0.41922969758021306, + "grad_norm": 1.5625, + "learning_rate": 1.9116602339146945e-05, + "loss": 1.0395, + "step": 2445 + }, + { + "epoch": 0.4194011616692029, + "grad_norm": 1.5625, + "learning_rate": 1.9115860053718455e-05, + "loss": 1.0219, + "step": 2446 + }, + { + "epoch": 0.41957262575819276, + "grad_norm": 1.609375, + "learning_rate": 1.911511747098744e-05, + "loss": 1.0217, + "step": 2447 + }, + { + "epoch": 0.41974408984718264, + "grad_norm": 1.6171875, + "learning_rate": 1.9114374590978123e-05, + "loss": 0.9609, + "step": 2448 + }, + { + "epoch": 0.41991555393617247, + "grad_norm": 2.296875, + "learning_rate": 1.911363141371473e-05, + "loss": 1.0245, + "step": 2449 + }, + { + "epoch": 0.42008701802516235, + "grad_norm": 1.6015625, + "learning_rate": 1.9112887939221495e-05, + "loss": 1.0485, + "step": 2450 + }, + { + "epoch": 0.42025848211415223, + "grad_norm": 1.703125, + "learning_rate": 1.911214416752267e-05, + "loss": 1.1228, + "step": 2451 + }, + { + "epoch": 0.42042994620314206, + "grad_norm": 1.953125, + "learning_rate": 1.911140009864251e-05, + "loss": 1.0937, + "step": 2452 + }, + { + "epoch": 0.42060141029213194, + "grad_norm": 1.6015625, + "learning_rate": 1.911065573260528e-05, + "loss": 1.0646, + "step": 2453 + }, + { + "epoch": 0.4207728743811218, + "grad_norm": 1.609375, + "learning_rate": 1.9109911069435263e-05, + "loss": 1.0335, + "step": 2454 + }, + { + "epoch": 0.42094433847011165, + "grad_norm": 1.640625, + "learning_rate": 1.910916610915674e-05, + "loss": 1.0465, + "step": 2455 + }, + { + "epoch": 0.4211158025591015, + "grad_norm": 1.5546875, + "learning_rate": 1.9108420851794007e-05, + "loss": 0.9507, + "step": 2456 + }, + { + "epoch": 0.4212872666480914, + "grad_norm": 1.6328125, + "learning_rate": 1.9107675297371372e-05, + "loss": 0.9774, + "step": 2457 + }, + { + "epoch": 0.42145873073708123, + "grad_norm": 1.6328125, + "learning_rate": 1.9106929445913147e-05, + "loss": 1.1207, + "step": 2458 + }, + { + "epoch": 0.4216301948260711, + "grad_norm": 1.5078125, + "learning_rate": 1.910618329744366e-05, + "loss": 1.0304, + "step": 2459 + }, + { + "epoch": 0.421801658915061, + "grad_norm": 1.578125, + "learning_rate": 1.9105436851987247e-05, + "loss": 1.0384, + "step": 2460 + }, + { + "epoch": 0.4219731230040508, + "grad_norm": 1.6484375, + "learning_rate": 1.9104690109568248e-05, + "loss": 1.0574, + "step": 2461 + }, + { + "epoch": 0.4221445870930407, + "grad_norm": 1.609375, + "learning_rate": 1.910394307021102e-05, + "loss": 0.9608, + "step": 2462 + }, + { + "epoch": 0.4223160511820306, + "grad_norm": 1.5703125, + "learning_rate": 1.910319573393993e-05, + "loss": 1.0314, + "step": 2463 + }, + { + "epoch": 0.4224875152710204, + "grad_norm": 1.6328125, + "learning_rate": 1.9102448100779343e-05, + "loss": 1.0206, + "step": 2464 + }, + { + "epoch": 0.4226589793600103, + "grad_norm": 1.6015625, + "learning_rate": 1.910170017075365e-05, + "loss": 1.0368, + "step": 2465 + }, + { + "epoch": 0.42283044344900017, + "grad_norm": 1.6796875, + "learning_rate": 1.910095194388724e-05, + "loss": 1.0978, + "step": 2466 + }, + { + "epoch": 0.42300190753799, + "grad_norm": 1.640625, + "learning_rate": 1.9100203420204516e-05, + "loss": 1.1047, + "step": 2467 + }, + { + "epoch": 0.4231733716269799, + "grad_norm": 1.5859375, + "learning_rate": 1.9099454599729887e-05, + "loss": 1.028, + "step": 2468 + }, + { + "epoch": 0.42334483571596976, + "grad_norm": 1.6328125, + "learning_rate": 1.909870548248778e-05, + "loss": 1.0247, + "step": 2469 + }, + { + "epoch": 0.4235162998049596, + "grad_norm": 1.5703125, + "learning_rate": 1.9097956068502626e-05, + "loss": 1.0003, + "step": 2470 + }, + { + "epoch": 0.42368776389394947, + "grad_norm": 1.546875, + "learning_rate": 1.9097206357798864e-05, + "loss": 1.086, + "step": 2471 + }, + { + "epoch": 0.42385922798293935, + "grad_norm": 1.65625, + "learning_rate": 1.9096456350400948e-05, + "loss": 1.0451, + "step": 2472 + }, + { + "epoch": 0.42403069207192917, + "grad_norm": 1.6953125, + "learning_rate": 1.9095706046333333e-05, + "loss": 1.1459, + "step": 2473 + }, + { + "epoch": 0.42420215616091905, + "grad_norm": 1.6171875, + "learning_rate": 1.9094955445620492e-05, + "loss": 1.0954, + "step": 2474 + }, + { + "epoch": 0.42437362024990893, + "grad_norm": 1.6171875, + "learning_rate": 1.909420454828691e-05, + "loss": 0.9514, + "step": 2475 + }, + { + "epoch": 0.42454508433889876, + "grad_norm": 1.5703125, + "learning_rate": 1.9093453354357064e-05, + "loss": 1.0758, + "step": 2476 + }, + { + "epoch": 0.42471654842788864, + "grad_norm": 1.765625, + "learning_rate": 1.909270186385547e-05, + "loss": 1.0863, + "step": 2477 + }, + { + "epoch": 0.4248880125168785, + "grad_norm": 1.609375, + "learning_rate": 1.9091950076806626e-05, + "loss": 1.0663, + "step": 2478 + }, + { + "epoch": 0.42505947660586835, + "grad_norm": 1.5390625, + "learning_rate": 1.909119799323505e-05, + "loss": 0.9911, + "step": 2479 + }, + { + "epoch": 0.42523094069485823, + "grad_norm": 1.6328125, + "learning_rate": 1.9090445613165273e-05, + "loss": 1.0238, + "step": 2480 + }, + { + "epoch": 0.4254024047838481, + "grad_norm": 1.734375, + "learning_rate": 1.9089692936621836e-05, + "loss": 1.1141, + "step": 2481 + }, + { + "epoch": 0.42557386887283793, + "grad_norm": 1.4609375, + "learning_rate": 1.908893996362928e-05, + "loss": 0.9739, + "step": 2482 + }, + { + "epoch": 0.4257453329618278, + "grad_norm": 1.625, + "learning_rate": 1.9088186694212174e-05, + "loss": 1.0654, + "step": 2483 + }, + { + "epoch": 0.4259167970508177, + "grad_norm": 1.6484375, + "learning_rate": 1.9087433128395073e-05, + "loss": 1.0327, + "step": 2484 + }, + { + "epoch": 0.4260882611398075, + "grad_norm": 1.671875, + "learning_rate": 1.9086679266202554e-05, + "loss": 1.0202, + "step": 2485 + }, + { + "epoch": 0.4262597252287974, + "grad_norm": 1.5546875, + "learning_rate": 1.908592510765921e-05, + "loss": 1.1057, + "step": 2486 + }, + { + "epoch": 0.42643118931778723, + "grad_norm": 1.640625, + "learning_rate": 1.908517065278963e-05, + "loss": 1.0686, + "step": 2487 + }, + { + "epoch": 0.4266026534067771, + "grad_norm": 1.5390625, + "learning_rate": 1.9084415901618428e-05, + "loss": 0.9874, + "step": 2488 + }, + { + "epoch": 0.426774117495767, + "grad_norm": 1.7265625, + "learning_rate": 1.9083660854170212e-05, + "loss": 1.0316, + "step": 2489 + }, + { + "epoch": 0.4269455815847568, + "grad_norm": 1.625, + "learning_rate": 1.908290551046961e-05, + "loss": 1.0549, + "step": 2490 + }, + { + "epoch": 0.4271170456737467, + "grad_norm": 1.6484375, + "learning_rate": 1.9082149870541258e-05, + "loss": 1.039, + "step": 2491 + }, + { + "epoch": 0.4272885097627366, + "grad_norm": 1.59375, + "learning_rate": 1.9081393934409797e-05, + "loss": 1.0129, + "step": 2492 + }, + { + "epoch": 0.4274599738517264, + "grad_norm": 1.734375, + "learning_rate": 1.9080637702099883e-05, + "loss": 1.1138, + "step": 2493 + }, + { + "epoch": 0.4276314379407163, + "grad_norm": 1.59375, + "learning_rate": 1.9079881173636182e-05, + "loss": 1.013, + "step": 2494 + }, + { + "epoch": 0.42780290202970617, + "grad_norm": 1.640625, + "learning_rate": 1.907912434904336e-05, + "loss": 1.0412, + "step": 2495 + }, + { + "epoch": 0.427974366118696, + "grad_norm": 1.6328125, + "learning_rate": 1.9078367228346106e-05, + "loss": 1.0565, + "step": 2496 + }, + { + "epoch": 0.4281458302076859, + "grad_norm": 1.6171875, + "learning_rate": 1.9077609811569107e-05, + "loss": 1.0311, + "step": 2497 + }, + { + "epoch": 0.42831729429667575, + "grad_norm": 1.671875, + "learning_rate": 1.9076852098737074e-05, + "loss": 1.1392, + "step": 2498 + }, + { + "epoch": 0.4284887583856656, + "grad_norm": 1.640625, + "learning_rate": 1.907609408987471e-05, + "loss": 1.0764, + "step": 2499 + }, + { + "epoch": 0.42866022247465546, + "grad_norm": 1.703125, + "learning_rate": 1.9075335785006743e-05, + "loss": 1.063, + "step": 2500 + }, + { + "epoch": 0.42883168656364534, + "grad_norm": 1.6015625, + "learning_rate": 1.90745771841579e-05, + "loss": 1.1018, + "step": 2501 + }, + { + "epoch": 0.42900315065263517, + "grad_norm": 1.609375, + "learning_rate": 1.9073818287352925e-05, + "loss": 1.0077, + "step": 2502 + }, + { + "epoch": 0.42917461474162505, + "grad_norm": 1.6015625, + "learning_rate": 1.9073059094616565e-05, + "loss": 0.9708, + "step": 2503 + }, + { + "epoch": 0.42934607883061493, + "grad_norm": 1.625, + "learning_rate": 1.9072299605973582e-05, + "loss": 1.068, + "step": 2504 + }, + { + "epoch": 0.42951754291960476, + "grad_norm": 1.5625, + "learning_rate": 1.907153982144875e-05, + "loss": 1.0559, + "step": 2505 + }, + { + "epoch": 0.42968900700859464, + "grad_norm": 1.578125, + "learning_rate": 1.907077974106684e-05, + "loss": 1.1482, + "step": 2506 + }, + { + "epoch": 0.4298604710975845, + "grad_norm": 1.546875, + "learning_rate": 1.9070019364852646e-05, + "loss": 1.071, + "step": 2507 + }, + { + "epoch": 0.43003193518657434, + "grad_norm": 1.6484375, + "learning_rate": 1.9069258692830964e-05, + "loss": 1.0705, + "step": 2508 + }, + { + "epoch": 0.4302033992755642, + "grad_norm": 1.6171875, + "learning_rate": 1.9068497725026607e-05, + "loss": 1.0141, + "step": 2509 + }, + { + "epoch": 0.4303748633645541, + "grad_norm": 1.6484375, + "learning_rate": 1.906773646146439e-05, + "loss": 1.1033, + "step": 2510 + }, + { + "epoch": 0.43054632745354393, + "grad_norm": 1.71875, + "learning_rate": 1.9066974902169142e-05, + "loss": 0.9858, + "step": 2511 + }, + { + "epoch": 0.4307177915425338, + "grad_norm": 1.7109375, + "learning_rate": 1.9066213047165698e-05, + "loss": 1.0306, + "step": 2512 + }, + { + "epoch": 0.4308892556315237, + "grad_norm": 1.65625, + "learning_rate": 1.9065450896478904e-05, + "loss": 1.0003, + "step": 2513 + }, + { + "epoch": 0.4310607197205135, + "grad_norm": 1.5078125, + "learning_rate": 1.906468845013362e-05, + "loss": 0.9826, + "step": 2514 + }, + { + "epoch": 0.4312321838095034, + "grad_norm": 1.671875, + "learning_rate": 1.9063925708154713e-05, + "loss": 1.0373, + "step": 2515 + }, + { + "epoch": 0.4314036478984933, + "grad_norm": 1.5546875, + "learning_rate": 1.9063162670567057e-05, + "loss": 1.072, + "step": 2516 + }, + { + "epoch": 0.4315751119874831, + "grad_norm": 1.671875, + "learning_rate": 1.906239933739554e-05, + "loss": 1.0728, + "step": 2517 + }, + { + "epoch": 0.431746576076473, + "grad_norm": 1.5859375, + "learning_rate": 1.9061635708665053e-05, + "loss": 1.0666, + "step": 2518 + }, + { + "epoch": 0.43191804016546287, + "grad_norm": 1.5078125, + "learning_rate": 1.90608717844005e-05, + "loss": 1.0418, + "step": 2519 + }, + { + "epoch": 0.4320895042544527, + "grad_norm": 1.6171875, + "learning_rate": 1.90601075646268e-05, + "loss": 0.9639, + "step": 2520 + }, + { + "epoch": 0.4322609683434426, + "grad_norm": 1.5390625, + "learning_rate": 1.905934304936887e-05, + "loss": 0.9816, + "step": 2521 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 1.6875, + "learning_rate": 1.9058578238651655e-05, + "loss": 1.0473, + "step": 2522 + }, + { + "epoch": 0.4326038965214223, + "grad_norm": 1.609375, + "learning_rate": 1.9057813132500095e-05, + "loss": 1.105, + "step": 2523 + }, + { + "epoch": 0.43277536061041216, + "grad_norm": 1.6171875, + "learning_rate": 1.9057047730939132e-05, + "loss": 1.0386, + "step": 2524 + }, + { + "epoch": 0.43294682469940204, + "grad_norm": 1.6953125, + "learning_rate": 1.905628203399374e-05, + "loss": 1.0777, + "step": 2525 + }, + { + "epoch": 0.43311828878839187, + "grad_norm": 1.6328125, + "learning_rate": 1.905551604168889e-05, + "loss": 1.0285, + "step": 2526 + }, + { + "epoch": 0.43328975287738175, + "grad_norm": 1.7421875, + "learning_rate": 1.905474975404956e-05, + "loss": 1.0409, + "step": 2527 + }, + { + "epoch": 0.43346121696637163, + "grad_norm": 1.5859375, + "learning_rate": 1.9053983171100744e-05, + "loss": 1.0576, + "step": 2528 + }, + { + "epoch": 0.43363268105536146, + "grad_norm": 1.75, + "learning_rate": 1.9053216292867445e-05, + "loss": 1.1138, + "step": 2529 + }, + { + "epoch": 0.43380414514435134, + "grad_norm": 1.6171875, + "learning_rate": 1.905244911937467e-05, + "loss": 1.0694, + "step": 2530 + }, + { + "epoch": 0.4339756092333412, + "grad_norm": 1.640625, + "learning_rate": 1.9051681650647437e-05, + "loss": 0.955, + "step": 2531 + }, + { + "epoch": 0.43414707332233105, + "grad_norm": 1.546875, + "learning_rate": 1.9050913886710786e-05, + "loss": 1.021, + "step": 2532 + }, + { + "epoch": 0.4343185374113209, + "grad_norm": 1.703125, + "learning_rate": 1.9050145827589745e-05, + "loss": 1.098, + "step": 2533 + }, + { + "epoch": 0.43449000150031075, + "grad_norm": 1.609375, + "learning_rate": 1.904937747330937e-05, + "loss": 1.0156, + "step": 2534 + }, + { + "epoch": 0.43466146558930063, + "grad_norm": 1.6640625, + "learning_rate": 1.9048608823894722e-05, + "loss": 1.0958, + "step": 2535 + }, + { + "epoch": 0.4348329296782905, + "grad_norm": 1.71875, + "learning_rate": 1.9047839879370867e-05, + "loss": 1.0483, + "step": 2536 + }, + { + "epoch": 0.43500439376728034, + "grad_norm": 1.59375, + "learning_rate": 1.9047070639762878e-05, + "loss": 0.9646, + "step": 2537 + }, + { + "epoch": 0.4351758578562702, + "grad_norm": 1.5859375, + "learning_rate": 1.9046301105095847e-05, + "loss": 0.9332, + "step": 2538 + }, + { + "epoch": 0.4353473219452601, + "grad_norm": 1.671875, + "learning_rate": 1.9045531275394878e-05, + "loss": 1.0767, + "step": 2539 + }, + { + "epoch": 0.4355187860342499, + "grad_norm": 1.6171875, + "learning_rate": 1.904476115068507e-05, + "loss": 1.0839, + "step": 2540 + }, + { + "epoch": 0.4356902501232398, + "grad_norm": 1.5703125, + "learning_rate": 1.9043990730991536e-05, + "loss": 1.0746, + "step": 2541 + }, + { + "epoch": 0.4358617142122297, + "grad_norm": 1.5859375, + "learning_rate": 1.9043220016339414e-05, + "loss": 0.9487, + "step": 2542 + }, + { + "epoch": 0.4360331783012195, + "grad_norm": 1.6875, + "learning_rate": 1.9042449006753827e-05, + "loss": 1.0746, + "step": 2543 + }, + { + "epoch": 0.4362046423902094, + "grad_norm": 1.5546875, + "learning_rate": 1.904167770225993e-05, + "loss": 1.0214, + "step": 2544 + }, + { + "epoch": 0.4363761064791993, + "grad_norm": 1.5703125, + "learning_rate": 1.904090610288288e-05, + "loss": 1.0514, + "step": 2545 + }, + { + "epoch": 0.4365475705681891, + "grad_norm": 1.53125, + "learning_rate": 1.904013420864783e-05, + "loss": 1.0005, + "step": 2546 + }, + { + "epoch": 0.436719034657179, + "grad_norm": 1.5625, + "learning_rate": 1.9039362019579965e-05, + "loss": 0.9825, + "step": 2547 + }, + { + "epoch": 0.43689049874616886, + "grad_norm": 1.75, + "learning_rate": 1.9038589535704467e-05, + "loss": 1.0655, + "step": 2548 + }, + { + "epoch": 0.4370619628351587, + "grad_norm": 1.6015625, + "learning_rate": 1.9037816757046528e-05, + "loss": 0.9854, + "step": 2549 + }, + { + "epoch": 0.43723342692414857, + "grad_norm": 1.5546875, + "learning_rate": 1.903704368363135e-05, + "loss": 1.0154, + "step": 2550 + }, + { + "epoch": 0.43740489101313845, + "grad_norm": 1.578125, + "learning_rate": 1.9036270315484144e-05, + "loss": 1.0617, + "step": 2551 + }, + { + "epoch": 0.4375763551021283, + "grad_norm": 1.6640625, + "learning_rate": 1.9035496652630138e-05, + "loss": 1.0488, + "step": 2552 + }, + { + "epoch": 0.43774781919111816, + "grad_norm": 1.640625, + "learning_rate": 1.9034722695094562e-05, + "loss": 1.0497, + "step": 2553 + }, + { + "epoch": 0.43791928328010804, + "grad_norm": 1.671875, + "learning_rate": 1.9033948442902658e-05, + "loss": 1.0427, + "step": 2554 + }, + { + "epoch": 0.43809074736909787, + "grad_norm": 1.625, + "learning_rate": 1.9033173896079677e-05, + "loss": 1.0454, + "step": 2555 + }, + { + "epoch": 0.43826221145808775, + "grad_norm": 1.53125, + "learning_rate": 1.9032399054650876e-05, + "loss": 0.962, + "step": 2556 + }, + { + "epoch": 0.43843367554707763, + "grad_norm": 1.5625, + "learning_rate": 1.903162391864153e-05, + "loss": 1.0095, + "step": 2557 + }, + { + "epoch": 0.43860513963606745, + "grad_norm": 1.75, + "learning_rate": 1.9030848488076924e-05, + "loss": 1.1543, + "step": 2558 + }, + { + "epoch": 0.43877660372505733, + "grad_norm": 1.671875, + "learning_rate": 1.9030072762982335e-05, + "loss": 1.0546, + "step": 2559 + }, + { + "epoch": 0.4389480678140472, + "grad_norm": 1.671875, + "learning_rate": 1.9029296743383074e-05, + "loss": 1.1123, + "step": 2560 + }, + { + "epoch": 0.43911953190303704, + "grad_norm": 1.71875, + "learning_rate": 1.902852042930444e-05, + "loss": 1.0054, + "step": 2561 + }, + { + "epoch": 0.4392909959920269, + "grad_norm": 1.6328125, + "learning_rate": 1.9027743820771756e-05, + "loss": 1.0301, + "step": 2562 + }, + { + "epoch": 0.4394624600810168, + "grad_norm": 1.671875, + "learning_rate": 1.9026966917810356e-05, + "loss": 1.0997, + "step": 2563 + }, + { + "epoch": 0.43963392417000663, + "grad_norm": 1.5390625, + "learning_rate": 1.9026189720445568e-05, + "loss": 1.0412, + "step": 2564 + }, + { + "epoch": 0.4398053882589965, + "grad_norm": 1.5078125, + "learning_rate": 1.9025412228702747e-05, + "loss": 1.0091, + "step": 2565 + }, + { + "epoch": 0.4399768523479864, + "grad_norm": 1.640625, + "learning_rate": 1.9024634442607245e-05, + "loss": 1.0701, + "step": 2566 + }, + { + "epoch": 0.4401483164369762, + "grad_norm": 1.6640625, + "learning_rate": 1.9023856362184433e-05, + "loss": 0.974, + "step": 2567 + }, + { + "epoch": 0.4403197805259661, + "grad_norm": 1.7109375, + "learning_rate": 1.902307798745968e-05, + "loss": 1.1008, + "step": 2568 + }, + { + "epoch": 0.440491244614956, + "grad_norm": 1.6015625, + "learning_rate": 1.9022299318458377e-05, + "loss": 1.0568, + "step": 2569 + }, + { + "epoch": 0.4406627087039458, + "grad_norm": 1.7734375, + "learning_rate": 1.902152035520592e-05, + "loss": 1.1136, + "step": 2570 + }, + { + "epoch": 0.4408341727929357, + "grad_norm": 1.6328125, + "learning_rate": 1.902074109772771e-05, + "loss": 0.989, + "step": 2571 + }, + { + "epoch": 0.44100563688192557, + "grad_norm": 1.6328125, + "learning_rate": 1.9019961546049165e-05, + "loss": 0.9976, + "step": 2572 + }, + { + "epoch": 0.4411771009709154, + "grad_norm": 1.6484375, + "learning_rate": 1.9019181700195714e-05, + "loss": 1.0227, + "step": 2573 + }, + { + "epoch": 0.4413485650599053, + "grad_norm": 1.6484375, + "learning_rate": 1.9018401560192775e-05, + "loss": 1.0334, + "step": 2574 + }, + { + "epoch": 0.44152002914889515, + "grad_norm": 1.6640625, + "learning_rate": 1.9017621126065804e-05, + "loss": 1.0257, + "step": 2575 + }, + { + "epoch": 0.441691493237885, + "grad_norm": 1.546875, + "learning_rate": 1.9016840397840256e-05, + "loss": 1.0305, + "step": 2576 + }, + { + "epoch": 0.44186295732687486, + "grad_norm": 1.5859375, + "learning_rate": 1.901605937554158e-05, + "loss": 1.0111, + "step": 2577 + }, + { + "epoch": 0.44203442141586474, + "grad_norm": 1.671875, + "learning_rate": 1.901527805919526e-05, + "loss": 1.0746, + "step": 2578 + }, + { + "epoch": 0.44220588550485457, + "grad_norm": 1.6484375, + "learning_rate": 1.9014496448826775e-05, + "loss": 1.08, + "step": 2579 + }, + { + "epoch": 0.44237734959384445, + "grad_norm": 1.546875, + "learning_rate": 1.9013714544461617e-05, + "loss": 1.0462, + "step": 2580 + }, + { + "epoch": 0.4425488136828343, + "grad_norm": 1.625, + "learning_rate": 1.9012932346125282e-05, + "loss": 1.0524, + "step": 2581 + }, + { + "epoch": 0.44272027777182416, + "grad_norm": 1.578125, + "learning_rate": 1.9012149853843283e-05, + "loss": 0.9439, + "step": 2582 + }, + { + "epoch": 0.44289174186081404, + "grad_norm": 1.7578125, + "learning_rate": 1.9011367067641146e-05, + "loss": 1.0259, + "step": 2583 + }, + { + "epoch": 0.44306320594980386, + "grad_norm": 1.5625, + "learning_rate": 1.901058398754439e-05, + "loss": 1.0868, + "step": 2584 + }, + { + "epoch": 0.44323467003879374, + "grad_norm": 1.6875, + "learning_rate": 1.9009800613578563e-05, + "loss": 1.1054, + "step": 2585 + }, + { + "epoch": 0.4434061341277836, + "grad_norm": 1.578125, + "learning_rate": 1.9009016945769207e-05, + "loss": 0.9721, + "step": 2586 + }, + { + "epoch": 0.44357759821677345, + "grad_norm": 1.6875, + "learning_rate": 1.9008232984141885e-05, + "loss": 1.0474, + "step": 2587 + }, + { + "epoch": 0.44374906230576333, + "grad_norm": 1.640625, + "learning_rate": 1.9007448728722165e-05, + "loss": 1.0311, + "step": 2588 + }, + { + "epoch": 0.4439205263947532, + "grad_norm": 1.71875, + "learning_rate": 1.900666417953562e-05, + "loss": 1.021, + "step": 2589 + }, + { + "epoch": 0.44409199048374304, + "grad_norm": 1.65625, + "learning_rate": 1.9005879336607844e-05, + "loss": 1.0546, + "step": 2590 + }, + { + "epoch": 0.4442634545727329, + "grad_norm": 1.5390625, + "learning_rate": 1.9005094199964427e-05, + "loss": 1.0064, + "step": 2591 + }, + { + "epoch": 0.4444349186617228, + "grad_norm": 1.640625, + "learning_rate": 1.900430876963098e-05, + "loss": 0.9986, + "step": 2592 + }, + { + "epoch": 0.4446063827507126, + "grad_norm": 1.5703125, + "learning_rate": 1.9003523045633116e-05, + "loss": 1.0127, + "step": 2593 + }, + { + "epoch": 0.4447778468397025, + "grad_norm": 1.6875, + "learning_rate": 1.900273702799646e-05, + "loss": 1.1392, + "step": 2594 + }, + { + "epoch": 0.4449493109286924, + "grad_norm": 1.515625, + "learning_rate": 1.9001950716746648e-05, + "loss": 0.9631, + "step": 2595 + }, + { + "epoch": 0.4451207750176822, + "grad_norm": 1.609375, + "learning_rate": 1.9001164111909327e-05, + "loss": 0.9925, + "step": 2596 + }, + { + "epoch": 0.4452922391066721, + "grad_norm": 1.578125, + "learning_rate": 1.9000377213510147e-05, + "loss": 1.0275, + "step": 2597 + }, + { + "epoch": 0.445463703195662, + "grad_norm": 1.7578125, + "learning_rate": 1.8999590021574776e-05, + "loss": 1.0874, + "step": 2598 + }, + { + "epoch": 0.4456351672846518, + "grad_norm": 1.546875, + "learning_rate": 1.8998802536128885e-05, + "loss": 0.979, + "step": 2599 + }, + { + "epoch": 0.4458066313736417, + "grad_norm": 1.6015625, + "learning_rate": 1.8998014757198152e-05, + "loss": 1.0238, + "step": 2600 + }, + { + "epoch": 0.44597809546263156, + "grad_norm": 1.75, + "learning_rate": 1.8997226684808282e-05, + "loss": 1.1384, + "step": 2601 + }, + { + "epoch": 0.4461495595516214, + "grad_norm": 1.515625, + "learning_rate": 1.8996438318984968e-05, + "loss": 1.0545, + "step": 2602 + }, + { + "epoch": 0.44632102364061127, + "grad_norm": 1.625, + "learning_rate": 1.8995649659753917e-05, + "loss": 1.0358, + "step": 2603 + }, + { + "epoch": 0.44649248772960115, + "grad_norm": 1.6015625, + "learning_rate": 1.8994860707140862e-05, + "loss": 1.0149, + "step": 2604 + }, + { + "epoch": 0.446663951818591, + "grad_norm": 1.5703125, + "learning_rate": 1.8994071461171522e-05, + "loss": 0.9884, + "step": 2605 + }, + { + "epoch": 0.44683541590758086, + "grad_norm": 1.5390625, + "learning_rate": 1.899328192187165e-05, + "loss": 1.0315, + "step": 2606 + }, + { + "epoch": 0.44700687999657074, + "grad_norm": 1.5703125, + "learning_rate": 1.8992492089266986e-05, + "loss": 0.9365, + "step": 2607 + }, + { + "epoch": 0.44717834408556056, + "grad_norm": 1.5546875, + "learning_rate": 1.899170196338329e-05, + "loss": 1.0134, + "step": 2608 + }, + { + "epoch": 0.44734980817455045, + "grad_norm": 1.65625, + "learning_rate": 1.8990911544246338e-05, + "loss": 1.0205, + "step": 2609 + }, + { + "epoch": 0.4475212722635403, + "grad_norm": 1.6015625, + "learning_rate": 1.89901208318819e-05, + "loss": 1.0267, + "step": 2610 + }, + { + "epoch": 0.44769273635253015, + "grad_norm": 1.5859375, + "learning_rate": 1.898932982631577e-05, + "loss": 1.1054, + "step": 2611 + }, + { + "epoch": 0.44786420044152003, + "grad_norm": 1.8203125, + "learning_rate": 1.8988538527573743e-05, + "loss": 0.9982, + "step": 2612 + }, + { + "epoch": 0.4480356645305099, + "grad_norm": 1.6015625, + "learning_rate": 1.8987746935681627e-05, + "loss": 1.0921, + "step": 2613 + }, + { + "epoch": 0.44820712861949974, + "grad_norm": 1.6484375, + "learning_rate": 1.898695505066524e-05, + "loss": 1.105, + "step": 2614 + }, + { + "epoch": 0.4483785927084896, + "grad_norm": 1.84375, + "learning_rate": 1.8986162872550405e-05, + "loss": 1.1337, + "step": 2615 + }, + { + "epoch": 0.4485500567974795, + "grad_norm": 1.6484375, + "learning_rate": 1.898537040136296e-05, + "loss": 1.0623, + "step": 2616 + }, + { + "epoch": 0.4487215208864693, + "grad_norm": 1.6953125, + "learning_rate": 1.8984577637128755e-05, + "loss": 1.1215, + "step": 2617 + }, + { + "epoch": 0.4488929849754592, + "grad_norm": 1.5, + "learning_rate": 1.8983784579873635e-05, + "loss": 0.9338, + "step": 2618 + }, + { + "epoch": 0.4490644490644491, + "grad_norm": 1.6484375, + "learning_rate": 1.898299122962347e-05, + "loss": 1.0079, + "step": 2619 + }, + { + "epoch": 0.4492359131534389, + "grad_norm": 1.65625, + "learning_rate": 1.8982197586404136e-05, + "loss": 1.0642, + "step": 2620 + }, + { + "epoch": 0.4494073772424288, + "grad_norm": 1.625, + "learning_rate": 1.8981403650241517e-05, + "loss": 1.0029, + "step": 2621 + }, + { + "epoch": 0.4495788413314187, + "grad_norm": 1.8125, + "learning_rate": 1.89806094211615e-05, + "loss": 1.0326, + "step": 2622 + }, + { + "epoch": 0.4497503054204085, + "grad_norm": 1.78125, + "learning_rate": 1.8979814899189993e-05, + "loss": 1.0617, + "step": 2623 + }, + { + "epoch": 0.4499217695093984, + "grad_norm": 1.6640625, + "learning_rate": 1.897902008435291e-05, + "loss": 1.0087, + "step": 2624 + }, + { + "epoch": 0.45009323359838826, + "grad_norm": 1.609375, + "learning_rate": 1.8978224976676168e-05, + "loss": 1.0239, + "step": 2625 + }, + { + "epoch": 0.4502646976873781, + "grad_norm": 1.6484375, + "learning_rate": 1.89774295761857e-05, + "loss": 1.0068, + "step": 2626 + }, + { + "epoch": 0.45043616177636797, + "grad_norm": 1.59375, + "learning_rate": 1.897663388290745e-05, + "loss": 0.9916, + "step": 2627 + }, + { + "epoch": 0.4506076258653578, + "grad_norm": 1.546875, + "learning_rate": 1.8975837896867365e-05, + "loss": 1.0105, + "step": 2628 + }, + { + "epoch": 0.4507790899543477, + "grad_norm": 1.5, + "learning_rate": 1.8975041618091406e-05, + "loss": 0.964, + "step": 2629 + }, + { + "epoch": 0.45095055404333756, + "grad_norm": 1.5859375, + "learning_rate": 1.8974245046605544e-05, + "loss": 1.0457, + "step": 2630 + }, + { + "epoch": 0.4511220181323274, + "grad_norm": 1.59375, + "learning_rate": 1.8973448182435757e-05, + "loss": 1.0377, + "step": 2631 + }, + { + "epoch": 0.45129348222131727, + "grad_norm": 1.640625, + "learning_rate": 1.897265102560803e-05, + "loss": 1.0343, + "step": 2632 + }, + { + "epoch": 0.45146494631030715, + "grad_norm": 1.59375, + "learning_rate": 1.897185357614837e-05, + "loss": 0.9959, + "step": 2633 + }, + { + "epoch": 0.451636410399297, + "grad_norm": 1.6953125, + "learning_rate": 1.8971055834082778e-05, + "loss": 1.0973, + "step": 2634 + }, + { + "epoch": 0.45180787448828685, + "grad_norm": 1.515625, + "learning_rate": 1.8970257799437274e-05, + "loss": 0.951, + "step": 2635 + }, + { + "epoch": 0.45197933857727673, + "grad_norm": 1.578125, + "learning_rate": 1.8969459472237886e-05, + "loss": 1.0957, + "step": 2636 + }, + { + "epoch": 0.45215080266626656, + "grad_norm": 1.671875, + "learning_rate": 1.8968660852510646e-05, + "loss": 1.0603, + "step": 2637 + }, + { + "epoch": 0.45232226675525644, + "grad_norm": 1.5625, + "learning_rate": 1.8967861940281603e-05, + "loss": 0.9595, + "step": 2638 + }, + { + "epoch": 0.4524937308442463, + "grad_norm": 1.6640625, + "learning_rate": 1.896706273557681e-05, + "loss": 1.0628, + "step": 2639 + }, + { + "epoch": 0.45266519493323615, + "grad_norm": 1.65625, + "learning_rate": 1.896626323842234e-05, + "loss": 1.0335, + "step": 2640 + }, + { + "epoch": 0.45283665902222603, + "grad_norm": 1.546875, + "learning_rate": 1.896546344884426e-05, + "loss": 0.9677, + "step": 2641 + }, + { + "epoch": 0.4530081231112159, + "grad_norm": 1.5625, + "learning_rate": 1.8964663366868655e-05, + "loss": 0.9559, + "step": 2642 + }, + { + "epoch": 0.45317958720020574, + "grad_norm": 1.5703125, + "learning_rate": 1.896386299252162e-05, + "loss": 1.0433, + "step": 2643 + }, + { + "epoch": 0.4533510512891956, + "grad_norm": 1.609375, + "learning_rate": 1.8963062325829256e-05, + "loss": 1.0023, + "step": 2644 + }, + { + "epoch": 0.4535225153781855, + "grad_norm": 1.6484375, + "learning_rate": 1.8962261366817684e-05, + "loss": 1.1453, + "step": 2645 + }, + { + "epoch": 0.4536939794671753, + "grad_norm": 1.59375, + "learning_rate": 1.8961460115513012e-05, + "loss": 0.9839, + "step": 2646 + }, + { + "epoch": 0.4538654435561652, + "grad_norm": 1.6015625, + "learning_rate": 1.8960658571941388e-05, + "loss": 1.1139, + "step": 2647 + }, + { + "epoch": 0.4540369076451551, + "grad_norm": 1.6484375, + "learning_rate": 1.895985673612894e-05, + "loss": 1.073, + "step": 2648 + }, + { + "epoch": 0.4542083717341449, + "grad_norm": 1.5390625, + "learning_rate": 1.8959054608101823e-05, + "loss": 1.0102, + "step": 2649 + }, + { + "epoch": 0.4543798358231348, + "grad_norm": 1.5859375, + "learning_rate": 1.8958252187886202e-05, + "loss": 1.0139, + "step": 2650 + }, + { + "epoch": 0.4545512999121247, + "grad_norm": 1.5859375, + "learning_rate": 1.8957449475508243e-05, + "loss": 0.9918, + "step": 2651 + }, + { + "epoch": 0.4547227640011145, + "grad_norm": 1.6171875, + "learning_rate": 1.8956646470994124e-05, + "loss": 1.0911, + "step": 2652 + }, + { + "epoch": 0.4548942280901044, + "grad_norm": 1.6328125, + "learning_rate": 1.895584317437004e-05, + "loss": 1.1194, + "step": 2653 + }, + { + "epoch": 0.45506569217909426, + "grad_norm": 1.609375, + "learning_rate": 1.8955039585662182e-05, + "loss": 1.0762, + "step": 2654 + }, + { + "epoch": 0.4552371562680841, + "grad_norm": 1.5703125, + "learning_rate": 1.895423570489676e-05, + "loss": 1.0032, + "step": 2655 + }, + { + "epoch": 0.45540862035707397, + "grad_norm": 1.6171875, + "learning_rate": 1.8953431532099994e-05, + "loss": 0.9551, + "step": 2656 + }, + { + "epoch": 0.45558008444606385, + "grad_norm": 1.59375, + "learning_rate": 1.8952627067298115e-05, + "loss": 1.0617, + "step": 2657 + }, + { + "epoch": 0.4557515485350537, + "grad_norm": 1.6953125, + "learning_rate": 1.895182231051735e-05, + "loss": 1.0103, + "step": 2658 + }, + { + "epoch": 0.45592301262404356, + "grad_norm": 1.6328125, + "learning_rate": 1.8951017261783954e-05, + "loss": 1.1338, + "step": 2659 + }, + { + "epoch": 0.45609447671303344, + "grad_norm": 1.5625, + "learning_rate": 1.8950211921124177e-05, + "loss": 1.0476, + "step": 2660 + }, + { + "epoch": 0.45626594080202326, + "grad_norm": 1.5390625, + "learning_rate": 1.8949406288564282e-05, + "loss": 1.0679, + "step": 2661 + }, + { + "epoch": 0.45643740489101314, + "grad_norm": 1.609375, + "learning_rate": 1.894860036413055e-05, + "loss": 1.0398, + "step": 2662 + }, + { + "epoch": 0.456608868980003, + "grad_norm": 1.6015625, + "learning_rate": 1.8947794147849264e-05, + "loss": 1.0968, + "step": 2663 + }, + { + "epoch": 0.45678033306899285, + "grad_norm": 1.609375, + "learning_rate": 1.8946987639746717e-05, + "loss": 1.0471, + "step": 2664 + }, + { + "epoch": 0.45695179715798273, + "grad_norm": 1.5703125, + "learning_rate": 1.894618083984921e-05, + "loss": 1.0101, + "step": 2665 + }, + { + "epoch": 0.4571232612469726, + "grad_norm": 1.6328125, + "learning_rate": 1.894537374818306e-05, + "loss": 1.0929, + "step": 2666 + }, + { + "epoch": 0.45729472533596244, + "grad_norm": 1.578125, + "learning_rate": 1.8944566364774585e-05, + "loss": 1.0572, + "step": 2667 + }, + { + "epoch": 0.4574661894249523, + "grad_norm": 1.625, + "learning_rate": 1.8943758689650117e-05, + "loss": 1.0125, + "step": 2668 + }, + { + "epoch": 0.4576376535139422, + "grad_norm": 1.5703125, + "learning_rate": 1.8942950722836e-05, + "loss": 1.0197, + "step": 2669 + }, + { + "epoch": 0.457809117602932, + "grad_norm": 1.5859375, + "learning_rate": 1.8942142464358587e-05, + "loss": 1.0973, + "step": 2670 + }, + { + "epoch": 0.4579805816919219, + "grad_norm": 1.546875, + "learning_rate": 1.8941333914244234e-05, + "loss": 1.0297, + "step": 2671 + }, + { + "epoch": 0.4581520457809118, + "grad_norm": 1.6796875, + "learning_rate": 1.894052507251931e-05, + "loss": 1.0836, + "step": 2672 + }, + { + "epoch": 0.4583235098699016, + "grad_norm": 1.640625, + "learning_rate": 1.8939715939210202e-05, + "loss": 1.1439, + "step": 2673 + }, + { + "epoch": 0.4584949739588915, + "grad_norm": 1.6328125, + "learning_rate": 1.8938906514343287e-05, + "loss": 1.0319, + "step": 2674 + }, + { + "epoch": 0.4586664380478813, + "grad_norm": 1.6875, + "learning_rate": 1.8938096797944972e-05, + "loss": 1.0257, + "step": 2675 + }, + { + "epoch": 0.4588379021368712, + "grad_norm": 1.5546875, + "learning_rate": 1.893728679004166e-05, + "loss": 1.0049, + "step": 2676 + }, + { + "epoch": 0.4590093662258611, + "grad_norm": 1.4921875, + "learning_rate": 1.8936476490659778e-05, + "loss": 0.9021, + "step": 2677 + }, + { + "epoch": 0.4591808303148509, + "grad_norm": 1.6484375, + "learning_rate": 1.893566589982574e-05, + "loss": 0.9815, + "step": 2678 + }, + { + "epoch": 0.4593522944038408, + "grad_norm": 1.6015625, + "learning_rate": 1.893485501756599e-05, + "loss": 1.1038, + "step": 2679 + }, + { + "epoch": 0.45952375849283067, + "grad_norm": 1.5859375, + "learning_rate": 1.8934043843906975e-05, + "loss": 0.9767, + "step": 2680 + }, + { + "epoch": 0.4596952225818205, + "grad_norm": 1.625, + "learning_rate": 1.8933232378875145e-05, + "loss": 1.0821, + "step": 2681 + }, + { + "epoch": 0.4598666866708104, + "grad_norm": 1.53125, + "learning_rate": 1.8932420622496964e-05, + "loss": 0.9355, + "step": 2682 + }, + { + "epoch": 0.46003815075980026, + "grad_norm": 1.5, + "learning_rate": 1.8931608574798915e-05, + "loss": 0.9788, + "step": 2683 + }, + { + "epoch": 0.4602096148487901, + "grad_norm": 1.5625, + "learning_rate": 1.8930796235807478e-05, + "loss": 1.0064, + "step": 2684 + }, + { + "epoch": 0.46038107893777996, + "grad_norm": 1.578125, + "learning_rate": 1.8929983605549146e-05, + "loss": 1.1276, + "step": 2685 + }, + { + "epoch": 0.46055254302676985, + "grad_norm": 1.5625, + "learning_rate": 1.8929170684050414e-05, + "loss": 1.0831, + "step": 2686 + }, + { + "epoch": 0.46072400711575967, + "grad_norm": 1.625, + "learning_rate": 1.892835747133781e-05, + "loss": 0.9837, + "step": 2687 + }, + { + "epoch": 0.46089547120474955, + "grad_norm": 1.5, + "learning_rate": 1.8927543967437846e-05, + "loss": 0.9239, + "step": 2688 + }, + { + "epoch": 0.46106693529373943, + "grad_norm": 1.609375, + "learning_rate": 1.892673017237705e-05, + "loss": 1.0372, + "step": 2689 + }, + { + "epoch": 0.46123839938272926, + "grad_norm": 1.640625, + "learning_rate": 1.8925916086181975e-05, + "loss": 1.1304, + "step": 2690 + }, + { + "epoch": 0.46140986347171914, + "grad_norm": 1.6875, + "learning_rate": 1.8925101708879162e-05, + "loss": 1.0963, + "step": 2691 + }, + { + "epoch": 0.461581327560709, + "grad_norm": 1.5703125, + "learning_rate": 1.8924287040495174e-05, + "loss": 0.9772, + "step": 2692 + }, + { + "epoch": 0.46175279164969885, + "grad_norm": 1.5859375, + "learning_rate": 1.8923472081056577e-05, + "loss": 0.9871, + "step": 2693 + }, + { + "epoch": 0.4619242557386887, + "grad_norm": 1.734375, + "learning_rate": 1.8922656830589955e-05, + "loss": 1.1368, + "step": 2694 + }, + { + "epoch": 0.4620957198276786, + "grad_norm": 1.6484375, + "learning_rate": 1.8921841289121894e-05, + "loss": 1.1031, + "step": 2695 + }, + { + "epoch": 0.46226718391666843, + "grad_norm": 1.5859375, + "learning_rate": 1.8921025456678988e-05, + "loss": 0.9948, + "step": 2696 + }, + { + "epoch": 0.4624386480056583, + "grad_norm": 1.609375, + "learning_rate": 1.8920209333287854e-05, + "loss": 0.9736, + "step": 2697 + }, + { + "epoch": 0.4626101120946482, + "grad_norm": 1.609375, + "learning_rate": 1.89193929189751e-05, + "loss": 1.0552, + "step": 2698 + }, + { + "epoch": 0.462781576183638, + "grad_norm": 1.6328125, + "learning_rate": 1.8918576213767358e-05, + "loss": 1.095, + "step": 2699 + }, + { + "epoch": 0.4629530402726279, + "grad_norm": 1.6015625, + "learning_rate": 1.891775921769126e-05, + "loss": 0.9765, + "step": 2700 + }, + { + "epoch": 0.4631245043616178, + "grad_norm": 1.6484375, + "learning_rate": 1.8916941930773457e-05, + "loss": 1.0, + "step": 2701 + }, + { + "epoch": 0.4632959684506076, + "grad_norm": 1.6328125, + "learning_rate": 1.8916124353040594e-05, + "loss": 1.013, + "step": 2702 + }, + { + "epoch": 0.4634674325395975, + "grad_norm": 1.6328125, + "learning_rate": 1.8915306484519344e-05, + "loss": 1.1182, + "step": 2703 + }, + { + "epoch": 0.46363889662858737, + "grad_norm": 1.6328125, + "learning_rate": 1.8914488325236373e-05, + "loss": 1.0811, + "step": 2704 + }, + { + "epoch": 0.4638103607175772, + "grad_norm": 1.6640625, + "learning_rate": 1.8913669875218375e-05, + "loss": 1.0969, + "step": 2705 + }, + { + "epoch": 0.4639818248065671, + "grad_norm": 1.546875, + "learning_rate": 1.8912851134492033e-05, + "loss": 1.0535, + "step": 2706 + }, + { + "epoch": 0.46415328889555696, + "grad_norm": 1.5703125, + "learning_rate": 1.8912032103084054e-05, + "loss": 1.0302, + "step": 2707 + }, + { + "epoch": 0.4643247529845468, + "grad_norm": 1.609375, + "learning_rate": 1.8911212781021148e-05, + "loss": 1.0506, + "step": 2708 + }, + { + "epoch": 0.46449621707353667, + "grad_norm": 1.6015625, + "learning_rate": 1.8910393168330036e-05, + "loss": 1.0178, + "step": 2709 + }, + { + "epoch": 0.46466768116252655, + "grad_norm": 1.6015625, + "learning_rate": 1.8909573265037448e-05, + "loss": 1.0261, + "step": 2710 + }, + { + "epoch": 0.4648391452515164, + "grad_norm": 1.5625, + "learning_rate": 1.8908753071170126e-05, + "loss": 1.0288, + "step": 2711 + }, + { + "epoch": 0.46501060934050625, + "grad_norm": 1.7109375, + "learning_rate": 1.8907932586754823e-05, + "loss": 1.0356, + "step": 2712 + }, + { + "epoch": 0.46518207342949613, + "grad_norm": 1.671875, + "learning_rate": 1.8907111811818292e-05, + "loss": 1.0415, + "step": 2713 + }, + { + "epoch": 0.46535353751848596, + "grad_norm": 1.578125, + "learning_rate": 1.89062907463873e-05, + "loss": 0.9969, + "step": 2714 + }, + { + "epoch": 0.46552500160747584, + "grad_norm": 1.65625, + "learning_rate": 1.8905469390488635e-05, + "loss": 1.0581, + "step": 2715 + }, + { + "epoch": 0.4656964656964657, + "grad_norm": 1.609375, + "learning_rate": 1.8904647744149077e-05, + "loss": 0.9877, + "step": 2716 + }, + { + "epoch": 0.46586792978545555, + "grad_norm": 1.6875, + "learning_rate": 1.8903825807395422e-05, + "loss": 0.997, + "step": 2717 + }, + { + "epoch": 0.46603939387444543, + "grad_norm": 1.5390625, + "learning_rate": 1.890300358025448e-05, + "loss": 1.0222, + "step": 2718 + }, + { + "epoch": 0.4662108579634353, + "grad_norm": 1.59375, + "learning_rate": 1.8902181062753064e-05, + "loss": 1.0158, + "step": 2719 + }, + { + "epoch": 0.46638232205242514, + "grad_norm": 1.609375, + "learning_rate": 1.8901358254918005e-05, + "loss": 0.9048, + "step": 2720 + }, + { + "epoch": 0.466553786141415, + "grad_norm": 1.6953125, + "learning_rate": 1.8900535156776128e-05, + "loss": 1.176, + "step": 2721 + }, + { + "epoch": 0.46672525023040484, + "grad_norm": 1.5546875, + "learning_rate": 1.8899711768354288e-05, + "loss": 1.059, + "step": 2722 + }, + { + "epoch": 0.4668967143193947, + "grad_norm": 1.59375, + "learning_rate": 1.8898888089679332e-05, + "loss": 0.9411, + "step": 2723 + }, + { + "epoch": 0.4670681784083846, + "grad_norm": 1.6875, + "learning_rate": 1.8898064120778126e-05, + "loss": 0.944, + "step": 2724 + }, + { + "epoch": 0.46723964249737443, + "grad_norm": 1.5625, + "learning_rate": 1.889723986167754e-05, + "loss": 1.0461, + "step": 2725 + }, + { + "epoch": 0.4674111065863643, + "grad_norm": 1.59375, + "learning_rate": 1.889641531240446e-05, + "loss": 1.095, + "step": 2726 + }, + { + "epoch": 0.4675825706753542, + "grad_norm": 1.640625, + "learning_rate": 1.8895590472985775e-05, + "loss": 1.0206, + "step": 2727 + }, + { + "epoch": 0.467754034764344, + "grad_norm": 1.6015625, + "learning_rate": 1.889476534344839e-05, + "loss": 1.0456, + "step": 2728 + }, + { + "epoch": 0.4679254988533339, + "grad_norm": 1.5703125, + "learning_rate": 1.889393992381921e-05, + "loss": 1.0462, + "step": 2729 + }, + { + "epoch": 0.4680969629423238, + "grad_norm": 1.5546875, + "learning_rate": 1.8893114214125154e-05, + "loss": 1.0204, + "step": 2730 + }, + { + "epoch": 0.4682684270313136, + "grad_norm": 1.734375, + "learning_rate": 1.889228821439316e-05, + "loss": 1.047, + "step": 2731 + }, + { + "epoch": 0.4684398911203035, + "grad_norm": 1.5703125, + "learning_rate": 1.8891461924650165e-05, + "loss": 0.9852, + "step": 2732 + }, + { + "epoch": 0.46861135520929337, + "grad_norm": 1.6328125, + "learning_rate": 1.8890635344923106e-05, + "loss": 1.0183, + "step": 2733 + }, + { + "epoch": 0.4687828192982832, + "grad_norm": 1.875, + "learning_rate": 1.8889808475238957e-05, + "loss": 1.0666, + "step": 2734 + }, + { + "epoch": 0.4689542833872731, + "grad_norm": 1.578125, + "learning_rate": 1.8888981315624672e-05, + "loss": 0.9941, + "step": 2735 + }, + { + "epoch": 0.46912574747626296, + "grad_norm": 1.5859375, + "learning_rate": 1.8888153866107236e-05, + "loss": 1.0518, + "step": 2736 + }, + { + "epoch": 0.4692972115652528, + "grad_norm": 1.609375, + "learning_rate": 1.888732612671363e-05, + "loss": 1.0533, + "step": 2737 + }, + { + "epoch": 0.46946867565424266, + "grad_norm": 1.609375, + "learning_rate": 1.888649809747086e-05, + "loss": 1.0669, + "step": 2738 + }, + { + "epoch": 0.46964013974323254, + "grad_norm": 1.6484375, + "learning_rate": 1.8885669778405917e-05, + "loss": 1.0136, + "step": 2739 + }, + { + "epoch": 0.46981160383222237, + "grad_norm": 1.5390625, + "learning_rate": 1.8884841169545826e-05, + "loss": 1.0767, + "step": 2740 + }, + { + "epoch": 0.46998306792121225, + "grad_norm": 1.65625, + "learning_rate": 1.8884012270917603e-05, + "loss": 0.9982, + "step": 2741 + }, + { + "epoch": 0.47015453201020213, + "grad_norm": 1.5703125, + "learning_rate": 1.888318308254829e-05, + "loss": 1.0528, + "step": 2742 + }, + { + "epoch": 0.47032599609919196, + "grad_norm": 1.5859375, + "learning_rate": 1.8882353604464923e-05, + "loss": 1.0539, + "step": 2743 + }, + { + "epoch": 0.47049746018818184, + "grad_norm": 1.71875, + "learning_rate": 1.8881523836694556e-05, + "loss": 1.0387, + "step": 2744 + }, + { + "epoch": 0.4706689242771717, + "grad_norm": 1.5546875, + "learning_rate": 1.8880693779264255e-05, + "loss": 0.991, + "step": 2745 + }, + { + "epoch": 0.47084038836616154, + "grad_norm": 1.53125, + "learning_rate": 1.8879863432201086e-05, + "loss": 0.9775, + "step": 2746 + }, + { + "epoch": 0.4710118524551514, + "grad_norm": 1.5703125, + "learning_rate": 1.8879032795532132e-05, + "loss": 0.9837, + "step": 2747 + }, + { + "epoch": 0.4711833165441413, + "grad_norm": 1.578125, + "learning_rate": 1.8878201869284483e-05, + "loss": 0.9901, + "step": 2748 + }, + { + "epoch": 0.47135478063313113, + "grad_norm": 1.7265625, + "learning_rate": 1.8877370653485242e-05, + "loss": 1.1065, + "step": 2749 + }, + { + "epoch": 0.471526244722121, + "grad_norm": 1.5859375, + "learning_rate": 1.8876539148161514e-05, + "loss": 1.0289, + "step": 2750 + }, + { + "epoch": 0.4716977088111109, + "grad_norm": 1.5703125, + "learning_rate": 1.887570735334042e-05, + "loss": 1.0634, + "step": 2751 + }, + { + "epoch": 0.4718691729001007, + "grad_norm": 1.546875, + "learning_rate": 1.887487526904908e-05, + "loss": 0.9623, + "step": 2752 + }, + { + "epoch": 0.4720406369890906, + "grad_norm": 1.671875, + "learning_rate": 1.8874042895314643e-05, + "loss": 1.0526, + "step": 2753 + }, + { + "epoch": 0.4722121010780805, + "grad_norm": 1.6796875, + "learning_rate": 1.8873210232164248e-05, + "loss": 1.0761, + "step": 2754 + }, + { + "epoch": 0.4723835651670703, + "grad_norm": 1.6484375, + "learning_rate": 1.8872377279625057e-05, + "loss": 1.0276, + "step": 2755 + }, + { + "epoch": 0.4725550292560602, + "grad_norm": 1.578125, + "learning_rate": 1.887154403772423e-05, + "loss": 0.9716, + "step": 2756 + }, + { + "epoch": 0.47272649334505007, + "grad_norm": 1.5390625, + "learning_rate": 1.8870710506488944e-05, + "loss": 1.0427, + "step": 2757 + }, + { + "epoch": 0.4728979574340399, + "grad_norm": 1.578125, + "learning_rate": 1.8869876685946388e-05, + "loss": 1.0066, + "step": 2758 + }, + { + "epoch": 0.4730694215230298, + "grad_norm": 1.5625, + "learning_rate": 1.886904257612375e-05, + "loss": 1.0671, + "step": 2759 + }, + { + "epoch": 0.47324088561201966, + "grad_norm": 1.6015625, + "learning_rate": 1.8868208177048238e-05, + "loss": 0.9966, + "step": 2760 + }, + { + "epoch": 0.4734123497010095, + "grad_norm": 1.6484375, + "learning_rate": 1.8867373488747058e-05, + "loss": 1.091, + "step": 2761 + }, + { + "epoch": 0.47358381378999936, + "grad_norm": 1.5546875, + "learning_rate": 1.886653851124744e-05, + "loss": 1.0605, + "step": 2762 + }, + { + "epoch": 0.47375527787898924, + "grad_norm": 1.578125, + "learning_rate": 1.8865703244576613e-05, + "loss": 0.9792, + "step": 2763 + }, + { + "epoch": 0.47392674196797907, + "grad_norm": 1.6171875, + "learning_rate": 1.8864867688761818e-05, + "loss": 1.1191, + "step": 2764 + }, + { + "epoch": 0.47409820605696895, + "grad_norm": 1.59375, + "learning_rate": 1.8864031843830305e-05, + "loss": 1.0338, + "step": 2765 + }, + { + "epoch": 0.47426967014595883, + "grad_norm": 1.578125, + "learning_rate": 1.8863195709809336e-05, + "loss": 1.0051, + "step": 2766 + }, + { + "epoch": 0.47444113423494866, + "grad_norm": 1.6015625, + "learning_rate": 1.886235928672618e-05, + "loss": 0.9636, + "step": 2767 + }, + { + "epoch": 0.47461259832393854, + "grad_norm": 1.6484375, + "learning_rate": 1.8861522574608113e-05, + "loss": 1.0052, + "step": 2768 + }, + { + "epoch": 0.47478406241292836, + "grad_norm": 1.5859375, + "learning_rate": 1.8860685573482427e-05, + "loss": 1.0447, + "step": 2769 + }, + { + "epoch": 0.47495552650191825, + "grad_norm": 1.65625, + "learning_rate": 1.8859848283376417e-05, + "loss": 1.098, + "step": 2770 + }, + { + "epoch": 0.4751269905909081, + "grad_norm": 1.703125, + "learning_rate": 1.885901070431739e-05, + "loss": 1.0925, + "step": 2771 + }, + { + "epoch": 0.47529845467989795, + "grad_norm": 1.6015625, + "learning_rate": 1.8858172836332667e-05, + "loss": 0.9781, + "step": 2772 + }, + { + "epoch": 0.47546991876888783, + "grad_norm": 1.5859375, + "learning_rate": 1.885733467944957e-05, + "loss": 1.0624, + "step": 2773 + }, + { + "epoch": 0.4756413828578777, + "grad_norm": 1.6171875, + "learning_rate": 1.8856496233695435e-05, + "loss": 1.097, + "step": 2774 + }, + { + "epoch": 0.47581284694686754, + "grad_norm": 1.6484375, + "learning_rate": 1.885565749909761e-05, + "loss": 1.0228, + "step": 2775 + }, + { + "epoch": 0.4759843110358574, + "grad_norm": 1.59375, + "learning_rate": 1.885481847568344e-05, + "loss": 1.0667, + "step": 2776 + }, + { + "epoch": 0.4761557751248473, + "grad_norm": 1.5703125, + "learning_rate": 1.8853979163480302e-05, + "loss": 0.9551, + "step": 2777 + }, + { + "epoch": 0.47632723921383713, + "grad_norm": 1.6484375, + "learning_rate": 1.885313956251556e-05, + "loss": 1.0382, + "step": 2778 + }, + { + "epoch": 0.476498703302827, + "grad_norm": 1.6484375, + "learning_rate": 1.8852299672816596e-05, + "loss": 1.0474, + "step": 2779 + }, + { + "epoch": 0.4766701673918169, + "grad_norm": 1.6484375, + "learning_rate": 1.885145949441081e-05, + "loss": 0.9978, + "step": 2780 + }, + { + "epoch": 0.4768416314808067, + "grad_norm": 1.5546875, + "learning_rate": 1.8850619027325595e-05, + "loss": 0.9717, + "step": 2781 + }, + { + "epoch": 0.4770130955697966, + "grad_norm": 1.625, + "learning_rate": 1.884977827158837e-05, + "loss": 1.0869, + "step": 2782 + }, + { + "epoch": 0.4771845596587865, + "grad_norm": 1.578125, + "learning_rate": 1.8848937227226542e-05, + "loss": 1.065, + "step": 2783 + }, + { + "epoch": 0.4773560237477763, + "grad_norm": 1.765625, + "learning_rate": 1.8848095894267556e-05, + "loss": 1.1608, + "step": 2784 + }, + { + "epoch": 0.4775274878367662, + "grad_norm": 1.7265625, + "learning_rate": 1.884725427273884e-05, + "loss": 1.1015, + "step": 2785 + }, + { + "epoch": 0.47769895192575607, + "grad_norm": 1.640625, + "learning_rate": 1.884641236266785e-05, + "loss": 1.0337, + "step": 2786 + }, + { + "epoch": 0.4778704160147459, + "grad_norm": 1.515625, + "learning_rate": 1.8845570164082038e-05, + "loss": 0.9727, + "step": 2787 + }, + { + "epoch": 0.4780418801037358, + "grad_norm": 1.5859375, + "learning_rate": 1.8844727677008877e-05, + "loss": 1.0569, + "step": 2788 + }, + { + "epoch": 0.47821334419272565, + "grad_norm": 1.59375, + "learning_rate": 1.8843884901475835e-05, + "loss": 1.0511, + "step": 2789 + }, + { + "epoch": 0.4783848082817155, + "grad_norm": 1.6796875, + "learning_rate": 1.8843041837510408e-05, + "loss": 1.0175, + "step": 2790 + }, + { + "epoch": 0.47855627237070536, + "grad_norm": 1.7265625, + "learning_rate": 1.8842198485140084e-05, + "loss": 1.0046, + "step": 2791 + }, + { + "epoch": 0.47872773645969524, + "grad_norm": 1.703125, + "learning_rate": 1.884135484439237e-05, + "loss": 1.0533, + "step": 2792 + }, + { + "epoch": 0.47889920054868507, + "grad_norm": 1.7109375, + "learning_rate": 1.8840510915294785e-05, + "loss": 1.0449, + "step": 2793 + }, + { + "epoch": 0.47907066463767495, + "grad_norm": 1.53125, + "learning_rate": 1.8839666697874845e-05, + "loss": 0.9818, + "step": 2794 + }, + { + "epoch": 0.47924212872666483, + "grad_norm": 1.5625, + "learning_rate": 1.883882219216009e-05, + "loss": 1.0963, + "step": 2795 + }, + { + "epoch": 0.47941359281565465, + "grad_norm": 1.515625, + "learning_rate": 1.8837977398178058e-05, + "loss": 1.0716, + "step": 2796 + }, + { + "epoch": 0.47958505690464454, + "grad_norm": 1.6328125, + "learning_rate": 1.88371323159563e-05, + "loss": 0.9346, + "step": 2797 + }, + { + "epoch": 0.4797565209936344, + "grad_norm": 1.546875, + "learning_rate": 1.8836286945522384e-05, + "loss": 1.0463, + "step": 2798 + }, + { + "epoch": 0.47992798508262424, + "grad_norm": 1.53125, + "learning_rate": 1.8835441286903874e-05, + "loss": 1.0739, + "step": 2799 + }, + { + "epoch": 0.4800994491716141, + "grad_norm": 1.5078125, + "learning_rate": 1.883459534012835e-05, + "loss": 0.9795, + "step": 2800 + }, + { + "epoch": 0.4800994491716141, + "eval_loss": 0.8867102265357971, + "eval_runtime": 837.1871, + "eval_samples_per_second": 2.985, + "eval_steps_per_second": 2.985, + "step": 2800 + }, + { + "epoch": 0.480270913260604, + "grad_norm": 1.578125, + "learning_rate": 1.8833749105223406e-05, + "loss": 1.0323, + "step": 2801 + }, + { + "epoch": 0.48044237734959383, + "grad_norm": 1.6171875, + "learning_rate": 1.883290258221664e-05, + "loss": 1.1122, + "step": 2802 + }, + { + "epoch": 0.4806138414385837, + "grad_norm": 1.640625, + "learning_rate": 1.8832055771135657e-05, + "loss": 1.1051, + "step": 2803 + }, + { + "epoch": 0.4807853055275736, + "grad_norm": 1.546875, + "learning_rate": 1.8831208672008082e-05, + "loss": 1.0378, + "step": 2804 + }, + { + "epoch": 0.4809567696165634, + "grad_norm": 1.59375, + "learning_rate": 1.8830361284861532e-05, + "loss": 1.0658, + "step": 2805 + }, + { + "epoch": 0.4811282337055533, + "grad_norm": 1.640625, + "learning_rate": 1.882951360972365e-05, + "loss": 1.0673, + "step": 2806 + }, + { + "epoch": 0.4812996977945432, + "grad_norm": 1.5703125, + "learning_rate": 1.882866564662208e-05, + "loss": 0.9835, + "step": 2807 + }, + { + "epoch": 0.481471161883533, + "grad_norm": 1.5703125, + "learning_rate": 1.8827817395584474e-05, + "loss": 1.0338, + "step": 2808 + }, + { + "epoch": 0.4816426259725229, + "grad_norm": 1.6953125, + "learning_rate": 1.8826968856638504e-05, + "loss": 1.0203, + "step": 2809 + }, + { + "epoch": 0.48181409006151277, + "grad_norm": 1.7265625, + "learning_rate": 1.8826120029811844e-05, + "loss": 1.0846, + "step": 2810 + }, + { + "epoch": 0.4819855541505026, + "grad_norm": 1.609375, + "learning_rate": 1.882527091513217e-05, + "loss": 1.1038, + "step": 2811 + }, + { + "epoch": 0.4821570182394925, + "grad_norm": 1.46875, + "learning_rate": 1.882442151262718e-05, + "loss": 0.9386, + "step": 2812 + }, + { + "epoch": 0.48232848232848236, + "grad_norm": 1.546875, + "learning_rate": 1.882357182232457e-05, + "loss": 1.0621, + "step": 2813 + }, + { + "epoch": 0.4824999464174722, + "grad_norm": 1.59375, + "learning_rate": 1.882272184425206e-05, + "loss": 1.088, + "step": 2814 + }, + { + "epoch": 0.48267141050646206, + "grad_norm": 1.6640625, + "learning_rate": 1.8821871578437367e-05, + "loss": 1.0637, + "step": 2815 + }, + { + "epoch": 0.4828428745954519, + "grad_norm": 1.578125, + "learning_rate": 1.8821021024908223e-05, + "loss": 1.0955, + "step": 2816 + }, + { + "epoch": 0.48301433868444177, + "grad_norm": 1.5625, + "learning_rate": 1.8820170183692364e-05, + "loss": 0.9826, + "step": 2817 + }, + { + "epoch": 0.48318580277343165, + "grad_norm": 1.6015625, + "learning_rate": 1.8819319054817543e-05, + "loss": 1.0452, + "step": 2818 + }, + { + "epoch": 0.4833572668624215, + "grad_norm": 1.640625, + "learning_rate": 1.8818467638311516e-05, + "loss": 1.0884, + "step": 2819 + }, + { + "epoch": 0.48352873095141136, + "grad_norm": 1.6015625, + "learning_rate": 1.8817615934202055e-05, + "loss": 1.1022, + "step": 2820 + }, + { + "epoch": 0.48370019504040124, + "grad_norm": 1.5546875, + "learning_rate": 1.881676394251693e-05, + "loss": 1.0365, + "step": 2821 + }, + { + "epoch": 0.48387165912939106, + "grad_norm": 1.515625, + "learning_rate": 1.8815911663283936e-05, + "loss": 0.9715, + "step": 2822 + }, + { + "epoch": 0.48404312321838094, + "grad_norm": 1.578125, + "learning_rate": 1.8815059096530863e-05, + "loss": 1.056, + "step": 2823 + }, + { + "epoch": 0.4842145873073708, + "grad_norm": 1.5546875, + "learning_rate": 1.881420624228552e-05, + "loss": 1.0723, + "step": 2824 + }, + { + "epoch": 0.48438605139636065, + "grad_norm": 1.6484375, + "learning_rate": 1.8813353100575716e-05, + "loss": 0.942, + "step": 2825 + }, + { + "epoch": 0.48455751548535053, + "grad_norm": 1.5234375, + "learning_rate": 1.8812499671429286e-05, + "loss": 1.0086, + "step": 2826 + }, + { + "epoch": 0.4847289795743404, + "grad_norm": 1.6796875, + "learning_rate": 1.8811645954874053e-05, + "loss": 1.0392, + "step": 2827 + }, + { + "epoch": 0.48490044366333024, + "grad_norm": 1.4921875, + "learning_rate": 1.8810791950937864e-05, + "loss": 1.002, + "step": 2828 + }, + { + "epoch": 0.4850719077523201, + "grad_norm": 1.5546875, + "learning_rate": 1.880993765964857e-05, + "loss": 0.9703, + "step": 2829 + }, + { + "epoch": 0.48524337184131, + "grad_norm": 1.640625, + "learning_rate": 1.8809083081034035e-05, + "loss": 1.1082, + "step": 2830 + }, + { + "epoch": 0.4854148359302998, + "grad_norm": 1.640625, + "learning_rate": 1.8808228215122127e-05, + "loss": 0.9924, + "step": 2831 + }, + { + "epoch": 0.4855863000192897, + "grad_norm": 2.03125, + "learning_rate": 1.880737306194073e-05, + "loss": 1.0646, + "step": 2832 + }, + { + "epoch": 0.4857577641082796, + "grad_norm": 1.5625, + "learning_rate": 1.8806517621517733e-05, + "loss": 1.0148, + "step": 2833 + }, + { + "epoch": 0.4859292281972694, + "grad_norm": 1.6484375, + "learning_rate": 1.880566189388103e-05, + "loss": 0.9831, + "step": 2834 + }, + { + "epoch": 0.4861006922862593, + "grad_norm": 1.5078125, + "learning_rate": 1.8804805879058538e-05, + "loss": 0.9783, + "step": 2835 + }, + { + "epoch": 0.4862721563752492, + "grad_norm": 1.546875, + "learning_rate": 1.8803949577078172e-05, + "loss": 0.9806, + "step": 2836 + }, + { + "epoch": 0.486443620464239, + "grad_norm": 1.6171875, + "learning_rate": 1.8803092987967853e-05, + "loss": 1.0607, + "step": 2837 + }, + { + "epoch": 0.4866150845532289, + "grad_norm": 1.5546875, + "learning_rate": 1.8802236111755524e-05, + "loss": 0.9994, + "step": 2838 + }, + { + "epoch": 0.48678654864221876, + "grad_norm": 1.6328125, + "learning_rate": 1.880137894846913e-05, + "loss": 1.053, + "step": 2839 + }, + { + "epoch": 0.4869580127312086, + "grad_norm": 1.65625, + "learning_rate": 1.8800521498136622e-05, + "loss": 1.0909, + "step": 2840 + }, + { + "epoch": 0.48712947682019847, + "grad_norm": 1.6484375, + "learning_rate": 1.8799663760785973e-05, + "loss": 1.0357, + "step": 2841 + }, + { + "epoch": 0.48730094090918835, + "grad_norm": 1.609375, + "learning_rate": 1.8798805736445153e-05, + "loss": 1.0848, + "step": 2842 + }, + { + "epoch": 0.4874724049981782, + "grad_norm": 1.578125, + "learning_rate": 1.8797947425142137e-05, + "loss": 1.0419, + "step": 2843 + }, + { + "epoch": 0.48764386908716806, + "grad_norm": 1.6875, + "learning_rate": 1.879708882690493e-05, + "loss": 1.15, + "step": 2844 + }, + { + "epoch": 0.48781533317615794, + "grad_norm": 1.546875, + "learning_rate": 1.879622994176153e-05, + "loss": 1.1013, + "step": 2845 + }, + { + "epoch": 0.48798679726514776, + "grad_norm": 1.6875, + "learning_rate": 1.879537076973995e-05, + "loss": 1.0203, + "step": 2846 + }, + { + "epoch": 0.48815826135413765, + "grad_norm": 1.640625, + "learning_rate": 1.8794511310868208e-05, + "loss": 0.9994, + "step": 2847 + }, + { + "epoch": 0.4883297254431275, + "grad_norm": 1.6171875, + "learning_rate": 1.8793651565174333e-05, + "loss": 1.0928, + "step": 2848 + }, + { + "epoch": 0.48850118953211735, + "grad_norm": 1.671875, + "learning_rate": 1.879279153268637e-05, + "loss": 1.0635, + "step": 2849 + }, + { + "epoch": 0.48867265362110723, + "grad_norm": 1.65625, + "learning_rate": 1.879193121343236e-05, + "loss": 1.1594, + "step": 2850 + }, + { + "epoch": 0.4888441177100971, + "grad_norm": 1.6640625, + "learning_rate": 1.879107060744037e-05, + "loss": 0.9951, + "step": 2851 + }, + { + "epoch": 0.48901558179908694, + "grad_norm": 1.6328125, + "learning_rate": 1.8790209714738462e-05, + "loss": 1.0426, + "step": 2852 + }, + { + "epoch": 0.4891870458880768, + "grad_norm": 1.578125, + "learning_rate": 1.8789348535354714e-05, + "loss": 1.0581, + "step": 2853 + }, + { + "epoch": 0.4893585099770667, + "grad_norm": 1.6328125, + "learning_rate": 1.8788487069317208e-05, + "loss": 1.0374, + "step": 2854 + }, + { + "epoch": 0.48952997406605653, + "grad_norm": 1.640625, + "learning_rate": 1.878762531665405e-05, + "loss": 1.0132, + "step": 2855 + }, + { + "epoch": 0.4897014381550464, + "grad_norm": 1.625, + "learning_rate": 1.878676327739334e-05, + "loss": 1.0167, + "step": 2856 + }, + { + "epoch": 0.4898729022440363, + "grad_norm": 1.6640625, + "learning_rate": 1.878590095156319e-05, + "loss": 1.0232, + "step": 2857 + }, + { + "epoch": 0.4900443663330261, + "grad_norm": 1.546875, + "learning_rate": 1.8785038339191723e-05, + "loss": 1.0708, + "step": 2858 + }, + { + "epoch": 0.490215830422016, + "grad_norm": 1.6015625, + "learning_rate": 1.8784175440307076e-05, + "loss": 1.0704, + "step": 2859 + }, + { + "epoch": 0.4903872945110059, + "grad_norm": 1.640625, + "learning_rate": 1.878331225493739e-05, + "loss": 1.0848, + "step": 2860 + }, + { + "epoch": 0.4905587585999957, + "grad_norm": 1.734375, + "learning_rate": 1.878244878311082e-05, + "loss": 1.0802, + "step": 2861 + }, + { + "epoch": 0.4907302226889856, + "grad_norm": 1.5703125, + "learning_rate": 1.8781585024855517e-05, + "loss": 1.0438, + "step": 2862 + }, + { + "epoch": 0.4909016867779754, + "grad_norm": 1.640625, + "learning_rate": 1.878072098019966e-05, + "loss": 0.9577, + "step": 2863 + }, + { + "epoch": 0.4910731508669653, + "grad_norm": 1.6953125, + "learning_rate": 1.8779856649171427e-05, + "loss": 1.0269, + "step": 2864 + }, + { + "epoch": 0.49124461495595517, + "grad_norm": 1.65625, + "learning_rate": 1.877899203179901e-05, + "loss": 0.978, + "step": 2865 + }, + { + "epoch": 0.491416079044945, + "grad_norm": 1.7734375, + "learning_rate": 1.8778127128110602e-05, + "loss": 1.0305, + "step": 2866 + }, + { + "epoch": 0.4915875431339349, + "grad_norm": 1.5234375, + "learning_rate": 1.877726193813441e-05, + "loss": 1.0869, + "step": 2867 + }, + { + "epoch": 0.49175900722292476, + "grad_norm": 1.640625, + "learning_rate": 1.877639646189866e-05, + "loss": 1.0588, + "step": 2868 + }, + { + "epoch": 0.4919304713119146, + "grad_norm": 1.5703125, + "learning_rate": 1.8775530699431566e-05, + "loss": 1.0311, + "step": 2869 + }, + { + "epoch": 0.49210193540090447, + "grad_norm": 1.578125, + "learning_rate": 1.8774664650761373e-05, + "loss": 1.0879, + "step": 2870 + }, + { + "epoch": 0.49227339948989435, + "grad_norm": 1.625, + "learning_rate": 1.8773798315916324e-05, + "loss": 1.0146, + "step": 2871 + }, + { + "epoch": 0.4924448635788842, + "grad_norm": 1.609375, + "learning_rate": 1.8772931694924677e-05, + "loss": 1.0515, + "step": 2872 + }, + { + "epoch": 0.49261632766787405, + "grad_norm": 1.59375, + "learning_rate": 1.8772064787814686e-05, + "loss": 1.0498, + "step": 2873 + }, + { + "epoch": 0.49278779175686394, + "grad_norm": 1.6171875, + "learning_rate": 1.877119759461463e-05, + "loss": 0.9728, + "step": 2874 + }, + { + "epoch": 0.49295925584585376, + "grad_norm": 1.609375, + "learning_rate": 1.8770330115352797e-05, + "loss": 1.0578, + "step": 2875 + }, + { + "epoch": 0.49313071993484364, + "grad_norm": 1.6015625, + "learning_rate": 1.8769462350057467e-05, + "loss": 1.0311, + "step": 2876 + }, + { + "epoch": 0.4933021840238335, + "grad_norm": 1.6484375, + "learning_rate": 1.876859429875695e-05, + "loss": 1.0546, + "step": 2877 + }, + { + "epoch": 0.49347364811282335, + "grad_norm": 1.6875, + "learning_rate": 1.8767725961479548e-05, + "loss": 0.9694, + "step": 2878 + }, + { + "epoch": 0.49364511220181323, + "grad_norm": 1.5390625, + "learning_rate": 1.876685733825359e-05, + "loss": 0.9932, + "step": 2879 + }, + { + "epoch": 0.4938165762908031, + "grad_norm": 1.59375, + "learning_rate": 1.87659884291074e-05, + "loss": 1.0917, + "step": 2880 + }, + { + "epoch": 0.49398804037979294, + "grad_norm": 1.6953125, + "learning_rate": 1.876511923406932e-05, + "loss": 1.0881, + "step": 2881 + }, + { + "epoch": 0.4941595044687828, + "grad_norm": 1.609375, + "learning_rate": 1.8764249753167693e-05, + "loss": 1.0479, + "step": 2882 + }, + { + "epoch": 0.4943309685577727, + "grad_norm": 1.609375, + "learning_rate": 1.8763379986430884e-05, + "loss": 1.1382, + "step": 2883 + }, + { + "epoch": 0.4945024326467625, + "grad_norm": 1.6171875, + "learning_rate": 1.8762509933887248e-05, + "loss": 1.0526, + "step": 2884 + }, + { + "epoch": 0.4946738967357524, + "grad_norm": 1.5078125, + "learning_rate": 1.8761639595565166e-05, + "loss": 0.9602, + "step": 2885 + }, + { + "epoch": 0.4948453608247423, + "grad_norm": 1.7109375, + "learning_rate": 1.876076897149303e-05, + "loss": 0.9996, + "step": 2886 + }, + { + "epoch": 0.4950168249137321, + "grad_norm": 1.59375, + "learning_rate": 1.8759898061699223e-05, + "loss": 1.0105, + "step": 2887 + }, + { + "epoch": 0.495188289002722, + "grad_norm": 1.625, + "learning_rate": 1.875902686621215e-05, + "loss": 1.0099, + "step": 2888 + }, + { + "epoch": 0.4953597530917119, + "grad_norm": 1.578125, + "learning_rate": 1.8758155385060232e-05, + "loss": 1.0506, + "step": 2889 + }, + { + "epoch": 0.4955312171807017, + "grad_norm": 1.640625, + "learning_rate": 1.8757283618271887e-05, + "loss": 1.0378, + "step": 2890 + }, + { + "epoch": 0.4957026812696916, + "grad_norm": 1.6640625, + "learning_rate": 1.8756411565875545e-05, + "loss": 1.152, + "step": 2891 + }, + { + "epoch": 0.49587414535868146, + "grad_norm": 1.6328125, + "learning_rate": 1.875553922789965e-05, + "loss": 1.0665, + "step": 2892 + }, + { + "epoch": 0.4960456094476713, + "grad_norm": 1.5390625, + "learning_rate": 1.875466660437265e-05, + "loss": 1.0447, + "step": 2893 + }, + { + "epoch": 0.49621707353666117, + "grad_norm": 1.546875, + "learning_rate": 1.8753793695323e-05, + "loss": 0.9895, + "step": 2894 + }, + { + "epoch": 0.49638853762565105, + "grad_norm": 1.5859375, + "learning_rate": 1.875292050077918e-05, + "loss": 0.9806, + "step": 2895 + }, + { + "epoch": 0.4965600017146409, + "grad_norm": 1.65625, + "learning_rate": 1.8752047020769663e-05, + "loss": 1.0423, + "step": 2896 + }, + { + "epoch": 0.49673146580363076, + "grad_norm": 1.8203125, + "learning_rate": 1.875117325532293e-05, + "loss": 1.0898, + "step": 2897 + }, + { + "epoch": 0.49690292989262064, + "grad_norm": 1.625, + "learning_rate": 1.8750299204467485e-05, + "loss": 1.0848, + "step": 2898 + }, + { + "epoch": 0.49707439398161046, + "grad_norm": 1.53125, + "learning_rate": 1.8749424868231837e-05, + "loss": 0.9389, + "step": 2899 + }, + { + "epoch": 0.49724585807060034, + "grad_norm": 1.6953125, + "learning_rate": 1.874855024664449e-05, + "loss": 1.0633, + "step": 2900 + }, + { + "epoch": 0.4974173221595902, + "grad_norm": 1.578125, + "learning_rate": 1.874767533973398e-05, + "loss": 1.0126, + "step": 2901 + }, + { + "epoch": 0.49758878624858005, + "grad_norm": 1.6328125, + "learning_rate": 1.874680014752883e-05, + "loss": 1.0707, + "step": 2902 + }, + { + "epoch": 0.49776025033756993, + "grad_norm": 1.7578125, + "learning_rate": 1.87459246700576e-05, + "loss": 1.0622, + "step": 2903 + }, + { + "epoch": 0.4979317144265598, + "grad_norm": 1.6484375, + "learning_rate": 1.8745048907348824e-05, + "loss": 1.0176, + "step": 2904 + }, + { + "epoch": 0.49810317851554964, + "grad_norm": 1.5625, + "learning_rate": 1.874417285943108e-05, + "loss": 1.0011, + "step": 2905 + }, + { + "epoch": 0.4982746426045395, + "grad_norm": 1.6015625, + "learning_rate": 1.8743296526332924e-05, + "loss": 1.0018, + "step": 2906 + }, + { + "epoch": 0.4984461066935294, + "grad_norm": 1.59375, + "learning_rate": 1.8742419908082946e-05, + "loss": 1.0249, + "step": 2907 + }, + { + "epoch": 0.4986175707825192, + "grad_norm": 1.5, + "learning_rate": 1.8741543004709735e-05, + "loss": 0.851, + "step": 2908 + }, + { + "epoch": 0.4987890348715091, + "grad_norm": 1.6015625, + "learning_rate": 1.874066581624189e-05, + "loss": 1.0617, + "step": 2909 + }, + { + "epoch": 0.498960498960499, + "grad_norm": 1.6875, + "learning_rate": 1.8739788342708016e-05, + "loss": 0.9858, + "step": 2910 + }, + { + "epoch": 0.4991319630494888, + "grad_norm": 1.546875, + "learning_rate": 1.8738910584136735e-05, + "loss": 1.0821, + "step": 2911 + }, + { + "epoch": 0.4993034271384787, + "grad_norm": 1.5625, + "learning_rate": 1.873803254055667e-05, + "loss": 1.0519, + "step": 2912 + }, + { + "epoch": 0.4994748912274685, + "grad_norm": 1.515625, + "learning_rate": 1.873715421199646e-05, + "loss": 0.9833, + "step": 2913 + }, + { + "epoch": 0.4996463553164584, + "grad_norm": 1.609375, + "learning_rate": 1.8736275598484753e-05, + "loss": 1.0917, + "step": 2914 + }, + { + "epoch": 0.4998178194054483, + "grad_norm": 1.5546875, + "learning_rate": 1.8735396700050202e-05, + "loss": 0.992, + "step": 2915 + }, + { + "epoch": 0.4999892834944381, + "grad_norm": 1.546875, + "learning_rate": 1.8734517516721467e-05, + "loss": 0.9666, + "step": 2916 + }, + { + "epoch": 0.500160747583428, + "grad_norm": 1.5546875, + "learning_rate": 1.8733638048527223e-05, + "loss": 1.0, + "step": 2917 + }, + { + "epoch": 0.5003322116724178, + "grad_norm": 1.5546875, + "learning_rate": 1.8732758295496158e-05, + "loss": 0.9783, + "step": 2918 + }, + { + "epoch": 0.5005036757614078, + "grad_norm": 1.640625, + "learning_rate": 1.8731878257656956e-05, + "loss": 1.1255, + "step": 2919 + }, + { + "epoch": 0.5006751398503976, + "grad_norm": 1.4921875, + "learning_rate": 1.8730997935038328e-05, + "loss": 1.0204, + "step": 2920 + }, + { + "epoch": 0.5008466039393874, + "grad_norm": 1.5703125, + "learning_rate": 1.8730117327668975e-05, + "loss": 0.9479, + "step": 2921 + }, + { + "epoch": 0.5010180680283773, + "grad_norm": 1.625, + "learning_rate": 1.8729236435577625e-05, + "loss": 0.9952, + "step": 2922 + }, + { + "epoch": 0.5011895321173672, + "grad_norm": 1.609375, + "learning_rate": 1.8728355258793e-05, + "loss": 1.0075, + "step": 2923 + }, + { + "epoch": 0.501360996206357, + "grad_norm": 1.6796875, + "learning_rate": 1.8727473797343846e-05, + "loss": 1.0614, + "step": 2924 + }, + { + "epoch": 0.5015324602953469, + "grad_norm": 1.640625, + "learning_rate": 1.8726592051258906e-05, + "loss": 0.9874, + "step": 2925 + }, + { + "epoch": 0.5017039243843368, + "grad_norm": 1.625, + "learning_rate": 1.8725710020566936e-05, + "loss": 1.0229, + "step": 2926 + }, + { + "epoch": 0.5018753884733266, + "grad_norm": 1.5703125, + "learning_rate": 1.8724827705296706e-05, + "loss": 1.0575, + "step": 2927 + }, + { + "epoch": 0.5020468525623165, + "grad_norm": 1.6328125, + "learning_rate": 1.872394510547699e-05, + "loss": 1.0114, + "step": 2928 + }, + { + "epoch": 0.5022183166513063, + "grad_norm": 1.6796875, + "learning_rate": 1.872306222113657e-05, + "loss": 1.1254, + "step": 2929 + }, + { + "epoch": 0.5023897807402962, + "grad_norm": 1.59375, + "learning_rate": 1.8722179052304245e-05, + "loss": 1.0574, + "step": 2930 + }, + { + "epoch": 0.5025612448292861, + "grad_norm": 1.625, + "learning_rate": 1.8721295599008815e-05, + "loss": 1.0549, + "step": 2931 + }, + { + "epoch": 0.5027327089182759, + "grad_norm": 1.5078125, + "learning_rate": 1.8720411861279094e-05, + "loss": 0.9429, + "step": 2932 + }, + { + "epoch": 0.5029041730072658, + "grad_norm": 1.5546875, + "learning_rate": 1.8719527839143906e-05, + "loss": 0.9691, + "step": 2933 + }, + { + "epoch": 0.5030756370962557, + "grad_norm": 1.5859375, + "learning_rate": 1.8718643532632083e-05, + "loss": 1.0602, + "step": 2934 + }, + { + "epoch": 0.5032471011852455, + "grad_norm": 1.6484375, + "learning_rate": 1.8717758941772458e-05, + "loss": 1.0318, + "step": 2935 + }, + { + "epoch": 0.5034185652742353, + "grad_norm": 1.6484375, + "learning_rate": 1.8716874066593885e-05, + "loss": 0.9521, + "step": 2936 + }, + { + "epoch": 0.5035900293632253, + "grad_norm": 1.609375, + "learning_rate": 1.871598890712523e-05, + "loss": 1.0152, + "step": 2937 + }, + { + "epoch": 0.5037614934522151, + "grad_norm": 1.6171875, + "learning_rate": 1.8715103463395352e-05, + "loss": 1.0642, + "step": 2938 + }, + { + "epoch": 0.5039329575412049, + "grad_norm": 1.65625, + "learning_rate": 1.8714217735433132e-05, + "loss": 1.0613, + "step": 2939 + }, + { + "epoch": 0.5041044216301949, + "grad_norm": 1.703125, + "learning_rate": 1.871333172326746e-05, + "loss": 1.142, + "step": 2940 + }, + { + "epoch": 0.5042758857191847, + "grad_norm": 1.65625, + "learning_rate": 1.8712445426927225e-05, + "loss": 1.0557, + "step": 2941 + }, + { + "epoch": 0.5044473498081745, + "grad_norm": 1.5859375, + "learning_rate": 1.8711558846441336e-05, + "loss": 1.0868, + "step": 2942 + }, + { + "epoch": 0.5046188138971645, + "grad_norm": 1.671875, + "learning_rate": 1.871067198183871e-05, + "loss": 1.0208, + "step": 2943 + }, + { + "epoch": 0.5047902779861543, + "grad_norm": 1.65625, + "learning_rate": 1.870978483314827e-05, + "loss": 1.069, + "step": 2944 + }, + { + "epoch": 0.5049617420751441, + "grad_norm": 1.6875, + "learning_rate": 1.870889740039895e-05, + "loss": 1.0765, + "step": 2945 + }, + { + "epoch": 0.505133206164134, + "grad_norm": 1.5859375, + "learning_rate": 1.8708009683619684e-05, + "loss": 0.9706, + "step": 2946 + }, + { + "epoch": 0.5053046702531239, + "grad_norm": 1.65625, + "learning_rate": 1.870712168283944e-05, + "loss": 1.0681, + "step": 2947 + }, + { + "epoch": 0.5054761343421137, + "grad_norm": 1.5703125, + "learning_rate": 1.8706233398087166e-05, + "loss": 0.9911, + "step": 2948 + }, + { + "epoch": 0.5056475984311036, + "grad_norm": 1.7265625, + "learning_rate": 1.8705344829391835e-05, + "loss": 1.1497, + "step": 2949 + }, + { + "epoch": 0.5058190625200935, + "grad_norm": 1.6328125, + "learning_rate": 1.8704455976782427e-05, + "loss": 0.9842, + "step": 2950 + }, + { + "epoch": 0.5059905266090833, + "grad_norm": 1.6328125, + "learning_rate": 1.8703566840287934e-05, + "loss": 1.07, + "step": 2951 + }, + { + "epoch": 0.5061619906980732, + "grad_norm": 1.5625, + "learning_rate": 1.8702677419937353e-05, + "loss": 0.9317, + "step": 2952 + }, + { + "epoch": 0.506333454787063, + "grad_norm": 1.5859375, + "learning_rate": 1.870178771575969e-05, + "loss": 1.101, + "step": 2953 + }, + { + "epoch": 0.5065049188760529, + "grad_norm": 1.59375, + "learning_rate": 1.8700897727783957e-05, + "loss": 1.1308, + "step": 2954 + }, + { + "epoch": 0.5066763829650428, + "grad_norm": 1.5390625, + "learning_rate": 1.8700007456039188e-05, + "loss": 0.9863, + "step": 2955 + }, + { + "epoch": 0.5068478470540326, + "grad_norm": 1.6796875, + "learning_rate": 1.8699116900554414e-05, + "loss": 1.0991, + "step": 2956 + }, + { + "epoch": 0.5070193111430225, + "grad_norm": 1.53125, + "learning_rate": 1.8698226061358685e-05, + "loss": 1.126, + "step": 2957 + }, + { + "epoch": 0.5071907752320124, + "grad_norm": 1.5390625, + "learning_rate": 1.8697334938481044e-05, + "loss": 1.0232, + "step": 2958 + }, + { + "epoch": 0.5073622393210022, + "grad_norm": 1.65625, + "learning_rate": 1.869644353195056e-05, + "loss": 1.0008, + "step": 2959 + }, + { + "epoch": 0.507533703409992, + "grad_norm": 1.6171875, + "learning_rate": 1.8695551841796305e-05, + "loss": 1.014, + "step": 2960 + }, + { + "epoch": 0.507705167498982, + "grad_norm": 1.578125, + "learning_rate": 1.869465986804736e-05, + "loss": 0.9823, + "step": 2961 + }, + { + "epoch": 0.5078766315879718, + "grad_norm": 1.5546875, + "learning_rate": 1.8693767610732815e-05, + "loss": 0.9592, + "step": 2962 + }, + { + "epoch": 0.5080480956769616, + "grad_norm": 1.578125, + "learning_rate": 1.8692875069881773e-05, + "loss": 1.0477, + "step": 2963 + }, + { + "epoch": 0.5082195597659516, + "grad_norm": 1.671875, + "learning_rate": 1.8691982245523336e-05, + "loss": 1.0331, + "step": 2964 + }, + { + "epoch": 0.5083910238549414, + "grad_norm": 1.5859375, + "learning_rate": 1.8691089137686633e-05, + "loss": 0.9992, + "step": 2965 + }, + { + "epoch": 0.5085624879439312, + "grad_norm": 1.5546875, + "learning_rate": 1.869019574640078e-05, + "loss": 1.0193, + "step": 2966 + }, + { + "epoch": 0.5087339520329212, + "grad_norm": 1.5546875, + "learning_rate": 1.8689302071694925e-05, + "loss": 1.021, + "step": 2967 + }, + { + "epoch": 0.508905416121911, + "grad_norm": 1.6640625, + "learning_rate": 1.8688408113598205e-05, + "loss": 1.0477, + "step": 2968 + }, + { + "epoch": 0.5090768802109008, + "grad_norm": 1.546875, + "learning_rate": 1.868751387213978e-05, + "loss": 0.9475, + "step": 2969 + }, + { + "epoch": 0.5092483442998907, + "grad_norm": 1.6015625, + "learning_rate": 1.868661934734881e-05, + "loss": 1.0664, + "step": 2970 + }, + { + "epoch": 0.5094198083888806, + "grad_norm": 1.609375, + "learning_rate": 1.8685724539254478e-05, + "loss": 1.0631, + "step": 2971 + }, + { + "epoch": 0.5095912724778704, + "grad_norm": 1.578125, + "learning_rate": 1.8684829447885958e-05, + "loss": 0.9959, + "step": 2972 + }, + { + "epoch": 0.5097627365668603, + "grad_norm": 1.640625, + "learning_rate": 1.868393407327245e-05, + "loss": 1.026, + "step": 2973 + }, + { + "epoch": 0.5099342006558502, + "grad_norm": 1.6015625, + "learning_rate": 1.8683038415443143e-05, + "loss": 0.9877, + "step": 2974 + }, + { + "epoch": 0.51010566474484, + "grad_norm": 1.5703125, + "learning_rate": 1.8682142474427264e-05, + "loss": 1.0109, + "step": 2975 + }, + { + "epoch": 0.5102771288338299, + "grad_norm": 1.671875, + "learning_rate": 1.868124625025402e-05, + "loss": 1.0571, + "step": 2976 + }, + { + "epoch": 0.5104485929228197, + "grad_norm": 1.6328125, + "learning_rate": 1.8680349742952648e-05, + "loss": 1.0695, + "step": 2977 + }, + { + "epoch": 0.5106200570118096, + "grad_norm": 1.5546875, + "learning_rate": 1.867945295255238e-05, + "loss": 1.0911, + "step": 2978 + }, + { + "epoch": 0.5107915211007995, + "grad_norm": 1.53125, + "learning_rate": 1.8678555879082473e-05, + "loss": 1.0142, + "step": 2979 + }, + { + "epoch": 0.5109629851897893, + "grad_norm": 1.6015625, + "learning_rate": 1.8677658522572173e-05, + "loss": 1.0067, + "step": 2980 + }, + { + "epoch": 0.5111344492787792, + "grad_norm": 1.609375, + "learning_rate": 1.8676760883050754e-05, + "loss": 0.9649, + "step": 2981 + }, + { + "epoch": 0.511305913367769, + "grad_norm": 1.5390625, + "learning_rate": 1.867586296054749e-05, + "loss": 1.0444, + "step": 2982 + }, + { + "epoch": 0.5114773774567589, + "grad_norm": 1.578125, + "learning_rate": 1.8674964755091663e-05, + "loss": 0.9835, + "step": 2983 + }, + { + "epoch": 0.5116488415457487, + "grad_norm": 1.5546875, + "learning_rate": 1.8674066266712567e-05, + "loss": 1.0934, + "step": 2984 + }, + { + "epoch": 0.5118203056347386, + "grad_norm": 1.5703125, + "learning_rate": 1.8673167495439507e-05, + "loss": 1.061, + "step": 2985 + }, + { + "epoch": 0.5119917697237285, + "grad_norm": 1.59375, + "learning_rate": 1.8672268441301797e-05, + "loss": 1.0825, + "step": 2986 + }, + { + "epoch": 0.5121632338127183, + "grad_norm": 1.640625, + "learning_rate": 1.8671369104328757e-05, + "loss": 1.0551, + "step": 2987 + }, + { + "epoch": 0.5123346979017082, + "grad_norm": 1.5625, + "learning_rate": 1.8670469484549716e-05, + "loss": 1.0457, + "step": 2988 + }, + { + "epoch": 0.5125061619906981, + "grad_norm": 1.578125, + "learning_rate": 1.8669569581994014e-05, + "loss": 1.0619, + "step": 2989 + }, + { + "epoch": 0.5126776260796879, + "grad_norm": 1.640625, + "learning_rate": 1.8668669396691003e-05, + "loss": 1.0387, + "step": 2990 + }, + { + "epoch": 0.5128490901686777, + "grad_norm": 1.546875, + "learning_rate": 1.8667768928670037e-05, + "loss": 0.9911, + "step": 2991 + }, + { + "epoch": 0.5130205542576677, + "grad_norm": 1.6875, + "learning_rate": 1.8666868177960492e-05, + "loss": 1.0664, + "step": 2992 + }, + { + "epoch": 0.5131920183466575, + "grad_norm": 1.5859375, + "learning_rate": 1.8665967144591733e-05, + "loss": 1.076, + "step": 2993 + }, + { + "epoch": 0.5133634824356473, + "grad_norm": 1.609375, + "learning_rate": 1.8665065828593155e-05, + "loss": 1.0758, + "step": 2994 + }, + { + "epoch": 0.5135349465246373, + "grad_norm": 1.609375, + "learning_rate": 1.8664164229994153e-05, + "loss": 1.1051, + "step": 2995 + }, + { + "epoch": 0.5137064106136271, + "grad_norm": 1.6328125, + "learning_rate": 1.8663262348824127e-05, + "loss": 1.0632, + "step": 2996 + }, + { + "epoch": 0.5138778747026169, + "grad_norm": 1.609375, + "learning_rate": 1.8662360185112495e-05, + "loss": 1.0605, + "step": 2997 + }, + { + "epoch": 0.5140493387916069, + "grad_norm": 1.53125, + "learning_rate": 1.8661457738888673e-05, + "loss": 1.0795, + "step": 2998 + }, + { + "epoch": 0.5142208028805967, + "grad_norm": 1.6328125, + "learning_rate": 1.86605550101821e-05, + "loss": 0.9883, + "step": 2999 + }, + { + "epoch": 0.5143922669695865, + "grad_norm": 1.5546875, + "learning_rate": 1.8659651999022218e-05, + "loss": 1.006, + "step": 3000 + }, + { + "epoch": 0.5145637310585764, + "grad_norm": 1.59375, + "learning_rate": 1.8658748705438474e-05, + "loss": 1.0278, + "step": 3001 + }, + { + "epoch": 0.5147351951475663, + "grad_norm": 1.5546875, + "learning_rate": 1.865784512946033e-05, + "loss": 0.9796, + "step": 3002 + }, + { + "epoch": 0.5149066592365561, + "grad_norm": 1.5625, + "learning_rate": 1.8656941271117252e-05, + "loss": 0.9706, + "step": 3003 + }, + { + "epoch": 0.515078123325546, + "grad_norm": 1.5625, + "learning_rate": 1.865603713043872e-05, + "loss": 0.9346, + "step": 3004 + }, + { + "epoch": 0.5152495874145359, + "grad_norm": 1.609375, + "learning_rate": 1.865513270745422e-05, + "loss": 1.0387, + "step": 3005 + }, + { + "epoch": 0.5154210515035257, + "grad_norm": 1.7109375, + "learning_rate": 1.8654228002193255e-05, + "loss": 1.0278, + "step": 3006 + }, + { + "epoch": 0.5155925155925156, + "grad_norm": 1.609375, + "learning_rate": 1.865332301468532e-05, + "loss": 1.0926, + "step": 3007 + }, + { + "epoch": 0.5157639796815054, + "grad_norm": 1.59375, + "learning_rate": 1.8652417744959942e-05, + "loss": 1.0507, + "step": 3008 + }, + { + "epoch": 0.5159354437704953, + "grad_norm": 1.59375, + "learning_rate": 1.865151219304664e-05, + "loss": 1.0022, + "step": 3009 + }, + { + "epoch": 0.5161069078594852, + "grad_norm": 1.5390625, + "learning_rate": 1.8650606358974942e-05, + "loss": 1.0055, + "step": 3010 + }, + { + "epoch": 0.516278371948475, + "grad_norm": 1.625, + "learning_rate": 1.8649700242774397e-05, + "loss": 1.0461, + "step": 3011 + }, + { + "epoch": 0.5164498360374649, + "grad_norm": 1.5859375, + "learning_rate": 1.8648793844474556e-05, + "loss": 1.0554, + "step": 3012 + }, + { + "epoch": 0.5166213001264548, + "grad_norm": 1.6328125, + "learning_rate": 1.8647887164104976e-05, + "loss": 1.0602, + "step": 3013 + }, + { + "epoch": 0.5167927642154446, + "grad_norm": 1.6328125, + "learning_rate": 1.8646980201695236e-05, + "loss": 1.0584, + "step": 3014 + }, + { + "epoch": 0.5169642283044344, + "grad_norm": 1.625, + "learning_rate": 1.8646072957274906e-05, + "loss": 1.0742, + "step": 3015 + }, + { + "epoch": 0.5171356923934244, + "grad_norm": 1.59375, + "learning_rate": 1.8645165430873578e-05, + "loss": 1.0544, + "step": 3016 + }, + { + "epoch": 0.5173071564824142, + "grad_norm": 1.671875, + "learning_rate": 1.8644257622520857e-05, + "loss": 1.0175, + "step": 3017 + }, + { + "epoch": 0.517478620571404, + "grad_norm": 1.6640625, + "learning_rate": 1.864334953224634e-05, + "loss": 0.9962, + "step": 3018 + }, + { + "epoch": 0.517650084660394, + "grad_norm": 1.5703125, + "learning_rate": 1.8642441160079644e-05, + "loss": 0.9973, + "step": 3019 + }, + { + "epoch": 0.5178215487493838, + "grad_norm": 1.6328125, + "learning_rate": 1.86415325060504e-05, + "loss": 1.0568, + "step": 3020 + }, + { + "epoch": 0.5179930128383736, + "grad_norm": 1.5390625, + "learning_rate": 1.864062357018824e-05, + "loss": 0.9947, + "step": 3021 + }, + { + "epoch": 0.5181644769273636, + "grad_norm": 1.5703125, + "learning_rate": 1.863971435252281e-05, + "loss": 0.9692, + "step": 3022 + }, + { + "epoch": 0.5183359410163534, + "grad_norm": 1.59375, + "learning_rate": 1.8638804853083757e-05, + "loss": 1.0223, + "step": 3023 + }, + { + "epoch": 0.5185074051053432, + "grad_norm": 1.5234375, + "learning_rate": 1.863789507190075e-05, + "loss": 0.9897, + "step": 3024 + }, + { + "epoch": 0.5186788691943331, + "grad_norm": 1.59375, + "learning_rate": 1.8636985009003456e-05, + "loss": 1.0149, + "step": 3025 + }, + { + "epoch": 0.518850333283323, + "grad_norm": 1.5390625, + "learning_rate": 1.8636074664421556e-05, + "loss": 1.0892, + "step": 3026 + }, + { + "epoch": 0.5190217973723128, + "grad_norm": 1.5625, + "learning_rate": 1.8635164038184742e-05, + "loss": 1.1259, + "step": 3027 + }, + { + "epoch": 0.5191932614613027, + "grad_norm": 1.5546875, + "learning_rate": 1.8634253130322714e-05, + "loss": 1.0054, + "step": 3028 + }, + { + "epoch": 0.5193647255502926, + "grad_norm": 1.5546875, + "learning_rate": 1.8633341940865172e-05, + "loss": 1.0098, + "step": 3029 + }, + { + "epoch": 0.5195361896392824, + "grad_norm": 1.6953125, + "learning_rate": 1.8632430469841844e-05, + "loss": 1.0955, + "step": 3030 + }, + { + "epoch": 0.5197076537282723, + "grad_norm": 1.640625, + "learning_rate": 1.863151871728245e-05, + "loss": 1.0311, + "step": 3031 + }, + { + "epoch": 0.5198791178172621, + "grad_norm": 1.5, + "learning_rate": 1.863060668321673e-05, + "loss": 0.9735, + "step": 3032 + }, + { + "epoch": 0.520050581906252, + "grad_norm": 1.6953125, + "learning_rate": 1.862969436767442e-05, + "loss": 1.166, + "step": 3033 + }, + { + "epoch": 0.5202220459952419, + "grad_norm": 1.6015625, + "learning_rate": 1.8628781770685282e-05, + "loss": 0.9992, + "step": 3034 + }, + { + "epoch": 0.5203935100842317, + "grad_norm": 1.625, + "learning_rate": 1.8627868892279083e-05, + "loss": 0.9968, + "step": 3035 + }, + { + "epoch": 0.5205649741732216, + "grad_norm": 1.7734375, + "learning_rate": 1.8626955732485585e-05, + "loss": 1.084, + "step": 3036 + }, + { + "epoch": 0.5207364382622115, + "grad_norm": 1.5546875, + "learning_rate": 1.8626042291334572e-05, + "loss": 0.9351, + "step": 3037 + }, + { + "epoch": 0.5209079023512013, + "grad_norm": 1.7109375, + "learning_rate": 1.8625128568855844e-05, + "loss": 1.0052, + "step": 3038 + }, + { + "epoch": 0.5210793664401911, + "grad_norm": 1.6015625, + "learning_rate": 1.862421456507919e-05, + "loss": 1.0152, + "step": 3039 + }, + { + "epoch": 0.5212508305291811, + "grad_norm": 1.515625, + "learning_rate": 1.8623300280034424e-05, + "loss": 0.9943, + "step": 3040 + }, + { + "epoch": 0.5214222946181709, + "grad_norm": 1.6484375, + "learning_rate": 1.8622385713751363e-05, + "loss": 1.0114, + "step": 3041 + }, + { + "epoch": 0.5215937587071607, + "grad_norm": 1.640625, + "learning_rate": 1.8621470866259835e-05, + "loss": 1.1507, + "step": 3042 + }, + { + "epoch": 0.5217652227961507, + "grad_norm": 1.6171875, + "learning_rate": 1.862055573758968e-05, + "loss": 0.9616, + "step": 3043 + }, + { + "epoch": 0.5219366868851405, + "grad_norm": 1.5546875, + "learning_rate": 1.8619640327770735e-05, + "loss": 0.9324, + "step": 3044 + }, + { + "epoch": 0.5221081509741303, + "grad_norm": 1.625, + "learning_rate": 1.8618724636832866e-05, + "loss": 0.9974, + "step": 3045 + }, + { + "epoch": 0.5222796150631203, + "grad_norm": 1.609375, + "learning_rate": 1.8617808664805926e-05, + "loss": 1.0601, + "step": 3046 + }, + { + "epoch": 0.5224510791521101, + "grad_norm": 1.6640625, + "learning_rate": 1.8616892411719798e-05, + "loss": 0.9951, + "step": 3047 + }, + { + "epoch": 0.5226225432410999, + "grad_norm": 1.734375, + "learning_rate": 1.8615975877604356e-05, + "loss": 1.1189, + "step": 3048 + }, + { + "epoch": 0.5227940073300898, + "grad_norm": 1.5625, + "learning_rate": 1.86150590624895e-05, + "loss": 1.0502, + "step": 3049 + }, + { + "epoch": 0.5229654714190797, + "grad_norm": 1.515625, + "learning_rate": 1.8614141966405125e-05, + "loss": 1.0211, + "step": 3050 + }, + { + "epoch": 0.5231369355080695, + "grad_norm": 1.640625, + "learning_rate": 1.8613224589381143e-05, + "loss": 1.0333, + "step": 3051 + }, + { + "epoch": 0.5233083995970594, + "grad_norm": 1.5390625, + "learning_rate": 1.8612306931447473e-05, + "loss": 1.0176, + "step": 3052 + }, + { + "epoch": 0.5234798636860493, + "grad_norm": 1.578125, + "learning_rate": 1.861138899263404e-05, + "loss": 1.1212, + "step": 3053 + }, + { + "epoch": 0.5236513277750391, + "grad_norm": 1.5390625, + "learning_rate": 1.8610470772970787e-05, + "loss": 1.0371, + "step": 3054 + }, + { + "epoch": 0.523822791864029, + "grad_norm": 1.5625, + "learning_rate": 1.860955227248766e-05, + "loss": 1.0394, + "step": 3055 + }, + { + "epoch": 0.5239942559530189, + "grad_norm": 1.609375, + "learning_rate": 1.8608633491214613e-05, + "loss": 1.0387, + "step": 3056 + }, + { + "epoch": 0.5241657200420087, + "grad_norm": 1.6796875, + "learning_rate": 1.8607714429181606e-05, + "loss": 1.1197, + "step": 3057 + }, + { + "epoch": 0.5243371841309986, + "grad_norm": 1.65625, + "learning_rate": 1.860679508641862e-05, + "loss": 1.1022, + "step": 3058 + }, + { + "epoch": 0.5245086482199884, + "grad_norm": 1.6171875, + "learning_rate": 1.8605875462955638e-05, + "loss": 1.0624, + "step": 3059 + }, + { + "epoch": 0.5246801123089783, + "grad_norm": 1.546875, + "learning_rate": 1.8604955558822647e-05, + "loss": 1.0418, + "step": 3060 + }, + { + "epoch": 0.5248515763979682, + "grad_norm": 1.6171875, + "learning_rate": 1.8604035374049654e-05, + "loss": 1.0223, + "step": 3061 + }, + { + "epoch": 0.525023040486958, + "grad_norm": 1.6171875, + "learning_rate": 1.860311490866667e-05, + "loss": 1.0532, + "step": 3062 + }, + { + "epoch": 0.5251945045759479, + "grad_norm": 1.5234375, + "learning_rate": 1.860219416270371e-05, + "loss": 0.9409, + "step": 3063 + }, + { + "epoch": 0.5253659686649378, + "grad_norm": 1.6328125, + "learning_rate": 1.8601273136190806e-05, + "loss": 1.0213, + "step": 3064 + }, + { + "epoch": 0.5255374327539276, + "grad_norm": 1.546875, + "learning_rate": 1.8600351829157996e-05, + "loss": 1.0198, + "step": 3065 + }, + { + "epoch": 0.5257088968429174, + "grad_norm": 1.546875, + "learning_rate": 1.859943024163532e-05, + "loss": 1.0321, + "step": 3066 + }, + { + "epoch": 0.5258803609319074, + "grad_norm": 1.609375, + "learning_rate": 1.859850837365285e-05, + "loss": 1.0562, + "step": 3067 + }, + { + "epoch": 0.5260518250208972, + "grad_norm": 1.6015625, + "learning_rate": 1.859758622524064e-05, + "loss": 1.0236, + "step": 3068 + }, + { + "epoch": 0.526223289109887, + "grad_norm": 1.5703125, + "learning_rate": 1.8596663796428766e-05, + "loss": 1.0014, + "step": 3069 + }, + { + "epoch": 0.526394753198877, + "grad_norm": 1.5234375, + "learning_rate": 1.859574108724732e-05, + "loss": 1.032, + "step": 3070 + }, + { + "epoch": 0.5265662172878668, + "grad_norm": 1.65625, + "learning_rate": 1.8594818097726382e-05, + "loss": 1.0493, + "step": 3071 + }, + { + "epoch": 0.5267376813768566, + "grad_norm": 1.5859375, + "learning_rate": 1.8593894827896063e-05, + "loss": 1.0415, + "step": 3072 + }, + { + "epoch": 0.5269091454658466, + "grad_norm": 1.59375, + "learning_rate": 1.859297127778647e-05, + "loss": 0.9893, + "step": 3073 + }, + { + "epoch": 0.5270806095548364, + "grad_norm": 1.5859375, + "learning_rate": 1.859204744742773e-05, + "loss": 1.0089, + "step": 3074 + }, + { + "epoch": 0.5272520736438262, + "grad_norm": 1.625, + "learning_rate": 1.8591123336849962e-05, + "loss": 1.0668, + "step": 3075 + }, + { + "epoch": 0.527423537732816, + "grad_norm": 1.703125, + "learning_rate": 1.8590198946083315e-05, + "loss": 1.1142, + "step": 3076 + }, + { + "epoch": 0.527595001821806, + "grad_norm": 1.6171875, + "learning_rate": 1.8589274275157935e-05, + "loss": 1.0525, + "step": 3077 + }, + { + "epoch": 0.5277664659107958, + "grad_norm": 1.5859375, + "learning_rate": 1.8588349324103974e-05, + "loss": 1.0156, + "step": 3078 + }, + { + "epoch": 0.5279379299997856, + "grad_norm": 1.5078125, + "learning_rate": 1.85874240929516e-05, + "loss": 1.02, + "step": 3079 + }, + { + "epoch": 0.5281093940887756, + "grad_norm": 1.5390625, + "learning_rate": 1.8586498581730987e-05, + "loss": 1.0628, + "step": 3080 + }, + { + "epoch": 0.5282808581777654, + "grad_norm": 1.4765625, + "learning_rate": 1.8585572790472326e-05, + "loss": 1.092, + "step": 3081 + }, + { + "epoch": 0.5284523222667552, + "grad_norm": 1.609375, + "learning_rate": 1.8584646719205803e-05, + "loss": 1.0836, + "step": 3082 + }, + { + "epoch": 0.5286237863557451, + "grad_norm": 1.53125, + "learning_rate": 1.8583720367961623e-05, + "loss": 0.9917, + "step": 3083 + }, + { + "epoch": 0.528795250444735, + "grad_norm": 1.609375, + "learning_rate": 1.858279373677e-05, + "loss": 1.0639, + "step": 3084 + }, + { + "epoch": 0.5289667145337248, + "grad_norm": 1.71875, + "learning_rate": 1.8581866825661157e-05, + "loss": 1.0244, + "step": 3085 + }, + { + "epoch": 0.5291381786227147, + "grad_norm": 1.6171875, + "learning_rate": 1.8580939634665315e-05, + "loss": 1.0353, + "step": 3086 + }, + { + "epoch": 0.5293096427117046, + "grad_norm": 1.5390625, + "learning_rate": 1.8580012163812724e-05, + "loss": 0.9599, + "step": 3087 + }, + { + "epoch": 0.5294811068006944, + "grad_norm": 1.5546875, + "learning_rate": 1.857908441313362e-05, + "loss": 1.024, + "step": 3088 + }, + { + "epoch": 0.5296525708896843, + "grad_norm": 1.6484375, + "learning_rate": 1.8578156382658275e-05, + "loss": 1.0846, + "step": 3089 + }, + { + "epoch": 0.5298240349786741, + "grad_norm": 1.78125, + "learning_rate": 1.8577228072416945e-05, + "loss": 1.1081, + "step": 3090 + }, + { + "epoch": 0.529995499067664, + "grad_norm": 1.59375, + "learning_rate": 1.8576299482439905e-05, + "loss": 1.0505, + "step": 3091 + }, + { + "epoch": 0.5301669631566539, + "grad_norm": 1.59375, + "learning_rate": 1.8575370612757447e-05, + "loss": 1.0444, + "step": 3092 + }, + { + "epoch": 0.5303384272456437, + "grad_norm": 1.609375, + "learning_rate": 1.8574441463399862e-05, + "loss": 1.0852, + "step": 3093 + }, + { + "epoch": 0.5305098913346336, + "grad_norm": 1.6015625, + "learning_rate": 1.8573512034397453e-05, + "loss": 1.0687, + "step": 3094 + }, + { + "epoch": 0.5306813554236235, + "grad_norm": 1.6015625, + "learning_rate": 1.857258232578053e-05, + "loss": 0.9603, + "step": 3095 + }, + { + "epoch": 0.5308528195126133, + "grad_norm": 1.625, + "learning_rate": 1.8571652337579414e-05, + "loss": 0.9849, + "step": 3096 + }, + { + "epoch": 0.5310242836016031, + "grad_norm": 1.6015625, + "learning_rate": 1.857072206982444e-05, + "loss": 0.9789, + "step": 3097 + }, + { + "epoch": 0.5311957476905931, + "grad_norm": 1.65625, + "learning_rate": 1.8569791522545943e-05, + "loss": 1.0004, + "step": 3098 + }, + { + "epoch": 0.5313672117795829, + "grad_norm": 1.546875, + "learning_rate": 1.8568860695774278e-05, + "loss": 0.9379, + "step": 3099 + }, + { + "epoch": 0.5315386758685727, + "grad_norm": 1.6640625, + "learning_rate": 1.8567929589539795e-05, + "loss": 0.9668, + "step": 3100 + }, + { + "epoch": 0.5317101399575627, + "grad_norm": 1.59375, + "learning_rate": 1.8566998203872866e-05, + "loss": 0.9812, + "step": 3101 + }, + { + "epoch": 0.5318816040465525, + "grad_norm": 1.578125, + "learning_rate": 1.8566066538803863e-05, + "loss": 1.0347, + "step": 3102 + }, + { + "epoch": 0.5320530681355423, + "grad_norm": 1.59375, + "learning_rate": 1.8565134594363174e-05, + "loss": 1.0362, + "step": 3103 + }, + { + "epoch": 0.5322245322245323, + "grad_norm": 1.5859375, + "learning_rate": 1.8564202370581193e-05, + "loss": 0.993, + "step": 3104 + }, + { + "epoch": 0.5323959963135221, + "grad_norm": 1.609375, + "learning_rate": 1.8563269867488323e-05, + "loss": 1.0241, + "step": 3105 + }, + { + "epoch": 0.5325674604025119, + "grad_norm": 1.6953125, + "learning_rate": 1.8562337085114978e-05, + "loss": 1.0542, + "step": 3106 + }, + { + "epoch": 0.5327389244915018, + "grad_norm": 1.5, + "learning_rate": 1.8561404023491577e-05, + "loss": 0.9356, + "step": 3107 + }, + { + "epoch": 0.5329103885804917, + "grad_norm": 1.515625, + "learning_rate": 1.856047068264855e-05, + "loss": 0.9925, + "step": 3108 + }, + { + "epoch": 0.5330818526694815, + "grad_norm": 1.5390625, + "learning_rate": 1.8559537062616337e-05, + "loss": 0.9417, + "step": 3109 + }, + { + "epoch": 0.5332533167584714, + "grad_norm": 1.640625, + "learning_rate": 1.8558603163425392e-05, + "loss": 1.0574, + "step": 3110 + }, + { + "epoch": 0.5334247808474613, + "grad_norm": 1.6328125, + "learning_rate": 1.8557668985106164e-05, + "loss": 1.0717, + "step": 3111 + }, + { + "epoch": 0.5335962449364511, + "grad_norm": 1.640625, + "learning_rate": 1.855673452768913e-05, + "loss": 1.0645, + "step": 3112 + }, + { + "epoch": 0.533767709025441, + "grad_norm": 1.5859375, + "learning_rate": 1.8555799791204762e-05, + "loss": 0.9766, + "step": 3113 + }, + { + "epoch": 0.5339391731144308, + "grad_norm": 1.7734375, + "learning_rate": 1.855486477568354e-05, + "loss": 1.0897, + "step": 3114 + }, + { + "epoch": 0.5341106372034207, + "grad_norm": 1.625, + "learning_rate": 1.8553929481155966e-05, + "loss": 1.0915, + "step": 3115 + }, + { + "epoch": 0.5342821012924106, + "grad_norm": 1.421875, + "learning_rate": 1.8552993907652538e-05, + "loss": 0.907, + "step": 3116 + }, + { + "epoch": 0.5344535653814004, + "grad_norm": 1.6015625, + "learning_rate": 1.8552058055203773e-05, + "loss": 1.0162, + "step": 3117 + }, + { + "epoch": 0.5346250294703903, + "grad_norm": 1.5625, + "learning_rate": 1.855112192384019e-05, + "loss": 1.0393, + "step": 3118 + }, + { + "epoch": 0.5347964935593802, + "grad_norm": 1.546875, + "learning_rate": 1.8550185513592325e-05, + "loss": 0.9972, + "step": 3119 + }, + { + "epoch": 0.53496795764837, + "grad_norm": 1.5546875, + "learning_rate": 1.854924882449071e-05, + "loss": 1.0278, + "step": 3120 + }, + { + "epoch": 0.5351394217373598, + "grad_norm": 1.6171875, + "learning_rate": 1.8548311856565897e-05, + "loss": 0.98, + "step": 3121 + }, + { + "epoch": 0.5353108858263498, + "grad_norm": 1.609375, + "learning_rate": 1.8547374609848442e-05, + "loss": 1.089, + "step": 3122 + }, + { + "epoch": 0.5354823499153396, + "grad_norm": 1.671875, + "learning_rate": 1.8546437084368917e-05, + "loss": 1.009, + "step": 3123 + }, + { + "epoch": 0.5356538140043294, + "grad_norm": 1.609375, + "learning_rate": 1.8545499280157897e-05, + "loss": 0.9897, + "step": 3124 + }, + { + "epoch": 0.5358252780933194, + "grad_norm": 1.6171875, + "learning_rate": 1.8544561197245966e-05, + "loss": 0.9395, + "step": 3125 + }, + { + "epoch": 0.5359967421823092, + "grad_norm": 1.5859375, + "learning_rate": 1.8543622835663715e-05, + "loss": 1.027, + "step": 3126 + }, + { + "epoch": 0.536168206271299, + "grad_norm": 1.5859375, + "learning_rate": 1.8542684195441754e-05, + "loss": 1.1007, + "step": 3127 + }, + { + "epoch": 0.536339670360289, + "grad_norm": 1.6015625, + "learning_rate": 1.8541745276610693e-05, + "loss": 1.0017, + "step": 3128 + }, + { + "epoch": 0.5365111344492788, + "grad_norm": 1.453125, + "learning_rate": 1.8540806079201152e-05, + "loss": 1.008, + "step": 3129 + }, + { + "epoch": 0.5366825985382686, + "grad_norm": 1.5390625, + "learning_rate": 1.8539866603243762e-05, + "loss": 1.0767, + "step": 3130 + }, + { + "epoch": 0.5368540626272585, + "grad_norm": 1.6015625, + "learning_rate": 1.8538926848769166e-05, + "loss": 1.1019, + "step": 3131 + }, + { + "epoch": 0.5370255267162484, + "grad_norm": 1.65625, + "learning_rate": 1.853798681580801e-05, + "loss": 1.0994, + "step": 3132 + }, + { + "epoch": 0.5371969908052382, + "grad_norm": 1.59375, + "learning_rate": 1.853704650439095e-05, + "loss": 1.0356, + "step": 3133 + }, + { + "epoch": 0.5373684548942281, + "grad_norm": 1.6953125, + "learning_rate": 1.853610591454866e-05, + "loss": 1.0899, + "step": 3134 + }, + { + "epoch": 0.537539918983218, + "grad_norm": 1.65625, + "learning_rate": 1.8535165046311814e-05, + "loss": 1.0018, + "step": 3135 + }, + { + "epoch": 0.5377113830722078, + "grad_norm": 1.546875, + "learning_rate": 1.853422389971109e-05, + "loss": 1.0659, + "step": 3136 + }, + { + "epoch": 0.5378828471611977, + "grad_norm": 1.5390625, + "learning_rate": 1.853328247477719e-05, + "loss": 0.9889, + "step": 3137 + }, + { + "epoch": 0.5380543112501875, + "grad_norm": 1.6171875, + "learning_rate": 1.8532340771540815e-05, + "loss": 1.2005, + "step": 3138 + }, + { + "epoch": 0.5382257753391774, + "grad_norm": 1.5234375, + "learning_rate": 1.853139879003268e-05, + "loss": 0.9841, + "step": 3139 + }, + { + "epoch": 0.5383972394281673, + "grad_norm": 1.609375, + "learning_rate": 1.85304565302835e-05, + "loss": 0.9865, + "step": 3140 + }, + { + "epoch": 0.5385687035171571, + "grad_norm": 1.578125, + "learning_rate": 1.8529513992324012e-05, + "loss": 1.0674, + "step": 3141 + }, + { + "epoch": 0.538740167606147, + "grad_norm": 1.515625, + "learning_rate": 1.8528571176184952e-05, + "loss": 1.0455, + "step": 3142 + }, + { + "epoch": 0.5389116316951369, + "grad_norm": 1.59375, + "learning_rate": 1.8527628081897076e-05, + "loss": 1.1141, + "step": 3143 + }, + { + "epoch": 0.5390830957841267, + "grad_norm": 1.484375, + "learning_rate": 1.852668470949113e-05, + "loss": 0.9851, + "step": 3144 + }, + { + "epoch": 0.5392545598731165, + "grad_norm": 1.703125, + "learning_rate": 1.8525741058997888e-05, + "loss": 1.08, + "step": 3145 + }, + { + "epoch": 0.5394260239621065, + "grad_norm": 1.6171875, + "learning_rate": 1.8524797130448127e-05, + "loss": 0.9748, + "step": 3146 + }, + { + "epoch": 0.5395974880510963, + "grad_norm": 1.5703125, + "learning_rate": 1.8523852923872628e-05, + "loss": 0.9932, + "step": 3147 + }, + { + "epoch": 0.5397689521400861, + "grad_norm": 1.6171875, + "learning_rate": 1.8522908439302193e-05, + "loss": 1.0925, + "step": 3148 + }, + { + "epoch": 0.5399404162290761, + "grad_norm": 1.515625, + "learning_rate": 1.8521963676767614e-05, + "loss": 1.0554, + "step": 3149 + }, + { + "epoch": 0.5401118803180659, + "grad_norm": 1.6875, + "learning_rate": 1.852101863629971e-05, + "loss": 1.0495, + "step": 3150 + }, + { + "epoch": 0.5402833444070557, + "grad_norm": 1.484375, + "learning_rate": 1.85200733179293e-05, + "loss": 1.043, + "step": 3151 + }, + { + "epoch": 0.5404548084960457, + "grad_norm": 1.5078125, + "learning_rate": 1.8519127721687217e-05, + "loss": 1.0885, + "step": 3152 + }, + { + "epoch": 0.5406262725850355, + "grad_norm": 1.4765625, + "learning_rate": 1.8518181847604298e-05, + "loss": 0.8962, + "step": 3153 + }, + { + "epoch": 0.5407977366740253, + "grad_norm": 1.6484375, + "learning_rate": 1.851723569571139e-05, + "loss": 1.0868, + "step": 3154 + }, + { + "epoch": 0.5409692007630152, + "grad_norm": 1.53125, + "learning_rate": 1.8516289266039358e-05, + "loss": 1.0079, + "step": 3155 + }, + { + "epoch": 0.5411406648520051, + "grad_norm": 1.6640625, + "learning_rate": 1.851534255861906e-05, + "loss": 1.0173, + "step": 3156 + }, + { + "epoch": 0.5413121289409949, + "grad_norm": 1.5859375, + "learning_rate": 1.8514395573481377e-05, + "loss": 1.0323, + "step": 3157 + }, + { + "epoch": 0.5414835930299848, + "grad_norm": 1.5390625, + "learning_rate": 1.8513448310657193e-05, + "loss": 0.9832, + "step": 3158 + }, + { + "epoch": 0.5416550571189747, + "grad_norm": 1.59375, + "learning_rate": 1.85125007701774e-05, + "loss": 1.0065, + "step": 3159 + }, + { + "epoch": 0.5418265212079645, + "grad_norm": 1.5859375, + "learning_rate": 1.85115529520729e-05, + "loss": 1.1247, + "step": 3160 + }, + { + "epoch": 0.5419979852969544, + "grad_norm": 1.5625, + "learning_rate": 1.851060485637461e-05, + "loss": 1.0102, + "step": 3161 + }, + { + "epoch": 0.5421694493859442, + "grad_norm": 1.5859375, + "learning_rate": 1.8509656483113444e-05, + "loss": 1.0124, + "step": 3162 + }, + { + "epoch": 0.5423409134749341, + "grad_norm": 1.5234375, + "learning_rate": 1.8508707832320338e-05, + "loss": 1.0461, + "step": 3163 + }, + { + "epoch": 0.542512377563924, + "grad_norm": 1.5, + "learning_rate": 1.850775890402623e-05, + "loss": 1.027, + "step": 3164 + }, + { + "epoch": 0.5426838416529138, + "grad_norm": 1.515625, + "learning_rate": 1.8506809698262064e-05, + "loss": 0.899, + "step": 3165 + }, + { + "epoch": 0.5428553057419037, + "grad_norm": 1.5703125, + "learning_rate": 1.85058602150588e-05, + "loss": 0.9396, + "step": 3166 + }, + { + "epoch": 0.5430267698308936, + "grad_norm": 1.5859375, + "learning_rate": 1.8504910454447407e-05, + "loss": 1.0611, + "step": 3167 + }, + { + "epoch": 0.5431982339198834, + "grad_norm": 1.5390625, + "learning_rate": 1.8503960416458854e-05, + "loss": 1.0304, + "step": 3168 + }, + { + "epoch": 0.5433696980088732, + "grad_norm": 1.609375, + "learning_rate": 1.8503010101124132e-05, + "loss": 0.9509, + "step": 3169 + }, + { + "epoch": 0.5435411620978632, + "grad_norm": 1.53125, + "learning_rate": 1.8502059508474232e-05, + "loss": 0.9562, + "step": 3170 + }, + { + "epoch": 0.543712626186853, + "grad_norm": 1.5546875, + "learning_rate": 1.850110863854015e-05, + "loss": 0.9525, + "step": 3171 + }, + { + "epoch": 0.5438840902758428, + "grad_norm": 1.671875, + "learning_rate": 1.850015749135291e-05, + "loss": 0.9873, + "step": 3172 + }, + { + "epoch": 0.5440555543648327, + "grad_norm": 1.6015625, + "learning_rate": 1.849920606694352e-05, + "loss": 1.0424, + "step": 3173 + }, + { + "epoch": 0.5442270184538226, + "grad_norm": 1.6015625, + "learning_rate": 1.849825436534302e-05, + "loss": 1.0221, + "step": 3174 + }, + { + "epoch": 0.5443984825428124, + "grad_norm": 1.5859375, + "learning_rate": 1.849730238658244e-05, + "loss": 1.0602, + "step": 3175 + }, + { + "epoch": 0.5445699466318022, + "grad_norm": 1.625, + "learning_rate": 1.849635013069283e-05, + "loss": 1.0207, + "step": 3176 + }, + { + "epoch": 0.5447414107207922, + "grad_norm": 1.625, + "learning_rate": 1.8495397597705253e-05, + "loss": 1.1356, + "step": 3177 + }, + { + "epoch": 0.544912874809782, + "grad_norm": 1.578125, + "learning_rate": 1.8494444787650768e-05, + "loss": 0.9768, + "step": 3178 + }, + { + "epoch": 0.5450843388987718, + "grad_norm": 1.671875, + "learning_rate": 1.849349170056045e-05, + "loss": 1.0797, + "step": 3179 + }, + { + "epoch": 0.5452558029877618, + "grad_norm": 1.4765625, + "learning_rate": 1.8492538336465387e-05, + "loss": 0.9896, + "step": 3180 + }, + { + "epoch": 0.5454272670767516, + "grad_norm": 1.609375, + "learning_rate": 1.8491584695396666e-05, + "loss": 0.9773, + "step": 3181 + }, + { + "epoch": 0.5455987311657414, + "grad_norm": 1.5390625, + "learning_rate": 1.8490630777385393e-05, + "loss": 1.0454, + "step": 3182 + }, + { + "epoch": 0.5457701952547314, + "grad_norm": 1.640625, + "learning_rate": 1.8489676582462675e-05, + "loss": 1.0276, + "step": 3183 + }, + { + "epoch": 0.5459416593437212, + "grad_norm": 1.609375, + "learning_rate": 1.8488722110659635e-05, + "loss": 1.0258, + "step": 3184 + }, + { + "epoch": 0.546113123432711, + "grad_norm": 1.5078125, + "learning_rate": 1.8487767362007403e-05, + "loss": 0.9235, + "step": 3185 + }, + { + "epoch": 0.546284587521701, + "grad_norm": 1.578125, + "learning_rate": 1.8486812336537117e-05, + "loss": 1.0482, + "step": 3186 + }, + { + "epoch": 0.5464560516106908, + "grad_norm": 1.546875, + "learning_rate": 1.848585703427992e-05, + "loss": 0.9701, + "step": 3187 + }, + { + "epoch": 0.5466275156996806, + "grad_norm": 1.484375, + "learning_rate": 1.848490145526697e-05, + "loss": 0.9514, + "step": 3188 + }, + { + "epoch": 0.5467989797886705, + "grad_norm": 1.578125, + "learning_rate": 1.8483945599529436e-05, + "loss": 1.0126, + "step": 3189 + }, + { + "epoch": 0.5469704438776604, + "grad_norm": 1.59375, + "learning_rate": 1.8482989467098483e-05, + "loss": 1.0683, + "step": 3190 + }, + { + "epoch": 0.5471419079666502, + "grad_norm": 1.7109375, + "learning_rate": 1.8482033058005305e-05, + "loss": 1.0875, + "step": 3191 + }, + { + "epoch": 0.5473133720556401, + "grad_norm": 1.625, + "learning_rate": 1.8481076372281083e-05, + "loss": 0.9823, + "step": 3192 + }, + { + "epoch": 0.54748483614463, + "grad_norm": 1.6015625, + "learning_rate": 1.8480119409957026e-05, + "loss": 1.0217, + "step": 3193 + }, + { + "epoch": 0.5476563002336198, + "grad_norm": 1.640625, + "learning_rate": 1.8479162171064345e-05, + "loss": 1.0225, + "step": 3194 + }, + { + "epoch": 0.5478277643226097, + "grad_norm": 1.609375, + "learning_rate": 1.847820465563425e-05, + "loss": 1.0182, + "step": 3195 + }, + { + "epoch": 0.5479992284115995, + "grad_norm": 1.734375, + "learning_rate": 1.847724686369798e-05, + "loss": 1.0994, + "step": 3196 + }, + { + "epoch": 0.5481706925005894, + "grad_norm": 1.625, + "learning_rate": 1.8476288795286764e-05, + "loss": 1.125, + "step": 3197 + }, + { + "epoch": 0.5483421565895793, + "grad_norm": 1.5859375, + "learning_rate": 1.8475330450431858e-05, + "loss": 1.0885, + "step": 3198 + }, + { + "epoch": 0.5485136206785691, + "grad_norm": 1.7421875, + "learning_rate": 1.8474371829164504e-05, + "loss": 1.0607, + "step": 3199 + }, + { + "epoch": 0.548685084767559, + "grad_norm": 1.609375, + "learning_rate": 1.8473412931515974e-05, + "loss": 1.0336, + "step": 3200 + }, + { + "epoch": 0.5488565488565489, + "grad_norm": 1.609375, + "learning_rate": 1.8472453757517544e-05, + "loss": 1.0461, + "step": 3201 + }, + { + "epoch": 0.5490280129455387, + "grad_norm": 1.6484375, + "learning_rate": 1.847149430720049e-05, + "loss": 1.0338, + "step": 3202 + }, + { + "epoch": 0.5491994770345285, + "grad_norm": 1.6171875, + "learning_rate": 1.8470534580596106e-05, + "loss": 1.0947, + "step": 3203 + }, + { + "epoch": 0.5493709411235185, + "grad_norm": 1.6484375, + "learning_rate": 1.8469574577735694e-05, + "loss": 1.0113, + "step": 3204 + }, + { + "epoch": 0.5495424052125083, + "grad_norm": 1.71875, + "learning_rate": 1.8468614298650562e-05, + "loss": 1.0102, + "step": 3205 + }, + { + "epoch": 0.5497138693014981, + "grad_norm": 1.6328125, + "learning_rate": 1.8467653743372026e-05, + "loss": 1.0543, + "step": 3206 + }, + { + "epoch": 0.5498853333904881, + "grad_norm": 1.6484375, + "learning_rate": 1.8466692911931414e-05, + "loss": 1.0869, + "step": 3207 + }, + { + "epoch": 0.5500567974794779, + "grad_norm": 1.59375, + "learning_rate": 1.8465731804360064e-05, + "loss": 1.0763, + "step": 3208 + }, + { + "epoch": 0.5502282615684677, + "grad_norm": 1.5078125, + "learning_rate": 1.8464770420689323e-05, + "loss": 0.967, + "step": 3209 + }, + { + "epoch": 0.5503997256574577, + "grad_norm": 1.6015625, + "learning_rate": 1.8463808760950544e-05, + "loss": 0.9791, + "step": 3210 + }, + { + "epoch": 0.5505711897464475, + "grad_norm": 1.6015625, + "learning_rate": 1.8462846825175087e-05, + "loss": 0.9765, + "step": 3211 + }, + { + "epoch": 0.5507426538354373, + "grad_norm": 1.53125, + "learning_rate": 1.8461884613394328e-05, + "loss": 1.0778, + "step": 3212 + }, + { + "epoch": 0.5509141179244272, + "grad_norm": 1.5390625, + "learning_rate": 1.8460922125639643e-05, + "loss": 1.0572, + "step": 3213 + }, + { + "epoch": 0.5510855820134171, + "grad_norm": 1.6640625, + "learning_rate": 1.845995936194243e-05, + "loss": 1.0458, + "step": 3214 + }, + { + "epoch": 0.5512570461024069, + "grad_norm": 1.640625, + "learning_rate": 1.8458996322334085e-05, + "loss": 1.0645, + "step": 3215 + }, + { + "epoch": 0.5514285101913968, + "grad_norm": 1.5703125, + "learning_rate": 1.845803300684602e-05, + "loss": 0.9418, + "step": 3216 + }, + { + "epoch": 0.5515999742803867, + "grad_norm": 1.5625, + "learning_rate": 1.8457069415509642e-05, + "loss": 1.0402, + "step": 3217 + }, + { + "epoch": 0.5517714383693765, + "grad_norm": 1.828125, + "learning_rate": 1.8456105548356392e-05, + "loss": 0.9247, + "step": 3218 + }, + { + "epoch": 0.5519429024583664, + "grad_norm": 1.5625, + "learning_rate": 1.845514140541769e-05, + "loss": 1.0088, + "step": 3219 + }, + { + "epoch": 0.5521143665473562, + "grad_norm": 1.6171875, + "learning_rate": 1.845417698672499e-05, + "loss": 1.0644, + "step": 3220 + }, + { + "epoch": 0.5522858306363461, + "grad_norm": 1.6328125, + "learning_rate": 1.8453212292309743e-05, + "loss": 1.1305, + "step": 3221 + }, + { + "epoch": 0.552457294725336, + "grad_norm": 1.5625, + "learning_rate": 1.8452247322203413e-05, + "loss": 1.0003, + "step": 3222 + }, + { + "epoch": 0.5526287588143258, + "grad_norm": 1.671875, + "learning_rate": 1.8451282076437468e-05, + "loss": 0.9957, + "step": 3223 + }, + { + "epoch": 0.5528002229033157, + "grad_norm": 1.65625, + "learning_rate": 1.845031655504339e-05, + "loss": 1.0345, + "step": 3224 + }, + { + "epoch": 0.5529716869923056, + "grad_norm": 1.515625, + "learning_rate": 1.8449350758052668e-05, + "loss": 0.9799, + "step": 3225 + }, + { + "epoch": 0.5531431510812954, + "grad_norm": 1.5703125, + "learning_rate": 1.84483846854968e-05, + "loss": 1.0793, + "step": 3226 + }, + { + "epoch": 0.5533146151702852, + "grad_norm": 1.59375, + "learning_rate": 1.8447418337407295e-05, + "loss": 1.0526, + "step": 3227 + }, + { + "epoch": 0.5534860792592752, + "grad_norm": 1.7421875, + "learning_rate": 1.8446451713815667e-05, + "loss": 1.1667, + "step": 3228 + }, + { + "epoch": 0.553657543348265, + "grad_norm": 1.59375, + "learning_rate": 1.8445484814753444e-05, + "loss": 1.0139, + "step": 3229 + }, + { + "epoch": 0.5538290074372548, + "grad_norm": 1.578125, + "learning_rate": 1.8444517640252156e-05, + "loss": 1.0396, + "step": 3230 + }, + { + "epoch": 0.5540004715262448, + "grad_norm": 1.7421875, + "learning_rate": 1.844355019034335e-05, + "loss": 0.9346, + "step": 3231 + }, + { + "epoch": 0.5541719356152346, + "grad_norm": 1.578125, + "learning_rate": 1.8442582465058577e-05, + "loss": 0.9973, + "step": 3232 + }, + { + "epoch": 0.5543433997042244, + "grad_norm": 1.5859375, + "learning_rate": 1.8441614464429396e-05, + "loss": 1.0392, + "step": 3233 + }, + { + "epoch": 0.5545148637932144, + "grad_norm": 1.5234375, + "learning_rate": 1.8440646188487382e-05, + "loss": 0.9467, + "step": 3234 + }, + { + "epoch": 0.5546863278822042, + "grad_norm": 1.625, + "learning_rate": 1.843967763726411e-05, + "loss": 1.0302, + "step": 3235 + }, + { + "epoch": 0.554857791971194, + "grad_norm": 1.5234375, + "learning_rate": 1.8438708810791167e-05, + "loss": 1.0771, + "step": 3236 + }, + { + "epoch": 0.5550292560601839, + "grad_norm": 1.640625, + "learning_rate": 1.8437739709100155e-05, + "loss": 1.0512, + "step": 3237 + }, + { + "epoch": 0.5552007201491738, + "grad_norm": 1.546875, + "learning_rate": 1.8436770332222677e-05, + "loss": 1.0086, + "step": 3238 + }, + { + "epoch": 0.5553721842381636, + "grad_norm": 1.4921875, + "learning_rate": 1.8435800680190347e-05, + "loss": 0.9962, + "step": 3239 + }, + { + "epoch": 0.5555436483271535, + "grad_norm": 1.6015625, + "learning_rate": 1.8434830753034795e-05, + "loss": 0.9521, + "step": 3240 + }, + { + "epoch": 0.5557151124161434, + "grad_norm": 1.5859375, + "learning_rate": 1.8433860550787647e-05, + "loss": 0.9977, + "step": 3241 + }, + { + "epoch": 0.5558865765051332, + "grad_norm": 1.6796875, + "learning_rate": 1.8432890073480546e-05, + "loss": 1.011, + "step": 3242 + }, + { + "epoch": 0.5560580405941231, + "grad_norm": 1.5859375, + "learning_rate": 1.8431919321145146e-05, + "loss": 1.0422, + "step": 3243 + }, + { + "epoch": 0.5562295046831129, + "grad_norm": 1.4765625, + "learning_rate": 1.8430948293813105e-05, + "loss": 0.9491, + "step": 3244 + }, + { + "epoch": 0.5564009687721028, + "grad_norm": 1.65625, + "learning_rate": 1.8429976991516093e-05, + "loss": 1.0425, + "step": 3245 + }, + { + "epoch": 0.5565724328610927, + "grad_norm": 1.640625, + "learning_rate": 1.8429005414285783e-05, + "loss": 1.0996, + "step": 3246 + }, + { + "epoch": 0.5567438969500825, + "grad_norm": 1.59375, + "learning_rate": 1.842803356215387e-05, + "loss": 0.9526, + "step": 3247 + }, + { + "epoch": 0.5569153610390724, + "grad_norm": 1.6796875, + "learning_rate": 1.8427061435152045e-05, + "loss": 0.9868, + "step": 3248 + }, + { + "epoch": 0.5570868251280623, + "grad_norm": 1.546875, + "learning_rate": 1.8426089033312013e-05, + "loss": 0.9752, + "step": 3249 + }, + { + "epoch": 0.5572582892170521, + "grad_norm": 1.6328125, + "learning_rate": 1.8425116356665484e-05, + "loss": 1.0016, + "step": 3250 + }, + { + "epoch": 0.5574297533060419, + "grad_norm": 1.6328125, + "learning_rate": 1.842414340524419e-05, + "loss": 1.0322, + "step": 3251 + }, + { + "epoch": 0.5576012173950319, + "grad_norm": 1.6171875, + "learning_rate": 1.8423170179079856e-05, + "loss": 0.9072, + "step": 3252 + }, + { + "epoch": 0.5577726814840217, + "grad_norm": 1.6953125, + "learning_rate": 1.8422196678204224e-05, + "loss": 1.0137, + "step": 3253 + }, + { + "epoch": 0.5579441455730115, + "grad_norm": 1.5625, + "learning_rate": 1.8421222902649042e-05, + "loss": 1.0086, + "step": 3254 + }, + { + "epoch": 0.5581156096620015, + "grad_norm": 1.5703125, + "learning_rate": 1.842024885244607e-05, + "loss": 1.0747, + "step": 3255 + }, + { + "epoch": 0.5582870737509913, + "grad_norm": 1.640625, + "learning_rate": 1.8419274527627072e-05, + "loss": 0.9744, + "step": 3256 + }, + { + "epoch": 0.5584585378399811, + "grad_norm": 1.53125, + "learning_rate": 1.8418299928223834e-05, + "loss": 1.0025, + "step": 3257 + }, + { + "epoch": 0.558630001928971, + "grad_norm": 1.6328125, + "learning_rate": 1.841732505426813e-05, + "loss": 1.051, + "step": 3258 + }, + { + "epoch": 0.5588014660179609, + "grad_norm": 1.5703125, + "learning_rate": 1.841634990579176e-05, + "loss": 0.9937, + "step": 3259 + }, + { + "epoch": 0.5589729301069507, + "grad_norm": 1.546875, + "learning_rate": 1.8415374482826526e-05, + "loss": 0.9564, + "step": 3260 + }, + { + "epoch": 0.5591443941959406, + "grad_norm": 1.59375, + "learning_rate": 1.8414398785404245e-05, + "loss": 1.1329, + "step": 3261 + }, + { + "epoch": 0.5593158582849305, + "grad_norm": 1.6328125, + "learning_rate": 1.8413422813556727e-05, + "loss": 0.9782, + "step": 3262 + }, + { + "epoch": 0.5594873223739203, + "grad_norm": 1.6171875, + "learning_rate": 1.8412446567315816e-05, + "loss": 1.0287, + "step": 3263 + }, + { + "epoch": 0.5596587864629102, + "grad_norm": 1.625, + "learning_rate": 1.841147004671334e-05, + "loss": 1.0996, + "step": 3264 + }, + { + "epoch": 0.5598302505519, + "grad_norm": 1.6015625, + "learning_rate": 1.841049325178115e-05, + "loss": 1.0127, + "step": 3265 + }, + { + "epoch": 0.5600017146408899, + "grad_norm": 1.578125, + "learning_rate": 1.840951618255111e-05, + "loss": 0.9634, + "step": 3266 + }, + { + "epoch": 0.5601731787298797, + "grad_norm": 1.6171875, + "learning_rate": 1.8408538839055078e-05, + "loss": 1.0352, + "step": 3267 + }, + { + "epoch": 0.5603446428188696, + "grad_norm": 1.515625, + "learning_rate": 1.8407561221324925e-05, + "loss": 0.9594, + "step": 3268 + }, + { + "epoch": 0.5605161069078595, + "grad_norm": 1.609375, + "learning_rate": 1.8406583329392547e-05, + "loss": 1.0399, + "step": 3269 + }, + { + "epoch": 0.5606875709968493, + "grad_norm": 1.578125, + "learning_rate": 1.8405605163289828e-05, + "loss": 1.0761, + "step": 3270 + }, + { + "epoch": 0.5608590350858392, + "grad_norm": 1.6875, + "learning_rate": 1.840462672304867e-05, + "loss": 0.9835, + "step": 3271 + }, + { + "epoch": 0.5610304991748291, + "grad_norm": 1.546875, + "learning_rate": 1.840364800870099e-05, + "loss": 1.0436, + "step": 3272 + }, + { + "epoch": 0.5612019632638189, + "grad_norm": 1.6484375, + "learning_rate": 1.84026690202787e-05, + "loss": 1.0471, + "step": 3273 + }, + { + "epoch": 0.5613734273528088, + "grad_norm": 1.46875, + "learning_rate": 1.8401689757813734e-05, + "loss": 0.98, + "step": 3274 + }, + { + "epoch": 0.5615448914417986, + "grad_norm": 1.6328125, + "learning_rate": 1.8400710221338026e-05, + "loss": 1.0721, + "step": 3275 + }, + { + "epoch": 0.5617163555307885, + "grad_norm": 1.6484375, + "learning_rate": 1.8399730410883524e-05, + "loss": 1.0973, + "step": 3276 + }, + { + "epoch": 0.5618878196197784, + "grad_norm": 1.578125, + "learning_rate": 1.8398750326482182e-05, + "loss": 0.991, + "step": 3277 + }, + { + "epoch": 0.5620592837087682, + "grad_norm": 1.5546875, + "learning_rate": 1.8397769968165967e-05, + "loss": 1.0093, + "step": 3278 + }, + { + "epoch": 0.5622307477977581, + "grad_norm": 1.6171875, + "learning_rate": 1.8396789335966846e-05, + "loss": 1.0551, + "step": 3279 + }, + { + "epoch": 0.562402211886748, + "grad_norm": 1.65625, + "learning_rate": 1.839580842991681e-05, + "loss": 0.969, + "step": 3280 + }, + { + "epoch": 0.5625736759757378, + "grad_norm": 1.5546875, + "learning_rate": 1.8394827250047842e-05, + "loss": 0.929, + "step": 3281 + }, + { + "epoch": 0.5627451400647276, + "grad_norm": 1.625, + "learning_rate": 1.8393845796391947e-05, + "loss": 1.0078, + "step": 3282 + }, + { + "epoch": 0.5629166041537176, + "grad_norm": 1.5703125, + "learning_rate": 1.839286406898113e-05, + "loss": 1.012, + "step": 3283 + }, + { + "epoch": 0.5630880682427074, + "grad_norm": 1.59375, + "learning_rate": 1.8391882067847414e-05, + "loss": 0.9506, + "step": 3284 + }, + { + "epoch": 0.5632595323316972, + "grad_norm": 1.640625, + "learning_rate": 1.8390899793022824e-05, + "loss": 1.0795, + "step": 3285 + }, + { + "epoch": 0.5634309964206872, + "grad_norm": 1.6484375, + "learning_rate": 1.8389917244539392e-05, + "loss": 1.0464, + "step": 3286 + }, + { + "epoch": 0.563602460509677, + "grad_norm": 1.640625, + "learning_rate": 1.8388934422429167e-05, + "loss": 1.1251, + "step": 3287 + }, + { + "epoch": 0.5637739245986668, + "grad_norm": 1.5859375, + "learning_rate": 1.83879513267242e-05, + "loss": 0.9987, + "step": 3288 + }, + { + "epoch": 0.5639453886876568, + "grad_norm": 1.6484375, + "learning_rate": 1.8386967957456555e-05, + "loss": 1.0367, + "step": 3289 + }, + { + "epoch": 0.5641168527766466, + "grad_norm": 1.6015625, + "learning_rate": 1.83859843146583e-05, + "loss": 1.0065, + "step": 3290 + }, + { + "epoch": 0.5642883168656364, + "grad_norm": 1.546875, + "learning_rate": 1.8385000398361517e-05, + "loss": 0.9502, + "step": 3291 + }, + { + "epoch": 0.5644597809546263, + "grad_norm": 1.640625, + "learning_rate": 1.83840162085983e-05, + "loss": 0.9974, + "step": 3292 + }, + { + "epoch": 0.5646312450436162, + "grad_norm": 1.5703125, + "learning_rate": 1.838303174540074e-05, + "loss": 1.1014, + "step": 3293 + }, + { + "epoch": 0.564802709132606, + "grad_norm": 1.5703125, + "learning_rate": 1.8382047008800947e-05, + "loss": 0.9488, + "step": 3294 + }, + { + "epoch": 0.5649741732215959, + "grad_norm": 1.6015625, + "learning_rate": 1.838106199883104e-05, + "loss": 1.0501, + "step": 3295 + }, + { + "epoch": 0.5651456373105858, + "grad_norm": 1.5625, + "learning_rate": 1.838007671552314e-05, + "loss": 0.9358, + "step": 3296 + }, + { + "epoch": 0.5653171013995756, + "grad_norm": 1.5859375, + "learning_rate": 1.837909115890938e-05, + "loss": 1.0654, + "step": 3297 + }, + { + "epoch": 0.5654885654885655, + "grad_norm": 1.5, + "learning_rate": 1.8378105329021908e-05, + "loss": 0.9435, + "step": 3298 + }, + { + "epoch": 0.5656600295775553, + "grad_norm": 1.65625, + "learning_rate": 1.8377119225892868e-05, + "loss": 0.9904, + "step": 3299 + }, + { + "epoch": 0.5658314936665452, + "grad_norm": 1.6015625, + "learning_rate": 1.8376132849554428e-05, + "loss": 1.0634, + "step": 3300 + }, + { + "epoch": 0.5660029577555351, + "grad_norm": 1.6015625, + "learning_rate": 1.8375146200038752e-05, + "loss": 1.0184, + "step": 3301 + }, + { + "epoch": 0.5661744218445249, + "grad_norm": 1.65625, + "learning_rate": 1.8374159277378024e-05, + "loss": 1.0845, + "step": 3302 + }, + { + "epoch": 0.5663458859335148, + "grad_norm": 1.578125, + "learning_rate": 1.8373172081604424e-05, + "loss": 0.9841, + "step": 3303 + }, + { + "epoch": 0.5665173500225047, + "grad_norm": 1.5859375, + "learning_rate": 1.8372184612750152e-05, + "loss": 1.0651, + "step": 3304 + }, + { + "epoch": 0.5666888141114945, + "grad_norm": 1.7265625, + "learning_rate": 1.8371196870847418e-05, + "loss": 1.0452, + "step": 3305 + }, + { + "epoch": 0.5668602782004843, + "grad_norm": 1.59375, + "learning_rate": 1.8370208855928427e-05, + "loss": 1.0319, + "step": 3306 + }, + { + "epoch": 0.5670317422894743, + "grad_norm": 1.6953125, + "learning_rate": 1.8369220568025405e-05, + "loss": 1.0709, + "step": 3307 + }, + { + "epoch": 0.5672032063784641, + "grad_norm": 1.6015625, + "learning_rate": 1.8368232007170587e-05, + "loss": 1.0725, + "step": 3308 + }, + { + "epoch": 0.5673746704674539, + "grad_norm": 1.5390625, + "learning_rate": 1.836724317339621e-05, + "loss": 1.0185, + "step": 3309 + }, + { + "epoch": 0.5675461345564439, + "grad_norm": 1.640625, + "learning_rate": 1.8366254066734526e-05, + "loss": 1.073, + "step": 3310 + }, + { + "epoch": 0.5677175986454337, + "grad_norm": 1.625, + "learning_rate": 1.836526468721779e-05, + "loss": 1.0662, + "step": 3311 + }, + { + "epoch": 0.5678890627344235, + "grad_norm": 1.5078125, + "learning_rate": 1.8364275034878277e-05, + "loss": 1.0078, + "step": 3312 + }, + { + "epoch": 0.5680605268234135, + "grad_norm": 1.5390625, + "learning_rate": 1.8363285109748253e-05, + "loss": 1.0112, + "step": 3313 + }, + { + "epoch": 0.5682319909124033, + "grad_norm": 1.6171875, + "learning_rate": 1.836229491186001e-05, + "loss": 1.0086, + "step": 3314 + }, + { + "epoch": 0.5684034550013931, + "grad_norm": 1.5625, + "learning_rate": 1.8361304441245843e-05, + "loss": 1.0831, + "step": 3315 + }, + { + "epoch": 0.568574919090383, + "grad_norm": 1.59375, + "learning_rate": 1.836031369793805e-05, + "loss": 1.0802, + "step": 3316 + }, + { + "epoch": 0.5687463831793729, + "grad_norm": 1.53125, + "learning_rate": 1.8359322681968942e-05, + "loss": 1.0011, + "step": 3317 + }, + { + "epoch": 0.5689178472683627, + "grad_norm": 1.5234375, + "learning_rate": 1.8358331393370847e-05, + "loss": 1.0521, + "step": 3318 + }, + { + "epoch": 0.5690893113573526, + "grad_norm": 1.6484375, + "learning_rate": 1.8357339832176092e-05, + "loss": 1.0235, + "step": 3319 + }, + { + "epoch": 0.5692607754463425, + "grad_norm": 1.5859375, + "learning_rate": 1.835634799841701e-05, + "loss": 1.0332, + "step": 3320 + }, + { + "epoch": 0.5694322395353323, + "grad_norm": 1.578125, + "learning_rate": 1.835535589212596e-05, + "loss": 1.0653, + "step": 3321 + }, + { + "epoch": 0.5696037036243222, + "grad_norm": 1.6484375, + "learning_rate": 1.8354363513335282e-05, + "loss": 1.1728, + "step": 3322 + }, + { + "epoch": 0.569775167713312, + "grad_norm": 1.53125, + "learning_rate": 1.8353370862077358e-05, + "loss": 1.0377, + "step": 3323 + }, + { + "epoch": 0.5699466318023019, + "grad_norm": 1.65625, + "learning_rate": 1.835237793838455e-05, + "loss": 1.0604, + "step": 3324 + }, + { + "epoch": 0.5701180958912918, + "grad_norm": 1.609375, + "learning_rate": 1.8351384742289244e-05, + "loss": 1.0777, + "step": 3325 + }, + { + "epoch": 0.5702895599802816, + "grad_norm": 1.53125, + "learning_rate": 1.8350391273823836e-05, + "loss": 1.0222, + "step": 3326 + }, + { + "epoch": 0.5704610240692715, + "grad_norm": 1.6015625, + "learning_rate": 1.8349397533020723e-05, + "loss": 1.0333, + "step": 3327 + }, + { + "epoch": 0.5706324881582614, + "grad_norm": 1.578125, + "learning_rate": 1.8348403519912316e-05, + "loss": 1.0752, + "step": 3328 + }, + { + "epoch": 0.5708039522472512, + "grad_norm": 1.5625, + "learning_rate": 1.8347409234531034e-05, + "loss": 1.0549, + "step": 3329 + }, + { + "epoch": 0.570975416336241, + "grad_norm": 1.6015625, + "learning_rate": 1.83464146769093e-05, + "loss": 1.0061, + "step": 3330 + }, + { + "epoch": 0.571146880425231, + "grad_norm": 1.6953125, + "learning_rate": 1.8345419847079555e-05, + "loss": 0.9759, + "step": 3331 + }, + { + "epoch": 0.5713183445142208, + "grad_norm": 1.5390625, + "learning_rate": 1.8344424745074245e-05, + "loss": 1.0317, + "step": 3332 + }, + { + "epoch": 0.5714898086032106, + "grad_norm": 1.6015625, + "learning_rate": 1.834342937092582e-05, + "loss": 1.0236, + "step": 3333 + }, + { + "epoch": 0.5716612726922006, + "grad_norm": 1.5859375, + "learning_rate": 1.8342433724666745e-05, + "loss": 1.032, + "step": 3334 + }, + { + "epoch": 0.5718327367811904, + "grad_norm": 1.5703125, + "learning_rate": 1.8341437806329492e-05, + "loss": 1.0583, + "step": 3335 + }, + { + "epoch": 0.5720042008701802, + "grad_norm": 1.5703125, + "learning_rate": 1.8340441615946544e-05, + "loss": 1.0017, + "step": 3336 + }, + { + "epoch": 0.5721756649591702, + "grad_norm": 1.6875, + "learning_rate": 1.8339445153550382e-05, + "loss": 1.1914, + "step": 3337 + }, + { + "epoch": 0.57234712904816, + "grad_norm": 1.65625, + "learning_rate": 1.8338448419173514e-05, + "loss": 1.0421, + "step": 3338 + }, + { + "epoch": 0.5725185931371498, + "grad_norm": 1.578125, + "learning_rate": 1.833745141284844e-05, + "loss": 1.0538, + "step": 3339 + }, + { + "epoch": 0.5726900572261397, + "grad_norm": 1.5546875, + "learning_rate": 1.833645413460768e-05, + "loss": 1.019, + "step": 3340 + }, + { + "epoch": 0.5728615213151296, + "grad_norm": 1.6796875, + "learning_rate": 1.8335456584483763e-05, + "loss": 1.0864, + "step": 3341 + }, + { + "epoch": 0.5730329854041194, + "grad_norm": 1.5234375, + "learning_rate": 1.8334458762509214e-05, + "loss": 0.988, + "step": 3342 + }, + { + "epoch": 0.5732044494931093, + "grad_norm": 1.65625, + "learning_rate": 1.833346066871658e-05, + "loss": 1.0151, + "step": 3343 + }, + { + "epoch": 0.5733759135820992, + "grad_norm": 1.6953125, + "learning_rate": 1.8332462303138418e-05, + "loss": 1.0402, + "step": 3344 + }, + { + "epoch": 0.573547377671089, + "grad_norm": 1.9375, + "learning_rate": 1.8331463665807278e-05, + "loss": 1.0646, + "step": 3345 + }, + { + "epoch": 0.5737188417600789, + "grad_norm": 1.5390625, + "learning_rate": 1.8330464756755737e-05, + "loss": 0.9698, + "step": 3346 + }, + { + "epoch": 0.5738903058490687, + "grad_norm": 1.484375, + "learning_rate": 1.832946557601637e-05, + "loss": 0.9967, + "step": 3347 + }, + { + "epoch": 0.5740617699380586, + "grad_norm": 1.6015625, + "learning_rate": 1.832846612362176e-05, + "loss": 0.9096, + "step": 3348 + }, + { + "epoch": 0.5742332340270485, + "grad_norm": 1.5546875, + "learning_rate": 1.8327466399604517e-05, + "loss": 1.0488, + "step": 3349 + }, + { + "epoch": 0.5744046981160383, + "grad_norm": 1.578125, + "learning_rate": 1.8326466403997228e-05, + "loss": 1.0233, + "step": 3350 + }, + { + "epoch": 0.5745761622050282, + "grad_norm": 1.65625, + "learning_rate": 1.832546613683252e-05, + "loss": 0.9991, + "step": 3351 + }, + { + "epoch": 0.5747476262940181, + "grad_norm": 1.6640625, + "learning_rate": 1.832446559814301e-05, + "loss": 1.12, + "step": 3352 + }, + { + "epoch": 0.5749190903830079, + "grad_norm": 1.7109375, + "learning_rate": 1.8323464787961327e-05, + "loss": 1.0985, + "step": 3353 + }, + { + "epoch": 0.5750905544719978, + "grad_norm": 1.4609375, + "learning_rate": 1.8322463706320116e-05, + "loss": 0.9235, + "step": 3354 + }, + { + "epoch": 0.5752620185609877, + "grad_norm": 1.5078125, + "learning_rate": 1.8321462353252024e-05, + "loss": 0.9686, + "step": 3355 + }, + { + "epoch": 0.5754334826499775, + "grad_norm": 1.609375, + "learning_rate": 1.8320460728789705e-05, + "loss": 0.991, + "step": 3356 + }, + { + "epoch": 0.5756049467389673, + "grad_norm": 1.625, + "learning_rate": 1.8319458832965836e-05, + "loss": 1.0094, + "step": 3357 + }, + { + "epoch": 0.5757764108279573, + "grad_norm": 1.625, + "learning_rate": 1.831845666581308e-05, + "loss": 1.0868, + "step": 3358 + }, + { + "epoch": 0.5759478749169471, + "grad_norm": 1.5703125, + "learning_rate": 1.8317454227364134e-05, + "loss": 0.9983, + "step": 3359 + }, + { + "epoch": 0.5761193390059369, + "grad_norm": 1.5703125, + "learning_rate": 1.8316451517651683e-05, + "loss": 1.0399, + "step": 3360 + }, + { + "epoch": 0.5762908030949268, + "grad_norm": 1.484375, + "learning_rate": 1.831544853670843e-05, + "loss": 0.9766, + "step": 3361 + }, + { + "epoch": 0.5764622671839167, + "grad_norm": 1.5859375, + "learning_rate": 1.8314445284567087e-05, + "loss": 0.9866, + "step": 3362 + }, + { + "epoch": 0.5766337312729065, + "grad_norm": 1.5703125, + "learning_rate": 1.8313441761260374e-05, + "loss": 1.0635, + "step": 3363 + }, + { + "epoch": 0.5768051953618963, + "grad_norm": 1.6015625, + "learning_rate": 1.831243796682102e-05, + "loss": 0.9684, + "step": 3364 + }, + { + "epoch": 0.5769766594508863, + "grad_norm": 1.5625, + "learning_rate": 1.831143390128176e-05, + "loss": 1.0406, + "step": 3365 + }, + { + "epoch": 0.5771481235398761, + "grad_norm": 1.6015625, + "learning_rate": 1.8310429564675347e-05, + "loss": 0.943, + "step": 3366 + }, + { + "epoch": 0.5773195876288659, + "grad_norm": 1.625, + "learning_rate": 1.8309424957034528e-05, + "loss": 1.0568, + "step": 3367 + }, + { + "epoch": 0.5774910517178559, + "grad_norm": 1.5390625, + "learning_rate": 1.8308420078392073e-05, + "loss": 1.0687, + "step": 3368 + }, + { + "epoch": 0.5776625158068457, + "grad_norm": 1.71875, + "learning_rate": 1.8307414928780753e-05, + "loss": 1.098, + "step": 3369 + }, + { + "epoch": 0.5778339798958355, + "grad_norm": 1.7109375, + "learning_rate": 1.8306409508233347e-05, + "loss": 1.1567, + "step": 3370 + }, + { + "epoch": 0.5780054439848255, + "grad_norm": 1.59375, + "learning_rate": 1.830540381678265e-05, + "loss": 1.0479, + "step": 3371 + }, + { + "epoch": 0.5781769080738153, + "grad_norm": 1.5546875, + "learning_rate": 1.830439785446146e-05, + "loss": 1.0058, + "step": 3372 + }, + { + "epoch": 0.5783483721628051, + "grad_norm": 1.65625, + "learning_rate": 1.830339162130258e-05, + "loss": 1.0528, + "step": 3373 + }, + { + "epoch": 0.578519836251795, + "grad_norm": 1.5859375, + "learning_rate": 1.8302385117338836e-05, + "loss": 1.0247, + "step": 3374 + }, + { + "epoch": 0.5786913003407849, + "grad_norm": 1.46875, + "learning_rate": 1.8301378342603045e-05, + "loss": 1.0035, + "step": 3375 + }, + { + "epoch": 0.5788627644297747, + "grad_norm": 1.5390625, + "learning_rate": 1.830037129712805e-05, + "loss": 1.0463, + "step": 3376 + }, + { + "epoch": 0.5790342285187646, + "grad_norm": 1.5859375, + "learning_rate": 1.829936398094669e-05, + "loss": 1.0622, + "step": 3377 + }, + { + "epoch": 0.5792056926077545, + "grad_norm": 1.59375, + "learning_rate": 1.829835639409182e-05, + "loss": 1.0306, + "step": 3378 + }, + { + "epoch": 0.5793771566967443, + "grad_norm": 1.578125, + "learning_rate": 1.8297348536596297e-05, + "loss": 1.0617, + "step": 3379 + }, + { + "epoch": 0.5795486207857342, + "grad_norm": 1.5390625, + "learning_rate": 1.829634040849299e-05, + "loss": 1.0848, + "step": 3380 + }, + { + "epoch": 0.579720084874724, + "grad_norm": 1.5546875, + "learning_rate": 1.829533200981479e-05, + "loss": 0.9907, + "step": 3381 + }, + { + "epoch": 0.5798915489637139, + "grad_norm": 1.578125, + "learning_rate": 1.8294323340594565e-05, + "loss": 1.0193, + "step": 3382 + }, + { + "epoch": 0.5800630130527038, + "grad_norm": 1.5390625, + "learning_rate": 1.829331440086523e-05, + "loss": 0.9932, + "step": 3383 + }, + { + "epoch": 0.5802344771416936, + "grad_norm": 1.5390625, + "learning_rate": 1.829230519065968e-05, + "loss": 0.9571, + "step": 3384 + }, + { + "epoch": 0.5804059412306835, + "grad_norm": 1.4296875, + "learning_rate": 1.829129571001083e-05, + "loss": 0.969, + "step": 3385 + }, + { + "epoch": 0.5805774053196734, + "grad_norm": 1.5, + "learning_rate": 1.829028595895161e-05, + "loss": 0.963, + "step": 3386 + }, + { + "epoch": 0.5807488694086632, + "grad_norm": 1.5390625, + "learning_rate": 1.828927593751494e-05, + "loss": 0.9628, + "step": 3387 + }, + { + "epoch": 0.580920333497653, + "grad_norm": 1.5234375, + "learning_rate": 1.828826564573377e-05, + "loss": 0.9359, + "step": 3388 + }, + { + "epoch": 0.581091797586643, + "grad_norm": 1.546875, + "learning_rate": 1.8287255083641048e-05, + "loss": 1.0194, + "step": 3389 + }, + { + "epoch": 0.5812632616756328, + "grad_norm": 1.515625, + "learning_rate": 1.828624425126973e-05, + "loss": 1.0237, + "step": 3390 + }, + { + "epoch": 0.5814347257646226, + "grad_norm": 1.6328125, + "learning_rate": 1.8285233148652784e-05, + "loss": 1.0385, + "step": 3391 + }, + { + "epoch": 0.5816061898536126, + "grad_norm": 1.5078125, + "learning_rate": 1.8284221775823187e-05, + "loss": 1.0283, + "step": 3392 + }, + { + "epoch": 0.5817776539426024, + "grad_norm": 1.5859375, + "learning_rate": 1.828321013281392e-05, + "loss": 1.0483, + "step": 3393 + }, + { + "epoch": 0.5819491180315922, + "grad_norm": 1.609375, + "learning_rate": 1.828219821965798e-05, + "loss": 1.0926, + "step": 3394 + }, + { + "epoch": 0.5821205821205822, + "grad_norm": 1.609375, + "learning_rate": 1.8281186036388368e-05, + "loss": 1.0863, + "step": 3395 + }, + { + "epoch": 0.582292046209572, + "grad_norm": 1.5390625, + "learning_rate": 1.8280173583038095e-05, + "loss": 1.0118, + "step": 3396 + }, + { + "epoch": 0.5824635102985618, + "grad_norm": 1.5859375, + "learning_rate": 1.8279160859640182e-05, + "loss": 0.978, + "step": 3397 + }, + { + "epoch": 0.5826349743875517, + "grad_norm": 1.4609375, + "learning_rate": 1.8278147866227658e-05, + "loss": 1.029, + "step": 3398 + }, + { + "epoch": 0.5828064384765416, + "grad_norm": 1.53125, + "learning_rate": 1.8277134602833556e-05, + "loss": 1.0615, + "step": 3399 + }, + { + "epoch": 0.5829779025655314, + "grad_norm": 1.515625, + "learning_rate": 1.8276121069490925e-05, + "loss": 1.0119, + "step": 3400 + }, + { + "epoch": 0.5831493666545213, + "grad_norm": 1.6171875, + "learning_rate": 1.8275107266232826e-05, + "loss": 1.0311, + "step": 3401 + }, + { + "epoch": 0.5833208307435112, + "grad_norm": 1.578125, + "learning_rate": 1.8274093193092313e-05, + "loss": 1.0166, + "step": 3402 + }, + { + "epoch": 0.583492294832501, + "grad_norm": 1.6015625, + "learning_rate": 1.827307885010247e-05, + "loss": 1.0905, + "step": 3403 + }, + { + "epoch": 0.5836637589214909, + "grad_norm": 1.75, + "learning_rate": 1.8272064237296367e-05, + "loss": 1.0299, + "step": 3404 + }, + { + "epoch": 0.5838352230104807, + "grad_norm": 1.546875, + "learning_rate": 1.82710493547071e-05, + "loss": 0.9962, + "step": 3405 + }, + { + "epoch": 0.5840066870994706, + "grad_norm": 1.625, + "learning_rate": 1.827003420236777e-05, + "loss": 0.9978, + "step": 3406 + }, + { + "epoch": 0.5841781511884605, + "grad_norm": 1.6171875, + "learning_rate": 1.826901878031148e-05, + "loss": 1.0804, + "step": 3407 + }, + { + "epoch": 0.5843496152774503, + "grad_norm": 1.6171875, + "learning_rate": 1.8268003088571352e-05, + "loss": 1.0453, + "step": 3408 + }, + { + "epoch": 0.5845210793664402, + "grad_norm": 1.5, + "learning_rate": 1.8266987127180507e-05, + "loss": 0.9565, + "step": 3409 + }, + { + "epoch": 0.5846925434554301, + "grad_norm": 1.671875, + "learning_rate": 1.8265970896172082e-05, + "loss": 1.0409, + "step": 3410 + }, + { + "epoch": 0.5848640075444199, + "grad_norm": 1.65625, + "learning_rate": 1.8264954395579216e-05, + "loss": 1.0311, + "step": 3411 + }, + { + "epoch": 0.5850354716334097, + "grad_norm": 1.7265625, + "learning_rate": 1.826393762543507e-05, + "loss": 1.0295, + "step": 3412 + }, + { + "epoch": 0.5852069357223997, + "grad_norm": 1.546875, + "learning_rate": 1.8262920585772793e-05, + "loss": 0.9599, + "step": 3413 + }, + { + "epoch": 0.5853783998113895, + "grad_norm": 1.5703125, + "learning_rate": 1.8261903276625562e-05, + "loss": 0.9765, + "step": 3414 + }, + { + "epoch": 0.5855498639003793, + "grad_norm": 1.546875, + "learning_rate": 1.826088569802656e-05, + "loss": 1.0553, + "step": 3415 + }, + { + "epoch": 0.5857213279893693, + "grad_norm": 1.6328125, + "learning_rate": 1.825986785000896e-05, + "loss": 0.9898, + "step": 3416 + }, + { + "epoch": 0.5858927920783591, + "grad_norm": 1.640625, + "learning_rate": 1.825884973260597e-05, + "loss": 1.1863, + "step": 3417 + }, + { + "epoch": 0.5860642561673489, + "grad_norm": 1.5625, + "learning_rate": 1.8257831345850787e-05, + "loss": 0.9711, + "step": 3418 + }, + { + "epoch": 0.5862357202563389, + "grad_norm": 1.5859375, + "learning_rate": 1.825681268977663e-05, + "loss": 1.0413, + "step": 3419 + }, + { + "epoch": 0.5864071843453287, + "grad_norm": 1.5546875, + "learning_rate": 1.8255793764416718e-05, + "loss": 1.0048, + "step": 3420 + }, + { + "epoch": 0.5865786484343185, + "grad_norm": 1.5625, + "learning_rate": 1.8254774569804284e-05, + "loss": 1.0406, + "step": 3421 + }, + { + "epoch": 0.5867501125233084, + "grad_norm": 1.609375, + "learning_rate": 1.8253755105972565e-05, + "loss": 1.0598, + "step": 3422 + }, + { + "epoch": 0.5869215766122983, + "grad_norm": 1.6640625, + "learning_rate": 1.8252735372954812e-05, + "loss": 1.0493, + "step": 3423 + }, + { + "epoch": 0.5870930407012881, + "grad_norm": 1.71875, + "learning_rate": 1.8251715370784283e-05, + "loss": 1.0204, + "step": 3424 + }, + { + "epoch": 0.587264504790278, + "grad_norm": 1.5390625, + "learning_rate": 1.825069509949424e-05, + "loss": 1.0498, + "step": 3425 + }, + { + "epoch": 0.5874359688792679, + "grad_norm": 1.6875, + "learning_rate": 1.8249674559117963e-05, + "loss": 1.0432, + "step": 3426 + }, + { + "epoch": 0.5876074329682577, + "grad_norm": 1.5546875, + "learning_rate": 1.824865374968873e-05, + "loss": 1.0371, + "step": 3427 + }, + { + "epoch": 0.5877788970572476, + "grad_norm": 1.5859375, + "learning_rate": 1.824763267123984e-05, + "loss": 1.0738, + "step": 3428 + }, + { + "epoch": 0.5879503611462374, + "grad_norm": 1.4921875, + "learning_rate": 1.824661132380459e-05, + "loss": 0.9195, + "step": 3429 + }, + { + "epoch": 0.5881218252352273, + "grad_norm": 1.5703125, + "learning_rate": 1.824558970741629e-05, + "loss": 1.043, + "step": 3430 + }, + { + "epoch": 0.5882932893242172, + "grad_norm": 1.546875, + "learning_rate": 1.8244567822108262e-05, + "loss": 0.9982, + "step": 3431 + }, + { + "epoch": 0.588464753413207, + "grad_norm": 1.546875, + "learning_rate": 1.824354566791383e-05, + "loss": 1.0171, + "step": 3432 + }, + { + "epoch": 0.5886362175021969, + "grad_norm": 1.5703125, + "learning_rate": 1.824252324486633e-05, + "loss": 0.9613, + "step": 3433 + }, + { + "epoch": 0.5888076815911868, + "grad_norm": 1.65625, + "learning_rate": 1.824150055299911e-05, + "loss": 1.1034, + "step": 3434 + }, + { + "epoch": 0.5889791456801766, + "grad_norm": 1.5546875, + "learning_rate": 1.8240477592345522e-05, + "loss": 0.9503, + "step": 3435 + }, + { + "epoch": 0.5891506097691664, + "grad_norm": 1.5703125, + "learning_rate": 1.8239454362938925e-05, + "loss": 1.0519, + "step": 3436 + }, + { + "epoch": 0.5893220738581564, + "grad_norm": 1.59375, + "learning_rate": 1.82384308648127e-05, + "loss": 1.0554, + "step": 3437 + }, + { + "epoch": 0.5894935379471462, + "grad_norm": 1.578125, + "learning_rate": 1.823740709800022e-05, + "loss": 0.9954, + "step": 3438 + }, + { + "epoch": 0.589665002036136, + "grad_norm": 1.8359375, + "learning_rate": 1.8236383062534874e-05, + "loss": 0.961, + "step": 3439 + }, + { + "epoch": 0.589836466125126, + "grad_norm": 1.7265625, + "learning_rate": 1.8235358758450062e-05, + "loss": 1.0856, + "step": 3440 + }, + { + "epoch": 0.5900079302141158, + "grad_norm": 1.5546875, + "learning_rate": 1.8234334185779186e-05, + "loss": 1.0408, + "step": 3441 + }, + { + "epoch": 0.5901793943031056, + "grad_norm": 1.5859375, + "learning_rate": 1.823330934455567e-05, + "loss": 1.0096, + "step": 3442 + }, + { + "epoch": 0.5903508583920956, + "grad_norm": 1.640625, + "learning_rate": 1.823228423481293e-05, + "loss": 1.0386, + "step": 3443 + }, + { + "epoch": 0.5905223224810854, + "grad_norm": 1.6171875, + "learning_rate": 1.82312588565844e-05, + "loss": 1.0921, + "step": 3444 + }, + { + "epoch": 0.5906937865700752, + "grad_norm": 1.6484375, + "learning_rate": 1.8230233209903527e-05, + "loss": 1.108, + "step": 3445 + }, + { + "epoch": 0.5908652506590651, + "grad_norm": 1.5859375, + "learning_rate": 1.8229207294803753e-05, + "loss": 0.9812, + "step": 3446 + }, + { + "epoch": 0.591036714748055, + "grad_norm": 1.5390625, + "learning_rate": 1.8228181111318538e-05, + "loss": 1.0402, + "step": 3447 + }, + { + "epoch": 0.5912081788370448, + "grad_norm": 1.578125, + "learning_rate": 1.822715465948136e-05, + "loss": 0.9975, + "step": 3448 + }, + { + "epoch": 0.5913796429260347, + "grad_norm": 1.5859375, + "learning_rate": 1.8226127939325683e-05, + "loss": 1.0768, + "step": 3449 + }, + { + "epoch": 0.5915511070150246, + "grad_norm": 1.5, + "learning_rate": 1.8225100950884997e-05, + "loss": 1.0326, + "step": 3450 + }, + { + "epoch": 0.5917225711040144, + "grad_norm": 1.5625, + "learning_rate": 1.8224073694192798e-05, + "loss": 1.0055, + "step": 3451 + }, + { + "epoch": 0.5918940351930043, + "grad_norm": 1.71875, + "learning_rate": 1.8223046169282585e-05, + "loss": 1.0598, + "step": 3452 + }, + { + "epoch": 0.5920654992819941, + "grad_norm": 1.6171875, + "learning_rate": 1.8222018376187873e-05, + "loss": 1.0051, + "step": 3453 + }, + { + "epoch": 0.592236963370984, + "grad_norm": 1.4765625, + "learning_rate": 1.822099031494218e-05, + "loss": 0.937, + "step": 3454 + }, + { + "epoch": 0.5924084274599738, + "grad_norm": 1.515625, + "learning_rate": 1.8219961985579035e-05, + "loss": 0.9945, + "step": 3455 + }, + { + "epoch": 0.5925798915489637, + "grad_norm": 1.6328125, + "learning_rate": 1.821893338813198e-05, + "loss": 1.1318, + "step": 3456 + }, + { + "epoch": 0.5927513556379536, + "grad_norm": 1.6171875, + "learning_rate": 1.8217904522634553e-05, + "loss": 1.0298, + "step": 3457 + }, + { + "epoch": 0.5929228197269434, + "grad_norm": 1.5546875, + "learning_rate": 1.8216875389120316e-05, + "loss": 0.9916, + "step": 3458 + }, + { + "epoch": 0.5930942838159333, + "grad_norm": 1.5390625, + "learning_rate": 1.8215845987622828e-05, + "loss": 1.0565, + "step": 3459 + }, + { + "epoch": 0.5932657479049231, + "grad_norm": 1.546875, + "learning_rate": 1.8214816318175663e-05, + "loss": 0.9459, + "step": 3460 + }, + { + "epoch": 0.593437211993913, + "grad_norm": 1.6015625, + "learning_rate": 1.821378638081241e-05, + "loss": 1.0692, + "step": 3461 + }, + { + "epoch": 0.5936086760829029, + "grad_norm": 1.6015625, + "learning_rate": 1.8212756175566646e-05, + "loss": 1.0501, + "step": 3462 + }, + { + "epoch": 0.5937801401718927, + "grad_norm": 1.734375, + "learning_rate": 1.8211725702471984e-05, + "loss": 1.0388, + "step": 3463 + }, + { + "epoch": 0.5939516042608826, + "grad_norm": 1.5859375, + "learning_rate": 1.8210694961562015e-05, + "loss": 1.0025, + "step": 3464 + }, + { + "epoch": 0.5941230683498725, + "grad_norm": 1.6796875, + "learning_rate": 1.8209663952870373e-05, + "loss": 1.046, + "step": 3465 + }, + { + "epoch": 0.5942945324388623, + "grad_norm": 1.59375, + "learning_rate": 1.8208632676430674e-05, + "loss": 1.0592, + "step": 3466 + }, + { + "epoch": 0.5944659965278521, + "grad_norm": 1.515625, + "learning_rate": 1.820760113227655e-05, + "loss": 0.9688, + "step": 3467 + }, + { + "epoch": 0.5946374606168421, + "grad_norm": 1.65625, + "learning_rate": 1.8206569320441645e-05, + "loss": 0.9846, + "step": 3468 + }, + { + "epoch": 0.5948089247058319, + "grad_norm": 1.59375, + "learning_rate": 1.8205537240959615e-05, + "loss": 0.9957, + "step": 3469 + }, + { + "epoch": 0.5949803887948217, + "grad_norm": 1.640625, + "learning_rate": 1.820450489386411e-05, + "loss": 1.0043, + "step": 3470 + }, + { + "epoch": 0.5951518528838117, + "grad_norm": 1.5859375, + "learning_rate": 1.820347227918881e-05, + "loss": 1.0168, + "step": 3471 + }, + { + "epoch": 0.5953233169728015, + "grad_norm": 1.6484375, + "learning_rate": 1.820243939696739e-05, + "loss": 1.0769, + "step": 3472 + }, + { + "epoch": 0.5954947810617913, + "grad_norm": 1.5546875, + "learning_rate": 1.820140624723353e-05, + "loss": 1.0118, + "step": 3473 + }, + { + "epoch": 0.5956662451507813, + "grad_norm": 1.578125, + "learning_rate": 1.820037283002093e-05, + "loss": 1.0421, + "step": 3474 + }, + { + "epoch": 0.5958377092397711, + "grad_norm": 1.6015625, + "learning_rate": 1.8199339145363292e-05, + "loss": 1.061, + "step": 3475 + }, + { + "epoch": 0.5960091733287609, + "grad_norm": 1.625, + "learning_rate": 1.819830519329433e-05, + "loss": 1.0801, + "step": 3476 + }, + { + "epoch": 0.5961806374177508, + "grad_norm": 1.671875, + "learning_rate": 1.819727097384776e-05, + "loss": 1.0314, + "step": 3477 + }, + { + "epoch": 0.5963521015067407, + "grad_norm": 1.625, + "learning_rate": 1.819623648705732e-05, + "loss": 0.9982, + "step": 3478 + }, + { + "epoch": 0.5965235655957305, + "grad_norm": 1.515625, + "learning_rate": 1.819520173295674e-05, + "loss": 1.0165, + "step": 3479 + }, + { + "epoch": 0.5966950296847204, + "grad_norm": 1.6484375, + "learning_rate": 1.819416671157977e-05, + "loss": 1.0337, + "step": 3480 + }, + { + "epoch": 0.5968664937737103, + "grad_norm": 1.5703125, + "learning_rate": 1.819313142296017e-05, + "loss": 1.0332, + "step": 3481 + }, + { + "epoch": 0.5970379578627001, + "grad_norm": 1.625, + "learning_rate": 1.8192095867131705e-05, + "loss": 1.0734, + "step": 3482 + }, + { + "epoch": 0.59720942195169, + "grad_norm": 1.5859375, + "learning_rate": 1.819106004412814e-05, + "loss": 1.1012, + "step": 3483 + }, + { + "epoch": 0.5973808860406798, + "grad_norm": 1.625, + "learning_rate": 1.8190023953983264e-05, + "loss": 0.9893, + "step": 3484 + }, + { + "epoch": 0.5975523501296697, + "grad_norm": 1.6171875, + "learning_rate": 1.818898759673087e-05, + "loss": 1.0762, + "step": 3485 + }, + { + "epoch": 0.5977238142186596, + "grad_norm": 1.640625, + "learning_rate": 1.8187950972404746e-05, + "loss": 1.093, + "step": 3486 + }, + { + "epoch": 0.5978952783076494, + "grad_norm": 1.59375, + "learning_rate": 1.818691408103871e-05, + "loss": 0.9958, + "step": 3487 + }, + { + "epoch": 0.5980667423966393, + "grad_norm": 1.5859375, + "learning_rate": 1.818587692266658e-05, + "loss": 1.0037, + "step": 3488 + }, + { + "epoch": 0.5982382064856292, + "grad_norm": 1.625, + "learning_rate": 1.818483949732218e-05, + "loss": 1.0488, + "step": 3489 + }, + { + "epoch": 0.598409670574619, + "grad_norm": 1.5703125, + "learning_rate": 1.8183801805039337e-05, + "loss": 1.027, + "step": 3490 + }, + { + "epoch": 0.5985811346636088, + "grad_norm": 1.734375, + "learning_rate": 1.8182763845851902e-05, + "loss": 1.053, + "step": 3491 + }, + { + "epoch": 0.5987525987525988, + "grad_norm": 1.7109375, + "learning_rate": 1.8181725619793727e-05, + "loss": 1.0962, + "step": 3492 + }, + { + "epoch": 0.5989240628415886, + "grad_norm": 1.5390625, + "learning_rate": 1.8180687126898666e-05, + "loss": 0.9581, + "step": 3493 + }, + { + "epoch": 0.5990955269305784, + "grad_norm": 1.5703125, + "learning_rate": 1.8179648367200596e-05, + "loss": 0.9726, + "step": 3494 + }, + { + "epoch": 0.5992669910195684, + "grad_norm": 1.515625, + "learning_rate": 1.817860934073339e-05, + "loss": 1.0337, + "step": 3495 + }, + { + "epoch": 0.5994384551085582, + "grad_norm": 1.59375, + "learning_rate": 1.8177570047530933e-05, + "loss": 1.0834, + "step": 3496 + }, + { + "epoch": 0.599609919197548, + "grad_norm": 1.546875, + "learning_rate": 1.8176530487627123e-05, + "loss": 0.9813, + "step": 3497 + }, + { + "epoch": 0.599781383286538, + "grad_norm": 1.5390625, + "learning_rate": 1.8175490661055864e-05, + "loss": 0.9932, + "step": 3498 + }, + { + "epoch": 0.5999528473755278, + "grad_norm": 1.609375, + "learning_rate": 1.817445056785107e-05, + "loss": 1.056, + "step": 3499 + }, + { + "epoch": 0.6001243114645176, + "grad_norm": 1.5703125, + "learning_rate": 1.8173410208046666e-05, + "loss": 1.021, + "step": 3500 + }, + { + "epoch": 0.6001243114645176, + "eval_loss": 0.8751522302627563, + "eval_runtime": 837.1447, + "eval_samples_per_second": 2.985, + "eval_steps_per_second": 2.985, + "step": 3500 + }, + { + "epoch": 0.6002957755535075, + "grad_norm": 1.6875, + "learning_rate": 1.8172369581676567e-05, + "loss": 1.0605, + "step": 3501 + }, + { + "epoch": 0.6004672396424974, + "grad_norm": 1.5625, + "learning_rate": 1.817132868877473e-05, + "loss": 0.9984, + "step": 3502 + }, + { + "epoch": 0.6006387037314872, + "grad_norm": 1.5390625, + "learning_rate": 1.817028752937509e-05, + "loss": 0.9996, + "step": 3503 + }, + { + "epoch": 0.6008101678204771, + "grad_norm": 1.578125, + "learning_rate": 1.8169246103511605e-05, + "loss": 1.0918, + "step": 3504 + }, + { + "epoch": 0.600981631909467, + "grad_norm": 1.6171875, + "learning_rate": 1.8168204411218247e-05, + "loss": 1.115, + "step": 3505 + }, + { + "epoch": 0.6011530959984568, + "grad_norm": 1.625, + "learning_rate": 1.8167162452528982e-05, + "loss": 0.9894, + "step": 3506 + }, + { + "epoch": 0.6013245600874467, + "grad_norm": 1.6015625, + "learning_rate": 1.8166120227477796e-05, + "loss": 1.0847, + "step": 3507 + }, + { + "epoch": 0.6014960241764366, + "grad_norm": 1.515625, + "learning_rate": 1.816507773609868e-05, + "loss": 0.9875, + "step": 3508 + }, + { + "epoch": 0.6016674882654264, + "grad_norm": 1.6875, + "learning_rate": 1.8164034978425627e-05, + "loss": 1.0064, + "step": 3509 + }, + { + "epoch": 0.6018389523544163, + "grad_norm": 1.734375, + "learning_rate": 1.8162991954492654e-05, + "loss": 1.1553, + "step": 3510 + }, + { + "epoch": 0.6020104164434061, + "grad_norm": 1.515625, + "learning_rate": 1.8161948664333773e-05, + "loss": 0.9663, + "step": 3511 + }, + { + "epoch": 0.602181880532396, + "grad_norm": 1.5625, + "learning_rate": 1.816090510798301e-05, + "loss": 0.9988, + "step": 3512 + }, + { + "epoch": 0.6023533446213859, + "grad_norm": 1.5234375, + "learning_rate": 1.81598612854744e-05, + "loss": 0.9419, + "step": 3513 + }, + { + "epoch": 0.6025248087103757, + "grad_norm": 1.578125, + "learning_rate": 1.8158817196841993e-05, + "loss": 1.1279, + "step": 3514 + }, + { + "epoch": 0.6026962727993656, + "grad_norm": 1.59375, + "learning_rate": 1.815777284211983e-05, + "loss": 1.0023, + "step": 3515 + }, + { + "epoch": 0.6028677368883555, + "grad_norm": 1.6015625, + "learning_rate": 1.815672822134197e-05, + "loss": 1.1439, + "step": 3516 + }, + { + "epoch": 0.6030392009773453, + "grad_norm": 1.6953125, + "learning_rate": 1.8155683334542493e-05, + "loss": 1.0355, + "step": 3517 + }, + { + "epoch": 0.6032106650663351, + "grad_norm": 1.4921875, + "learning_rate": 1.8154638181755467e-05, + "loss": 1.0313, + "step": 3518 + }, + { + "epoch": 0.6033821291553251, + "grad_norm": 1.734375, + "learning_rate": 1.8153592763014987e-05, + "loss": 1.1347, + "step": 3519 + }, + { + "epoch": 0.6035535932443149, + "grad_norm": 1.4921875, + "learning_rate": 1.8152547078355143e-05, + "loss": 0.9597, + "step": 3520 + }, + { + "epoch": 0.6037250573333047, + "grad_norm": 1.546875, + "learning_rate": 1.8151501127810038e-05, + "loss": 1.0531, + "step": 3521 + }, + { + "epoch": 0.6038965214222947, + "grad_norm": 1.5078125, + "learning_rate": 1.8150454911413783e-05, + "loss": 0.9631, + "step": 3522 + }, + { + "epoch": 0.6040679855112845, + "grad_norm": 1.5390625, + "learning_rate": 1.8149408429200503e-05, + "loss": 1.0816, + "step": 3523 + }, + { + "epoch": 0.6042394496002743, + "grad_norm": 1.6015625, + "learning_rate": 1.814836168120433e-05, + "loss": 0.9617, + "step": 3524 + }, + { + "epoch": 0.6044109136892643, + "grad_norm": 1.53125, + "learning_rate": 1.8147314667459394e-05, + "loss": 0.9863, + "step": 3525 + }, + { + "epoch": 0.6045823777782541, + "grad_norm": 1.5859375, + "learning_rate": 1.814626738799985e-05, + "loss": 1.0777, + "step": 3526 + }, + { + "epoch": 0.6047538418672439, + "grad_norm": 1.5703125, + "learning_rate": 1.814521984285985e-05, + "loss": 1.0113, + "step": 3527 + }, + { + "epoch": 0.6049253059562338, + "grad_norm": 1.5859375, + "learning_rate": 1.8144172032073558e-05, + "loss": 0.9739, + "step": 3528 + }, + { + "epoch": 0.6050967700452237, + "grad_norm": 1.6015625, + "learning_rate": 1.8143123955675147e-05, + "loss": 1.0028, + "step": 3529 + }, + { + "epoch": 0.6052682341342135, + "grad_norm": 1.671875, + "learning_rate": 1.8142075613698798e-05, + "loss": 1.0616, + "step": 3530 + }, + { + "epoch": 0.6054396982232034, + "grad_norm": 1.578125, + "learning_rate": 1.8141027006178707e-05, + "loss": 1.0612, + "step": 3531 + }, + { + "epoch": 0.6056111623121933, + "grad_norm": 1.46875, + "learning_rate": 1.813997813314907e-05, + "loss": 0.9675, + "step": 3532 + }, + { + "epoch": 0.6057826264011831, + "grad_norm": 1.5546875, + "learning_rate": 1.8138928994644088e-05, + "loss": 1.0159, + "step": 3533 + }, + { + "epoch": 0.605954090490173, + "grad_norm": 1.6015625, + "learning_rate": 1.8137879590697986e-05, + "loss": 1.0209, + "step": 3534 + }, + { + "epoch": 0.6061255545791628, + "grad_norm": 1.53125, + "learning_rate": 1.8136829921344986e-05, + "loss": 1.0564, + "step": 3535 + }, + { + "epoch": 0.6062970186681527, + "grad_norm": 1.6484375, + "learning_rate": 1.8135779986619324e-05, + "loss": 1.0058, + "step": 3536 + }, + { + "epoch": 0.6064684827571426, + "grad_norm": 1.578125, + "learning_rate": 1.813472978655524e-05, + "loss": 1.0321, + "step": 3537 + }, + { + "epoch": 0.6066399468461324, + "grad_norm": 1.59375, + "learning_rate": 1.813367932118698e-05, + "loss": 1.0533, + "step": 3538 + }, + { + "epoch": 0.6068114109351223, + "grad_norm": 1.59375, + "learning_rate": 1.8132628590548813e-05, + "loss": 0.9243, + "step": 3539 + }, + { + "epoch": 0.6069828750241122, + "grad_norm": 1.609375, + "learning_rate": 1.8131577594675003e-05, + "loss": 1.053, + "step": 3540 + }, + { + "epoch": 0.607154339113102, + "grad_norm": 1.6484375, + "learning_rate": 1.8130526333599826e-05, + "loss": 1.0014, + "step": 3541 + }, + { + "epoch": 0.6073258032020918, + "grad_norm": 1.8359375, + "learning_rate": 1.8129474807357567e-05, + "loss": 0.9082, + "step": 3542 + }, + { + "epoch": 0.6074972672910818, + "grad_norm": 1.546875, + "learning_rate": 1.8128423015982526e-05, + "loss": 1.0552, + "step": 3543 + }, + { + "epoch": 0.6076687313800716, + "grad_norm": 1.625, + "learning_rate": 1.8127370959508995e-05, + "loss": 1.1296, + "step": 3544 + }, + { + "epoch": 0.6078401954690614, + "grad_norm": 1.5703125, + "learning_rate": 1.81263186379713e-05, + "loss": 0.9598, + "step": 3545 + }, + { + "epoch": 0.6080116595580514, + "grad_norm": 1.59375, + "learning_rate": 1.812526605140375e-05, + "loss": 1.0185, + "step": 3546 + }, + { + "epoch": 0.6081831236470412, + "grad_norm": 1.6015625, + "learning_rate": 1.8124213199840674e-05, + "loss": 0.9668, + "step": 3547 + }, + { + "epoch": 0.608354587736031, + "grad_norm": 1.546875, + "learning_rate": 1.8123160083316417e-05, + "loss": 1.0008, + "step": 3548 + }, + { + "epoch": 0.6085260518250208, + "grad_norm": 1.5859375, + "learning_rate": 1.8122106701865317e-05, + "loss": 0.9254, + "step": 3549 + }, + { + "epoch": 0.6086975159140108, + "grad_norm": 1.5859375, + "learning_rate": 1.8121053055521736e-05, + "loss": 1.021, + "step": 3550 + }, + { + "epoch": 0.6088689800030006, + "grad_norm": 1.59375, + "learning_rate": 1.811999914432003e-05, + "loss": 1.0135, + "step": 3551 + }, + { + "epoch": 0.6090404440919904, + "grad_norm": 1.515625, + "learning_rate": 1.8118944968294578e-05, + "loss": 0.9984, + "step": 3552 + }, + { + "epoch": 0.6092119081809804, + "grad_norm": 1.4609375, + "learning_rate": 1.8117890527479757e-05, + "loss": 0.8945, + "step": 3553 + }, + { + "epoch": 0.6093833722699702, + "grad_norm": 1.4921875, + "learning_rate": 1.811683582190996e-05, + "loss": 0.9852, + "step": 3554 + }, + { + "epoch": 0.60955483635896, + "grad_norm": 1.578125, + "learning_rate": 1.8115780851619575e-05, + "loss": 1.0574, + "step": 3555 + }, + { + "epoch": 0.60972630044795, + "grad_norm": 1.65625, + "learning_rate": 1.8114725616643018e-05, + "loss": 1.0476, + "step": 3556 + }, + { + "epoch": 0.6098977645369398, + "grad_norm": 1.546875, + "learning_rate": 1.8113670117014702e-05, + "loss": 1.0345, + "step": 3557 + }, + { + "epoch": 0.6100692286259296, + "grad_norm": 1.484375, + "learning_rate": 1.8112614352769048e-05, + "loss": 0.9556, + "step": 3558 + }, + { + "epoch": 0.6102406927149195, + "grad_norm": 1.5859375, + "learning_rate": 1.8111558323940493e-05, + "loss": 1.0069, + "step": 3559 + }, + { + "epoch": 0.6104121568039094, + "grad_norm": 1.640625, + "learning_rate": 1.8110502030563477e-05, + "loss": 1.0486, + "step": 3560 + }, + { + "epoch": 0.6105836208928992, + "grad_norm": 1.6171875, + "learning_rate": 1.8109445472672448e-05, + "loss": 0.9778, + "step": 3561 + }, + { + "epoch": 0.6107550849818891, + "grad_norm": 1.5859375, + "learning_rate": 1.8108388650301862e-05, + "loss": 1.0094, + "step": 3562 + }, + { + "epoch": 0.610926549070879, + "grad_norm": 1.5859375, + "learning_rate": 1.810733156348619e-05, + "loss": 0.9897, + "step": 3563 + }, + { + "epoch": 0.6110980131598688, + "grad_norm": 1.625, + "learning_rate": 1.8106274212259906e-05, + "loss": 1.0612, + "step": 3564 + }, + { + "epoch": 0.6112694772488587, + "grad_norm": 1.6015625, + "learning_rate": 1.8105216596657497e-05, + "loss": 0.9924, + "step": 3565 + }, + { + "epoch": 0.6114409413378485, + "grad_norm": 1.5546875, + "learning_rate": 1.810415871671345e-05, + "loss": 1.0056, + "step": 3566 + }, + { + "epoch": 0.6116124054268384, + "grad_norm": 1.671875, + "learning_rate": 1.810310057246227e-05, + "loss": 1.1138, + "step": 3567 + }, + { + "epoch": 0.6117838695158283, + "grad_norm": 1.6328125, + "learning_rate": 1.8102042163938472e-05, + "loss": 1.1251, + "step": 3568 + }, + { + "epoch": 0.6119553336048181, + "grad_norm": 1.5703125, + "learning_rate": 1.8100983491176563e-05, + "loss": 0.981, + "step": 3569 + }, + { + "epoch": 0.612126797693808, + "grad_norm": 1.53125, + "learning_rate": 1.809992455421108e-05, + "loss": 1.0471, + "step": 3570 + }, + { + "epoch": 0.6122982617827979, + "grad_norm": 1.6328125, + "learning_rate": 1.8098865353076554e-05, + "loss": 1.0886, + "step": 3571 + }, + { + "epoch": 0.6124697258717877, + "grad_norm": 1.7109375, + "learning_rate": 1.8097805887807536e-05, + "loss": 1.1298, + "step": 3572 + }, + { + "epoch": 0.6126411899607775, + "grad_norm": 1.625, + "learning_rate": 1.8096746158438568e-05, + "loss": 1.0555, + "step": 3573 + }, + { + "epoch": 0.6128126540497675, + "grad_norm": 1.6171875, + "learning_rate": 1.8095686165004222e-05, + "loss": 1.0145, + "step": 3574 + }, + { + "epoch": 0.6129841181387573, + "grad_norm": 1.5390625, + "learning_rate": 1.8094625907539065e-05, + "loss": 0.9968, + "step": 3575 + }, + { + "epoch": 0.6131555822277471, + "grad_norm": 1.5625, + "learning_rate": 1.8093565386077675e-05, + "loss": 0.9829, + "step": 3576 + }, + { + "epoch": 0.6133270463167371, + "grad_norm": 1.609375, + "learning_rate": 1.8092504600654642e-05, + "loss": 1.142, + "step": 3577 + }, + { + "epoch": 0.6134985104057269, + "grad_norm": 1.5234375, + "learning_rate": 1.809144355130456e-05, + "loss": 1.0224, + "step": 3578 + }, + { + "epoch": 0.6136699744947167, + "grad_norm": 1.6171875, + "learning_rate": 1.8090382238062028e-05, + "loss": 1.0181, + "step": 3579 + }, + { + "epoch": 0.6138414385837067, + "grad_norm": 1.5234375, + "learning_rate": 1.808932066096167e-05, + "loss": 1.0756, + "step": 3580 + }, + { + "epoch": 0.6140129026726965, + "grad_norm": 1.5546875, + "learning_rate": 1.8088258820038105e-05, + "loss": 1.0064, + "step": 3581 + }, + { + "epoch": 0.6141843667616863, + "grad_norm": 1.5078125, + "learning_rate": 1.8087196715325962e-05, + "loss": 0.9854, + "step": 3582 + }, + { + "epoch": 0.6143558308506762, + "grad_norm": 1.625, + "learning_rate": 1.808613434685988e-05, + "loss": 0.9552, + "step": 3583 + }, + { + "epoch": 0.6145272949396661, + "grad_norm": 1.515625, + "learning_rate": 1.8085071714674506e-05, + "loss": 0.979, + "step": 3584 + }, + { + "epoch": 0.6146987590286559, + "grad_norm": 1.5859375, + "learning_rate": 1.80840088188045e-05, + "loss": 1.1012, + "step": 3585 + }, + { + "epoch": 0.6148702231176458, + "grad_norm": 1.546875, + "learning_rate": 1.8082945659284522e-05, + "loss": 1.0108, + "step": 3586 + }, + { + "epoch": 0.6150416872066357, + "grad_norm": 1.5390625, + "learning_rate": 1.8081882236149253e-05, + "loss": 0.9735, + "step": 3587 + }, + { + "epoch": 0.6152131512956255, + "grad_norm": 1.59375, + "learning_rate": 1.8080818549433367e-05, + "loss": 1.0742, + "step": 3588 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 1.59375, + "learning_rate": 1.807975459917156e-05, + "loss": 1.1254, + "step": 3589 + }, + { + "epoch": 0.6155560794736052, + "grad_norm": 1.5625, + "learning_rate": 1.807869038539853e-05, + "loss": 1.0363, + "step": 3590 + }, + { + "epoch": 0.6157275435625951, + "grad_norm": 1.578125, + "learning_rate": 1.8077625908148983e-05, + "loss": 1.0375, + "step": 3591 + }, + { + "epoch": 0.615899007651585, + "grad_norm": 1.5, + "learning_rate": 1.807656116745764e-05, + "loss": 0.9396, + "step": 3592 + }, + { + "epoch": 0.6160704717405748, + "grad_norm": 1.6015625, + "learning_rate": 1.8075496163359224e-05, + "loss": 1.0285, + "step": 3593 + }, + { + "epoch": 0.6162419358295647, + "grad_norm": 1.5625, + "learning_rate": 1.8074430895888466e-05, + "loss": 0.9645, + "step": 3594 + }, + { + "epoch": 0.6164133999185546, + "grad_norm": 1.625, + "learning_rate": 1.8073365365080114e-05, + "loss": 1.091, + "step": 3595 + }, + { + "epoch": 0.6165848640075444, + "grad_norm": 1.6875, + "learning_rate": 1.8072299570968915e-05, + "loss": 1.0924, + "step": 3596 + }, + { + "epoch": 0.6167563280965342, + "grad_norm": 1.65625, + "learning_rate": 1.8071233513589626e-05, + "loss": 1.0404, + "step": 3597 + }, + { + "epoch": 0.6169277921855242, + "grad_norm": 1.59375, + "learning_rate": 1.8070167192977024e-05, + "loss": 1.0805, + "step": 3598 + }, + { + "epoch": 0.617099256274514, + "grad_norm": 1.5390625, + "learning_rate": 1.8069100609165878e-05, + "loss": 1.0391, + "step": 3599 + }, + { + "epoch": 0.6172707203635038, + "grad_norm": 1.6015625, + "learning_rate": 1.8068033762190977e-05, + "loss": 1.083, + "step": 3600 + }, + { + "epoch": 0.6174421844524938, + "grad_norm": 1.6328125, + "learning_rate": 1.8066966652087114e-05, + "loss": 0.9962, + "step": 3601 + }, + { + "epoch": 0.6176136485414836, + "grad_norm": 1.75, + "learning_rate": 1.8065899278889088e-05, + "loss": 1.0419, + "step": 3602 + }, + { + "epoch": 0.6177851126304734, + "grad_norm": 1.53125, + "learning_rate": 1.8064831642631712e-05, + "loss": 1.1506, + "step": 3603 + }, + { + "epoch": 0.6179565767194634, + "grad_norm": 1.640625, + "learning_rate": 1.8063763743349813e-05, + "loss": 1.063, + "step": 3604 + }, + { + "epoch": 0.6181280408084532, + "grad_norm": 1.515625, + "learning_rate": 1.806269558107821e-05, + "loss": 0.9929, + "step": 3605 + }, + { + "epoch": 0.618299504897443, + "grad_norm": 1.53125, + "learning_rate": 1.806162715585174e-05, + "loss": 0.9639, + "step": 3606 + }, + { + "epoch": 0.618470968986433, + "grad_norm": 1.5859375, + "learning_rate": 1.8060558467705254e-05, + "loss": 0.9289, + "step": 3607 + }, + { + "epoch": 0.6186424330754228, + "grad_norm": 1.5703125, + "learning_rate": 1.8059489516673603e-05, + "loss": 0.9864, + "step": 3608 + }, + { + "epoch": 0.6188138971644126, + "grad_norm": 1.6328125, + "learning_rate": 1.8058420302791647e-05, + "loss": 1.0511, + "step": 3609 + }, + { + "epoch": 0.6189853612534025, + "grad_norm": 1.5859375, + "learning_rate": 1.8057350826094263e-05, + "loss": 0.9789, + "step": 3610 + }, + { + "epoch": 0.6191568253423924, + "grad_norm": 1.515625, + "learning_rate": 1.805628108661633e-05, + "loss": 1.0056, + "step": 3611 + }, + { + "epoch": 0.6193282894313822, + "grad_norm": 1.625, + "learning_rate": 1.8055211084392728e-05, + "loss": 0.9905, + "step": 3612 + }, + { + "epoch": 0.6194997535203721, + "grad_norm": 1.6015625, + "learning_rate": 1.8054140819458362e-05, + "loss": 0.9457, + "step": 3613 + }, + { + "epoch": 0.619671217609362, + "grad_norm": 2.078125, + "learning_rate": 1.8053070291848132e-05, + "loss": 1.0677, + "step": 3614 + }, + { + "epoch": 0.6198426816983518, + "grad_norm": 1.453125, + "learning_rate": 1.805199950159696e-05, + "loss": 0.9021, + "step": 3615 + }, + { + "epoch": 0.6200141457873417, + "grad_norm": 1.6484375, + "learning_rate": 1.805092844873976e-05, + "loss": 1.0903, + "step": 3616 + }, + { + "epoch": 0.6201856098763315, + "grad_norm": 1.59375, + "learning_rate": 1.8049857133311467e-05, + "loss": 1.01, + "step": 3617 + }, + { + "epoch": 0.6203570739653214, + "grad_norm": 1.640625, + "learning_rate": 1.804878555534702e-05, + "loss": 0.9704, + "step": 3618 + }, + { + "epoch": 0.6205285380543113, + "grad_norm": 1.609375, + "learning_rate": 1.8047713714881366e-05, + "loss": 0.9934, + "step": 3619 + }, + { + "epoch": 0.6207000021433011, + "grad_norm": 1.6015625, + "learning_rate": 1.8046641611949463e-05, + "loss": 1.0609, + "step": 3620 + }, + { + "epoch": 0.620871466232291, + "grad_norm": 2.3125, + "learning_rate": 1.8045569246586274e-05, + "loss": 1.0085, + "step": 3621 + }, + { + "epoch": 0.6210429303212809, + "grad_norm": 1.609375, + "learning_rate": 1.804449661882678e-05, + "loss": 1.0796, + "step": 3622 + }, + { + "epoch": 0.6212143944102707, + "grad_norm": 1.6484375, + "learning_rate": 1.8043423728705955e-05, + "loss": 1.0437, + "step": 3623 + }, + { + "epoch": 0.6213858584992605, + "grad_norm": 1.53125, + "learning_rate": 1.8042350576258796e-05, + "loss": 1.0536, + "step": 3624 + }, + { + "epoch": 0.6215573225882505, + "grad_norm": 1.5546875, + "learning_rate": 1.8041277161520296e-05, + "loss": 0.9373, + "step": 3625 + }, + { + "epoch": 0.6217287866772403, + "grad_norm": 2.59375, + "learning_rate": 1.804020348452547e-05, + "loss": 1.0301, + "step": 3626 + }, + { + "epoch": 0.6219002507662301, + "grad_norm": 1.5703125, + "learning_rate": 1.8039129545309333e-05, + "loss": 1.0713, + "step": 3627 + }, + { + "epoch": 0.6220717148552201, + "grad_norm": 1.546875, + "learning_rate": 1.8038055343906906e-05, + "loss": 0.9714, + "step": 3628 + }, + { + "epoch": 0.6222431789442099, + "grad_norm": 1.515625, + "learning_rate": 1.803698088035323e-05, + "loss": 1.069, + "step": 3629 + }, + { + "epoch": 0.6224146430331997, + "grad_norm": 1.6875, + "learning_rate": 1.8035906154683342e-05, + "loss": 1.0507, + "step": 3630 + }, + { + "epoch": 0.6225861071221896, + "grad_norm": 1.8359375, + "learning_rate": 1.8034831166932288e-05, + "loss": 1.0842, + "step": 3631 + }, + { + "epoch": 0.6227575712111795, + "grad_norm": 1.609375, + "learning_rate": 1.803375591713514e-05, + "loss": 0.9876, + "step": 3632 + }, + { + "epoch": 0.6229290353001693, + "grad_norm": 1.5390625, + "learning_rate": 1.8032680405326956e-05, + "loss": 0.9642, + "step": 3633 + }, + { + "epoch": 0.6231004993891592, + "grad_norm": 1.7421875, + "learning_rate": 1.8031604631542815e-05, + "loss": 1.0226, + "step": 3634 + }, + { + "epoch": 0.6232719634781491, + "grad_norm": 1.5859375, + "learning_rate": 1.8030528595817804e-05, + "loss": 1.0634, + "step": 3635 + }, + { + "epoch": 0.6234434275671389, + "grad_norm": 1.4453125, + "learning_rate": 1.8029452298187015e-05, + "loss": 0.9581, + "step": 3636 + }, + { + "epoch": 0.6236148916561288, + "grad_norm": 1.6953125, + "learning_rate": 1.802837573868555e-05, + "loss": 1.0147, + "step": 3637 + }, + { + "epoch": 0.6237863557451186, + "grad_norm": 1.7109375, + "learning_rate": 1.802729891734852e-05, + "loss": 0.9945, + "step": 3638 + }, + { + "epoch": 0.6239578198341085, + "grad_norm": 1.578125, + "learning_rate": 1.8026221834211045e-05, + "loss": 0.9873, + "step": 3639 + }, + { + "epoch": 0.6241292839230984, + "grad_norm": 2.296875, + "learning_rate": 1.8025144489308255e-05, + "loss": 1.0331, + "step": 3640 + }, + { + "epoch": 0.6243007480120882, + "grad_norm": 1.5625, + "learning_rate": 1.802406688267528e-05, + "loss": 0.9741, + "step": 3641 + }, + { + "epoch": 0.6244722121010781, + "grad_norm": 1.5703125, + "learning_rate": 1.802298901434727e-05, + "loss": 1.0698, + "step": 3642 + }, + { + "epoch": 0.624643676190068, + "grad_norm": 1.71875, + "learning_rate": 1.8021910884359373e-05, + "loss": 1.0824, + "step": 3643 + }, + { + "epoch": 0.6248151402790578, + "grad_norm": 1.546875, + "learning_rate": 1.802083249274676e-05, + "loss": 0.9868, + "step": 3644 + }, + { + "epoch": 0.6249866043680476, + "grad_norm": 1.90625, + "learning_rate": 1.8019753839544588e-05, + "loss": 0.9885, + "step": 3645 + }, + { + "epoch": 0.6251580684570375, + "grad_norm": 1.5546875, + "learning_rate": 1.801867492478805e-05, + "loss": 0.9759, + "step": 3646 + }, + { + "epoch": 0.6253295325460274, + "grad_norm": 1.53125, + "learning_rate": 1.8017595748512327e-05, + "loss": 0.9293, + "step": 3647 + }, + { + "epoch": 0.6255009966350172, + "grad_norm": 1.625, + "learning_rate": 1.8016516310752614e-05, + "loss": 1.0215, + "step": 3648 + }, + { + "epoch": 0.6256724607240071, + "grad_norm": 1.5390625, + "learning_rate": 1.8015436611544112e-05, + "loss": 1.0435, + "step": 3649 + }, + { + "epoch": 0.625843924812997, + "grad_norm": 1.5859375, + "learning_rate": 1.8014356650922043e-05, + "loss": 1.016, + "step": 3650 + }, + { + "epoch": 0.6260153889019868, + "grad_norm": 1.6171875, + "learning_rate": 1.8013276428921625e-05, + "loss": 1.0187, + "step": 3651 + }, + { + "epoch": 0.6261868529909767, + "grad_norm": 1.6484375, + "learning_rate": 1.8012195945578085e-05, + "loss": 1.0304, + "step": 3652 + }, + { + "epoch": 0.6263583170799666, + "grad_norm": 1.6328125, + "learning_rate": 1.8011115200926666e-05, + "loss": 1.1196, + "step": 3653 + }, + { + "epoch": 0.6265297811689564, + "grad_norm": 1.5390625, + "learning_rate": 1.8010034195002616e-05, + "loss": 0.9716, + "step": 3654 + }, + { + "epoch": 0.6267012452579462, + "grad_norm": 1.5703125, + "learning_rate": 1.800895292784118e-05, + "loss": 1.0112, + "step": 3655 + }, + { + "epoch": 0.6268727093469362, + "grad_norm": 1.6640625, + "learning_rate": 1.800787139947764e-05, + "loss": 1.0518, + "step": 3656 + }, + { + "epoch": 0.627044173435926, + "grad_norm": 1.5546875, + "learning_rate": 1.800678960994725e-05, + "loss": 0.9209, + "step": 3657 + }, + { + "epoch": 0.6272156375249158, + "grad_norm": 1.640625, + "learning_rate": 1.8005707559285306e-05, + "loss": 1.0306, + "step": 3658 + }, + { + "epoch": 0.6273871016139058, + "grad_norm": 1.5625, + "learning_rate": 1.800462524752709e-05, + "loss": 1.0427, + "step": 3659 + }, + { + "epoch": 0.6275585657028956, + "grad_norm": 1.609375, + "learning_rate": 1.80035426747079e-05, + "loss": 1.0499, + "step": 3660 + }, + { + "epoch": 0.6277300297918854, + "grad_norm": 1.5234375, + "learning_rate": 1.8002459840863044e-05, + "loss": 0.9191, + "step": 3661 + }, + { + "epoch": 0.6279014938808753, + "grad_norm": 1.6875, + "learning_rate": 1.8001376746027844e-05, + "loss": 1.1145, + "step": 3662 + }, + { + "epoch": 0.6280729579698652, + "grad_norm": 1.5390625, + "learning_rate": 1.800029339023761e-05, + "loss": 0.9926, + "step": 3663 + }, + { + "epoch": 0.628244422058855, + "grad_norm": 1.609375, + "learning_rate": 1.799920977352769e-05, + "loss": 1.0446, + "step": 3664 + }, + { + "epoch": 0.6284158861478449, + "grad_norm": 1.6328125, + "learning_rate": 1.799812589593341e-05, + "loss": 1.0364, + "step": 3665 + }, + { + "epoch": 0.6285873502368348, + "grad_norm": 1.4765625, + "learning_rate": 1.7997041757490133e-05, + "loss": 0.954, + "step": 3666 + }, + { + "epoch": 0.6287588143258246, + "grad_norm": 1.671875, + "learning_rate": 1.7995957358233206e-05, + "loss": 1.0657, + "step": 3667 + }, + { + "epoch": 0.6289302784148145, + "grad_norm": 1.6015625, + "learning_rate": 1.7994872698198e-05, + "loss": 1.0128, + "step": 3668 + }, + { + "epoch": 0.6291017425038044, + "grad_norm": 1.5234375, + "learning_rate": 1.799378777741989e-05, + "loss": 1.011, + "step": 3669 + }, + { + "epoch": 0.6292732065927942, + "grad_norm": 1.625, + "learning_rate": 1.7992702595934262e-05, + "loss": 1.1297, + "step": 3670 + }, + { + "epoch": 0.6294446706817841, + "grad_norm": 1.578125, + "learning_rate": 1.79916171537765e-05, + "loss": 1.0064, + "step": 3671 + }, + { + "epoch": 0.6296161347707739, + "grad_norm": 1.5703125, + "learning_rate": 1.7990531450982013e-05, + "loss": 0.9921, + "step": 3672 + }, + { + "epoch": 0.6297875988597638, + "grad_norm": 1.4765625, + "learning_rate": 1.7989445487586202e-05, + "loss": 1.0384, + "step": 3673 + }, + { + "epoch": 0.6299590629487537, + "grad_norm": 1.5546875, + "learning_rate": 1.798835926362449e-05, + "loss": 1.1058, + "step": 3674 + }, + { + "epoch": 0.6301305270377435, + "grad_norm": 1.515625, + "learning_rate": 1.79872727791323e-05, + "loss": 0.9876, + "step": 3675 + }, + { + "epoch": 0.6303019911267334, + "grad_norm": 1.6171875, + "learning_rate": 1.798618603414507e-05, + "loss": 1.0731, + "step": 3676 + }, + { + "epoch": 0.6304734552157233, + "grad_norm": 1.6953125, + "learning_rate": 1.7985099028698237e-05, + "loss": 1.0111, + "step": 3677 + }, + { + "epoch": 0.6306449193047131, + "grad_norm": 1.5625, + "learning_rate": 1.7984011762827257e-05, + "loss": 0.9985, + "step": 3678 + }, + { + "epoch": 0.6308163833937029, + "grad_norm": 1.609375, + "learning_rate": 1.7982924236567587e-05, + "loss": 1.0882, + "step": 3679 + }, + { + "epoch": 0.6309878474826929, + "grad_norm": 1.609375, + "learning_rate": 1.79818364499547e-05, + "loss": 1.0011, + "step": 3680 + }, + { + "epoch": 0.6311593115716827, + "grad_norm": 1.6875, + "learning_rate": 1.7980748403024067e-05, + "loss": 1.0398, + "step": 3681 + }, + { + "epoch": 0.6313307756606725, + "grad_norm": 1.59375, + "learning_rate": 1.7979660095811177e-05, + "loss": 1.0896, + "step": 3682 + }, + { + "epoch": 0.6315022397496625, + "grad_norm": 1.7109375, + "learning_rate": 1.7978571528351517e-05, + "loss": 1.0367, + "step": 3683 + }, + { + "epoch": 0.6316737038386523, + "grad_norm": 1.6015625, + "learning_rate": 1.7977482700680605e-05, + "loss": 1.0867, + "step": 3684 + }, + { + "epoch": 0.6318451679276421, + "grad_norm": 1.640625, + "learning_rate": 1.7976393612833935e-05, + "loss": 1.028, + "step": 3685 + }, + { + "epoch": 0.632016632016632, + "grad_norm": 1.6328125, + "learning_rate": 1.7975304264847035e-05, + "loss": 1.0476, + "step": 3686 + }, + { + "epoch": 0.6321880961056219, + "grad_norm": 1.578125, + "learning_rate": 1.7974214656755432e-05, + "loss": 0.9904, + "step": 3687 + }, + { + "epoch": 0.6323595601946117, + "grad_norm": 1.6484375, + "learning_rate": 1.7973124788594658e-05, + "loss": 1.0644, + "step": 3688 + }, + { + "epoch": 0.6325310242836016, + "grad_norm": 1.5234375, + "learning_rate": 1.797203466040026e-05, + "loss": 1.0607, + "step": 3689 + }, + { + "epoch": 0.6327024883725915, + "grad_norm": 1.53125, + "learning_rate": 1.7970944272207795e-05, + "loss": 0.9432, + "step": 3690 + }, + { + "epoch": 0.6328739524615813, + "grad_norm": 1.53125, + "learning_rate": 1.796985362405282e-05, + "loss": 1.0377, + "step": 3691 + }, + { + "epoch": 0.6330454165505712, + "grad_norm": 1.7265625, + "learning_rate": 1.7968762715970905e-05, + "loss": 1.1736, + "step": 3692 + }, + { + "epoch": 0.633216880639561, + "grad_norm": 1.578125, + "learning_rate": 1.7967671547997633e-05, + "loss": 1.0287, + "step": 3693 + }, + { + "epoch": 0.6333883447285509, + "grad_norm": 1.5390625, + "learning_rate": 1.7966580120168588e-05, + "loss": 1.0058, + "step": 3694 + }, + { + "epoch": 0.6335598088175408, + "grad_norm": 1.7109375, + "learning_rate": 1.7965488432519363e-05, + "loss": 1.0454, + "step": 3695 + }, + { + "epoch": 0.6337312729065306, + "grad_norm": 1.640625, + "learning_rate": 1.796439648508557e-05, + "loss": 0.9996, + "step": 3696 + }, + { + "epoch": 0.6339027369955205, + "grad_norm": 1.5625, + "learning_rate": 1.7963304277902815e-05, + "loss": 0.9916, + "step": 3697 + }, + { + "epoch": 0.6340742010845104, + "grad_norm": 1.5390625, + "learning_rate": 1.7962211811006717e-05, + "loss": 1.0748, + "step": 3698 + }, + { + "epoch": 0.6342456651735002, + "grad_norm": 1.578125, + "learning_rate": 1.7961119084432914e-05, + "loss": 0.9708, + "step": 3699 + }, + { + "epoch": 0.63441712926249, + "grad_norm": 1.4921875, + "learning_rate": 1.7960026098217033e-05, + "loss": 1.0117, + "step": 3700 + }, + { + "epoch": 0.63458859335148, + "grad_norm": 1.625, + "learning_rate": 1.795893285239473e-05, + "loss": 1.0854, + "step": 3701 + }, + { + "epoch": 0.6347600574404698, + "grad_norm": 1.5625, + "learning_rate": 1.7957839347001656e-05, + "loss": 1.0339, + "step": 3702 + }, + { + "epoch": 0.6349315215294596, + "grad_norm": 1.578125, + "learning_rate": 1.7956745582073472e-05, + "loss": 1.0549, + "step": 3703 + }, + { + "epoch": 0.6351029856184496, + "grad_norm": 1.65625, + "learning_rate": 1.7955651557645856e-05, + "loss": 1.0122, + "step": 3704 + }, + { + "epoch": 0.6352744497074394, + "grad_norm": 1.5234375, + "learning_rate": 1.795455727375448e-05, + "loss": 0.9592, + "step": 3705 + }, + { + "epoch": 0.6354459137964292, + "grad_norm": 1.6484375, + "learning_rate": 1.7953462730435043e-05, + "loss": 1.1369, + "step": 3706 + }, + { + "epoch": 0.6356173778854192, + "grad_norm": 1.53125, + "learning_rate": 1.795236792772323e-05, + "loss": 0.9327, + "step": 3707 + }, + { + "epoch": 0.635788841974409, + "grad_norm": 1.546875, + "learning_rate": 1.795127286565476e-05, + "loss": 1.0152, + "step": 3708 + }, + { + "epoch": 0.6359603060633988, + "grad_norm": 1.6015625, + "learning_rate": 1.7950177544265333e-05, + "loss": 1.049, + "step": 3709 + }, + { + "epoch": 0.6361317701523888, + "grad_norm": 1.5390625, + "learning_rate": 1.7949081963590683e-05, + "loss": 1.0378, + "step": 3710 + }, + { + "epoch": 0.6363032342413786, + "grad_norm": 1.609375, + "learning_rate": 1.7947986123666536e-05, + "loss": 1.0444, + "step": 3711 + }, + { + "epoch": 0.6364746983303684, + "grad_norm": 1.5546875, + "learning_rate": 1.7946890024528633e-05, + "loss": 1.0106, + "step": 3712 + }, + { + "epoch": 0.6366461624193583, + "grad_norm": 1.5625, + "learning_rate": 1.794579366621272e-05, + "loss": 1.0274, + "step": 3713 + }, + { + "epoch": 0.6368176265083482, + "grad_norm": 1.5546875, + "learning_rate": 1.7944697048754552e-05, + "loss": 0.9483, + "step": 3714 + }, + { + "epoch": 0.636989090597338, + "grad_norm": 1.5859375, + "learning_rate": 1.79436001721899e-05, + "loss": 1.096, + "step": 3715 + }, + { + "epoch": 0.6371605546863279, + "grad_norm": 1.5234375, + "learning_rate": 1.794250303655453e-05, + "loss": 1.0592, + "step": 3716 + }, + { + "epoch": 0.6373320187753178, + "grad_norm": 1.5703125, + "learning_rate": 1.794140564188423e-05, + "loss": 1.0463, + "step": 3717 + }, + { + "epoch": 0.6375034828643076, + "grad_norm": 1.546875, + "learning_rate": 1.794030798821479e-05, + "loss": 0.9117, + "step": 3718 + }, + { + "epoch": 0.6376749469532975, + "grad_norm": 1.671875, + "learning_rate": 1.7939210075582003e-05, + "loss": 1.051, + "step": 3719 + }, + { + "epoch": 0.6378464110422873, + "grad_norm": 1.5703125, + "learning_rate": 1.7938111904021677e-05, + "loss": 1.0476, + "step": 3720 + }, + { + "epoch": 0.6380178751312772, + "grad_norm": 1.609375, + "learning_rate": 1.7937013473569633e-05, + "loss": 0.9729, + "step": 3721 + }, + { + "epoch": 0.6381893392202671, + "grad_norm": 1.625, + "learning_rate": 1.793591478426169e-05, + "loss": 1.0366, + "step": 3722 + }, + { + "epoch": 0.6383608033092569, + "grad_norm": 1.5625, + "learning_rate": 1.7934815836133684e-05, + "loss": 1.0052, + "step": 3723 + }, + { + "epoch": 0.6385322673982468, + "grad_norm": 1.6328125, + "learning_rate": 1.7933716629221455e-05, + "loss": 1.0623, + "step": 3724 + }, + { + "epoch": 0.6387037314872367, + "grad_norm": 1.5546875, + "learning_rate": 1.7932617163560847e-05, + "loss": 0.9928, + "step": 3725 + }, + { + "epoch": 0.6388751955762265, + "grad_norm": 1.5625, + "learning_rate": 1.793151743918772e-05, + "loss": 1.0026, + "step": 3726 + }, + { + "epoch": 0.6390466596652163, + "grad_norm": 1.5234375, + "learning_rate": 1.793041745613795e-05, + "loss": 1.0157, + "step": 3727 + }, + { + "epoch": 0.6392181237542063, + "grad_norm": 1.46875, + "learning_rate": 1.7929317214447395e-05, + "loss": 1.0197, + "step": 3728 + }, + { + "epoch": 0.6393895878431961, + "grad_norm": 1.515625, + "learning_rate": 1.7928216714151953e-05, + "loss": 0.9981, + "step": 3729 + }, + { + "epoch": 0.6395610519321859, + "grad_norm": 1.5546875, + "learning_rate": 1.7927115955287506e-05, + "loss": 1.009, + "step": 3730 + }, + { + "epoch": 0.6397325160211759, + "grad_norm": 1.6171875, + "learning_rate": 1.7926014937889956e-05, + "loss": 1.0699, + "step": 3731 + }, + { + "epoch": 0.6399039801101657, + "grad_norm": 1.609375, + "learning_rate": 1.7924913661995212e-05, + "loss": 1.0051, + "step": 3732 + }, + { + "epoch": 0.6400754441991555, + "grad_norm": 1.7265625, + "learning_rate": 1.7923812127639194e-05, + "loss": 1.063, + "step": 3733 + }, + { + "epoch": 0.6402469082881455, + "grad_norm": 1.6484375, + "learning_rate": 1.7922710334857824e-05, + "loss": 0.9803, + "step": 3734 + }, + { + "epoch": 0.6404183723771353, + "grad_norm": 1.609375, + "learning_rate": 1.7921608283687035e-05, + "loss": 1.0446, + "step": 3735 + }, + { + "epoch": 0.6405898364661251, + "grad_norm": 1.640625, + "learning_rate": 1.7920505974162766e-05, + "loss": 1.02, + "step": 3736 + }, + { + "epoch": 0.640761300555115, + "grad_norm": 1.6015625, + "learning_rate": 1.7919403406320978e-05, + "loss": 1.007, + "step": 3737 + }, + { + "epoch": 0.6409327646441049, + "grad_norm": 1.609375, + "learning_rate": 1.7918300580197617e-05, + "loss": 1.0234, + "step": 3738 + }, + { + "epoch": 0.6411042287330947, + "grad_norm": 1.578125, + "learning_rate": 1.7917197495828663e-05, + "loss": 0.9882, + "step": 3739 + }, + { + "epoch": 0.6412756928220845, + "grad_norm": 1.640625, + "learning_rate": 1.791609415325008e-05, + "loss": 1.0269, + "step": 3740 + }, + { + "epoch": 0.6414471569110745, + "grad_norm": 1.6015625, + "learning_rate": 1.7914990552497863e-05, + "loss": 1.018, + "step": 3741 + }, + { + "epoch": 0.6416186210000643, + "grad_norm": 1.5625, + "learning_rate": 1.7913886693607992e-05, + "loss": 1.0189, + "step": 3742 + }, + { + "epoch": 0.6417900850890541, + "grad_norm": 1.5234375, + "learning_rate": 1.791278257661648e-05, + "loss": 1.0041, + "step": 3743 + }, + { + "epoch": 0.641961549178044, + "grad_norm": 1.5703125, + "learning_rate": 1.791167820155933e-05, + "loss": 1.0526, + "step": 3744 + }, + { + "epoch": 0.6421330132670339, + "grad_norm": 1.515625, + "learning_rate": 1.7910573568472556e-05, + "loss": 1.0612, + "step": 3745 + }, + { + "epoch": 0.6423044773560237, + "grad_norm": 1.5390625, + "learning_rate": 1.7909468677392198e-05, + "loss": 1.0807, + "step": 3746 + }, + { + "epoch": 0.6424759414450136, + "grad_norm": 1.5390625, + "learning_rate": 1.790836352835428e-05, + "loss": 1.0416, + "step": 3747 + }, + { + "epoch": 0.6426474055340035, + "grad_norm": 1.5234375, + "learning_rate": 1.7907258121394843e-05, + "loss": 0.9954, + "step": 3748 + }, + { + "epoch": 0.6428188696229933, + "grad_norm": 1.515625, + "learning_rate": 1.7906152456549946e-05, + "loss": 0.9858, + "step": 3749 + }, + { + "epoch": 0.6429903337119832, + "grad_norm": 1.546875, + "learning_rate": 1.7905046533855642e-05, + "loss": 1.0425, + "step": 3750 + }, + { + "epoch": 0.643161797800973, + "grad_norm": 1.5234375, + "learning_rate": 1.790394035334801e-05, + "loss": 0.9905, + "step": 3751 + }, + { + "epoch": 0.6433332618899629, + "grad_norm": 1.609375, + "learning_rate": 1.790283391506311e-05, + "loss": 0.9942, + "step": 3752 + }, + { + "epoch": 0.6435047259789528, + "grad_norm": 1.59375, + "learning_rate": 1.7901727219037047e-05, + "loss": 1.0426, + "step": 3753 + }, + { + "epoch": 0.6436761900679426, + "grad_norm": 1.5546875, + "learning_rate": 1.7900620265305895e-05, + "loss": 1.1176, + "step": 3754 + }, + { + "epoch": 0.6438476541569325, + "grad_norm": 1.6171875, + "learning_rate": 1.7899513053905772e-05, + "loss": 1.0297, + "step": 3755 + }, + { + "epoch": 0.6440191182459224, + "grad_norm": 1.6484375, + "learning_rate": 1.789840558487278e-05, + "loss": 1.0504, + "step": 3756 + }, + { + "epoch": 0.6441905823349122, + "grad_norm": 1.609375, + "learning_rate": 1.7897297858243036e-05, + "loss": 1.021, + "step": 3757 + }, + { + "epoch": 0.644362046423902, + "grad_norm": 1.640625, + "learning_rate": 1.7896189874052675e-05, + "loss": 1.0672, + "step": 3758 + }, + { + "epoch": 0.644533510512892, + "grad_norm": 1.5234375, + "learning_rate": 1.7895081632337826e-05, + "loss": 0.9723, + "step": 3759 + }, + { + "epoch": 0.6447049746018818, + "grad_norm": 1.5234375, + "learning_rate": 1.7893973133134634e-05, + "loss": 0.946, + "step": 3760 + }, + { + "epoch": 0.6448764386908716, + "grad_norm": 1.6484375, + "learning_rate": 1.7892864376479254e-05, + "loss": 0.978, + "step": 3761 + }, + { + "epoch": 0.6450479027798616, + "grad_norm": 1.5859375, + "learning_rate": 1.789175536240784e-05, + "loss": 1.0083, + "step": 3762 + }, + { + "epoch": 0.6452193668688514, + "grad_norm": 1.5078125, + "learning_rate": 1.789064609095657e-05, + "loss": 1.0333, + "step": 3763 + }, + { + "epoch": 0.6453908309578412, + "grad_norm": 1.6328125, + "learning_rate": 1.788953656216162e-05, + "loss": 1.0297, + "step": 3764 + }, + { + "epoch": 0.6455622950468312, + "grad_norm": 1.5625, + "learning_rate": 1.7888426776059166e-05, + "loss": 0.9605, + "step": 3765 + }, + { + "epoch": 0.645733759135821, + "grad_norm": 1.6640625, + "learning_rate": 1.7887316732685415e-05, + "loss": 1.0634, + "step": 3766 + }, + { + "epoch": 0.6459052232248108, + "grad_norm": 1.65625, + "learning_rate": 1.7886206432076566e-05, + "loss": 1.0777, + "step": 3767 + }, + { + "epoch": 0.6460766873138007, + "grad_norm": 1.6328125, + "learning_rate": 1.7885095874268826e-05, + "loss": 1.0468, + "step": 3768 + }, + { + "epoch": 0.6462481514027906, + "grad_norm": 1.546875, + "learning_rate": 1.7883985059298418e-05, + "loss": 0.9256, + "step": 3769 + }, + { + "epoch": 0.6464196154917804, + "grad_norm": 1.671875, + "learning_rate": 1.7882873987201566e-05, + "loss": 1.0545, + "step": 3770 + }, + { + "epoch": 0.6465910795807703, + "grad_norm": 1.546875, + "learning_rate": 1.788176265801451e-05, + "loss": 0.9954, + "step": 3771 + }, + { + "epoch": 0.6467625436697602, + "grad_norm": 1.5390625, + "learning_rate": 1.7880651071773495e-05, + "loss": 0.9356, + "step": 3772 + }, + { + "epoch": 0.64693400775875, + "grad_norm": 1.6875, + "learning_rate": 1.7879539228514775e-05, + "loss": 1.0697, + "step": 3773 + }, + { + "epoch": 0.6471054718477399, + "grad_norm": 1.59375, + "learning_rate": 1.7878427128274607e-05, + "loss": 0.9816, + "step": 3774 + }, + { + "epoch": 0.6472769359367297, + "grad_norm": 1.5234375, + "learning_rate": 1.787731477108926e-05, + "loss": 0.9684, + "step": 3775 + }, + { + "epoch": 0.6474484000257196, + "grad_norm": 1.609375, + "learning_rate": 1.7876202156995018e-05, + "loss": 1.05, + "step": 3776 + }, + { + "epoch": 0.6476198641147095, + "grad_norm": 1.5859375, + "learning_rate": 1.7875089286028167e-05, + "loss": 0.9508, + "step": 3777 + }, + { + "epoch": 0.6477913282036993, + "grad_norm": 1.5859375, + "learning_rate": 1.7873976158225e-05, + "loss": 0.9991, + "step": 3778 + }, + { + "epoch": 0.6479627922926892, + "grad_norm": 1.5, + "learning_rate": 1.7872862773621814e-05, + "loss": 0.9117, + "step": 3779 + }, + { + "epoch": 0.6481342563816791, + "grad_norm": 1.6484375, + "learning_rate": 1.787174913225493e-05, + "loss": 0.9993, + "step": 3780 + }, + { + "epoch": 0.6483057204706689, + "grad_norm": 45.5, + "learning_rate": 1.7870635234160663e-05, + "loss": 1.1466, + "step": 3781 + }, + { + "epoch": 0.6484771845596587, + "grad_norm": 1.65625, + "learning_rate": 1.7869521079375345e-05, + "loss": 0.9891, + "step": 3782 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 1.5390625, + "learning_rate": 1.786840666793531e-05, + "loss": 0.9753, + "step": 3783 + }, + { + "epoch": 0.6488201127376385, + "grad_norm": 1.6171875, + "learning_rate": 1.7867291999876905e-05, + "loss": 1.0144, + "step": 3784 + }, + { + "epoch": 0.6489915768266283, + "grad_norm": 1.625, + "learning_rate": 1.786617707523648e-05, + "loss": 1.1283, + "step": 3785 + }, + { + "epoch": 0.6491630409156183, + "grad_norm": 1.6171875, + "learning_rate": 1.78650618940504e-05, + "loss": 0.996, + "step": 3786 + }, + { + "epoch": 0.6493345050046081, + "grad_norm": 1.6328125, + "learning_rate": 1.7863946456355036e-05, + "loss": 1.0807, + "step": 3787 + }, + { + "epoch": 0.6495059690935979, + "grad_norm": 1.5390625, + "learning_rate": 1.786283076218676e-05, + "loss": 1.0865, + "step": 3788 + }, + { + "epoch": 0.6496774331825879, + "grad_norm": 1.484375, + "learning_rate": 1.786171481158197e-05, + "loss": 1.019, + "step": 3789 + }, + { + "epoch": 0.6498488972715777, + "grad_norm": 1.4921875, + "learning_rate": 1.786059860457705e-05, + "loss": 0.8982, + "step": 3790 + }, + { + "epoch": 0.6500203613605675, + "grad_norm": 1.5234375, + "learning_rate": 1.7859482141208413e-05, + "loss": 0.9797, + "step": 3791 + }, + { + "epoch": 0.6501918254495574, + "grad_norm": 1.5234375, + "learning_rate": 1.7858365421512467e-05, + "loss": 0.9876, + "step": 3792 + }, + { + "epoch": 0.6503632895385473, + "grad_norm": 1.546875, + "learning_rate": 1.7857248445525636e-05, + "loss": 1.0003, + "step": 3793 + }, + { + "epoch": 0.6505347536275371, + "grad_norm": 1.6015625, + "learning_rate": 1.785613121328434e-05, + "loss": 0.9599, + "step": 3794 + }, + { + "epoch": 0.650706217716527, + "grad_norm": 1.59375, + "learning_rate": 1.785501372482503e-05, + "loss": 1.0637, + "step": 3795 + }, + { + "epoch": 0.6508776818055169, + "grad_norm": 1.546875, + "learning_rate": 1.7853895980184136e-05, + "loss": 0.9595, + "step": 3796 + }, + { + "epoch": 0.6510491458945067, + "grad_norm": 1.546875, + "learning_rate": 1.785277797939812e-05, + "loss": 0.9918, + "step": 3797 + }, + { + "epoch": 0.6512206099834966, + "grad_norm": 1.5859375, + "learning_rate": 1.7851659722503442e-05, + "loss": 0.9836, + "step": 3798 + }, + { + "epoch": 0.6513920740724864, + "grad_norm": 1.5859375, + "learning_rate": 1.7850541209536575e-05, + "loss": 1.044, + "step": 3799 + }, + { + "epoch": 0.6515635381614763, + "grad_norm": 1.5234375, + "learning_rate": 1.7849422440533998e-05, + "loss": 0.9397, + "step": 3800 + }, + { + "epoch": 0.6517350022504662, + "grad_norm": 1.59375, + "learning_rate": 1.7848303415532197e-05, + "loss": 0.9828, + "step": 3801 + }, + { + "epoch": 0.651906466339456, + "grad_norm": 1.640625, + "learning_rate": 1.7847184134567664e-05, + "loss": 1.0753, + "step": 3802 + }, + { + "epoch": 0.6520779304284459, + "grad_norm": 1.5859375, + "learning_rate": 1.7846064597676913e-05, + "loss": 1.0568, + "step": 3803 + }, + { + "epoch": 0.6522493945174358, + "grad_norm": 1.7265625, + "learning_rate": 1.7844944804896446e-05, + "loss": 0.978, + "step": 3804 + }, + { + "epoch": 0.6524208586064256, + "grad_norm": 1.53125, + "learning_rate": 1.784382475626279e-05, + "loss": 1.0221, + "step": 3805 + }, + { + "epoch": 0.6525923226954154, + "grad_norm": 1.6015625, + "learning_rate": 1.784270445181247e-05, + "loss": 1.0098, + "step": 3806 + }, + { + "epoch": 0.6527637867844054, + "grad_norm": 1.6015625, + "learning_rate": 1.784158389158202e-05, + "loss": 1.0093, + "step": 3807 + }, + { + "epoch": 0.6529352508733952, + "grad_norm": 1.6796875, + "learning_rate": 1.7840463075607996e-05, + "loss": 1.0862, + "step": 3808 + }, + { + "epoch": 0.653106714962385, + "grad_norm": 1.53125, + "learning_rate": 1.7839342003926945e-05, + "loss": 1.0648, + "step": 3809 + }, + { + "epoch": 0.653278179051375, + "grad_norm": 1.5234375, + "learning_rate": 1.7838220676575433e-05, + "loss": 1.0024, + "step": 3810 + }, + { + "epoch": 0.6534496431403648, + "grad_norm": 1.5859375, + "learning_rate": 1.783709909359003e-05, + "loss": 1.1038, + "step": 3811 + }, + { + "epoch": 0.6536211072293546, + "grad_norm": 1.5546875, + "learning_rate": 1.783597725500731e-05, + "loss": 0.9757, + "step": 3812 + }, + { + "epoch": 0.6537925713183446, + "grad_norm": 1.6796875, + "learning_rate": 1.7834855160863864e-05, + "loss": 1.0116, + "step": 3813 + }, + { + "epoch": 0.6539640354073344, + "grad_norm": 1.625, + "learning_rate": 1.783373281119629e-05, + "loss": 1.0175, + "step": 3814 + }, + { + "epoch": 0.6541354994963242, + "grad_norm": 1.625, + "learning_rate": 1.783261020604119e-05, + "loss": 0.991, + "step": 3815 + }, + { + "epoch": 0.6543069635853141, + "grad_norm": 1.7421875, + "learning_rate": 1.7831487345435175e-05, + "loss": 1.0568, + "step": 3816 + }, + { + "epoch": 0.654478427674304, + "grad_norm": 1.5625, + "learning_rate": 1.783036422941487e-05, + "loss": 0.976, + "step": 3817 + }, + { + "epoch": 0.6546498917632938, + "grad_norm": 1.5859375, + "learning_rate": 1.7829240858016896e-05, + "loss": 0.9601, + "step": 3818 + }, + { + "epoch": 0.6548213558522837, + "grad_norm": 1.6015625, + "learning_rate": 1.78281172312779e-05, + "loss": 1.0032, + "step": 3819 + }, + { + "epoch": 0.6549928199412736, + "grad_norm": 1.59375, + "learning_rate": 1.782699334923452e-05, + "loss": 1.0149, + "step": 3820 + }, + { + "epoch": 0.6551642840302634, + "grad_norm": 1.5234375, + "learning_rate": 1.7825869211923415e-05, + "loss": 0.9881, + "step": 3821 + }, + { + "epoch": 0.6553357481192533, + "grad_norm": 1.6640625, + "learning_rate": 1.7824744819381244e-05, + "loss": 1.096, + "step": 3822 + }, + { + "epoch": 0.6555072122082432, + "grad_norm": 1.6328125, + "learning_rate": 1.782362017164468e-05, + "loss": 1.0724, + "step": 3823 + }, + { + "epoch": 0.655678676297233, + "grad_norm": 1.6328125, + "learning_rate": 1.7822495268750402e-05, + "loss": 0.9867, + "step": 3824 + }, + { + "epoch": 0.6558501403862229, + "grad_norm": 1.640625, + "learning_rate": 1.7821370110735094e-05, + "loss": 1.0498, + "step": 3825 + }, + { + "epoch": 0.6560216044752127, + "grad_norm": 1.5625, + "learning_rate": 1.7820244697635458e-05, + "loss": 1.0758, + "step": 3826 + }, + { + "epoch": 0.6561930685642026, + "grad_norm": 1.6171875, + "learning_rate": 1.781911902948819e-05, + "loss": 1.0126, + "step": 3827 + }, + { + "epoch": 0.6563645326531925, + "grad_norm": 1.609375, + "learning_rate": 1.781799310633001e-05, + "loss": 1.1322, + "step": 3828 + }, + { + "epoch": 0.6565359967421823, + "grad_norm": 1.578125, + "learning_rate": 1.7816866928197632e-05, + "loss": 1.1316, + "step": 3829 + }, + { + "epoch": 0.6567074608311722, + "grad_norm": 1.5625, + "learning_rate": 1.781574049512779e-05, + "loss": 0.9376, + "step": 3830 + }, + { + "epoch": 0.6568789249201621, + "grad_norm": 1.609375, + "learning_rate": 1.781461380715722e-05, + "loss": 0.9902, + "step": 3831 + }, + { + "epoch": 0.6570503890091519, + "grad_norm": 1.609375, + "learning_rate": 1.781348686432266e-05, + "loss": 1.0493, + "step": 3832 + }, + { + "epoch": 0.6572218530981417, + "grad_norm": 1.515625, + "learning_rate": 1.781235966666088e-05, + "loss": 0.9787, + "step": 3833 + }, + { + "epoch": 0.6573933171871316, + "grad_norm": 1.6328125, + "learning_rate": 1.7811232214208626e-05, + "loss": 0.9926, + "step": 3834 + }, + { + "epoch": 0.6575647812761215, + "grad_norm": 1.65625, + "learning_rate": 1.7810104507002675e-05, + "loss": 1.0555, + "step": 3835 + }, + { + "epoch": 0.6577362453651113, + "grad_norm": 1.640625, + "learning_rate": 1.780897654507981e-05, + "loss": 0.9789, + "step": 3836 + }, + { + "epoch": 0.6579077094541012, + "grad_norm": 1.7109375, + "learning_rate": 1.7807848328476813e-05, + "loss": 1.0648, + "step": 3837 + }, + { + "epoch": 0.6580791735430911, + "grad_norm": 1.5859375, + "learning_rate": 1.780671985723048e-05, + "loss": 0.8998, + "step": 3838 + }, + { + "epoch": 0.6582506376320809, + "grad_norm": 3.859375, + "learning_rate": 1.7805591131377612e-05, + "loss": 1.0997, + "step": 3839 + }, + { + "epoch": 0.6584221017210707, + "grad_norm": 1.6875, + "learning_rate": 1.780446215095503e-05, + "loss": 1.0947, + "step": 3840 + }, + { + "epoch": 0.6585935658100607, + "grad_norm": 1.5390625, + "learning_rate": 1.7803332915999542e-05, + "loss": 1.0082, + "step": 3841 + }, + { + "epoch": 0.6587650298990505, + "grad_norm": 1.640625, + "learning_rate": 1.780220342654799e-05, + "loss": 1.0477, + "step": 3842 + }, + { + "epoch": 0.6589364939880403, + "grad_norm": 1.625, + "learning_rate": 1.78010736826372e-05, + "loss": 1.1199, + "step": 3843 + }, + { + "epoch": 0.6591079580770303, + "grad_norm": 1.515625, + "learning_rate": 1.7799943684304016e-05, + "loss": 0.9562, + "step": 3844 + }, + { + "epoch": 0.6592794221660201, + "grad_norm": 1.7421875, + "learning_rate": 1.77988134315853e-05, + "loss": 1.0783, + "step": 3845 + }, + { + "epoch": 0.6594508862550099, + "grad_norm": 1.4765625, + "learning_rate": 1.7797682924517917e-05, + "loss": 0.8665, + "step": 3846 + }, + { + "epoch": 0.6596223503439999, + "grad_norm": 1.6015625, + "learning_rate": 1.7796552163138722e-05, + "loss": 1.1352, + "step": 3847 + }, + { + "epoch": 0.6597938144329897, + "grad_norm": 1.5078125, + "learning_rate": 1.7795421147484608e-05, + "loss": 1.0046, + "step": 3848 + }, + { + "epoch": 0.6599652785219795, + "grad_norm": 1.515625, + "learning_rate": 1.7794289877592453e-05, + "loss": 1.0031, + "step": 3849 + }, + { + "epoch": 0.6601367426109694, + "grad_norm": 1.4609375, + "learning_rate": 1.779315835349915e-05, + "loss": 0.9268, + "step": 3850 + }, + { + "epoch": 0.6603082066999593, + "grad_norm": 1.625, + "learning_rate": 1.7792026575241616e-05, + "loss": 0.9385, + "step": 3851 + }, + { + "epoch": 0.6604796707889491, + "grad_norm": 1.6640625, + "learning_rate": 1.7790894542856748e-05, + "loss": 1.0262, + "step": 3852 + }, + { + "epoch": 0.660651134877939, + "grad_norm": 1.609375, + "learning_rate": 1.778976225638147e-05, + "loss": 1.0465, + "step": 3853 + }, + { + "epoch": 0.6608225989669289, + "grad_norm": 1.7265625, + "learning_rate": 1.778862971585271e-05, + "loss": 1.0516, + "step": 3854 + }, + { + "epoch": 0.6609940630559187, + "grad_norm": 1.6171875, + "learning_rate": 1.778749692130741e-05, + "loss": 0.9965, + "step": 3855 + }, + { + "epoch": 0.6611655271449086, + "grad_norm": 1.65625, + "learning_rate": 1.778636387278251e-05, + "loss": 1.0474, + "step": 3856 + }, + { + "epoch": 0.6613369912338984, + "grad_norm": 1.65625, + "learning_rate": 1.7785230570314963e-05, + "loss": 1.0767, + "step": 3857 + }, + { + "epoch": 0.6615084553228883, + "grad_norm": 1.5625, + "learning_rate": 1.7784097013941732e-05, + "loss": 1.0504, + "step": 3858 + }, + { + "epoch": 0.6616799194118782, + "grad_norm": 1.578125, + "learning_rate": 1.778296320369978e-05, + "loss": 1.009, + "step": 3859 + }, + { + "epoch": 0.661851383500868, + "grad_norm": 1.59375, + "learning_rate": 1.7781829139626096e-05, + "loss": 1.0706, + "step": 3860 + }, + { + "epoch": 0.6620228475898579, + "grad_norm": 1.65625, + "learning_rate": 1.7780694821757654e-05, + "loss": 0.9951, + "step": 3861 + }, + { + "epoch": 0.6621943116788478, + "grad_norm": 1.6171875, + "learning_rate": 1.7779560250131457e-05, + "loss": 1.0454, + "step": 3862 + }, + { + "epoch": 0.6623657757678376, + "grad_norm": 1.609375, + "learning_rate": 1.777842542478451e-05, + "loss": 1.1059, + "step": 3863 + }, + { + "epoch": 0.6625372398568274, + "grad_norm": 1.5390625, + "learning_rate": 1.7777290345753814e-05, + "loss": 1.0174, + "step": 3864 + }, + { + "epoch": 0.6627087039458174, + "grad_norm": 1.609375, + "learning_rate": 1.7776155013076394e-05, + "loss": 1.0829, + "step": 3865 + }, + { + "epoch": 0.6628801680348072, + "grad_norm": 1.515625, + "learning_rate": 1.7775019426789274e-05, + "loss": 0.997, + "step": 3866 + }, + { + "epoch": 0.663051632123797, + "grad_norm": 1.546875, + "learning_rate": 1.7773883586929498e-05, + "loss": 0.9961, + "step": 3867 + }, + { + "epoch": 0.663223096212787, + "grad_norm": 1.4765625, + "learning_rate": 1.77727474935341e-05, + "loss": 0.9763, + "step": 3868 + }, + { + "epoch": 0.6633945603017768, + "grad_norm": 1.65625, + "learning_rate": 1.777161114664014e-05, + "loss": 1.0266, + "step": 3869 + }, + { + "epoch": 0.6635660243907666, + "grad_norm": 1.6171875, + "learning_rate": 1.7770474546284674e-05, + "loss": 1.0637, + "step": 3870 + }, + { + "epoch": 0.6637374884797566, + "grad_norm": 1.6171875, + "learning_rate": 1.776933769250477e-05, + "loss": 1.0638, + "step": 3871 + }, + { + "epoch": 0.6639089525687464, + "grad_norm": 1.5625, + "learning_rate": 1.776820058533751e-05, + "loss": 1.0038, + "step": 3872 + }, + { + "epoch": 0.6640804166577362, + "grad_norm": 1.6171875, + "learning_rate": 1.7767063224819976e-05, + "loss": 1.0904, + "step": 3873 + }, + { + "epoch": 0.6642518807467261, + "grad_norm": 1.6640625, + "learning_rate": 1.7765925610989263e-05, + "loss": 0.968, + "step": 3874 + }, + { + "epoch": 0.664423344835716, + "grad_norm": 1.6953125, + "learning_rate": 1.776478774388247e-05, + "loss": 1.1252, + "step": 3875 + }, + { + "epoch": 0.6645948089247058, + "grad_norm": 1.5546875, + "learning_rate": 1.7763649623536712e-05, + "loss": 1.0071, + "step": 3876 + }, + { + "epoch": 0.6647662730136957, + "grad_norm": 1.6640625, + "learning_rate": 1.7762511249989104e-05, + "loss": 1.214, + "step": 3877 + }, + { + "epoch": 0.6649377371026856, + "grad_norm": 1.65625, + "learning_rate": 1.776137262327677e-05, + "loss": 1.0238, + "step": 3878 + }, + { + "epoch": 0.6651092011916754, + "grad_norm": 1.5390625, + "learning_rate": 1.776023374343685e-05, + "loss": 0.9687, + "step": 3879 + }, + { + "epoch": 0.6652806652806653, + "grad_norm": 1.5859375, + "learning_rate": 1.7759094610506486e-05, + "loss": 0.9881, + "step": 3880 + }, + { + "epoch": 0.6654521293696551, + "grad_norm": 1.46875, + "learning_rate": 1.775795522452283e-05, + "loss": 1.0175, + "step": 3881 + }, + { + "epoch": 0.665623593458645, + "grad_norm": 1.578125, + "learning_rate": 1.7756815585523038e-05, + "loss": 1.0085, + "step": 3882 + }, + { + "epoch": 0.6657950575476349, + "grad_norm": 1.578125, + "learning_rate": 1.7755675693544277e-05, + "loss": 0.9782, + "step": 3883 + }, + { + "epoch": 0.6659665216366247, + "grad_norm": 1.578125, + "learning_rate": 1.775453554862373e-05, + "loss": 0.9616, + "step": 3884 + }, + { + "epoch": 0.6661379857256146, + "grad_norm": 1.6796875, + "learning_rate": 1.775339515079858e-05, + "loss": 1.0437, + "step": 3885 + }, + { + "epoch": 0.6663094498146045, + "grad_norm": 1.65625, + "learning_rate": 1.7752254500106016e-05, + "loss": 1.146, + "step": 3886 + }, + { + "epoch": 0.6664809139035943, + "grad_norm": 1.7265625, + "learning_rate": 1.775111359658324e-05, + "loss": 1.0646, + "step": 3887 + }, + { + "epoch": 0.6666523779925841, + "grad_norm": 1.640625, + "learning_rate": 1.7749972440267463e-05, + "loss": 1.0783, + "step": 3888 + }, + { + "epoch": 0.6668238420815741, + "grad_norm": 1.609375, + "learning_rate": 1.7748831031195898e-05, + "loss": 1.0681, + "step": 3889 + }, + { + "epoch": 0.6669953061705639, + "grad_norm": 1.609375, + "learning_rate": 1.774768936940578e-05, + "loss": 1.0427, + "step": 3890 + }, + { + "epoch": 0.6671667702595537, + "grad_norm": 1.609375, + "learning_rate": 1.7746547454934327e-05, + "loss": 1.064, + "step": 3891 + }, + { + "epoch": 0.6673382343485437, + "grad_norm": 1.578125, + "learning_rate": 1.77454052878188e-05, + "loss": 0.9948, + "step": 3892 + }, + { + "epoch": 0.6675096984375335, + "grad_norm": 1.6640625, + "learning_rate": 1.7744262868096432e-05, + "loss": 1.0956, + "step": 3893 + }, + { + "epoch": 0.6676811625265233, + "grad_norm": 1.609375, + "learning_rate": 1.7743120195804498e-05, + "loss": 0.9624, + "step": 3894 + }, + { + "epoch": 0.6678526266155133, + "grad_norm": 1.7890625, + "learning_rate": 1.774197727098025e-05, + "loss": 1.0827, + "step": 3895 + }, + { + "epoch": 0.6680240907045031, + "grad_norm": 1.5859375, + "learning_rate": 1.7740834093660974e-05, + "loss": 0.8746, + "step": 3896 + }, + { + "epoch": 0.6681955547934929, + "grad_norm": 1.5703125, + "learning_rate": 1.7739690663883948e-05, + "loss": 1.0361, + "step": 3897 + }, + { + "epoch": 0.6683670188824828, + "grad_norm": 1.6484375, + "learning_rate": 1.7738546981686458e-05, + "loss": 0.9436, + "step": 3898 + }, + { + "epoch": 0.6685384829714727, + "grad_norm": 1.46875, + "learning_rate": 1.773740304710582e-05, + "loss": 0.9772, + "step": 3899 + }, + { + "epoch": 0.6687099470604625, + "grad_norm": 1.546875, + "learning_rate": 1.7736258860179326e-05, + "loss": 1.015, + "step": 3900 + }, + { + "epoch": 0.6688814111494524, + "grad_norm": 1.59375, + "learning_rate": 1.77351144209443e-05, + "loss": 0.9838, + "step": 3901 + }, + { + "epoch": 0.6690528752384423, + "grad_norm": 1.65625, + "learning_rate": 1.7733969729438064e-05, + "loss": 1.0158, + "step": 3902 + }, + { + "epoch": 0.6692243393274321, + "grad_norm": 1.5234375, + "learning_rate": 1.7732824785697956e-05, + "loss": 0.9799, + "step": 3903 + }, + { + "epoch": 0.669395803416422, + "grad_norm": 1.625, + "learning_rate": 1.7731679589761307e-05, + "loss": 0.9674, + "step": 3904 + }, + { + "epoch": 0.6695672675054118, + "grad_norm": 1.625, + "learning_rate": 1.7730534141665473e-05, + "loss": 1.0857, + "step": 3905 + }, + { + "epoch": 0.6697387315944017, + "grad_norm": 1.5546875, + "learning_rate": 1.7729388441447813e-05, + "loss": 1.0258, + "step": 3906 + }, + { + "epoch": 0.6699101956833916, + "grad_norm": 1.5625, + "learning_rate": 1.7728242489145687e-05, + "loss": 1.0592, + "step": 3907 + }, + { + "epoch": 0.6700816597723814, + "grad_norm": 1.5546875, + "learning_rate": 1.7727096284796476e-05, + "loss": 1.0646, + "step": 3908 + }, + { + "epoch": 0.6702531238613713, + "grad_norm": 1.5390625, + "learning_rate": 1.7725949828437552e-05, + "loss": 0.9314, + "step": 3909 + }, + { + "epoch": 0.6704245879503612, + "grad_norm": 1.5078125, + "learning_rate": 1.7724803120106312e-05, + "loss": 1.0248, + "step": 3910 + }, + { + "epoch": 0.670596052039351, + "grad_norm": 1.515625, + "learning_rate": 1.7723656159840156e-05, + "loss": 0.9268, + "step": 3911 + }, + { + "epoch": 0.6707675161283408, + "grad_norm": 1.5, + "learning_rate": 1.7722508947676488e-05, + "loss": 0.9512, + "step": 3912 + }, + { + "epoch": 0.6709389802173308, + "grad_norm": 1.5859375, + "learning_rate": 1.772136148365272e-05, + "loss": 0.9944, + "step": 3913 + }, + { + "epoch": 0.6711104443063206, + "grad_norm": 1.5390625, + "learning_rate": 1.772021376780628e-05, + "loss": 1.0312, + "step": 3914 + }, + { + "epoch": 0.6712819083953104, + "grad_norm": 1.6875, + "learning_rate": 1.7719065800174595e-05, + "loss": 1.0387, + "step": 3915 + }, + { + "epoch": 0.6714533724843004, + "grad_norm": 1.578125, + "learning_rate": 1.7717917580795108e-05, + "loss": 1.0055, + "step": 3916 + }, + { + "epoch": 0.6716248365732902, + "grad_norm": 1.515625, + "learning_rate": 1.771676910970527e-05, + "loss": 1.0512, + "step": 3917 + }, + { + "epoch": 0.67179630066228, + "grad_norm": 1.5390625, + "learning_rate": 1.7715620386942528e-05, + "loss": 1.0302, + "step": 3918 + }, + { + "epoch": 0.67196776475127, + "grad_norm": 1.578125, + "learning_rate": 1.7714471412544353e-05, + "loss": 1.0381, + "step": 3919 + }, + { + "epoch": 0.6721392288402598, + "grad_norm": 1.59375, + "learning_rate": 1.771332218654821e-05, + "loss": 1.0399, + "step": 3920 + }, + { + "epoch": 0.6723106929292496, + "grad_norm": 1.4921875, + "learning_rate": 1.7712172708991594e-05, + "loss": 1.0186, + "step": 3921 + }, + { + "epoch": 0.6724821570182395, + "grad_norm": 1.5546875, + "learning_rate": 1.7711022979911977e-05, + "loss": 1.0577, + "step": 3922 + }, + { + "epoch": 0.6726536211072294, + "grad_norm": 1.578125, + "learning_rate": 1.7709872999346867e-05, + "loss": 1.1428, + "step": 3923 + }, + { + "epoch": 0.6728250851962192, + "grad_norm": 1.625, + "learning_rate": 1.7708722767333766e-05, + "loss": 0.9536, + "step": 3924 + }, + { + "epoch": 0.6729965492852091, + "grad_norm": 1.6953125, + "learning_rate": 1.7707572283910184e-05, + "loss": 1.112, + "step": 3925 + }, + { + "epoch": 0.673168013374199, + "grad_norm": 1.4921875, + "learning_rate": 1.770642154911365e-05, + "loss": 0.9273, + "step": 3926 + }, + { + "epoch": 0.6733394774631888, + "grad_norm": 1.5, + "learning_rate": 1.7705270562981688e-05, + "loss": 1.0, + "step": 3927 + }, + { + "epoch": 0.6735109415521786, + "grad_norm": 1.5859375, + "learning_rate": 1.7704119325551835e-05, + "loss": 0.9893, + "step": 3928 + }, + { + "epoch": 0.6736824056411685, + "grad_norm": 1.609375, + "learning_rate": 1.7702967836861643e-05, + "loss": 1.0211, + "step": 3929 + }, + { + "epoch": 0.6738538697301584, + "grad_norm": 1.5625, + "learning_rate": 1.7701816096948665e-05, + "loss": 1.0401, + "step": 3930 + }, + { + "epoch": 0.6740253338191482, + "grad_norm": 1.546875, + "learning_rate": 1.7700664105850454e-05, + "loss": 1.0329, + "step": 3931 + }, + { + "epoch": 0.6741967979081381, + "grad_norm": 1.625, + "learning_rate": 1.7699511863604597e-05, + "loss": 1.0191, + "step": 3932 + }, + { + "epoch": 0.674368261997128, + "grad_norm": 1.671875, + "learning_rate": 1.769835937024866e-05, + "loss": 0.9552, + "step": 3933 + }, + { + "epoch": 0.6745397260861178, + "grad_norm": 1.59375, + "learning_rate": 1.7697206625820237e-05, + "loss": 1.0417, + "step": 3934 + }, + { + "epoch": 0.6747111901751077, + "grad_norm": 1.5703125, + "learning_rate": 1.7696053630356918e-05, + "loss": 1.0664, + "step": 3935 + }, + { + "epoch": 0.6748826542640975, + "grad_norm": 1.640625, + "learning_rate": 1.769490038389631e-05, + "loss": 1.1068, + "step": 3936 + }, + { + "epoch": 0.6750541183530874, + "grad_norm": 1.5234375, + "learning_rate": 1.7693746886476028e-05, + "loss": 1.0836, + "step": 3937 + }, + { + "epoch": 0.6752255824420773, + "grad_norm": 1.671875, + "learning_rate": 1.7692593138133684e-05, + "loss": 0.974, + "step": 3938 + }, + { + "epoch": 0.6753970465310671, + "grad_norm": 1.59375, + "learning_rate": 1.769143913890691e-05, + "loss": 1.0171, + "step": 3939 + }, + { + "epoch": 0.675568510620057, + "grad_norm": 1.5546875, + "learning_rate": 1.7690284888833344e-05, + "loss": 1.1185, + "step": 3940 + }, + { + "epoch": 0.6757399747090469, + "grad_norm": 1.515625, + "learning_rate": 1.768913038795063e-05, + "loss": 1.0596, + "step": 3941 + }, + { + "epoch": 0.6759114387980367, + "grad_norm": 1.4765625, + "learning_rate": 1.7687975636296414e-05, + "loss": 1.0241, + "step": 3942 + }, + { + "epoch": 0.6760829028870265, + "grad_norm": 1.625, + "learning_rate": 1.7686820633908368e-05, + "loss": 1.1086, + "step": 3943 + }, + { + "epoch": 0.6762543669760165, + "grad_norm": 1.53125, + "learning_rate": 1.7685665380824156e-05, + "loss": 0.9996, + "step": 3944 + }, + { + "epoch": 0.6764258310650063, + "grad_norm": 1.5234375, + "learning_rate": 1.768450987708145e-05, + "loss": 0.9955, + "step": 3945 + }, + { + "epoch": 0.6765972951539961, + "grad_norm": 1.4375, + "learning_rate": 1.7683354122717942e-05, + "loss": 0.937, + "step": 3946 + }, + { + "epoch": 0.6767687592429861, + "grad_norm": 1.546875, + "learning_rate": 1.768219811777132e-05, + "loss": 1.054, + "step": 3947 + }, + { + "epoch": 0.6769402233319759, + "grad_norm": 1.625, + "learning_rate": 1.7681041862279294e-05, + "loss": 0.9462, + "step": 3948 + }, + { + "epoch": 0.6771116874209657, + "grad_norm": 1.546875, + "learning_rate": 1.7679885356279566e-05, + "loss": 0.97, + "step": 3949 + }, + { + "epoch": 0.6772831515099557, + "grad_norm": 1.5390625, + "learning_rate": 1.7678728599809858e-05, + "loss": 0.9661, + "step": 3950 + }, + { + "epoch": 0.6774546155989455, + "grad_norm": 1.5703125, + "learning_rate": 1.7677571592907893e-05, + "loss": 1.0832, + "step": 3951 + }, + { + "epoch": 0.6776260796879353, + "grad_norm": 1.546875, + "learning_rate": 1.767641433561141e-05, + "loss": 1.0797, + "step": 3952 + }, + { + "epoch": 0.6777975437769252, + "grad_norm": 1.6484375, + "learning_rate": 1.7675256827958148e-05, + "loss": 1.0648, + "step": 3953 + }, + { + "epoch": 0.6779690078659151, + "grad_norm": 1.5, + "learning_rate": 1.7674099069985855e-05, + "loss": 0.9913, + "step": 3954 + }, + { + "epoch": 0.6781404719549049, + "grad_norm": 1.5703125, + "learning_rate": 1.7672941061732297e-05, + "loss": 1.0111, + "step": 3955 + }, + { + "epoch": 0.6783119360438948, + "grad_norm": 1.5546875, + "learning_rate": 1.7671782803235235e-05, + "loss": 0.9879, + "step": 3956 + }, + { + "epoch": 0.6784834001328847, + "grad_norm": 1.625, + "learning_rate": 1.767062429453245e-05, + "loss": 1.0341, + "step": 3957 + }, + { + "epoch": 0.6786548642218745, + "grad_norm": 1.5703125, + "learning_rate": 1.766946553566172e-05, + "loss": 1.0252, + "step": 3958 + }, + { + "epoch": 0.6788263283108644, + "grad_norm": 1.5703125, + "learning_rate": 1.7668306526660836e-05, + "loss": 1.0166, + "step": 3959 + }, + { + "epoch": 0.6789977923998542, + "grad_norm": 1.53125, + "learning_rate": 1.7667147267567602e-05, + "loss": 0.9378, + "step": 3960 + }, + { + "epoch": 0.6791692564888441, + "grad_norm": 1.5859375, + "learning_rate": 1.7665987758419824e-05, + "loss": 1.0179, + "step": 3961 + }, + { + "epoch": 0.679340720577834, + "grad_norm": 1.578125, + "learning_rate": 1.766482799925532e-05, + "loss": 1.0277, + "step": 3962 + }, + { + "epoch": 0.6795121846668238, + "grad_norm": 1.5234375, + "learning_rate": 1.7663667990111908e-05, + "loss": 1.0449, + "step": 3963 + }, + { + "epoch": 0.6796836487558137, + "grad_norm": 1.7109375, + "learning_rate": 1.7662507731027426e-05, + "loss": 1.0957, + "step": 3964 + }, + { + "epoch": 0.6798551128448036, + "grad_norm": 1.6953125, + "learning_rate": 1.7661347222039714e-05, + "loss": 1.0475, + "step": 3965 + }, + { + "epoch": 0.6800265769337934, + "grad_norm": 1.5703125, + "learning_rate": 1.7660186463186617e-05, + "loss": 0.8924, + "step": 3966 + }, + { + "epoch": 0.6801980410227833, + "grad_norm": 1.5859375, + "learning_rate": 1.7659025454505994e-05, + "loss": 1.0355, + "step": 3967 + }, + { + "epoch": 0.6803695051117732, + "grad_norm": 1.5234375, + "learning_rate": 1.765786419603571e-05, + "loss": 0.9575, + "step": 3968 + }, + { + "epoch": 0.680540969200763, + "grad_norm": 1.7109375, + "learning_rate": 1.765670268781364e-05, + "loss": 1.0357, + "step": 3969 + }, + { + "epoch": 0.6807124332897528, + "grad_norm": 1.5703125, + "learning_rate": 1.765554092987766e-05, + "loss": 0.9695, + "step": 3970 + }, + { + "epoch": 0.6808838973787428, + "grad_norm": 1.5546875, + "learning_rate": 1.765437892226566e-05, + "loss": 1.1055, + "step": 3971 + }, + { + "epoch": 0.6810553614677326, + "grad_norm": 1.5546875, + "learning_rate": 1.765321666501554e-05, + "loss": 0.951, + "step": 3972 + }, + { + "epoch": 0.6812268255567224, + "grad_norm": 1.515625, + "learning_rate": 1.765205415816521e-05, + "loss": 0.9672, + "step": 3973 + }, + { + "epoch": 0.6813982896457124, + "grad_norm": 1.6015625, + "learning_rate": 1.7650891401752578e-05, + "loss": 1.0185, + "step": 3974 + }, + { + "epoch": 0.6815697537347022, + "grad_norm": 1.6953125, + "learning_rate": 1.7649728395815567e-05, + "loss": 1.0638, + "step": 3975 + }, + { + "epoch": 0.681741217823692, + "grad_norm": 1.4609375, + "learning_rate": 1.7648565140392103e-05, + "loss": 0.9564, + "step": 3976 + }, + { + "epoch": 0.681912681912682, + "grad_norm": 1.515625, + "learning_rate": 1.764740163552013e-05, + "loss": 1.0024, + "step": 3977 + }, + { + "epoch": 0.6820841460016718, + "grad_norm": 1.53125, + "learning_rate": 1.764623788123759e-05, + "loss": 1.0701, + "step": 3978 + }, + { + "epoch": 0.6822556100906616, + "grad_norm": 1.515625, + "learning_rate": 1.7645073877582445e-05, + "loss": 1.0215, + "step": 3979 + }, + { + "epoch": 0.6824270741796515, + "grad_norm": 1.5703125, + "learning_rate": 1.764390962459265e-05, + "loss": 1.0535, + "step": 3980 + }, + { + "epoch": 0.6825985382686414, + "grad_norm": 1.5859375, + "learning_rate": 1.7642745122306177e-05, + "loss": 0.9549, + "step": 3981 + }, + { + "epoch": 0.6827700023576312, + "grad_norm": 1.5234375, + "learning_rate": 1.7641580370761e-05, + "loss": 0.9616, + "step": 3982 + }, + { + "epoch": 0.6829414664466211, + "grad_norm": 1.6171875, + "learning_rate": 1.764041536999512e-05, + "loss": 1.0469, + "step": 3983 + }, + { + "epoch": 0.683112930535611, + "grad_norm": 1.5703125, + "learning_rate": 1.763925012004652e-05, + "loss": 1.0718, + "step": 3984 + }, + { + "epoch": 0.6832843946246008, + "grad_norm": 1.5859375, + "learning_rate": 1.763808462095321e-05, + "loss": 1.0762, + "step": 3985 + }, + { + "epoch": 0.6834558587135907, + "grad_norm": 1.484375, + "learning_rate": 1.7636918872753194e-05, + "loss": 0.9343, + "step": 3986 + }, + { + "epoch": 0.6836273228025805, + "grad_norm": 1.6171875, + "learning_rate": 1.76357528754845e-05, + "loss": 1.0311, + "step": 3987 + }, + { + "epoch": 0.6837987868915704, + "grad_norm": 1.59375, + "learning_rate": 1.7634586629185147e-05, + "loss": 1.0325, + "step": 3988 + }, + { + "epoch": 0.6839702509805603, + "grad_norm": 1.53125, + "learning_rate": 1.7633420133893176e-05, + "loss": 0.9296, + "step": 3989 + }, + { + "epoch": 0.6841417150695501, + "grad_norm": 1.6171875, + "learning_rate": 1.763225338964663e-05, + "loss": 1.0297, + "step": 3990 + }, + { + "epoch": 0.68431317915854, + "grad_norm": 1.8046875, + "learning_rate": 1.7631086396483562e-05, + "loss": 1.0326, + "step": 3991 + }, + { + "epoch": 0.6844846432475299, + "grad_norm": 1.5703125, + "learning_rate": 1.762991915444203e-05, + "loss": 0.9755, + "step": 3992 + }, + { + "epoch": 0.6846561073365197, + "grad_norm": 1.515625, + "learning_rate": 1.76287516635601e-05, + "loss": 0.9867, + "step": 3993 + }, + { + "epoch": 0.6848275714255095, + "grad_norm": 1.59375, + "learning_rate": 1.762758392387585e-05, + "loss": 1.0919, + "step": 3994 + }, + { + "epoch": 0.6849990355144995, + "grad_norm": 1.484375, + "learning_rate": 1.7626415935427373e-05, + "loss": 0.9688, + "step": 3995 + }, + { + "epoch": 0.6851704996034893, + "grad_norm": 1.5625, + "learning_rate": 1.762524769825275e-05, + "loss": 1.0599, + "step": 3996 + }, + { + "epoch": 0.6853419636924791, + "grad_norm": 1.6328125, + "learning_rate": 1.762407921239008e-05, + "loss": 1.0304, + "step": 3997 + }, + { + "epoch": 0.6855134277814691, + "grad_norm": 1.6484375, + "learning_rate": 1.7622910477877484e-05, + "loss": 1.0556, + "step": 3998 + }, + { + "epoch": 0.6856848918704589, + "grad_norm": 1.6015625, + "learning_rate": 1.762174149475307e-05, + "loss": 1.007, + "step": 3999 + }, + { + "epoch": 0.6858563559594487, + "grad_norm": 1.578125, + "learning_rate": 1.7620572263054964e-05, + "loss": 0.9835, + "step": 4000 + }, + { + "epoch": 0.6860278200484387, + "grad_norm": 1.609375, + "learning_rate": 1.7619402782821306e-05, + "loss": 0.9957, + "step": 4001 + }, + { + "epoch": 0.6861992841374285, + "grad_norm": 1.59375, + "learning_rate": 1.7618233054090223e-05, + "loss": 1.0368, + "step": 4002 + }, + { + "epoch": 0.6863707482264183, + "grad_norm": 1.5546875, + "learning_rate": 1.7617063076899875e-05, + "loss": 1.0038, + "step": 4003 + }, + { + "epoch": 0.6865422123154082, + "grad_norm": 1.6484375, + "learning_rate": 1.7615892851288417e-05, + "loss": 1.0597, + "step": 4004 + }, + { + "epoch": 0.6867136764043981, + "grad_norm": 1.59375, + "learning_rate": 1.7614722377294017e-05, + "loss": 0.9894, + "step": 4005 + }, + { + "epoch": 0.6868851404933879, + "grad_norm": 1.5234375, + "learning_rate": 1.7613551654954846e-05, + "loss": 1.0048, + "step": 4006 + }, + { + "epoch": 0.6870566045823778, + "grad_norm": 1.7109375, + "learning_rate": 1.761238068430908e-05, + "loss": 1.1029, + "step": 4007 + }, + { + "epoch": 0.6872280686713677, + "grad_norm": 1.734375, + "learning_rate": 1.761120946539492e-05, + "loss": 1.1666, + "step": 4008 + }, + { + "epoch": 0.6873995327603575, + "grad_norm": 1.5625, + "learning_rate": 1.7610037998250555e-05, + "loss": 1.0329, + "step": 4009 + }, + { + "epoch": 0.6875709968493474, + "grad_norm": 1.5703125, + "learning_rate": 1.7608866282914195e-05, + "loss": 0.9206, + "step": 4010 + }, + { + "epoch": 0.6877424609383372, + "grad_norm": 1.484375, + "learning_rate": 1.7607694319424054e-05, + "loss": 0.9154, + "step": 4011 + }, + { + "epoch": 0.6879139250273271, + "grad_norm": 1.53125, + "learning_rate": 1.7606522107818355e-05, + "loss": 1.0722, + "step": 4012 + }, + { + "epoch": 0.688085389116317, + "grad_norm": 1.6328125, + "learning_rate": 1.760534964813532e-05, + "loss": 1.0418, + "step": 4013 + }, + { + "epoch": 0.6882568532053068, + "grad_norm": 1.5859375, + "learning_rate": 1.76041769404132e-05, + "loss": 0.9458, + "step": 4014 + }, + { + "epoch": 0.6884283172942967, + "grad_norm": 1.6953125, + "learning_rate": 1.7603003984690234e-05, + "loss": 1.0555, + "step": 4015 + }, + { + "epoch": 0.6885997813832866, + "grad_norm": 1.546875, + "learning_rate": 1.7601830781004676e-05, + "loss": 1.0789, + "step": 4016 + }, + { + "epoch": 0.6887712454722764, + "grad_norm": 1.5703125, + "learning_rate": 1.7600657329394794e-05, + "loss": 0.9962, + "step": 4017 + }, + { + "epoch": 0.6889427095612662, + "grad_norm": 1.6328125, + "learning_rate": 1.759948362989885e-05, + "loss": 1.0139, + "step": 4018 + }, + { + "epoch": 0.6891141736502562, + "grad_norm": 1.6171875, + "learning_rate": 1.7598309682555133e-05, + "loss": 1.0394, + "step": 4019 + }, + { + "epoch": 0.689285637739246, + "grad_norm": 1.515625, + "learning_rate": 1.759713548740192e-05, + "loss": 0.9697, + "step": 4020 + }, + { + "epoch": 0.6894571018282358, + "grad_norm": 1.5859375, + "learning_rate": 1.7595961044477516e-05, + "loss": 1.0688, + "step": 4021 + }, + { + "epoch": 0.6896285659172258, + "grad_norm": 1.6171875, + "learning_rate": 1.7594786353820215e-05, + "loss": 0.9847, + "step": 4022 + }, + { + "epoch": 0.6898000300062156, + "grad_norm": 1.6875, + "learning_rate": 1.759361141546833e-05, + "loss": 1.0528, + "step": 4023 + }, + { + "epoch": 0.6899714940952054, + "grad_norm": 1.578125, + "learning_rate": 1.7592436229460185e-05, + "loss": 1.0234, + "step": 4024 + }, + { + "epoch": 0.6901429581841952, + "grad_norm": 1.6171875, + "learning_rate": 1.7591260795834104e-05, + "loss": 1.1078, + "step": 4025 + }, + { + "epoch": 0.6903144222731852, + "grad_norm": 1.6796875, + "learning_rate": 1.7590085114628422e-05, + "loss": 1.1082, + "step": 4026 + }, + { + "epoch": 0.690485886362175, + "grad_norm": 1.53125, + "learning_rate": 1.758890918588148e-05, + "loss": 1.0392, + "step": 4027 + }, + { + "epoch": 0.6906573504511648, + "grad_norm": 1.5859375, + "learning_rate": 1.7587733009631637e-05, + "loss": 0.9816, + "step": 4028 + }, + { + "epoch": 0.6908288145401548, + "grad_norm": 1.5390625, + "learning_rate": 1.758655658591724e-05, + "loss": 1.0778, + "step": 4029 + }, + { + "epoch": 0.6910002786291446, + "grad_norm": 1.5078125, + "learning_rate": 1.7585379914776672e-05, + "loss": 1.0351, + "step": 4030 + }, + { + "epoch": 0.6911717427181344, + "grad_norm": 1.609375, + "learning_rate": 1.75842029962483e-05, + "loss": 1.0163, + "step": 4031 + }, + { + "epoch": 0.6913432068071244, + "grad_norm": 1.46875, + "learning_rate": 1.7583025830370507e-05, + "loss": 0.9156, + "step": 4032 + }, + { + "epoch": 0.6915146708961142, + "grad_norm": 1.8203125, + "learning_rate": 1.7581848417181687e-05, + "loss": 1.0033, + "step": 4033 + }, + { + "epoch": 0.691686134985104, + "grad_norm": 1.59375, + "learning_rate": 1.758067075672024e-05, + "loss": 1.0246, + "step": 4034 + }, + { + "epoch": 0.6918575990740939, + "grad_norm": 1.5234375, + "learning_rate": 1.7579492849024574e-05, + "loss": 0.9565, + "step": 4035 + }, + { + "epoch": 0.6920290631630838, + "grad_norm": 1.546875, + "learning_rate": 1.7578314694133105e-05, + "loss": 1.0043, + "step": 4036 + }, + { + "epoch": 0.6922005272520736, + "grad_norm": 1.5390625, + "learning_rate": 1.7577136292084255e-05, + "loss": 0.931, + "step": 4037 + }, + { + "epoch": 0.6923719913410635, + "grad_norm": 1.6015625, + "learning_rate": 1.7575957642916453e-05, + "loss": 1.0022, + "step": 4038 + }, + { + "epoch": 0.6925434554300534, + "grad_norm": 1.6015625, + "learning_rate": 1.7574778746668152e-05, + "loss": 0.9138, + "step": 4039 + }, + { + "epoch": 0.6927149195190432, + "grad_norm": 1.5390625, + "learning_rate": 1.757359960337779e-05, + "loss": 0.957, + "step": 4040 + }, + { + "epoch": 0.6928863836080331, + "grad_norm": 1.578125, + "learning_rate": 1.7572420213083822e-05, + "loss": 1.0356, + "step": 4041 + }, + { + "epoch": 0.6930578476970229, + "grad_norm": 1.6015625, + "learning_rate": 1.757124057582472e-05, + "loss": 1.0043, + "step": 4042 + }, + { + "epoch": 0.6932293117860128, + "grad_norm": 1.671875, + "learning_rate": 1.757006069163895e-05, + "loss": 1.0444, + "step": 4043 + }, + { + "epoch": 0.6934007758750027, + "grad_norm": 1.5625, + "learning_rate": 1.7568880560564994e-05, + "loss": 1.0236, + "step": 4044 + }, + { + "epoch": 0.6935722399639925, + "grad_norm": 1.5390625, + "learning_rate": 1.756770018264134e-05, + "loss": 0.9874, + "step": 4045 + }, + { + "epoch": 0.6937437040529824, + "grad_norm": 1.46875, + "learning_rate": 1.7566519557906488e-05, + "loss": 1.0284, + "step": 4046 + }, + { + "epoch": 0.6939151681419723, + "grad_norm": 1.671875, + "learning_rate": 1.756533868639894e-05, + "loss": 1.1392, + "step": 4047 + }, + { + "epoch": 0.6940866322309621, + "grad_norm": 1.5859375, + "learning_rate": 1.7564157568157208e-05, + "loss": 0.9929, + "step": 4048 + }, + { + "epoch": 0.694258096319952, + "grad_norm": 1.578125, + "learning_rate": 1.7562976203219815e-05, + "loss": 1.0021, + "step": 4049 + }, + { + "epoch": 0.6944295604089419, + "grad_norm": 1.5703125, + "learning_rate": 1.756179459162529e-05, + "loss": 0.9882, + "step": 4050 + }, + { + "epoch": 0.6946010244979317, + "grad_norm": 1.53125, + "learning_rate": 1.7560612733412167e-05, + "loss": 1.0035, + "step": 4051 + }, + { + "epoch": 0.6947724885869215, + "grad_norm": 1.609375, + "learning_rate": 1.755943062861899e-05, + "loss": 1.0026, + "step": 4052 + }, + { + "epoch": 0.6949439526759115, + "grad_norm": 1.609375, + "learning_rate": 1.7558248277284318e-05, + "loss": 0.9078, + "step": 4053 + }, + { + "epoch": 0.6951154167649013, + "grad_norm": 1.546875, + "learning_rate": 1.7557065679446705e-05, + "loss": 1.0097, + "step": 4054 + }, + { + "epoch": 0.6952868808538911, + "grad_norm": 1.5703125, + "learning_rate": 1.755588283514472e-05, + "loss": 1.1547, + "step": 4055 + }, + { + "epoch": 0.6954583449428811, + "grad_norm": 1.5625, + "learning_rate": 1.755469974441695e-05, + "loss": 0.9205, + "step": 4056 + }, + { + "epoch": 0.6956298090318709, + "grad_norm": 1.703125, + "learning_rate": 1.7553516407301967e-05, + "loss": 1.0198, + "step": 4057 + }, + { + "epoch": 0.6958012731208607, + "grad_norm": 1.5, + "learning_rate": 1.7552332823838375e-05, + "loss": 0.9808, + "step": 4058 + }, + { + "epoch": 0.6959727372098506, + "grad_norm": 1.5546875, + "learning_rate": 1.7551148994064765e-05, + "loss": 0.9738, + "step": 4059 + }, + { + "epoch": 0.6961442012988405, + "grad_norm": 1.4375, + "learning_rate": 1.7549964918019754e-05, + "loss": 0.9615, + "step": 4060 + }, + { + "epoch": 0.6963156653878303, + "grad_norm": 1.6171875, + "learning_rate": 1.7548780595741957e-05, + "loss": 1.0502, + "step": 4061 + }, + { + "epoch": 0.6964871294768202, + "grad_norm": 1.5390625, + "learning_rate": 1.754759602727e-05, + "loss": 1.0078, + "step": 4062 + }, + { + "epoch": 0.6966585935658101, + "grad_norm": 1.5390625, + "learning_rate": 1.754641121264251e-05, + "loss": 0.9771, + "step": 4063 + }, + { + "epoch": 0.6968300576547999, + "grad_norm": 1.6328125, + "learning_rate": 1.7545226151898134e-05, + "loss": 1.1078, + "step": 4064 + }, + { + "epoch": 0.6970015217437898, + "grad_norm": 1.671875, + "learning_rate": 1.7544040845075528e-05, + "loss": 1.0964, + "step": 4065 + }, + { + "epoch": 0.6971729858327796, + "grad_norm": 1.5546875, + "learning_rate": 1.7542855292213334e-05, + "loss": 1.0188, + "step": 4066 + }, + { + "epoch": 0.6973444499217695, + "grad_norm": 1.515625, + "learning_rate": 1.7541669493350227e-05, + "loss": 0.9677, + "step": 4067 + }, + { + "epoch": 0.6975159140107594, + "grad_norm": 1.640625, + "learning_rate": 1.754048344852488e-05, + "loss": 1.0634, + "step": 4068 + }, + { + "epoch": 0.6976873780997492, + "grad_norm": 1.6015625, + "learning_rate": 1.753929715777597e-05, + "loss": 1.0472, + "step": 4069 + }, + { + "epoch": 0.6978588421887391, + "grad_norm": 1.6796875, + "learning_rate": 1.7538110621142194e-05, + "loss": 1.0953, + "step": 4070 + }, + { + "epoch": 0.698030306277729, + "grad_norm": 1.609375, + "learning_rate": 1.7536923838662243e-05, + "loss": 1.0702, + "step": 4071 + }, + { + "epoch": 0.6982017703667188, + "grad_norm": 1.4921875, + "learning_rate": 1.7535736810374822e-05, + "loss": 1.0504, + "step": 4072 + }, + { + "epoch": 0.6983732344557086, + "grad_norm": 1.5078125, + "learning_rate": 1.7534549536318647e-05, + "loss": 0.9602, + "step": 4073 + }, + { + "epoch": 0.6985446985446986, + "grad_norm": 1.5859375, + "learning_rate": 1.753336201653244e-05, + "loss": 1.083, + "step": 4074 + }, + { + "epoch": 0.6987161626336884, + "grad_norm": 1.5859375, + "learning_rate": 1.753217425105493e-05, + "loss": 0.9854, + "step": 4075 + }, + { + "epoch": 0.6988876267226782, + "grad_norm": 1.5625, + "learning_rate": 1.7530986239924858e-05, + "loss": 0.9757, + "step": 4076 + }, + { + "epoch": 0.6990590908116682, + "grad_norm": 1.5703125, + "learning_rate": 1.7529797983180962e-05, + "loss": 1.0032, + "step": 4077 + }, + { + "epoch": 0.699230554900658, + "grad_norm": 1.6015625, + "learning_rate": 1.7528609480861995e-05, + "loss": 0.9975, + "step": 4078 + }, + { + "epoch": 0.6994020189896478, + "grad_norm": 1.5859375, + "learning_rate": 1.752742073300673e-05, + "loss": 1.0549, + "step": 4079 + }, + { + "epoch": 0.6995734830786378, + "grad_norm": 1.5625, + "learning_rate": 1.7526231739653923e-05, + "loss": 1.051, + "step": 4080 + }, + { + "epoch": 0.6997449471676276, + "grad_norm": 1.5234375, + "learning_rate": 1.7525042500842363e-05, + "loss": 1.0099, + "step": 4081 + }, + { + "epoch": 0.6999164112566174, + "grad_norm": 1.5546875, + "learning_rate": 1.7523853016610826e-05, + "loss": 1.0453, + "step": 4082 + }, + { + "epoch": 0.7000878753456073, + "grad_norm": 1.7421875, + "learning_rate": 1.7522663286998113e-05, + "loss": 1.1041, + "step": 4083 + }, + { + "epoch": 0.7002593394345972, + "grad_norm": 1.546875, + "learning_rate": 1.752147331204302e-05, + "loss": 0.9947, + "step": 4084 + }, + { + "epoch": 0.700430803523587, + "grad_norm": 1.53125, + "learning_rate": 1.752028309178436e-05, + "loss": 1.012, + "step": 4085 + }, + { + "epoch": 0.7006022676125769, + "grad_norm": 1.5234375, + "learning_rate": 1.751909262626095e-05, + "loss": 1.0268, + "step": 4086 + }, + { + "epoch": 0.7007737317015668, + "grad_norm": 1.59375, + "learning_rate": 1.7517901915511614e-05, + "loss": 0.9548, + "step": 4087 + }, + { + "epoch": 0.7009451957905566, + "grad_norm": 1.6484375, + "learning_rate": 1.7516710959575188e-05, + "loss": 1.0622, + "step": 4088 + }, + { + "epoch": 0.7011166598795465, + "grad_norm": 1.5234375, + "learning_rate": 1.751551975849051e-05, + "loss": 0.9667, + "step": 4089 + }, + { + "epoch": 0.7012881239685363, + "grad_norm": 1.625, + "learning_rate": 1.751432831229644e-05, + "loss": 1.0628, + "step": 4090 + }, + { + "epoch": 0.7014595880575262, + "grad_norm": 1.7578125, + "learning_rate": 1.751313662103182e-05, + "loss": 1.0634, + "step": 4091 + }, + { + "epoch": 0.7016310521465161, + "grad_norm": 1.515625, + "learning_rate": 1.7511944684735523e-05, + "loss": 1.0062, + "step": 4092 + }, + { + "epoch": 0.7018025162355059, + "grad_norm": 1.609375, + "learning_rate": 1.7510752503446423e-05, + "loss": 1.0688, + "step": 4093 + }, + { + "epoch": 0.7019739803244958, + "grad_norm": 1.6015625, + "learning_rate": 1.7509560077203404e-05, + "loss": 0.9527, + "step": 4094 + }, + { + "epoch": 0.7021454444134857, + "grad_norm": 1.578125, + "learning_rate": 1.7508367406045348e-05, + "loss": 1.0027, + "step": 4095 + }, + { + "epoch": 0.7023169085024755, + "grad_norm": 1.46875, + "learning_rate": 1.750717449001116e-05, + "loss": 0.9678, + "step": 4096 + }, + { + "epoch": 0.7024883725914653, + "grad_norm": 1.484375, + "learning_rate": 1.750598132913974e-05, + "loss": 0.9813, + "step": 4097 + }, + { + "epoch": 0.7026598366804553, + "grad_norm": 1.59375, + "learning_rate": 1.7504787923470007e-05, + "loss": 1.0437, + "step": 4098 + }, + { + "epoch": 0.7028313007694451, + "grad_norm": 1.46875, + "learning_rate": 1.7503594273040877e-05, + "loss": 0.981, + "step": 4099 + }, + { + "epoch": 0.7030027648584349, + "grad_norm": 1.53125, + "learning_rate": 1.7502400377891285e-05, + "loss": 0.9728, + "step": 4100 + }, + { + "epoch": 0.7031742289474249, + "grad_norm": 1.6484375, + "learning_rate": 1.7501206238060162e-05, + "loss": 1.0596, + "step": 4101 + }, + { + "epoch": 0.7033456930364147, + "grad_norm": 1.4921875, + "learning_rate": 1.7500011853586457e-05, + "loss": 0.9799, + "step": 4102 + }, + { + "epoch": 0.7035171571254045, + "grad_norm": 1.5859375, + "learning_rate": 1.7498817224509124e-05, + "loss": 0.9499, + "step": 4103 + }, + { + "epoch": 0.7036886212143945, + "grad_norm": 1.6171875, + "learning_rate": 1.7497622350867125e-05, + "loss": 1.1101, + "step": 4104 + }, + { + "epoch": 0.7038600853033843, + "grad_norm": 1.578125, + "learning_rate": 1.7496427232699423e-05, + "loss": 1.0013, + "step": 4105 + }, + { + "epoch": 0.7040315493923741, + "grad_norm": 1.53125, + "learning_rate": 1.7495231870045002e-05, + "loss": 0.9436, + "step": 4106 + }, + { + "epoch": 0.704203013481364, + "grad_norm": 1.6875, + "learning_rate": 1.749403626294285e-05, + "loss": 1.1108, + "step": 4107 + }, + { + "epoch": 0.7043744775703539, + "grad_norm": 1.578125, + "learning_rate": 1.7492840411431952e-05, + "loss": 0.9605, + "step": 4108 + }, + { + "epoch": 0.7045459416593437, + "grad_norm": 1.578125, + "learning_rate": 1.7491644315551314e-05, + "loss": 1.0315, + "step": 4109 + }, + { + "epoch": 0.7047174057483336, + "grad_norm": 1.5, + "learning_rate": 1.7490447975339943e-05, + "loss": 0.9563, + "step": 4110 + }, + { + "epoch": 0.7048888698373235, + "grad_norm": 1.546875, + "learning_rate": 1.7489251390836853e-05, + "loss": 0.9553, + "step": 4111 + }, + { + "epoch": 0.7050603339263133, + "grad_norm": 1.609375, + "learning_rate": 1.748805456208108e-05, + "loss": 1.0408, + "step": 4112 + }, + { + "epoch": 0.7052317980153032, + "grad_norm": 1.5390625, + "learning_rate": 1.7486857489111643e-05, + "loss": 1.0166, + "step": 4113 + }, + { + "epoch": 0.705403262104293, + "grad_norm": 1.4921875, + "learning_rate": 1.7485660171967595e-05, + "loss": 0.9863, + "step": 4114 + }, + { + "epoch": 0.7055747261932829, + "grad_norm": 1.5625, + "learning_rate": 1.748446261068798e-05, + "loss": 0.9943, + "step": 4115 + }, + { + "epoch": 0.7057461902822728, + "grad_norm": 1.5546875, + "learning_rate": 1.7483264805311856e-05, + "loss": 0.9912, + "step": 4116 + }, + { + "epoch": 0.7059176543712626, + "grad_norm": 1.5, + "learning_rate": 1.7482066755878287e-05, + "loss": 1.0156, + "step": 4117 + }, + { + "epoch": 0.7060891184602525, + "grad_norm": 1.6015625, + "learning_rate": 1.7480868462426345e-05, + "loss": 1.0942, + "step": 4118 + }, + { + "epoch": 0.7062605825492423, + "grad_norm": 1.4921875, + "learning_rate": 1.7479669924995117e-05, + "loss": 0.9926, + "step": 4119 + }, + { + "epoch": 0.7064320466382322, + "grad_norm": 1.5390625, + "learning_rate": 1.747847114362368e-05, + "loss": 1.0409, + "step": 4120 + }, + { + "epoch": 0.706603510727222, + "grad_norm": 1.5859375, + "learning_rate": 1.747727211835114e-05, + "loss": 1.0482, + "step": 4121 + }, + { + "epoch": 0.7067749748162119, + "grad_norm": 1.5625, + "learning_rate": 1.7476072849216602e-05, + "loss": 1.0351, + "step": 4122 + }, + { + "epoch": 0.7069464389052018, + "grad_norm": 1.6171875, + "learning_rate": 1.7474873336259172e-05, + "loss": 1.0221, + "step": 4123 + }, + { + "epoch": 0.7071179029941916, + "grad_norm": 1.59375, + "learning_rate": 1.747367357951798e-05, + "loss": 0.9936, + "step": 4124 + }, + { + "epoch": 0.7072893670831815, + "grad_norm": 1.6640625, + "learning_rate": 1.747247357903214e-05, + "loss": 1.1066, + "step": 4125 + }, + { + "epoch": 0.7074608311721714, + "grad_norm": 1.546875, + "learning_rate": 1.7471273334840807e-05, + "loss": 1.0051, + "step": 4126 + }, + { + "epoch": 0.7076322952611612, + "grad_norm": 1.546875, + "learning_rate": 1.7470072846983112e-05, + "loss": 1.006, + "step": 4127 + }, + { + "epoch": 0.707803759350151, + "grad_norm": 1.6171875, + "learning_rate": 1.7468872115498216e-05, + "loss": 0.9908, + "step": 4128 + }, + { + "epoch": 0.707975223439141, + "grad_norm": 1.5625, + "learning_rate": 1.7467671140425272e-05, + "loss": 0.9295, + "step": 4129 + }, + { + "epoch": 0.7081466875281308, + "grad_norm": 1.5234375, + "learning_rate": 1.746646992180345e-05, + "loss": 0.9709, + "step": 4130 + }, + { + "epoch": 0.7083181516171206, + "grad_norm": 1.5234375, + "learning_rate": 1.7465268459671932e-05, + "loss": 1.0124, + "step": 4131 + }, + { + "epoch": 0.7084896157061106, + "grad_norm": 1.5390625, + "learning_rate": 1.7464066754069893e-05, + "loss": 0.966, + "step": 4132 + }, + { + "epoch": 0.7086610797951004, + "grad_norm": 1.53125, + "learning_rate": 1.7462864805036535e-05, + "loss": 0.9522, + "step": 4133 + }, + { + "epoch": 0.7088325438840902, + "grad_norm": 1.4921875, + "learning_rate": 1.746166261261105e-05, + "loss": 0.9749, + "step": 4134 + }, + { + "epoch": 0.7090040079730802, + "grad_norm": 1.5625, + "learning_rate": 1.746046017683265e-05, + "loss": 0.9992, + "step": 4135 + }, + { + "epoch": 0.70917547206207, + "grad_norm": 1.6015625, + "learning_rate": 1.7459257497740548e-05, + "loss": 1.0595, + "step": 4136 + }, + { + "epoch": 0.7093469361510598, + "grad_norm": 1.5390625, + "learning_rate": 1.7458054575373973e-05, + "loss": 1.0378, + "step": 4137 + }, + { + "epoch": 0.7095184002400498, + "grad_norm": 1.4375, + "learning_rate": 1.7456851409772153e-05, + "loss": 0.9598, + "step": 4138 + }, + { + "epoch": 0.7096898643290396, + "grad_norm": 1.609375, + "learning_rate": 1.745564800097433e-05, + "loss": 1.0255, + "step": 4139 + }, + { + "epoch": 0.7098613284180294, + "grad_norm": 1.5390625, + "learning_rate": 1.7454444349019747e-05, + "loss": 1.005, + "step": 4140 + }, + { + "epoch": 0.7100327925070193, + "grad_norm": 1.6953125, + "learning_rate": 1.7453240453947664e-05, + "loss": 1.0376, + "step": 4141 + }, + { + "epoch": 0.7102042565960092, + "grad_norm": 1.53125, + "learning_rate": 1.7452036315797346e-05, + "loss": 0.9216, + "step": 4142 + }, + { + "epoch": 0.710375720684999, + "grad_norm": 1.703125, + "learning_rate": 1.745083193460806e-05, + "loss": 1.0864, + "step": 4143 + }, + { + "epoch": 0.7105471847739889, + "grad_norm": 1.5546875, + "learning_rate": 1.7449627310419086e-05, + "loss": 0.9865, + "step": 4144 + }, + { + "epoch": 0.7107186488629788, + "grad_norm": 1.640625, + "learning_rate": 1.7448422443269716e-05, + "loss": 1.0528, + "step": 4145 + }, + { + "epoch": 0.7108901129519686, + "grad_norm": 1.640625, + "learning_rate": 1.744721733319924e-05, + "loss": 1.0079, + "step": 4146 + }, + { + "epoch": 0.7110615770409585, + "grad_norm": 1.5078125, + "learning_rate": 1.7446011980246963e-05, + "loss": 0.9296, + "step": 4147 + }, + { + "epoch": 0.7112330411299483, + "grad_norm": 1.578125, + "learning_rate": 1.7444806384452198e-05, + "loss": 1.0596, + "step": 4148 + }, + { + "epoch": 0.7114045052189382, + "grad_norm": 1.5859375, + "learning_rate": 1.744360054585426e-05, + "loss": 0.9702, + "step": 4149 + }, + { + "epoch": 0.7115759693079281, + "grad_norm": 1.5078125, + "learning_rate": 1.744239446449248e-05, + "loss": 0.933, + "step": 4150 + }, + { + "epoch": 0.7117474333969179, + "grad_norm": 1.5859375, + "learning_rate": 1.744118814040619e-05, + "loss": 1.1285, + "step": 4151 + }, + { + "epoch": 0.7119188974859078, + "grad_norm": 1.5625, + "learning_rate": 1.7439981573634734e-05, + "loss": 1.0722, + "step": 4152 + }, + { + "epoch": 0.7120903615748977, + "grad_norm": 1.609375, + "learning_rate": 1.743877476421746e-05, + "loss": 1.0448, + "step": 4153 + }, + { + "epoch": 0.7122618256638875, + "grad_norm": 1.5390625, + "learning_rate": 1.7437567712193732e-05, + "loss": 0.93, + "step": 4154 + }, + { + "epoch": 0.7124332897528773, + "grad_norm": 1.5625, + "learning_rate": 1.7436360417602915e-05, + "loss": 0.989, + "step": 4155 + }, + { + "epoch": 0.7126047538418673, + "grad_norm": 1.53125, + "learning_rate": 1.743515288048438e-05, + "loss": 1.0188, + "step": 4156 + }, + { + "epoch": 0.7127762179308571, + "grad_norm": 1.6171875, + "learning_rate": 1.7433945100877513e-05, + "loss": 1.0177, + "step": 4157 + }, + { + "epoch": 0.7129476820198469, + "grad_norm": 1.5234375, + "learning_rate": 1.74327370788217e-05, + "loss": 0.9703, + "step": 4158 + }, + { + "epoch": 0.7131191461088369, + "grad_norm": 1.6015625, + "learning_rate": 1.743152881435634e-05, + "loss": 1.0018, + "step": 4159 + }, + { + "epoch": 0.7132906101978267, + "grad_norm": 1.65625, + "learning_rate": 1.7430320307520844e-05, + "loss": 1.0156, + "step": 4160 + }, + { + "epoch": 0.7134620742868165, + "grad_norm": 1.6015625, + "learning_rate": 1.7429111558354624e-05, + "loss": 1.0338, + "step": 4161 + }, + { + "epoch": 0.7136335383758065, + "grad_norm": 1.6640625, + "learning_rate": 1.74279025668971e-05, + "loss": 1.0291, + "step": 4162 + }, + { + "epoch": 0.7138050024647963, + "grad_norm": 1.640625, + "learning_rate": 1.7426693333187702e-05, + "loss": 1.1897, + "step": 4163 + }, + { + "epoch": 0.7139764665537861, + "grad_norm": 1.6328125, + "learning_rate": 1.7425483857265865e-05, + "loss": 1.0438, + "step": 4164 + }, + { + "epoch": 0.714147930642776, + "grad_norm": 1.5390625, + "learning_rate": 1.742427413917104e-05, + "loss": 1.0547, + "step": 4165 + }, + { + "epoch": 0.7143193947317659, + "grad_norm": 1.546875, + "learning_rate": 1.742306417894268e-05, + "loss": 1.0058, + "step": 4166 + }, + { + "epoch": 0.7144908588207557, + "grad_norm": 1.6015625, + "learning_rate": 1.7421853976620245e-05, + "loss": 1.0136, + "step": 4167 + }, + { + "epoch": 0.7146623229097456, + "grad_norm": 1.515625, + "learning_rate": 1.74206435322432e-05, + "loss": 1.0062, + "step": 4168 + }, + { + "epoch": 0.7148337869987355, + "grad_norm": 1.6328125, + "learning_rate": 1.7419432845851027e-05, + "loss": 1.1097, + "step": 4169 + }, + { + "epoch": 0.7150052510877253, + "grad_norm": 1.6640625, + "learning_rate": 1.741822191748321e-05, + "loss": 1.017, + "step": 4170 + }, + { + "epoch": 0.7151767151767152, + "grad_norm": 1.609375, + "learning_rate": 1.7417010747179245e-05, + "loss": 0.9241, + "step": 4171 + }, + { + "epoch": 0.715348179265705, + "grad_norm": 1.5859375, + "learning_rate": 1.7415799334978624e-05, + "loss": 0.9328, + "step": 4172 + }, + { + "epoch": 0.7155196433546949, + "grad_norm": 1.5859375, + "learning_rate": 1.7414587680920864e-05, + "loss": 0.9638, + "step": 4173 + }, + { + "epoch": 0.7156911074436848, + "grad_norm": 1.5625, + "learning_rate": 1.741337578504548e-05, + "loss": 1.0168, + "step": 4174 + }, + { + "epoch": 0.7158625715326746, + "grad_norm": 1.609375, + "learning_rate": 1.7412163647391996e-05, + "loss": 1.0322, + "step": 4175 + }, + { + "epoch": 0.7160340356216645, + "grad_norm": 1.59375, + "learning_rate": 1.7410951267999943e-05, + "loss": 1.0177, + "step": 4176 + }, + { + "epoch": 0.7162054997106544, + "grad_norm": 1.5625, + "learning_rate": 1.740973864690886e-05, + "loss": 1.0237, + "step": 4177 + }, + { + "epoch": 0.7163769637996442, + "grad_norm": 1.5546875, + "learning_rate": 1.7408525784158298e-05, + "loss": 0.9636, + "step": 4178 + }, + { + "epoch": 0.716548427888634, + "grad_norm": 1.6171875, + "learning_rate": 1.7407312679787814e-05, + "loss": 0.9987, + "step": 4179 + }, + { + "epoch": 0.716719891977624, + "grad_norm": 1.5234375, + "learning_rate": 1.740609933383697e-05, + "loss": 0.9429, + "step": 4180 + }, + { + "epoch": 0.7168913560666138, + "grad_norm": 1.625, + "learning_rate": 1.7404885746345333e-05, + "loss": 1.0416, + "step": 4181 + }, + { + "epoch": 0.7170628201556036, + "grad_norm": 1.6171875, + "learning_rate": 1.7403671917352492e-05, + "loss": 1.0059, + "step": 4182 + }, + { + "epoch": 0.7172342842445936, + "grad_norm": 1.6796875, + "learning_rate": 1.7402457846898032e-05, + "loss": 1.0392, + "step": 4183 + }, + { + "epoch": 0.7174057483335834, + "grad_norm": 1.5390625, + "learning_rate": 1.7401243535021547e-05, + "loss": 1.034, + "step": 4184 + }, + { + "epoch": 0.7175772124225732, + "grad_norm": 1.5703125, + "learning_rate": 1.7400028981762634e-05, + "loss": 1.0569, + "step": 4185 + }, + { + "epoch": 0.7177486765115632, + "grad_norm": 1.53125, + "learning_rate": 1.7398814187160913e-05, + "loss": 0.996, + "step": 4186 + }, + { + "epoch": 0.717920140600553, + "grad_norm": 1.59375, + "learning_rate": 1.7397599151256003e-05, + "loss": 1.0013, + "step": 4187 + }, + { + "epoch": 0.7180916046895428, + "grad_norm": 1.578125, + "learning_rate": 1.739638387408752e-05, + "loss": 0.9945, + "step": 4188 + }, + { + "epoch": 0.7182630687785327, + "grad_norm": 1.578125, + "learning_rate": 1.7395168355695116e-05, + "loss": 1.0584, + "step": 4189 + }, + { + "epoch": 0.7184345328675226, + "grad_norm": 1.5390625, + "learning_rate": 1.739395259611842e-05, + "loss": 0.9664, + "step": 4190 + }, + { + "epoch": 0.7186059969565124, + "grad_norm": 1.578125, + "learning_rate": 1.7392736595397086e-05, + "loss": 1.0509, + "step": 4191 + }, + { + "epoch": 0.7187774610455023, + "grad_norm": 1.5546875, + "learning_rate": 1.7391520353570772e-05, + "loss": 0.9658, + "step": 4192 + }, + { + "epoch": 0.7189489251344922, + "grad_norm": 1.5546875, + "learning_rate": 1.739030387067915e-05, + "loss": 1.0161, + "step": 4193 + }, + { + "epoch": 0.719120389223482, + "grad_norm": 1.5625, + "learning_rate": 1.7389087146761886e-05, + "loss": 0.998, + "step": 4194 + }, + { + "epoch": 0.7192918533124719, + "grad_norm": 1.6640625, + "learning_rate": 1.7387870181858666e-05, + "loss": 1.0322, + "step": 4195 + }, + { + "epoch": 0.7194633174014617, + "grad_norm": 1.65625, + "learning_rate": 1.738665297600918e-05, + "loss": 1.0163, + "step": 4196 + }, + { + "epoch": 0.7196347814904516, + "grad_norm": 1.59375, + "learning_rate": 1.738543552925312e-05, + "loss": 0.9975, + "step": 4197 + }, + { + "epoch": 0.7198062455794415, + "grad_norm": 1.5, + "learning_rate": 1.7384217841630207e-05, + "loss": 1.0121, + "step": 4198 + }, + { + "epoch": 0.7199777096684313, + "grad_norm": 1.6171875, + "learning_rate": 1.7382999913180135e-05, + "loss": 1.0565, + "step": 4199 + }, + { + "epoch": 0.7201491737574212, + "grad_norm": 1.671875, + "learning_rate": 1.7381781743942636e-05, + "loss": 0.9933, + "step": 4200 + }, + { + "epoch": 0.7201491737574212, + "eval_loss": 0.8648467063903809, + "eval_runtime": 837.3272, + "eval_samples_per_second": 2.984, + "eval_steps_per_second": 2.984, + "step": 4200 + }, + { + "epoch": 0.7203206378464111, + "grad_norm": 1.5625, + "learning_rate": 1.738056333395744e-05, + "loss": 1.0291, + "step": 4201 + }, + { + "epoch": 0.7204921019354009, + "grad_norm": 1.5625, + "learning_rate": 1.7379344683264275e-05, + "loss": 1.0251, + "step": 4202 + }, + { + "epoch": 0.7206635660243907, + "grad_norm": 1.5, + "learning_rate": 1.7378125791902897e-05, + "loss": 0.9141, + "step": 4203 + }, + { + "epoch": 0.7208350301133807, + "grad_norm": 1.5625, + "learning_rate": 1.737690665991305e-05, + "loss": 0.9753, + "step": 4204 + }, + { + "epoch": 0.7210064942023705, + "grad_norm": 1.5703125, + "learning_rate": 1.7375687287334502e-05, + "loss": 1.0581, + "step": 4205 + }, + { + "epoch": 0.7211779582913603, + "grad_norm": 1.6640625, + "learning_rate": 1.7374467674207015e-05, + "loss": 1.0378, + "step": 4206 + }, + { + "epoch": 0.7213494223803503, + "grad_norm": 1.5859375, + "learning_rate": 1.7373247820570368e-05, + "loss": 1.0007, + "step": 4207 + }, + { + "epoch": 0.7215208864693401, + "grad_norm": 1.640625, + "learning_rate": 1.7372027726464346e-05, + "loss": 0.9991, + "step": 4208 + }, + { + "epoch": 0.7216923505583299, + "grad_norm": 1.515625, + "learning_rate": 1.7370807391928737e-05, + "loss": 0.9788, + "step": 4209 + }, + { + "epoch": 0.7218638146473199, + "grad_norm": 1.5625, + "learning_rate": 1.7369586817003345e-05, + "loss": 0.9624, + "step": 4210 + }, + { + "epoch": 0.7220352787363097, + "grad_norm": 1.6171875, + "learning_rate": 1.7368366001727973e-05, + "loss": 0.9637, + "step": 4211 + }, + { + "epoch": 0.7222067428252995, + "grad_norm": 1.65625, + "learning_rate": 1.736714494614244e-05, + "loss": 1.0159, + "step": 4212 + }, + { + "epoch": 0.7223782069142893, + "grad_norm": 1.640625, + "learning_rate": 1.736592365028657e-05, + "loss": 0.9424, + "step": 4213 + }, + { + "epoch": 0.7225496710032793, + "grad_norm": 1.59375, + "learning_rate": 1.736470211420019e-05, + "loss": 1.0695, + "step": 4214 + }, + { + "epoch": 0.7227211350922691, + "grad_norm": 1.5625, + "learning_rate": 1.7363480337923143e-05, + "loss": 0.9998, + "step": 4215 + }, + { + "epoch": 0.7228925991812589, + "grad_norm": 1.5546875, + "learning_rate": 1.736225832149527e-05, + "loss": 0.9964, + "step": 4216 + }, + { + "epoch": 0.7230640632702489, + "grad_norm": 1.640625, + "learning_rate": 1.7361036064956433e-05, + "loss": 0.9818, + "step": 4217 + }, + { + "epoch": 0.7232355273592387, + "grad_norm": 1.8046875, + "learning_rate": 1.7359813568346492e-05, + "loss": 1.0565, + "step": 4218 + }, + { + "epoch": 0.7234069914482285, + "grad_norm": 1.546875, + "learning_rate": 1.735859083170531e-05, + "loss": 0.9854, + "step": 4219 + }, + { + "epoch": 0.7235784555372184, + "grad_norm": 1.625, + "learning_rate": 1.7357367855072778e-05, + "loss": 1.0865, + "step": 4220 + }, + { + "epoch": 0.7237499196262083, + "grad_norm": 1.609375, + "learning_rate": 1.7356144638488772e-05, + "loss": 0.9946, + "step": 4221 + }, + { + "epoch": 0.7239213837151981, + "grad_norm": 1.5859375, + "learning_rate": 1.7354921181993187e-05, + "loss": 0.948, + "step": 4222 + }, + { + "epoch": 0.724092847804188, + "grad_norm": 1.5703125, + "learning_rate": 1.7353697485625928e-05, + "loss": 1.0151, + "step": 4223 + }, + { + "epoch": 0.7242643118931779, + "grad_norm": 1.5, + "learning_rate": 1.73524735494269e-05, + "loss": 0.953, + "step": 4224 + }, + { + "epoch": 0.7244357759821677, + "grad_norm": 1.734375, + "learning_rate": 1.7351249373436028e-05, + "loss": 1.007, + "step": 4225 + }, + { + "epoch": 0.7246072400711576, + "grad_norm": 1.71875, + "learning_rate": 1.7350024957693232e-05, + "loss": 1.072, + "step": 4226 + }, + { + "epoch": 0.7247787041601474, + "grad_norm": 1.6171875, + "learning_rate": 1.734880030223844e-05, + "loss": 1.1049, + "step": 4227 + }, + { + "epoch": 0.7249501682491373, + "grad_norm": 1.6015625, + "learning_rate": 1.7347575407111595e-05, + "loss": 1.0501, + "step": 4228 + }, + { + "epoch": 0.7251216323381272, + "grad_norm": 1.6796875, + "learning_rate": 1.7346350272352653e-05, + "loss": 1.1176, + "step": 4229 + }, + { + "epoch": 0.725293096427117, + "grad_norm": 1.59375, + "learning_rate": 1.7345124898001562e-05, + "loss": 1.0401, + "step": 4230 + }, + { + "epoch": 0.7254645605161069, + "grad_norm": 1.6015625, + "learning_rate": 1.7343899284098292e-05, + "loss": 1.073, + "step": 4231 + }, + { + "epoch": 0.7256360246050968, + "grad_norm": 1.671875, + "learning_rate": 1.734267343068281e-05, + "loss": 1.0733, + "step": 4232 + }, + { + "epoch": 0.7258074886940866, + "grad_norm": 1.609375, + "learning_rate": 1.7341447337795098e-05, + "loss": 1.0307, + "step": 4233 + }, + { + "epoch": 0.7259789527830764, + "grad_norm": 1.6328125, + "learning_rate": 1.734022100547514e-05, + "loss": 1.0375, + "step": 4234 + }, + { + "epoch": 0.7261504168720664, + "grad_norm": 1.46875, + "learning_rate": 1.7338994433762938e-05, + "loss": 0.8565, + "step": 4235 + }, + { + "epoch": 0.7263218809610562, + "grad_norm": 1.578125, + "learning_rate": 1.733776762269849e-05, + "loss": 1.0315, + "step": 4236 + }, + { + "epoch": 0.726493345050046, + "grad_norm": 1.6171875, + "learning_rate": 1.7336540572321807e-05, + "loss": 0.9393, + "step": 4237 + }, + { + "epoch": 0.726664809139036, + "grad_norm": 1.59375, + "learning_rate": 1.733531328267291e-05, + "loss": 1.0198, + "step": 4238 + }, + { + "epoch": 0.7268362732280258, + "grad_norm": 1.5078125, + "learning_rate": 1.7334085753791824e-05, + "loss": 0.9681, + "step": 4239 + }, + { + "epoch": 0.7270077373170156, + "grad_norm": 1.59375, + "learning_rate": 1.7332857985718584e-05, + "loss": 1.0287, + "step": 4240 + }, + { + "epoch": 0.7271792014060056, + "grad_norm": 1.640625, + "learning_rate": 1.7331629978493234e-05, + "loss": 1.0395, + "step": 4241 + }, + { + "epoch": 0.7273506654949954, + "grad_norm": 1.6953125, + "learning_rate": 1.733040173215582e-05, + "loss": 1.0084, + "step": 4242 + }, + { + "epoch": 0.7275221295839852, + "grad_norm": 1.5703125, + "learning_rate": 1.7329173246746406e-05, + "loss": 1.0966, + "step": 4243 + }, + { + "epoch": 0.7276935936729751, + "grad_norm": 1.6953125, + "learning_rate": 1.732794452230505e-05, + "loss": 1.1253, + "step": 4244 + }, + { + "epoch": 0.727865057761965, + "grad_norm": 1.5, + "learning_rate": 1.7326715558871826e-05, + "loss": 0.9047, + "step": 4245 + }, + { + "epoch": 0.7280365218509548, + "grad_norm": 1.6328125, + "learning_rate": 1.7325486356486823e-05, + "loss": 1.1244, + "step": 4246 + }, + { + "epoch": 0.7282079859399447, + "grad_norm": 1.6953125, + "learning_rate": 1.732425691519012e-05, + "loss": 1.0456, + "step": 4247 + }, + { + "epoch": 0.7283794500289346, + "grad_norm": 1.59375, + "learning_rate": 1.732302723502182e-05, + "loss": 0.9747, + "step": 4248 + }, + { + "epoch": 0.7285509141179244, + "grad_norm": 1.5, + "learning_rate": 1.732179731602203e-05, + "loss": 1.0131, + "step": 4249 + }, + { + "epoch": 0.7287223782069143, + "grad_norm": 1.46875, + "learning_rate": 1.7320567158230855e-05, + "loss": 0.988, + "step": 4250 + }, + { + "epoch": 0.7288938422959041, + "grad_norm": 1.609375, + "learning_rate": 1.7319336761688415e-05, + "loss": 1.0683, + "step": 4251 + }, + { + "epoch": 0.729065306384894, + "grad_norm": 1.5703125, + "learning_rate": 1.7318106126434845e-05, + "loss": 1.0612, + "step": 4252 + }, + { + "epoch": 0.7292367704738839, + "grad_norm": 1.4765625, + "learning_rate": 1.7316875252510274e-05, + "loss": 0.9791, + "step": 4253 + }, + { + "epoch": 0.7294082345628737, + "grad_norm": 1.4609375, + "learning_rate": 1.7315644139954852e-05, + "loss": 0.9713, + "step": 4254 + }, + { + "epoch": 0.7295796986518636, + "grad_norm": 1.5546875, + "learning_rate": 1.7314412788808727e-05, + "loss": 1.002, + "step": 4255 + }, + { + "epoch": 0.7297511627408535, + "grad_norm": 1.671875, + "learning_rate": 1.731318119911205e-05, + "loss": 1.0398, + "step": 4256 + }, + { + "epoch": 0.7299226268298433, + "grad_norm": 1.4921875, + "learning_rate": 1.7311949370905e-05, + "loss": 0.9147, + "step": 4257 + }, + { + "epoch": 0.7300940909188331, + "grad_norm": 1.4921875, + "learning_rate": 1.731071730422775e-05, + "loss": 1.0034, + "step": 4258 + }, + { + "epoch": 0.7302655550078231, + "grad_norm": 1.5, + "learning_rate": 1.7309484999120475e-05, + "loss": 0.9908, + "step": 4259 + }, + { + "epoch": 0.7304370190968129, + "grad_norm": 1.59375, + "learning_rate": 1.730825245562337e-05, + "loss": 1.0843, + "step": 4260 + }, + { + "epoch": 0.7306084831858027, + "grad_norm": 1.703125, + "learning_rate": 1.730701967377663e-05, + "loss": 1.0963, + "step": 4261 + }, + { + "epoch": 0.7307799472747927, + "grad_norm": 1.515625, + "learning_rate": 1.7305786653620466e-05, + "loss": 0.9635, + "step": 4262 + }, + { + "epoch": 0.7309514113637825, + "grad_norm": 1.515625, + "learning_rate": 1.730455339519509e-05, + "loss": 1.0101, + "step": 4263 + }, + { + "epoch": 0.7311228754527723, + "grad_norm": 1.640625, + "learning_rate": 1.730331989854072e-05, + "loss": 1.0204, + "step": 4264 + }, + { + "epoch": 0.7312943395417623, + "grad_norm": 1.609375, + "learning_rate": 1.730208616369758e-05, + "loss": 1.019, + "step": 4265 + }, + { + "epoch": 0.7314658036307521, + "grad_norm": 1.4609375, + "learning_rate": 1.730085219070592e-05, + "loss": 0.928, + "step": 4266 + }, + { + "epoch": 0.7316372677197419, + "grad_norm": 1.6015625, + "learning_rate": 1.7299617979605976e-05, + "loss": 1.0206, + "step": 4267 + }, + { + "epoch": 0.7318087318087318, + "grad_norm": 1.5546875, + "learning_rate": 1.7298383530438002e-05, + "loss": 0.9391, + "step": 4268 + }, + { + "epoch": 0.7319801958977217, + "grad_norm": 1.4921875, + "learning_rate": 1.7297148843242257e-05, + "loss": 0.9757, + "step": 4269 + }, + { + "epoch": 0.7321516599867115, + "grad_norm": 1.5546875, + "learning_rate": 1.7295913918059008e-05, + "loss": 0.993, + "step": 4270 + }, + { + "epoch": 0.7323231240757014, + "grad_norm": 1.59375, + "learning_rate": 1.7294678754928535e-05, + "loss": 0.9771, + "step": 4271 + }, + { + "epoch": 0.7324945881646913, + "grad_norm": 1.5625, + "learning_rate": 1.7293443353891118e-05, + "loss": 1.0116, + "step": 4272 + }, + { + "epoch": 0.7326660522536811, + "grad_norm": 1.4921875, + "learning_rate": 1.729220771498705e-05, + "loss": 0.9867, + "step": 4273 + }, + { + "epoch": 0.732837516342671, + "grad_norm": 1.59375, + "learning_rate": 1.7290971838256624e-05, + "loss": 1.0323, + "step": 4274 + }, + { + "epoch": 0.7330089804316609, + "grad_norm": 1.6171875, + "learning_rate": 1.7289735723740157e-05, + "loss": 0.9824, + "step": 4275 + }, + { + "epoch": 0.7331804445206507, + "grad_norm": 1.578125, + "learning_rate": 1.7288499371477954e-05, + "loss": 0.9785, + "step": 4276 + }, + { + "epoch": 0.7333519086096406, + "grad_norm": 1.625, + "learning_rate": 1.728726278151034e-05, + "loss": 1.0296, + "step": 4277 + }, + { + "epoch": 0.7335233726986304, + "grad_norm": 1.5078125, + "learning_rate": 1.7286025953877644e-05, + "loss": 1.1321, + "step": 4278 + }, + { + "epoch": 0.7336948367876203, + "grad_norm": 1.5703125, + "learning_rate": 1.728478888862021e-05, + "loss": 1.0588, + "step": 4279 + }, + { + "epoch": 0.7338663008766102, + "grad_norm": 1.609375, + "learning_rate": 1.7283551585778375e-05, + "loss": 1.0378, + "step": 4280 + }, + { + "epoch": 0.7340377649656, + "grad_norm": 1.875, + "learning_rate": 1.72823140453925e-05, + "loss": 0.9893, + "step": 4281 + }, + { + "epoch": 0.7342092290545899, + "grad_norm": 1.625, + "learning_rate": 1.7281076267502936e-05, + "loss": 1.0624, + "step": 4282 + }, + { + "epoch": 0.7343806931435798, + "grad_norm": 1.5859375, + "learning_rate": 1.7279838252150057e-05, + "loss": 1.083, + "step": 4283 + }, + { + "epoch": 0.7345521572325696, + "grad_norm": 1.546875, + "learning_rate": 1.727859999937424e-05, + "loss": 1.0401, + "step": 4284 + }, + { + "epoch": 0.7347236213215594, + "grad_norm": 1.546875, + "learning_rate": 1.727736150921587e-05, + "loss": 1.0169, + "step": 4285 + }, + { + "epoch": 0.7348950854105494, + "grad_norm": 1.546875, + "learning_rate": 1.7276122781715335e-05, + "loss": 1.0907, + "step": 4286 + }, + { + "epoch": 0.7350665494995392, + "grad_norm": 1.6015625, + "learning_rate": 1.727488381691304e-05, + "loss": 1.038, + "step": 4287 + }, + { + "epoch": 0.735238013588529, + "grad_norm": 1.53125, + "learning_rate": 1.727364461484939e-05, + "loss": 1.0173, + "step": 4288 + }, + { + "epoch": 0.735409477677519, + "grad_norm": 1.6328125, + "learning_rate": 1.727240517556479e-05, + "loss": 1.078, + "step": 4289 + }, + { + "epoch": 0.7355809417665088, + "grad_norm": 1.59375, + "learning_rate": 1.7271165499099682e-05, + "loss": 1.0485, + "step": 4290 + }, + { + "epoch": 0.7357524058554986, + "grad_norm": 1.5625, + "learning_rate": 1.7269925585494483e-05, + "loss": 0.9896, + "step": 4291 + }, + { + "epoch": 0.7359238699444886, + "grad_norm": 1.59375, + "learning_rate": 1.7268685434789634e-05, + "loss": 1.0272, + "step": 4292 + }, + { + "epoch": 0.7360953340334784, + "grad_norm": 1.546875, + "learning_rate": 1.7267445047025582e-05, + "loss": 0.9564, + "step": 4293 + }, + { + "epoch": 0.7362667981224682, + "grad_norm": 1.5390625, + "learning_rate": 1.726620442224278e-05, + "loss": 1.0263, + "step": 4294 + }, + { + "epoch": 0.7364382622114581, + "grad_norm": 1.59375, + "learning_rate": 1.7264963560481688e-05, + "loss": 1.0433, + "step": 4295 + }, + { + "epoch": 0.736609726300448, + "grad_norm": 1.65625, + "learning_rate": 1.7263722461782782e-05, + "loss": 0.9827, + "step": 4296 + }, + { + "epoch": 0.7367811903894378, + "grad_norm": 1.6015625, + "learning_rate": 1.726248112618653e-05, + "loss": 1.0504, + "step": 4297 + }, + { + "epoch": 0.7369526544784277, + "grad_norm": 1.5546875, + "learning_rate": 1.7261239553733424e-05, + "loss": 0.9999, + "step": 4298 + }, + { + "epoch": 0.7371241185674176, + "grad_norm": 1.5234375, + "learning_rate": 1.7259997744463948e-05, + "loss": 0.9925, + "step": 4299 + }, + { + "epoch": 0.7372955826564074, + "grad_norm": 1.59375, + "learning_rate": 1.7258755698418606e-05, + "loss": 0.9823, + "step": 4300 + }, + { + "epoch": 0.7374670467453973, + "grad_norm": 1.640625, + "learning_rate": 1.7257513415637913e-05, + "loss": 1.0207, + "step": 4301 + }, + { + "epoch": 0.7376385108343871, + "grad_norm": 1.671875, + "learning_rate": 1.7256270896162373e-05, + "loss": 0.9931, + "step": 4302 + }, + { + "epoch": 0.737809974923377, + "grad_norm": 1.5234375, + "learning_rate": 1.7255028140032517e-05, + "loss": 0.985, + "step": 4303 + }, + { + "epoch": 0.7379814390123669, + "grad_norm": 1.625, + "learning_rate": 1.7253785147288875e-05, + "loss": 1.0894, + "step": 4304 + }, + { + "epoch": 0.7381529031013567, + "grad_norm": 1.671875, + "learning_rate": 1.7252541917971984e-05, + "loss": 1.0508, + "step": 4305 + }, + { + "epoch": 0.7383243671903466, + "grad_norm": 1.6484375, + "learning_rate": 1.7251298452122385e-05, + "loss": 1.0104, + "step": 4306 + }, + { + "epoch": 0.7384958312793364, + "grad_norm": 1.625, + "learning_rate": 1.7250054749780643e-05, + "loss": 1.0856, + "step": 4307 + }, + { + "epoch": 0.7386672953683263, + "grad_norm": 1.609375, + "learning_rate": 1.7248810810987312e-05, + "loss": 0.968, + "step": 4308 + }, + { + "epoch": 0.7388387594573161, + "grad_norm": 1.59375, + "learning_rate": 1.7247566635782965e-05, + "loss": 1.0389, + "step": 4309 + }, + { + "epoch": 0.739010223546306, + "grad_norm": 1.578125, + "learning_rate": 1.724632222420818e-05, + "loss": 0.9872, + "step": 4310 + }, + { + "epoch": 0.7391816876352959, + "grad_norm": 1.59375, + "learning_rate": 1.7245077576303537e-05, + "loss": 1.0246, + "step": 4311 + }, + { + "epoch": 0.7393531517242857, + "grad_norm": 1.7421875, + "learning_rate": 1.7243832692109632e-05, + "loss": 0.9628, + "step": 4312 + }, + { + "epoch": 0.7395246158132756, + "grad_norm": 1.5390625, + "learning_rate": 1.7242587571667067e-05, + "loss": 1.0645, + "step": 4313 + }, + { + "epoch": 0.7396960799022655, + "grad_norm": 1.53125, + "learning_rate": 1.7241342215016446e-05, + "loss": 0.9326, + "step": 4314 + }, + { + "epoch": 0.7398675439912553, + "grad_norm": 1.65625, + "learning_rate": 1.724009662219839e-05, + "loss": 1.1156, + "step": 4315 + }, + { + "epoch": 0.7400390080802451, + "grad_norm": 1.5390625, + "learning_rate": 1.7238850793253516e-05, + "loss": 0.974, + "step": 4316 + }, + { + "epoch": 0.7402104721692351, + "grad_norm": 1.5234375, + "learning_rate": 1.723760472822246e-05, + "loss": 0.9032, + "step": 4317 + }, + { + "epoch": 0.7403819362582249, + "grad_norm": 1.8671875, + "learning_rate": 1.723635842714586e-05, + "loss": 0.9871, + "step": 4318 + }, + { + "epoch": 0.7405534003472147, + "grad_norm": 1.7734375, + "learning_rate": 1.7235111890064363e-05, + "loss": 1.0518, + "step": 4319 + }, + { + "epoch": 0.7407248644362047, + "grad_norm": 1.6875, + "learning_rate": 1.723386511701862e-05, + "loss": 0.998, + "step": 4320 + }, + { + "epoch": 0.7408963285251945, + "grad_norm": 3.078125, + "learning_rate": 1.72326181080493e-05, + "loss": 1.138, + "step": 4321 + }, + { + "epoch": 0.7410677926141843, + "grad_norm": 2.921875, + "learning_rate": 1.723137086319706e-05, + "loss": 0.8734, + "step": 4322 + }, + { + "epoch": 0.7412392567031743, + "grad_norm": 4.34375, + "learning_rate": 1.7230123382502592e-05, + "loss": 1.0155, + "step": 4323 + }, + { + "epoch": 0.7414107207921641, + "grad_norm": 2.140625, + "learning_rate": 1.7228875666006572e-05, + "loss": 1.0432, + "step": 4324 + }, + { + "epoch": 0.7415821848811539, + "grad_norm": 1.765625, + "learning_rate": 1.72276277137497e-05, + "loss": 1.0208, + "step": 4325 + }, + { + "epoch": 0.7417536489701438, + "grad_norm": 1.6796875, + "learning_rate": 1.7226379525772664e-05, + "loss": 1.0444, + "step": 4326 + }, + { + "epoch": 0.7419251130591337, + "grad_norm": 1.5625, + "learning_rate": 1.7225131102116185e-05, + "loss": 1.0232, + "step": 4327 + }, + { + "epoch": 0.7420965771481235, + "grad_norm": 1.5546875, + "learning_rate": 1.7223882442820968e-05, + "loss": 1.0519, + "step": 4328 + }, + { + "epoch": 0.7422680412371134, + "grad_norm": 1.671875, + "learning_rate": 1.7222633547927745e-05, + "loss": 1.0556, + "step": 4329 + }, + { + "epoch": 0.7424395053261033, + "grad_norm": 1.6796875, + "learning_rate": 1.7221384417477244e-05, + "loss": 1.0376, + "step": 4330 + }, + { + "epoch": 0.7426109694150931, + "grad_norm": 1.5859375, + "learning_rate": 1.7220135051510202e-05, + "loss": 1.0469, + "step": 4331 + }, + { + "epoch": 0.742782433504083, + "grad_norm": 1.53125, + "learning_rate": 1.721888545006737e-05, + "loss": 0.9343, + "step": 4332 + }, + { + "epoch": 0.7429538975930728, + "grad_norm": 2.390625, + "learning_rate": 1.72176356131895e-05, + "loss": 1.045, + "step": 4333 + }, + { + "epoch": 0.7431253616820627, + "grad_norm": 1.828125, + "learning_rate": 1.7216385540917353e-05, + "loss": 1.0854, + "step": 4334 + }, + { + "epoch": 0.7432968257710526, + "grad_norm": 1.765625, + "learning_rate": 1.7215135233291698e-05, + "loss": 1.0709, + "step": 4335 + }, + { + "epoch": 0.7434682898600424, + "grad_norm": 1.625, + "learning_rate": 1.7213884690353317e-05, + "loss": 1.0443, + "step": 4336 + }, + { + "epoch": 0.7436397539490323, + "grad_norm": 1.953125, + "learning_rate": 1.7212633912142986e-05, + "loss": 1.0119, + "step": 4337 + }, + { + "epoch": 0.7438112180380222, + "grad_norm": 1.4921875, + "learning_rate": 1.7211382898701506e-05, + "loss": 0.9788, + "step": 4338 + }, + { + "epoch": 0.743982682127012, + "grad_norm": 1.625, + "learning_rate": 1.7210131650069674e-05, + "loss": 0.9576, + "step": 4339 + }, + { + "epoch": 0.7441541462160018, + "grad_norm": 1.546875, + "learning_rate": 1.7208880166288294e-05, + "loss": 0.9501, + "step": 4340 + }, + { + "epoch": 0.7443256103049918, + "grad_norm": 1.671875, + "learning_rate": 1.7207628447398192e-05, + "loss": 1.085, + "step": 4341 + }, + { + "epoch": 0.7444970743939816, + "grad_norm": 1.625, + "learning_rate": 1.720637649344018e-05, + "loss": 0.9839, + "step": 4342 + }, + { + "epoch": 0.7446685384829714, + "grad_norm": 1.6875, + "learning_rate": 1.7205124304455098e-05, + "loss": 1.0275, + "step": 4343 + }, + { + "epoch": 0.7448400025719614, + "grad_norm": 1.5859375, + "learning_rate": 1.7203871880483776e-05, + "loss": 0.9456, + "step": 4344 + }, + { + "epoch": 0.7450114666609512, + "grad_norm": 1.515625, + "learning_rate": 1.720261922156707e-05, + "loss": 0.978, + "step": 4345 + }, + { + "epoch": 0.745182930749941, + "grad_norm": 1.59375, + "learning_rate": 1.7201366327745824e-05, + "loss": 0.9333, + "step": 4346 + }, + { + "epoch": 0.745354394838931, + "grad_norm": 1.515625, + "learning_rate": 1.7200113199060905e-05, + "loss": 1.0612, + "step": 4347 + }, + { + "epoch": 0.7455258589279208, + "grad_norm": 1.5859375, + "learning_rate": 1.7198859835553184e-05, + "loss": 1.0532, + "step": 4348 + }, + { + "epoch": 0.7456973230169106, + "grad_norm": 1.59375, + "learning_rate": 1.7197606237263533e-05, + "loss": 1.0471, + "step": 4349 + }, + { + "epoch": 0.7458687871059005, + "grad_norm": 1.5078125, + "learning_rate": 1.719635240423284e-05, + "loss": 0.9832, + "step": 4350 + }, + { + "epoch": 0.7460402511948904, + "grad_norm": 1.5546875, + "learning_rate": 1.7195098336501997e-05, + "loss": 0.962, + "step": 4351 + }, + { + "epoch": 0.7462117152838802, + "grad_norm": 1.640625, + "learning_rate": 1.71938440341119e-05, + "loss": 1.0084, + "step": 4352 + }, + { + "epoch": 0.7463831793728701, + "grad_norm": 1.546875, + "learning_rate": 1.7192589497103462e-05, + "loss": 1.0764, + "step": 4353 + }, + { + "epoch": 0.74655464346186, + "grad_norm": 1.5703125, + "learning_rate": 1.7191334725517593e-05, + "loss": 0.95, + "step": 4354 + }, + { + "epoch": 0.7467261075508498, + "grad_norm": 1.609375, + "learning_rate": 1.7190079719395222e-05, + "loss": 0.9804, + "step": 4355 + }, + { + "epoch": 0.7468975716398397, + "grad_norm": 1.5078125, + "learning_rate": 1.7188824478777275e-05, + "loss": 0.9459, + "step": 4356 + }, + { + "epoch": 0.7470690357288295, + "grad_norm": 1.4765625, + "learning_rate": 1.7187569003704688e-05, + "loss": 0.9771, + "step": 4357 + }, + { + "epoch": 0.7472404998178194, + "grad_norm": 1.5703125, + "learning_rate": 1.7186313294218416e-05, + "loss": 0.9737, + "step": 4358 + }, + { + "epoch": 0.7474119639068093, + "grad_norm": 1.6015625, + "learning_rate": 1.71850573503594e-05, + "loss": 1.0141, + "step": 4359 + }, + { + "epoch": 0.7475834279957991, + "grad_norm": 1.515625, + "learning_rate": 1.718380117216861e-05, + "loss": 0.9638, + "step": 4360 + }, + { + "epoch": 0.747754892084789, + "grad_norm": 1.5078125, + "learning_rate": 1.7182544759687015e-05, + "loss": 0.9517, + "step": 4361 + }, + { + "epoch": 0.7479263561737789, + "grad_norm": 1.5546875, + "learning_rate": 1.7181288112955586e-05, + "loss": 1.0122, + "step": 4362 + }, + { + "epoch": 0.7480978202627687, + "grad_norm": 1.4609375, + "learning_rate": 1.7180031232015307e-05, + "loss": 0.9521, + "step": 4363 + }, + { + "epoch": 0.7482692843517585, + "grad_norm": 1.65625, + "learning_rate": 1.7178774116907177e-05, + "loss": 0.994, + "step": 4364 + }, + { + "epoch": 0.7484407484407485, + "grad_norm": 1.5546875, + "learning_rate": 1.7177516767672187e-05, + "loss": 1.0372, + "step": 4365 + }, + { + "epoch": 0.7486122125297383, + "grad_norm": 1.5078125, + "learning_rate": 1.717625918435135e-05, + "loss": 0.9872, + "step": 4366 + }, + { + "epoch": 0.7487836766187281, + "grad_norm": 1.625, + "learning_rate": 1.717500136698567e-05, + "loss": 1.1055, + "step": 4367 + }, + { + "epoch": 0.7489551407077181, + "grad_norm": 1.6640625, + "learning_rate": 1.7173743315616184e-05, + "loss": 1.1213, + "step": 4368 + }, + { + "epoch": 0.7491266047967079, + "grad_norm": 1.6015625, + "learning_rate": 1.7172485030283912e-05, + "loss": 1.0286, + "step": 4369 + }, + { + "epoch": 0.7492980688856977, + "grad_norm": 1.609375, + "learning_rate": 1.7171226511029895e-05, + "loss": 1.0856, + "step": 4370 + }, + { + "epoch": 0.7494695329746877, + "grad_norm": 1.53125, + "learning_rate": 1.716996775789518e-05, + "loss": 1.0403, + "step": 4371 + }, + { + "epoch": 0.7496409970636775, + "grad_norm": 1.609375, + "learning_rate": 1.7168708770920815e-05, + "loss": 1.0312, + "step": 4372 + }, + { + "epoch": 0.7498124611526673, + "grad_norm": 1.484375, + "learning_rate": 1.716744955014786e-05, + "loss": 0.9761, + "step": 4373 + }, + { + "epoch": 0.7499839252416572, + "grad_norm": 1.5625, + "learning_rate": 1.7166190095617386e-05, + "loss": 1.0441, + "step": 4374 + }, + { + "epoch": 0.7501553893306471, + "grad_norm": 1.5390625, + "learning_rate": 1.7164930407370465e-05, + "loss": 0.9607, + "step": 4375 + }, + { + "epoch": 0.7503268534196369, + "grad_norm": 1.6796875, + "learning_rate": 1.716367048544819e-05, + "loss": 1.0862, + "step": 4376 + }, + { + "epoch": 0.7504983175086268, + "grad_norm": 1.53125, + "learning_rate": 1.7162410329891636e-05, + "loss": 1.0016, + "step": 4377 + }, + { + "epoch": 0.7506697815976167, + "grad_norm": 1.5078125, + "learning_rate": 1.7161149940741918e-05, + "loss": 1.0165, + "step": 4378 + }, + { + "epoch": 0.7508412456866065, + "grad_norm": 1.5390625, + "learning_rate": 1.7159889318040128e-05, + "loss": 1.0394, + "step": 4379 + }, + { + "epoch": 0.7510127097755964, + "grad_norm": 1.5859375, + "learning_rate": 1.7158628461827386e-05, + "loss": 0.9978, + "step": 4380 + }, + { + "epoch": 0.7511841738645862, + "grad_norm": 1.5, + "learning_rate": 1.715736737214482e-05, + "loss": 0.9515, + "step": 4381 + }, + { + "epoch": 0.7513556379535761, + "grad_norm": 1.671875, + "learning_rate": 1.7156106049033544e-05, + "loss": 1.0019, + "step": 4382 + }, + { + "epoch": 0.751527102042566, + "grad_norm": 1.7890625, + "learning_rate": 1.7154844492534704e-05, + "loss": 1.1323, + "step": 4383 + }, + { + "epoch": 0.7516985661315558, + "grad_norm": 1.5546875, + "learning_rate": 1.7153582702689445e-05, + "loss": 1.0557, + "step": 4384 + }, + { + "epoch": 0.7518700302205457, + "grad_norm": 1.5703125, + "learning_rate": 1.7152320679538914e-05, + "loss": 1.0072, + "step": 4385 + }, + { + "epoch": 0.7520414943095356, + "grad_norm": 1.4453125, + "learning_rate": 1.7151058423124275e-05, + "loss": 0.889, + "step": 4386 + }, + { + "epoch": 0.7522129583985254, + "grad_norm": 1.5703125, + "learning_rate": 1.714979593348669e-05, + "loss": 1.0677, + "step": 4387 + }, + { + "epoch": 0.7523844224875152, + "grad_norm": 1.609375, + "learning_rate": 1.7148533210667337e-05, + "loss": 1.0883, + "step": 4388 + }, + { + "epoch": 0.7525558865765052, + "grad_norm": 1.5, + "learning_rate": 1.7147270254707394e-05, + "loss": 0.9649, + "step": 4389 + }, + { + "epoch": 0.752727350665495, + "grad_norm": 1.5859375, + "learning_rate": 1.714600706564806e-05, + "loss": 1.0362, + "step": 4390 + }, + { + "epoch": 0.7528988147544848, + "grad_norm": 1.59375, + "learning_rate": 1.714474364353052e-05, + "loss": 1.0016, + "step": 4391 + }, + { + "epoch": 0.7530702788434748, + "grad_norm": 1.53125, + "learning_rate": 1.714347998839599e-05, + "loss": 0.9474, + "step": 4392 + }, + { + "epoch": 0.7532417429324646, + "grad_norm": 1.6171875, + "learning_rate": 1.7142216100285672e-05, + "loss": 1.0142, + "step": 4393 + }, + { + "epoch": 0.7534132070214544, + "grad_norm": 1.6796875, + "learning_rate": 1.7140951979240797e-05, + "loss": 1.0681, + "step": 4394 + }, + { + "epoch": 0.7535846711104444, + "grad_norm": 1.6328125, + "learning_rate": 1.7139687625302587e-05, + "loss": 1.041, + "step": 4395 + }, + { + "epoch": 0.7537561351994342, + "grad_norm": 1.6015625, + "learning_rate": 1.7138423038512275e-05, + "loss": 0.9334, + "step": 4396 + }, + { + "epoch": 0.753927599288424, + "grad_norm": 1.5390625, + "learning_rate": 1.713715821891111e-05, + "loss": 0.9708, + "step": 4397 + }, + { + "epoch": 0.754099063377414, + "grad_norm": 1.6015625, + "learning_rate": 1.713589316654034e-05, + "loss": 0.9977, + "step": 4398 + }, + { + "epoch": 0.7542705274664038, + "grad_norm": 1.53125, + "learning_rate": 1.713462788144122e-05, + "loss": 0.9526, + "step": 4399 + }, + { + "epoch": 0.7544419915553936, + "grad_norm": 1.5390625, + "learning_rate": 1.713336236365502e-05, + "loss": 1.0819, + "step": 4400 + }, + { + "epoch": 0.7546134556443834, + "grad_norm": 1.5078125, + "learning_rate": 1.713209661322301e-05, + "loss": 0.9711, + "step": 4401 + }, + { + "epoch": 0.7547849197333734, + "grad_norm": 1.640625, + "learning_rate": 1.7130830630186476e-05, + "loss": 1.0617, + "step": 4402 + }, + { + "epoch": 0.7549563838223632, + "grad_norm": 1.5546875, + "learning_rate": 1.7129564414586698e-05, + "loss": 1.0486, + "step": 4403 + }, + { + "epoch": 0.755127847911353, + "grad_norm": 1.671875, + "learning_rate": 1.712829796646498e-05, + "loss": 1.0216, + "step": 4404 + }, + { + "epoch": 0.755299312000343, + "grad_norm": 1.546875, + "learning_rate": 1.712703128586262e-05, + "loss": 0.9582, + "step": 4405 + }, + { + "epoch": 0.7554707760893328, + "grad_norm": 1.625, + "learning_rate": 1.712576437282093e-05, + "loss": 1.029, + "step": 4406 + }, + { + "epoch": 0.7556422401783226, + "grad_norm": 1.546875, + "learning_rate": 1.7124497227381238e-05, + "loss": 1.0258, + "step": 4407 + }, + { + "epoch": 0.7558137042673125, + "grad_norm": 1.4921875, + "learning_rate": 1.712322984958486e-05, + "loss": 0.9825, + "step": 4408 + }, + { + "epoch": 0.7559851683563024, + "grad_norm": 1.6875, + "learning_rate": 1.7121962239473134e-05, + "loss": 1.0222, + "step": 4409 + }, + { + "epoch": 0.7561566324452922, + "grad_norm": 1.5703125, + "learning_rate": 1.7120694397087396e-05, + "loss": 0.9514, + "step": 4410 + }, + { + "epoch": 0.7563280965342821, + "grad_norm": 1.546875, + "learning_rate": 1.7119426322469002e-05, + "loss": 0.9866, + "step": 4411 + }, + { + "epoch": 0.756499560623272, + "grad_norm": 1.5703125, + "learning_rate": 1.7118158015659308e-05, + "loss": 0.9585, + "step": 4412 + }, + { + "epoch": 0.7566710247122618, + "grad_norm": 1.5390625, + "learning_rate": 1.7116889476699675e-05, + "loss": 0.985, + "step": 4413 + }, + { + "epoch": 0.7568424888012517, + "grad_norm": 1.5625, + "learning_rate": 1.7115620705631477e-05, + "loss": 0.9986, + "step": 4414 + }, + { + "epoch": 0.7570139528902415, + "grad_norm": 1.5234375, + "learning_rate": 1.711435170249609e-05, + "loss": 0.9748, + "step": 4415 + }, + { + "epoch": 0.7571854169792314, + "grad_norm": 1.5703125, + "learning_rate": 1.711308246733491e-05, + "loss": 0.9782, + "step": 4416 + }, + { + "epoch": 0.7573568810682213, + "grad_norm": 1.625, + "learning_rate": 1.7111813000189322e-05, + "loss": 0.9702, + "step": 4417 + }, + { + "epoch": 0.7575283451572111, + "grad_norm": 1.5, + "learning_rate": 1.711054330110073e-05, + "loss": 0.9542, + "step": 4418 + }, + { + "epoch": 0.757699809246201, + "grad_norm": 1.7109375, + "learning_rate": 1.7109273370110547e-05, + "loss": 1.0391, + "step": 4419 + }, + { + "epoch": 0.7578712733351909, + "grad_norm": 1.5859375, + "learning_rate": 1.7108003207260188e-05, + "loss": 0.9986, + "step": 4420 + }, + { + "epoch": 0.7580427374241807, + "grad_norm": 1.6171875, + "learning_rate": 1.7106732812591077e-05, + "loss": 1.0372, + "step": 4421 + }, + { + "epoch": 0.7582142015131705, + "grad_norm": 1.578125, + "learning_rate": 1.7105462186144652e-05, + "loss": 1.0716, + "step": 4422 + }, + { + "epoch": 0.7583856656021605, + "grad_norm": 1.6328125, + "learning_rate": 1.7104191327962345e-05, + "loss": 1.0082, + "step": 4423 + }, + { + "epoch": 0.7585571296911503, + "grad_norm": 1.703125, + "learning_rate": 1.7102920238085606e-05, + "loss": 0.9464, + "step": 4424 + }, + { + "epoch": 0.7587285937801401, + "grad_norm": 1.6015625, + "learning_rate": 1.710164891655589e-05, + "loss": 1.0328, + "step": 4425 + }, + { + "epoch": 0.7589000578691301, + "grad_norm": 1.59375, + "learning_rate": 1.7100377363414665e-05, + "loss": 0.9934, + "step": 4426 + }, + { + "epoch": 0.7590715219581199, + "grad_norm": 1.5703125, + "learning_rate": 1.7099105578703393e-05, + "loss": 1.0205, + "step": 4427 + }, + { + "epoch": 0.7592429860471097, + "grad_norm": 1.53125, + "learning_rate": 1.7097833562463556e-05, + "loss": 0.9942, + "step": 4428 + }, + { + "epoch": 0.7594144501360997, + "grad_norm": 1.5546875, + "learning_rate": 1.7096561314736638e-05, + "loss": 0.9955, + "step": 4429 + }, + { + "epoch": 0.7595859142250895, + "grad_norm": 1.5078125, + "learning_rate": 1.709528883556413e-05, + "loss": 0.936, + "step": 4430 + }, + { + "epoch": 0.7597573783140793, + "grad_norm": 1.65625, + "learning_rate": 1.709401612498754e-05, + "loss": 0.9424, + "step": 4431 + }, + { + "epoch": 0.7599288424030692, + "grad_norm": 1.6875, + "learning_rate": 1.709274318304837e-05, + "loss": 1.1093, + "step": 4432 + }, + { + "epoch": 0.7601003064920591, + "grad_norm": 1.5546875, + "learning_rate": 1.7091470009788135e-05, + "loss": 0.9639, + "step": 4433 + }, + { + "epoch": 0.7602717705810489, + "grad_norm": 1.5625, + "learning_rate": 1.7090196605248358e-05, + "loss": 0.974, + "step": 4434 + }, + { + "epoch": 0.7604432346700388, + "grad_norm": 1.578125, + "learning_rate": 1.708892296947057e-05, + "loss": 1.0629, + "step": 4435 + }, + { + "epoch": 0.7606146987590287, + "grad_norm": 1.609375, + "learning_rate": 1.708764910249631e-05, + "loss": 1.1132, + "step": 4436 + }, + { + "epoch": 0.7607861628480185, + "grad_norm": 1.546875, + "learning_rate": 1.7086375004367124e-05, + "loss": 0.994, + "step": 4437 + }, + { + "epoch": 0.7609576269370084, + "grad_norm": 1.640625, + "learning_rate": 1.708510067512456e-05, + "loss": 1.0587, + "step": 4438 + }, + { + "epoch": 0.7611290910259982, + "grad_norm": 1.5078125, + "learning_rate": 1.708382611481019e-05, + "loss": 0.9997, + "step": 4439 + }, + { + "epoch": 0.7613005551149881, + "grad_norm": 1.5703125, + "learning_rate": 1.7082551323465573e-05, + "loss": 1.0589, + "step": 4440 + }, + { + "epoch": 0.761472019203978, + "grad_norm": 1.6171875, + "learning_rate": 1.7081276301132284e-05, + "loss": 1.0245, + "step": 4441 + }, + { + "epoch": 0.7616434832929678, + "grad_norm": 1.484375, + "learning_rate": 1.7080001047851912e-05, + "loss": 0.9763, + "step": 4442 + }, + { + "epoch": 0.7618149473819577, + "grad_norm": 1.5234375, + "learning_rate": 1.7078725563666045e-05, + "loss": 0.8622, + "step": 4443 + }, + { + "epoch": 0.7619864114709476, + "grad_norm": 1.6484375, + "learning_rate": 1.707744984861628e-05, + "loss": 1.0999, + "step": 4444 + }, + { + "epoch": 0.7621578755599374, + "grad_norm": 1.609375, + "learning_rate": 1.7076173902744226e-05, + "loss": 1.0719, + "step": 4445 + }, + { + "epoch": 0.7623293396489272, + "grad_norm": 1.6171875, + "learning_rate": 1.7074897726091492e-05, + "loss": 1.0189, + "step": 4446 + }, + { + "epoch": 0.7625008037379172, + "grad_norm": 1.5859375, + "learning_rate": 1.7073621318699703e-05, + "loss": 0.9854, + "step": 4447 + }, + { + "epoch": 0.762672267826907, + "grad_norm": 1.5859375, + "learning_rate": 1.7072344680610485e-05, + "loss": 1.007, + "step": 4448 + }, + { + "epoch": 0.7628437319158968, + "grad_norm": 1.6015625, + "learning_rate": 1.7071067811865477e-05, + "loss": 1.0101, + "step": 4449 + }, + { + "epoch": 0.7630151960048868, + "grad_norm": 1.625, + "learning_rate": 1.7069790712506317e-05, + "loss": 0.9622, + "step": 4450 + }, + { + "epoch": 0.7631866600938766, + "grad_norm": 1.546875, + "learning_rate": 1.7068513382574665e-05, + "loss": 0.9194, + "step": 4451 + }, + { + "epoch": 0.7633581241828664, + "grad_norm": 1.546875, + "learning_rate": 1.7067235822112168e-05, + "loss": 1.0135, + "step": 4452 + }, + { + "epoch": 0.7635295882718564, + "grad_norm": 1.671875, + "learning_rate": 1.7065958031160503e-05, + "loss": 0.9483, + "step": 4453 + }, + { + "epoch": 0.7637010523608462, + "grad_norm": 1.6875, + "learning_rate": 1.7064680009761338e-05, + "loss": 1.0235, + "step": 4454 + }, + { + "epoch": 0.763872516449836, + "grad_norm": 1.5859375, + "learning_rate": 1.7063401757956353e-05, + "loss": 1.05, + "step": 4455 + }, + { + "epoch": 0.7640439805388259, + "grad_norm": 1.546875, + "learning_rate": 1.7062123275787236e-05, + "loss": 0.9476, + "step": 4456 + }, + { + "epoch": 0.7642154446278158, + "grad_norm": 1.6171875, + "learning_rate": 1.706084456329569e-05, + "loss": 1.05, + "step": 4457 + }, + { + "epoch": 0.7643869087168056, + "grad_norm": 1.5234375, + "learning_rate": 1.7059565620523414e-05, + "loss": 1.0445, + "step": 4458 + }, + { + "epoch": 0.7645583728057955, + "grad_norm": 1.625, + "learning_rate": 1.7058286447512115e-05, + "loss": 1.0614, + "step": 4459 + }, + { + "epoch": 0.7647298368947854, + "grad_norm": 1.484375, + "learning_rate": 1.7057007044303518e-05, + "loss": 1.0182, + "step": 4460 + }, + { + "epoch": 0.7649013009837752, + "grad_norm": 1.578125, + "learning_rate": 1.705572741093935e-05, + "loss": 0.9285, + "step": 4461 + }, + { + "epoch": 0.7650727650727651, + "grad_norm": 1.5625, + "learning_rate": 1.7054447547461337e-05, + "loss": 0.9481, + "step": 4462 + }, + { + "epoch": 0.7652442291617549, + "grad_norm": 1.515625, + "learning_rate": 1.705316745391123e-05, + "loss": 1.0469, + "step": 4463 + }, + { + "epoch": 0.7654156932507448, + "grad_norm": 1.5703125, + "learning_rate": 1.7051887130330767e-05, + "loss": 1.0112, + "step": 4464 + }, + { + "epoch": 0.7655871573397347, + "grad_norm": 1.5234375, + "learning_rate": 1.7050606576761714e-05, + "loss": 0.9453, + "step": 4465 + }, + { + "epoch": 0.7657586214287245, + "grad_norm": 1.5234375, + "learning_rate": 1.704932579324583e-05, + "loss": 0.9819, + "step": 4466 + }, + { + "epoch": 0.7659300855177144, + "grad_norm": 1.5703125, + "learning_rate": 1.7048044779824885e-05, + "loss": 1.0449, + "step": 4467 + }, + { + "epoch": 0.7661015496067043, + "grad_norm": 1.640625, + "learning_rate": 1.7046763536540657e-05, + "loss": 0.9743, + "step": 4468 + }, + { + "epoch": 0.7662730136956941, + "grad_norm": 1.75, + "learning_rate": 1.704548206343494e-05, + "loss": 1.0509, + "step": 4469 + }, + { + "epoch": 0.7664444777846839, + "grad_norm": 1.6171875, + "learning_rate": 1.704420036054952e-05, + "loss": 0.9419, + "step": 4470 + }, + { + "epoch": 0.7666159418736739, + "grad_norm": 1.5625, + "learning_rate": 1.70429184279262e-05, + "loss": 0.9786, + "step": 4471 + }, + { + "epoch": 0.7667874059626637, + "grad_norm": 1.4609375, + "learning_rate": 1.7041636265606786e-05, + "loss": 0.9696, + "step": 4472 + }, + { + "epoch": 0.7669588700516535, + "grad_norm": 1.5625, + "learning_rate": 1.7040353873633097e-05, + "loss": 1.0663, + "step": 4473 + }, + { + "epoch": 0.7671303341406435, + "grad_norm": 1.5390625, + "learning_rate": 1.703907125204696e-05, + "loss": 0.975, + "step": 4474 + }, + { + "epoch": 0.7673017982296333, + "grad_norm": 1.5, + "learning_rate": 1.7037788400890206e-05, + "loss": 0.963, + "step": 4475 + }, + { + "epoch": 0.7674732623186231, + "grad_norm": 1.65625, + "learning_rate": 1.7036505320204664e-05, + "loss": 1.0571, + "step": 4476 + }, + { + "epoch": 0.767644726407613, + "grad_norm": 1.5703125, + "learning_rate": 1.703522201003219e-05, + "loss": 0.9805, + "step": 4477 + }, + { + "epoch": 0.7678161904966029, + "grad_norm": 1.5703125, + "learning_rate": 1.703393847041463e-05, + "loss": 1.014, + "step": 4478 + }, + { + "epoch": 0.7679876545855927, + "grad_norm": 1.546875, + "learning_rate": 1.7032654701393853e-05, + "loss": 1.036, + "step": 4479 + }, + { + "epoch": 0.7681591186745826, + "grad_norm": 1.5859375, + "learning_rate": 1.7031370703011724e-05, + "loss": 1.0157, + "step": 4480 + }, + { + "epoch": 0.7683305827635725, + "grad_norm": 1.5625, + "learning_rate": 1.7030086475310116e-05, + "loss": 0.9388, + "step": 4481 + }, + { + "epoch": 0.7685020468525623, + "grad_norm": 1.578125, + "learning_rate": 1.7028802018330915e-05, + "loss": 0.9365, + "step": 4482 + }, + { + "epoch": 0.7686735109415522, + "grad_norm": 1.6171875, + "learning_rate": 1.7027517332116014e-05, + "loss": 1.0533, + "step": 4483 + }, + { + "epoch": 0.768844975030542, + "grad_norm": 1.4921875, + "learning_rate": 1.7026232416707312e-05, + "loss": 0.9399, + "step": 4484 + }, + { + "epoch": 0.7690164391195319, + "grad_norm": 1.515625, + "learning_rate": 1.7024947272146704e-05, + "loss": 1.0067, + "step": 4485 + }, + { + "epoch": 0.7691879032085218, + "grad_norm": 1.546875, + "learning_rate": 1.702366189847612e-05, + "loss": 0.9739, + "step": 4486 + }, + { + "epoch": 0.7693593672975116, + "grad_norm": 1.578125, + "learning_rate": 1.7022376295737473e-05, + "loss": 1.0058, + "step": 4487 + }, + { + "epoch": 0.7695308313865015, + "grad_norm": 1.6875, + "learning_rate": 1.702109046397269e-05, + "loss": 0.9879, + "step": 4488 + }, + { + "epoch": 0.7697022954754914, + "grad_norm": 1.5546875, + "learning_rate": 1.7019804403223702e-05, + "loss": 0.9565, + "step": 4489 + }, + { + "epoch": 0.7698737595644812, + "grad_norm": 1.625, + "learning_rate": 1.7018518113532467e-05, + "loss": 1.0858, + "step": 4490 + }, + { + "epoch": 0.7700452236534711, + "grad_norm": 1.4921875, + "learning_rate": 1.701723159494092e-05, + "loss": 1.0049, + "step": 4491 + }, + { + "epoch": 0.770216687742461, + "grad_norm": 1.578125, + "learning_rate": 1.7015944847491027e-05, + "loss": 1.0759, + "step": 4492 + }, + { + "epoch": 0.7703881518314508, + "grad_norm": 1.5625, + "learning_rate": 1.7014657871224758e-05, + "loss": 0.9866, + "step": 4493 + }, + { + "epoch": 0.7705596159204406, + "grad_norm": 1.515625, + "learning_rate": 1.7013370666184078e-05, + "loss": 0.9232, + "step": 4494 + }, + { + "epoch": 0.7707310800094306, + "grad_norm": 1.5859375, + "learning_rate": 1.701208323241097e-05, + "loss": 0.9587, + "step": 4495 + }, + { + "epoch": 0.7709025440984204, + "grad_norm": 1.53125, + "learning_rate": 1.7010795569947426e-05, + "loss": 0.9278, + "step": 4496 + }, + { + "epoch": 0.7710740081874102, + "grad_norm": 1.5859375, + "learning_rate": 1.7009507678835432e-05, + "loss": 1.004, + "step": 4497 + }, + { + "epoch": 0.7712454722764001, + "grad_norm": 1.65625, + "learning_rate": 1.7008219559117002e-05, + "loss": 1.0233, + "step": 4498 + }, + { + "epoch": 0.77141693636539, + "grad_norm": 1.4296875, + "learning_rate": 1.7006931210834144e-05, + "loss": 0.929, + "step": 4499 + }, + { + "epoch": 0.7715884004543798, + "grad_norm": 1.5859375, + "learning_rate": 1.700564263402887e-05, + "loss": 0.9735, + "step": 4500 + }, + { + "epoch": 0.7717598645433696, + "grad_norm": 1.578125, + "learning_rate": 1.7004353828743206e-05, + "loss": 1.009, + "step": 4501 + }, + { + "epoch": 0.7719313286323596, + "grad_norm": 1.5625, + "learning_rate": 1.7003064795019193e-05, + "loss": 1.008, + "step": 4502 + }, + { + "epoch": 0.7721027927213494, + "grad_norm": 1.6015625, + "learning_rate": 1.7001775532898865e-05, + "loss": 0.9422, + "step": 4503 + }, + { + "epoch": 0.7722742568103392, + "grad_norm": 1.6171875, + "learning_rate": 1.7000486042424268e-05, + "loss": 1.0274, + "step": 4504 + }, + { + "epoch": 0.7724457208993292, + "grad_norm": 1.5859375, + "learning_rate": 1.6999196323637463e-05, + "loss": 0.9829, + "step": 4505 + }, + { + "epoch": 0.772617184988319, + "grad_norm": 1.59375, + "learning_rate": 1.6997906376580506e-05, + "loss": 1.1202, + "step": 4506 + }, + { + "epoch": 0.7727886490773088, + "grad_norm": 1.4765625, + "learning_rate": 1.6996616201295472e-05, + "loss": 0.9832, + "step": 4507 + }, + { + "epoch": 0.7729601131662988, + "grad_norm": 1.546875, + "learning_rate": 1.6995325797824434e-05, + "loss": 0.9579, + "step": 4508 + }, + { + "epoch": 0.7731315772552886, + "grad_norm": 1.59375, + "learning_rate": 1.6994035166209483e-05, + "loss": 1.0081, + "step": 4509 + }, + { + "epoch": 0.7733030413442784, + "grad_norm": 1.4765625, + "learning_rate": 1.699274430649271e-05, + "loss": 0.9441, + "step": 4510 + }, + { + "epoch": 0.7734745054332683, + "grad_norm": 1.609375, + "learning_rate": 1.6991453218716212e-05, + "loss": 0.9585, + "step": 4511 + }, + { + "epoch": 0.7736459695222582, + "grad_norm": 1.5234375, + "learning_rate": 1.6990161902922094e-05, + "loss": 0.9768, + "step": 4512 + }, + { + "epoch": 0.773817433611248, + "grad_norm": 1.6015625, + "learning_rate": 1.6988870359152473e-05, + "loss": 1.0389, + "step": 4513 + }, + { + "epoch": 0.7739888977002379, + "grad_norm": 1.6640625, + "learning_rate": 1.6987578587449478e-05, + "loss": 0.9938, + "step": 4514 + }, + { + "epoch": 0.7741603617892278, + "grad_norm": 1.515625, + "learning_rate": 1.6986286587855226e-05, + "loss": 0.9872, + "step": 4515 + }, + { + "epoch": 0.7743318258782176, + "grad_norm": 1.546875, + "learning_rate": 1.6984994360411866e-05, + "loss": 0.9156, + "step": 4516 + }, + { + "epoch": 0.7745032899672075, + "grad_norm": 1.5546875, + "learning_rate": 1.6983701905161533e-05, + "loss": 0.9448, + "step": 4517 + }, + { + "epoch": 0.7746747540561973, + "grad_norm": 1.5, + "learning_rate": 1.6982409222146384e-05, + "loss": 0.9568, + "step": 4518 + }, + { + "epoch": 0.7748462181451872, + "grad_norm": 1.5234375, + "learning_rate": 1.698111631140858e-05, + "loss": 1.0083, + "step": 4519 + }, + { + "epoch": 0.7750176822341771, + "grad_norm": 1.578125, + "learning_rate": 1.697982317299028e-05, + "loss": 1.0531, + "step": 4520 + }, + { + "epoch": 0.7751891463231669, + "grad_norm": 1.5703125, + "learning_rate": 1.6978529806933665e-05, + "loss": 0.9687, + "step": 4521 + }, + { + "epoch": 0.7753606104121568, + "grad_norm": 1.46875, + "learning_rate": 1.6977236213280913e-05, + "loss": 0.9364, + "step": 4522 + }, + { + "epoch": 0.7755320745011467, + "grad_norm": 1.609375, + "learning_rate": 1.6975942392074217e-05, + "loss": 1.0638, + "step": 4523 + }, + { + "epoch": 0.7757035385901365, + "grad_norm": 1.5703125, + "learning_rate": 1.6974648343355765e-05, + "loss": 1.0329, + "step": 4524 + }, + { + "epoch": 0.7758750026791263, + "grad_norm": 1.5703125, + "learning_rate": 1.697335406716777e-05, + "loss": 0.9509, + "step": 4525 + }, + { + "epoch": 0.7760464667681163, + "grad_norm": 1.515625, + "learning_rate": 1.697205956355244e-05, + "loss": 0.9671, + "step": 4526 + }, + { + "epoch": 0.7762179308571061, + "grad_norm": 1.65625, + "learning_rate": 1.6970764832551996e-05, + "loss": 1.0382, + "step": 4527 + }, + { + "epoch": 0.7763893949460959, + "grad_norm": 1.5625, + "learning_rate": 1.6969469874208657e-05, + "loss": 0.992, + "step": 4528 + }, + { + "epoch": 0.7765608590350859, + "grad_norm": 1.5078125, + "learning_rate": 1.696817468856466e-05, + "loss": 0.9358, + "step": 4529 + }, + { + "epoch": 0.7767323231240757, + "grad_norm": 1.625, + "learning_rate": 1.6966879275662252e-05, + "loss": 1.0602, + "step": 4530 + }, + { + "epoch": 0.7769037872130655, + "grad_norm": 1.65625, + "learning_rate": 1.6965583635543673e-05, + "loss": 1.0713, + "step": 4531 + }, + { + "epoch": 0.7770752513020555, + "grad_norm": 1.5390625, + "learning_rate": 1.696428776825118e-05, + "loss": 1.0382, + "step": 4532 + }, + { + "epoch": 0.7772467153910453, + "grad_norm": 1.6875, + "learning_rate": 1.6962991673827038e-05, + "loss": 1.0826, + "step": 4533 + }, + { + "epoch": 0.7774181794800351, + "grad_norm": 1.6484375, + "learning_rate": 1.696169535231352e-05, + "loss": 1.0855, + "step": 4534 + }, + { + "epoch": 0.777589643569025, + "grad_norm": 1.59375, + "learning_rate": 1.69603988037529e-05, + "loss": 0.9902, + "step": 4535 + }, + { + "epoch": 0.7777611076580149, + "grad_norm": 1.609375, + "learning_rate": 1.695910202818746e-05, + "loss": 1.0447, + "step": 4536 + }, + { + "epoch": 0.7779325717470047, + "grad_norm": 1.6484375, + "learning_rate": 1.6957805025659504e-05, + "loss": 1.1, + "step": 4537 + }, + { + "epoch": 0.7781040358359946, + "grad_norm": 1.5390625, + "learning_rate": 1.6956507796211325e-05, + "loss": 0.9557, + "step": 4538 + }, + { + "epoch": 0.7782754999249845, + "grad_norm": 1.53125, + "learning_rate": 1.695521033988523e-05, + "loss": 0.9928, + "step": 4539 + }, + { + "epoch": 0.7784469640139743, + "grad_norm": 1.59375, + "learning_rate": 1.695391265672353e-05, + "loss": 0.9859, + "step": 4540 + }, + { + "epoch": 0.7786184281029642, + "grad_norm": 1.5390625, + "learning_rate": 1.6952614746768555e-05, + "loss": 1.0547, + "step": 4541 + }, + { + "epoch": 0.778789892191954, + "grad_norm": 1.625, + "learning_rate": 1.6951316610062634e-05, + "loss": 1.0959, + "step": 4542 + }, + { + "epoch": 0.7789613562809439, + "grad_norm": 1.59375, + "learning_rate": 1.6950018246648105e-05, + "loss": 1.0866, + "step": 4543 + }, + { + "epoch": 0.7791328203699338, + "grad_norm": 1.5546875, + "learning_rate": 1.6948719656567304e-05, + "loss": 0.9895, + "step": 4544 + }, + { + "epoch": 0.7793042844589236, + "grad_norm": 1.59375, + "learning_rate": 1.694742083986259e-05, + "loss": 0.9399, + "step": 4545 + }, + { + "epoch": 0.7794757485479135, + "grad_norm": 1.59375, + "learning_rate": 1.6946121796576324e-05, + "loss": 1.1096, + "step": 4546 + }, + { + "epoch": 0.7796472126369034, + "grad_norm": 1.5078125, + "learning_rate": 1.6944822526750865e-05, + "loss": 0.9154, + "step": 4547 + }, + { + "epoch": 0.7798186767258932, + "grad_norm": 1.5234375, + "learning_rate": 1.6943523030428592e-05, + "loss": 0.9304, + "step": 4548 + }, + { + "epoch": 0.779990140814883, + "grad_norm": 1.671875, + "learning_rate": 1.694222330765189e-05, + "loss": 1.0091, + "step": 4549 + }, + { + "epoch": 0.780161604903873, + "grad_norm": 1.5, + "learning_rate": 1.694092335846314e-05, + "loss": 1.0471, + "step": 4550 + }, + { + "epoch": 0.7803330689928628, + "grad_norm": 1.6484375, + "learning_rate": 1.6939623182904746e-05, + "loss": 1.0572, + "step": 4551 + }, + { + "epoch": 0.7805045330818526, + "grad_norm": 1.5078125, + "learning_rate": 1.6938322781019106e-05, + "loss": 0.8963, + "step": 4552 + }, + { + "epoch": 0.7806759971708426, + "grad_norm": 1.65625, + "learning_rate": 1.693702215284863e-05, + "loss": 1.0282, + "step": 4553 + }, + { + "epoch": 0.7808474612598324, + "grad_norm": 1.6484375, + "learning_rate": 1.693572129843574e-05, + "loss": 0.9697, + "step": 4554 + }, + { + "epoch": 0.7810189253488222, + "grad_norm": 1.6796875, + "learning_rate": 1.6934420217822864e-05, + "loss": 1.0408, + "step": 4555 + }, + { + "epoch": 0.7811903894378122, + "grad_norm": 1.515625, + "learning_rate": 1.693311891105243e-05, + "loss": 0.9593, + "step": 4556 + }, + { + "epoch": 0.781361853526802, + "grad_norm": 1.5625, + "learning_rate": 1.6931817378166885e-05, + "loss": 0.9947, + "step": 4557 + }, + { + "epoch": 0.7815333176157918, + "grad_norm": 1.734375, + "learning_rate": 1.6930515619208665e-05, + "loss": 1.0501, + "step": 4558 + }, + { + "epoch": 0.7817047817047817, + "grad_norm": 1.6171875, + "learning_rate": 1.6929213634220235e-05, + "loss": 1.0618, + "step": 4559 + }, + { + "epoch": 0.7818762457937716, + "grad_norm": 1.59375, + "learning_rate": 1.692791142324406e-05, + "loss": 1.0571, + "step": 4560 + }, + { + "epoch": 0.7820477098827614, + "grad_norm": 1.5390625, + "learning_rate": 1.69266089863226e-05, + "loss": 0.9615, + "step": 4561 + }, + { + "epoch": 0.7822191739717513, + "grad_norm": 1.6640625, + "learning_rate": 1.692530632349834e-05, + "loss": 1.072, + "step": 4562 + }, + { + "epoch": 0.7823906380607412, + "grad_norm": 1.5546875, + "learning_rate": 1.6924003434813763e-05, + "loss": 0.9741, + "step": 4563 + }, + { + "epoch": 0.782562102149731, + "grad_norm": 1.6015625, + "learning_rate": 1.6922700320311357e-05, + "loss": 1.0453, + "step": 4564 + }, + { + "epoch": 0.7827335662387209, + "grad_norm": 1.5546875, + "learning_rate": 1.692139698003363e-05, + "loss": 1.0435, + "step": 4565 + }, + { + "epoch": 0.7829050303277107, + "grad_norm": 1.4765625, + "learning_rate": 1.6920093414023083e-05, + "loss": 1.017, + "step": 4566 + }, + { + "epoch": 0.7830764944167006, + "grad_norm": 1.578125, + "learning_rate": 1.691878962232223e-05, + "loss": 1.0075, + "step": 4567 + }, + { + "epoch": 0.7832479585056905, + "grad_norm": 1.6171875, + "learning_rate": 1.6917485604973595e-05, + "loss": 0.9461, + "step": 4568 + }, + { + "epoch": 0.7834194225946803, + "grad_norm": 1.640625, + "learning_rate": 1.6916181362019704e-05, + "loss": 1.0606, + "step": 4569 + }, + { + "epoch": 0.7835908866836702, + "grad_norm": 1.578125, + "learning_rate": 1.6914876893503093e-05, + "loss": 1.0641, + "step": 4570 + }, + { + "epoch": 0.7837623507726601, + "grad_norm": 1.5625, + "learning_rate": 1.6913572199466312e-05, + "loss": 0.9939, + "step": 4571 + }, + { + "epoch": 0.7839338148616499, + "grad_norm": 1.5703125, + "learning_rate": 1.6912267279951904e-05, + "loss": 0.9261, + "step": 4572 + }, + { + "epoch": 0.7841052789506398, + "grad_norm": 1.5546875, + "learning_rate": 1.6910962135002433e-05, + "loss": 0.9663, + "step": 4573 + }, + { + "epoch": 0.7842767430396297, + "grad_norm": 1.5703125, + "learning_rate": 1.690965676466046e-05, + "loss": 1.0686, + "step": 4574 + }, + { + "epoch": 0.7844482071286195, + "grad_norm": 1.53125, + "learning_rate": 1.6908351168968563e-05, + "loss": 1.02, + "step": 4575 + }, + { + "epoch": 0.7846196712176093, + "grad_norm": 1.6484375, + "learning_rate": 1.690704534796932e-05, + "loss": 1.0676, + "step": 4576 + }, + { + "epoch": 0.7847911353065993, + "grad_norm": 1.640625, + "learning_rate": 1.6905739301705316e-05, + "loss": 1.0048, + "step": 4577 + }, + { + "epoch": 0.7849625993955891, + "grad_norm": 1.609375, + "learning_rate": 1.690443303021915e-05, + "loss": 1.0309, + "step": 4578 + }, + { + "epoch": 0.7851340634845789, + "grad_norm": 1.46875, + "learning_rate": 1.6903126533553425e-05, + "loss": 1.0769, + "step": 4579 + }, + { + "epoch": 0.7853055275735689, + "grad_norm": 1.5, + "learning_rate": 1.690181981175075e-05, + "loss": 0.9724, + "step": 4580 + }, + { + "epoch": 0.7854769916625587, + "grad_norm": 1.453125, + "learning_rate": 1.690051286485374e-05, + "loss": 1.0019, + "step": 4581 + }, + { + "epoch": 0.7856484557515485, + "grad_norm": 1.609375, + "learning_rate": 1.6899205692905016e-05, + "loss": 0.9549, + "step": 4582 + }, + { + "epoch": 0.7858199198405385, + "grad_norm": 1.578125, + "learning_rate": 1.6897898295947217e-05, + "loss": 0.9591, + "step": 4583 + }, + { + "epoch": 0.7859913839295283, + "grad_norm": 1.5234375, + "learning_rate": 1.6896590674022977e-05, + "loss": 0.9805, + "step": 4584 + }, + { + "epoch": 0.7861628480185181, + "grad_norm": 1.546875, + "learning_rate": 1.6895282827174952e-05, + "loss": 0.9356, + "step": 4585 + }, + { + "epoch": 0.786334312107508, + "grad_norm": 1.5546875, + "learning_rate": 1.6893974755445785e-05, + "loss": 1.0637, + "step": 4586 + }, + { + "epoch": 0.7865057761964979, + "grad_norm": 1.515625, + "learning_rate": 1.689266645887814e-05, + "loss": 0.9466, + "step": 4587 + }, + { + "epoch": 0.7866772402854877, + "grad_norm": 1.5859375, + "learning_rate": 1.6891357937514685e-05, + "loss": 0.9378, + "step": 4588 + }, + { + "epoch": 0.7868487043744776, + "grad_norm": 1.5546875, + "learning_rate": 1.6890049191398102e-05, + "loss": 0.9389, + "step": 4589 + }, + { + "epoch": 0.7870201684634675, + "grad_norm": 1.59375, + "learning_rate": 1.688874022057107e-05, + "loss": 1.0351, + "step": 4590 + }, + { + "epoch": 0.7871916325524573, + "grad_norm": 1.6328125, + "learning_rate": 1.688743102507627e-05, + "loss": 1.0766, + "step": 4591 + }, + { + "epoch": 0.7873630966414471, + "grad_norm": 1.609375, + "learning_rate": 1.6886121604956415e-05, + "loss": 1.1055, + "step": 4592 + }, + { + "epoch": 0.787534560730437, + "grad_norm": 1.5390625, + "learning_rate": 1.6884811960254203e-05, + "loss": 1.0666, + "step": 4593 + }, + { + "epoch": 0.7877060248194269, + "grad_norm": 1.6328125, + "learning_rate": 1.6883502091012346e-05, + "loss": 1.036, + "step": 4594 + }, + { + "epoch": 0.7878774889084167, + "grad_norm": 1.5625, + "learning_rate": 1.6882191997273567e-05, + "loss": 1.0623, + "step": 4595 + }, + { + "epoch": 0.7880489529974066, + "grad_norm": 1.578125, + "learning_rate": 1.6880881679080592e-05, + "loss": 0.9519, + "step": 4596 + }, + { + "epoch": 0.7882204170863965, + "grad_norm": 1.5546875, + "learning_rate": 1.687957113647615e-05, + "loss": 1.1319, + "step": 4597 + }, + { + "epoch": 0.7883918811753863, + "grad_norm": 1.5078125, + "learning_rate": 1.6878260369502993e-05, + "loss": 0.984, + "step": 4598 + }, + { + "epoch": 0.7885633452643762, + "grad_norm": 1.5703125, + "learning_rate": 1.6876949378203858e-05, + "loss": 1.0901, + "step": 4599 + }, + { + "epoch": 0.788734809353366, + "grad_norm": 1.5703125, + "learning_rate": 1.6875638162621512e-05, + "loss": 0.9269, + "step": 4600 + }, + { + "epoch": 0.7889062734423559, + "grad_norm": 1.5234375, + "learning_rate": 1.6874326722798713e-05, + "loss": 1.0038, + "step": 4601 + }, + { + "epoch": 0.7890777375313458, + "grad_norm": 1.6171875, + "learning_rate": 1.6873015058778232e-05, + "loss": 1.0358, + "step": 4602 + }, + { + "epoch": 0.7892492016203356, + "grad_norm": 1.6484375, + "learning_rate": 1.687170317060285e-05, + "loss": 1.0957, + "step": 4603 + }, + { + "epoch": 0.7894206657093255, + "grad_norm": 1.515625, + "learning_rate": 1.687039105831535e-05, + "loss": 0.9912, + "step": 4604 + }, + { + "epoch": 0.7895921297983154, + "grad_norm": 1.453125, + "learning_rate": 1.6869078721958528e-05, + "loss": 0.982, + "step": 4605 + }, + { + "epoch": 0.7897635938873052, + "grad_norm": 1.546875, + "learning_rate": 1.686776616157518e-05, + "loss": 0.9703, + "step": 4606 + }, + { + "epoch": 0.789935057976295, + "grad_norm": 1.5234375, + "learning_rate": 1.6866453377208115e-05, + "loss": 1.0642, + "step": 4607 + }, + { + "epoch": 0.790106522065285, + "grad_norm": 1.65625, + "learning_rate": 1.6865140368900153e-05, + "loss": 1.0153, + "step": 4608 + }, + { + "epoch": 0.7902779861542748, + "grad_norm": 1.59375, + "learning_rate": 1.686382713669411e-05, + "loss": 0.9728, + "step": 4609 + }, + { + "epoch": 0.7904494502432646, + "grad_norm": 1.578125, + "learning_rate": 1.6862513680632815e-05, + "loss": 0.9923, + "step": 4610 + }, + { + "epoch": 0.7906209143322546, + "grad_norm": 1.53125, + "learning_rate": 1.6861200000759108e-05, + "loss": 1.0171, + "step": 4611 + }, + { + "epoch": 0.7907923784212444, + "grad_norm": 1.6953125, + "learning_rate": 1.6859886097115833e-05, + "loss": 1.075, + "step": 4612 + }, + { + "epoch": 0.7909638425102342, + "grad_norm": 1.671875, + "learning_rate": 1.685857196974584e-05, + "loss": 0.9838, + "step": 4613 + }, + { + "epoch": 0.7911353065992242, + "grad_norm": 1.6328125, + "learning_rate": 1.6857257618691992e-05, + "loss": 1.0337, + "step": 4614 + }, + { + "epoch": 0.791306770688214, + "grad_norm": 1.609375, + "learning_rate": 1.6855943043997144e-05, + "loss": 0.9971, + "step": 4615 + }, + { + "epoch": 0.7914782347772038, + "grad_norm": 1.5703125, + "learning_rate": 1.685462824570418e-05, + "loss": 1.028, + "step": 4616 + }, + { + "epoch": 0.7916496988661937, + "grad_norm": 1.6796875, + "learning_rate": 1.6853313223855977e-05, + "loss": 1.0616, + "step": 4617 + }, + { + "epoch": 0.7918211629551836, + "grad_norm": 1.53125, + "learning_rate": 1.685199797849542e-05, + "loss": 1.0246, + "step": 4618 + }, + { + "epoch": 0.7919926270441734, + "grad_norm": 1.578125, + "learning_rate": 1.685068250966541e-05, + "loss": 1.0009, + "step": 4619 + }, + { + "epoch": 0.7921640911331633, + "grad_norm": 1.6015625, + "learning_rate": 1.6849366817408846e-05, + "loss": 1.0326, + "step": 4620 + }, + { + "epoch": 0.7923355552221532, + "grad_norm": 1.5625, + "learning_rate": 1.684805090176864e-05, + "loss": 0.9854, + "step": 4621 + }, + { + "epoch": 0.792507019311143, + "grad_norm": 1.4921875, + "learning_rate": 1.68467347627877e-05, + "loss": 0.9683, + "step": 4622 + }, + { + "epoch": 0.7926784834001329, + "grad_norm": 1.515625, + "learning_rate": 1.684541840050896e-05, + "loss": 0.9217, + "step": 4623 + }, + { + "epoch": 0.7928499474891227, + "grad_norm": 1.53125, + "learning_rate": 1.6844101814975345e-05, + "loss": 0.9986, + "step": 4624 + }, + { + "epoch": 0.7930214115781126, + "grad_norm": 1.671875, + "learning_rate": 1.6842785006229805e-05, + "loss": 1.0345, + "step": 4625 + }, + { + "epoch": 0.7931928756671025, + "grad_norm": 1.609375, + "learning_rate": 1.6841467974315275e-05, + "loss": 1.059, + "step": 4626 + }, + { + "epoch": 0.7933643397560923, + "grad_norm": 1.6015625, + "learning_rate": 1.6840150719274712e-05, + "loss": 1.0887, + "step": 4627 + }, + { + "epoch": 0.7935358038450822, + "grad_norm": 1.609375, + "learning_rate": 1.6838833241151072e-05, + "loss": 0.9706, + "step": 4628 + }, + { + "epoch": 0.7937072679340721, + "grad_norm": 1.6328125, + "learning_rate": 1.6837515539987333e-05, + "loss": 1.0665, + "step": 4629 + }, + { + "epoch": 0.7938787320230619, + "grad_norm": 1.546875, + "learning_rate": 1.6836197615826463e-05, + "loss": 0.9952, + "step": 4630 + }, + { + "epoch": 0.7940501961120517, + "grad_norm": 1.5546875, + "learning_rate": 1.6834879468711444e-05, + "loss": 1.026, + "step": 4631 + }, + { + "epoch": 0.7942216602010417, + "grad_norm": 1.59375, + "learning_rate": 1.6833561098685272e-05, + "loss": 1.0303, + "step": 4632 + }, + { + "epoch": 0.7943931242900315, + "grad_norm": 1.5078125, + "learning_rate": 1.6832242505790938e-05, + "loss": 0.94, + "step": 4633 + }, + { + "epoch": 0.7945645883790213, + "grad_norm": 1.6328125, + "learning_rate": 1.6830923690071445e-05, + "loss": 1.047, + "step": 4634 + }, + { + "epoch": 0.7947360524680113, + "grad_norm": 1.6328125, + "learning_rate": 1.6829604651569805e-05, + "loss": 1.0085, + "step": 4635 + }, + { + "epoch": 0.7949075165570011, + "grad_norm": 1.640625, + "learning_rate": 1.6828285390329047e-05, + "loss": 0.9228, + "step": 4636 + }, + { + "epoch": 0.7950789806459909, + "grad_norm": 1.5546875, + "learning_rate": 1.6826965906392187e-05, + "loss": 1.0364, + "step": 4637 + }, + { + "epoch": 0.7952504447349809, + "grad_norm": 1.5078125, + "learning_rate": 1.682564619980226e-05, + "loss": 0.965, + "step": 4638 + }, + { + "epoch": 0.7954219088239707, + "grad_norm": 1.5546875, + "learning_rate": 1.682432627060231e-05, + "loss": 1.1089, + "step": 4639 + }, + { + "epoch": 0.7955933729129605, + "grad_norm": 1.578125, + "learning_rate": 1.682300611883538e-05, + "loss": 0.9678, + "step": 4640 + }, + { + "epoch": 0.7957648370019504, + "grad_norm": 1.59375, + "learning_rate": 1.6821685744544526e-05, + "loss": 0.9893, + "step": 4641 + }, + { + "epoch": 0.7959363010909403, + "grad_norm": 1.6640625, + "learning_rate": 1.6820365147772812e-05, + "loss": 1.008, + "step": 4642 + }, + { + "epoch": 0.7961077651799301, + "grad_norm": 1.6015625, + "learning_rate": 1.681904432856331e-05, + "loss": 1.0865, + "step": 4643 + }, + { + "epoch": 0.79627922926892, + "grad_norm": 1.5703125, + "learning_rate": 1.6817723286959092e-05, + "loss": 1.0863, + "step": 4644 + }, + { + "epoch": 0.7964506933579099, + "grad_norm": 1.5546875, + "learning_rate": 1.6816402023003246e-05, + "loss": 1.0579, + "step": 4645 + }, + { + "epoch": 0.7966221574468997, + "grad_norm": 1.609375, + "learning_rate": 1.6815080536738862e-05, + "loss": 1.0737, + "step": 4646 + }, + { + "epoch": 0.7967936215358896, + "grad_norm": 1.5078125, + "learning_rate": 1.6813758828209036e-05, + "loss": 1.0252, + "step": 4647 + }, + { + "epoch": 0.7969650856248794, + "grad_norm": 1.59375, + "learning_rate": 1.681243689745688e-05, + "loss": 1.0197, + "step": 4648 + }, + { + "epoch": 0.7971365497138693, + "grad_norm": 1.578125, + "learning_rate": 1.6811114744525502e-05, + "loss": 1.0342, + "step": 4649 + }, + { + "epoch": 0.7973080138028592, + "grad_norm": 1.5546875, + "learning_rate": 1.6809792369458022e-05, + "loss": 0.9819, + "step": 4650 + }, + { + "epoch": 0.797479477891849, + "grad_norm": 1.5859375, + "learning_rate": 1.6808469772297572e-05, + "loss": 1.0754, + "step": 4651 + }, + { + "epoch": 0.7976509419808389, + "grad_norm": 1.515625, + "learning_rate": 1.6807146953087282e-05, + "loss": 0.925, + "step": 4652 + }, + { + "epoch": 0.7978224060698288, + "grad_norm": 1.6015625, + "learning_rate": 1.6805823911870298e-05, + "loss": 1.0692, + "step": 4653 + }, + { + "epoch": 0.7979938701588186, + "grad_norm": 1.609375, + "learning_rate": 1.680450064868977e-05, + "loss": 1.0828, + "step": 4654 + }, + { + "epoch": 0.7981653342478084, + "grad_norm": 1.6015625, + "learning_rate": 1.6803177163588848e-05, + "loss": 1.0006, + "step": 4655 + }, + { + "epoch": 0.7983367983367984, + "grad_norm": 1.5234375, + "learning_rate": 1.6801853456610705e-05, + "loss": 0.9446, + "step": 4656 + }, + { + "epoch": 0.7985082624257882, + "grad_norm": 1.65625, + "learning_rate": 1.68005295277985e-05, + "loss": 0.9129, + "step": 4657 + }, + { + "epoch": 0.798679726514778, + "grad_norm": 1.578125, + "learning_rate": 1.6799205377195427e-05, + "loss": 0.971, + "step": 4658 + }, + { + "epoch": 0.798851190603768, + "grad_norm": 1.5546875, + "learning_rate": 1.6797881004844658e-05, + "loss": 0.9304, + "step": 4659 + }, + { + "epoch": 0.7990226546927578, + "grad_norm": 1.5859375, + "learning_rate": 1.6796556410789394e-05, + "loss": 1.0222, + "step": 4660 + }, + { + "epoch": 0.7991941187817476, + "grad_norm": 1.6640625, + "learning_rate": 1.6795231595072832e-05, + "loss": 0.969, + "step": 4661 + }, + { + "epoch": 0.7993655828707376, + "grad_norm": 1.59375, + "learning_rate": 1.6793906557738177e-05, + "loss": 0.9851, + "step": 4662 + }, + { + "epoch": 0.7995370469597274, + "grad_norm": 1.765625, + "learning_rate": 1.6792581298828644e-05, + "loss": 0.9575, + "step": 4663 + }, + { + "epoch": 0.7997085110487172, + "grad_norm": 1.625, + "learning_rate": 1.6791255818387462e-05, + "loss": 0.9878, + "step": 4664 + }, + { + "epoch": 0.7998799751377071, + "grad_norm": 1.5078125, + "learning_rate": 1.678993011645785e-05, + "loss": 0.9399, + "step": 4665 + }, + { + "epoch": 0.800051439226697, + "grad_norm": 1.6640625, + "learning_rate": 1.6788604193083052e-05, + "loss": 0.9622, + "step": 4666 + }, + { + "epoch": 0.8002229033156868, + "grad_norm": 1.5546875, + "learning_rate": 1.6787278048306307e-05, + "loss": 0.98, + "step": 4667 + }, + { + "epoch": 0.8003943674046767, + "grad_norm": 1.640625, + "learning_rate": 1.6785951682170863e-05, + "loss": 1.1163, + "step": 4668 + }, + { + "epoch": 0.8005658314936666, + "grad_norm": 1.546875, + "learning_rate": 1.678462509471998e-05, + "loss": 1.0644, + "step": 4669 + }, + { + "epoch": 0.8007372955826564, + "grad_norm": 1.53125, + "learning_rate": 1.6783298285996928e-05, + "loss": 0.9675, + "step": 4670 + }, + { + "epoch": 0.8009087596716463, + "grad_norm": 1.6640625, + "learning_rate": 1.6781971256044975e-05, + "loss": 0.9141, + "step": 4671 + }, + { + "epoch": 0.8010802237606361, + "grad_norm": 1.53125, + "learning_rate": 1.67806440049074e-05, + "loss": 0.971, + "step": 4672 + }, + { + "epoch": 0.801251687849626, + "grad_norm": 1.5625, + "learning_rate": 1.677931653262749e-05, + "loss": 0.9711, + "step": 4673 + }, + { + "epoch": 0.8014231519386159, + "grad_norm": 1.5703125, + "learning_rate": 1.677798883924854e-05, + "loss": 0.9369, + "step": 4674 + }, + { + "epoch": 0.8015946160276057, + "grad_norm": 1.5625, + "learning_rate": 1.6776660924813854e-05, + "loss": 0.9912, + "step": 4675 + }, + { + "epoch": 0.8017660801165956, + "grad_norm": 1.6328125, + "learning_rate": 1.677533278936673e-05, + "loss": 1.1182, + "step": 4676 + }, + { + "epoch": 0.8019375442055855, + "grad_norm": 1.6328125, + "learning_rate": 1.6774004432950496e-05, + "loss": 1.0211, + "step": 4677 + }, + { + "epoch": 0.8021090082945753, + "grad_norm": 1.671875, + "learning_rate": 1.6772675855608466e-05, + "loss": 0.9907, + "step": 4678 + }, + { + "epoch": 0.8022804723835651, + "grad_norm": 1.5625, + "learning_rate": 1.677134705738397e-05, + "loss": 0.9965, + "step": 4679 + }, + { + "epoch": 0.8024519364725551, + "grad_norm": 1.6171875, + "learning_rate": 1.6770018038320353e-05, + "loss": 1.0059, + "step": 4680 + }, + { + "epoch": 0.8026234005615449, + "grad_norm": 1.765625, + "learning_rate": 1.676868879846095e-05, + "loss": 1.092, + "step": 4681 + }, + { + "epoch": 0.8027948646505347, + "grad_norm": 1.625, + "learning_rate": 1.676735933784912e-05, + "loss": 1.0292, + "step": 4682 + }, + { + "epoch": 0.8029663287395247, + "grad_norm": 1.703125, + "learning_rate": 1.6766029656528218e-05, + "loss": 1.0185, + "step": 4683 + }, + { + "epoch": 0.8031377928285145, + "grad_norm": 2.484375, + "learning_rate": 1.6764699754541608e-05, + "loss": 1.0001, + "step": 4684 + }, + { + "epoch": 0.8033092569175043, + "grad_norm": 1.8125, + "learning_rate": 1.6763369631932665e-05, + "loss": 1.0169, + "step": 4685 + }, + { + "epoch": 0.8034807210064941, + "grad_norm": 1.5859375, + "learning_rate": 1.6762039288744777e-05, + "loss": 0.9109, + "step": 4686 + }, + { + "epoch": 0.8036521850954841, + "grad_norm": 3.171875, + "learning_rate": 1.6760708725021316e-05, + "loss": 0.9967, + "step": 4687 + }, + { + "epoch": 0.8038236491844739, + "grad_norm": 1.5, + "learning_rate": 1.675937794080569e-05, + "loss": 0.963, + "step": 4688 + }, + { + "epoch": 0.8039951132734637, + "grad_norm": 1.6015625, + "learning_rate": 1.6758046936141295e-05, + "loss": 1.0488, + "step": 4689 + }, + { + "epoch": 0.8041665773624537, + "grad_norm": 1.5, + "learning_rate": 1.675671571107154e-05, + "loss": 0.9804, + "step": 4690 + }, + { + "epoch": 0.8043380414514435, + "grad_norm": 1.671875, + "learning_rate": 1.675538426563984e-05, + "loss": 1.0073, + "step": 4691 + }, + { + "epoch": 0.8045095055404333, + "grad_norm": 2.015625, + "learning_rate": 1.6754052599889626e-05, + "loss": 1.0464, + "step": 4692 + }, + { + "epoch": 0.8046809696294233, + "grad_norm": 1.59375, + "learning_rate": 1.675272071386432e-05, + "loss": 1.0297, + "step": 4693 + }, + { + "epoch": 0.8048524337184131, + "grad_norm": 1.5859375, + "learning_rate": 1.6751388607607363e-05, + "loss": 1.1082, + "step": 4694 + }, + { + "epoch": 0.8050238978074029, + "grad_norm": 1.5859375, + "learning_rate": 1.6750056281162203e-05, + "loss": 0.9478, + "step": 4695 + }, + { + "epoch": 0.8051953618963928, + "grad_norm": 1.671875, + "learning_rate": 1.6748723734572285e-05, + "loss": 1.0545, + "step": 4696 + }, + { + "epoch": 0.8053668259853827, + "grad_norm": 1.5703125, + "learning_rate": 1.6747390967881073e-05, + "loss": 1.0008, + "step": 4697 + }, + { + "epoch": 0.8055382900743725, + "grad_norm": 1.515625, + "learning_rate": 1.6746057981132037e-05, + "loss": 0.9503, + "step": 4698 + }, + { + "epoch": 0.8057097541633624, + "grad_norm": 1.671875, + "learning_rate": 1.6744724774368645e-05, + "loss": 1.0142, + "step": 4699 + }, + { + "epoch": 0.8058812182523523, + "grad_norm": 1.546875, + "learning_rate": 1.674339134763438e-05, + "loss": 0.9319, + "step": 4700 + }, + { + "epoch": 0.8060526823413421, + "grad_norm": 1.5703125, + "learning_rate": 1.674205770097273e-05, + "loss": 1.079, + "step": 4701 + }, + { + "epoch": 0.806224146430332, + "grad_norm": 2.1875, + "learning_rate": 1.674072383442719e-05, + "loss": 1.0402, + "step": 4702 + }, + { + "epoch": 0.8063956105193218, + "grad_norm": 1.7578125, + "learning_rate": 1.673938974804126e-05, + "loss": 1.0822, + "step": 4703 + }, + { + "epoch": 0.8065670746083117, + "grad_norm": 1.6328125, + "learning_rate": 1.6738055441858456e-05, + "loss": 1.075, + "step": 4704 + }, + { + "epoch": 0.8067385386973016, + "grad_norm": 1.65625, + "learning_rate": 1.673672091592229e-05, + "loss": 1.077, + "step": 4705 + }, + { + "epoch": 0.8069100027862914, + "grad_norm": 1.5078125, + "learning_rate": 1.6735386170276283e-05, + "loss": 1.0157, + "step": 4706 + }, + { + "epoch": 0.8070814668752813, + "grad_norm": 1.6640625, + "learning_rate": 1.6734051204963974e-05, + "loss": 1.1408, + "step": 4707 + }, + { + "epoch": 0.8072529309642712, + "grad_norm": 1.609375, + "learning_rate": 1.6732716020028897e-05, + "loss": 0.9205, + "step": 4708 + }, + { + "epoch": 0.807424395053261, + "grad_norm": 1.5078125, + "learning_rate": 1.6731380615514593e-05, + "loss": 0.9743, + "step": 4709 + }, + { + "epoch": 0.8075958591422508, + "grad_norm": 1.5703125, + "learning_rate": 1.6730044991464626e-05, + "loss": 1.0179, + "step": 4710 + }, + { + "epoch": 0.8077673232312408, + "grad_norm": 1.59375, + "learning_rate": 1.6728709147922543e-05, + "loss": 0.9116, + "step": 4711 + }, + { + "epoch": 0.8079387873202306, + "grad_norm": 1.6171875, + "learning_rate": 1.672737308493192e-05, + "loss": 0.9609, + "step": 4712 + }, + { + "epoch": 0.8081102514092204, + "grad_norm": 1.5625, + "learning_rate": 1.672603680253633e-05, + "loss": 0.9789, + "step": 4713 + }, + { + "epoch": 0.8082817154982104, + "grad_norm": 1.6640625, + "learning_rate": 1.6724700300779346e-05, + "loss": 1.0335, + "step": 4714 + }, + { + "epoch": 0.8084531795872002, + "grad_norm": 1.625, + "learning_rate": 1.6723363579704567e-05, + "loss": 1.0538, + "step": 4715 + }, + { + "epoch": 0.80862464367619, + "grad_norm": 1.640625, + "learning_rate": 1.672202663935558e-05, + "loss": 1.092, + "step": 4716 + }, + { + "epoch": 0.80879610776518, + "grad_norm": 1.53125, + "learning_rate": 1.6720689479775996e-05, + "loss": 0.9346, + "step": 4717 + }, + { + "epoch": 0.8089675718541698, + "grad_norm": 1.5546875, + "learning_rate": 1.6719352101009417e-05, + "loss": 0.9112, + "step": 4718 + }, + { + "epoch": 0.8091390359431596, + "grad_norm": 1.484375, + "learning_rate": 1.671801450309946e-05, + "loss": 0.9348, + "step": 4719 + }, + { + "epoch": 0.8093105000321495, + "grad_norm": 1.5, + "learning_rate": 1.671667668608976e-05, + "loss": 0.9716, + "step": 4720 + }, + { + "epoch": 0.8094819641211394, + "grad_norm": 1.53125, + "learning_rate": 1.6715338650023936e-05, + "loss": 0.9984, + "step": 4721 + }, + { + "epoch": 0.8096534282101292, + "grad_norm": 1.5703125, + "learning_rate": 1.6714000394945632e-05, + "loss": 0.9771, + "step": 4722 + }, + { + "epoch": 0.8098248922991191, + "grad_norm": 1.5859375, + "learning_rate": 1.6712661920898493e-05, + "loss": 0.9918, + "step": 4723 + }, + { + "epoch": 0.809996356388109, + "grad_norm": 1.4921875, + "learning_rate": 1.671132322792617e-05, + "loss": 0.9857, + "step": 4724 + }, + { + "epoch": 0.8101678204770988, + "grad_norm": 1.4921875, + "learning_rate": 1.6709984316072327e-05, + "loss": 0.9775, + "step": 4725 + }, + { + "epoch": 0.8103392845660887, + "grad_norm": 1.6015625, + "learning_rate": 1.6708645185380622e-05, + "loss": 1.0013, + "step": 4726 + }, + { + "epoch": 0.8105107486550786, + "grad_norm": 1.5, + "learning_rate": 1.670730583589474e-05, + "loss": 0.9176, + "step": 4727 + }, + { + "epoch": 0.8106822127440684, + "grad_norm": 1.6328125, + "learning_rate": 1.6705966267658355e-05, + "loss": 0.948, + "step": 4728 + }, + { + "epoch": 0.8108536768330583, + "grad_norm": 1.5078125, + "learning_rate": 1.6704626480715162e-05, + "loss": 0.9791, + "step": 4729 + }, + { + "epoch": 0.8110251409220481, + "grad_norm": 1.875, + "learning_rate": 1.670328647510885e-05, + "loss": 1.0124, + "step": 4730 + }, + { + "epoch": 0.811196605011038, + "grad_norm": 1.6640625, + "learning_rate": 1.670194625088312e-05, + "loss": 1.1446, + "step": 4731 + }, + { + "epoch": 0.8113680691000279, + "grad_norm": 1.546875, + "learning_rate": 1.6700605808081688e-05, + "loss": 1.0095, + "step": 4732 + }, + { + "epoch": 0.8115395331890177, + "grad_norm": 1.546875, + "learning_rate": 1.669926514674827e-05, + "loss": 0.9893, + "step": 4733 + }, + { + "epoch": 0.8117109972780076, + "grad_norm": 1.484375, + "learning_rate": 1.6697924266926588e-05, + "loss": 1.0897, + "step": 4734 + }, + { + "epoch": 0.8118824613669975, + "grad_norm": 1.6171875, + "learning_rate": 1.6696583168660373e-05, + "loss": 0.9558, + "step": 4735 + }, + { + "epoch": 0.8120539254559873, + "grad_norm": 1.515625, + "learning_rate": 1.6695241851993363e-05, + "loss": 0.952, + "step": 4736 + }, + { + "epoch": 0.8122253895449771, + "grad_norm": 1.9765625, + "learning_rate": 1.6693900316969303e-05, + "loss": 1.0141, + "step": 4737 + }, + { + "epoch": 0.8123968536339671, + "grad_norm": 1.5, + "learning_rate": 1.669255856363195e-05, + "loss": 0.9915, + "step": 4738 + }, + { + "epoch": 0.8125683177229569, + "grad_norm": 1.8828125, + "learning_rate": 1.669121659202506e-05, + "loss": 1.0528, + "step": 4739 + }, + { + "epoch": 0.8127397818119467, + "grad_norm": 1.546875, + "learning_rate": 1.66898744021924e-05, + "loss": 1.0535, + "step": 4740 + }, + { + "epoch": 0.8129112459009367, + "grad_norm": 1.5859375, + "learning_rate": 1.6688531994177745e-05, + "loss": 0.9725, + "step": 4741 + }, + { + "epoch": 0.8130827099899265, + "grad_norm": 1.578125, + "learning_rate": 1.6687189368024874e-05, + "loss": 1.026, + "step": 4742 + }, + { + "epoch": 0.8132541740789163, + "grad_norm": 1.609375, + "learning_rate": 1.6685846523777573e-05, + "loss": 1.0988, + "step": 4743 + }, + { + "epoch": 0.8134256381679063, + "grad_norm": 1.4765625, + "learning_rate": 1.668450346147964e-05, + "loss": 0.8971, + "step": 4744 + }, + { + "epoch": 0.8135971022568961, + "grad_norm": 1.6328125, + "learning_rate": 1.6683160181174886e-05, + "loss": 1.0287, + "step": 4745 + }, + { + "epoch": 0.8137685663458859, + "grad_norm": 1.53125, + "learning_rate": 1.6681816682907104e-05, + "loss": 1.0021, + "step": 4746 + }, + { + "epoch": 0.8139400304348758, + "grad_norm": 1.515625, + "learning_rate": 1.668047296672012e-05, + "loss": 0.9952, + "step": 4747 + }, + { + "epoch": 0.8141114945238657, + "grad_norm": 1.59375, + "learning_rate": 1.667912903265776e-05, + "loss": 0.9552, + "step": 4748 + }, + { + "epoch": 0.8142829586128555, + "grad_norm": 1.6171875, + "learning_rate": 1.6677784880763847e-05, + "loss": 1.0447, + "step": 4749 + }, + { + "epoch": 0.8144544227018454, + "grad_norm": 1.5546875, + "learning_rate": 1.6676440511082223e-05, + "loss": 1.0072, + "step": 4750 + }, + { + "epoch": 0.8146258867908353, + "grad_norm": 1.6328125, + "learning_rate": 1.6675095923656736e-05, + "loss": 1.0373, + "step": 4751 + }, + { + "epoch": 0.8147973508798251, + "grad_norm": 1.5390625, + "learning_rate": 1.667375111853123e-05, + "loss": 1.0665, + "step": 4752 + }, + { + "epoch": 0.814968814968815, + "grad_norm": 1.6171875, + "learning_rate": 1.6672406095749573e-05, + "loss": 0.9888, + "step": 4753 + }, + { + "epoch": 0.8151402790578048, + "grad_norm": 1.5546875, + "learning_rate": 1.6671060855355626e-05, + "loss": 1.0541, + "step": 4754 + }, + { + "epoch": 0.8153117431467947, + "grad_norm": 1.5703125, + "learning_rate": 1.666971539739326e-05, + "loss": 1.0444, + "step": 4755 + }, + { + "epoch": 0.8154832072357846, + "grad_norm": 1.6171875, + "learning_rate": 1.6668369721906367e-05, + "loss": 1.0401, + "step": 4756 + }, + { + "epoch": 0.8156546713247744, + "grad_norm": 1.578125, + "learning_rate": 1.666702382893882e-05, + "loss": 1.0007, + "step": 4757 + }, + { + "epoch": 0.8158261354137643, + "grad_norm": 1.5703125, + "learning_rate": 1.6665677718534525e-05, + "loss": 1.074, + "step": 4758 + }, + { + "epoch": 0.8159975995027542, + "grad_norm": 1.4609375, + "learning_rate": 1.6664331390737373e-05, + "loss": 0.9096, + "step": 4759 + }, + { + "epoch": 0.816169063591744, + "grad_norm": 1.5625, + "learning_rate": 1.6662984845591283e-05, + "loss": 1.0082, + "step": 4760 + }, + { + "epoch": 0.8163405276807338, + "grad_norm": 1.515625, + "learning_rate": 1.6661638083140167e-05, + "loss": 0.9651, + "step": 4761 + }, + { + "epoch": 0.8165119917697238, + "grad_norm": 1.5234375, + "learning_rate": 1.6660291103427946e-05, + "loss": 0.9039, + "step": 4762 + }, + { + "epoch": 0.8166834558587136, + "grad_norm": 1.6015625, + "learning_rate": 1.665894390649855e-05, + "loss": 0.9503, + "step": 4763 + }, + { + "epoch": 0.8168549199477034, + "grad_norm": 1.5234375, + "learning_rate": 1.6657596492395926e-05, + "loss": 0.9711, + "step": 4764 + }, + { + "epoch": 0.8170263840366934, + "grad_norm": 1.5859375, + "learning_rate": 1.6656248861164e-05, + "loss": 1.0314, + "step": 4765 + }, + { + "epoch": 0.8171978481256832, + "grad_norm": 1.53125, + "learning_rate": 1.6654901012846737e-05, + "loss": 0.948, + "step": 4766 + }, + { + "epoch": 0.817369312214673, + "grad_norm": 1.515625, + "learning_rate": 1.6653552947488095e-05, + "loss": 0.9148, + "step": 4767 + }, + { + "epoch": 0.817540776303663, + "grad_norm": 1.671875, + "learning_rate": 1.6652204665132036e-05, + "loss": 1.0365, + "step": 4768 + }, + { + "epoch": 0.8177122403926528, + "grad_norm": 1.4921875, + "learning_rate": 1.665085616582253e-05, + "loss": 0.9708, + "step": 4769 + }, + { + "epoch": 0.8178837044816426, + "grad_norm": 1.484375, + "learning_rate": 1.664950744960356e-05, + "loss": 1.0439, + "step": 4770 + }, + { + "epoch": 0.8180551685706325, + "grad_norm": 1.546875, + "learning_rate": 1.6648158516519115e-05, + "loss": 0.9592, + "step": 4771 + }, + { + "epoch": 0.8182266326596224, + "grad_norm": 1.6328125, + "learning_rate": 1.6646809366613186e-05, + "loss": 1.0561, + "step": 4772 + }, + { + "epoch": 0.8183980967486122, + "grad_norm": 1.59375, + "learning_rate": 1.664545999992977e-05, + "loss": 1.0024, + "step": 4773 + }, + { + "epoch": 0.8185695608376021, + "grad_norm": 1.5546875, + "learning_rate": 1.6644110416512883e-05, + "loss": 1.0269, + "step": 4774 + }, + { + "epoch": 0.818741024926592, + "grad_norm": 1.640625, + "learning_rate": 1.6642760616406536e-05, + "loss": 1.0261, + "step": 4775 + }, + { + "epoch": 0.8189124890155818, + "grad_norm": 1.5078125, + "learning_rate": 1.664141059965475e-05, + "loss": 1.0186, + "step": 4776 + }, + { + "epoch": 0.8190839531045717, + "grad_norm": 1.59375, + "learning_rate": 1.664006036630155e-05, + "loss": 0.9797, + "step": 4777 + }, + { + "epoch": 0.8192554171935615, + "grad_norm": 1.5078125, + "learning_rate": 1.663870991639098e-05, + "loss": 0.9059, + "step": 4778 + }, + { + "epoch": 0.8194268812825514, + "grad_norm": 1.5625, + "learning_rate": 1.6637359249967085e-05, + "loss": 0.9974, + "step": 4779 + }, + { + "epoch": 0.8195983453715412, + "grad_norm": 1.546875, + "learning_rate": 1.6636008367073908e-05, + "loss": 0.9284, + "step": 4780 + }, + { + "epoch": 0.8197698094605311, + "grad_norm": 1.546875, + "learning_rate": 1.6634657267755508e-05, + "loss": 0.9575, + "step": 4781 + }, + { + "epoch": 0.819941273549521, + "grad_norm": 1.53125, + "learning_rate": 1.6633305952055955e-05, + "loss": 1.0132, + "step": 4782 + }, + { + "epoch": 0.8201127376385108, + "grad_norm": 1.625, + "learning_rate": 1.6631954420019312e-05, + "loss": 1.03, + "step": 4783 + }, + { + "epoch": 0.8202842017275007, + "grad_norm": 1.578125, + "learning_rate": 1.6630602671689665e-05, + "loss": 0.9498, + "step": 4784 + }, + { + "epoch": 0.8204556658164905, + "grad_norm": 1.5703125, + "learning_rate": 1.662925070711109e-05, + "loss": 0.9973, + "step": 4785 + }, + { + "epoch": 0.8206271299054804, + "grad_norm": 1.5234375, + "learning_rate": 1.662789852632769e-05, + "loss": 1.1094, + "step": 4786 + }, + { + "epoch": 0.8207985939944703, + "grad_norm": 1.546875, + "learning_rate": 1.6626546129383564e-05, + "loss": 0.9227, + "step": 4787 + }, + { + "epoch": 0.8209700580834601, + "grad_norm": 1.625, + "learning_rate": 1.6625193516322813e-05, + "loss": 1.1013, + "step": 4788 + }, + { + "epoch": 0.82114152217245, + "grad_norm": 1.53125, + "learning_rate": 1.6623840687189554e-05, + "loss": 0.9486, + "step": 4789 + }, + { + "epoch": 0.8213129862614399, + "grad_norm": 1.5390625, + "learning_rate": 1.6622487642027904e-05, + "loss": 0.9604, + "step": 4790 + }, + { + "epoch": 0.8214844503504297, + "grad_norm": 1.4765625, + "learning_rate": 1.6621134380881997e-05, + "loss": 0.955, + "step": 4791 + }, + { + "epoch": 0.8216559144394195, + "grad_norm": 1.625, + "learning_rate": 1.6619780903795964e-05, + "loss": 1.0258, + "step": 4792 + }, + { + "epoch": 0.8218273785284095, + "grad_norm": 1.6015625, + "learning_rate": 1.6618427210813948e-05, + "loss": 1.0394, + "step": 4793 + }, + { + "epoch": 0.8219988426173993, + "grad_norm": 1.5390625, + "learning_rate": 1.6617073301980097e-05, + "loss": 0.9275, + "step": 4794 + }, + { + "epoch": 0.8221703067063891, + "grad_norm": 1.5234375, + "learning_rate": 1.6615719177338568e-05, + "loss": 1.0212, + "step": 4795 + }, + { + "epoch": 0.8223417707953791, + "grad_norm": 1.4375, + "learning_rate": 1.6614364836933526e-05, + "loss": 0.9618, + "step": 4796 + }, + { + "epoch": 0.8225132348843689, + "grad_norm": 1.4765625, + "learning_rate": 1.6613010280809138e-05, + "loss": 1.0033, + "step": 4797 + }, + { + "epoch": 0.8226846989733587, + "grad_norm": 1.5234375, + "learning_rate": 1.661165550900958e-05, + "loss": 1.0089, + "step": 4798 + }, + { + "epoch": 0.8228561630623487, + "grad_norm": 1.515625, + "learning_rate": 1.6610300521579046e-05, + "loss": 1.0124, + "step": 4799 + }, + { + "epoch": 0.8230276271513385, + "grad_norm": 1.6171875, + "learning_rate": 1.6608945318561715e-05, + "loss": 0.9778, + "step": 4800 + }, + { + "epoch": 0.8231990912403283, + "grad_norm": 1.6015625, + "learning_rate": 1.6607589900001787e-05, + "loss": 0.9307, + "step": 4801 + }, + { + "epoch": 0.8233705553293182, + "grad_norm": 1.6484375, + "learning_rate": 1.6606234265943477e-05, + "loss": 0.9979, + "step": 4802 + }, + { + "epoch": 0.8235420194183081, + "grad_norm": 1.609375, + "learning_rate": 1.6604878416430986e-05, + "loss": 1.032, + "step": 4803 + }, + { + "epoch": 0.8237134835072979, + "grad_norm": 1.640625, + "learning_rate": 1.660352235150854e-05, + "loss": 0.9481, + "step": 4804 + }, + { + "epoch": 0.8238849475962878, + "grad_norm": 1.9921875, + "learning_rate": 1.6602166071220365e-05, + "loss": 0.9735, + "step": 4805 + }, + { + "epoch": 0.8240564116852777, + "grad_norm": 1.5625, + "learning_rate": 1.660080957561069e-05, + "loss": 1.0189, + "step": 4806 + }, + { + "epoch": 0.8242278757742675, + "grad_norm": 1.75, + "learning_rate": 1.659945286472376e-05, + "loss": 0.9874, + "step": 4807 + }, + { + "epoch": 0.8243993398632574, + "grad_norm": 1.7109375, + "learning_rate": 1.6598095938603818e-05, + "loss": 1.0096, + "step": 4808 + }, + { + "epoch": 0.8245708039522472, + "grad_norm": 1.6015625, + "learning_rate": 1.6596738797295126e-05, + "loss": 0.9903, + "step": 4809 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 1.5390625, + "learning_rate": 1.659538144084194e-05, + "loss": 1.0101, + "step": 4810 + }, + { + "epoch": 0.824913732130227, + "grad_norm": 1.65625, + "learning_rate": 1.6594023869288528e-05, + "loss": 0.928, + "step": 4811 + }, + { + "epoch": 0.8250851962192168, + "grad_norm": 1.6015625, + "learning_rate": 1.6592666082679165e-05, + "loss": 0.9943, + "step": 4812 + }, + { + "epoch": 0.8252566603082067, + "grad_norm": 1.671875, + "learning_rate": 1.6591308081058136e-05, + "loss": 1.0472, + "step": 4813 + }, + { + "epoch": 0.8254281243971966, + "grad_norm": 1.609375, + "learning_rate": 1.6589949864469733e-05, + "loss": 0.9844, + "step": 4814 + }, + { + "epoch": 0.8255995884861864, + "grad_norm": 1.6015625, + "learning_rate": 1.6588591432958246e-05, + "loss": 1.0661, + "step": 4815 + }, + { + "epoch": 0.8257710525751762, + "grad_norm": 1.5546875, + "learning_rate": 1.6587232786567985e-05, + "loss": 1.0042, + "step": 4816 + }, + { + "epoch": 0.8259425166641662, + "grad_norm": 1.5703125, + "learning_rate": 1.6585873925343257e-05, + "loss": 1.0843, + "step": 4817 + }, + { + "epoch": 0.826113980753156, + "grad_norm": 1.5625, + "learning_rate": 1.6584514849328378e-05, + "loss": 1.0532, + "step": 4818 + }, + { + "epoch": 0.8262854448421458, + "grad_norm": 2.015625, + "learning_rate": 1.658315555856768e-05, + "loss": 0.8471, + "step": 4819 + }, + { + "epoch": 0.8264569089311358, + "grad_norm": 1.5859375, + "learning_rate": 1.6581796053105483e-05, + "loss": 0.9945, + "step": 4820 + }, + { + "epoch": 0.8266283730201256, + "grad_norm": 1.578125, + "learning_rate": 1.6580436332986134e-05, + "loss": 0.9524, + "step": 4821 + }, + { + "epoch": 0.8267998371091154, + "grad_norm": 1.640625, + "learning_rate": 1.657907639825398e-05, + "loss": 1.1008, + "step": 4822 + }, + { + "epoch": 0.8269713011981054, + "grad_norm": 1.640625, + "learning_rate": 1.6577716248953364e-05, + "loss": 1.1223, + "step": 4823 + }, + { + "epoch": 0.8271427652870952, + "grad_norm": 1.6328125, + "learning_rate": 1.6576355885128657e-05, + "loss": 1.0223, + "step": 4824 + }, + { + "epoch": 0.827314229376085, + "grad_norm": 1.484375, + "learning_rate": 1.6574995306824217e-05, + "loss": 0.9231, + "step": 4825 + }, + { + "epoch": 0.827485693465075, + "grad_norm": 1.5390625, + "learning_rate": 1.657363451408442e-05, + "loss": 1.0, + "step": 4826 + }, + { + "epoch": 0.8276571575540648, + "grad_norm": 1.59375, + "learning_rate": 1.657227350695365e-05, + "loss": 1.0248, + "step": 4827 + }, + { + "epoch": 0.8278286216430546, + "grad_norm": 1.640625, + "learning_rate": 1.6570912285476294e-05, + "loss": 0.9989, + "step": 4828 + }, + { + "epoch": 0.8280000857320445, + "grad_norm": 1.515625, + "learning_rate": 1.6569550849696742e-05, + "loss": 1.0243, + "step": 4829 + }, + { + "epoch": 0.8281715498210344, + "grad_norm": 1.453125, + "learning_rate": 1.65681891996594e-05, + "loss": 0.9558, + "step": 4830 + }, + { + "epoch": 0.8283430139100242, + "grad_norm": 1.53125, + "learning_rate": 1.656682733540867e-05, + "loss": 1.0354, + "step": 4831 + }, + { + "epoch": 0.8285144779990141, + "grad_norm": 1.5546875, + "learning_rate": 1.6565465256988976e-05, + "loss": 1.0249, + "step": 4832 + }, + { + "epoch": 0.828685942088004, + "grad_norm": 1.484375, + "learning_rate": 1.6564102964444736e-05, + "loss": 0.8995, + "step": 4833 + }, + { + "epoch": 0.8288574061769938, + "grad_norm": 1.5625, + "learning_rate": 1.6562740457820382e-05, + "loss": 0.9886, + "step": 4834 + }, + { + "epoch": 0.8290288702659837, + "grad_norm": 1.59375, + "learning_rate": 1.6561377737160347e-05, + "loss": 1.084, + "step": 4835 + }, + { + "epoch": 0.8292003343549735, + "grad_norm": 1.6484375, + "learning_rate": 1.6560014802509076e-05, + "loss": 0.9952, + "step": 4836 + }, + { + "epoch": 0.8293717984439634, + "grad_norm": 1.5625, + "learning_rate": 1.6558651653911018e-05, + "loss": 0.981, + "step": 4837 + }, + { + "epoch": 0.8295432625329533, + "grad_norm": 1.6484375, + "learning_rate": 1.6557288291410632e-05, + "loss": 1.0717, + "step": 4838 + }, + { + "epoch": 0.8297147266219431, + "grad_norm": 1.546875, + "learning_rate": 1.6555924715052387e-05, + "loss": 1.0198, + "step": 4839 + }, + { + "epoch": 0.829886190710933, + "grad_norm": 1.6015625, + "learning_rate": 1.6554560924880747e-05, + "loss": 1.0254, + "step": 4840 + }, + { + "epoch": 0.8300576547999229, + "grad_norm": 1.734375, + "learning_rate": 1.6553196920940192e-05, + "loss": 1.043, + "step": 4841 + }, + { + "epoch": 0.8302291188889127, + "grad_norm": 1.6875, + "learning_rate": 1.655183270327521e-05, + "loss": 1.0715, + "step": 4842 + }, + { + "epoch": 0.8304005829779025, + "grad_norm": 1.6015625, + "learning_rate": 1.6550468271930288e-05, + "loss": 1.0748, + "step": 4843 + }, + { + "epoch": 0.8305720470668925, + "grad_norm": 1.6015625, + "learning_rate": 1.6549103626949934e-05, + "loss": 0.9705, + "step": 4844 + }, + { + "epoch": 0.8307435111558823, + "grad_norm": 1.5546875, + "learning_rate": 1.6547738768378648e-05, + "loss": 0.9233, + "step": 4845 + }, + { + "epoch": 0.8309149752448721, + "grad_norm": 1.5234375, + "learning_rate": 1.654637369626094e-05, + "loss": 1.0039, + "step": 4846 + }, + { + "epoch": 0.8310864393338621, + "grad_norm": 1.6796875, + "learning_rate": 1.654500841064134e-05, + "loss": 1.0685, + "step": 4847 + }, + { + "epoch": 0.8312579034228519, + "grad_norm": 1.5703125, + "learning_rate": 1.6543642911564366e-05, + "loss": 0.9914, + "step": 4848 + }, + { + "epoch": 0.8314293675118417, + "grad_norm": 1.5625, + "learning_rate": 1.654227719907456e-05, + "loss": 1.0263, + "step": 4849 + }, + { + "epoch": 0.8316008316008316, + "grad_norm": 1.640625, + "learning_rate": 1.654091127321645e-05, + "loss": 0.9904, + "step": 4850 + }, + { + "epoch": 0.8317722956898215, + "grad_norm": 1.6171875, + "learning_rate": 1.65395451340346e-05, + "loss": 0.9814, + "step": 4851 + }, + { + "epoch": 0.8319437597788113, + "grad_norm": 1.546875, + "learning_rate": 1.6538178781573553e-05, + "loss": 1.0059, + "step": 4852 + }, + { + "epoch": 0.8321152238678012, + "grad_norm": 1.5546875, + "learning_rate": 1.6536812215877877e-05, + "loss": 1.0505, + "step": 4853 + }, + { + "epoch": 0.8322866879567911, + "grad_norm": 1.5390625, + "learning_rate": 1.6535445436992142e-05, + "loss": 0.9706, + "step": 4854 + }, + { + "epoch": 0.8324581520457809, + "grad_norm": 1.53125, + "learning_rate": 1.653407844496092e-05, + "loss": 0.9743, + "step": 4855 + }, + { + "epoch": 0.8326296161347708, + "grad_norm": 1.53125, + "learning_rate": 1.6532711239828792e-05, + "loss": 1.0592, + "step": 4856 + }, + { + "epoch": 0.8328010802237606, + "grad_norm": 1.671875, + "learning_rate": 1.6531343821640353e-05, + "loss": 1.1352, + "step": 4857 + }, + { + "epoch": 0.8329725443127505, + "grad_norm": 1.6171875, + "learning_rate": 1.6529976190440197e-05, + "loss": 1.0237, + "step": 4858 + }, + { + "epoch": 0.8331440084017404, + "grad_norm": 1.625, + "learning_rate": 1.652860834627293e-05, + "loss": 1.0109, + "step": 4859 + }, + { + "epoch": 0.8333154724907302, + "grad_norm": 1.5859375, + "learning_rate": 1.6527240289183156e-05, + "loss": 0.9852, + "step": 4860 + }, + { + "epoch": 0.8334869365797201, + "grad_norm": 1.5625, + "learning_rate": 1.6525872019215495e-05, + "loss": 1.0334, + "step": 4861 + }, + { + "epoch": 0.83365840066871, + "grad_norm": 1.515625, + "learning_rate": 1.652450353641458e-05, + "loss": 1.0522, + "step": 4862 + }, + { + "epoch": 0.8338298647576998, + "grad_norm": 1.7578125, + "learning_rate": 1.652313484082503e-05, + "loss": 1.0455, + "step": 4863 + }, + { + "epoch": 0.8340013288466896, + "grad_norm": 1.6328125, + "learning_rate": 1.6521765932491493e-05, + "loss": 1.0399, + "step": 4864 + }, + { + "epoch": 0.8341727929356796, + "grad_norm": 1.5859375, + "learning_rate": 1.652039681145861e-05, + "loss": 0.9762, + "step": 4865 + }, + { + "epoch": 0.8343442570246694, + "grad_norm": 1.5859375, + "learning_rate": 1.6519027477771034e-05, + "loss": 0.9692, + "step": 4866 + }, + { + "epoch": 0.8345157211136592, + "grad_norm": 1.640625, + "learning_rate": 1.651765793147342e-05, + "loss": 1.0545, + "step": 4867 + }, + { + "epoch": 0.8346871852026492, + "grad_norm": 1.546875, + "learning_rate": 1.651628817261044e-05, + "loss": 1.0397, + "step": 4868 + }, + { + "epoch": 0.834858649291639, + "grad_norm": 1.609375, + "learning_rate": 1.651491820122676e-05, + "loss": 1.059, + "step": 4869 + }, + { + "epoch": 0.8350301133806288, + "grad_norm": 1.59375, + "learning_rate": 1.651354801736707e-05, + "loss": 1.0279, + "step": 4870 + }, + { + "epoch": 0.8352015774696188, + "grad_norm": 1.5078125, + "learning_rate": 1.651217762107605e-05, + "loss": 0.9188, + "step": 4871 + }, + { + "epoch": 0.8353730415586086, + "grad_norm": 1.5234375, + "learning_rate": 1.6510807012398394e-05, + "loss": 1.0087, + "step": 4872 + }, + { + "epoch": 0.8355445056475984, + "grad_norm": 1.625, + "learning_rate": 1.650943619137881e-05, + "loss": 1.0444, + "step": 4873 + }, + { + "epoch": 0.8357159697365882, + "grad_norm": 1.5078125, + "learning_rate": 1.6508065158061992e-05, + "loss": 1.052, + "step": 4874 + }, + { + "epoch": 0.8358874338255782, + "grad_norm": 1.53125, + "learning_rate": 1.6506693912492666e-05, + "loss": 1.0841, + "step": 4875 + }, + { + "epoch": 0.836058897914568, + "grad_norm": 1.515625, + "learning_rate": 1.650532245471555e-05, + "loss": 0.9297, + "step": 4876 + }, + { + "epoch": 0.8362303620035578, + "grad_norm": 1.5546875, + "learning_rate": 1.6503950784775368e-05, + "loss": 0.9995, + "step": 4877 + }, + { + "epoch": 0.8364018260925478, + "grad_norm": 1.703125, + "learning_rate": 1.6502578902716863e-05, + "loss": 0.9401, + "step": 4878 + }, + { + "epoch": 0.8365732901815376, + "grad_norm": 1.546875, + "learning_rate": 1.650120680858477e-05, + "loss": 1.0145, + "step": 4879 + }, + { + "epoch": 0.8367447542705274, + "grad_norm": 1.578125, + "learning_rate": 1.6499834502423848e-05, + "loss": 1.0129, + "step": 4880 + }, + { + "epoch": 0.8369162183595173, + "grad_norm": 1.6640625, + "learning_rate": 1.649846198427884e-05, + "loss": 1.1061, + "step": 4881 + }, + { + "epoch": 0.8370876824485072, + "grad_norm": 1.5234375, + "learning_rate": 1.6497089254194515e-05, + "loss": 1.017, + "step": 4882 + }, + { + "epoch": 0.837259146537497, + "grad_norm": 1.6328125, + "learning_rate": 1.649571631221565e-05, + "loss": 0.9935, + "step": 4883 + }, + { + "epoch": 0.8374306106264869, + "grad_norm": 1.6484375, + "learning_rate": 1.649434315838701e-05, + "loss": 1.0117, + "step": 4884 + }, + { + "epoch": 0.8376020747154768, + "grad_norm": 1.515625, + "learning_rate": 1.6492969792753387e-05, + "loss": 0.9968, + "step": 4885 + }, + { + "epoch": 0.8377735388044666, + "grad_norm": 1.5234375, + "learning_rate": 1.649159621535957e-05, + "loss": 1.0011, + "step": 4886 + }, + { + "epoch": 0.8379450028934565, + "grad_norm": 1.5546875, + "learning_rate": 1.6490222426250354e-05, + "loss": 1.019, + "step": 4887 + }, + { + "epoch": 0.8381164669824464, + "grad_norm": 1.578125, + "learning_rate": 1.6488848425470543e-05, + "loss": 1.0191, + "step": 4888 + }, + { + "epoch": 0.8382879310714362, + "grad_norm": 1.5703125, + "learning_rate": 1.648747421306495e-05, + "loss": 0.9839, + "step": 4889 + }, + { + "epoch": 0.8384593951604261, + "grad_norm": 1.53125, + "learning_rate": 1.64860997890784e-05, + "loss": 0.9773, + "step": 4890 + }, + { + "epoch": 0.8386308592494159, + "grad_norm": 1.546875, + "learning_rate": 1.6484725153555703e-05, + "loss": 1.0045, + "step": 4891 + }, + { + "epoch": 0.8388023233384058, + "grad_norm": 1.6015625, + "learning_rate": 1.6483350306541703e-05, + "loss": 1.0592, + "step": 4892 + }, + { + "epoch": 0.8389737874273957, + "grad_norm": 1.5625, + "learning_rate": 1.6481975248081233e-05, + "loss": 0.9609, + "step": 4893 + }, + { + "epoch": 0.8391452515163855, + "grad_norm": 1.5078125, + "learning_rate": 1.6480599978219143e-05, + "loss": 0.9875, + "step": 4894 + }, + { + "epoch": 0.8393167156053754, + "grad_norm": 1.5703125, + "learning_rate": 1.6479224497000287e-05, + "loss": 1.0036, + "step": 4895 + }, + { + "epoch": 0.8394881796943653, + "grad_norm": 1.5625, + "learning_rate": 1.647784880446952e-05, + "loss": 0.9743, + "step": 4896 + }, + { + "epoch": 0.8396596437833551, + "grad_norm": 1.46875, + "learning_rate": 1.647647290067171e-05, + "loss": 0.9982, + "step": 4897 + }, + { + "epoch": 0.8398311078723449, + "grad_norm": 1.5234375, + "learning_rate": 1.647509678565173e-05, + "loss": 1.1047, + "step": 4898 + }, + { + "epoch": 0.8400025719613349, + "grad_norm": 1.6015625, + "learning_rate": 1.647372045945446e-05, + "loss": 1.0122, + "step": 4899 + }, + { + "epoch": 0.8401740360503247, + "grad_norm": 1.5546875, + "learning_rate": 1.647234392212479e-05, + "loss": 0.9983, + "step": 4900 + }, + { + "epoch": 0.8401740360503247, + "eval_loss": 0.8566790223121643, + "eval_runtime": 836.9111, + "eval_samples_per_second": 2.986, + "eval_steps_per_second": 2.986, + "step": 4900 + }, + { + "epoch": 0.8403455001393145, + "grad_norm": 1.5859375, + "learning_rate": 1.647096717370761e-05, + "loss": 1.0025, + "step": 4901 + }, + { + "epoch": 0.8405169642283045, + "grad_norm": 1.5625, + "learning_rate": 1.6469590214247828e-05, + "loss": 1.0119, + "step": 4902 + }, + { + "epoch": 0.8406884283172943, + "grad_norm": 1.578125, + "learning_rate": 1.646821304379034e-05, + "loss": 1.0166, + "step": 4903 + }, + { + "epoch": 0.8408598924062841, + "grad_norm": 1.7109375, + "learning_rate": 1.6466835662380072e-05, + "loss": 1.0238, + "step": 4904 + }, + { + "epoch": 0.841031356495274, + "grad_norm": 1.5078125, + "learning_rate": 1.646545807006194e-05, + "loss": 0.9224, + "step": 4905 + }, + { + "epoch": 0.8412028205842639, + "grad_norm": 1.6015625, + "learning_rate": 1.646408026688087e-05, + "loss": 1.0507, + "step": 4906 + }, + { + "epoch": 0.8413742846732537, + "grad_norm": 1.5703125, + "learning_rate": 1.6462702252881805e-05, + "loss": 1.0593, + "step": 4907 + }, + { + "epoch": 0.8415457487622436, + "grad_norm": 1.46875, + "learning_rate": 1.6461324028109683e-05, + "loss": 0.9803, + "step": 4908 + }, + { + "epoch": 0.8417172128512335, + "grad_norm": 1.4765625, + "learning_rate": 1.6459945592609455e-05, + "loss": 0.9086, + "step": 4909 + }, + { + "epoch": 0.8418886769402233, + "grad_norm": 1.578125, + "learning_rate": 1.6458566946426072e-05, + "loss": 1.0014, + "step": 4910 + }, + { + "epoch": 0.8420601410292132, + "grad_norm": 1.609375, + "learning_rate": 1.6457188089604505e-05, + "loss": 0.9632, + "step": 4911 + }, + { + "epoch": 0.842231605118203, + "grad_norm": 1.640625, + "learning_rate": 1.6455809022189716e-05, + "loss": 1.0341, + "step": 4912 + }, + { + "epoch": 0.8424030692071929, + "grad_norm": 1.6328125, + "learning_rate": 1.645442974422668e-05, + "loss": 0.962, + "step": 4913 + }, + { + "epoch": 0.8425745332961828, + "grad_norm": 1.5625, + "learning_rate": 1.6453050255760392e-05, + "loss": 0.9729, + "step": 4914 + }, + { + "epoch": 0.8427459973851726, + "grad_norm": 1.671875, + "learning_rate": 1.645167055683583e-05, + "loss": 1.027, + "step": 4915 + }, + { + "epoch": 0.8429174614741625, + "grad_norm": 1.59375, + "learning_rate": 1.6450290647498e-05, + "loss": 1.1018, + "step": 4916 + }, + { + "epoch": 0.8430889255631524, + "grad_norm": 1.59375, + "learning_rate": 1.64489105277919e-05, + "loss": 0.9738, + "step": 4917 + }, + { + "epoch": 0.8432603896521422, + "grad_norm": 1.53125, + "learning_rate": 1.6447530197762545e-05, + "loss": 1.0227, + "step": 4918 + }, + { + "epoch": 0.843431853741132, + "grad_norm": 1.5390625, + "learning_rate": 1.644614965745495e-05, + "loss": 1.0394, + "step": 4919 + }, + { + "epoch": 0.843603317830122, + "grad_norm": 1.59375, + "learning_rate": 1.6444768906914143e-05, + "loss": 1.0459, + "step": 4920 + }, + { + "epoch": 0.8437747819191118, + "grad_norm": 1.5234375, + "learning_rate": 1.644338794618515e-05, + "loss": 0.9538, + "step": 4921 + }, + { + "epoch": 0.8439462460081016, + "grad_norm": 1.4296875, + "learning_rate": 1.6442006775313017e-05, + "loss": 0.9086, + "step": 4922 + }, + { + "epoch": 0.8441177100970916, + "grad_norm": 1.4609375, + "learning_rate": 1.6440625394342777e-05, + "loss": 0.9426, + "step": 4923 + }, + { + "epoch": 0.8442891741860814, + "grad_norm": 1.4921875, + "learning_rate": 1.6439243803319492e-05, + "loss": 0.908, + "step": 4924 + }, + { + "epoch": 0.8444606382750712, + "grad_norm": 1.5390625, + "learning_rate": 1.6437862002288226e-05, + "loss": 1.0451, + "step": 4925 + }, + { + "epoch": 0.8446321023640612, + "grad_norm": 1.5234375, + "learning_rate": 1.6436479991294032e-05, + "loss": 1.0059, + "step": 4926 + }, + { + "epoch": 0.844803566453051, + "grad_norm": 1.4921875, + "learning_rate": 1.6435097770381988e-05, + "loss": 1.0136, + "step": 4927 + }, + { + "epoch": 0.8449750305420408, + "grad_norm": 1.65625, + "learning_rate": 1.643371533959717e-05, + "loss": 1.049, + "step": 4928 + }, + { + "epoch": 0.8451464946310308, + "grad_norm": 1.5078125, + "learning_rate": 1.6432332698984672e-05, + "loss": 0.9991, + "step": 4929 + }, + { + "epoch": 0.8453179587200206, + "grad_norm": 1.5234375, + "learning_rate": 1.6430949848589583e-05, + "loss": 0.9708, + "step": 4930 + }, + { + "epoch": 0.8454894228090104, + "grad_norm": 1.5234375, + "learning_rate": 1.6429566788457007e-05, + "loss": 0.9411, + "step": 4931 + }, + { + "epoch": 0.8456608868980003, + "grad_norm": 1.7421875, + "learning_rate": 1.6428183518632036e-05, + "loss": 1.0671, + "step": 4932 + }, + { + "epoch": 0.8458323509869902, + "grad_norm": 1.5, + "learning_rate": 1.64268000391598e-05, + "loss": 0.9859, + "step": 4933 + }, + { + "epoch": 0.84600381507598, + "grad_norm": 1.46875, + "learning_rate": 1.6425416350085416e-05, + "loss": 0.9405, + "step": 4934 + }, + { + "epoch": 0.8461752791649699, + "grad_norm": 1.5, + "learning_rate": 1.6424032451454004e-05, + "loss": 0.9694, + "step": 4935 + }, + { + "epoch": 0.8463467432539598, + "grad_norm": 2.28125, + "learning_rate": 1.6422648343310706e-05, + "loss": 1.0205, + "step": 4936 + }, + { + "epoch": 0.8465182073429496, + "grad_norm": 1.546875, + "learning_rate": 1.642126402570066e-05, + "loss": 0.9597, + "step": 4937 + }, + { + "epoch": 0.8466896714319395, + "grad_norm": 1.484375, + "learning_rate": 1.6419879498669012e-05, + "loss": 1.0294, + "step": 4938 + }, + { + "epoch": 0.8468611355209293, + "grad_norm": 1.578125, + "learning_rate": 1.641849476226092e-05, + "loss": 1.0367, + "step": 4939 + }, + { + "epoch": 0.8470325996099192, + "grad_norm": 1.5546875, + "learning_rate": 1.641710981652154e-05, + "loss": 1.0519, + "step": 4940 + }, + { + "epoch": 0.8472040636989091, + "grad_norm": 1.6015625, + "learning_rate": 1.641572466149605e-05, + "loss": 0.9819, + "step": 4941 + }, + { + "epoch": 0.8473755277878989, + "grad_norm": 1.5625, + "learning_rate": 1.6414339297229614e-05, + "loss": 0.9086, + "step": 4942 + }, + { + "epoch": 0.8475469918768888, + "grad_norm": 1.625, + "learning_rate": 1.6412953723767426e-05, + "loss": 1.0298, + "step": 4943 + }, + { + "epoch": 0.8477184559658787, + "grad_norm": 1.578125, + "learning_rate": 1.6411567941154666e-05, + "loss": 0.9991, + "step": 4944 + }, + { + "epoch": 0.8478899200548685, + "grad_norm": 1.6328125, + "learning_rate": 1.641018194943653e-05, + "loss": 1.0165, + "step": 4945 + }, + { + "epoch": 0.8480613841438583, + "grad_norm": 2.640625, + "learning_rate": 1.6408795748658223e-05, + "loss": 1.0844, + "step": 4946 + }, + { + "epoch": 0.8482328482328483, + "grad_norm": 1.5, + "learning_rate": 1.640740933886495e-05, + "loss": 0.9461, + "step": 4947 + }, + { + "epoch": 0.8484043123218381, + "grad_norm": 1.5703125, + "learning_rate": 1.6406022720101935e-05, + "loss": 0.9793, + "step": 4948 + }, + { + "epoch": 0.8485757764108279, + "grad_norm": 1.5703125, + "learning_rate": 1.6404635892414392e-05, + "loss": 1.0438, + "step": 4949 + }, + { + "epoch": 0.8487472404998179, + "grad_norm": 1.5234375, + "learning_rate": 1.640324885584756e-05, + "loss": 0.9791, + "step": 4950 + }, + { + "epoch": 0.8489187045888077, + "grad_norm": 1.5546875, + "learning_rate": 1.6401861610446667e-05, + "loss": 0.9802, + "step": 4951 + }, + { + "epoch": 0.8490901686777975, + "grad_norm": 1.5546875, + "learning_rate": 1.640047415625696e-05, + "loss": 0.9295, + "step": 4952 + }, + { + "epoch": 0.8492616327667875, + "grad_norm": 1.65625, + "learning_rate": 1.639908649332369e-05, + "loss": 1.0223, + "step": 4953 + }, + { + "epoch": 0.8494330968557773, + "grad_norm": 1.5546875, + "learning_rate": 1.6397698621692112e-05, + "loss": 0.9678, + "step": 4954 + }, + { + "epoch": 0.8496045609447671, + "grad_norm": 1.6171875, + "learning_rate": 1.639631054140749e-05, + "loss": 1.0231, + "step": 4955 + }, + { + "epoch": 0.849776025033757, + "grad_norm": 1.5703125, + "learning_rate": 1.63949222525151e-05, + "loss": 1.0565, + "step": 4956 + }, + { + "epoch": 0.8499474891227469, + "grad_norm": 1.71875, + "learning_rate": 1.6393533755060204e-05, + "loss": 1.0848, + "step": 4957 + }, + { + "epoch": 0.8501189532117367, + "grad_norm": 1.703125, + "learning_rate": 1.6392145049088105e-05, + "loss": 1.1402, + "step": 4958 + }, + { + "epoch": 0.8502904173007266, + "grad_norm": 1.5546875, + "learning_rate": 1.639075613464408e-05, + "loss": 0.9722, + "step": 4959 + }, + { + "epoch": 0.8504618813897165, + "grad_norm": 1.515625, + "learning_rate": 1.6389367011773435e-05, + "loss": 1.0092, + "step": 4960 + }, + { + "epoch": 0.8506333454787063, + "grad_norm": 1.46875, + "learning_rate": 1.6387977680521472e-05, + "loss": 0.9036, + "step": 4961 + }, + { + "epoch": 0.8508048095676962, + "grad_norm": 1.5703125, + "learning_rate": 1.6386588140933503e-05, + "loss": 1.0088, + "step": 4962 + }, + { + "epoch": 0.850976273656686, + "grad_norm": 1.515625, + "learning_rate": 1.6385198393054843e-05, + "loss": 0.9637, + "step": 4963 + }, + { + "epoch": 0.8511477377456759, + "grad_norm": 1.546875, + "learning_rate": 1.638380843693082e-05, + "loss": 0.9548, + "step": 4964 + }, + { + "epoch": 0.8513192018346658, + "grad_norm": 1.5390625, + "learning_rate": 1.6382418272606763e-05, + "loss": 0.9837, + "step": 4965 + }, + { + "epoch": 0.8514906659236556, + "grad_norm": 1.578125, + "learning_rate": 1.6381027900128013e-05, + "loss": 1.1239, + "step": 4966 + }, + { + "epoch": 0.8516621300126455, + "grad_norm": 1.5390625, + "learning_rate": 1.6379637319539913e-05, + "loss": 0.9881, + "step": 4967 + }, + { + "epoch": 0.8518335941016354, + "grad_norm": 1.6015625, + "learning_rate": 1.637824653088782e-05, + "loss": 0.9896, + "step": 4968 + }, + { + "epoch": 0.8520050581906252, + "grad_norm": 1.6640625, + "learning_rate": 1.637685553421709e-05, + "loss": 1.053, + "step": 4969 + }, + { + "epoch": 0.852176522279615, + "grad_norm": 1.484375, + "learning_rate": 1.6375464329573084e-05, + "loss": 1.0277, + "step": 4970 + }, + { + "epoch": 0.8523479863686049, + "grad_norm": 1.6015625, + "learning_rate": 1.637407291700118e-05, + "loss": 0.9823, + "step": 4971 + }, + { + "epoch": 0.8525194504575948, + "grad_norm": 1.5859375, + "learning_rate": 1.6372681296546755e-05, + "loss": 0.9795, + "step": 4972 + }, + { + "epoch": 0.8526909145465846, + "grad_norm": 1.5703125, + "learning_rate": 1.6371289468255196e-05, + "loss": 1.042, + "step": 4973 + }, + { + "epoch": 0.8528623786355745, + "grad_norm": 1.5859375, + "learning_rate": 1.6369897432171896e-05, + "loss": 1.0476, + "step": 4974 + }, + { + "epoch": 0.8530338427245644, + "grad_norm": 1.671875, + "learning_rate": 1.6368505188342253e-05, + "loss": 1.0198, + "step": 4975 + }, + { + "epoch": 0.8532053068135542, + "grad_norm": 1.6796875, + "learning_rate": 1.636711273681167e-05, + "loss": 1.0091, + "step": 4976 + }, + { + "epoch": 0.853376770902544, + "grad_norm": 1.5234375, + "learning_rate": 1.6365720077625575e-05, + "loss": 1.0143, + "step": 4977 + }, + { + "epoch": 0.853548234991534, + "grad_norm": 1.5859375, + "learning_rate": 1.6364327210829373e-05, + "loss": 0.8933, + "step": 4978 + }, + { + "epoch": 0.8537196990805238, + "grad_norm": 1.5546875, + "learning_rate": 1.636293413646849e-05, + "loss": 0.9725, + "step": 4979 + }, + { + "epoch": 0.8538911631695136, + "grad_norm": 1.578125, + "learning_rate": 1.6361540854588364e-05, + "loss": 1.0688, + "step": 4980 + }, + { + "epoch": 0.8540626272585036, + "grad_norm": 1.6171875, + "learning_rate": 1.636014736523444e-05, + "loss": 1.0411, + "step": 4981 + }, + { + "epoch": 0.8542340913474934, + "grad_norm": 1.59375, + "learning_rate": 1.6358753668452162e-05, + "loss": 1.0243, + "step": 4982 + }, + { + "epoch": 0.8544055554364832, + "grad_norm": 1.6015625, + "learning_rate": 1.6357359764286978e-05, + "loss": 1.0091, + "step": 4983 + }, + { + "epoch": 0.8545770195254732, + "grad_norm": 1.5390625, + "learning_rate": 1.6355965652784353e-05, + "loss": 0.9835, + "step": 4984 + }, + { + "epoch": 0.854748483614463, + "grad_norm": 1.5859375, + "learning_rate": 1.635457133398975e-05, + "loss": 0.9444, + "step": 4985 + }, + { + "epoch": 0.8549199477034528, + "grad_norm": 1.5078125, + "learning_rate": 1.6353176807948654e-05, + "loss": 1.0489, + "step": 4986 + }, + { + "epoch": 0.8550914117924427, + "grad_norm": 1.4453125, + "learning_rate": 1.6351782074706536e-05, + "loss": 0.8774, + "step": 4987 + }, + { + "epoch": 0.8552628758814326, + "grad_norm": 1.5546875, + "learning_rate": 1.6350387134308884e-05, + "loss": 1.002, + "step": 4988 + }, + { + "epoch": 0.8554343399704224, + "grad_norm": 1.46875, + "learning_rate": 1.6348991986801197e-05, + "loss": 0.9584, + "step": 4989 + }, + { + "epoch": 0.8556058040594123, + "grad_norm": 1.484375, + "learning_rate": 1.634759663222897e-05, + "loss": 0.9562, + "step": 4990 + }, + { + "epoch": 0.8557772681484022, + "grad_norm": 1.7578125, + "learning_rate": 1.634620107063772e-05, + "loss": 0.9777, + "step": 4991 + }, + { + "epoch": 0.855948732237392, + "grad_norm": 1.5625, + "learning_rate": 1.634480530207295e-05, + "loss": 0.9741, + "step": 4992 + }, + { + "epoch": 0.8561201963263819, + "grad_norm": 1.59375, + "learning_rate": 1.6343409326580185e-05, + "loss": 1.0333, + "step": 4993 + }, + { + "epoch": 0.8562916604153717, + "grad_norm": 1.640625, + "learning_rate": 1.6342013144204956e-05, + "loss": 1.0337, + "step": 4994 + }, + { + "epoch": 0.8564631245043616, + "grad_norm": 1.625, + "learning_rate": 1.6340616754992803e-05, + "loss": 0.9508, + "step": 4995 + }, + { + "epoch": 0.8566345885933515, + "grad_norm": 1.4609375, + "learning_rate": 1.6339220158989256e-05, + "loss": 0.9595, + "step": 4996 + }, + { + "epoch": 0.8568060526823413, + "grad_norm": 1.5390625, + "learning_rate": 1.633782335623987e-05, + "loss": 0.9851, + "step": 4997 + }, + { + "epoch": 0.8569775167713312, + "grad_norm": 1.5703125, + "learning_rate": 1.6336426346790192e-05, + "loss": 1.0379, + "step": 4998 + }, + { + "epoch": 0.8571489808603211, + "grad_norm": 1.6015625, + "learning_rate": 1.6335029130685794e-05, + "loss": 1.1435, + "step": 4999 + }, + { + "epoch": 0.8573204449493109, + "grad_norm": 1.515625, + "learning_rate": 1.6333631707972245e-05, + "loss": 0.9131, + "step": 5000 + }, + { + "epoch": 0.8574919090383007, + "grad_norm": 1.4921875, + "learning_rate": 1.6332234078695113e-05, + "loss": 0.9825, + "step": 5001 + }, + { + "epoch": 0.8576633731272907, + "grad_norm": 1.625, + "learning_rate": 1.633083624289998e-05, + "loss": 1.008, + "step": 5002 + }, + { + "epoch": 0.8578348372162805, + "grad_norm": 1.671875, + "learning_rate": 1.6329438200632443e-05, + "loss": 1.1064, + "step": 5003 + }, + { + "epoch": 0.8580063013052703, + "grad_norm": 1.5078125, + "learning_rate": 1.6328039951938086e-05, + "loss": 0.9704, + "step": 5004 + }, + { + "epoch": 0.8581777653942603, + "grad_norm": 1.5234375, + "learning_rate": 1.632664149686252e-05, + "loss": 1.0046, + "step": 5005 + }, + { + "epoch": 0.8583492294832501, + "grad_norm": 1.4453125, + "learning_rate": 1.6325242835451353e-05, + "loss": 0.9914, + "step": 5006 + }, + { + "epoch": 0.8585206935722399, + "grad_norm": 1.5546875, + "learning_rate": 1.632384396775019e-05, + "loss": 1.0318, + "step": 5007 + }, + { + "epoch": 0.8586921576612299, + "grad_norm": 1.609375, + "learning_rate": 1.632244489380467e-05, + "loss": 1.0568, + "step": 5008 + }, + { + "epoch": 0.8588636217502197, + "grad_norm": 1.578125, + "learning_rate": 1.632104561366041e-05, + "loss": 1.0481, + "step": 5009 + }, + { + "epoch": 0.8590350858392095, + "grad_norm": 1.609375, + "learning_rate": 1.6319646127363053e-05, + "loss": 1.0812, + "step": 5010 + }, + { + "epoch": 0.8592065499281994, + "grad_norm": 1.5859375, + "learning_rate": 1.6318246434958234e-05, + "loss": 1.0274, + "step": 5011 + }, + { + "epoch": 0.8593780140171893, + "grad_norm": 1.5703125, + "learning_rate": 1.631684653649161e-05, + "loss": 0.9391, + "step": 5012 + }, + { + "epoch": 0.8595494781061791, + "grad_norm": 1.53125, + "learning_rate": 1.631544643200883e-05, + "loss": 0.9186, + "step": 5013 + }, + { + "epoch": 0.859720942195169, + "grad_norm": 1.671875, + "learning_rate": 1.6314046121555563e-05, + "loss": 0.9996, + "step": 5014 + }, + { + "epoch": 0.8598924062841589, + "grad_norm": 1.5078125, + "learning_rate": 1.631264560517748e-05, + "loss": 0.9689, + "step": 5015 + }, + { + "epoch": 0.8600638703731487, + "grad_norm": 1.5390625, + "learning_rate": 1.6311244882920243e-05, + "loss": 1.0294, + "step": 5016 + }, + { + "epoch": 0.8602353344621386, + "grad_norm": 1.46875, + "learning_rate": 1.6309843954829547e-05, + "loss": 0.9928, + "step": 5017 + }, + { + "epoch": 0.8604067985511284, + "grad_norm": 1.5390625, + "learning_rate": 1.630844282095108e-05, + "loss": 0.968, + "step": 5018 + }, + { + "epoch": 0.8605782626401183, + "grad_norm": 1.5, + "learning_rate": 1.6307041481330542e-05, + "loss": 1.0326, + "step": 5019 + }, + { + "epoch": 0.8607497267291082, + "grad_norm": 1.5234375, + "learning_rate": 1.6305639936013625e-05, + "loss": 1.0035, + "step": 5020 + }, + { + "epoch": 0.860921190818098, + "grad_norm": 1.515625, + "learning_rate": 1.6304238185046046e-05, + "loss": 0.9567, + "step": 5021 + }, + { + "epoch": 0.8610926549070879, + "grad_norm": 1.53125, + "learning_rate": 1.6302836228473518e-05, + "loss": 1.0195, + "step": 5022 + }, + { + "epoch": 0.8612641189960778, + "grad_norm": 1.5, + "learning_rate": 1.6301434066341774e-05, + "loss": 1.0022, + "step": 5023 + }, + { + "epoch": 0.8614355830850676, + "grad_norm": 1.5546875, + "learning_rate": 1.630003169869653e-05, + "loss": 1.0037, + "step": 5024 + }, + { + "epoch": 0.8616070471740574, + "grad_norm": 1.5703125, + "learning_rate": 1.629862912558353e-05, + "loss": 0.91, + "step": 5025 + }, + { + "epoch": 0.8617785112630474, + "grad_norm": 1.5234375, + "learning_rate": 1.6297226347048516e-05, + "loss": 0.9843, + "step": 5026 + }, + { + "epoch": 0.8619499753520372, + "grad_norm": 1.6171875, + "learning_rate": 1.6295823363137238e-05, + "loss": 0.9593, + "step": 5027 + }, + { + "epoch": 0.862121439441027, + "grad_norm": 1.5625, + "learning_rate": 1.629442017389545e-05, + "loss": 0.9987, + "step": 5028 + }, + { + "epoch": 0.862292903530017, + "grad_norm": 1.5703125, + "learning_rate": 1.629301677936892e-05, + "loss": 0.9576, + "step": 5029 + }, + { + "epoch": 0.8624643676190068, + "grad_norm": 1.4921875, + "learning_rate": 1.6291613179603417e-05, + "loss": 0.9212, + "step": 5030 + }, + { + "epoch": 0.8626358317079966, + "grad_norm": 1.609375, + "learning_rate": 1.6290209374644712e-05, + "loss": 1.0911, + "step": 5031 + }, + { + "epoch": 0.8628072957969866, + "grad_norm": 1.6171875, + "learning_rate": 1.6288805364538592e-05, + "loss": 1.0831, + "step": 5032 + }, + { + "epoch": 0.8629787598859764, + "grad_norm": 1.5546875, + "learning_rate": 1.6287401149330854e-05, + "loss": 0.9477, + "step": 5033 + }, + { + "epoch": 0.8631502239749662, + "grad_norm": 1.6171875, + "learning_rate": 1.6285996729067284e-05, + "loss": 0.993, + "step": 5034 + }, + { + "epoch": 0.8633216880639561, + "grad_norm": 1.6171875, + "learning_rate": 1.628459210379369e-05, + "loss": 0.9876, + "step": 5035 + }, + { + "epoch": 0.863493152152946, + "grad_norm": 1.5390625, + "learning_rate": 1.628318727355588e-05, + "loss": 0.8755, + "step": 5036 + }, + { + "epoch": 0.8636646162419358, + "grad_norm": 1.640625, + "learning_rate": 1.6281782238399677e-05, + "loss": 1.0774, + "step": 5037 + }, + { + "epoch": 0.8638360803309257, + "grad_norm": 1.5703125, + "learning_rate": 1.6280376998370896e-05, + "loss": 0.9763, + "step": 5038 + }, + { + "epoch": 0.8640075444199156, + "grad_norm": 1.578125, + "learning_rate": 1.6278971553515375e-05, + "loss": 1.0475, + "step": 5039 + }, + { + "epoch": 0.8641790085089054, + "grad_norm": 1.6484375, + "learning_rate": 1.6277565903878947e-05, + "loss": 1.0331, + "step": 5040 + }, + { + "epoch": 0.8643504725978953, + "grad_norm": 1.5390625, + "learning_rate": 1.6276160049507455e-05, + "loss": 1.018, + "step": 5041 + }, + { + "epoch": 0.8645219366868852, + "grad_norm": 1.5625, + "learning_rate": 1.627475399044675e-05, + "loss": 0.9472, + "step": 5042 + }, + { + "epoch": 0.864693400775875, + "grad_norm": 1.609375, + "learning_rate": 1.6273347726742685e-05, + "loss": 1.0766, + "step": 5043 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 1.6171875, + "learning_rate": 1.6271941258441134e-05, + "loss": 0.9634, + "step": 5044 + }, + { + "epoch": 0.8650363289538547, + "grad_norm": 1.625, + "learning_rate": 1.6270534585587957e-05, + "loss": 0.971, + "step": 5045 + }, + { + "epoch": 0.8652077930428446, + "grad_norm": 1.4453125, + "learning_rate": 1.6269127708229032e-05, + "loss": 0.9035, + "step": 5046 + }, + { + "epoch": 0.8653792571318345, + "grad_norm": 1.5390625, + "learning_rate": 1.6267720626410248e-05, + "loss": 1.032, + "step": 5047 + }, + { + "epoch": 0.8655507212208243, + "grad_norm": 1.5546875, + "learning_rate": 1.6266313340177494e-05, + "loss": 1.0, + "step": 5048 + }, + { + "epoch": 0.8657221853098142, + "grad_norm": 1.5390625, + "learning_rate": 1.6264905849576663e-05, + "loss": 1.0141, + "step": 5049 + }, + { + "epoch": 0.8658936493988041, + "grad_norm": 1.5703125, + "learning_rate": 1.626349815465366e-05, + "loss": 0.9323, + "step": 5050 + }, + { + "epoch": 0.8660651134877939, + "grad_norm": 1.8125, + "learning_rate": 1.62620902554544e-05, + "loss": 1.0563, + "step": 5051 + }, + { + "epoch": 0.8662365775767837, + "grad_norm": 1.5625, + "learning_rate": 1.626068215202479e-05, + "loss": 1.0467, + "step": 5052 + }, + { + "epoch": 0.8664080416657737, + "grad_norm": 1.5859375, + "learning_rate": 1.6259273844410762e-05, + "loss": 0.9192, + "step": 5053 + }, + { + "epoch": 0.8665795057547635, + "grad_norm": 1.59375, + "learning_rate": 1.6257865332658248e-05, + "loss": 0.9773, + "step": 5054 + }, + { + "epoch": 0.8667509698437533, + "grad_norm": 1.5703125, + "learning_rate": 1.6256456616813176e-05, + "loss": 0.9858, + "step": 5055 + }, + { + "epoch": 0.8669224339327433, + "grad_norm": 1.5859375, + "learning_rate": 1.6255047696921492e-05, + "loss": 0.9744, + "step": 5056 + }, + { + "epoch": 0.8670938980217331, + "grad_norm": 1.59375, + "learning_rate": 1.6253638573029153e-05, + "loss": 1.067, + "step": 5057 + }, + { + "epoch": 0.8672653621107229, + "grad_norm": 1.5625, + "learning_rate": 1.625222924518211e-05, + "loss": 1.0363, + "step": 5058 + }, + { + "epoch": 0.8674368261997129, + "grad_norm": 1.4921875, + "learning_rate": 1.6250819713426326e-05, + "loss": 0.9379, + "step": 5059 + }, + { + "epoch": 0.8676082902887027, + "grad_norm": 1.53125, + "learning_rate": 1.6249409977807772e-05, + "loss": 1.0446, + "step": 5060 + }, + { + "epoch": 0.8677797543776925, + "grad_norm": 1.5234375, + "learning_rate": 1.624800003837243e-05, + "loss": 1.0111, + "step": 5061 + }, + { + "epoch": 0.8679512184666824, + "grad_norm": 1.5546875, + "learning_rate": 1.6246589895166277e-05, + "loss": 0.9468, + "step": 5062 + }, + { + "epoch": 0.8681226825556723, + "grad_norm": 1.6796875, + "learning_rate": 1.6245179548235303e-05, + "loss": 0.9303, + "step": 5063 + }, + { + "epoch": 0.8682941466446621, + "grad_norm": 1.59375, + "learning_rate": 1.624376899762551e-05, + "loss": 0.983, + "step": 5064 + }, + { + "epoch": 0.8684656107336519, + "grad_norm": 1.5625, + "learning_rate": 1.6242358243382894e-05, + "loss": 0.9681, + "step": 5065 + }, + { + "epoch": 0.8686370748226419, + "grad_norm": 1.6015625, + "learning_rate": 1.6240947285553473e-05, + "loss": 1.0484, + "step": 5066 + }, + { + "epoch": 0.8688085389116317, + "grad_norm": 1.5234375, + "learning_rate": 1.6239536124183258e-05, + "loss": 1.0441, + "step": 5067 + }, + { + "epoch": 0.8689800030006215, + "grad_norm": 1.59375, + "learning_rate": 1.623812475931827e-05, + "loss": 0.9782, + "step": 5068 + }, + { + "epoch": 0.8691514670896114, + "grad_norm": 1.46875, + "learning_rate": 1.623671319100455e-05, + "loss": 0.9758, + "step": 5069 + }, + { + "epoch": 0.8693229311786013, + "grad_norm": 1.515625, + "learning_rate": 1.6235301419288125e-05, + "loss": 0.924, + "step": 5070 + }, + { + "epoch": 0.8694943952675911, + "grad_norm": 1.5390625, + "learning_rate": 1.6233889444215042e-05, + "loss": 0.9122, + "step": 5071 + }, + { + "epoch": 0.869665859356581, + "grad_norm": 1.515625, + "learning_rate": 1.6232477265831344e-05, + "loss": 1.0136, + "step": 5072 + }, + { + "epoch": 0.8698373234455709, + "grad_norm": 1.640625, + "learning_rate": 1.62310648841831e-05, + "loss": 1.0216, + "step": 5073 + }, + { + "epoch": 0.8700087875345607, + "grad_norm": 1.4765625, + "learning_rate": 1.622965229931636e-05, + "loss": 0.9828, + "step": 5074 + }, + { + "epoch": 0.8701802516235506, + "grad_norm": 1.671875, + "learning_rate": 1.6228239511277207e-05, + "loss": 0.9817, + "step": 5075 + }, + { + "epoch": 0.8703517157125404, + "grad_norm": 1.6328125, + "learning_rate": 1.6226826520111704e-05, + "loss": 1.0222, + "step": 5076 + }, + { + "epoch": 0.8705231798015303, + "grad_norm": 1.6171875, + "learning_rate": 1.622541332586594e-05, + "loss": 1.089, + "step": 5077 + }, + { + "epoch": 0.8706946438905202, + "grad_norm": 1.5859375, + "learning_rate": 1.622399992858601e-05, + "loss": 0.9925, + "step": 5078 + }, + { + "epoch": 0.87086610797951, + "grad_norm": 1.546875, + "learning_rate": 1.6222586328317998e-05, + "loss": 1.0387, + "step": 5079 + }, + { + "epoch": 0.8710375720684999, + "grad_norm": 1.640625, + "learning_rate": 1.622117252510802e-05, + "loss": 1.0318, + "step": 5080 + }, + { + "epoch": 0.8712090361574898, + "grad_norm": 1.65625, + "learning_rate": 1.6219758519002174e-05, + "loss": 1.0202, + "step": 5081 + }, + { + "epoch": 0.8713805002464796, + "grad_norm": 1.5625, + "learning_rate": 1.6218344310046585e-05, + "loss": 0.9705, + "step": 5082 + }, + { + "epoch": 0.8715519643354694, + "grad_norm": 1.5234375, + "learning_rate": 1.621692989828737e-05, + "loss": 1.0089, + "step": 5083 + }, + { + "epoch": 0.8717234284244594, + "grad_norm": 1.578125, + "learning_rate": 1.6215515283770666e-05, + "loss": 0.9512, + "step": 5084 + }, + { + "epoch": 0.8718948925134492, + "grad_norm": 1.515625, + "learning_rate": 1.6214100466542598e-05, + "loss": 1.0778, + "step": 5085 + }, + { + "epoch": 0.872066356602439, + "grad_norm": 1.5546875, + "learning_rate": 1.6212685446649313e-05, + "loss": 1.0466, + "step": 5086 + }, + { + "epoch": 0.872237820691429, + "grad_norm": 1.5078125, + "learning_rate": 1.6211270224136962e-05, + "loss": 0.9975, + "step": 5087 + }, + { + "epoch": 0.8724092847804188, + "grad_norm": 1.5, + "learning_rate": 1.6209854799051695e-05, + "loss": 0.9996, + "step": 5088 + }, + { + "epoch": 0.8725807488694086, + "grad_norm": 1.6484375, + "learning_rate": 1.6208439171439688e-05, + "loss": 1.0921, + "step": 5089 + }, + { + "epoch": 0.8727522129583986, + "grad_norm": 1.65625, + "learning_rate": 1.6207023341347094e-05, + "loss": 0.9706, + "step": 5090 + }, + { + "epoch": 0.8729236770473884, + "grad_norm": 1.5703125, + "learning_rate": 1.6205607308820097e-05, + "loss": 1.0387, + "step": 5091 + }, + { + "epoch": 0.8730951411363782, + "grad_norm": 1.6015625, + "learning_rate": 1.6204191073904877e-05, + "loss": 1.0514, + "step": 5092 + }, + { + "epoch": 0.8732666052253681, + "grad_norm": 1.6875, + "learning_rate": 1.6202774636647624e-05, + "loss": 1.0053, + "step": 5093 + }, + { + "epoch": 0.873438069314358, + "grad_norm": 1.5859375, + "learning_rate": 1.6201357997094534e-05, + "loss": 0.9727, + "step": 5094 + }, + { + "epoch": 0.8736095334033478, + "grad_norm": 1.515625, + "learning_rate": 1.6199941155291805e-05, + "loss": 1.0132, + "step": 5095 + }, + { + "epoch": 0.8737809974923377, + "grad_norm": 1.609375, + "learning_rate": 1.6198524111285648e-05, + "loss": 1.0101, + "step": 5096 + }, + { + "epoch": 0.8739524615813276, + "grad_norm": 1.5390625, + "learning_rate": 1.6197106865122282e-05, + "loss": 1.0265, + "step": 5097 + }, + { + "epoch": 0.8741239256703174, + "grad_norm": 1.546875, + "learning_rate": 1.619568941684792e-05, + "loss": 1.0285, + "step": 5098 + }, + { + "epoch": 0.8742953897593073, + "grad_norm": 1.4765625, + "learning_rate": 1.61942717665088e-05, + "loss": 0.9578, + "step": 5099 + }, + { + "epoch": 0.8744668538482971, + "grad_norm": 1.5078125, + "learning_rate": 1.6192853914151147e-05, + "loss": 1.0192, + "step": 5100 + }, + { + "epoch": 0.874638317937287, + "grad_norm": 1.5390625, + "learning_rate": 1.6191435859821215e-05, + "loss": 0.9872, + "step": 5101 + }, + { + "epoch": 0.8748097820262769, + "grad_norm": 1.59375, + "learning_rate": 1.6190017603565238e-05, + "loss": 1.0022, + "step": 5102 + }, + { + "epoch": 0.8749812461152667, + "grad_norm": 1.5234375, + "learning_rate": 1.618859914542948e-05, + "loss": 0.9653, + "step": 5103 + }, + { + "epoch": 0.8751527102042566, + "grad_norm": 1.578125, + "learning_rate": 1.61871804854602e-05, + "loss": 1.0534, + "step": 5104 + }, + { + "epoch": 0.8753241742932465, + "grad_norm": 1.6171875, + "learning_rate": 1.6185761623703668e-05, + "loss": 0.9797, + "step": 5105 + }, + { + "epoch": 0.8754956383822363, + "grad_norm": 1.6015625, + "learning_rate": 1.6184342560206153e-05, + "loss": 1.0583, + "step": 5106 + }, + { + "epoch": 0.8756671024712261, + "grad_norm": 1.5703125, + "learning_rate": 1.6182923295013935e-05, + "loss": 1.0972, + "step": 5107 + }, + { + "epoch": 0.8758385665602161, + "grad_norm": 1.5859375, + "learning_rate": 1.6181503828173314e-05, + "loss": 0.9856, + "step": 5108 + }, + { + "epoch": 0.8760100306492059, + "grad_norm": 1.53125, + "learning_rate": 1.618008415973057e-05, + "loss": 1.0702, + "step": 5109 + }, + { + "epoch": 0.8761814947381957, + "grad_norm": 1.640625, + "learning_rate": 1.6178664289732014e-05, + "loss": 1.0591, + "step": 5110 + }, + { + "epoch": 0.8763529588271857, + "grad_norm": 1.6328125, + "learning_rate": 1.6177244218223943e-05, + "loss": 0.9781, + "step": 5111 + }, + { + "epoch": 0.8765244229161755, + "grad_norm": 1.640625, + "learning_rate": 1.6175823945252684e-05, + "loss": 0.9926, + "step": 5112 + }, + { + "epoch": 0.8766958870051653, + "grad_norm": 1.46875, + "learning_rate": 1.6174403470864545e-05, + "loss": 0.951, + "step": 5113 + }, + { + "epoch": 0.8768673510941553, + "grad_norm": 1.4375, + "learning_rate": 1.6172982795105857e-05, + "loss": 0.9442, + "step": 5114 + }, + { + "epoch": 0.8770388151831451, + "grad_norm": 1.5703125, + "learning_rate": 1.6171561918022954e-05, + "loss": 0.9973, + "step": 5115 + }, + { + "epoch": 0.8772102792721349, + "grad_norm": 1.7109375, + "learning_rate": 1.6170140839662184e-05, + "loss": 0.9902, + "step": 5116 + }, + { + "epoch": 0.8773817433611248, + "grad_norm": 1.5859375, + "learning_rate": 1.6168719560069882e-05, + "loss": 1.1159, + "step": 5117 + }, + { + "epoch": 0.8775532074501147, + "grad_norm": 1.578125, + "learning_rate": 1.616729807929241e-05, + "loss": 0.9024, + "step": 5118 + }, + { + "epoch": 0.8777246715391045, + "grad_norm": 1.5234375, + "learning_rate": 1.6165876397376117e-05, + "loss": 0.9801, + "step": 5119 + }, + { + "epoch": 0.8778961356280944, + "grad_norm": 1.6953125, + "learning_rate": 1.616445451436738e-05, + "loss": 1.106, + "step": 5120 + }, + { + "epoch": 0.8780675997170843, + "grad_norm": 1.6875, + "learning_rate": 1.616303243031257e-05, + "loss": 0.9608, + "step": 5121 + }, + { + "epoch": 0.8782390638060741, + "grad_norm": 1.5390625, + "learning_rate": 1.6161610145258058e-05, + "loss": 1.0416, + "step": 5122 + }, + { + "epoch": 0.878410527895064, + "grad_norm": 1.6484375, + "learning_rate": 1.616018765925024e-05, + "loss": 1.0368, + "step": 5123 + }, + { + "epoch": 0.8785819919840538, + "grad_norm": 1.6640625, + "learning_rate": 1.6158764972335507e-05, + "loss": 0.9011, + "step": 5124 + }, + { + "epoch": 0.8787534560730437, + "grad_norm": 1.53125, + "learning_rate": 1.6157342084560258e-05, + "loss": 0.9417, + "step": 5125 + }, + { + "epoch": 0.8789249201620336, + "grad_norm": 1.5546875, + "learning_rate": 1.6155918995970892e-05, + "loss": 1.0066, + "step": 5126 + }, + { + "epoch": 0.8790963842510234, + "grad_norm": 1.5859375, + "learning_rate": 1.615449570661383e-05, + "loss": 1.0111, + "step": 5127 + }, + { + "epoch": 0.8792678483400133, + "grad_norm": 1.5546875, + "learning_rate": 1.6153072216535485e-05, + "loss": 1.0012, + "step": 5128 + }, + { + "epoch": 0.8794393124290032, + "grad_norm": 1.6640625, + "learning_rate": 1.6151648525782286e-05, + "loss": 1.0302, + "step": 5129 + }, + { + "epoch": 0.879610776517993, + "grad_norm": 1.6328125, + "learning_rate": 1.6150224634400666e-05, + "loss": 1.0521, + "step": 5130 + }, + { + "epoch": 0.8797822406069828, + "grad_norm": 1.625, + "learning_rate": 1.614880054243706e-05, + "loss": 1.0379, + "step": 5131 + }, + { + "epoch": 0.8799537046959728, + "grad_norm": 1.484375, + "learning_rate": 1.614737624993791e-05, + "loss": 0.9418, + "step": 5132 + }, + { + "epoch": 0.8801251687849626, + "grad_norm": 1.6796875, + "learning_rate": 1.6145951756949676e-05, + "loss": 1.0277, + "step": 5133 + }, + { + "epoch": 0.8802966328739524, + "grad_norm": 1.5859375, + "learning_rate": 1.6144527063518813e-05, + "loss": 0.9721, + "step": 5134 + }, + { + "epoch": 0.8804680969629424, + "grad_norm": 1.515625, + "learning_rate": 1.614310216969178e-05, + "loss": 1.0545, + "step": 5135 + }, + { + "epoch": 0.8806395610519322, + "grad_norm": 1.5859375, + "learning_rate": 1.6141677075515053e-05, + "loss": 1.0213, + "step": 5136 + }, + { + "epoch": 0.880811025140922, + "grad_norm": 1.5625, + "learning_rate": 1.614025178103511e-05, + "loss": 0.9924, + "step": 5137 + }, + { + "epoch": 0.880982489229912, + "grad_norm": 1.5546875, + "learning_rate": 1.6138826286298434e-05, + "loss": 1.0368, + "step": 5138 + }, + { + "epoch": 0.8811539533189018, + "grad_norm": 1.6328125, + "learning_rate": 1.6137400591351518e-05, + "loss": 1.032, + "step": 5139 + }, + { + "epoch": 0.8813254174078916, + "grad_norm": 1.53125, + "learning_rate": 1.6135974696240854e-05, + "loss": 1.0255, + "step": 5140 + }, + { + "epoch": 0.8814968814968815, + "grad_norm": 1.6015625, + "learning_rate": 1.6134548601012954e-05, + "loss": 0.9538, + "step": 5141 + }, + { + "epoch": 0.8816683455858714, + "grad_norm": 1.53125, + "learning_rate": 1.613312230571432e-05, + "loss": 0.9656, + "step": 5142 + }, + { + "epoch": 0.8818398096748612, + "grad_norm": 1.515625, + "learning_rate": 1.613169581039147e-05, + "loss": 1.0057, + "step": 5143 + }, + { + "epoch": 0.8820112737638511, + "grad_norm": 1.5859375, + "learning_rate": 1.6130269115090936e-05, + "loss": 1.0336, + "step": 5144 + }, + { + "epoch": 0.882182737852841, + "grad_norm": 1.6328125, + "learning_rate": 1.6128842219859238e-05, + "loss": 1.1094, + "step": 5145 + }, + { + "epoch": 0.8823542019418308, + "grad_norm": 1.609375, + "learning_rate": 1.6127415124742917e-05, + "loss": 1.0314, + "step": 5146 + }, + { + "epoch": 0.8825256660308207, + "grad_norm": 1.59375, + "learning_rate": 1.6125987829788514e-05, + "loss": 0.9689, + "step": 5147 + }, + { + "epoch": 0.8826971301198105, + "grad_norm": 1.5390625, + "learning_rate": 1.612456033504258e-05, + "loss": 0.9291, + "step": 5148 + }, + { + "epoch": 0.8828685942088004, + "grad_norm": 1.59375, + "learning_rate": 1.6123132640551674e-05, + "loss": 0.9894, + "step": 5149 + }, + { + "epoch": 0.8830400582977903, + "grad_norm": 1.5703125, + "learning_rate": 1.6121704746362352e-05, + "loss": 0.965, + "step": 5150 + }, + { + "epoch": 0.8832115223867801, + "grad_norm": 39.5, + "learning_rate": 1.6120276652521185e-05, + "loss": 1.0894, + "step": 5151 + }, + { + "epoch": 0.88338298647577, + "grad_norm": 1.5859375, + "learning_rate": 1.6118848359074753e-05, + "loss": 1.0246, + "step": 5152 + }, + { + "epoch": 0.8835544505647599, + "grad_norm": 1.5234375, + "learning_rate": 1.611741986606963e-05, + "loss": 0.974, + "step": 5153 + }, + { + "epoch": 0.8837259146537497, + "grad_norm": 1.65625, + "learning_rate": 1.611599117355241e-05, + "loss": 1.0874, + "step": 5154 + }, + { + "epoch": 0.8838973787427395, + "grad_norm": 1.640625, + "learning_rate": 1.611456228156969e-05, + "loss": 1.0439, + "step": 5155 + }, + { + "epoch": 0.8840688428317295, + "grad_norm": 1.65625, + "learning_rate": 1.611313319016807e-05, + "loss": 0.9967, + "step": 5156 + }, + { + "epoch": 0.8842403069207193, + "grad_norm": 1.5703125, + "learning_rate": 1.611170389939415e-05, + "loss": 0.9404, + "step": 5157 + }, + { + "epoch": 0.8844117710097091, + "grad_norm": 1.5703125, + "learning_rate": 1.6110274409294556e-05, + "loss": 0.9975, + "step": 5158 + }, + { + "epoch": 0.884583235098699, + "grad_norm": 1.625, + "learning_rate": 1.6108844719915905e-05, + "loss": 1.0937, + "step": 5159 + }, + { + "epoch": 0.8847546991876889, + "grad_norm": 1.53125, + "learning_rate": 1.610741483130482e-05, + "loss": 1.0137, + "step": 5160 + }, + { + "epoch": 0.8849261632766787, + "grad_norm": 1.59375, + "learning_rate": 1.6105984743507944e-05, + "loss": 1.0278, + "step": 5161 + }, + { + "epoch": 0.8850976273656685, + "grad_norm": 1.5078125, + "learning_rate": 1.610455445657191e-05, + "loss": 0.9889, + "step": 5162 + }, + { + "epoch": 0.8852690914546585, + "grad_norm": 1.6328125, + "learning_rate": 1.6103123970543366e-05, + "loss": 0.9304, + "step": 5163 + }, + { + "epoch": 0.8854405555436483, + "grad_norm": 1.6328125, + "learning_rate": 1.6101693285468968e-05, + "loss": 1.0058, + "step": 5164 + }, + { + "epoch": 0.8856120196326381, + "grad_norm": 1.4921875, + "learning_rate": 1.6100262401395376e-05, + "loss": 1.0408, + "step": 5165 + }, + { + "epoch": 0.8857834837216281, + "grad_norm": 1.5234375, + "learning_rate": 1.6098831318369253e-05, + "loss": 1.0108, + "step": 5166 + }, + { + "epoch": 0.8859549478106179, + "grad_norm": 1.4921875, + "learning_rate": 1.6097400036437276e-05, + "loss": 0.9301, + "step": 5167 + }, + { + "epoch": 0.8861264118996077, + "grad_norm": 1.53125, + "learning_rate": 1.6095968555646128e-05, + "loss": 0.9718, + "step": 5168 + }, + { + "epoch": 0.8862978759885977, + "grad_norm": 1.7421875, + "learning_rate": 1.6094536876042486e-05, + "loss": 1.1003, + "step": 5169 + }, + { + "epoch": 0.8864693400775875, + "grad_norm": 1.6015625, + "learning_rate": 1.6093104997673045e-05, + "loss": 1.0531, + "step": 5170 + }, + { + "epoch": 0.8866408041665773, + "grad_norm": 1.609375, + "learning_rate": 1.6091672920584508e-05, + "loss": 1.0346, + "step": 5171 + }, + { + "epoch": 0.8868122682555672, + "grad_norm": 1.5625, + "learning_rate": 1.6090240644823577e-05, + "loss": 0.9871, + "step": 5172 + }, + { + "epoch": 0.8869837323445571, + "grad_norm": 1.6015625, + "learning_rate": 1.6088808170436964e-05, + "loss": 0.9438, + "step": 5173 + }, + { + "epoch": 0.8871551964335469, + "grad_norm": 1.578125, + "learning_rate": 1.608737549747139e-05, + "loss": 1.0102, + "step": 5174 + }, + { + "epoch": 0.8873266605225368, + "grad_norm": 1.59375, + "learning_rate": 1.6085942625973577e-05, + "loss": 1.0018, + "step": 5175 + }, + { + "epoch": 0.8874981246115267, + "grad_norm": 1.4609375, + "learning_rate": 1.6084509555990258e-05, + "loss": 0.9204, + "step": 5176 + }, + { + "epoch": 0.8876695887005165, + "grad_norm": 1.59375, + "learning_rate": 1.608307628756817e-05, + "loss": 1.0178, + "step": 5177 + }, + { + "epoch": 0.8878410527895064, + "grad_norm": 1.5625, + "learning_rate": 1.608164282075406e-05, + "loss": 0.9534, + "step": 5178 + }, + { + "epoch": 0.8880125168784962, + "grad_norm": 1.4921875, + "learning_rate": 1.608020915559467e-05, + "loss": 1.0153, + "step": 5179 + }, + { + "epoch": 0.8881839809674861, + "grad_norm": 1.6953125, + "learning_rate": 1.607877529213677e-05, + "loss": 1.0366, + "step": 5180 + }, + { + "epoch": 0.888355445056476, + "grad_norm": 1.5703125, + "learning_rate": 1.607734123042711e-05, + "loss": 1.0342, + "step": 5181 + }, + { + "epoch": 0.8885269091454658, + "grad_norm": 1.484375, + "learning_rate": 1.6075906970512475e-05, + "loss": 0.9779, + "step": 5182 + }, + { + "epoch": 0.8886983732344557, + "grad_norm": 1.578125, + "learning_rate": 1.607447251243963e-05, + "loss": 0.994, + "step": 5183 + }, + { + "epoch": 0.8888698373234456, + "grad_norm": 1.59375, + "learning_rate": 1.6073037856255362e-05, + "loss": 0.9917, + "step": 5184 + }, + { + "epoch": 0.8890413014124354, + "grad_norm": 1.5703125, + "learning_rate": 1.607160300200646e-05, + "loss": 1.0062, + "step": 5185 + }, + { + "epoch": 0.8892127655014253, + "grad_norm": 1.609375, + "learning_rate": 1.6070167949739724e-05, + "loss": 1.039, + "step": 5186 + }, + { + "epoch": 0.8893842295904152, + "grad_norm": 1.5234375, + "learning_rate": 1.606873269950195e-05, + "loss": 0.9748, + "step": 5187 + }, + { + "epoch": 0.889555693679405, + "grad_norm": 1.59375, + "learning_rate": 1.606729725133995e-05, + "loss": 0.9604, + "step": 5188 + }, + { + "epoch": 0.8897271577683948, + "grad_norm": 1.5390625, + "learning_rate": 1.606586160530054e-05, + "loss": 0.9808, + "step": 5189 + }, + { + "epoch": 0.8898986218573848, + "grad_norm": 1.5703125, + "learning_rate": 1.606442576143054e-05, + "loss": 0.894, + "step": 5190 + }, + { + "epoch": 0.8900700859463746, + "grad_norm": 1.546875, + "learning_rate": 1.6062989719776782e-05, + "loss": 0.9603, + "step": 5191 + }, + { + "epoch": 0.8902415500353644, + "grad_norm": 1.5, + "learning_rate": 1.60615534803861e-05, + "loss": 0.9645, + "step": 5192 + }, + { + "epoch": 0.8904130141243544, + "grad_norm": 1.5, + "learning_rate": 1.6060117043305327e-05, + "loss": 0.9169, + "step": 5193 + }, + { + "epoch": 0.8905844782133442, + "grad_norm": 1.4765625, + "learning_rate": 1.605868040858132e-05, + "loss": 0.9474, + "step": 5194 + }, + { + "epoch": 0.890755942302334, + "grad_norm": 1.625, + "learning_rate": 1.605724357626093e-05, + "loss": 1.0531, + "step": 5195 + }, + { + "epoch": 0.890927406391324, + "grad_norm": 1.6328125, + "learning_rate": 1.605580654639102e-05, + "loss": 1.035, + "step": 5196 + }, + { + "epoch": 0.8910988704803138, + "grad_norm": 1.53125, + "learning_rate": 1.605436931901845e-05, + "loss": 1.0524, + "step": 5197 + }, + { + "epoch": 0.8912703345693036, + "grad_norm": 1.578125, + "learning_rate": 1.60529318941901e-05, + "loss": 1.1239, + "step": 5198 + }, + { + "epoch": 0.8914417986582935, + "grad_norm": 1.5078125, + "learning_rate": 1.6051494271952844e-05, + "loss": 1.0065, + "step": 5199 + }, + { + "epoch": 0.8916132627472834, + "grad_norm": 1.578125, + "learning_rate": 1.605005645235358e-05, + "loss": 0.9343, + "step": 5200 + }, + { + "epoch": 0.8917847268362732, + "grad_norm": 1.484375, + "learning_rate": 1.6048618435439184e-05, + "loss": 0.9976, + "step": 5201 + }, + { + "epoch": 0.8919561909252631, + "grad_norm": 1.53125, + "learning_rate": 1.604718022125657e-05, + "loss": 0.986, + "step": 5202 + }, + { + "epoch": 0.892127655014253, + "grad_norm": 1.4921875, + "learning_rate": 1.6045741809852636e-05, + "loss": 0.9586, + "step": 5203 + }, + { + "epoch": 0.8922991191032428, + "grad_norm": 1.625, + "learning_rate": 1.6044303201274293e-05, + "loss": 1.0265, + "step": 5204 + }, + { + "epoch": 0.8924705831922327, + "grad_norm": 1.515625, + "learning_rate": 1.6042864395568466e-05, + "loss": 0.9714, + "step": 5205 + }, + { + "epoch": 0.8926420472812225, + "grad_norm": 1.59375, + "learning_rate": 1.6041425392782073e-05, + "loss": 1.0083, + "step": 5206 + }, + { + "epoch": 0.8928135113702124, + "grad_norm": 1.5859375, + "learning_rate": 1.6039986192962048e-05, + "loss": 1.0321, + "step": 5207 + }, + { + "epoch": 0.8929849754592023, + "grad_norm": 1.6640625, + "learning_rate": 1.603854679615533e-05, + "loss": 1.0479, + "step": 5208 + }, + { + "epoch": 0.8931564395481921, + "grad_norm": 1.6875, + "learning_rate": 1.6037107202408862e-05, + "loss": 1.0948, + "step": 5209 + }, + { + "epoch": 0.893327903637182, + "grad_norm": 1.6015625, + "learning_rate": 1.6035667411769593e-05, + "loss": 1.0308, + "step": 5210 + }, + { + "epoch": 0.8934993677261719, + "grad_norm": 1.7578125, + "learning_rate": 1.6034227424284482e-05, + "loss": 1.0654, + "step": 5211 + }, + { + "epoch": 0.8936708318151617, + "grad_norm": 1.609375, + "learning_rate": 1.603278724000049e-05, + "loss": 0.976, + "step": 5212 + }, + { + "epoch": 0.8938422959041515, + "grad_norm": 1.6015625, + "learning_rate": 1.603134685896459e-05, + "loss": 1.0447, + "step": 5213 + }, + { + "epoch": 0.8940137599931415, + "grad_norm": 1.5390625, + "learning_rate": 1.602990628122376e-05, + "loss": 1.0081, + "step": 5214 + }, + { + "epoch": 0.8941852240821313, + "grad_norm": 1.6796875, + "learning_rate": 1.6028465506824978e-05, + "loss": 1.143, + "step": 5215 + }, + { + "epoch": 0.8943566881711211, + "grad_norm": 1.625, + "learning_rate": 1.6027024535815234e-05, + "loss": 1.0891, + "step": 5216 + }, + { + "epoch": 0.8945281522601111, + "grad_norm": 1.5078125, + "learning_rate": 1.6025583368241524e-05, + "loss": 0.9675, + "step": 5217 + }, + { + "epoch": 0.8946996163491009, + "grad_norm": 1.4609375, + "learning_rate": 1.602414200415085e-05, + "loss": 0.9697, + "step": 5218 + }, + { + "epoch": 0.8948710804380907, + "grad_norm": 1.5546875, + "learning_rate": 1.602270044359022e-05, + "loss": 1.0339, + "step": 5219 + }, + { + "epoch": 0.8950425445270807, + "grad_norm": 1.515625, + "learning_rate": 1.6021258686606652e-05, + "loss": 1.018, + "step": 5220 + }, + { + "epoch": 0.8952140086160705, + "grad_norm": 1.78125, + "learning_rate": 1.6019816733247158e-05, + "loss": 0.9383, + "step": 5221 + }, + { + "epoch": 0.8953854727050603, + "grad_norm": 1.53125, + "learning_rate": 1.601837458355878e-05, + "loss": 0.9684, + "step": 5222 + }, + { + "epoch": 0.8955569367940502, + "grad_norm": 1.453125, + "learning_rate": 1.6016932237588537e-05, + "loss": 1.0065, + "step": 5223 + }, + { + "epoch": 0.8957284008830401, + "grad_norm": 1.6171875, + "learning_rate": 1.601548969538348e-05, + "loss": 0.9799, + "step": 5224 + }, + { + "epoch": 0.8958998649720299, + "grad_norm": 1.5625, + "learning_rate": 1.6014046956990653e-05, + "loss": 1.0463, + "step": 5225 + }, + { + "epoch": 0.8960713290610198, + "grad_norm": 1.5390625, + "learning_rate": 1.6012604022457104e-05, + "loss": 1.0254, + "step": 5226 + }, + { + "epoch": 0.8962427931500097, + "grad_norm": 1.5703125, + "learning_rate": 1.6011160891829898e-05, + "loss": 1.0115, + "step": 5227 + }, + { + "epoch": 0.8964142572389995, + "grad_norm": 1.578125, + "learning_rate": 1.60097175651561e-05, + "loss": 0.982, + "step": 5228 + }, + { + "epoch": 0.8965857213279894, + "grad_norm": 1.5234375, + "learning_rate": 1.600827404248278e-05, + "loss": 0.9934, + "step": 5229 + }, + { + "epoch": 0.8967571854169792, + "grad_norm": 1.5859375, + "learning_rate": 1.600683032385702e-05, + "loss": 1.0717, + "step": 5230 + }, + { + "epoch": 0.8969286495059691, + "grad_norm": 1.578125, + "learning_rate": 1.6005386409325906e-05, + "loss": 1.0007, + "step": 5231 + }, + { + "epoch": 0.897100113594959, + "grad_norm": 1.53125, + "learning_rate": 1.6003942298936524e-05, + "loss": 0.9075, + "step": 5232 + }, + { + "epoch": 0.8972715776839488, + "grad_norm": 1.515625, + "learning_rate": 1.6002497992735973e-05, + "loss": 0.9794, + "step": 5233 + }, + { + "epoch": 0.8974430417729387, + "grad_norm": 1.4765625, + "learning_rate": 1.600105349077136e-05, + "loss": 0.9492, + "step": 5234 + }, + { + "epoch": 0.8976145058619286, + "grad_norm": 1.546875, + "learning_rate": 1.5999608793089797e-05, + "loss": 0.9647, + "step": 5235 + }, + { + "epoch": 0.8977859699509184, + "grad_norm": 1.640625, + "learning_rate": 1.5998163899738398e-05, + "loss": 1.0268, + "step": 5236 + }, + { + "epoch": 0.8979574340399082, + "grad_norm": 1.546875, + "learning_rate": 1.5996718810764285e-05, + "loss": 1.0403, + "step": 5237 + }, + { + "epoch": 0.8981288981288982, + "grad_norm": 1.6328125, + "learning_rate": 1.5995273526214596e-05, + "loss": 0.9826, + "step": 5238 + }, + { + "epoch": 0.898300362217888, + "grad_norm": 1.5234375, + "learning_rate": 1.5993828046136454e-05, + "loss": 0.9027, + "step": 5239 + }, + { + "epoch": 0.8984718263068778, + "grad_norm": 1.578125, + "learning_rate": 1.5992382370577013e-05, + "loss": 0.9631, + "step": 5240 + }, + { + "epoch": 0.8986432903958678, + "grad_norm": 1.578125, + "learning_rate": 1.5990936499583415e-05, + "loss": 1.037, + "step": 5241 + }, + { + "epoch": 0.8988147544848576, + "grad_norm": 1.53125, + "learning_rate": 1.598949043320282e-05, + "loss": 1.0115, + "step": 5242 + }, + { + "epoch": 0.8989862185738474, + "grad_norm": 1.578125, + "learning_rate": 1.598804417148239e-05, + "loss": 0.9891, + "step": 5243 + }, + { + "epoch": 0.8991576826628374, + "grad_norm": 1.4609375, + "learning_rate": 1.5986597714469288e-05, + "loss": 1.0014, + "step": 5244 + }, + { + "epoch": 0.8993291467518272, + "grad_norm": 1.53125, + "learning_rate": 1.5985151062210687e-05, + "loss": 0.9914, + "step": 5245 + }, + { + "epoch": 0.899500610840817, + "grad_norm": 1.625, + "learning_rate": 1.5983704214753777e-05, + "loss": 1.0197, + "step": 5246 + }, + { + "epoch": 0.8996720749298069, + "grad_norm": 1.5, + "learning_rate": 1.5982257172145738e-05, + "loss": 0.9477, + "step": 5247 + }, + { + "epoch": 0.8998435390187968, + "grad_norm": 1.5703125, + "learning_rate": 1.598080993443377e-05, + "loss": 0.9749, + "step": 5248 + }, + { + "epoch": 0.9000150031077866, + "grad_norm": 1.5234375, + "learning_rate": 1.5979362501665062e-05, + "loss": 0.9667, + "step": 5249 + }, + { + "epoch": 0.9001864671967765, + "grad_norm": 1.5078125, + "learning_rate": 1.597791487388683e-05, + "loss": 1.0182, + "step": 5250 + }, + { + "epoch": 0.9003579312857664, + "grad_norm": 1.5234375, + "learning_rate": 1.5976467051146284e-05, + "loss": 1.0425, + "step": 5251 + }, + { + "epoch": 0.9005293953747562, + "grad_norm": 1.546875, + "learning_rate": 1.597501903349064e-05, + "loss": 1.0029, + "step": 5252 + }, + { + "epoch": 0.900700859463746, + "grad_norm": 1.484375, + "learning_rate": 1.5973570820967125e-05, + "loss": 1.0055, + "step": 5253 + }, + { + "epoch": 0.9008723235527359, + "grad_norm": 1.640625, + "learning_rate": 1.5972122413622972e-05, + "loss": 1.0582, + "step": 5254 + }, + { + "epoch": 0.9010437876417258, + "grad_norm": 1.578125, + "learning_rate": 1.597067381150542e-05, + "loss": 1.0042, + "step": 5255 + }, + { + "epoch": 0.9012152517307156, + "grad_norm": 1.7421875, + "learning_rate": 1.5969225014661708e-05, + "loss": 1.101, + "step": 5256 + }, + { + "epoch": 0.9013867158197055, + "grad_norm": 1.484375, + "learning_rate": 1.5967776023139094e-05, + "loss": 0.9062, + "step": 5257 + }, + { + "epoch": 0.9015581799086954, + "grad_norm": 1.5859375, + "learning_rate": 1.596632683698483e-05, + "loss": 1.0323, + "step": 5258 + }, + { + "epoch": 0.9017296439976852, + "grad_norm": 1.6171875, + "learning_rate": 1.596487745624618e-05, + "loss": 1.0726, + "step": 5259 + }, + { + "epoch": 0.9019011080866751, + "grad_norm": 1.5859375, + "learning_rate": 1.5963427880970414e-05, + "loss": 1.0093, + "step": 5260 + }, + { + "epoch": 0.9020725721756649, + "grad_norm": 1.53125, + "learning_rate": 1.596197811120481e-05, + "loss": 0.9529, + "step": 5261 + }, + { + "epoch": 0.9022440362646548, + "grad_norm": 1.546875, + "learning_rate": 1.596052814699665e-05, + "loss": 0.9538, + "step": 5262 + }, + { + "epoch": 0.9024155003536447, + "grad_norm": 1.546875, + "learning_rate": 1.595907798839322e-05, + "loss": 1.0119, + "step": 5263 + }, + { + "epoch": 0.9025869644426345, + "grad_norm": 1.5390625, + "learning_rate": 1.5957627635441815e-05, + "loss": 0.9586, + "step": 5264 + }, + { + "epoch": 0.9027584285316244, + "grad_norm": 1.5625, + "learning_rate": 1.5956177088189742e-05, + "loss": 0.9865, + "step": 5265 + }, + { + "epoch": 0.9029298926206143, + "grad_norm": 1.6171875, + "learning_rate": 1.5954726346684303e-05, + "loss": 1.0195, + "step": 5266 + }, + { + "epoch": 0.9031013567096041, + "grad_norm": 1.5703125, + "learning_rate": 1.5953275410972817e-05, + "loss": 1.0341, + "step": 5267 + }, + { + "epoch": 0.903272820798594, + "grad_norm": 1.4765625, + "learning_rate": 1.59518242811026e-05, + "loss": 0.974, + "step": 5268 + }, + { + "epoch": 0.9034442848875839, + "grad_norm": 1.5, + "learning_rate": 1.595037295712098e-05, + "loss": 0.9357, + "step": 5269 + }, + { + "epoch": 0.9036157489765737, + "grad_norm": 1.546875, + "learning_rate": 1.5948921439075293e-05, + "loss": 1.0212, + "step": 5270 + }, + { + "epoch": 0.9037872130655635, + "grad_norm": 1.578125, + "learning_rate": 1.5947469727012876e-05, + "loss": 0.9645, + "step": 5271 + }, + { + "epoch": 0.9039586771545535, + "grad_norm": 1.5078125, + "learning_rate": 1.5946017820981073e-05, + "loss": 0.9818, + "step": 5272 + }, + { + "epoch": 0.9041301412435433, + "grad_norm": 1.5390625, + "learning_rate": 1.5944565721027243e-05, + "loss": 1.0271, + "step": 5273 + }, + { + "epoch": 0.9043016053325331, + "grad_norm": 1.5234375, + "learning_rate": 1.5943113427198735e-05, + "loss": 0.9747, + "step": 5274 + }, + { + "epoch": 0.9044730694215231, + "grad_norm": 1.546875, + "learning_rate": 1.594166093954292e-05, + "loss": 0.9688, + "step": 5275 + }, + { + "epoch": 0.9046445335105129, + "grad_norm": 1.625, + "learning_rate": 1.5940208258107164e-05, + "loss": 0.9406, + "step": 5276 + }, + { + "epoch": 0.9048159975995027, + "grad_norm": 1.5546875, + "learning_rate": 1.5938755382938853e-05, + "loss": 0.9435, + "step": 5277 + }, + { + "epoch": 0.9049874616884926, + "grad_norm": 1.5390625, + "learning_rate": 1.5937302314085363e-05, + "loss": 0.9882, + "step": 5278 + }, + { + "epoch": 0.9051589257774825, + "grad_norm": 1.5546875, + "learning_rate": 1.5935849051594086e-05, + "loss": 0.9744, + "step": 5279 + }, + { + "epoch": 0.9053303898664723, + "grad_norm": 1.515625, + "learning_rate": 1.5934395595512423e-05, + "loss": 0.9818, + "step": 5280 + }, + { + "epoch": 0.9055018539554622, + "grad_norm": 1.671875, + "learning_rate": 1.593294194588777e-05, + "loss": 1.0195, + "step": 5281 + }, + { + "epoch": 0.9056733180444521, + "grad_norm": 1.6015625, + "learning_rate": 1.593148810276754e-05, + "loss": 1.0049, + "step": 5282 + }, + { + "epoch": 0.9058447821334419, + "grad_norm": 1.71875, + "learning_rate": 1.5930034066199146e-05, + "loss": 1.0413, + "step": 5283 + }, + { + "epoch": 0.9060162462224318, + "grad_norm": 1.4921875, + "learning_rate": 1.5928579836230012e-05, + "loss": 0.9894, + "step": 5284 + }, + { + "epoch": 0.9061877103114216, + "grad_norm": 1.546875, + "learning_rate": 1.5927125412907563e-05, + "loss": 0.9478, + "step": 5285 + }, + { + "epoch": 0.9063591744004115, + "grad_norm": 1.5234375, + "learning_rate": 1.5925670796279235e-05, + "loss": 0.9541, + "step": 5286 + }, + { + "epoch": 0.9065306384894014, + "grad_norm": 1.5703125, + "learning_rate": 1.592421598639247e-05, + "loss": 0.969, + "step": 5287 + }, + { + "epoch": 0.9067021025783912, + "grad_norm": 1.5078125, + "learning_rate": 1.592276098329471e-05, + "loss": 0.9962, + "step": 5288 + }, + { + "epoch": 0.9068735666673811, + "grad_norm": 1.5390625, + "learning_rate": 1.5921305787033416e-05, + "loss": 1.0632, + "step": 5289 + }, + { + "epoch": 0.907045030756371, + "grad_norm": 1.609375, + "learning_rate": 1.5919850397656036e-05, + "loss": 0.9742, + "step": 5290 + }, + { + "epoch": 0.9072164948453608, + "grad_norm": 1.515625, + "learning_rate": 1.591839481521005e-05, + "loss": 1.0608, + "step": 5291 + }, + { + "epoch": 0.9073879589343506, + "grad_norm": 1.609375, + "learning_rate": 1.5916939039742915e-05, + "loss": 0.9504, + "step": 5292 + }, + { + "epoch": 0.9075594230233406, + "grad_norm": 1.609375, + "learning_rate": 1.591548307130212e-05, + "loss": 0.9848, + "step": 5293 + }, + { + "epoch": 0.9077308871123304, + "grad_norm": 1.578125, + "learning_rate": 1.591402690993515e-05, + "loss": 1.0109, + "step": 5294 + }, + { + "epoch": 0.9079023512013202, + "grad_norm": 1.5625, + "learning_rate": 1.591257055568949e-05, + "loss": 1.0148, + "step": 5295 + }, + { + "epoch": 0.9080738152903102, + "grad_norm": 1.5546875, + "learning_rate": 1.5911114008612637e-05, + "loss": 1.0152, + "step": 5296 + }, + { + "epoch": 0.9082452793793, + "grad_norm": 1.578125, + "learning_rate": 1.5909657268752096e-05, + "loss": 1.0008, + "step": 5297 + }, + { + "epoch": 0.9084167434682898, + "grad_norm": 1.65625, + "learning_rate": 1.590820033615538e-05, + "loss": 1.0524, + "step": 5298 + }, + { + "epoch": 0.9085882075572798, + "grad_norm": 1.546875, + "learning_rate": 1.5906743210870007e-05, + "loss": 0.9841, + "step": 5299 + }, + { + "epoch": 0.9087596716462696, + "grad_norm": 1.6171875, + "learning_rate": 1.590528589294349e-05, + "loss": 0.9474, + "step": 5300 + }, + { + "epoch": 0.9089311357352594, + "grad_norm": 1.8046875, + "learning_rate": 1.5903828382423366e-05, + "loss": 1.0609, + "step": 5301 + }, + { + "epoch": 0.9091025998242493, + "grad_norm": 1.546875, + "learning_rate": 1.5902370679357165e-05, + "loss": 0.9618, + "step": 5302 + }, + { + "epoch": 0.9092740639132392, + "grad_norm": 1.5, + "learning_rate": 1.5900912783792426e-05, + "loss": 0.9268, + "step": 5303 + }, + { + "epoch": 0.909445528002229, + "grad_norm": 1.578125, + "learning_rate": 1.5899454695776705e-05, + "loss": 1.0258, + "step": 5304 + }, + { + "epoch": 0.9096169920912189, + "grad_norm": 1.546875, + "learning_rate": 1.589799641535755e-05, + "loss": 0.9598, + "step": 5305 + }, + { + "epoch": 0.9097884561802088, + "grad_norm": 1.703125, + "learning_rate": 1.5896537942582523e-05, + "loss": 1.0599, + "step": 5306 + }, + { + "epoch": 0.9099599202691986, + "grad_norm": 1.5703125, + "learning_rate": 1.589507927749919e-05, + "loss": 0.9962, + "step": 5307 + }, + { + "epoch": 0.9101313843581885, + "grad_norm": 1.5703125, + "learning_rate": 1.589362042015512e-05, + "loss": 1.0258, + "step": 5308 + }, + { + "epoch": 0.9103028484471783, + "grad_norm": 1.5625, + "learning_rate": 1.5892161370597897e-05, + "loss": 0.9494, + "step": 5309 + }, + { + "epoch": 0.9104743125361682, + "grad_norm": 1.5625, + "learning_rate": 1.5890702128875103e-05, + "loss": 1.0038, + "step": 5310 + }, + { + "epoch": 0.9106457766251581, + "grad_norm": 1.6484375, + "learning_rate": 1.588924269503433e-05, + "loss": 0.9653, + "step": 5311 + }, + { + "epoch": 0.9108172407141479, + "grad_norm": 1.6640625, + "learning_rate": 1.5887783069123178e-05, + "loss": 1.0415, + "step": 5312 + }, + { + "epoch": 0.9109887048031378, + "grad_norm": 1.4921875, + "learning_rate": 1.5886323251189247e-05, + "loss": 0.9358, + "step": 5313 + }, + { + "epoch": 0.9111601688921277, + "grad_norm": 1.5546875, + "learning_rate": 1.5884863241280147e-05, + "loss": 0.9164, + "step": 5314 + }, + { + "epoch": 0.9113316329811175, + "grad_norm": 1.5234375, + "learning_rate": 1.5883403039443498e-05, + "loss": 0.9586, + "step": 5315 + }, + { + "epoch": 0.9115030970701073, + "grad_norm": 1.5546875, + "learning_rate": 1.5881942645726924e-05, + "loss": 1.0247, + "step": 5316 + }, + { + "epoch": 0.9116745611590973, + "grad_norm": 1.4765625, + "learning_rate": 1.5880482060178048e-05, + "loss": 0.9309, + "step": 5317 + }, + { + "epoch": 0.9118460252480871, + "grad_norm": 1.53125, + "learning_rate": 1.587902128284451e-05, + "loss": 0.9456, + "step": 5318 + }, + { + "epoch": 0.9120174893370769, + "grad_norm": 1.6328125, + "learning_rate": 1.5877560313773946e-05, + "loss": 0.9501, + "step": 5319 + }, + { + "epoch": 0.9121889534260669, + "grad_norm": 1.609375, + "learning_rate": 1.5876099153014008e-05, + "loss": 0.9963, + "step": 5320 + }, + { + "epoch": 0.9123604175150567, + "grad_norm": 1.53125, + "learning_rate": 1.587463780061235e-05, + "loss": 0.9477, + "step": 5321 + }, + { + "epoch": 0.9125318816040465, + "grad_norm": 1.5078125, + "learning_rate": 1.5873176256616634e-05, + "loss": 0.9406, + "step": 5322 + }, + { + "epoch": 0.9127033456930365, + "grad_norm": 1.5625, + "learning_rate": 1.587171452107452e-05, + "loss": 0.9296, + "step": 5323 + }, + { + "epoch": 0.9128748097820263, + "grad_norm": 1.578125, + "learning_rate": 1.5870252594033687e-05, + "loss": 1.0263, + "step": 5324 + }, + { + "epoch": 0.9130462738710161, + "grad_norm": 1.4921875, + "learning_rate": 1.5868790475541812e-05, + "loss": 0.934, + "step": 5325 + }, + { + "epoch": 0.913217737960006, + "grad_norm": 1.59375, + "learning_rate": 1.5867328165646583e-05, + "loss": 1.0729, + "step": 5326 + }, + { + "epoch": 0.9133892020489959, + "grad_norm": 1.546875, + "learning_rate": 1.5865865664395684e-05, + "loss": 0.9704, + "step": 5327 + }, + { + "epoch": 0.9135606661379857, + "grad_norm": 1.65625, + "learning_rate": 1.5864402971836813e-05, + "loss": 1.025, + "step": 5328 + }, + { + "epoch": 0.9137321302269756, + "grad_norm": 1.5546875, + "learning_rate": 1.5862940088017683e-05, + "loss": 1.0173, + "step": 5329 + }, + { + "epoch": 0.9139035943159655, + "grad_norm": 1.5703125, + "learning_rate": 1.5861477012986002e-05, + "loss": 1.0344, + "step": 5330 + }, + { + "epoch": 0.9140750584049553, + "grad_norm": 1.6171875, + "learning_rate": 1.5860013746789477e-05, + "loss": 0.9556, + "step": 5331 + }, + { + "epoch": 0.9142465224939452, + "grad_norm": 1.6171875, + "learning_rate": 1.585855028947584e-05, + "loss": 0.9806, + "step": 5332 + }, + { + "epoch": 0.914417986582935, + "grad_norm": 1.625, + "learning_rate": 1.5857086641092817e-05, + "loss": 0.9937, + "step": 5333 + }, + { + "epoch": 0.9145894506719249, + "grad_norm": 1.6171875, + "learning_rate": 1.585562280168814e-05, + "loss": 1.0304, + "step": 5334 + }, + { + "epoch": 0.9147609147609148, + "grad_norm": 1.578125, + "learning_rate": 1.5854158771309557e-05, + "loss": 1.0163, + "step": 5335 + }, + { + "epoch": 0.9149323788499046, + "grad_norm": 1.5703125, + "learning_rate": 1.585269455000481e-05, + "loss": 0.9672, + "step": 5336 + }, + { + "epoch": 0.9151038429388945, + "grad_norm": 1.5625, + "learning_rate": 1.5851230137821656e-05, + "loss": 1.0051, + "step": 5337 + }, + { + "epoch": 0.9152753070278844, + "grad_norm": 1.625, + "learning_rate": 1.584976553480785e-05, + "loss": 1.0918, + "step": 5338 + }, + { + "epoch": 0.9154467711168742, + "grad_norm": 1.5390625, + "learning_rate": 1.5848300741011163e-05, + "loss": 0.9945, + "step": 5339 + }, + { + "epoch": 0.915618235205864, + "grad_norm": 1.5703125, + "learning_rate": 1.5846835756479367e-05, + "loss": 0.9911, + "step": 5340 + }, + { + "epoch": 0.915789699294854, + "grad_norm": 1.5859375, + "learning_rate": 1.5845370581260243e-05, + "loss": 1.0362, + "step": 5341 + }, + { + "epoch": 0.9159611633838438, + "grad_norm": 1.703125, + "learning_rate": 1.5843905215401566e-05, + "loss": 1.0358, + "step": 5342 + }, + { + "epoch": 0.9161326274728336, + "grad_norm": 1.5234375, + "learning_rate": 1.5842439658951137e-05, + "loss": 1.0377, + "step": 5343 + }, + { + "epoch": 0.9163040915618236, + "grad_norm": 1.671875, + "learning_rate": 1.584097391195675e-05, + "loss": 1.104, + "step": 5344 + }, + { + "epoch": 0.9164755556508134, + "grad_norm": 1.515625, + "learning_rate": 1.583950797446621e-05, + "loss": 0.8953, + "step": 5345 + }, + { + "epoch": 0.9166470197398032, + "grad_norm": 1.6796875, + "learning_rate": 1.5838041846527325e-05, + "loss": 1.0442, + "step": 5346 + }, + { + "epoch": 0.9168184838287932, + "grad_norm": 1.5703125, + "learning_rate": 1.583657552818791e-05, + "loss": 0.9812, + "step": 5347 + }, + { + "epoch": 0.916989947917783, + "grad_norm": 1.640625, + "learning_rate": 1.583510901949579e-05, + "loss": 1.0907, + "step": 5348 + }, + { + "epoch": 0.9171614120067728, + "grad_norm": 1.6171875, + "learning_rate": 1.583364232049879e-05, + "loss": 0.9746, + "step": 5349 + }, + { + "epoch": 0.9173328760957626, + "grad_norm": 1.546875, + "learning_rate": 1.583217543124475e-05, + "loss": 1.0124, + "step": 5350 + }, + { + "epoch": 0.9175043401847526, + "grad_norm": 1.5546875, + "learning_rate": 1.5830708351781507e-05, + "loss": 1.044, + "step": 5351 + }, + { + "epoch": 0.9176758042737424, + "grad_norm": 1.5859375, + "learning_rate": 1.5829241082156907e-05, + "loss": 1.047, + "step": 5352 + }, + { + "epoch": 0.9178472683627322, + "grad_norm": 1.484375, + "learning_rate": 1.58277736224188e-05, + "loss": 0.946, + "step": 5353 + }, + { + "epoch": 0.9180187324517222, + "grad_norm": 1.6328125, + "learning_rate": 1.5826305972615053e-05, + "loss": 1.0134, + "step": 5354 + }, + { + "epoch": 0.918190196540712, + "grad_norm": 1.609375, + "learning_rate": 1.582483813279353e-05, + "loss": 1.0489, + "step": 5355 + }, + { + "epoch": 0.9183616606297018, + "grad_norm": 1.6796875, + "learning_rate": 1.58233701030021e-05, + "loss": 1.0232, + "step": 5356 + }, + { + "epoch": 0.9185331247186918, + "grad_norm": 1.5703125, + "learning_rate": 1.5821901883288642e-05, + "loss": 1.0153, + "step": 5357 + }, + { + "epoch": 0.9187045888076816, + "grad_norm": 1.625, + "learning_rate": 1.582043347370104e-05, + "loss": 1.0195, + "step": 5358 + }, + { + "epoch": 0.9188760528966714, + "grad_norm": 1.5859375, + "learning_rate": 1.5818964874287185e-05, + "loss": 1.0454, + "step": 5359 + }, + { + "epoch": 0.9190475169856613, + "grad_norm": 1.65625, + "learning_rate": 1.5817496085094974e-05, + "loss": 1.0176, + "step": 5360 + }, + { + "epoch": 0.9192189810746512, + "grad_norm": 1.6328125, + "learning_rate": 1.5816027106172307e-05, + "loss": 0.9667, + "step": 5361 + }, + { + "epoch": 0.919390445163641, + "grad_norm": 1.515625, + "learning_rate": 1.5814557937567097e-05, + "loss": 1.0203, + "step": 5362 + }, + { + "epoch": 0.9195619092526309, + "grad_norm": 1.59375, + "learning_rate": 1.5813088579327256e-05, + "loss": 1.0232, + "step": 5363 + }, + { + "epoch": 0.9197333733416208, + "grad_norm": 1.5234375, + "learning_rate": 1.5811619031500706e-05, + "loss": 0.9786, + "step": 5364 + }, + { + "epoch": 0.9199048374306106, + "grad_norm": 1.703125, + "learning_rate": 1.5810149294135376e-05, + "loss": 1.0583, + "step": 5365 + }, + { + "epoch": 0.9200763015196005, + "grad_norm": 1.625, + "learning_rate": 1.5808679367279197e-05, + "loss": 1.0685, + "step": 5366 + }, + { + "epoch": 0.9202477656085903, + "grad_norm": 1.6171875, + "learning_rate": 1.5807209250980112e-05, + "loss": 1.0277, + "step": 5367 + }, + { + "epoch": 0.9204192296975802, + "grad_norm": 1.5234375, + "learning_rate": 1.5805738945286066e-05, + "loss": 0.9964, + "step": 5368 + }, + { + "epoch": 0.9205906937865701, + "grad_norm": 1.640625, + "learning_rate": 1.580426845024501e-05, + "loss": 1.0182, + "step": 5369 + }, + { + "epoch": 0.9207621578755599, + "grad_norm": 1.6015625, + "learning_rate": 1.5802797765904903e-05, + "loss": 1.051, + "step": 5370 + }, + { + "epoch": 0.9209336219645498, + "grad_norm": 1.515625, + "learning_rate": 1.5801326892313707e-05, + "loss": 1.0433, + "step": 5371 + }, + { + "epoch": 0.9211050860535397, + "grad_norm": 1.5625, + "learning_rate": 1.57998558295194e-05, + "loss": 0.9663, + "step": 5372 + }, + { + "epoch": 0.9212765501425295, + "grad_norm": 1.65625, + "learning_rate": 1.579838457756995e-05, + "loss": 1.055, + "step": 5373 + }, + { + "epoch": 0.9214480142315193, + "grad_norm": 1.4921875, + "learning_rate": 1.579691313651335e-05, + "loss": 1.0098, + "step": 5374 + }, + { + "epoch": 0.9216194783205093, + "grad_norm": 1.46875, + "learning_rate": 1.5795441506397574e-05, + "loss": 0.9559, + "step": 5375 + }, + { + "epoch": 0.9217909424094991, + "grad_norm": 1.5390625, + "learning_rate": 1.5793969687270634e-05, + "loss": 0.9153, + "step": 5376 + }, + { + "epoch": 0.9219624064984889, + "grad_norm": 1.4921875, + "learning_rate": 1.5792497679180524e-05, + "loss": 0.9999, + "step": 5377 + }, + { + "epoch": 0.9221338705874789, + "grad_norm": 1.5078125, + "learning_rate": 1.5791025482175247e-05, + "loss": 0.9207, + "step": 5378 + }, + { + "epoch": 0.9223053346764687, + "grad_norm": 1.59375, + "learning_rate": 1.578955309630283e-05, + "loss": 0.9692, + "step": 5379 + }, + { + "epoch": 0.9224767987654585, + "grad_norm": 1.609375, + "learning_rate": 1.578808052161128e-05, + "loss": 1.0302, + "step": 5380 + }, + { + "epoch": 0.9226482628544485, + "grad_norm": 1.65625, + "learning_rate": 1.5786607758148628e-05, + "loss": 1.007, + "step": 5381 + }, + { + "epoch": 0.9228197269434383, + "grad_norm": 1.546875, + "learning_rate": 1.5785134805962907e-05, + "loss": 0.9924, + "step": 5382 + }, + { + "epoch": 0.9229911910324281, + "grad_norm": 1.5625, + "learning_rate": 1.578366166510216e-05, + "loss": 1.0786, + "step": 5383 + }, + { + "epoch": 0.923162655121418, + "grad_norm": 1.6875, + "learning_rate": 1.578218833561442e-05, + "loss": 1.1266, + "step": 5384 + }, + { + "epoch": 0.9233341192104079, + "grad_norm": 1.609375, + "learning_rate": 1.5780714817547745e-05, + "loss": 0.9992, + "step": 5385 + }, + { + "epoch": 0.9235055832993977, + "grad_norm": 1.6015625, + "learning_rate": 1.5779241110950195e-05, + "loss": 1.0309, + "step": 5386 + }, + { + "epoch": 0.9236770473883876, + "grad_norm": 1.5, + "learning_rate": 1.577776721586983e-05, + "loss": 0.9404, + "step": 5387 + }, + { + "epoch": 0.9238485114773775, + "grad_norm": 1.671875, + "learning_rate": 1.577629313235472e-05, + "loss": 0.9784, + "step": 5388 + }, + { + "epoch": 0.9240199755663673, + "grad_norm": 1.546875, + "learning_rate": 1.5774818860452933e-05, + "loss": 0.9775, + "step": 5389 + }, + { + "epoch": 0.9241914396553572, + "grad_norm": 1.6640625, + "learning_rate": 1.577334440021256e-05, + "loss": 0.9955, + "step": 5390 + }, + { + "epoch": 0.924362903744347, + "grad_norm": 1.65625, + "learning_rate": 1.577186975168169e-05, + "loss": 0.9753, + "step": 5391 + }, + { + "epoch": 0.9245343678333369, + "grad_norm": 1.546875, + "learning_rate": 1.577039491490841e-05, + "loss": 1.0302, + "step": 5392 + }, + { + "epoch": 0.9247058319223268, + "grad_norm": 1.5234375, + "learning_rate": 1.576891988994082e-05, + "loss": 0.9805, + "step": 5393 + }, + { + "epoch": 0.9248772960113166, + "grad_norm": 1.640625, + "learning_rate": 1.576744467682703e-05, + "loss": 1.0489, + "step": 5394 + }, + { + "epoch": 0.9250487601003065, + "grad_norm": 1.640625, + "learning_rate": 1.5765969275615153e-05, + "loss": 0.9113, + "step": 5395 + }, + { + "epoch": 0.9252202241892964, + "grad_norm": 1.484375, + "learning_rate": 1.5764493686353307e-05, + "loss": 1.0295, + "step": 5396 + }, + { + "epoch": 0.9253916882782862, + "grad_norm": 1.53125, + "learning_rate": 1.5763017909089608e-05, + "loss": 0.9964, + "step": 5397 + }, + { + "epoch": 0.925563152367276, + "grad_norm": 1.65625, + "learning_rate": 1.57615419438722e-05, + "loss": 0.958, + "step": 5398 + }, + { + "epoch": 0.925734616456266, + "grad_norm": 1.5859375, + "learning_rate": 1.576006579074921e-05, + "loss": 0.9351, + "step": 5399 + }, + { + "epoch": 0.9259060805452558, + "grad_norm": 1.5703125, + "learning_rate": 1.5758589449768784e-05, + "loss": 0.9385, + "step": 5400 + }, + { + "epoch": 0.9260775446342456, + "grad_norm": 1.578125, + "learning_rate": 1.575711292097907e-05, + "loss": 1.0104, + "step": 5401 + }, + { + "epoch": 0.9262490087232356, + "grad_norm": 1.71875, + "learning_rate": 1.5755636204428228e-05, + "loss": 1.0488, + "step": 5402 + }, + { + "epoch": 0.9264204728122254, + "grad_norm": 1.5390625, + "learning_rate": 1.5754159300164414e-05, + "loss": 0.9892, + "step": 5403 + }, + { + "epoch": 0.9265919369012152, + "grad_norm": 1.546875, + "learning_rate": 1.57526822082358e-05, + "loss": 1.0528, + "step": 5404 + }, + { + "epoch": 0.9267634009902052, + "grad_norm": 1.6640625, + "learning_rate": 1.5751204928690552e-05, + "loss": 1.0059, + "step": 5405 + }, + { + "epoch": 0.926934865079195, + "grad_norm": 1.4140625, + "learning_rate": 1.5749727461576852e-05, + "loss": 0.9006, + "step": 5406 + }, + { + "epoch": 0.9271063291681848, + "grad_norm": 1.578125, + "learning_rate": 1.5748249806942895e-05, + "loss": 1.0689, + "step": 5407 + }, + { + "epoch": 0.9272777932571747, + "grad_norm": 1.7734375, + "learning_rate": 1.5746771964836864e-05, + "loss": 0.9228, + "step": 5408 + }, + { + "epoch": 0.9274492573461646, + "grad_norm": 1.6328125, + "learning_rate": 1.5745293935306954e-05, + "loss": 1.0405, + "step": 5409 + }, + { + "epoch": 0.9276207214351544, + "grad_norm": 1.640625, + "learning_rate": 1.574381571840138e-05, + "loss": 1.0065, + "step": 5410 + }, + { + "epoch": 0.9277921855241443, + "grad_norm": 1.5546875, + "learning_rate": 1.5742337314168343e-05, + "loss": 1.0412, + "step": 5411 + }, + { + "epoch": 0.9279636496131342, + "grad_norm": 1.5234375, + "learning_rate": 1.574085872265606e-05, + "loss": 1.0683, + "step": 5412 + }, + { + "epoch": 0.928135113702124, + "grad_norm": 1.5625, + "learning_rate": 1.573937994391276e-05, + "loss": 1.0458, + "step": 5413 + }, + { + "epoch": 0.9283065777911139, + "grad_norm": 1.5390625, + "learning_rate": 1.5737900977986667e-05, + "loss": 0.9001, + "step": 5414 + }, + { + "epoch": 0.9284780418801037, + "grad_norm": 1.59375, + "learning_rate": 1.5736421824926016e-05, + "loss": 1.0702, + "step": 5415 + }, + { + "epoch": 0.9286495059690936, + "grad_norm": 1.46875, + "learning_rate": 1.573494248477905e-05, + "loss": 0.9452, + "step": 5416 + }, + { + "epoch": 0.9288209700580835, + "grad_norm": 1.5703125, + "learning_rate": 1.573346295759401e-05, + "loss": 0.9938, + "step": 5417 + }, + { + "epoch": 0.9289924341470733, + "grad_norm": 1.640625, + "learning_rate": 1.5731983243419154e-05, + "loss": 1.0848, + "step": 5418 + }, + { + "epoch": 0.9291638982360632, + "grad_norm": 1.5390625, + "learning_rate": 1.573050334230274e-05, + "loss": 1.0467, + "step": 5419 + }, + { + "epoch": 0.9293353623250531, + "grad_norm": 1.625, + "learning_rate": 1.5729023254293034e-05, + "loss": 0.9948, + "step": 5420 + }, + { + "epoch": 0.9295068264140429, + "grad_norm": 1.6171875, + "learning_rate": 1.5727542979438305e-05, + "loss": 0.968, + "step": 5421 + }, + { + "epoch": 0.9296782905030327, + "grad_norm": 1.5625, + "learning_rate": 1.5726062517786834e-05, + "loss": 1.0831, + "step": 5422 + }, + { + "epoch": 0.9298497545920227, + "grad_norm": 1.5234375, + "learning_rate": 1.57245818693869e-05, + "loss": 1.0289, + "step": 5423 + }, + { + "epoch": 0.9300212186810125, + "grad_norm": 1.5546875, + "learning_rate": 1.5723101034286794e-05, + "loss": 1.0607, + "step": 5424 + }, + { + "epoch": 0.9301926827700023, + "grad_norm": 1.5234375, + "learning_rate": 1.572162001253481e-05, + "loss": 1.0166, + "step": 5425 + }, + { + "epoch": 0.9303641468589923, + "grad_norm": 1.6640625, + "learning_rate": 1.5720138804179256e-05, + "loss": 1.0684, + "step": 5426 + }, + { + "epoch": 0.9305356109479821, + "grad_norm": 1.625, + "learning_rate": 1.5718657409268436e-05, + "loss": 1.0084, + "step": 5427 + }, + { + "epoch": 0.9307070750369719, + "grad_norm": 1.46875, + "learning_rate": 1.5717175827850657e-05, + "loss": 1.0336, + "step": 5428 + }, + { + "epoch": 0.9308785391259619, + "grad_norm": 1.5234375, + "learning_rate": 1.571569405997425e-05, + "loss": 1.0188, + "step": 5429 + }, + { + "epoch": 0.9310500032149517, + "grad_norm": 1.5, + "learning_rate": 1.5714212105687535e-05, + "loss": 0.9354, + "step": 5430 + }, + { + "epoch": 0.9312214673039415, + "grad_norm": 1.5390625, + "learning_rate": 1.571272996503885e-05, + "loss": 0.9608, + "step": 5431 + }, + { + "epoch": 0.9313929313929314, + "grad_norm": 1.5859375, + "learning_rate": 1.571124763807652e-05, + "loss": 1.0545, + "step": 5432 + }, + { + "epoch": 0.9315643954819213, + "grad_norm": 1.5546875, + "learning_rate": 1.5709765124848907e-05, + "loss": 0.9436, + "step": 5433 + }, + { + "epoch": 0.9317358595709111, + "grad_norm": 1.5390625, + "learning_rate": 1.5708282425404345e-05, + "loss": 0.9691, + "step": 5434 + }, + { + "epoch": 0.931907323659901, + "grad_norm": 1.515625, + "learning_rate": 1.5706799539791204e-05, + "loss": 0.9409, + "step": 5435 + }, + { + "epoch": 0.9320787877488909, + "grad_norm": 1.5859375, + "learning_rate": 1.5705316468057837e-05, + "loss": 1.0345, + "step": 5436 + }, + { + "epoch": 0.9322502518378807, + "grad_norm": 1.671875, + "learning_rate": 1.5703833210252613e-05, + "loss": 0.9729, + "step": 5437 + }, + { + "epoch": 0.9324217159268706, + "grad_norm": 1.4921875, + "learning_rate": 1.5702349766423913e-05, + "loss": 0.9774, + "step": 5438 + }, + { + "epoch": 0.9325931800158604, + "grad_norm": 1.5234375, + "learning_rate": 1.5700866136620114e-05, + "loss": 0.9663, + "step": 5439 + }, + { + "epoch": 0.9327646441048503, + "grad_norm": 1.5625, + "learning_rate": 1.56993823208896e-05, + "loss": 0.9804, + "step": 5440 + }, + { + "epoch": 0.9329361081938402, + "grad_norm": 1.5078125, + "learning_rate": 1.5697898319280767e-05, + "loss": 0.9341, + "step": 5441 + }, + { + "epoch": 0.93310757228283, + "grad_norm": 1.546875, + "learning_rate": 1.569641413184202e-05, + "loss": 1.0012, + "step": 5442 + }, + { + "epoch": 0.9332790363718199, + "grad_norm": 1.46875, + "learning_rate": 1.569492975862175e-05, + "loss": 1.0137, + "step": 5443 + }, + { + "epoch": 0.9334505004608097, + "grad_norm": 1.7109375, + "learning_rate": 1.569344519966838e-05, + "loss": 1.0661, + "step": 5444 + }, + { + "epoch": 0.9336219645497996, + "grad_norm": 1.4921875, + "learning_rate": 1.569196045503032e-05, + "loss": 0.9675, + "step": 5445 + }, + { + "epoch": 0.9337934286387894, + "grad_norm": 1.609375, + "learning_rate": 1.5690475524755996e-05, + "loss": 1.1199, + "step": 5446 + }, + { + "epoch": 0.9339648927277793, + "grad_norm": 1.5078125, + "learning_rate": 1.5688990408893837e-05, + "loss": 0.973, + "step": 5447 + }, + { + "epoch": 0.9341363568167692, + "grad_norm": 1.546875, + "learning_rate": 1.5687505107492278e-05, + "loss": 1.0056, + "step": 5448 + }, + { + "epoch": 0.934307820905759, + "grad_norm": 1.6171875, + "learning_rate": 1.568601962059976e-05, + "loss": 1.0917, + "step": 5449 + }, + { + "epoch": 0.9344792849947489, + "grad_norm": 1.5, + "learning_rate": 1.5684533948264734e-05, + "loss": 0.9369, + "step": 5450 + }, + { + "epoch": 0.9346507490837388, + "grad_norm": 1.5625, + "learning_rate": 1.5683048090535645e-05, + "loss": 1.0045, + "step": 5451 + }, + { + "epoch": 0.9348222131727286, + "grad_norm": 1.5546875, + "learning_rate": 1.568156204746096e-05, + "loss": 0.9499, + "step": 5452 + }, + { + "epoch": 0.9349936772617184, + "grad_norm": 1.6328125, + "learning_rate": 1.568007581908914e-05, + "loss": 1.0495, + "step": 5453 + }, + { + "epoch": 0.9351651413507084, + "grad_norm": 1.5625, + "learning_rate": 1.567858940546866e-05, + "loss": 1.0209, + "step": 5454 + }, + { + "epoch": 0.9353366054396982, + "grad_norm": 1.5234375, + "learning_rate": 1.5677102806647993e-05, + "loss": 1.0253, + "step": 5455 + }, + { + "epoch": 0.935508069528688, + "grad_norm": 1.625, + "learning_rate": 1.567561602267563e-05, + "loss": 1.0378, + "step": 5456 + }, + { + "epoch": 0.935679533617678, + "grad_norm": 1.5390625, + "learning_rate": 1.5674129053600054e-05, + "loss": 1.0162, + "step": 5457 + }, + { + "epoch": 0.9358509977066678, + "grad_norm": 1.59375, + "learning_rate": 1.5672641899469764e-05, + "loss": 0.9873, + "step": 5458 + }, + { + "epoch": 0.9360224617956576, + "grad_norm": 1.5703125, + "learning_rate": 1.5671154560333258e-05, + "loss": 1.0778, + "step": 5459 + }, + { + "epoch": 0.9361939258846476, + "grad_norm": 1.546875, + "learning_rate": 1.5669667036239047e-05, + "loss": 1.0814, + "step": 5460 + }, + { + "epoch": 0.9363653899736374, + "grad_norm": 1.5078125, + "learning_rate": 1.5668179327235644e-05, + "loss": 1.015, + "step": 5461 + }, + { + "epoch": 0.9365368540626272, + "grad_norm": 1.5546875, + "learning_rate": 1.566669143337157e-05, + "loss": 0.9743, + "step": 5462 + }, + { + "epoch": 0.9367083181516171, + "grad_norm": 1.5546875, + "learning_rate": 1.566520335469535e-05, + "loss": 0.9868, + "step": 5463 + }, + { + "epoch": 0.936879782240607, + "grad_norm": 1.625, + "learning_rate": 1.5663715091255513e-05, + "loss": 1.0205, + "step": 5464 + }, + { + "epoch": 0.9370512463295968, + "grad_norm": 1.4609375, + "learning_rate": 1.56622266431006e-05, + "loss": 0.9398, + "step": 5465 + }, + { + "epoch": 0.9372227104185867, + "grad_norm": 1.53125, + "learning_rate": 1.5660738010279156e-05, + "loss": 0.9776, + "step": 5466 + }, + { + "epoch": 0.9373941745075766, + "grad_norm": 1.6484375, + "learning_rate": 1.5659249192839724e-05, + "loss": 0.958, + "step": 5467 + }, + { + "epoch": 0.9375656385965664, + "grad_norm": 1.7265625, + "learning_rate": 1.565776019083087e-05, + "loss": 0.9683, + "step": 5468 + }, + { + "epoch": 0.9377371026855563, + "grad_norm": 1.703125, + "learning_rate": 1.565627100430115e-05, + "loss": 1.0691, + "step": 5469 + }, + { + "epoch": 0.9379085667745461, + "grad_norm": 1.546875, + "learning_rate": 1.5654781633299134e-05, + "loss": 0.9959, + "step": 5470 + }, + { + "epoch": 0.938080030863536, + "grad_norm": 1.640625, + "learning_rate": 1.5653292077873393e-05, + "loss": 1.0553, + "step": 5471 + }, + { + "epoch": 0.9382514949525259, + "grad_norm": 1.4765625, + "learning_rate": 1.565180233807251e-05, + "loss": 1.0085, + "step": 5472 + }, + { + "epoch": 0.9384229590415157, + "grad_norm": 1.609375, + "learning_rate": 1.565031241394507e-05, + "loss": 1.0366, + "step": 5473 + }, + { + "epoch": 0.9385944231305056, + "grad_norm": 1.4765625, + "learning_rate": 1.5648822305539667e-05, + "loss": 0.8648, + "step": 5474 + }, + { + "epoch": 0.9387658872194955, + "grad_norm": 1.5703125, + "learning_rate": 1.5647332012904892e-05, + "loss": 0.9774, + "step": 5475 + }, + { + "epoch": 0.9389373513084853, + "grad_norm": 1.5625, + "learning_rate": 1.564584153608936e-05, + "loss": 0.9816, + "step": 5476 + }, + { + "epoch": 0.9391088153974751, + "grad_norm": 1.609375, + "learning_rate": 1.5644350875141673e-05, + "loss": 1.0061, + "step": 5477 + }, + { + "epoch": 0.9392802794864651, + "grad_norm": 1.484375, + "learning_rate": 1.564286003011045e-05, + "loss": 0.9747, + "step": 5478 + }, + { + "epoch": 0.9394517435754549, + "grad_norm": 1.4921875, + "learning_rate": 1.5641369001044308e-05, + "loss": 0.8715, + "step": 5479 + }, + { + "epoch": 0.9396232076644447, + "grad_norm": 1.5, + "learning_rate": 1.5639877787991884e-05, + "loss": 0.9639, + "step": 5480 + }, + { + "epoch": 0.9397946717534347, + "grad_norm": 1.5390625, + "learning_rate": 1.5638386391001804e-05, + "loss": 1.0736, + "step": 5481 + }, + { + "epoch": 0.9399661358424245, + "grad_norm": 1.515625, + "learning_rate": 1.5636894810122717e-05, + "loss": 1.0084, + "step": 5482 + }, + { + "epoch": 0.9401375999314143, + "grad_norm": 1.609375, + "learning_rate": 1.5635403045403257e-05, + "loss": 1.0358, + "step": 5483 + }, + { + "epoch": 0.9403090640204043, + "grad_norm": 1.4609375, + "learning_rate": 1.5633911096892088e-05, + "loss": 0.9374, + "step": 5484 + }, + { + "epoch": 0.9404805281093941, + "grad_norm": 1.5859375, + "learning_rate": 1.563241896463786e-05, + "loss": 1.019, + "step": 5485 + }, + { + "epoch": 0.9406519921983839, + "grad_norm": 1.578125, + "learning_rate": 1.5630926648689243e-05, + "loss": 1.0141, + "step": 5486 + }, + { + "epoch": 0.9408234562873738, + "grad_norm": 1.546875, + "learning_rate": 1.5629434149094898e-05, + "loss": 1.0172, + "step": 5487 + }, + { + "epoch": 0.9409949203763637, + "grad_norm": 1.484375, + "learning_rate": 1.5627941465903512e-05, + "loss": 0.941, + "step": 5488 + }, + { + "epoch": 0.9411663844653535, + "grad_norm": 1.625, + "learning_rate": 1.562644859916376e-05, + "loss": 1.0477, + "step": 5489 + }, + { + "epoch": 0.9413378485543434, + "grad_norm": 1.59375, + "learning_rate": 1.5624955548924334e-05, + "loss": 0.9985, + "step": 5490 + }, + { + "epoch": 0.9415093126433333, + "grad_norm": 1.5, + "learning_rate": 1.5623462315233923e-05, + "loss": 0.9092, + "step": 5491 + }, + { + "epoch": 0.9416807767323231, + "grad_norm": 1.5859375, + "learning_rate": 1.562196889814123e-05, + "loss": 1.0168, + "step": 5492 + }, + { + "epoch": 0.941852240821313, + "grad_norm": 1.6796875, + "learning_rate": 1.5620475297694963e-05, + "loss": 1.0219, + "step": 5493 + }, + { + "epoch": 0.9420237049103029, + "grad_norm": 1.796875, + "learning_rate": 1.5618981513943833e-05, + "loss": 1.0164, + "step": 5494 + }, + { + "epoch": 0.9421951689992927, + "grad_norm": 1.640625, + "learning_rate": 1.5617487546936555e-05, + "loss": 1.0886, + "step": 5495 + }, + { + "epoch": 0.9423666330882826, + "grad_norm": 1.5078125, + "learning_rate": 1.5615993396721852e-05, + "loss": 0.9554, + "step": 5496 + }, + { + "epoch": 0.9425380971772724, + "grad_norm": 1.5859375, + "learning_rate": 1.5614499063348457e-05, + "loss": 1.0361, + "step": 5497 + }, + { + "epoch": 0.9427095612662623, + "grad_norm": 1.578125, + "learning_rate": 1.561300454686511e-05, + "loss": 1.0378, + "step": 5498 + }, + { + "epoch": 0.9428810253552522, + "grad_norm": 1.5390625, + "learning_rate": 1.561150984732054e-05, + "loss": 0.975, + "step": 5499 + }, + { + "epoch": 0.943052489444242, + "grad_norm": 1.546875, + "learning_rate": 1.561001496476351e-05, + "loss": 0.9535, + "step": 5500 + }, + { + "epoch": 0.9432239535332319, + "grad_norm": 1.53125, + "learning_rate": 1.5608519899242764e-05, + "loss": 1.0368, + "step": 5501 + }, + { + "epoch": 0.9433954176222218, + "grad_norm": 1.609375, + "learning_rate": 1.560702465080707e-05, + "loss": 0.9941, + "step": 5502 + }, + { + "epoch": 0.9435668817112116, + "grad_norm": 1.5234375, + "learning_rate": 1.560552921950518e-05, + "loss": 0.8931, + "step": 5503 + }, + { + "epoch": 0.9437383458002014, + "grad_norm": 1.5234375, + "learning_rate": 1.5604033605385874e-05, + "loss": 1.0116, + "step": 5504 + }, + { + "epoch": 0.9439098098891914, + "grad_norm": 1.3984375, + "learning_rate": 1.5602537808497935e-05, + "loss": 0.9978, + "step": 5505 + }, + { + "epoch": 0.9440812739781812, + "grad_norm": 1.5546875, + "learning_rate": 1.560104182889014e-05, + "loss": 0.941, + "step": 5506 + }, + { + "epoch": 0.944252738067171, + "grad_norm": 1.6875, + "learning_rate": 1.5599545666611272e-05, + "loss": 1.0056, + "step": 5507 + }, + { + "epoch": 0.944424202156161, + "grad_norm": 1.578125, + "learning_rate": 1.5598049321710143e-05, + "loss": 0.9915, + "step": 5508 + }, + { + "epoch": 0.9445956662451508, + "grad_norm": 1.546875, + "learning_rate": 1.559655279423554e-05, + "loss": 0.9526, + "step": 5509 + }, + { + "epoch": 0.9447671303341406, + "grad_norm": 1.671875, + "learning_rate": 1.5595056084236277e-05, + "loss": 1.0734, + "step": 5510 + }, + { + "epoch": 0.9449385944231306, + "grad_norm": 1.5390625, + "learning_rate": 1.5593559191761165e-05, + "loss": 0.9538, + "step": 5511 + }, + { + "epoch": 0.9451100585121204, + "grad_norm": 1.6015625, + "learning_rate": 1.5592062116859026e-05, + "loss": 1.0745, + "step": 5512 + }, + { + "epoch": 0.9452815226011102, + "grad_norm": 1.546875, + "learning_rate": 1.5590564859578682e-05, + "loss": 0.9885, + "step": 5513 + }, + { + "epoch": 0.9454529866901001, + "grad_norm": 1.5859375, + "learning_rate": 1.5589067419968967e-05, + "loss": 0.9589, + "step": 5514 + }, + { + "epoch": 0.94562445077909, + "grad_norm": 1.5390625, + "learning_rate": 1.558756979807872e-05, + "loss": 1.0317, + "step": 5515 + }, + { + "epoch": 0.9457959148680798, + "grad_norm": 1.4453125, + "learning_rate": 1.558607199395678e-05, + "loss": 0.959, + "step": 5516 + }, + { + "epoch": 0.9459673789570697, + "grad_norm": 1.5703125, + "learning_rate": 1.5584574007651993e-05, + "loss": 1.029, + "step": 5517 + }, + { + "epoch": 0.9461388430460596, + "grad_norm": 1.5546875, + "learning_rate": 1.558307583921322e-05, + "loss": 0.9865, + "step": 5518 + }, + { + "epoch": 0.9463103071350494, + "grad_norm": 1.5625, + "learning_rate": 1.558157748868932e-05, + "loss": 1.0517, + "step": 5519 + }, + { + "epoch": 0.9464817712240393, + "grad_norm": 1.5703125, + "learning_rate": 1.5580078956129158e-05, + "loss": 1.0287, + "step": 5520 + }, + { + "epoch": 0.9466532353130291, + "grad_norm": 1.5625, + "learning_rate": 1.5578580241581612e-05, + "loss": 0.966, + "step": 5521 + }, + { + "epoch": 0.946824699402019, + "grad_norm": 1.703125, + "learning_rate": 1.5577081345095556e-05, + "loss": 1.0418, + "step": 5522 + }, + { + "epoch": 0.9469961634910089, + "grad_norm": 1.6796875, + "learning_rate": 1.557558226671987e-05, + "loss": 0.9901, + "step": 5523 + }, + { + "epoch": 0.9471676275799987, + "grad_norm": 1.578125, + "learning_rate": 1.5574083006503462e-05, + "loss": 1.0459, + "step": 5524 + }, + { + "epoch": 0.9473390916689886, + "grad_norm": 1.46875, + "learning_rate": 1.5572583564495212e-05, + "loss": 0.9744, + "step": 5525 + }, + { + "epoch": 0.9475105557579785, + "grad_norm": 1.59375, + "learning_rate": 1.5571083940744025e-05, + "loss": 0.9907, + "step": 5526 + }, + { + "epoch": 0.9476820198469683, + "grad_norm": 1.4375, + "learning_rate": 1.5569584135298813e-05, + "loss": 0.8529, + "step": 5527 + }, + { + "epoch": 0.9478534839359581, + "grad_norm": 1.5, + "learning_rate": 1.5568084148208493e-05, + "loss": 0.9824, + "step": 5528 + }, + { + "epoch": 0.9480249480249481, + "grad_norm": 1.671875, + "learning_rate": 1.5566583979521976e-05, + "loss": 0.9589, + "step": 5529 + }, + { + "epoch": 0.9481964121139379, + "grad_norm": 1.546875, + "learning_rate": 1.5565083629288195e-05, + "loss": 0.9606, + "step": 5530 + }, + { + "epoch": 0.9483678762029277, + "grad_norm": 1.5703125, + "learning_rate": 1.5563583097556086e-05, + "loss": 1.0618, + "step": 5531 + }, + { + "epoch": 0.9485393402919177, + "grad_norm": 1.5859375, + "learning_rate": 1.556208238437458e-05, + "loss": 0.965, + "step": 5532 + }, + { + "epoch": 0.9487108043809075, + "grad_norm": 1.546875, + "learning_rate": 1.5560581489792617e-05, + "loss": 0.992, + "step": 5533 + }, + { + "epoch": 0.9488822684698973, + "grad_norm": 1.4453125, + "learning_rate": 1.5559080413859153e-05, + "loss": 0.9641, + "step": 5534 + }, + { + "epoch": 0.9490537325588873, + "grad_norm": 1.578125, + "learning_rate": 1.555757915662315e-05, + "loss": 0.9974, + "step": 5535 + }, + { + "epoch": 0.9492251966478771, + "grad_norm": 1.5078125, + "learning_rate": 1.5556077718133556e-05, + "loss": 0.9771, + "step": 5536 + }, + { + "epoch": 0.9493966607368669, + "grad_norm": 1.5703125, + "learning_rate": 1.555457609843935e-05, + "loss": 0.9597, + "step": 5537 + }, + { + "epoch": 0.9495681248258567, + "grad_norm": 1.6484375, + "learning_rate": 1.55530742975895e-05, + "loss": 1.0468, + "step": 5538 + }, + { + "epoch": 0.9497395889148467, + "grad_norm": 1.5859375, + "learning_rate": 1.5551572315632983e-05, + "loss": 1.0865, + "step": 5539 + }, + { + "epoch": 0.9499110530038365, + "grad_norm": 1.5703125, + "learning_rate": 1.555007015261879e-05, + "loss": 1.0402, + "step": 5540 + }, + { + "epoch": 0.9500825170928263, + "grad_norm": 1.5703125, + "learning_rate": 1.5548567808595905e-05, + "loss": 0.9962, + "step": 5541 + }, + { + "epoch": 0.9502539811818163, + "grad_norm": 1.515625, + "learning_rate": 1.5547065283613332e-05, + "loss": 1.0879, + "step": 5542 + }, + { + "epoch": 0.9504254452708061, + "grad_norm": 1.625, + "learning_rate": 1.5545562577720077e-05, + "loss": 1.0379, + "step": 5543 + }, + { + "epoch": 0.9505969093597959, + "grad_norm": 1.53125, + "learning_rate": 1.554405969096514e-05, + "loss": 1.0111, + "step": 5544 + }, + { + "epoch": 0.9507683734487858, + "grad_norm": 1.53125, + "learning_rate": 1.5542556623397542e-05, + "loss": 0.9819, + "step": 5545 + }, + { + "epoch": 0.9509398375377757, + "grad_norm": 1.5078125, + "learning_rate": 1.5541053375066298e-05, + "loss": 0.9977, + "step": 5546 + }, + { + "epoch": 0.9511113016267655, + "grad_norm": 1.5625, + "learning_rate": 1.553954994602044e-05, + "loss": 1.124, + "step": 5547 + }, + { + "epoch": 0.9512827657157554, + "grad_norm": 1.4765625, + "learning_rate": 1.5538046336308996e-05, + "loss": 0.9165, + "step": 5548 + }, + { + "epoch": 0.9514542298047453, + "grad_norm": 1.5625, + "learning_rate": 1.553654254598101e-05, + "loss": 0.9962, + "step": 5549 + }, + { + "epoch": 0.9516256938937351, + "grad_norm": 1.609375, + "learning_rate": 1.553503857508552e-05, + "loss": 1.0265, + "step": 5550 + }, + { + "epoch": 0.951797157982725, + "grad_norm": 1.625, + "learning_rate": 1.5533534423671578e-05, + "loss": 1.0501, + "step": 5551 + }, + { + "epoch": 0.9519686220717148, + "grad_norm": 1.484375, + "learning_rate": 1.5532030091788243e-05, + "loss": 0.8984, + "step": 5552 + }, + { + "epoch": 0.9521400861607047, + "grad_norm": 1.5703125, + "learning_rate": 1.5530525579484575e-05, + "loss": 1.0077, + "step": 5553 + }, + { + "epoch": 0.9523115502496946, + "grad_norm": 1.578125, + "learning_rate": 1.552902088680964e-05, + "loss": 1.0534, + "step": 5554 + }, + { + "epoch": 0.9524830143386844, + "grad_norm": 1.5390625, + "learning_rate": 1.552751601381252e-05, + "loss": 0.9774, + "step": 5555 + }, + { + "epoch": 0.9526544784276743, + "grad_norm": 1.5859375, + "learning_rate": 1.5526010960542278e-05, + "loss": 1.0769, + "step": 5556 + }, + { + "epoch": 0.9528259425166642, + "grad_norm": 1.6328125, + "learning_rate": 1.5524505727048017e-05, + "loss": 1.0191, + "step": 5557 + }, + { + "epoch": 0.952997406605654, + "grad_norm": 1.5078125, + "learning_rate": 1.5523000313378816e-05, + "loss": 0.9485, + "step": 5558 + }, + { + "epoch": 0.9531688706946438, + "grad_norm": 1.5546875, + "learning_rate": 1.552149471958378e-05, + "loss": 1.0232, + "step": 5559 + }, + { + "epoch": 0.9533403347836338, + "grad_norm": 1.6015625, + "learning_rate": 1.551998894571201e-05, + "loss": 1.0259, + "step": 5560 + }, + { + "epoch": 0.9535117988726236, + "grad_norm": 1.5859375, + "learning_rate": 1.5518482991812614e-05, + "loss": 0.9999, + "step": 5561 + }, + { + "epoch": 0.9536832629616134, + "grad_norm": 1.5, + "learning_rate": 1.5516976857934703e-05, + "loss": 1.0025, + "step": 5562 + }, + { + "epoch": 0.9538547270506034, + "grad_norm": 1.5390625, + "learning_rate": 1.5515470544127405e-05, + "loss": 1.0224, + "step": 5563 + }, + { + "epoch": 0.9540261911395932, + "grad_norm": 1.5078125, + "learning_rate": 1.5513964050439842e-05, + "loss": 1.0007, + "step": 5564 + }, + { + "epoch": 0.954197655228583, + "grad_norm": 1.5234375, + "learning_rate": 1.5512457376921147e-05, + "loss": 0.9864, + "step": 5565 + }, + { + "epoch": 0.954369119317573, + "grad_norm": 1.5234375, + "learning_rate": 1.5510950523620465e-05, + "loss": 1.0098, + "step": 5566 + }, + { + "epoch": 0.9545405834065628, + "grad_norm": 1.6328125, + "learning_rate": 1.550944349058693e-05, + "loss": 0.9862, + "step": 5567 + }, + { + "epoch": 0.9547120474955526, + "grad_norm": 1.5, + "learning_rate": 1.5507936277869693e-05, + "loss": 1.0668, + "step": 5568 + }, + { + "epoch": 0.9548835115845425, + "grad_norm": 1.5625, + "learning_rate": 1.5506428885517917e-05, + "loss": 0.9649, + "step": 5569 + }, + { + "epoch": 0.9550549756735324, + "grad_norm": 1.4921875, + "learning_rate": 1.5504921313580757e-05, + "loss": 0.9597, + "step": 5570 + }, + { + "epoch": 0.9552264397625222, + "grad_norm": 1.5234375, + "learning_rate": 1.5503413562107385e-05, + "loss": 0.9652, + "step": 5571 + }, + { + "epoch": 0.9553979038515121, + "grad_norm": 1.5, + "learning_rate": 1.5501905631146975e-05, + "loss": 0.9575, + "step": 5572 + }, + { + "epoch": 0.955569367940502, + "grad_norm": 1.515625, + "learning_rate": 1.55003975207487e-05, + "loss": 0.9843, + "step": 5573 + }, + { + "epoch": 0.9557408320294918, + "grad_norm": 1.6796875, + "learning_rate": 1.5498889230961753e-05, + "loss": 1.0935, + "step": 5574 + }, + { + "epoch": 0.9559122961184817, + "grad_norm": 1.5546875, + "learning_rate": 1.5497380761835318e-05, + "loss": 0.9589, + "step": 5575 + }, + { + "epoch": 0.9560837602074715, + "grad_norm": 1.609375, + "learning_rate": 1.5495872113418594e-05, + "loss": 0.9689, + "step": 5576 + }, + { + "epoch": 0.9562552242964614, + "grad_norm": 1.625, + "learning_rate": 1.549436328576079e-05, + "loss": 0.9995, + "step": 5577 + }, + { + "epoch": 0.9564266883854513, + "grad_norm": 1.53125, + "learning_rate": 1.5492854278911103e-05, + "loss": 0.9451, + "step": 5578 + }, + { + "epoch": 0.9565981524744411, + "grad_norm": 1.5546875, + "learning_rate": 1.549134509291876e-05, + "loss": 0.974, + "step": 5579 + }, + { + "epoch": 0.956769616563431, + "grad_norm": 1.578125, + "learning_rate": 1.5489835727832973e-05, + "loss": 0.9706, + "step": 5580 + }, + { + "epoch": 0.9569410806524209, + "grad_norm": 1.4921875, + "learning_rate": 1.5488326183702967e-05, + "loss": 0.9371, + "step": 5581 + }, + { + "epoch": 0.9571125447414107, + "grad_norm": 1.6875, + "learning_rate": 1.548681646057798e-05, + "loss": 1.0169, + "step": 5582 + }, + { + "epoch": 0.9572840088304005, + "grad_norm": 1.6015625, + "learning_rate": 1.548530655850725e-05, + "loss": 1.0346, + "step": 5583 + }, + { + "epoch": 0.9574554729193905, + "grad_norm": 1.578125, + "learning_rate": 1.548379647754001e-05, + "loss": 0.9309, + "step": 5584 + }, + { + "epoch": 0.9576269370083803, + "grad_norm": 1.59375, + "learning_rate": 1.548228621772552e-05, + "loss": 0.9644, + "step": 5585 + }, + { + "epoch": 0.9577984010973701, + "grad_norm": 1.546875, + "learning_rate": 1.5480775779113032e-05, + "loss": 0.9528, + "step": 5586 + }, + { + "epoch": 0.9579698651863601, + "grad_norm": 1.5, + "learning_rate": 1.5479265161751807e-05, + "loss": 0.9366, + "step": 5587 + }, + { + "epoch": 0.9581413292753499, + "grad_norm": 1.578125, + "learning_rate": 1.5477754365691113e-05, + "loss": 0.9291, + "step": 5588 + }, + { + "epoch": 0.9583127933643397, + "grad_norm": 1.53125, + "learning_rate": 1.5476243390980224e-05, + "loss": 1.0644, + "step": 5589 + }, + { + "epoch": 0.9584842574533297, + "grad_norm": 1.515625, + "learning_rate": 1.5474732237668413e-05, + "loss": 0.9302, + "step": 5590 + }, + { + "epoch": 0.9586557215423195, + "grad_norm": 1.5546875, + "learning_rate": 1.547322090580497e-05, + "loss": 0.956, + "step": 5591 + }, + { + "epoch": 0.9588271856313093, + "grad_norm": 1.59375, + "learning_rate": 1.5471709395439182e-05, + "loss": 1.0559, + "step": 5592 + }, + { + "epoch": 0.9589986497202992, + "grad_norm": 1.5234375, + "learning_rate": 1.5470197706620345e-05, + "loss": 0.9851, + "step": 5593 + }, + { + "epoch": 0.9591701138092891, + "grad_norm": 1.4296875, + "learning_rate": 1.5468685839397764e-05, + "loss": 0.9395, + "step": 5594 + }, + { + "epoch": 0.9593415778982789, + "grad_norm": 1.640625, + "learning_rate": 1.546717379382074e-05, + "loss": 1.025, + "step": 5595 + }, + { + "epoch": 0.9595130419872688, + "grad_norm": 1.59375, + "learning_rate": 1.5465661569938597e-05, + "loss": 1.0304, + "step": 5596 + }, + { + "epoch": 0.9596845060762587, + "grad_norm": 1.5546875, + "learning_rate": 1.5464149167800643e-05, + "loss": 0.9483, + "step": 5597 + }, + { + "epoch": 0.9598559701652485, + "grad_norm": 1.546875, + "learning_rate": 1.5462636587456216e-05, + "loss": 0.9549, + "step": 5598 + }, + { + "epoch": 0.9600274342542384, + "grad_norm": 1.546875, + "learning_rate": 1.5461123828954635e-05, + "loss": 0.9554, + "step": 5599 + }, + { + "epoch": 0.9601988983432282, + "grad_norm": 1.65625, + "learning_rate": 1.545961089234524e-05, + "loss": 1.1681, + "step": 5600 + }, + { + "epoch": 0.9601988983432282, + "eval_loss": 0.8496836423873901, + "eval_runtime": 836.7427, + "eval_samples_per_second": 2.987, + "eval_steps_per_second": 2.987, + "step": 5600 + }, + { + "epoch": 0.9603703624322181, + "grad_norm": 1.5078125, + "learning_rate": 1.5458097777677377e-05, + "loss": 1.0117, + "step": 5601 + }, + { + "epoch": 0.960541826521208, + "grad_norm": 1.6015625, + "learning_rate": 1.545658448500039e-05, + "loss": 1.0554, + "step": 5602 + }, + { + "epoch": 0.9607132906101978, + "grad_norm": 1.5859375, + "learning_rate": 1.5455071014363637e-05, + "loss": 0.9915, + "step": 5603 + }, + { + "epoch": 0.9608847546991877, + "grad_norm": 1.6328125, + "learning_rate": 1.5453557365816477e-05, + "loss": 1.0956, + "step": 5604 + }, + { + "epoch": 0.9610562187881776, + "grad_norm": 1.53125, + "learning_rate": 1.545204353940827e-05, + "loss": 0.9851, + "step": 5605 + }, + { + "epoch": 0.9612276828771674, + "grad_norm": 1.5234375, + "learning_rate": 1.54505295351884e-05, + "loss": 0.9669, + "step": 5606 + }, + { + "epoch": 0.9613991469661572, + "grad_norm": 1.515625, + "learning_rate": 1.5449015353206232e-05, + "loss": 0.9688, + "step": 5607 + }, + { + "epoch": 0.9615706110551472, + "grad_norm": 1.5625, + "learning_rate": 1.5447500993511155e-05, + "loss": 0.9418, + "step": 5608 + }, + { + "epoch": 0.961742075144137, + "grad_norm": 1.453125, + "learning_rate": 1.5445986456152557e-05, + "loss": 1.0225, + "step": 5609 + }, + { + "epoch": 0.9619135392331268, + "grad_norm": 1.5390625, + "learning_rate": 1.5444471741179838e-05, + "loss": 1.0441, + "step": 5610 + }, + { + "epoch": 0.9620850033221168, + "grad_norm": 1.6484375, + "learning_rate": 1.544295684864239e-05, + "loss": 1.0412, + "step": 5611 + }, + { + "epoch": 0.9622564674111066, + "grad_norm": 1.5859375, + "learning_rate": 1.5441441778589622e-05, + "loss": 1.0482, + "step": 5612 + }, + { + "epoch": 0.9624279315000964, + "grad_norm": 1.53125, + "learning_rate": 1.5439926531070944e-05, + "loss": 1.0165, + "step": 5613 + }, + { + "epoch": 0.9625993955890864, + "grad_norm": 1.5078125, + "learning_rate": 1.5438411106135784e-05, + "loss": 0.9765, + "step": 5614 + }, + { + "epoch": 0.9627708596780762, + "grad_norm": 1.6015625, + "learning_rate": 1.5436895503833555e-05, + "loss": 1.0007, + "step": 5615 + }, + { + "epoch": 0.962942323767066, + "grad_norm": 1.6640625, + "learning_rate": 1.543537972421369e-05, + "loss": 0.9622, + "step": 5616 + }, + { + "epoch": 0.963113787856056, + "grad_norm": 1.59375, + "learning_rate": 1.5433863767325626e-05, + "loss": 1.1012, + "step": 5617 + }, + { + "epoch": 0.9632852519450458, + "grad_norm": 1.609375, + "learning_rate": 1.5432347633218802e-05, + "loss": 1.0813, + "step": 5618 + }, + { + "epoch": 0.9634567160340356, + "grad_norm": 1.796875, + "learning_rate": 1.5430831321942664e-05, + "loss": 1.0686, + "step": 5619 + }, + { + "epoch": 0.9636281801230255, + "grad_norm": 1.625, + "learning_rate": 1.5429314833546665e-05, + "loss": 1.0027, + "step": 5620 + }, + { + "epoch": 0.9637996442120154, + "grad_norm": 1.6328125, + "learning_rate": 1.5427798168080267e-05, + "loss": 1.0444, + "step": 5621 + }, + { + "epoch": 0.9639711083010052, + "grad_norm": 1.5078125, + "learning_rate": 1.5426281325592932e-05, + "loss": 0.9338, + "step": 5622 + }, + { + "epoch": 0.9641425723899951, + "grad_norm": 1.640625, + "learning_rate": 1.542476430613413e-05, + "loss": 1.0205, + "step": 5623 + }, + { + "epoch": 0.964314036478985, + "grad_norm": 1.515625, + "learning_rate": 1.5423247109753332e-05, + "loss": 0.9084, + "step": 5624 + }, + { + "epoch": 0.9644855005679748, + "grad_norm": 1.609375, + "learning_rate": 1.5421729736500024e-05, + "loss": 1.0294, + "step": 5625 + }, + { + "epoch": 0.9646569646569647, + "grad_norm": 1.53125, + "learning_rate": 1.5420212186423696e-05, + "loss": 0.9791, + "step": 5626 + }, + { + "epoch": 0.9648284287459545, + "grad_norm": 1.59375, + "learning_rate": 1.541869445957384e-05, + "loss": 1.0413, + "step": 5627 + }, + { + "epoch": 0.9649998928349444, + "grad_norm": 1.5859375, + "learning_rate": 1.5417176555999948e-05, + "loss": 0.9642, + "step": 5628 + }, + { + "epoch": 0.9651713569239343, + "grad_norm": 1.6015625, + "learning_rate": 1.541565847575153e-05, + "loss": 0.9477, + "step": 5629 + }, + { + "epoch": 0.9653428210129241, + "grad_norm": 1.5078125, + "learning_rate": 1.5414140218878096e-05, + "loss": 0.9937, + "step": 5630 + }, + { + "epoch": 0.965514285101914, + "grad_norm": 1.7578125, + "learning_rate": 1.5412621785429162e-05, + "loss": 1.0819, + "step": 5631 + }, + { + "epoch": 0.9656857491909038, + "grad_norm": 1.515625, + "learning_rate": 1.541110317545425e-05, + "loss": 0.9953, + "step": 5632 + }, + { + "epoch": 0.9658572132798937, + "grad_norm": 1.5546875, + "learning_rate": 1.5409584389002885e-05, + "loss": 0.9715, + "step": 5633 + }, + { + "epoch": 0.9660286773688835, + "grad_norm": 1.484375, + "learning_rate": 1.5408065426124607e-05, + "loss": 0.916, + "step": 5634 + }, + { + "epoch": 0.9662001414578734, + "grad_norm": 1.578125, + "learning_rate": 1.5406546286868946e-05, + "loss": 0.9839, + "step": 5635 + }, + { + "epoch": 0.9663716055468633, + "grad_norm": 1.484375, + "learning_rate": 1.5405026971285454e-05, + "loss": 0.9272, + "step": 5636 + }, + { + "epoch": 0.9665430696358531, + "grad_norm": 1.546875, + "learning_rate": 1.540350747942368e-05, + "loss": 1.017, + "step": 5637 + }, + { + "epoch": 0.966714533724843, + "grad_norm": 1.6171875, + "learning_rate": 1.540198781133318e-05, + "loss": 1.0335, + "step": 5638 + }, + { + "epoch": 0.9668859978138329, + "grad_norm": 1.5, + "learning_rate": 1.5400467967063512e-05, + "loss": 1.0387, + "step": 5639 + }, + { + "epoch": 0.9670574619028227, + "grad_norm": 1.6015625, + "learning_rate": 1.5398947946664247e-05, + "loss": 1.0063, + "step": 5640 + }, + { + "epoch": 0.9672289259918125, + "grad_norm": 1.5390625, + "learning_rate": 1.5397427750184962e-05, + "loss": 0.9206, + "step": 5641 + }, + { + "epoch": 0.9674003900808025, + "grad_norm": 1.578125, + "learning_rate": 1.539590737767523e-05, + "loss": 1.0224, + "step": 5642 + }, + { + "epoch": 0.9675718541697923, + "grad_norm": 1.5625, + "learning_rate": 1.5394386829184643e-05, + "loss": 1.0046, + "step": 5643 + }, + { + "epoch": 0.9677433182587821, + "grad_norm": 1.578125, + "learning_rate": 1.5392866104762783e-05, + "loss": 1.0165, + "step": 5644 + }, + { + "epoch": 0.9679147823477721, + "grad_norm": 1.546875, + "learning_rate": 1.5391345204459255e-05, + "loss": 1.051, + "step": 5645 + }, + { + "epoch": 0.9680862464367619, + "grad_norm": 1.578125, + "learning_rate": 1.538982412832366e-05, + "loss": 0.9658, + "step": 5646 + }, + { + "epoch": 0.9682577105257517, + "grad_norm": 1.609375, + "learning_rate": 1.5388302876405602e-05, + "loss": 0.9937, + "step": 5647 + }, + { + "epoch": 0.9684291746147417, + "grad_norm": 1.65625, + "learning_rate": 1.5386781448754696e-05, + "loss": 0.9847, + "step": 5648 + }, + { + "epoch": 0.9686006387037315, + "grad_norm": 1.4765625, + "learning_rate": 1.538525984542056e-05, + "loss": 0.9305, + "step": 5649 + }, + { + "epoch": 0.9687721027927213, + "grad_norm": 1.4609375, + "learning_rate": 1.5383738066452825e-05, + "loss": 0.9352, + "step": 5650 + }, + { + "epoch": 0.9689435668817112, + "grad_norm": 1.7265625, + "learning_rate": 1.5382216111901116e-05, + "loss": 0.9158, + "step": 5651 + }, + { + "epoch": 0.9691150309707011, + "grad_norm": 1.6328125, + "learning_rate": 1.5380693981815077e-05, + "loss": 0.9998, + "step": 5652 + }, + { + "epoch": 0.9692864950596909, + "grad_norm": 1.6015625, + "learning_rate": 1.5379171676244343e-05, + "loss": 1.1242, + "step": 5653 + }, + { + "epoch": 0.9694579591486808, + "grad_norm": 1.4765625, + "learning_rate": 1.537764919523856e-05, + "loss": 0.9611, + "step": 5654 + }, + { + "epoch": 0.9696294232376707, + "grad_norm": 1.609375, + "learning_rate": 1.537612653884739e-05, + "loss": 0.9709, + "step": 5655 + }, + { + "epoch": 0.9698008873266605, + "grad_norm": 1.609375, + "learning_rate": 1.537460370712049e-05, + "loss": 1.0086, + "step": 5656 + }, + { + "epoch": 0.9699723514156504, + "grad_norm": 1.6015625, + "learning_rate": 1.5373080700107522e-05, + "loss": 0.9748, + "step": 5657 + }, + { + "epoch": 0.9701438155046402, + "grad_norm": 1.625, + "learning_rate": 1.5371557517858162e-05, + "loss": 0.9927, + "step": 5658 + }, + { + "epoch": 0.9703152795936301, + "grad_norm": 1.5078125, + "learning_rate": 1.5370034160422084e-05, + "loss": 1.0601, + "step": 5659 + }, + { + "epoch": 0.97048674368262, + "grad_norm": 1.6875, + "learning_rate": 1.5368510627848963e-05, + "loss": 1.0856, + "step": 5660 + }, + { + "epoch": 0.9706582077716098, + "grad_norm": 1.59375, + "learning_rate": 1.53669869201885e-05, + "loss": 1.0443, + "step": 5661 + }, + { + "epoch": 0.9708296718605997, + "grad_norm": 1.5, + "learning_rate": 1.5365463037490386e-05, + "loss": 0.9502, + "step": 5662 + }, + { + "epoch": 0.9710011359495896, + "grad_norm": 1.6171875, + "learning_rate": 1.5363938979804312e-05, + "loss": 0.9061, + "step": 5663 + }, + { + "epoch": 0.9711726000385794, + "grad_norm": 1.6953125, + "learning_rate": 1.5362414747179996e-05, + "loss": 0.9255, + "step": 5664 + }, + { + "epoch": 0.9713440641275692, + "grad_norm": 1.59375, + "learning_rate": 1.536089033966714e-05, + "loss": 1.0111, + "step": 5665 + }, + { + "epoch": 0.9715155282165592, + "grad_norm": 1.59375, + "learning_rate": 1.535936575731546e-05, + "loss": 0.9452, + "step": 5666 + }, + { + "epoch": 0.971686992305549, + "grad_norm": 1.4609375, + "learning_rate": 1.535784100017468e-05, + "loss": 0.9667, + "step": 5667 + }, + { + "epoch": 0.9718584563945388, + "grad_norm": 1.5859375, + "learning_rate": 1.5356316068294533e-05, + "loss": 1.0044, + "step": 5668 + }, + { + "epoch": 0.9720299204835288, + "grad_norm": 1.6171875, + "learning_rate": 1.535479096172475e-05, + "loss": 1.051, + "step": 5669 + }, + { + "epoch": 0.9722013845725186, + "grad_norm": 1.59375, + "learning_rate": 1.5353265680515067e-05, + "loss": 1.0812, + "step": 5670 + }, + { + "epoch": 0.9723728486615084, + "grad_norm": 1.578125, + "learning_rate": 1.535174022471523e-05, + "loss": 0.9584, + "step": 5671 + }, + { + "epoch": 0.9725443127504984, + "grad_norm": 1.6328125, + "learning_rate": 1.5350214594374995e-05, + "loss": 0.9947, + "step": 5672 + }, + { + "epoch": 0.9727157768394882, + "grad_norm": 1.5859375, + "learning_rate": 1.5348688789544114e-05, + "loss": 0.9427, + "step": 5673 + }, + { + "epoch": 0.972887240928478, + "grad_norm": 1.53125, + "learning_rate": 1.5347162810272348e-05, + "loss": 0.974, + "step": 5674 + }, + { + "epoch": 0.9730587050174679, + "grad_norm": 1.4921875, + "learning_rate": 1.534563665660947e-05, + "loss": 1.0238, + "step": 5675 + }, + { + "epoch": 0.9732301691064578, + "grad_norm": 1.4921875, + "learning_rate": 1.5344110328605248e-05, + "loss": 0.9584, + "step": 5676 + }, + { + "epoch": 0.9734016331954476, + "grad_norm": 1.578125, + "learning_rate": 1.5342583826309464e-05, + "loss": 1.0112, + "step": 5677 + }, + { + "epoch": 0.9735730972844375, + "grad_norm": 1.65625, + "learning_rate": 1.534105714977191e-05, + "loss": 1.0449, + "step": 5678 + }, + { + "epoch": 0.9737445613734274, + "grad_norm": 1.46875, + "learning_rate": 1.533953029904236e-05, + "loss": 0.9855, + "step": 5679 + }, + { + "epoch": 0.9739160254624172, + "grad_norm": 1.53125, + "learning_rate": 1.5338003274170626e-05, + "loss": 0.9667, + "step": 5680 + }, + { + "epoch": 0.9740874895514071, + "grad_norm": 1.59375, + "learning_rate": 1.5336476075206506e-05, + "loss": 1.02, + "step": 5681 + }, + { + "epoch": 0.9742589536403969, + "grad_norm": 1.46875, + "learning_rate": 1.5334948702199803e-05, + "loss": 1.0029, + "step": 5682 + }, + { + "epoch": 0.9744304177293868, + "grad_norm": 1.5546875, + "learning_rate": 1.533342115520033e-05, + "loss": 0.9316, + "step": 5683 + }, + { + "epoch": 0.9746018818183767, + "grad_norm": 1.5625, + "learning_rate": 1.533189343425791e-05, + "loss": 0.8841, + "step": 5684 + }, + { + "epoch": 0.9747733459073665, + "grad_norm": 1.546875, + "learning_rate": 1.533036553942237e-05, + "loss": 0.9879, + "step": 5685 + }, + { + "epoch": 0.9749448099963564, + "grad_norm": 1.5390625, + "learning_rate": 1.532883747074354e-05, + "loss": 1.0564, + "step": 5686 + }, + { + "epoch": 0.9751162740853463, + "grad_norm": 1.5546875, + "learning_rate": 1.532730922827125e-05, + "loss": 0.9616, + "step": 5687 + }, + { + "epoch": 0.9752877381743361, + "grad_norm": 1.5625, + "learning_rate": 1.5325780812055345e-05, + "loss": 0.9662, + "step": 5688 + }, + { + "epoch": 0.9754592022633259, + "grad_norm": 1.6953125, + "learning_rate": 1.5324252222145673e-05, + "loss": 0.9497, + "step": 5689 + }, + { + "epoch": 0.9756306663523159, + "grad_norm": 1.5234375, + "learning_rate": 1.532272345859209e-05, + "loss": 1.0239, + "step": 5690 + }, + { + "epoch": 0.9758021304413057, + "grad_norm": 1.578125, + "learning_rate": 1.5321194521444445e-05, + "loss": 0.9976, + "step": 5691 + }, + { + "epoch": 0.9759735945302955, + "grad_norm": 1.5546875, + "learning_rate": 1.5319665410752615e-05, + "loss": 1.005, + "step": 5692 + }, + { + "epoch": 0.9761450586192855, + "grad_norm": 1.515625, + "learning_rate": 1.531813612656646e-05, + "loss": 0.9203, + "step": 5693 + }, + { + "epoch": 0.9763165227082753, + "grad_norm": 1.6796875, + "learning_rate": 1.5316606668935862e-05, + "loss": 1.037, + "step": 5694 + }, + { + "epoch": 0.9764879867972651, + "grad_norm": 1.4921875, + "learning_rate": 1.53150770379107e-05, + "loss": 0.929, + "step": 5695 + }, + { + "epoch": 0.976659450886255, + "grad_norm": 1.5703125, + "learning_rate": 1.5313547233540858e-05, + "loss": 1.0949, + "step": 5696 + }, + { + "epoch": 0.9768309149752449, + "grad_norm": 1.5625, + "learning_rate": 1.5312017255876234e-05, + "loss": 0.9868, + "step": 5697 + }, + { + "epoch": 0.9770023790642347, + "grad_norm": 1.5078125, + "learning_rate": 1.5310487104966725e-05, + "loss": 0.933, + "step": 5698 + }, + { + "epoch": 0.9771738431532246, + "grad_norm": 1.5625, + "learning_rate": 1.5308956780862234e-05, + "loss": 0.9512, + "step": 5699 + }, + { + "epoch": 0.9773453072422145, + "grad_norm": 1.5234375, + "learning_rate": 1.5307426283612668e-05, + "loss": 0.9092, + "step": 5700 + }, + { + "epoch": 0.9775167713312043, + "grad_norm": 1.5703125, + "learning_rate": 1.530589561326795e-05, + "loss": 0.9629, + "step": 5701 + }, + { + "epoch": 0.9776882354201942, + "grad_norm": 1.6796875, + "learning_rate": 1.530436476987799e-05, + "loss": 1.0289, + "step": 5702 + }, + { + "epoch": 0.977859699509184, + "grad_norm": 1.5, + "learning_rate": 1.5302833753492726e-05, + "loss": 0.8838, + "step": 5703 + }, + { + "epoch": 0.9780311635981739, + "grad_norm": 1.5390625, + "learning_rate": 1.530130256416208e-05, + "loss": 0.9919, + "step": 5704 + }, + { + "epoch": 0.9782026276871638, + "grad_norm": 1.5390625, + "learning_rate": 1.5299771201935998e-05, + "loss": 0.9795, + "step": 5705 + }, + { + "epoch": 0.9783740917761536, + "grad_norm": 1.6328125, + "learning_rate": 1.5298239666864417e-05, + "loss": 0.9597, + "step": 5706 + }, + { + "epoch": 0.9785455558651435, + "grad_norm": 1.5859375, + "learning_rate": 1.529670795899729e-05, + "loss": 0.9669, + "step": 5707 + }, + { + "epoch": 0.9787170199541334, + "grad_norm": 1.546875, + "learning_rate": 1.529517607838457e-05, + "loss": 1.0354, + "step": 5708 + }, + { + "epoch": 0.9788884840431232, + "grad_norm": 1.5625, + "learning_rate": 1.5293644025076223e-05, + "loss": 1.0429, + "step": 5709 + }, + { + "epoch": 0.9790599481321131, + "grad_norm": 1.5625, + "learning_rate": 1.5292111799122208e-05, + "loss": 0.98, + "step": 5710 + }, + { + "epoch": 0.979231412221103, + "grad_norm": 1.65625, + "learning_rate": 1.5290579400572497e-05, + "loss": 1.0995, + "step": 5711 + }, + { + "epoch": 0.9794028763100928, + "grad_norm": 1.515625, + "learning_rate": 1.528904682947707e-05, + "loss": 0.9783, + "step": 5712 + }, + { + "epoch": 0.9795743403990826, + "grad_norm": 1.4921875, + "learning_rate": 1.528751408588591e-05, + "loss": 0.9386, + "step": 5713 + }, + { + "epoch": 0.9797458044880726, + "grad_norm": 1.578125, + "learning_rate": 1.5285981169849002e-05, + "loss": 1.0191, + "step": 5714 + }, + { + "epoch": 0.9799172685770624, + "grad_norm": 1.5, + "learning_rate": 1.5284448081416346e-05, + "loss": 0.8976, + "step": 5715 + }, + { + "epoch": 0.9800887326660522, + "grad_norm": 1.4921875, + "learning_rate": 1.5282914820637938e-05, + "loss": 0.8958, + "step": 5716 + }, + { + "epoch": 0.9802601967550422, + "grad_norm": 1.59375, + "learning_rate": 1.5281381387563785e-05, + "loss": 1.0732, + "step": 5717 + }, + { + "epoch": 0.980431660844032, + "grad_norm": 1.53125, + "learning_rate": 1.5279847782243896e-05, + "loss": 0.9982, + "step": 5718 + }, + { + "epoch": 0.9806031249330218, + "grad_norm": 1.625, + "learning_rate": 1.5278314004728288e-05, + "loss": 1.0392, + "step": 5719 + }, + { + "epoch": 0.9807745890220118, + "grad_norm": 1.5546875, + "learning_rate": 1.5276780055066985e-05, + "loss": 1.02, + "step": 5720 + }, + { + "epoch": 0.9809460531110016, + "grad_norm": 1.578125, + "learning_rate": 1.5275245933310014e-05, + "loss": 0.9646, + "step": 5721 + }, + { + "epoch": 0.9811175171999914, + "grad_norm": 1.546875, + "learning_rate": 1.5273711639507408e-05, + "loss": 0.9817, + "step": 5722 + }, + { + "epoch": 0.9812889812889813, + "grad_norm": 1.5625, + "learning_rate": 1.5272177173709205e-05, + "loss": 1.0012, + "step": 5723 + }, + { + "epoch": 0.9814604453779712, + "grad_norm": 1.7265625, + "learning_rate": 1.5270642535965455e-05, + "loss": 1.0409, + "step": 5724 + }, + { + "epoch": 0.981631909466961, + "grad_norm": 1.484375, + "learning_rate": 1.52691077263262e-05, + "loss": 0.8934, + "step": 5725 + }, + { + "epoch": 0.9818033735559508, + "grad_norm": 1.46875, + "learning_rate": 1.52675727448415e-05, + "loss": 0.9786, + "step": 5726 + }, + { + "epoch": 0.9819748376449408, + "grad_norm": 1.515625, + "learning_rate": 1.526603759156142e-05, + "loss": 1.0649, + "step": 5727 + }, + { + "epoch": 0.9821463017339306, + "grad_norm": 1.4765625, + "learning_rate": 1.526450226653603e-05, + "loss": 0.9111, + "step": 5728 + }, + { + "epoch": 0.9823177658229204, + "grad_norm": 1.6015625, + "learning_rate": 1.5262966769815387e-05, + "loss": 1.0624, + "step": 5729 + }, + { + "epoch": 0.9824892299119103, + "grad_norm": 1.46875, + "learning_rate": 1.5261431101449584e-05, + "loss": 0.9538, + "step": 5730 + }, + { + "epoch": 0.9826606940009002, + "grad_norm": 1.546875, + "learning_rate": 1.52598952614887e-05, + "loss": 1.0416, + "step": 5731 + }, + { + "epoch": 0.98283215808989, + "grad_norm": 1.4921875, + "learning_rate": 1.5258359249982818e-05, + "loss": 0.9478, + "step": 5732 + }, + { + "epoch": 0.9830036221788799, + "grad_norm": 1.5703125, + "learning_rate": 1.5256823066982046e-05, + "loss": 1.006, + "step": 5733 + }, + { + "epoch": 0.9831750862678698, + "grad_norm": 1.515625, + "learning_rate": 1.525528671253648e-05, + "loss": 1.0836, + "step": 5734 + }, + { + "epoch": 0.9833465503568596, + "grad_norm": 1.546875, + "learning_rate": 1.5253750186696219e-05, + "loss": 1.029, + "step": 5735 + }, + { + "epoch": 0.9835180144458495, + "grad_norm": 1.5390625, + "learning_rate": 1.5252213489511386e-05, + "loss": 1.0368, + "step": 5736 + }, + { + "epoch": 0.9836894785348393, + "grad_norm": 1.6171875, + "learning_rate": 1.5250676621032091e-05, + "loss": 0.9558, + "step": 5737 + }, + { + "epoch": 0.9838609426238292, + "grad_norm": 1.546875, + "learning_rate": 1.5249139581308457e-05, + "loss": 1.0056, + "step": 5738 + }, + { + "epoch": 0.9840324067128191, + "grad_norm": 1.5078125, + "learning_rate": 1.524760237039062e-05, + "loss": 0.8954, + "step": 5739 + }, + { + "epoch": 0.9842038708018089, + "grad_norm": 1.5859375, + "learning_rate": 1.5246064988328706e-05, + "loss": 1.0487, + "step": 5740 + }, + { + "epoch": 0.9843753348907988, + "grad_norm": 1.5234375, + "learning_rate": 1.5244527435172858e-05, + "loss": 0.9816, + "step": 5741 + }, + { + "epoch": 0.9845467989797887, + "grad_norm": 1.5625, + "learning_rate": 1.5242989710973218e-05, + "loss": 0.9851, + "step": 5742 + }, + { + "epoch": 0.9847182630687785, + "grad_norm": 1.5703125, + "learning_rate": 1.5241451815779944e-05, + "loss": 1.0613, + "step": 5743 + }, + { + "epoch": 0.9848897271577683, + "grad_norm": 1.6328125, + "learning_rate": 1.523991374964319e-05, + "loss": 1.0254, + "step": 5744 + }, + { + "epoch": 0.9850611912467583, + "grad_norm": 1.5390625, + "learning_rate": 1.5238375512613113e-05, + "loss": 0.9928, + "step": 5745 + }, + { + "epoch": 0.9852326553357481, + "grad_norm": 1.6015625, + "learning_rate": 1.5236837104739887e-05, + "loss": 1.0757, + "step": 5746 + }, + { + "epoch": 0.9854041194247379, + "grad_norm": 1.5234375, + "learning_rate": 1.5235298526073684e-05, + "loss": 0.9682, + "step": 5747 + }, + { + "epoch": 0.9855755835137279, + "grad_norm": 1.65625, + "learning_rate": 1.523375977666468e-05, + "loss": 1.0222, + "step": 5748 + }, + { + "epoch": 0.9857470476027177, + "grad_norm": 1.65625, + "learning_rate": 1.523222085656306e-05, + "loss": 1.0489, + "step": 5749 + }, + { + "epoch": 0.9859185116917075, + "grad_norm": 1.5703125, + "learning_rate": 1.523068176581902e-05, + "loss": 1.0337, + "step": 5750 + }, + { + "epoch": 0.9860899757806975, + "grad_norm": 1.5703125, + "learning_rate": 1.5229142504482743e-05, + "loss": 0.987, + "step": 5751 + }, + { + "epoch": 0.9862614398696873, + "grad_norm": 1.5625, + "learning_rate": 1.5227603072604442e-05, + "loss": 1.0087, + "step": 5752 + }, + { + "epoch": 0.9864329039586771, + "grad_norm": 1.453125, + "learning_rate": 1.522606347023432e-05, + "loss": 0.9602, + "step": 5753 + }, + { + "epoch": 0.986604368047667, + "grad_norm": 1.6015625, + "learning_rate": 1.5224523697422588e-05, + "loss": 1.0332, + "step": 5754 + }, + { + "epoch": 0.9867758321366569, + "grad_norm": 1.4765625, + "learning_rate": 1.5222983754219466e-05, + "loss": 0.9192, + "step": 5755 + }, + { + "epoch": 0.9869472962256467, + "grad_norm": 1.5390625, + "learning_rate": 1.5221443640675175e-05, + "loss": 0.9666, + "step": 5756 + }, + { + "epoch": 0.9871187603146366, + "grad_norm": 1.53125, + "learning_rate": 1.5219903356839947e-05, + "loss": 0.9095, + "step": 5757 + }, + { + "epoch": 0.9872902244036265, + "grad_norm": 1.53125, + "learning_rate": 1.521836290276401e-05, + "loss": 0.9543, + "step": 5758 + }, + { + "epoch": 0.9874616884926163, + "grad_norm": 1.5703125, + "learning_rate": 1.521682227849761e-05, + "loss": 1.002, + "step": 5759 + }, + { + "epoch": 0.9876331525816062, + "grad_norm": 1.5, + "learning_rate": 1.5215281484090989e-05, + "loss": 0.9872, + "step": 5760 + }, + { + "epoch": 0.987804616670596, + "grad_norm": 1.4765625, + "learning_rate": 1.5213740519594402e-05, + "loss": 0.9745, + "step": 5761 + }, + { + "epoch": 0.9879760807595859, + "grad_norm": 2.0, + "learning_rate": 1.5212199385058104e-05, + "loss": 0.9453, + "step": 5762 + }, + { + "epoch": 0.9881475448485758, + "grad_norm": 1.484375, + "learning_rate": 1.5210658080532354e-05, + "loss": 0.9648, + "step": 5763 + }, + { + "epoch": 0.9883190089375656, + "grad_norm": 1.640625, + "learning_rate": 1.5209116606067424e-05, + "loss": 1.0302, + "step": 5764 + }, + { + "epoch": 0.9884904730265555, + "grad_norm": 1.5703125, + "learning_rate": 1.5207574961713585e-05, + "loss": 1.0159, + "step": 5765 + }, + { + "epoch": 0.9886619371155454, + "grad_norm": 1.578125, + "learning_rate": 1.520603314752112e-05, + "loss": 0.9792, + "step": 5766 + }, + { + "epoch": 0.9888334012045352, + "grad_norm": 1.6015625, + "learning_rate": 1.5204491163540307e-05, + "loss": 0.9606, + "step": 5767 + }, + { + "epoch": 0.989004865293525, + "grad_norm": 1.6171875, + "learning_rate": 1.5202949009821439e-05, + "loss": 1.0483, + "step": 5768 + }, + { + "epoch": 0.989176329382515, + "grad_norm": 1.5390625, + "learning_rate": 1.5201406686414812e-05, + "loss": 1.0426, + "step": 5769 + }, + { + "epoch": 0.9893477934715048, + "grad_norm": 1.46875, + "learning_rate": 1.5199864193370725e-05, + "loss": 0.9885, + "step": 5770 + }, + { + "epoch": 0.9895192575604946, + "grad_norm": 1.5390625, + "learning_rate": 1.5198321530739487e-05, + "loss": 1.0593, + "step": 5771 + }, + { + "epoch": 0.9896907216494846, + "grad_norm": 1.5625, + "learning_rate": 1.5196778698571409e-05, + "loss": 1.0358, + "step": 5772 + }, + { + "epoch": 0.9898621857384744, + "grad_norm": 1.6640625, + "learning_rate": 1.5195235696916809e-05, + "loss": 0.9692, + "step": 5773 + }, + { + "epoch": 0.9900336498274642, + "grad_norm": 1.5703125, + "learning_rate": 1.5193692525826007e-05, + "loss": 0.8914, + "step": 5774 + }, + { + "epoch": 0.9902051139164542, + "grad_norm": 1.5625, + "learning_rate": 1.5192149185349334e-05, + "loss": 0.9372, + "step": 5775 + }, + { + "epoch": 0.990376578005444, + "grad_norm": 1.4921875, + "learning_rate": 1.5190605675537128e-05, + "loss": 0.8688, + "step": 5776 + }, + { + "epoch": 0.9905480420944338, + "grad_norm": 1.5, + "learning_rate": 1.518906199643972e-05, + "loss": 0.9517, + "step": 5777 + }, + { + "epoch": 0.9907195061834237, + "grad_norm": 1.46875, + "learning_rate": 1.5187518148107466e-05, + "loss": 0.8902, + "step": 5778 + }, + { + "epoch": 0.9908909702724136, + "grad_norm": 1.5703125, + "learning_rate": 1.5185974130590704e-05, + "loss": 1.0075, + "step": 5779 + }, + { + "epoch": 0.9910624343614034, + "grad_norm": 1.625, + "learning_rate": 1.51844299439398e-05, + "loss": 1.0397, + "step": 5780 + }, + { + "epoch": 0.9912338984503933, + "grad_norm": 1.5546875, + "learning_rate": 1.5182885588205114e-05, + "loss": 0.9898, + "step": 5781 + }, + { + "epoch": 0.9914053625393832, + "grad_norm": 1.53125, + "learning_rate": 1.5181341063437012e-05, + "loss": 0.9545, + "step": 5782 + }, + { + "epoch": 0.991576826628373, + "grad_norm": 1.65625, + "learning_rate": 1.5179796369685867e-05, + "loss": 1.1139, + "step": 5783 + }, + { + "epoch": 0.9917482907173629, + "grad_norm": 1.46875, + "learning_rate": 1.5178251507002056e-05, + "loss": 0.883, + "step": 5784 + }, + { + "epoch": 0.9919197548063527, + "grad_norm": 1.5234375, + "learning_rate": 1.5176706475435964e-05, + "loss": 0.9783, + "step": 5785 + }, + { + "epoch": 0.9920912188953426, + "grad_norm": 1.640625, + "learning_rate": 1.517516127503798e-05, + "loss": 1.1041, + "step": 5786 + }, + { + "epoch": 0.9922626829843325, + "grad_norm": 1.453125, + "learning_rate": 1.5173615905858499e-05, + "loss": 0.9326, + "step": 5787 + }, + { + "epoch": 0.9924341470733223, + "grad_norm": 1.640625, + "learning_rate": 1.5172070367947922e-05, + "loss": 1.0264, + "step": 5788 + }, + { + "epoch": 0.9926056111623122, + "grad_norm": 1.53125, + "learning_rate": 1.5170524661356654e-05, + "loss": 0.9666, + "step": 5789 + }, + { + "epoch": 0.9927770752513021, + "grad_norm": 1.5078125, + "learning_rate": 1.5168978786135102e-05, + "loss": 0.9852, + "step": 5790 + }, + { + "epoch": 0.9929485393402919, + "grad_norm": 1.6015625, + "learning_rate": 1.5167432742333694e-05, + "loss": 1.0273, + "step": 5791 + }, + { + "epoch": 0.9931200034292818, + "grad_norm": 1.6015625, + "learning_rate": 1.5165886530002842e-05, + "loss": 1.0046, + "step": 5792 + }, + { + "epoch": 0.9932914675182717, + "grad_norm": 1.578125, + "learning_rate": 1.516434014919298e-05, + "loss": 0.9921, + "step": 5793 + }, + { + "epoch": 0.9934629316072615, + "grad_norm": 1.59375, + "learning_rate": 1.5162793599954535e-05, + "loss": 1.0535, + "step": 5794 + }, + { + "epoch": 0.9936343956962513, + "grad_norm": 1.59375, + "learning_rate": 1.5161246882337952e-05, + "loss": 1.0442, + "step": 5795 + }, + { + "epoch": 0.9938058597852413, + "grad_norm": 1.453125, + "learning_rate": 1.5159699996393672e-05, + "loss": 0.94, + "step": 5796 + }, + { + "epoch": 0.9939773238742311, + "grad_norm": 1.59375, + "learning_rate": 1.5158152942172144e-05, + "loss": 1.0225, + "step": 5797 + }, + { + "epoch": 0.9941487879632209, + "grad_norm": 1.546875, + "learning_rate": 1.5156605719723824e-05, + "loss": 0.9897, + "step": 5798 + }, + { + "epoch": 0.9943202520522109, + "grad_norm": 1.5703125, + "learning_rate": 1.5155058329099176e-05, + "loss": 1.0447, + "step": 5799 + }, + { + "epoch": 0.9944917161412007, + "grad_norm": 1.578125, + "learning_rate": 1.5153510770348665e-05, + "loss": 1.0434, + "step": 5800 + }, + { + "epoch": 0.9946631802301905, + "grad_norm": 1.6015625, + "learning_rate": 1.5151963043522759e-05, + "loss": 0.934, + "step": 5801 + }, + { + "epoch": 0.9948346443191805, + "grad_norm": 1.46875, + "learning_rate": 1.515041514867194e-05, + "loss": 0.9631, + "step": 5802 + }, + { + "epoch": 0.9950061084081703, + "grad_norm": 1.546875, + "learning_rate": 1.5148867085846686e-05, + "loss": 0.9318, + "step": 5803 + }, + { + "epoch": 0.9951775724971601, + "grad_norm": 1.5, + "learning_rate": 1.5147318855097491e-05, + "loss": 0.954, + "step": 5804 + }, + { + "epoch": 0.99534903658615, + "grad_norm": 1.53125, + "learning_rate": 1.5145770456474842e-05, + "loss": 1.044, + "step": 5805 + }, + { + "epoch": 0.9955205006751399, + "grad_norm": 1.625, + "learning_rate": 1.514422189002924e-05, + "loss": 1.0305, + "step": 5806 + }, + { + "epoch": 0.9956919647641297, + "grad_norm": 1.59375, + "learning_rate": 1.5142673155811192e-05, + "loss": 1.0378, + "step": 5807 + }, + { + "epoch": 0.9958634288531196, + "grad_norm": 1.5625, + "learning_rate": 1.5141124253871206e-05, + "loss": 1.0143, + "step": 5808 + }, + { + "epoch": 0.9960348929421095, + "grad_norm": 1.7578125, + "learning_rate": 1.51395751842598e-05, + "loss": 1.0365, + "step": 5809 + }, + { + "epoch": 0.9962063570310993, + "grad_norm": 1.6171875, + "learning_rate": 1.5138025947027495e-05, + "loss": 0.9591, + "step": 5810 + }, + { + "epoch": 0.9963778211200892, + "grad_norm": 1.6171875, + "learning_rate": 1.5136476542224813e-05, + "loss": 1.0616, + "step": 5811 + }, + { + "epoch": 0.996549285209079, + "grad_norm": 1.5546875, + "learning_rate": 1.513492696990229e-05, + "loss": 0.9995, + "step": 5812 + }, + { + "epoch": 0.9967207492980689, + "grad_norm": 1.5234375, + "learning_rate": 1.5133377230110461e-05, + "loss": 0.9262, + "step": 5813 + }, + { + "epoch": 0.9968922133870588, + "grad_norm": 1.6171875, + "learning_rate": 1.5131827322899873e-05, + "loss": 1.0377, + "step": 5814 + }, + { + "epoch": 0.9970636774760486, + "grad_norm": 1.5859375, + "learning_rate": 1.5130277248321068e-05, + "loss": 1.0408, + "step": 5815 + }, + { + "epoch": 0.9972351415650385, + "grad_norm": 1.6953125, + "learning_rate": 1.5128727006424605e-05, + "loss": 0.9927, + "step": 5816 + }, + { + "epoch": 0.9974066056540284, + "grad_norm": 1.59375, + "learning_rate": 1.512717659726104e-05, + "loss": 0.9785, + "step": 5817 + }, + { + "epoch": 0.9975780697430182, + "grad_norm": 1.6015625, + "learning_rate": 1.5125626020880937e-05, + "loss": 1.0031, + "step": 5818 + }, + { + "epoch": 0.997749533832008, + "grad_norm": 1.6171875, + "learning_rate": 1.5124075277334873e-05, + "loss": 1.0179, + "step": 5819 + }, + { + "epoch": 0.997920997920998, + "grad_norm": 1.5, + "learning_rate": 1.5122524366673416e-05, + "loss": 1.0134, + "step": 5820 + }, + { + "epoch": 0.9980924620099878, + "grad_norm": 1.5390625, + "learning_rate": 1.5120973288947149e-05, + "loss": 1.0051, + "step": 5821 + }, + { + "epoch": 0.9982639260989776, + "grad_norm": 1.53125, + "learning_rate": 1.5119422044206661e-05, + "loss": 1.0071, + "step": 5822 + }, + { + "epoch": 0.9984353901879675, + "grad_norm": 1.578125, + "learning_rate": 1.5117870632502542e-05, + "loss": 0.9875, + "step": 5823 + }, + { + "epoch": 0.9986068542769574, + "grad_norm": 1.6171875, + "learning_rate": 1.5116319053885388e-05, + "loss": 1.0501, + "step": 5824 + }, + { + "epoch": 0.9987783183659472, + "grad_norm": 1.53125, + "learning_rate": 1.5114767308405807e-05, + "loss": 0.9723, + "step": 5825 + }, + { + "epoch": 0.998949782454937, + "grad_norm": 1.5703125, + "learning_rate": 1.51132153961144e-05, + "loss": 1.0194, + "step": 5826 + }, + { + "epoch": 0.999121246543927, + "grad_norm": 1.4296875, + "learning_rate": 1.5111663317061785e-05, + "loss": 1.0033, + "step": 5827 + }, + { + "epoch": 0.9992927106329168, + "grad_norm": 1.6328125, + "learning_rate": 1.5110111071298581e-05, + "loss": 0.9697, + "step": 5828 + }, + { + "epoch": 0.9994641747219066, + "grad_norm": 1.4765625, + "learning_rate": 1.5108558658875411e-05, + "loss": 0.9253, + "step": 5829 + }, + { + "epoch": 0.9996356388108966, + "grad_norm": 1.4765625, + "learning_rate": 1.510700607984291e-05, + "loss": 0.9752, + "step": 5830 + }, + { + "epoch": 0.9998071028998864, + "grad_norm": 1.6015625, + "learning_rate": 1.5105453334251708e-05, + "loss": 0.9819, + "step": 5831 + }, + { + "epoch": 0.9999785669888762, + "grad_norm": 1.5078125, + "learning_rate": 1.5103900422152445e-05, + "loss": 0.9919, + "step": 5832 + }, + { + "epoch": 1.0001500310778662, + "grad_norm": 1.5390625, + "learning_rate": 1.510234734359577e-05, + "loss": 0.9498, + "step": 5833 + }, + { + "epoch": 1.000321495166856, + "grad_norm": 1.5390625, + "learning_rate": 1.5100794098632336e-05, + "loss": 0.9506, + "step": 5834 + }, + { + "epoch": 1.0004929592558458, + "grad_norm": 1.5546875, + "learning_rate": 1.5099240687312797e-05, + "loss": 1.128, + "step": 5835 + }, + { + "epoch": 1.0006644233448356, + "grad_norm": 1.46875, + "learning_rate": 1.5097687109687818e-05, + "loss": 0.9603, + "step": 5836 + }, + { + "epoch": 1.0008358874338257, + "grad_norm": 1.625, + "learning_rate": 1.5096133365808068e-05, + "loss": 0.935, + "step": 5837 + }, + { + "epoch": 1.0010073515228155, + "grad_norm": 1.5625, + "learning_rate": 1.5094579455724216e-05, + "loss": 1.0625, + "step": 5838 + }, + { + "epoch": 1.0011788156118053, + "grad_norm": 1.5703125, + "learning_rate": 1.5093025379486945e-05, + "loss": 1.0261, + "step": 5839 + }, + { + "epoch": 1.0013502797007952, + "grad_norm": 1.65625, + "learning_rate": 1.5091471137146938e-05, + "loss": 1.0069, + "step": 5840 + }, + { + "epoch": 1.001521743789785, + "grad_norm": 1.5859375, + "learning_rate": 1.5089916728754882e-05, + "loss": 0.9623, + "step": 5841 + }, + { + "epoch": 1.0016932078787748, + "grad_norm": 1.5390625, + "learning_rate": 1.5088362154361477e-05, + "loss": 0.9904, + "step": 5842 + }, + { + "epoch": 1.0018646719677649, + "grad_norm": 1.4296875, + "learning_rate": 1.5086807414017424e-05, + "loss": 0.8465, + "step": 5843 + }, + { + "epoch": 1.0020361360567547, + "grad_norm": 1.6171875, + "learning_rate": 1.508525250777342e-05, + "loss": 1.0136, + "step": 5844 + }, + { + "epoch": 1.0022076001457445, + "grad_norm": 1.6015625, + "learning_rate": 1.5083697435680185e-05, + "loss": 1.001, + "step": 5845 + }, + { + "epoch": 1.0023790642347343, + "grad_norm": 1.5703125, + "learning_rate": 1.5082142197788435e-05, + "loss": 1.0401, + "step": 5846 + }, + { + "epoch": 1.0025505283237242, + "grad_norm": 1.5078125, + "learning_rate": 1.5080586794148887e-05, + "loss": 1.0249, + "step": 5847 + }, + { + "epoch": 1.002721992412714, + "grad_norm": 1.5859375, + "learning_rate": 1.5079031224812276e-05, + "loss": 0.9805, + "step": 5848 + }, + { + "epoch": 1.002893456501704, + "grad_norm": 1.484375, + "learning_rate": 1.5077475489829327e-05, + "loss": 0.9111, + "step": 5849 + }, + { + "epoch": 1.0030649205906939, + "grad_norm": 1.5078125, + "learning_rate": 1.5075919589250785e-05, + "loss": 0.9759, + "step": 5850 + }, + { + "epoch": 1.0032363846796837, + "grad_norm": 1.546875, + "learning_rate": 1.5074363523127389e-05, + "loss": 0.9997, + "step": 5851 + }, + { + "epoch": 1.0034078487686735, + "grad_norm": 1.5546875, + "learning_rate": 1.5072807291509892e-05, + "loss": 0.9388, + "step": 5852 + }, + { + "epoch": 1.0035793128576633, + "grad_norm": 1.453125, + "learning_rate": 1.5071250894449046e-05, + "loss": 0.9081, + "step": 5853 + }, + { + "epoch": 1.0037507769466532, + "grad_norm": 1.5, + "learning_rate": 1.506969433199561e-05, + "loss": 0.9728, + "step": 5854 + }, + { + "epoch": 1.0039222410356432, + "grad_norm": 1.5078125, + "learning_rate": 1.5068137604200353e-05, + "loss": 0.9584, + "step": 5855 + }, + { + "epoch": 1.004093705124633, + "grad_norm": 1.546875, + "learning_rate": 1.5066580711114045e-05, + "loss": 1.0177, + "step": 5856 + }, + { + "epoch": 1.0042651692136229, + "grad_norm": 1.5546875, + "learning_rate": 1.5065023652787459e-05, + "loss": 1.0031, + "step": 5857 + }, + { + "epoch": 1.0044366333026127, + "grad_norm": 1.6171875, + "learning_rate": 1.506346642927138e-05, + "loss": 0.9219, + "step": 5858 + }, + { + "epoch": 1.0046080973916025, + "grad_norm": 1.546875, + "learning_rate": 1.5061909040616592e-05, + "loss": 0.9689, + "step": 5859 + }, + { + "epoch": 1.0047795614805923, + "grad_norm": 1.6171875, + "learning_rate": 1.5060351486873893e-05, + "loss": 1.0111, + "step": 5860 + }, + { + "epoch": 1.0049510255695824, + "grad_norm": 1.6484375, + "learning_rate": 1.505879376809407e-05, + "loss": 0.9455, + "step": 5861 + }, + { + "epoch": 1.0051224896585722, + "grad_norm": 1.65625, + "learning_rate": 1.5057235884327938e-05, + "loss": 0.9665, + "step": 5862 + }, + { + "epoch": 1.005293953747562, + "grad_norm": 1.53125, + "learning_rate": 1.5055677835626296e-05, + "loss": 0.9387, + "step": 5863 + }, + { + "epoch": 1.0054654178365519, + "grad_norm": 1.5546875, + "learning_rate": 1.5054119622039962e-05, + "loss": 0.998, + "step": 5864 + }, + { + "epoch": 1.0056368819255417, + "grad_norm": 1.578125, + "learning_rate": 1.5052561243619756e-05, + "loss": 1.047, + "step": 5865 + }, + { + "epoch": 1.0058083460145315, + "grad_norm": 1.5390625, + "learning_rate": 1.5051002700416505e-05, + "loss": 0.9357, + "step": 5866 + }, + { + "epoch": 1.0059798101035216, + "grad_norm": 1.6171875, + "learning_rate": 1.5049443992481031e-05, + "loss": 1.1065, + "step": 5867 + }, + { + "epoch": 1.0061512741925114, + "grad_norm": 1.5390625, + "learning_rate": 1.5047885119864179e-05, + "loss": 1.032, + "step": 5868 + }, + { + "epoch": 1.0063227382815012, + "grad_norm": 2.53125, + "learning_rate": 1.504632608261678e-05, + "loss": 0.9967, + "step": 5869 + }, + { + "epoch": 1.006494202370491, + "grad_norm": 1.6015625, + "learning_rate": 1.5044766880789686e-05, + "loss": 0.9772, + "step": 5870 + }, + { + "epoch": 1.0066656664594809, + "grad_norm": 1.5234375, + "learning_rate": 1.504320751443375e-05, + "loss": 0.9998, + "step": 5871 + }, + { + "epoch": 1.0068371305484707, + "grad_norm": 1.46875, + "learning_rate": 1.504164798359982e-05, + "loss": 0.9344, + "step": 5872 + }, + { + "epoch": 1.0070085946374605, + "grad_norm": 1.578125, + "learning_rate": 1.504008828833877e-05, + "loss": 1.1216, + "step": 5873 + }, + { + "epoch": 1.0071800587264506, + "grad_norm": 1.59375, + "learning_rate": 1.503852842870146e-05, + "loss": 1.004, + "step": 5874 + }, + { + "epoch": 1.0073515228154404, + "grad_norm": 1.5078125, + "learning_rate": 1.503696840473876e-05, + "loss": 0.9362, + "step": 5875 + }, + { + "epoch": 1.0075229869044302, + "grad_norm": 1.46875, + "learning_rate": 1.5035408216501559e-05, + "loss": 0.9073, + "step": 5876 + }, + { + "epoch": 1.00769445099342, + "grad_norm": 1.4609375, + "learning_rate": 1.5033847864040734e-05, + "loss": 0.9681, + "step": 5877 + }, + { + "epoch": 1.0078659150824099, + "grad_norm": 1.5625, + "learning_rate": 1.5032287347407173e-05, + "loss": 0.9663, + "step": 5878 + }, + { + "epoch": 1.0080373791713997, + "grad_norm": 1.5234375, + "learning_rate": 1.5030726666651771e-05, + "loss": 0.9741, + "step": 5879 + }, + { + "epoch": 1.0082088432603897, + "grad_norm": 1.53125, + "learning_rate": 1.5029165821825429e-05, + "loss": 1.0372, + "step": 5880 + }, + { + "epoch": 1.0083803073493796, + "grad_norm": 1.5625, + "learning_rate": 1.502760481297905e-05, + "loss": 1.0015, + "step": 5881 + }, + { + "epoch": 1.0085517714383694, + "grad_norm": 1.6953125, + "learning_rate": 1.5026043640163548e-05, + "loss": 1.0011, + "step": 5882 + }, + { + "epoch": 1.0087232355273592, + "grad_norm": 1.6640625, + "learning_rate": 1.5024482303429837e-05, + "loss": 1.0835, + "step": 5883 + }, + { + "epoch": 1.008894699616349, + "grad_norm": 1.5390625, + "learning_rate": 1.5022920802828835e-05, + "loss": 0.9673, + "step": 5884 + }, + { + "epoch": 1.0090661637053389, + "grad_norm": 2.34375, + "learning_rate": 1.5021359138411473e-05, + "loss": 1.0423, + "step": 5885 + }, + { + "epoch": 1.009237627794329, + "grad_norm": 1.53125, + "learning_rate": 1.5019797310228681e-05, + "loss": 0.9883, + "step": 5886 + }, + { + "epoch": 1.0094090918833187, + "grad_norm": 1.6484375, + "learning_rate": 1.5018235318331394e-05, + "loss": 1.0063, + "step": 5887 + }, + { + "epoch": 1.0095805559723086, + "grad_norm": 1.53125, + "learning_rate": 1.5016673162770556e-05, + "loss": 1.0686, + "step": 5888 + }, + { + "epoch": 1.0097520200612984, + "grad_norm": 1.5625, + "learning_rate": 1.5015110843597121e-05, + "loss": 0.997, + "step": 5889 + }, + { + "epoch": 1.0099234841502882, + "grad_norm": 1.6328125, + "learning_rate": 1.5013548360862032e-05, + "loss": 0.9651, + "step": 5890 + }, + { + "epoch": 1.010094948239278, + "grad_norm": 1.796875, + "learning_rate": 1.5011985714616248e-05, + "loss": 1.0072, + "step": 5891 + }, + { + "epoch": 1.010266412328268, + "grad_norm": 1.6328125, + "learning_rate": 1.501042290491074e-05, + "loss": 1.0448, + "step": 5892 + }, + { + "epoch": 1.010437876417258, + "grad_norm": 1.640625, + "learning_rate": 1.5008859931796474e-05, + "loss": 1.0269, + "step": 5893 + }, + { + "epoch": 1.0106093405062477, + "grad_norm": 1.5078125, + "learning_rate": 1.5007296795324427e-05, + "loss": 1.0092, + "step": 5894 + }, + { + "epoch": 1.0107808045952376, + "grad_norm": 1.609375, + "learning_rate": 1.5005733495545572e-05, + "loss": 0.9801, + "step": 5895 + }, + { + "epoch": 1.0109522686842274, + "grad_norm": 1.53125, + "learning_rate": 1.5004170032510896e-05, + "loss": 0.9324, + "step": 5896 + }, + { + "epoch": 1.0111237327732172, + "grad_norm": 1.625, + "learning_rate": 1.5002606406271393e-05, + "loss": 1.0021, + "step": 5897 + }, + { + "epoch": 1.0112951968622073, + "grad_norm": 1.5078125, + "learning_rate": 1.5001042616878057e-05, + "loss": 0.9742, + "step": 5898 + }, + { + "epoch": 1.011466660951197, + "grad_norm": 1.5625, + "learning_rate": 1.499947866438189e-05, + "loss": 0.9153, + "step": 5899 + }, + { + "epoch": 1.011638125040187, + "grad_norm": 1.5625, + "learning_rate": 1.4997914548833898e-05, + "loss": 1.026, + "step": 5900 + }, + { + "epoch": 1.0118095891291767, + "grad_norm": 1.625, + "learning_rate": 1.4996350270285089e-05, + "loss": 0.9569, + "step": 5901 + }, + { + "epoch": 1.0119810532181666, + "grad_norm": 1.8359375, + "learning_rate": 1.4994785828786487e-05, + "loss": 0.9064, + "step": 5902 + }, + { + "epoch": 1.0001714640889898, + "grad_norm": 1.75, + "learning_rate": 1.4993221224389109e-05, + "loss": 0.9062, + "step": 5903 + }, + { + "epoch": 1.0003429281779797, + "grad_norm": 1.7109375, + "learning_rate": 1.4991656457143984e-05, + "loss": 0.8624, + "step": 5904 + }, + { + "epoch": 1.0005143922669695, + "grad_norm": 1.515625, + "learning_rate": 1.4990091527102145e-05, + "loss": 0.8074, + "step": 5905 + }, + { + "epoch": 1.0006858563559595, + "grad_norm": 1.625, + "learning_rate": 1.4988526434314633e-05, + "loss": 0.8797, + "step": 5906 + }, + { + "epoch": 1.0008573204449493, + "grad_norm": 1.5625, + "learning_rate": 1.4986961178832485e-05, + "loss": 0.9083, + "step": 5907 + }, + { + "epoch": 1.0010287845339392, + "grad_norm": 1.6171875, + "learning_rate": 1.4985395760706756e-05, + "loss": 0.8394, + "step": 5908 + }, + { + "epoch": 1.001200248622929, + "grad_norm": 1.7109375, + "learning_rate": 1.4983830179988497e-05, + "loss": 0.7831, + "step": 5909 + }, + { + "epoch": 1.0013717127119188, + "grad_norm": 1.8359375, + "learning_rate": 1.4982264436728768e-05, + "loss": 0.9046, + "step": 5910 + }, + { + "epoch": 1.0015431768009087, + "grad_norm": 1.84375, + "learning_rate": 1.4980698530978632e-05, + "loss": 0.8875, + "step": 5911 + }, + { + "epoch": 1.0017146408898987, + "grad_norm": 1.703125, + "learning_rate": 1.4979132462789168e-05, + "loss": 0.7999, + "step": 5912 + }, + { + "epoch": 1.0018861049788885, + "grad_norm": 1.6015625, + "learning_rate": 1.497756623221144e-05, + "loss": 0.8818, + "step": 5913 + }, + { + "epoch": 1.0020575690678784, + "grad_norm": 1.609375, + "learning_rate": 1.4975999839296534e-05, + "loss": 0.8981, + "step": 5914 + }, + { + "epoch": 1.0022290331568682, + "grad_norm": 1.71875, + "learning_rate": 1.4974433284095535e-05, + "loss": 0.9054, + "step": 5915 + }, + { + "epoch": 1.002400497245858, + "grad_norm": 1.6953125, + "learning_rate": 1.4972866566659537e-05, + "loss": 0.8543, + "step": 5916 + }, + { + "epoch": 1.0025719613348478, + "grad_norm": 2.0, + "learning_rate": 1.4971299687039634e-05, + "loss": 0.9116, + "step": 5917 + }, + { + "epoch": 1.0027434254238379, + "grad_norm": 1.6015625, + "learning_rate": 1.4969732645286925e-05, + "loss": 0.8365, + "step": 5918 + }, + { + "epoch": 1.0029148895128277, + "grad_norm": 1.71875, + "learning_rate": 1.496816544145252e-05, + "loss": 0.9019, + "step": 5919 + }, + { + "epoch": 1.0030863536018175, + "grad_norm": 1.609375, + "learning_rate": 1.4966598075587532e-05, + "loss": 0.8775, + "step": 5920 + }, + { + "epoch": 1.0032578176908074, + "grad_norm": 1.59375, + "learning_rate": 1.496503054774308e-05, + "loss": 0.7859, + "step": 5921 + }, + { + "epoch": 1.0034292817797972, + "grad_norm": 1.703125, + "learning_rate": 1.4963462857970285e-05, + "loss": 0.8872, + "step": 5922 + }, + { + "epoch": 1.003600745868787, + "grad_norm": 1.7109375, + "learning_rate": 1.4961895006320276e-05, + "loss": 0.9129, + "step": 5923 + }, + { + "epoch": 1.003772209957777, + "grad_norm": 1.5703125, + "learning_rate": 1.4960326992844188e-05, + "loss": 0.8566, + "step": 5924 + }, + { + "epoch": 1.0039436740467669, + "grad_norm": 1.7578125, + "learning_rate": 1.4958758817593153e-05, + "loss": 0.9195, + "step": 5925 + }, + { + "epoch": 1.0041151381357567, + "grad_norm": 1.75, + "learning_rate": 1.4957190480618323e-05, + "loss": 0.9396, + "step": 5926 + }, + { + "epoch": 1.0042866022247465, + "grad_norm": 1.6953125, + "learning_rate": 1.4955621981970844e-05, + "loss": 0.8806, + "step": 5927 + }, + { + "epoch": 1.0044580663137364, + "grad_norm": 1.578125, + "learning_rate": 1.4954053321701871e-05, + "loss": 0.8617, + "step": 5928 + }, + { + "epoch": 1.0046295304027262, + "grad_norm": 1.609375, + "learning_rate": 1.4952484499862563e-05, + "loss": 0.8603, + "step": 5929 + }, + { + "epoch": 1.0048009944917162, + "grad_norm": 1.6953125, + "learning_rate": 1.4950915516504088e-05, + "loss": 0.8906, + "step": 5930 + }, + { + "epoch": 1.004972458580706, + "grad_norm": 1.6953125, + "learning_rate": 1.4949346371677612e-05, + "loss": 0.9146, + "step": 5931 + }, + { + "epoch": 1.0051439226696959, + "grad_norm": 1.625, + "learning_rate": 1.4947777065434315e-05, + "loss": 0.9114, + "step": 5932 + }, + { + "epoch": 1.0053153867586857, + "grad_norm": 1.71875, + "learning_rate": 1.4946207597825375e-05, + "loss": 0.8995, + "step": 5933 + }, + { + "epoch": 1.0054868508476755, + "grad_norm": 1.671875, + "learning_rate": 1.494463796890198e-05, + "loss": 0.8308, + "step": 5934 + }, + { + "epoch": 1.0056583149366654, + "grad_norm": 1.65625, + "learning_rate": 1.4943068178715322e-05, + "loss": 0.8965, + "step": 5935 + }, + { + "epoch": 1.0058297790256554, + "grad_norm": 1.7109375, + "learning_rate": 1.49414982273166e-05, + "loss": 0.8661, + "step": 5936 + }, + { + "epoch": 1.0060012431146452, + "grad_norm": 1.671875, + "learning_rate": 1.4939928114757007e-05, + "loss": 0.863, + "step": 5937 + }, + { + "epoch": 1.006172707203635, + "grad_norm": 1.71875, + "learning_rate": 1.4938357841087757e-05, + "loss": 0.8777, + "step": 5938 + }, + { + "epoch": 1.0063441712926249, + "grad_norm": 1.6640625, + "learning_rate": 1.4936787406360062e-05, + "loss": 0.8211, + "step": 5939 + }, + { + "epoch": 1.0065156353816147, + "grad_norm": 1.6484375, + "learning_rate": 1.4935216810625141e-05, + "loss": 0.9032, + "step": 5940 + }, + { + "epoch": 1.0066870994706045, + "grad_norm": 1.59375, + "learning_rate": 1.4933646053934216e-05, + "loss": 0.8251, + "step": 5941 + }, + { + "epoch": 1.0068585635595946, + "grad_norm": 1.6796875, + "learning_rate": 1.4932075136338515e-05, + "loss": 0.8489, + "step": 5942 + }, + { + "epoch": 1.0070300276485844, + "grad_norm": 1.6015625, + "learning_rate": 1.493050405788927e-05, + "loss": 0.8285, + "step": 5943 + }, + { + "epoch": 1.0072014917375742, + "grad_norm": 1.4921875, + "learning_rate": 1.492893281863772e-05, + "loss": 0.8343, + "step": 5944 + }, + { + "epoch": 1.007372955826564, + "grad_norm": 1.5703125, + "learning_rate": 1.4927361418635111e-05, + "loss": 0.8195, + "step": 5945 + }, + { + "epoch": 1.0075444199155539, + "grad_norm": 1.609375, + "learning_rate": 1.4925789857932693e-05, + "loss": 0.8629, + "step": 5946 + }, + { + "epoch": 1.0077158840045437, + "grad_norm": 1.5859375, + "learning_rate": 1.492421813658172e-05, + "loss": 0.7921, + "step": 5947 + }, + { + "epoch": 1.0078873480935338, + "grad_norm": 1.53125, + "learning_rate": 1.4922646254633446e-05, + "loss": 0.8804, + "step": 5948 + }, + { + "epoch": 1.0080588121825236, + "grad_norm": 1.640625, + "learning_rate": 1.4921074212139141e-05, + "loss": 0.8934, + "step": 5949 + }, + { + "epoch": 1.0082302762715134, + "grad_norm": 1.6171875, + "learning_rate": 1.4919502009150077e-05, + "loss": 0.8651, + "step": 5950 + }, + { + "epoch": 1.0084017403605032, + "grad_norm": 1.5546875, + "learning_rate": 1.4917929645717528e-05, + "loss": 0.8723, + "step": 5951 + }, + { + "epoch": 1.008573204449493, + "grad_norm": 1.6328125, + "learning_rate": 1.4916357121892775e-05, + "loss": 0.8532, + "step": 5952 + }, + { + "epoch": 1.0087446685384829, + "grad_norm": 1.609375, + "learning_rate": 1.49147844377271e-05, + "loss": 0.8857, + "step": 5953 + }, + { + "epoch": 1.008916132627473, + "grad_norm": 1.65625, + "learning_rate": 1.4913211593271802e-05, + "loss": 0.8886, + "step": 5954 + }, + { + "epoch": 1.0090875967164628, + "grad_norm": 1.7734375, + "learning_rate": 1.4911638588578167e-05, + "loss": 0.8719, + "step": 5955 + }, + { + "epoch": 1.0092590608054526, + "grad_norm": 1.65625, + "learning_rate": 1.4910065423697504e-05, + "loss": 0.9048, + "step": 5956 + }, + { + "epoch": 1.0094305248944424, + "grad_norm": 1.5859375, + "learning_rate": 1.4908492098681118e-05, + "loss": 0.7576, + "step": 5957 + }, + { + "epoch": 1.0096019889834322, + "grad_norm": 1.671875, + "learning_rate": 1.4906918613580322e-05, + "loss": 0.8747, + "step": 5958 + }, + { + "epoch": 1.009773453072422, + "grad_norm": 1.546875, + "learning_rate": 1.490534496844643e-05, + "loss": 0.8154, + "step": 5959 + }, + { + "epoch": 1.009944917161412, + "grad_norm": 1.6640625, + "learning_rate": 1.4903771163330766e-05, + "loss": 0.7974, + "step": 5960 + }, + { + "epoch": 1.010116381250402, + "grad_norm": 1.7890625, + "learning_rate": 1.4902197198284663e-05, + "loss": 0.8984, + "step": 5961 + }, + { + "epoch": 1.0102878453393918, + "grad_norm": 1.5234375, + "learning_rate": 1.4900623073359445e-05, + "loss": 0.8233, + "step": 5962 + }, + { + "epoch": 1.0104593094283816, + "grad_norm": 1.5234375, + "learning_rate": 1.4899048788606456e-05, + "loss": 0.8013, + "step": 5963 + }, + { + "epoch": 1.0106307735173714, + "grad_norm": 1.6015625, + "learning_rate": 1.4897474344077041e-05, + "loss": 0.8674, + "step": 5964 + }, + { + "epoch": 1.0108022376063612, + "grad_norm": 1.6171875, + "learning_rate": 1.4895899739822542e-05, + "loss": 0.8388, + "step": 5965 + }, + { + "epoch": 1.0109737016953513, + "grad_norm": 1.5234375, + "learning_rate": 1.4894324975894314e-05, + "loss": 0.8599, + "step": 5966 + }, + { + "epoch": 1.011145165784341, + "grad_norm": 1.6796875, + "learning_rate": 1.4892750052343723e-05, + "loss": 0.8238, + "step": 5967 + }, + { + "epoch": 1.011316629873331, + "grad_norm": 1.5625, + "learning_rate": 1.4891174969222127e-05, + "loss": 0.8735, + "step": 5968 + }, + { + "epoch": 1.0114880939623208, + "grad_norm": 1.546875, + "learning_rate": 1.4889599726580898e-05, + "loss": 0.7963, + "step": 5969 + }, + { + "epoch": 1.0116595580513106, + "grad_norm": 1.5703125, + "learning_rate": 1.4888024324471406e-05, + "loss": 0.9024, + "step": 5970 + }, + { + "epoch": 1.0118310221403004, + "grad_norm": 1.546875, + "learning_rate": 1.4886448762945035e-05, + "loss": 0.8448, + "step": 5971 + }, + { + "epoch": 1.0120024862292905, + "grad_norm": 1.578125, + "learning_rate": 1.4884873042053171e-05, + "loss": 0.8464, + "step": 5972 + }, + { + "epoch": 1.0121739503182803, + "grad_norm": 1.5546875, + "learning_rate": 1.4883297161847204e-05, + "loss": 0.8066, + "step": 5973 + }, + { + "epoch": 1.01234541440727, + "grad_norm": 1.625, + "learning_rate": 1.4881721122378523e-05, + "loss": 0.8588, + "step": 5974 + }, + { + "epoch": 1.01251687849626, + "grad_norm": 1.640625, + "learning_rate": 1.4880144923698537e-05, + "loss": 0.8857, + "step": 5975 + }, + { + "epoch": 1.0126883425852498, + "grad_norm": 1.6171875, + "learning_rate": 1.4878568565858644e-05, + "loss": 0.8952, + "step": 5976 + }, + { + "epoch": 1.0128598066742396, + "grad_norm": 1.6875, + "learning_rate": 1.4876992048910264e-05, + "loss": 0.8991, + "step": 5977 + }, + { + "epoch": 1.0130312707632296, + "grad_norm": 1.6171875, + "learning_rate": 1.4875415372904804e-05, + "loss": 0.842, + "step": 5978 + }, + { + "epoch": 1.0132027348522195, + "grad_norm": 1.5625, + "learning_rate": 1.4873838537893693e-05, + "loss": 0.9149, + "step": 5979 + }, + { + "epoch": 1.0133741989412093, + "grad_norm": 1.609375, + "learning_rate": 1.4872261543928353e-05, + "loss": 0.8517, + "step": 5980 + }, + { + "epoch": 1.013545663030199, + "grad_norm": 1.6015625, + "learning_rate": 1.4870684391060217e-05, + "loss": 0.8273, + "step": 5981 + }, + { + "epoch": 1.013717127119189, + "grad_norm": 1.5703125, + "learning_rate": 1.4869107079340721e-05, + "loss": 0.8642, + "step": 5982 + }, + { + "epoch": 1.0138885912081788, + "grad_norm": 1.625, + "learning_rate": 1.486752960882131e-05, + "loss": 0.8797, + "step": 5983 + }, + { + "epoch": 1.0140600552971688, + "grad_norm": 1.6328125, + "learning_rate": 1.4865951979553426e-05, + "loss": 0.8991, + "step": 5984 + }, + { + "epoch": 1.0142315193861586, + "grad_norm": 1.65625, + "learning_rate": 1.4864374191588522e-05, + "loss": 0.8193, + "step": 5985 + }, + { + "epoch": 1.0144029834751485, + "grad_norm": 1.6640625, + "learning_rate": 1.4862796244978061e-05, + "loss": 0.8228, + "step": 5986 + }, + { + "epoch": 1.0145744475641383, + "grad_norm": 1.5546875, + "learning_rate": 1.4861218139773506e-05, + "loss": 0.7662, + "step": 5987 + }, + { + "epoch": 1.014745911653128, + "grad_norm": 1.609375, + "learning_rate": 1.485963987602632e-05, + "loss": 0.9135, + "step": 5988 + }, + { + "epoch": 1.014917375742118, + "grad_norm": 1.6015625, + "learning_rate": 1.4858061453787977e-05, + "loss": 0.8718, + "step": 5989 + }, + { + "epoch": 1.015088839831108, + "grad_norm": 1.640625, + "learning_rate": 1.4856482873109956e-05, + "loss": 0.8989, + "step": 5990 + }, + { + "epoch": 1.0152603039200978, + "grad_norm": 1.6484375, + "learning_rate": 1.4854904134043741e-05, + "loss": 0.7878, + "step": 5991 + }, + { + "epoch": 1.0154317680090876, + "grad_norm": 1.7265625, + "learning_rate": 1.4853325236640821e-05, + "loss": 0.886, + "step": 5992 + }, + { + "epoch": 1.0156032320980775, + "grad_norm": 1.6796875, + "learning_rate": 1.4851746180952687e-05, + "loss": 0.849, + "step": 5993 + }, + { + "epoch": 1.0157746961870673, + "grad_norm": 1.5859375, + "learning_rate": 1.4850166967030842e-05, + "loss": 0.8621, + "step": 5994 + }, + { + "epoch": 1.015946160276057, + "grad_norm": 1.6484375, + "learning_rate": 1.4848587594926787e-05, + "loss": 0.9694, + "step": 5995 + }, + { + "epoch": 1.0161176243650472, + "grad_norm": 1.6328125, + "learning_rate": 1.4847008064692035e-05, + "loss": 0.932, + "step": 5996 + }, + { + "epoch": 1.016289088454037, + "grad_norm": 1.6796875, + "learning_rate": 1.4845428376378094e-05, + "loss": 0.8344, + "step": 5997 + }, + { + "epoch": 1.0164605525430268, + "grad_norm": 1.59375, + "learning_rate": 1.484384853003649e-05, + "loss": 0.8891, + "step": 5998 + }, + { + "epoch": 1.0166320166320166, + "grad_norm": 1.6640625, + "learning_rate": 1.4842268525718747e-05, + "loss": 0.9256, + "step": 5999 + }, + { + "epoch": 1.0168034807210065, + "grad_norm": 1.6484375, + "learning_rate": 1.484068836347639e-05, + "loss": 0.822, + "step": 6000 + }, + { + "epoch": 1.0169749448099963, + "grad_norm": 1.5859375, + "learning_rate": 1.4839108043360958e-05, + "loss": 0.8274, + "step": 6001 + }, + { + "epoch": 1.017146408898986, + "grad_norm": 1.6328125, + "learning_rate": 1.483752756542399e-05, + "loss": 0.8475, + "step": 6002 + }, + { + "epoch": 1.0173178729879762, + "grad_norm": 1.546875, + "learning_rate": 1.4835946929717031e-05, + "loss": 0.797, + "step": 6003 + }, + { + "epoch": 1.017489337076966, + "grad_norm": 1.6640625, + "learning_rate": 1.4834366136291633e-05, + "loss": 0.8124, + "step": 6004 + }, + { + "epoch": 1.0176608011659558, + "grad_norm": 1.7265625, + "learning_rate": 1.483278518519935e-05, + "loss": 0.8464, + "step": 6005 + }, + { + "epoch": 1.0178322652549456, + "grad_norm": 1.6796875, + "learning_rate": 1.4831204076491744e-05, + "loss": 0.8662, + "step": 6006 + }, + { + "epoch": 1.0180037293439355, + "grad_norm": 1.6953125, + "learning_rate": 1.4829622810220383e-05, + "loss": 0.8657, + "step": 6007 + }, + { + "epoch": 1.0181751934329253, + "grad_norm": 1.625, + "learning_rate": 1.482804138643683e-05, + "loss": 0.8258, + "step": 6008 + }, + { + "epoch": 1.0183466575219153, + "grad_norm": 1.6875, + "learning_rate": 1.4826459805192673e-05, + "loss": 0.8241, + "step": 6009 + }, + { + "epoch": 1.0185181216109052, + "grad_norm": 1.703125, + "learning_rate": 1.4824878066539485e-05, + "loss": 0.8901, + "step": 6010 + }, + { + "epoch": 1.018689585699895, + "grad_norm": 1.7578125, + "learning_rate": 1.4823296170528854e-05, + "loss": 0.8794, + "step": 6011 + }, + { + "epoch": 1.0188610497888848, + "grad_norm": 1.6015625, + "learning_rate": 1.4821714117212369e-05, + "loss": 0.8573, + "step": 6012 + }, + { + "epoch": 1.0190325138778746, + "grad_norm": 1.625, + "learning_rate": 1.4820131906641631e-05, + "loss": 0.7691, + "step": 6013 + }, + { + "epoch": 1.0192039779668645, + "grad_norm": 1.8125, + "learning_rate": 1.481854953886824e-05, + "loss": 0.9168, + "step": 6014 + }, + { + "epoch": 1.0193754420558545, + "grad_norm": 1.65625, + "learning_rate": 1.4816967013943808e-05, + "loss": 0.8843, + "step": 6015 + }, + { + "epoch": 1.0195469061448443, + "grad_norm": 1.546875, + "learning_rate": 1.481538433191994e-05, + "loss": 0.8351, + "step": 6016 + }, + { + "epoch": 1.0197183702338342, + "grad_norm": 1.578125, + "learning_rate": 1.4813801492848256e-05, + "loss": 0.8354, + "step": 6017 + }, + { + "epoch": 1.019889834322824, + "grad_norm": 1.65625, + "learning_rate": 1.4812218496780378e-05, + "loss": 1.0101, + "step": 6018 + }, + { + "epoch": 1.0200612984118138, + "grad_norm": 1.6640625, + "learning_rate": 1.4810635343767935e-05, + "loss": 0.8163, + "step": 6019 + }, + { + "epoch": 1.0202327625008036, + "grad_norm": 1.5703125, + "learning_rate": 1.4809052033862555e-05, + "loss": 0.781, + "step": 6020 + }, + { + "epoch": 1.0204042265897937, + "grad_norm": 1.609375, + "learning_rate": 1.480746856711588e-05, + "loss": 0.8783, + "step": 6021 + }, + { + "epoch": 1.0205756906787835, + "grad_norm": 1.5859375, + "learning_rate": 1.4805884943579551e-05, + "loss": 0.8485, + "step": 6022 + }, + { + "epoch": 1.0207471547677733, + "grad_norm": 1.7265625, + "learning_rate": 1.4804301163305219e-05, + "loss": 0.9251, + "step": 6023 + }, + { + "epoch": 1.0209186188567632, + "grad_norm": 1.6953125, + "learning_rate": 1.4802717226344533e-05, + "loss": 0.8207, + "step": 6024 + }, + { + "epoch": 1.021090082945753, + "grad_norm": 1.6953125, + "learning_rate": 1.4801133132749157e-05, + "loss": 0.8953, + "step": 6025 + }, + { + "epoch": 1.0212615470347428, + "grad_norm": 2.15625, + "learning_rate": 1.4799548882570744e-05, + "loss": 0.8914, + "step": 6026 + }, + { + "epoch": 1.0214330111237329, + "grad_norm": 1.6796875, + "learning_rate": 1.4797964475860975e-05, + "loss": 0.8625, + "step": 6027 + }, + { + "epoch": 1.0216044752127227, + "grad_norm": 1.6328125, + "learning_rate": 1.4796379912671514e-05, + "loss": 0.8708, + "step": 6028 + }, + { + "epoch": 1.0217759393017125, + "grad_norm": 1.6875, + "learning_rate": 1.479479519305404e-05, + "loss": 0.8928, + "step": 6029 + }, + { + "epoch": 1.0219474033907023, + "grad_norm": 1.5078125, + "learning_rate": 1.4793210317060243e-05, + "loss": 0.8637, + "step": 6030 + }, + { + "epoch": 1.0221188674796922, + "grad_norm": 1.625, + "learning_rate": 1.4791625284741807e-05, + "loss": 0.8429, + "step": 6031 + }, + { + "epoch": 1.022290331568682, + "grad_norm": 1.53125, + "learning_rate": 1.4790040096150425e-05, + "loss": 0.7881, + "step": 6032 + }, + { + "epoch": 1.022461795657672, + "grad_norm": 1.6484375, + "learning_rate": 1.4788454751337799e-05, + "loss": 0.9203, + "step": 6033 + }, + { + "epoch": 1.0226332597466619, + "grad_norm": 1.6328125, + "learning_rate": 1.4786869250355632e-05, + "loss": 0.8146, + "step": 6034 + }, + { + "epoch": 1.0228047238356517, + "grad_norm": 1.5859375, + "learning_rate": 1.4785283593255633e-05, + "loss": 0.7602, + "step": 6035 + }, + { + "epoch": 1.0229761879246415, + "grad_norm": 1.6484375, + "learning_rate": 1.4783697780089517e-05, + "loss": 0.8346, + "step": 6036 + }, + { + "epoch": 1.0231476520136313, + "grad_norm": 1.671875, + "learning_rate": 1.4782111810909002e-05, + "loss": 0.9469, + "step": 6037 + }, + { + "epoch": 1.0233191161026212, + "grad_norm": 1.6640625, + "learning_rate": 1.4780525685765813e-05, + "loss": 0.9122, + "step": 6038 + }, + { + "epoch": 1.0234905801916112, + "grad_norm": 1.734375, + "learning_rate": 1.4778939404711682e-05, + "loss": 0.8712, + "step": 6039 + }, + { + "epoch": 1.023662044280601, + "grad_norm": 1.7734375, + "learning_rate": 1.4777352967798335e-05, + "loss": 0.9272, + "step": 6040 + }, + { + "epoch": 1.0238335083695909, + "grad_norm": 1.6796875, + "learning_rate": 1.4775766375077523e-05, + "loss": 0.885, + "step": 6041 + }, + { + "epoch": 1.0240049724585807, + "grad_norm": 1.6953125, + "learning_rate": 1.4774179626600984e-05, + "loss": 0.8927, + "step": 6042 + }, + { + "epoch": 1.0241764365475705, + "grad_norm": 1.578125, + "learning_rate": 1.4772592722420468e-05, + "loss": 0.7589, + "step": 6043 + }, + { + "epoch": 1.0243479006365603, + "grad_norm": 1.7890625, + "learning_rate": 1.4771005662587737e-05, + "loss": 0.7592, + "step": 6044 + }, + { + "epoch": 1.0245193647255504, + "grad_norm": 1.609375, + "learning_rate": 1.4769418447154541e-05, + "loss": 0.7909, + "step": 6045 + }, + { + "epoch": 1.0246908288145402, + "grad_norm": 1.609375, + "learning_rate": 1.4767831076172649e-05, + "loss": 0.8449, + "step": 6046 + }, + { + "epoch": 1.02486229290353, + "grad_norm": 1.5390625, + "learning_rate": 1.4766243549693836e-05, + "loss": 0.7959, + "step": 6047 + }, + { + "epoch": 1.0250337569925199, + "grad_norm": 1.6796875, + "learning_rate": 1.4764655867769869e-05, + "loss": 0.8639, + "step": 6048 + }, + { + "epoch": 1.0252052210815097, + "grad_norm": 1.6328125, + "learning_rate": 1.4763068030452532e-05, + "loss": 0.8417, + "step": 6049 + }, + { + "epoch": 1.0253766851704995, + "grad_norm": 1.546875, + "learning_rate": 1.476148003779361e-05, + "loss": 0.8112, + "step": 6050 + }, + { + "epoch": 1.0255481492594896, + "grad_norm": 1.6640625, + "learning_rate": 1.4759891889844895e-05, + "loss": 0.845, + "step": 6051 + }, + { + "epoch": 1.0257196133484794, + "grad_norm": 1.5703125, + "learning_rate": 1.4758303586658183e-05, + "loss": 0.7849, + "step": 6052 + }, + { + "epoch": 1.0258910774374692, + "grad_norm": 1.609375, + "learning_rate": 1.4756715128285271e-05, + "loss": 0.7499, + "step": 6053 + }, + { + "epoch": 1.026062541526459, + "grad_norm": 1.609375, + "learning_rate": 1.4755126514777965e-05, + "loss": 0.8702, + "step": 6054 + }, + { + "epoch": 1.0262340056154489, + "grad_norm": 1.7578125, + "learning_rate": 1.4753537746188081e-05, + "loss": 0.8347, + "step": 6055 + }, + { + "epoch": 1.0264054697044387, + "grad_norm": 1.59375, + "learning_rate": 1.475194882256743e-05, + "loss": 0.8415, + "step": 6056 + }, + { + "epoch": 1.0265769337934287, + "grad_norm": 1.6640625, + "learning_rate": 1.4750359743967833e-05, + "loss": 0.8358, + "step": 6057 + }, + { + "epoch": 1.0267483978824186, + "grad_norm": 1.6484375, + "learning_rate": 1.4748770510441115e-05, + "loss": 0.8432, + "step": 6058 + }, + { + "epoch": 1.0269198619714084, + "grad_norm": 1.6328125, + "learning_rate": 1.4747181122039109e-05, + "loss": 0.9186, + "step": 6059 + }, + { + "epoch": 1.0270913260603982, + "grad_norm": 1.671875, + "learning_rate": 1.4745591578813651e-05, + "loss": 0.8913, + "step": 6060 + }, + { + "epoch": 1.027262790149388, + "grad_norm": 1.671875, + "learning_rate": 1.4744001880816581e-05, + "loss": 0.8778, + "step": 6061 + }, + { + "epoch": 1.0274342542383779, + "grad_norm": 1.78125, + "learning_rate": 1.4742412028099748e-05, + "loss": 0.9405, + "step": 6062 + }, + { + "epoch": 1.027605718327368, + "grad_norm": 1.6953125, + "learning_rate": 1.4740822020714999e-05, + "loss": 0.8893, + "step": 6063 + }, + { + "epoch": 1.0277771824163577, + "grad_norm": 1.6328125, + "learning_rate": 1.473923185871419e-05, + "loss": 0.8459, + "step": 6064 + }, + { + "epoch": 1.0279486465053476, + "grad_norm": 1.6953125, + "learning_rate": 1.4737641542149184e-05, + "loss": 0.867, + "step": 6065 + }, + { + "epoch": 1.0281201105943374, + "grad_norm": 1.6875, + "learning_rate": 1.4736051071071848e-05, + "loss": 0.8569, + "step": 6066 + }, + { + "epoch": 1.0282915746833272, + "grad_norm": 1.6640625, + "learning_rate": 1.4734460445534053e-05, + "loss": 0.8905, + "step": 6067 + }, + { + "epoch": 1.028463038772317, + "grad_norm": 1.5546875, + "learning_rate": 1.4732869665587673e-05, + "loss": 0.8276, + "step": 6068 + }, + { + "epoch": 1.028634502861307, + "grad_norm": 1.7265625, + "learning_rate": 1.4731278731284591e-05, + "loss": 0.904, + "step": 6069 + }, + { + "epoch": 1.028805966950297, + "grad_norm": 1.6171875, + "learning_rate": 1.4729687642676693e-05, + "loss": 0.8826, + "step": 6070 + }, + { + "epoch": 1.0289774310392867, + "grad_norm": 1.6875, + "learning_rate": 1.4728096399815873e-05, + "loss": 0.8896, + "step": 6071 + }, + { + "epoch": 1.0291488951282766, + "grad_norm": 1.8046875, + "learning_rate": 1.4726505002754023e-05, + "loss": 0.8389, + "step": 6072 + }, + { + "epoch": 1.0293203592172664, + "grad_norm": 1.6171875, + "learning_rate": 1.4724913451543048e-05, + "loss": 0.8424, + "step": 6073 + }, + { + "epoch": 1.0294918233062562, + "grad_norm": 1.7421875, + "learning_rate": 1.4723321746234852e-05, + "loss": 0.9061, + "step": 6074 + }, + { + "epoch": 1.0296632873952463, + "grad_norm": 1.59375, + "learning_rate": 1.4721729886881346e-05, + "loss": 0.8773, + "step": 6075 + }, + { + "epoch": 1.029834751484236, + "grad_norm": 1.640625, + "learning_rate": 1.4720137873534451e-05, + "loss": 0.8438, + "step": 6076 + }, + { + "epoch": 1.030006215573226, + "grad_norm": 1.6640625, + "learning_rate": 1.4718545706246083e-05, + "loss": 0.8368, + "step": 6077 + }, + { + "epoch": 1.0301776796622157, + "grad_norm": 1.6171875, + "learning_rate": 1.471695338506817e-05, + "loss": 0.9028, + "step": 6078 + }, + { + "epoch": 1.0303491437512056, + "grad_norm": 1.625, + "learning_rate": 1.4715360910052648e-05, + "loss": 0.8155, + "step": 6079 + }, + { + "epoch": 1.0305206078401954, + "grad_norm": 1.6640625, + "learning_rate": 1.4713768281251448e-05, + "loss": 0.9443, + "step": 6080 + }, + { + "epoch": 1.0306920719291854, + "grad_norm": 1.609375, + "learning_rate": 1.4712175498716517e-05, + "loss": 0.8505, + "step": 6081 + }, + { + "epoch": 1.0308635360181753, + "grad_norm": 1.5625, + "learning_rate": 1.4710582562499796e-05, + "loss": 0.8387, + "step": 6082 + }, + { + "epoch": 1.031035000107165, + "grad_norm": 1.5703125, + "learning_rate": 1.4708989472653239e-05, + "loss": 0.7899, + "step": 6083 + }, + { + "epoch": 1.031206464196155, + "grad_norm": 1.546875, + "learning_rate": 1.4707396229228803e-05, + "loss": 0.8112, + "step": 6084 + }, + { + "epoch": 1.0313779282851447, + "grad_norm": 1.625, + "learning_rate": 1.4705802832278453e-05, + "loss": 0.9203, + "step": 6085 + }, + { + "epoch": 1.0315493923741346, + "grad_norm": 1.625, + "learning_rate": 1.4704209281854149e-05, + "loss": 0.8632, + "step": 6086 + }, + { + "epoch": 1.0317208564631244, + "grad_norm": 1.703125, + "learning_rate": 1.4702615578007863e-05, + "loss": 0.9001, + "step": 6087 + }, + { + "epoch": 1.0318923205521144, + "grad_norm": 1.625, + "learning_rate": 1.470102172079158e-05, + "loss": 0.7713, + "step": 6088 + }, + { + "epoch": 1.0320637846411043, + "grad_norm": 1.625, + "learning_rate": 1.4699427710257275e-05, + "loss": 0.8201, + "step": 6089 + }, + { + "epoch": 1.032235248730094, + "grad_norm": 1.671875, + "learning_rate": 1.4697833546456935e-05, + "loss": 0.915, + "step": 6090 + }, + { + "epoch": 1.032406712819084, + "grad_norm": 1.6015625, + "learning_rate": 1.4696239229442553e-05, + "loss": 0.8307, + "step": 6091 + }, + { + "epoch": 1.0325781769080737, + "grad_norm": 1.6171875, + "learning_rate": 1.4694644759266126e-05, + "loss": 0.8525, + "step": 6092 + }, + { + "epoch": 1.0327496409970638, + "grad_norm": 1.625, + "learning_rate": 1.4693050135979654e-05, + "loss": 0.784, + "step": 6093 + }, + { + "epoch": 1.0329211050860536, + "grad_norm": 1.578125, + "learning_rate": 1.4691455359635143e-05, + "loss": 0.8912, + "step": 6094 + }, + { + "epoch": 1.0330925691750434, + "grad_norm": 1.6796875, + "learning_rate": 1.4689860430284608e-05, + "loss": 0.8415, + "step": 6095 + }, + { + "epoch": 1.0332640332640333, + "grad_norm": 1.640625, + "learning_rate": 1.4688265347980065e-05, + "loss": 0.9884, + "step": 6096 + }, + { + "epoch": 1.033435497353023, + "grad_norm": 1.609375, + "learning_rate": 1.4686670112773534e-05, + "loss": 0.8707, + "step": 6097 + }, + { + "epoch": 1.033606961442013, + "grad_norm": 1.4765625, + "learning_rate": 1.4685074724717041e-05, + "loss": 0.7489, + "step": 6098 + }, + { + "epoch": 1.0337784255310027, + "grad_norm": 1.640625, + "learning_rate": 1.468347918386262e-05, + "loss": 0.866, + "step": 6099 + }, + { + "epoch": 1.0339498896199928, + "grad_norm": 1.6328125, + "learning_rate": 1.4681883490262308e-05, + "loss": 0.8207, + "step": 6100 + }, + { + "epoch": 1.0341213537089826, + "grad_norm": 1.609375, + "learning_rate": 1.4680287643968143e-05, + "loss": 0.7684, + "step": 6101 + }, + { + "epoch": 1.0342928177979724, + "grad_norm": 1.7890625, + "learning_rate": 1.4678691645032175e-05, + "loss": 0.9001, + "step": 6102 + }, + { + "epoch": 1.0344642818869623, + "grad_norm": 1.6484375, + "learning_rate": 1.4677095493506452e-05, + "loss": 0.9167, + "step": 6103 + }, + { + "epoch": 1.034635745975952, + "grad_norm": 1.640625, + "learning_rate": 1.4675499189443035e-05, + "loss": 0.8546, + "step": 6104 + }, + { + "epoch": 1.034807210064942, + "grad_norm": 1.6015625, + "learning_rate": 1.4673902732893981e-05, + "loss": 0.924, + "step": 6105 + }, + { + "epoch": 1.034978674153932, + "grad_norm": 1.703125, + "learning_rate": 1.4672306123911359e-05, + "loss": 0.8848, + "step": 6106 + }, + { + "epoch": 1.0351501382429218, + "grad_norm": 1.6640625, + "learning_rate": 1.467070936254724e-05, + "loss": 0.8747, + "step": 6107 + }, + { + "epoch": 1.0353216023319116, + "grad_norm": 1.65625, + "learning_rate": 1.4669112448853702e-05, + "loss": 0.8458, + "step": 6108 + }, + { + "epoch": 1.0354930664209014, + "grad_norm": 1.6015625, + "learning_rate": 1.4667515382882825e-05, + "loss": 0.899, + "step": 6109 + }, + { + "epoch": 1.0356645305098913, + "grad_norm": 1.5078125, + "learning_rate": 1.4665918164686692e-05, + "loss": 0.8748, + "step": 6110 + }, + { + "epoch": 1.035835994598881, + "grad_norm": 1.6875, + "learning_rate": 1.4664320794317399e-05, + "loss": 0.8575, + "step": 6111 + }, + { + "epoch": 1.0360074586878711, + "grad_norm": 1.6875, + "learning_rate": 1.4662723271827043e-05, + "loss": 0.8943, + "step": 6112 + }, + { + "epoch": 1.036178922776861, + "grad_norm": 1.640625, + "learning_rate": 1.466112559726772e-05, + "loss": 0.8982, + "step": 6113 + }, + { + "epoch": 1.0363503868658508, + "grad_norm": 1.703125, + "learning_rate": 1.4659527770691541e-05, + "loss": 0.9443, + "step": 6114 + }, + { + "epoch": 1.0365218509548406, + "grad_norm": 1.71875, + "learning_rate": 1.4657929792150611e-05, + "loss": 0.8187, + "step": 6115 + }, + { + "epoch": 1.0366933150438304, + "grad_norm": 1.6015625, + "learning_rate": 1.4656331661697055e-05, + "loss": 0.8568, + "step": 6116 + }, + { + "epoch": 1.0368647791328203, + "grad_norm": 1.5703125, + "learning_rate": 1.4654733379382986e-05, + "loss": 0.8306, + "step": 6117 + }, + { + "epoch": 1.0370362432218103, + "grad_norm": 1.6953125, + "learning_rate": 1.4653134945260536e-05, + "loss": 0.9314, + "step": 6118 + }, + { + "epoch": 1.0372077073108001, + "grad_norm": 1.71875, + "learning_rate": 1.4651536359381835e-05, + "loss": 0.8781, + "step": 6119 + }, + { + "epoch": 1.03737917139979, + "grad_norm": 1.7109375, + "learning_rate": 1.4649937621799016e-05, + "loss": 0.9768, + "step": 6120 + }, + { + "epoch": 1.0375506354887798, + "grad_norm": 1.59375, + "learning_rate": 1.464833873256422e-05, + "loss": 0.7535, + "step": 6121 + }, + { + "epoch": 1.0377220995777696, + "grad_norm": 1.546875, + "learning_rate": 1.4646739691729592e-05, + "loss": 0.7748, + "step": 6122 + }, + { + "epoch": 1.0378935636667594, + "grad_norm": 1.671875, + "learning_rate": 1.4645140499347287e-05, + "loss": 0.8655, + "step": 6123 + }, + { + "epoch": 1.0380650277557495, + "grad_norm": 1.5625, + "learning_rate": 1.4643541155469459e-05, + "loss": 0.8241, + "step": 6124 + }, + { + "epoch": 1.0382364918447393, + "grad_norm": 1.515625, + "learning_rate": 1.4641941660148267e-05, + "loss": 0.8163, + "step": 6125 + }, + { + "epoch": 1.0384079559337291, + "grad_norm": 1.71875, + "learning_rate": 1.4640342013435875e-05, + "loss": 0.9213, + "step": 6126 + }, + { + "epoch": 1.038579420022719, + "grad_norm": 1.625, + "learning_rate": 1.463874221538446e-05, + "loss": 0.7863, + "step": 6127 + }, + { + "epoch": 1.0387508841117088, + "grad_norm": 1.7890625, + "learning_rate": 1.4637142266046192e-05, + "loss": 0.8414, + "step": 6128 + }, + { + "epoch": 1.0389223482006986, + "grad_norm": 1.6640625, + "learning_rate": 1.4635542165473254e-05, + "loss": 0.9, + "step": 6129 + }, + { + "epoch": 1.0390938122896887, + "grad_norm": 1.703125, + "learning_rate": 1.4633941913717826e-05, + "loss": 0.8092, + "step": 6130 + }, + { + "epoch": 1.0392652763786785, + "grad_norm": 1.6484375, + "learning_rate": 1.4632341510832109e-05, + "loss": 0.9552, + "step": 6131 + }, + { + "epoch": 1.0394367404676683, + "grad_norm": 1.6015625, + "learning_rate": 1.4630740956868289e-05, + "loss": 0.7918, + "step": 6132 + }, + { + "epoch": 1.0396082045566581, + "grad_norm": 1.640625, + "learning_rate": 1.4629140251878567e-05, + "loss": 0.8943, + "step": 6133 + }, + { + "epoch": 1.039779668645648, + "grad_norm": 1.6640625, + "learning_rate": 1.4627539395915146e-05, + "loss": 0.8804, + "step": 6134 + }, + { + "epoch": 1.0399511327346378, + "grad_norm": 1.7578125, + "learning_rate": 1.4625938389030246e-05, + "loss": 0.8774, + "step": 6135 + }, + { + "epoch": 1.0401225968236278, + "grad_norm": 1.6171875, + "learning_rate": 1.4624337231276071e-05, + "loss": 0.7932, + "step": 6136 + }, + { + "epoch": 1.0402940609126177, + "grad_norm": 1.59375, + "learning_rate": 1.4622735922704849e-05, + "loss": 0.8117, + "step": 6137 + }, + { + "epoch": 1.0404655250016075, + "grad_norm": 1.703125, + "learning_rate": 1.46211344633688e-05, + "loss": 0.9195, + "step": 6138 + }, + { + "epoch": 1.0406369890905973, + "grad_norm": 1.734375, + "learning_rate": 1.4619532853320153e-05, + "loss": 0.8734, + "step": 6139 + }, + { + "epoch": 1.0408084531795871, + "grad_norm": 1.5859375, + "learning_rate": 1.4617931092611146e-05, + "loss": 0.7903, + "step": 6140 + }, + { + "epoch": 1.040979917268577, + "grad_norm": 1.8359375, + "learning_rate": 1.4616329181294016e-05, + "loss": 0.8192, + "step": 6141 + }, + { + "epoch": 1.041151381357567, + "grad_norm": 1.609375, + "learning_rate": 1.4614727119421007e-05, + "loss": 0.8648, + "step": 6142 + }, + { + "epoch": 1.0413228454465568, + "grad_norm": 1.7265625, + "learning_rate": 1.4613124907044369e-05, + "loss": 0.8881, + "step": 6143 + }, + { + "epoch": 1.0414943095355467, + "grad_norm": 1.71875, + "learning_rate": 1.4611522544216357e-05, + "loss": 0.8722, + "step": 6144 + }, + { + "epoch": 1.0416657736245365, + "grad_norm": 1.59375, + "learning_rate": 1.460992003098923e-05, + "loss": 0.81, + "step": 6145 + }, + { + "epoch": 1.0418372377135263, + "grad_norm": 1.6484375, + "learning_rate": 1.460831736741525e-05, + "loss": 0.8484, + "step": 6146 + }, + { + "epoch": 1.0420087018025161, + "grad_norm": 1.6953125, + "learning_rate": 1.4606714553546689e-05, + "loss": 0.8397, + "step": 6147 + }, + { + "epoch": 1.0421801658915062, + "grad_norm": 1.671875, + "learning_rate": 1.460511158943582e-05, + "loss": 0.8013, + "step": 6148 + }, + { + "epoch": 1.042351629980496, + "grad_norm": 1.734375, + "learning_rate": 1.460350847513492e-05, + "loss": 0.9249, + "step": 6149 + }, + { + "epoch": 1.0425230940694858, + "grad_norm": 1.5625, + "learning_rate": 1.4601905210696273e-05, + "loss": 0.843, + "step": 6150 + }, + { + "epoch": 1.0426945581584757, + "grad_norm": 1.8984375, + "learning_rate": 1.4600301796172167e-05, + "loss": 0.9169, + "step": 6151 + }, + { + "epoch": 1.0428660222474655, + "grad_norm": 1.6640625, + "learning_rate": 1.4598698231614896e-05, + "loss": 0.8661, + "step": 6152 + }, + { + "epoch": 1.0430374863364553, + "grad_norm": 1.625, + "learning_rate": 1.459709451707676e-05, + "loss": 0.8222, + "step": 6153 + }, + { + "epoch": 1.0432089504254454, + "grad_norm": 1.578125, + "learning_rate": 1.459549065261006e-05, + "loss": 0.8195, + "step": 6154 + }, + { + "epoch": 1.0433804145144352, + "grad_norm": 1.6328125, + "learning_rate": 1.4593886638267104e-05, + "loss": 0.9094, + "step": 6155 + }, + { + "epoch": 1.043551878603425, + "grad_norm": 1.5859375, + "learning_rate": 1.4592282474100207e-05, + "loss": 0.8695, + "step": 6156 + }, + { + "epoch": 1.0437233426924148, + "grad_norm": 1.609375, + "learning_rate": 1.4590678160161686e-05, + "loss": 0.8661, + "step": 6157 + }, + { + "epoch": 1.0438948067814047, + "grad_norm": 1.5625, + "learning_rate": 1.4589073696503864e-05, + "loss": 0.8281, + "step": 6158 + }, + { + "epoch": 1.0440662708703945, + "grad_norm": 1.75, + "learning_rate": 1.4587469083179065e-05, + "loss": 0.9914, + "step": 6159 + }, + { + "epoch": 1.0442377349593845, + "grad_norm": 1.6796875, + "learning_rate": 1.4585864320239629e-05, + "loss": 0.8431, + "step": 6160 + }, + { + "epoch": 1.0444091990483744, + "grad_norm": 1.6953125, + "learning_rate": 1.4584259407737884e-05, + "loss": 0.8542, + "step": 6161 + }, + { + "epoch": 1.0445806631373642, + "grad_norm": 1.609375, + "learning_rate": 1.4582654345726177e-05, + "loss": 0.8523, + "step": 6162 + }, + { + "epoch": 1.044752127226354, + "grad_norm": 1.625, + "learning_rate": 1.4581049134256857e-05, + "loss": 0.9015, + "step": 6163 + }, + { + "epoch": 1.0449235913153438, + "grad_norm": 1.640625, + "learning_rate": 1.4579443773382273e-05, + "loss": 0.8365, + "step": 6164 + }, + { + "epoch": 1.0450950554043337, + "grad_norm": 1.609375, + "learning_rate": 1.4577838263154787e-05, + "loss": 0.8479, + "step": 6165 + }, + { + "epoch": 1.0452665194933237, + "grad_norm": 1.65625, + "learning_rate": 1.4576232603626754e-05, + "loss": 0.8198, + "step": 6166 + }, + { + "epoch": 1.0454379835823135, + "grad_norm": 1.65625, + "learning_rate": 1.4574626794850541e-05, + "loss": 0.8248, + "step": 6167 + }, + { + "epoch": 1.0456094476713034, + "grad_norm": 1.625, + "learning_rate": 1.4573020836878524e-05, + "loss": 0.8216, + "step": 6168 + }, + { + "epoch": 1.0457809117602932, + "grad_norm": 1.609375, + "learning_rate": 1.4571414729763076e-05, + "loss": 0.8822, + "step": 6169 + }, + { + "epoch": 1.045952375849283, + "grad_norm": 1.6484375, + "learning_rate": 1.456980847355658e-05, + "loss": 0.9406, + "step": 6170 + }, + { + "epoch": 1.0461238399382728, + "grad_norm": 1.5234375, + "learning_rate": 1.4568202068311421e-05, + "loss": 0.7995, + "step": 6171 + }, + { + "epoch": 1.046295304027263, + "grad_norm": 1.6640625, + "learning_rate": 1.4566595514079991e-05, + "loss": 0.7696, + "step": 6172 + }, + { + "epoch": 1.0464667681162527, + "grad_norm": 1.515625, + "learning_rate": 1.4564988810914686e-05, + "loss": 0.8146, + "step": 6173 + }, + { + "epoch": 1.0466382322052425, + "grad_norm": 1.6171875, + "learning_rate": 1.4563381958867903e-05, + "loss": 0.8455, + "step": 6174 + }, + { + "epoch": 1.0468096962942324, + "grad_norm": 1.59375, + "learning_rate": 1.4561774957992052e-05, + "loss": 0.8198, + "step": 6175 + }, + { + "epoch": 1.0469811603832222, + "grad_norm": 1.609375, + "learning_rate": 1.456016780833954e-05, + "loss": 0.8871, + "step": 6176 + }, + { + "epoch": 1.047152624472212, + "grad_norm": 1.6640625, + "learning_rate": 1.455856050996279e-05, + "loss": 0.8185, + "step": 6177 + }, + { + "epoch": 1.047324088561202, + "grad_norm": 1.546875, + "learning_rate": 1.455695306291421e-05, + "loss": 0.8382, + "step": 6178 + }, + { + "epoch": 1.047495552650192, + "grad_norm": 1.625, + "learning_rate": 1.4555345467246231e-05, + "loss": 0.9098, + "step": 6179 + }, + { + "epoch": 1.0476670167391817, + "grad_norm": 1.609375, + "learning_rate": 1.4553737723011283e-05, + "loss": 0.89, + "step": 6180 + }, + { + "epoch": 1.0478384808281715, + "grad_norm": 1.671875, + "learning_rate": 1.4552129830261797e-05, + "loss": 0.8876, + "step": 6181 + }, + { + "epoch": 1.0480099449171614, + "grad_norm": 1.671875, + "learning_rate": 1.4550521789050218e-05, + "loss": 0.9873, + "step": 6182 + }, + { + "epoch": 1.0481814090061512, + "grad_norm": 1.6015625, + "learning_rate": 1.454891359942899e-05, + "loss": 0.8154, + "step": 6183 + }, + { + "epoch": 1.048352873095141, + "grad_norm": 1.59375, + "learning_rate": 1.4547305261450559e-05, + "loss": 0.9423, + "step": 6184 + }, + { + "epoch": 1.048524337184131, + "grad_norm": 1.671875, + "learning_rate": 1.454569677516738e-05, + "loss": 0.8704, + "step": 6185 + }, + { + "epoch": 1.048695801273121, + "grad_norm": 1.6796875, + "learning_rate": 1.4544088140631912e-05, + "loss": 0.8998, + "step": 6186 + }, + { + "epoch": 1.0488672653621107, + "grad_norm": 1.6328125, + "learning_rate": 1.4542479357896615e-05, + "loss": 0.9563, + "step": 6187 + }, + { + "epoch": 1.0490387294511005, + "grad_norm": 1.6015625, + "learning_rate": 1.4540870427013962e-05, + "loss": 0.8739, + "step": 6188 + }, + { + "epoch": 1.0492101935400904, + "grad_norm": 1.6953125, + "learning_rate": 1.4539261348036426e-05, + "loss": 0.8912, + "step": 6189 + }, + { + "epoch": 1.0493816576290804, + "grad_norm": 1.671875, + "learning_rate": 1.4537652121016485e-05, + "loss": 0.8597, + "step": 6190 + }, + { + "epoch": 1.0495531217180702, + "grad_norm": 1.671875, + "learning_rate": 1.453604274600662e-05, + "loss": 0.9169, + "step": 6191 + }, + { + "epoch": 1.04972458580706, + "grad_norm": 1.796875, + "learning_rate": 1.453443322305932e-05, + "loss": 0.8305, + "step": 6192 + }, + { + "epoch": 1.04989604989605, + "grad_norm": 1.6015625, + "learning_rate": 1.4532823552227078e-05, + "loss": 0.832, + "step": 6193 + }, + { + "epoch": 1.0500675139850397, + "grad_norm": 1.7890625, + "learning_rate": 1.4531213733562392e-05, + "loss": 0.9607, + "step": 6194 + }, + { + "epoch": 1.0502389780740295, + "grad_norm": 1.6640625, + "learning_rate": 1.4529603767117761e-05, + "loss": 0.8243, + "step": 6195 + }, + { + "epoch": 1.0504104421630194, + "grad_norm": 1.625, + "learning_rate": 1.4527993652945696e-05, + "loss": 0.8031, + "step": 6196 + }, + { + "epoch": 1.0505819062520094, + "grad_norm": 1.625, + "learning_rate": 1.4526383391098704e-05, + "loss": 0.9309, + "step": 6197 + }, + { + "epoch": 1.0507533703409992, + "grad_norm": 1.640625, + "learning_rate": 1.4524772981629309e-05, + "loss": 0.8646, + "step": 6198 + }, + { + "epoch": 1.050924834429989, + "grad_norm": 1.734375, + "learning_rate": 1.4523162424590025e-05, + "loss": 0.9192, + "step": 6199 + }, + { + "epoch": 1.051096298518979, + "grad_norm": 1.5, + "learning_rate": 1.4521551720033382e-05, + "loss": 0.8055, + "step": 6200 + }, + { + "epoch": 1.0512677626079687, + "grad_norm": 1.6640625, + "learning_rate": 1.4519940868011913e-05, + "loss": 0.8371, + "step": 6201 + }, + { + "epoch": 1.0514392266969586, + "grad_norm": 1.546875, + "learning_rate": 1.4518329868578149e-05, + "loss": 0.8334, + "step": 6202 + }, + { + "epoch": 1.0516106907859486, + "grad_norm": 1.671875, + "learning_rate": 1.4516718721784635e-05, + "loss": 0.8649, + "step": 6203 + }, + { + "epoch": 1.0517821548749384, + "grad_norm": 1.703125, + "learning_rate": 1.4515107427683917e-05, + "loss": 0.9058, + "step": 6204 + }, + { + "epoch": 1.0519536189639282, + "grad_norm": 1.6328125, + "learning_rate": 1.4513495986328541e-05, + "loss": 0.8435, + "step": 6205 + }, + { + "epoch": 1.052125083052918, + "grad_norm": 1.734375, + "learning_rate": 1.4511884397771065e-05, + "loss": 0.846, + "step": 6206 + }, + { + "epoch": 1.052296547141908, + "grad_norm": 1.6875, + "learning_rate": 1.451027266206405e-05, + "loss": 0.8909, + "step": 6207 + }, + { + "epoch": 1.0524680112308977, + "grad_norm": 1.6328125, + "learning_rate": 1.4508660779260057e-05, + "loss": 0.9091, + "step": 6208 + }, + { + "epoch": 1.0526394753198878, + "grad_norm": 1.5859375, + "learning_rate": 1.4507048749411658e-05, + "loss": 0.8144, + "step": 6209 + }, + { + "epoch": 1.0528109394088776, + "grad_norm": 1.71875, + "learning_rate": 1.4505436572571428e-05, + "loss": 0.8372, + "step": 6210 + }, + { + "epoch": 1.0529824034978674, + "grad_norm": 1.5703125, + "learning_rate": 1.4503824248791946e-05, + "loss": 0.8087, + "step": 6211 + }, + { + "epoch": 1.0531538675868573, + "grad_norm": 1.671875, + "learning_rate": 1.4502211778125799e-05, + "loss": 0.9126, + "step": 6212 + }, + { + "epoch": 1.053325331675847, + "grad_norm": 1.7421875, + "learning_rate": 1.4500599160625565e-05, + "loss": 0.8595, + "step": 6213 + }, + { + "epoch": 1.053496795764837, + "grad_norm": 1.578125, + "learning_rate": 1.4498986396343851e-05, + "loss": 0.8395, + "step": 6214 + }, + { + "epoch": 1.053668259853827, + "grad_norm": 1.6015625, + "learning_rate": 1.4497373485333245e-05, + "loss": 0.8365, + "step": 6215 + }, + { + "epoch": 1.0538397239428168, + "grad_norm": 1.703125, + "learning_rate": 1.4495760427646355e-05, + "loss": 0.9233, + "step": 6216 + }, + { + "epoch": 1.0540111880318066, + "grad_norm": 1.640625, + "learning_rate": 1.4494147223335789e-05, + "loss": 0.7995, + "step": 6217 + }, + { + "epoch": 1.0541826521207964, + "grad_norm": 1.8125, + "learning_rate": 1.4492533872454157e-05, + "loss": 0.9579, + "step": 6218 + }, + { + "epoch": 1.0543541162097863, + "grad_norm": 1.7734375, + "learning_rate": 1.449092037505408e-05, + "loss": 0.9061, + "step": 6219 + }, + { + "epoch": 1.054525580298776, + "grad_norm": 1.78125, + "learning_rate": 1.4489306731188178e-05, + "loss": 0.8286, + "step": 6220 + }, + { + "epoch": 1.0546970443877661, + "grad_norm": 1.734375, + "learning_rate": 1.4487692940909077e-05, + "loss": 0.9238, + "step": 6221 + }, + { + "epoch": 1.054868508476756, + "grad_norm": 1.7265625, + "learning_rate": 1.4486079004269411e-05, + "loss": 0.8596, + "step": 6222 + }, + { + "epoch": 1.0550399725657458, + "grad_norm": 1.6796875, + "learning_rate": 1.4484464921321818e-05, + "loss": 0.8859, + "step": 6223 + }, + { + "epoch": 1.0552114366547356, + "grad_norm": 1.640625, + "learning_rate": 1.4482850692118932e-05, + "loss": 0.8388, + "step": 6224 + }, + { + "epoch": 1.0553829007437254, + "grad_norm": 1.625, + "learning_rate": 1.4481236316713408e-05, + "loss": 0.8139, + "step": 6225 + }, + { + "epoch": 1.0555543648327153, + "grad_norm": 1.7265625, + "learning_rate": 1.447962179515789e-05, + "loss": 0.9119, + "step": 6226 + }, + { + "epoch": 1.0557258289217053, + "grad_norm": 1.6875, + "learning_rate": 1.4478007127505039e-05, + "loss": 0.8191, + "step": 6227 + }, + { + "epoch": 1.0558972930106951, + "grad_norm": 1.703125, + "learning_rate": 1.4476392313807508e-05, + "loss": 0.8268, + "step": 6228 + }, + { + "epoch": 1.056068757099685, + "grad_norm": 1.703125, + "learning_rate": 1.4474777354117974e-05, + "loss": 0.8519, + "step": 6229 + }, + { + "epoch": 1.0562402211886748, + "grad_norm": 1.6015625, + "learning_rate": 1.4473162248489097e-05, + "loss": 0.8713, + "step": 6230 + }, + { + "epoch": 1.0564116852776646, + "grad_norm": 1.703125, + "learning_rate": 1.4471546996973555e-05, + "loss": 0.8951, + "step": 6231 + }, + { + "epoch": 1.0565831493666544, + "grad_norm": 1.6171875, + "learning_rate": 1.4469931599624027e-05, + "loss": 0.806, + "step": 6232 + }, + { + "epoch": 1.0567546134556445, + "grad_norm": 1.6328125, + "learning_rate": 1.4468316056493197e-05, + "loss": 0.8072, + "step": 6233 + }, + { + "epoch": 1.0569260775446343, + "grad_norm": 1.6875, + "learning_rate": 1.4466700367633754e-05, + "loss": 0.8183, + "step": 6234 + }, + { + "epoch": 1.0570975416336241, + "grad_norm": 1.6328125, + "learning_rate": 1.4465084533098394e-05, + "loss": 0.7764, + "step": 6235 + }, + { + "epoch": 1.057269005722614, + "grad_norm": 1.6875, + "learning_rate": 1.446346855293981e-05, + "loss": 0.9423, + "step": 6236 + }, + { + "epoch": 1.0574404698116038, + "grad_norm": 1.5078125, + "learning_rate": 1.4461852427210711e-05, + "loss": 0.7811, + "step": 6237 + }, + { + "epoch": 1.0576119339005936, + "grad_norm": 1.6953125, + "learning_rate": 1.4460236155963803e-05, + "loss": 0.8399, + "step": 6238 + }, + { + "epoch": 1.0577833979895837, + "grad_norm": 1.6953125, + "learning_rate": 1.4458619739251795e-05, + "loss": 0.8339, + "step": 6239 + }, + { + "epoch": 1.0579548620785735, + "grad_norm": 1.59375, + "learning_rate": 1.4457003177127414e-05, + "loss": 0.8646, + "step": 6240 + }, + { + "epoch": 1.0581263261675633, + "grad_norm": 1.609375, + "learning_rate": 1.4455386469643371e-05, + "loss": 0.7481, + "step": 6241 + }, + { + "epoch": 1.0582977902565531, + "grad_norm": 1.6640625, + "learning_rate": 1.44537696168524e-05, + "loss": 0.8451, + "step": 6242 + }, + { + "epoch": 1.058469254345543, + "grad_norm": 1.609375, + "learning_rate": 1.4452152618807228e-05, + "loss": 0.8559, + "step": 6243 + }, + { + "epoch": 1.0586407184345328, + "grad_norm": 1.6171875, + "learning_rate": 1.4450535475560594e-05, + "loss": 0.8876, + "step": 6244 + }, + { + "epoch": 1.0588121825235228, + "grad_norm": 1.6171875, + "learning_rate": 1.4448918187165242e-05, + "loss": 0.8205, + "step": 6245 + }, + { + "epoch": 1.0589836466125127, + "grad_norm": 1.7109375, + "learning_rate": 1.4447300753673912e-05, + "loss": 0.8484, + "step": 6246 + }, + { + "epoch": 1.0591551107015025, + "grad_norm": 1.6015625, + "learning_rate": 1.4445683175139357e-05, + "loss": 0.8203, + "step": 6247 + }, + { + "epoch": 1.0593265747904923, + "grad_norm": 1.609375, + "learning_rate": 1.4444065451614336e-05, + "loss": 0.8436, + "step": 6248 + }, + { + "epoch": 1.0594980388794821, + "grad_norm": 1.65625, + "learning_rate": 1.4442447583151604e-05, + "loss": 0.8511, + "step": 6249 + }, + { + "epoch": 1.059669502968472, + "grad_norm": 1.6328125, + "learning_rate": 1.4440829569803927e-05, + "loss": 0.8101, + "step": 6250 + }, + { + "epoch": 1.059840967057462, + "grad_norm": 1.6328125, + "learning_rate": 1.4439211411624074e-05, + "loss": 0.8623, + "step": 6251 + }, + { + "epoch": 1.0600124311464518, + "grad_norm": 1.7421875, + "learning_rate": 1.4437593108664825e-05, + "loss": 0.8703, + "step": 6252 + }, + { + "epoch": 1.0601838952354417, + "grad_norm": 1.6328125, + "learning_rate": 1.4435974660978951e-05, + "loss": 0.8501, + "step": 6253 + }, + { + "epoch": 1.0603553593244315, + "grad_norm": 1.71875, + "learning_rate": 1.443435606861924e-05, + "loss": 0.9815, + "step": 6254 + }, + { + "epoch": 1.0605268234134213, + "grad_norm": 1.6015625, + "learning_rate": 1.4432737331638477e-05, + "loss": 0.8542, + "step": 6255 + }, + { + "epoch": 1.0606982875024111, + "grad_norm": 1.6171875, + "learning_rate": 1.4431118450089458e-05, + "loss": 0.8357, + "step": 6256 + }, + { + "epoch": 1.0608697515914012, + "grad_norm": 1.5546875, + "learning_rate": 1.4429499424024982e-05, + "loss": 0.8416, + "step": 6257 + }, + { + "epoch": 1.061041215680391, + "grad_norm": 1.625, + "learning_rate": 1.442788025349785e-05, + "loss": 0.8575, + "step": 6258 + }, + { + "epoch": 1.0612126797693808, + "grad_norm": 1.59375, + "learning_rate": 1.4426260938560868e-05, + "loss": 0.8339, + "step": 6259 + }, + { + "epoch": 1.0613841438583707, + "grad_norm": 1.75, + "learning_rate": 1.442464147926685e-05, + "loss": 0.8274, + "step": 6260 + }, + { + "epoch": 1.0615556079473605, + "grad_norm": 1.578125, + "learning_rate": 1.442302187566861e-05, + "loss": 0.796, + "step": 6261 + }, + { + "epoch": 1.0617270720363503, + "grad_norm": 1.6640625, + "learning_rate": 1.4421402127818974e-05, + "loss": 0.8236, + "step": 6262 + }, + { + "epoch": 1.0618985361253404, + "grad_norm": 1.6875, + "learning_rate": 1.4419782235770763e-05, + "loss": 0.8753, + "step": 6263 + }, + { + "epoch": 1.0620700002143302, + "grad_norm": 1.6953125, + "learning_rate": 1.4418162199576808e-05, + "loss": 0.9048, + "step": 6264 + }, + { + "epoch": 1.06224146430332, + "grad_norm": 1.703125, + "learning_rate": 1.441654201928995e-05, + "loss": 0.8329, + "step": 6265 + }, + { + "epoch": 1.0624129283923098, + "grad_norm": 1.6875, + "learning_rate": 1.4414921694963024e-05, + "loss": 0.8477, + "step": 6266 + }, + { + "epoch": 1.0625843924812997, + "grad_norm": 1.6171875, + "learning_rate": 1.4413301226648877e-05, + "loss": 0.8548, + "step": 6267 + }, + { + "epoch": 1.0627558565702895, + "grad_norm": 1.75, + "learning_rate": 1.4411680614400356e-05, + "loss": 0.861, + "step": 6268 + }, + { + "epoch": 1.0629273206592793, + "grad_norm": 1.625, + "learning_rate": 1.4410059858270322e-05, + "loss": 0.7642, + "step": 6269 + }, + { + "epoch": 1.0630987847482694, + "grad_norm": 1.53125, + "learning_rate": 1.4408438958311626e-05, + "loss": 0.7788, + "step": 6270 + }, + { + "epoch": 1.0632702488372592, + "grad_norm": 1.578125, + "learning_rate": 1.4406817914577135e-05, + "loss": 0.799, + "step": 6271 + }, + { + "epoch": 1.063441712926249, + "grad_norm": 1.6640625, + "learning_rate": 1.4405196727119717e-05, + "loss": 0.8563, + "step": 6272 + }, + { + "epoch": 1.0636131770152388, + "grad_norm": 1.578125, + "learning_rate": 1.4403575395992247e-05, + "loss": 0.7594, + "step": 6273 + }, + { + "epoch": 1.0637846411042287, + "grad_norm": 1.671875, + "learning_rate": 1.44019539212476e-05, + "loss": 0.8161, + "step": 6274 + }, + { + "epoch": 1.0639561051932187, + "grad_norm": 1.6640625, + "learning_rate": 1.4400332302938658e-05, + "loss": 0.8292, + "step": 6275 + }, + { + "epoch": 1.0641275692822085, + "grad_norm": 1.7109375, + "learning_rate": 1.439871054111831e-05, + "loss": 0.808, + "step": 6276 + }, + { + "epoch": 1.0642990333711984, + "grad_norm": 1.6171875, + "learning_rate": 1.439708863583945e-05, + "loss": 0.8312, + "step": 6277 + }, + { + "epoch": 1.0644704974601882, + "grad_norm": 1.6484375, + "learning_rate": 1.4395466587154969e-05, + "loss": 0.8997, + "step": 6278 + }, + { + "epoch": 1.064641961549178, + "grad_norm": 1.609375, + "learning_rate": 1.4393844395117771e-05, + "loss": 0.7794, + "step": 6279 + }, + { + "epoch": 1.0648134256381678, + "grad_norm": 1.6328125, + "learning_rate": 1.439222205978076e-05, + "loss": 0.9294, + "step": 6280 + }, + { + "epoch": 1.0649848897271577, + "grad_norm": 1.5625, + "learning_rate": 1.4390599581196854e-05, + "loss": 0.8536, + "step": 6281 + }, + { + "epoch": 1.0651563538161477, + "grad_norm": 1.7265625, + "learning_rate": 1.4388976959418956e-05, + "loss": 0.8836, + "step": 6282 + }, + { + "epoch": 1.0653278179051375, + "grad_norm": 1.6953125, + "learning_rate": 1.4387354194499994e-05, + "loss": 0.8734, + "step": 6283 + }, + { + "epoch": 1.0654992819941274, + "grad_norm": 1.5625, + "learning_rate": 1.438573128649289e-05, + "loss": 0.7896, + "step": 6284 + }, + { + "epoch": 1.0656707460831172, + "grad_norm": 1.734375, + "learning_rate": 1.4384108235450574e-05, + "loss": 0.9422, + "step": 6285 + }, + { + "epoch": 1.065842210172107, + "grad_norm": 1.6015625, + "learning_rate": 1.438248504142598e-05, + "loss": 0.8169, + "step": 6286 + }, + { + "epoch": 1.066013674261097, + "grad_norm": 1.7421875, + "learning_rate": 1.4380861704472047e-05, + "loss": 0.8534, + "step": 6287 + }, + { + "epoch": 1.0661851383500869, + "grad_norm": 1.6171875, + "learning_rate": 1.4379238224641716e-05, + "loss": 0.8926, + "step": 6288 + }, + { + "epoch": 1.0663566024390767, + "grad_norm": 1.5546875, + "learning_rate": 1.4377614601987934e-05, + "loss": 0.8646, + "step": 6289 + }, + { + "epoch": 1.0665280665280665, + "grad_norm": 1.546875, + "learning_rate": 1.4375990836563658e-05, + "loss": 0.8611, + "step": 6290 + }, + { + "epoch": 1.0666995306170564, + "grad_norm": 1.6875, + "learning_rate": 1.437436692842184e-05, + "loss": 0.7749, + "step": 6291 + }, + { + "epoch": 1.0668709947060462, + "grad_norm": 1.671875, + "learning_rate": 1.4372742877615447e-05, + "loss": 0.8207, + "step": 6292 + }, + { + "epoch": 1.067042458795036, + "grad_norm": 1.7265625, + "learning_rate": 1.437111868419744e-05, + "loss": 0.9227, + "step": 6293 + }, + { + "epoch": 1.067213922884026, + "grad_norm": 1.6015625, + "learning_rate": 1.4369494348220791e-05, + "loss": 0.8164, + "step": 6294 + }, + { + "epoch": 1.0673853869730159, + "grad_norm": 1.578125, + "learning_rate": 1.4367869869738482e-05, + "loss": 0.8078, + "step": 6295 + }, + { + "epoch": 1.0675568510620057, + "grad_norm": 1.703125, + "learning_rate": 1.4366245248803485e-05, + "loss": 0.8588, + "step": 6296 + }, + { + "epoch": 1.0677283151509955, + "grad_norm": 1.734375, + "learning_rate": 1.436462048546879e-05, + "loss": 0.9065, + "step": 6297 + }, + { + "epoch": 1.0678997792399854, + "grad_norm": 1.734375, + "learning_rate": 1.4362995579787389e-05, + "loss": 0.8279, + "step": 6298 + }, + { + "epoch": 1.0680712433289752, + "grad_norm": 1.6328125, + "learning_rate": 1.4361370531812266e-05, + "loss": 0.8336, + "step": 6299 + }, + { + "epoch": 1.0682427074179652, + "grad_norm": 1.7421875, + "learning_rate": 1.4359745341596431e-05, + "loss": 0.9098, + "step": 6300 + }, + { + "epoch": 1.0682427074179652, + "eval_loss": 0.8528460264205933, + "eval_runtime": 837.1005, + "eval_samples_per_second": 2.985, + "eval_steps_per_second": 2.985, + "step": 6300 + }, + { + "epoch": 1.068414171506955, + "grad_norm": 1.6640625, + "learning_rate": 1.4358120009192881e-05, + "loss": 0.8324, + "step": 6301 + }, + { + "epoch": 1.0685856355959449, + "grad_norm": 1.7109375, + "learning_rate": 1.4356494534654627e-05, + "loss": 0.9125, + "step": 6302 + }, + { + "epoch": 1.0687570996849347, + "grad_norm": 1.921875, + "learning_rate": 1.4354868918034679e-05, + "loss": 0.7998, + "step": 6303 + }, + { + "epoch": 1.0689285637739245, + "grad_norm": 1.7265625, + "learning_rate": 1.435324315938606e-05, + "loss": 0.9459, + "step": 6304 + }, + { + "epoch": 1.0691000278629144, + "grad_norm": 1.6953125, + "learning_rate": 1.4351617258761787e-05, + "loss": 0.874, + "step": 6305 + }, + { + "epoch": 1.0692714919519044, + "grad_norm": 1.65625, + "learning_rate": 1.434999121621489e-05, + "loss": 0.8602, + "step": 6306 + }, + { + "epoch": 1.0694429560408942, + "grad_norm": 1.6796875, + "learning_rate": 1.4348365031798398e-05, + "loss": 0.881, + "step": 6307 + }, + { + "epoch": 1.069614420129884, + "grad_norm": 1.734375, + "learning_rate": 1.4346738705565348e-05, + "loss": 0.8294, + "step": 6308 + }, + { + "epoch": 1.0697858842188739, + "grad_norm": 1.7265625, + "learning_rate": 1.4345112237568781e-05, + "loss": 0.8579, + "step": 6309 + }, + { + "epoch": 1.0699573483078637, + "grad_norm": 1.6171875, + "learning_rate": 1.4343485627861742e-05, + "loss": 0.8984, + "step": 6310 + }, + { + "epoch": 1.0701288123968535, + "grad_norm": 1.640625, + "learning_rate": 1.4341858876497279e-05, + "loss": 0.7913, + "step": 6311 + }, + { + "epoch": 1.0703002764858436, + "grad_norm": 1.703125, + "learning_rate": 1.4340231983528448e-05, + "loss": 0.8667, + "step": 6312 + }, + { + "epoch": 1.0704717405748334, + "grad_norm": 1.65625, + "learning_rate": 1.433860494900831e-05, + "loss": 0.8287, + "step": 6313 + }, + { + "epoch": 1.0706432046638232, + "grad_norm": 1.6484375, + "learning_rate": 1.4336977772989924e-05, + "loss": 0.8595, + "step": 6314 + }, + { + "epoch": 1.070814668752813, + "grad_norm": 1.640625, + "learning_rate": 1.4335350455526367e-05, + "loss": 0.8163, + "step": 6315 + }, + { + "epoch": 1.0709861328418029, + "grad_norm": 1.734375, + "learning_rate": 1.4333722996670702e-05, + "loss": 0.8464, + "step": 6316 + }, + { + "epoch": 1.0711575969307927, + "grad_norm": 1.7265625, + "learning_rate": 1.4332095396476012e-05, + "loss": 0.8942, + "step": 6317 + }, + { + "epoch": 1.0713290610197828, + "grad_norm": 1.640625, + "learning_rate": 1.4330467654995376e-05, + "loss": 0.8218, + "step": 6318 + }, + { + "epoch": 1.0715005251087726, + "grad_norm": 1.6953125, + "learning_rate": 1.4328839772281884e-05, + "loss": 0.8912, + "step": 6319 + }, + { + "epoch": 1.0716719891977624, + "grad_norm": 1.6328125, + "learning_rate": 1.4327211748388626e-05, + "loss": 0.8496, + "step": 6320 + }, + { + "epoch": 1.0718434532867522, + "grad_norm": 1.7734375, + "learning_rate": 1.4325583583368698e-05, + "loss": 0.9215, + "step": 6321 + }, + { + "epoch": 1.072014917375742, + "grad_norm": 1.59375, + "learning_rate": 1.4323955277275201e-05, + "loss": 0.824, + "step": 6322 + }, + { + "epoch": 1.0721863814647319, + "grad_norm": 1.6015625, + "learning_rate": 1.432232683016124e-05, + "loss": 0.903, + "step": 6323 + }, + { + "epoch": 1.072357845553722, + "grad_norm": 1.8046875, + "learning_rate": 1.4320698242079925e-05, + "loss": 0.8151, + "step": 6324 + }, + { + "epoch": 1.0725293096427118, + "grad_norm": 1.5234375, + "learning_rate": 1.431906951308437e-05, + "loss": 0.7505, + "step": 6325 + }, + { + "epoch": 1.0727007737317016, + "grad_norm": 1.59375, + "learning_rate": 1.4317440643227693e-05, + "loss": 0.84, + "step": 6326 + }, + { + "epoch": 1.0728722378206914, + "grad_norm": 1.6640625, + "learning_rate": 1.4315811632563022e-05, + "loss": 0.8941, + "step": 6327 + }, + { + "epoch": 1.0730437019096812, + "grad_norm": 1.6171875, + "learning_rate": 1.4314182481143478e-05, + "loss": 0.9537, + "step": 6328 + }, + { + "epoch": 1.073215165998671, + "grad_norm": 1.5546875, + "learning_rate": 1.4312553189022201e-05, + "loss": 0.8015, + "step": 6329 + }, + { + "epoch": 1.073386630087661, + "grad_norm": 1.7109375, + "learning_rate": 1.431092375625232e-05, + "loss": 0.8483, + "step": 6330 + }, + { + "epoch": 1.073558094176651, + "grad_norm": 1.7265625, + "learning_rate": 1.4309294182886984e-05, + "loss": 0.874, + "step": 6331 + }, + { + "epoch": 1.0737295582656408, + "grad_norm": 1.6171875, + "learning_rate": 1.4307664468979336e-05, + "loss": 0.9018, + "step": 6332 + }, + { + "epoch": 1.0739010223546306, + "grad_norm": 1.5625, + "learning_rate": 1.4306034614582534e-05, + "loss": 0.7619, + "step": 6333 + }, + { + "epoch": 1.0740724864436204, + "grad_norm": 1.6484375, + "learning_rate": 1.4304404619749724e-05, + "loss": 0.7982, + "step": 6334 + }, + { + "epoch": 1.0742439505326102, + "grad_norm": 1.578125, + "learning_rate": 1.4302774484534073e-05, + "loss": 0.8727, + "step": 6335 + }, + { + "epoch": 1.0744154146216003, + "grad_norm": 1.8125, + "learning_rate": 1.430114420898874e-05, + "loss": 0.8913, + "step": 6336 + }, + { + "epoch": 1.07458687871059, + "grad_norm": 1.6171875, + "learning_rate": 1.4299513793166896e-05, + "loss": 0.9844, + "step": 6337 + }, + { + "epoch": 1.07475834279958, + "grad_norm": 1.671875, + "learning_rate": 1.4297883237121721e-05, + "loss": 0.8231, + "step": 6338 + }, + { + "epoch": 1.0749298068885698, + "grad_norm": 1.640625, + "learning_rate": 1.429625254090639e-05, + "loss": 0.878, + "step": 6339 + }, + { + "epoch": 1.0751012709775596, + "grad_norm": 1.703125, + "learning_rate": 1.4294621704574084e-05, + "loss": 0.8635, + "step": 6340 + }, + { + "epoch": 1.0752727350665494, + "grad_norm": 1.734375, + "learning_rate": 1.4292990728177992e-05, + "loss": 0.8971, + "step": 6341 + }, + { + "epoch": 1.0754441991555395, + "grad_norm": 1.59375, + "learning_rate": 1.4291359611771304e-05, + "loss": 0.8481, + "step": 6342 + }, + { + "epoch": 1.0756156632445293, + "grad_norm": 1.6953125, + "learning_rate": 1.4289728355407221e-05, + "loss": 0.8761, + "step": 6343 + }, + { + "epoch": 1.075787127333519, + "grad_norm": 1.5078125, + "learning_rate": 1.4288096959138946e-05, + "loss": 0.7794, + "step": 6344 + }, + { + "epoch": 1.075958591422509, + "grad_norm": 1.78125, + "learning_rate": 1.4286465423019679e-05, + "loss": 0.9239, + "step": 6345 + }, + { + "epoch": 1.0761300555114988, + "grad_norm": 1.6484375, + "learning_rate": 1.4284833747102634e-05, + "loss": 0.8863, + "step": 6346 + }, + { + "epoch": 1.0763015196004886, + "grad_norm": 1.6953125, + "learning_rate": 1.4283201931441024e-05, + "loss": 0.9214, + "step": 6347 + }, + { + "epoch": 1.0764729836894786, + "grad_norm": 1.671875, + "learning_rate": 1.428156997608807e-05, + "loss": 0.8686, + "step": 6348 + }, + { + "epoch": 1.0766444477784685, + "grad_norm": 1.703125, + "learning_rate": 1.4279937881096997e-05, + "loss": 0.8189, + "step": 6349 + }, + { + "epoch": 1.0768159118674583, + "grad_norm": 1.65625, + "learning_rate": 1.4278305646521032e-05, + "loss": 0.8565, + "step": 6350 + }, + { + "epoch": 1.0769873759564481, + "grad_norm": 1.6015625, + "learning_rate": 1.427667327241341e-05, + "loss": 0.7881, + "step": 6351 + }, + { + "epoch": 1.077158840045438, + "grad_norm": 1.5390625, + "learning_rate": 1.4275040758827367e-05, + "loss": 0.8265, + "step": 6352 + }, + { + "epoch": 1.0773303041344278, + "grad_norm": 1.71875, + "learning_rate": 1.4273408105816148e-05, + "loss": 0.8923, + "step": 6353 + }, + { + "epoch": 1.0775017682234178, + "grad_norm": 1.6484375, + "learning_rate": 1.4271775313432998e-05, + "loss": 0.8301, + "step": 6354 + }, + { + "epoch": 1.0776732323124076, + "grad_norm": 1.6640625, + "learning_rate": 1.4270142381731168e-05, + "loss": 0.7962, + "step": 6355 + }, + { + "epoch": 1.0778446964013975, + "grad_norm": 1.65625, + "learning_rate": 1.426850931076392e-05, + "loss": 0.856, + "step": 6356 + }, + { + "epoch": 1.0780161604903873, + "grad_norm": 1.7109375, + "learning_rate": 1.4266876100584505e-05, + "loss": 0.9114, + "step": 6357 + }, + { + "epoch": 1.0781876245793771, + "grad_norm": 1.6953125, + "learning_rate": 1.4265242751246191e-05, + "loss": 0.8902, + "step": 6358 + }, + { + "epoch": 1.078359088668367, + "grad_norm": 1.6953125, + "learning_rate": 1.4263609262802255e-05, + "loss": 0.9172, + "step": 6359 + }, + { + "epoch": 1.078530552757357, + "grad_norm": 1.6484375, + "learning_rate": 1.426197563530596e-05, + "loss": 0.9066, + "step": 6360 + }, + { + "epoch": 1.0787020168463468, + "grad_norm": 1.6171875, + "learning_rate": 1.4260341868810591e-05, + "loss": 0.7365, + "step": 6361 + }, + { + "epoch": 1.0788734809353366, + "grad_norm": 1.7109375, + "learning_rate": 1.4258707963369438e-05, + "loss": 0.8886, + "step": 6362 + }, + { + "epoch": 1.0790449450243265, + "grad_norm": 1.640625, + "learning_rate": 1.4257073919035775e-05, + "loss": 0.8961, + "step": 6363 + }, + { + "epoch": 1.0792164091133163, + "grad_norm": 1.609375, + "learning_rate": 1.4255439735862901e-05, + "loss": 0.782, + "step": 6364 + }, + { + "epoch": 1.0793878732023061, + "grad_norm": 1.671875, + "learning_rate": 1.4253805413904114e-05, + "loss": 0.8904, + "step": 6365 + }, + { + "epoch": 1.079559337291296, + "grad_norm": 1.75, + "learning_rate": 1.4252170953212713e-05, + "loss": 0.8601, + "step": 6366 + }, + { + "epoch": 1.079730801380286, + "grad_norm": 1.78125, + "learning_rate": 1.4250536353842009e-05, + "loss": 0.8834, + "step": 6367 + }, + { + "epoch": 1.0799022654692758, + "grad_norm": 1.625, + "learning_rate": 1.4248901615845304e-05, + "loss": 0.8273, + "step": 6368 + }, + { + "epoch": 1.0800737295582656, + "grad_norm": 1.5859375, + "learning_rate": 1.4247266739275918e-05, + "loss": 0.8262, + "step": 6369 + }, + { + "epoch": 1.0802451936472555, + "grad_norm": 1.6328125, + "learning_rate": 1.4245631724187172e-05, + "loss": 0.8552, + "step": 6370 + }, + { + "epoch": 1.0804166577362453, + "grad_norm": 1.578125, + "learning_rate": 1.4243996570632385e-05, + "loss": 0.7704, + "step": 6371 + }, + { + "epoch": 1.0805881218252353, + "grad_norm": 1.59375, + "learning_rate": 1.4242361278664891e-05, + "loss": 0.8284, + "step": 6372 + }, + { + "epoch": 1.0807595859142252, + "grad_norm": 1.765625, + "learning_rate": 1.4240725848338023e-05, + "loss": 0.8723, + "step": 6373 + }, + { + "epoch": 1.080931050003215, + "grad_norm": 1.7265625, + "learning_rate": 1.423909027970511e-05, + "loss": 0.8555, + "step": 6374 + }, + { + "epoch": 1.0811025140922048, + "grad_norm": 1.6796875, + "learning_rate": 1.4237454572819505e-05, + "loss": 0.9163, + "step": 6375 + }, + { + "epoch": 1.0812739781811946, + "grad_norm": 1.6953125, + "learning_rate": 1.4235818727734544e-05, + "loss": 0.8108, + "step": 6376 + }, + { + "epoch": 1.0814454422701845, + "grad_norm": 1.6015625, + "learning_rate": 1.4234182744503586e-05, + "loss": 0.8376, + "step": 6377 + }, + { + "epoch": 1.0816169063591743, + "grad_norm": 1.640625, + "learning_rate": 1.4232546623179985e-05, + "loss": 0.7456, + "step": 6378 + }, + { + "epoch": 1.0817883704481643, + "grad_norm": 1.7265625, + "learning_rate": 1.4230910363817104e-05, + "loss": 0.8937, + "step": 6379 + }, + { + "epoch": 1.0819598345371542, + "grad_norm": 1.6484375, + "learning_rate": 1.4229273966468298e-05, + "loss": 0.8592, + "step": 6380 + }, + { + "epoch": 1.082131298626144, + "grad_norm": 1.6171875, + "learning_rate": 1.4227637431186945e-05, + "loss": 0.8733, + "step": 6381 + }, + { + "epoch": 1.0823027627151338, + "grad_norm": 1.7265625, + "learning_rate": 1.4226000758026414e-05, + "loss": 0.9073, + "step": 6382 + }, + { + "epoch": 1.0824742268041236, + "grad_norm": 1.59375, + "learning_rate": 1.4224363947040085e-05, + "loss": 0.7889, + "step": 6383 + }, + { + "epoch": 1.0826456908931137, + "grad_norm": 1.71875, + "learning_rate": 1.4222726998281342e-05, + "loss": 0.8612, + "step": 6384 + }, + { + "epoch": 1.0828171549821035, + "grad_norm": 1.6796875, + "learning_rate": 1.422108991180357e-05, + "loss": 0.8388, + "step": 6385 + }, + { + "epoch": 1.0829886190710933, + "grad_norm": 1.6875, + "learning_rate": 1.4219452687660158e-05, + "loss": 0.8867, + "step": 6386 + }, + { + "epoch": 1.0831600831600832, + "grad_norm": 1.640625, + "learning_rate": 1.4217815325904508e-05, + "loss": 0.8568, + "step": 6387 + }, + { + "epoch": 1.083331547249073, + "grad_norm": 1.71875, + "learning_rate": 1.4216177826590017e-05, + "loss": 0.9192, + "step": 6388 + }, + { + "epoch": 1.0835030113380628, + "grad_norm": 1.6875, + "learning_rate": 1.4214540189770087e-05, + "loss": 0.9198, + "step": 6389 + }, + { + "epoch": 1.0836744754270526, + "grad_norm": 1.6015625, + "learning_rate": 1.4212902415498136e-05, + "loss": 0.8886, + "step": 6390 + }, + { + "epoch": 1.0838459395160427, + "grad_norm": 1.640625, + "learning_rate": 1.4211264503827571e-05, + "loss": 0.8625, + "step": 6391 + }, + { + "epoch": 1.0840174036050325, + "grad_norm": 1.6484375, + "learning_rate": 1.4209626454811812e-05, + "loss": 0.7927, + "step": 6392 + }, + { + "epoch": 1.0841888676940223, + "grad_norm": 1.6640625, + "learning_rate": 1.420798826850428e-05, + "loss": 0.8718, + "step": 6393 + }, + { + "epoch": 1.0843603317830122, + "grad_norm": 1.75, + "learning_rate": 1.4206349944958407e-05, + "loss": 0.9271, + "step": 6394 + }, + { + "epoch": 1.084531795872002, + "grad_norm": 1.7265625, + "learning_rate": 1.4204711484227623e-05, + "loss": 0.8981, + "step": 6395 + }, + { + "epoch": 1.0847032599609918, + "grad_norm": 1.7109375, + "learning_rate": 1.4203072886365364e-05, + "loss": 0.8554, + "step": 6396 + }, + { + "epoch": 1.0848747240499819, + "grad_norm": 1.7265625, + "learning_rate": 1.4201434151425072e-05, + "loss": 0.9236, + "step": 6397 + }, + { + "epoch": 1.0850461881389717, + "grad_norm": 1.6953125, + "learning_rate": 1.4199795279460187e-05, + "loss": 0.8479, + "step": 6398 + }, + { + "epoch": 1.0852176522279615, + "grad_norm": 1.671875, + "learning_rate": 1.4198156270524167e-05, + "loss": 0.8149, + "step": 6399 + }, + { + "epoch": 1.0853891163169513, + "grad_norm": 1.65625, + "learning_rate": 1.419651712467046e-05, + "loss": 0.8815, + "step": 6400 + }, + { + "epoch": 1.0855605804059412, + "grad_norm": 1.7109375, + "learning_rate": 1.4194877841952526e-05, + "loss": 0.9174, + "step": 6401 + }, + { + "epoch": 1.085732044494931, + "grad_norm": 1.609375, + "learning_rate": 1.4193238422423834e-05, + "loss": 0.8742, + "step": 6402 + }, + { + "epoch": 1.085903508583921, + "grad_norm": 1.765625, + "learning_rate": 1.4191598866137844e-05, + "loss": 0.902, + "step": 6403 + }, + { + "epoch": 1.0860749726729109, + "grad_norm": 1.7265625, + "learning_rate": 1.418995917314803e-05, + "loss": 0.9654, + "step": 6404 + }, + { + "epoch": 1.0862464367619007, + "grad_norm": 1.75, + "learning_rate": 1.4188319343507865e-05, + "loss": 0.8984, + "step": 6405 + }, + { + "epoch": 1.0864179008508905, + "grad_norm": 1.6640625, + "learning_rate": 1.4186679377270841e-05, + "loss": 0.9152, + "step": 6406 + }, + { + "epoch": 1.0865893649398803, + "grad_norm": 1.59375, + "learning_rate": 1.4185039274490436e-05, + "loss": 0.8611, + "step": 6407 + }, + { + "epoch": 1.0867608290288702, + "grad_norm": 1.671875, + "learning_rate": 1.4183399035220143e-05, + "loss": 0.8521, + "step": 6408 + }, + { + "epoch": 1.0869322931178602, + "grad_norm": 1.765625, + "learning_rate": 1.418175865951345e-05, + "loss": 0.8818, + "step": 6409 + }, + { + "epoch": 1.08710375720685, + "grad_norm": 1.6015625, + "learning_rate": 1.4180118147423861e-05, + "loss": 0.8461, + "step": 6410 + }, + { + "epoch": 1.0872752212958399, + "grad_norm": 1.7109375, + "learning_rate": 1.4178477499004879e-05, + "loss": 0.7814, + "step": 6411 + }, + { + "epoch": 1.0874466853848297, + "grad_norm": 1.703125, + "learning_rate": 1.417683671431001e-05, + "loss": 0.9004, + "step": 6412 + }, + { + "epoch": 1.0876181494738195, + "grad_norm": 1.6484375, + "learning_rate": 1.4175195793392769e-05, + "loss": 0.851, + "step": 6413 + }, + { + "epoch": 1.0877896135628093, + "grad_norm": 1.65625, + "learning_rate": 1.417355473630667e-05, + "loss": 0.825, + "step": 6414 + }, + { + "epoch": 1.0879610776517994, + "grad_norm": 1.7109375, + "learning_rate": 1.4171913543105233e-05, + "loss": 0.932, + "step": 6415 + }, + { + "epoch": 1.0881325417407892, + "grad_norm": 1.7734375, + "learning_rate": 1.4170272213841988e-05, + "loss": 0.8789, + "step": 6416 + }, + { + "epoch": 1.088304005829779, + "grad_norm": 1.625, + "learning_rate": 1.4168630748570462e-05, + "loss": 0.8809, + "step": 6417 + }, + { + "epoch": 1.0884754699187689, + "grad_norm": 1.6875, + "learning_rate": 1.4166989147344188e-05, + "loss": 0.8888, + "step": 6418 + }, + { + "epoch": 1.0886469340077587, + "grad_norm": 1.6015625, + "learning_rate": 1.416534741021671e-05, + "loss": 0.8732, + "step": 6419 + }, + { + "epoch": 1.0888183980967485, + "grad_norm": 1.5703125, + "learning_rate": 1.4163705537241565e-05, + "loss": 0.7691, + "step": 6420 + }, + { + "epoch": 1.0889898621857386, + "grad_norm": 1.6328125, + "learning_rate": 1.4162063528472302e-05, + "loss": 0.864, + "step": 6421 + }, + { + "epoch": 1.0891613262747284, + "grad_norm": 1.6328125, + "learning_rate": 1.4160421383962478e-05, + "loss": 0.793, + "step": 6422 + }, + { + "epoch": 1.0893327903637182, + "grad_norm": 1.6015625, + "learning_rate": 1.4158779103765642e-05, + "loss": 0.8254, + "step": 6423 + }, + { + "epoch": 1.089504254452708, + "grad_norm": 1.7578125, + "learning_rate": 1.415713668793536e-05, + "loss": 0.8618, + "step": 6424 + }, + { + "epoch": 1.0896757185416979, + "grad_norm": 1.640625, + "learning_rate": 1.4155494136525198e-05, + "loss": 0.8551, + "step": 6425 + }, + { + "epoch": 1.0898471826306877, + "grad_norm": 1.7109375, + "learning_rate": 1.4153851449588725e-05, + "loss": 0.9366, + "step": 6426 + }, + { + "epoch": 1.0900186467196777, + "grad_norm": 1.640625, + "learning_rate": 1.4152208627179513e-05, + "loss": 0.8696, + "step": 6427 + }, + { + "epoch": 1.0901901108086676, + "grad_norm": 1.6015625, + "learning_rate": 1.4150565669351141e-05, + "loss": 0.889, + "step": 6428 + }, + { + "epoch": 1.0903615748976574, + "grad_norm": 1.7265625, + "learning_rate": 1.4148922576157194e-05, + "loss": 0.9755, + "step": 6429 + }, + { + "epoch": 1.0905330389866472, + "grad_norm": 1.703125, + "learning_rate": 1.4147279347651256e-05, + "loss": 0.9041, + "step": 6430 + }, + { + "epoch": 1.090704503075637, + "grad_norm": 1.546875, + "learning_rate": 1.4145635983886927e-05, + "loss": 0.7879, + "step": 6431 + }, + { + "epoch": 1.0908759671646269, + "grad_norm": 1.65625, + "learning_rate": 1.4143992484917792e-05, + "loss": 0.7683, + "step": 6432 + }, + { + "epoch": 1.091047431253617, + "grad_norm": 1.609375, + "learning_rate": 1.4142348850797458e-05, + "loss": 0.7695, + "step": 6433 + }, + { + "epoch": 1.0912188953426067, + "grad_norm": 1.6171875, + "learning_rate": 1.414070508157953e-05, + "loss": 0.8802, + "step": 6434 + }, + { + "epoch": 1.0913903594315966, + "grad_norm": 1.671875, + "learning_rate": 1.4139061177317616e-05, + "loss": 0.9043, + "step": 6435 + }, + { + "epoch": 1.0915618235205864, + "grad_norm": 1.6796875, + "learning_rate": 1.4137417138065333e-05, + "loss": 0.8847, + "step": 6436 + }, + { + "epoch": 1.0917332876095762, + "grad_norm": 1.640625, + "learning_rate": 1.4135772963876297e-05, + "loss": 0.8592, + "step": 6437 + }, + { + "epoch": 1.091904751698566, + "grad_norm": 1.625, + "learning_rate": 1.4134128654804131e-05, + "loss": 0.7798, + "step": 6438 + }, + { + "epoch": 1.092076215787556, + "grad_norm": 1.65625, + "learning_rate": 1.413248421090246e-05, + "loss": 0.8814, + "step": 6439 + }, + { + "epoch": 1.092247679876546, + "grad_norm": 1.5625, + "learning_rate": 1.4130839632224918e-05, + "loss": 0.8163, + "step": 6440 + }, + { + "epoch": 1.0924191439655357, + "grad_norm": 1.65625, + "learning_rate": 1.412919491882514e-05, + "loss": 0.9243, + "step": 6441 + }, + { + "epoch": 1.0925906080545256, + "grad_norm": 1.6171875, + "learning_rate": 1.4127550070756768e-05, + "loss": 0.8714, + "step": 6442 + }, + { + "epoch": 1.0927620721435154, + "grad_norm": 1.640625, + "learning_rate": 1.4125905088073442e-05, + "loss": 0.9572, + "step": 6443 + }, + { + "epoch": 1.0929335362325052, + "grad_norm": 1.703125, + "learning_rate": 1.4124259970828817e-05, + "loss": 0.8775, + "step": 6444 + }, + { + "epoch": 1.0931050003214953, + "grad_norm": 1.703125, + "learning_rate": 1.4122614719076544e-05, + "loss": 0.9437, + "step": 6445 + }, + { + "epoch": 1.093276464410485, + "grad_norm": 1.6875, + "learning_rate": 1.412096933287028e-05, + "loss": 0.907, + "step": 6446 + }, + { + "epoch": 1.093447928499475, + "grad_norm": 1.5703125, + "learning_rate": 1.4119323812263688e-05, + "loss": 0.807, + "step": 6447 + }, + { + "epoch": 1.0936193925884647, + "grad_norm": 1.5625, + "learning_rate": 1.4117678157310436e-05, + "loss": 0.8288, + "step": 6448 + }, + { + "epoch": 1.0937908566774546, + "grad_norm": 1.6875, + "learning_rate": 1.4116032368064192e-05, + "loss": 0.8559, + "step": 6449 + }, + { + "epoch": 1.0939623207664444, + "grad_norm": 1.6484375, + "learning_rate": 1.4114386444578632e-05, + "loss": 0.8325, + "step": 6450 + }, + { + "epoch": 1.0941337848554344, + "grad_norm": 1.6875, + "learning_rate": 1.4112740386907438e-05, + "loss": 0.8723, + "step": 6451 + }, + { + "epoch": 1.0943052489444243, + "grad_norm": 1.6640625, + "learning_rate": 1.4111094195104291e-05, + "loss": 0.8334, + "step": 6452 + }, + { + "epoch": 1.094476713033414, + "grad_norm": 1.7421875, + "learning_rate": 1.410944786922288e-05, + "loss": 1.0709, + "step": 6453 + }, + { + "epoch": 1.094648177122404, + "grad_norm": 1.84375, + "learning_rate": 1.4107801409316907e-05, + "loss": 0.907, + "step": 6454 + }, + { + "epoch": 1.0948196412113937, + "grad_norm": 1.65625, + "learning_rate": 1.4106154815440056e-05, + "loss": 0.8274, + "step": 6455 + }, + { + "epoch": 1.0949911053003836, + "grad_norm": 1.59375, + "learning_rate": 1.4104508087646033e-05, + "loss": 0.7924, + "step": 6456 + }, + { + "epoch": 1.0951625693893736, + "grad_norm": 1.578125, + "learning_rate": 1.410286122598855e-05, + "loss": 0.8505, + "step": 6457 + }, + { + "epoch": 1.0953340334783634, + "grad_norm": 1.6953125, + "learning_rate": 1.4101214230521307e-05, + "loss": 0.9114, + "step": 6458 + }, + { + "epoch": 1.0955054975673533, + "grad_norm": 1.734375, + "learning_rate": 1.4099567101298026e-05, + "loss": 0.8981, + "step": 6459 + }, + { + "epoch": 1.095676961656343, + "grad_norm": 1.7109375, + "learning_rate": 1.4097919838372425e-05, + "loss": 0.8765, + "step": 6460 + }, + { + "epoch": 1.095848425745333, + "grad_norm": 1.6953125, + "learning_rate": 1.4096272441798227e-05, + "loss": 0.9022, + "step": 6461 + }, + { + "epoch": 1.0960198898343227, + "grad_norm": 2.046875, + "learning_rate": 1.4094624911629158e-05, + "loss": 0.8844, + "step": 6462 + }, + { + "epoch": 1.0961913539233126, + "grad_norm": 1.546875, + "learning_rate": 1.4092977247918953e-05, + "loss": 0.7989, + "step": 6463 + }, + { + "epoch": 1.0963628180123026, + "grad_norm": 1.625, + "learning_rate": 1.4091329450721347e-05, + "loss": 0.8971, + "step": 6464 + }, + { + "epoch": 1.0965342821012924, + "grad_norm": 1.6875, + "learning_rate": 1.4089681520090084e-05, + "loss": 0.8169, + "step": 6465 + }, + { + "epoch": 1.0967057461902823, + "grad_norm": 1.6640625, + "learning_rate": 1.4088033456078902e-05, + "loss": 0.8382, + "step": 6466 + }, + { + "epoch": 1.096877210279272, + "grad_norm": 1.6640625, + "learning_rate": 1.4086385258741557e-05, + "loss": 0.8602, + "step": 6467 + }, + { + "epoch": 1.097048674368262, + "grad_norm": 1.6875, + "learning_rate": 1.40847369281318e-05, + "loss": 0.8289, + "step": 6468 + }, + { + "epoch": 1.097220138457252, + "grad_norm": 1.640625, + "learning_rate": 1.408308846430339e-05, + "loss": 0.868, + "step": 6469 + }, + { + "epoch": 1.0973916025462418, + "grad_norm": 1.609375, + "learning_rate": 1.408143986731009e-05, + "loss": 0.8289, + "step": 6470 + }, + { + "epoch": 1.0975630666352316, + "grad_norm": 1.6796875, + "learning_rate": 1.4079791137205665e-05, + "loss": 0.8554, + "step": 6471 + }, + { + "epoch": 1.0977345307242214, + "grad_norm": 1.703125, + "learning_rate": 1.407814227404389e-05, + "loss": 0.9221, + "step": 6472 + }, + { + "epoch": 1.0979059948132113, + "grad_norm": 1.7265625, + "learning_rate": 1.4076493277878537e-05, + "loss": 0.8511, + "step": 6473 + }, + { + "epoch": 1.098077458902201, + "grad_norm": 1.609375, + "learning_rate": 1.4074844148763388e-05, + "loss": 0.8883, + "step": 6474 + }, + { + "epoch": 1.098248922991191, + "grad_norm": 1.578125, + "learning_rate": 1.4073194886752228e-05, + "loss": 0.8379, + "step": 6475 + }, + { + "epoch": 1.098420387080181, + "grad_norm": 1.59375, + "learning_rate": 1.4071545491898843e-05, + "loss": 0.8347, + "step": 6476 + }, + { + "epoch": 1.0985918511691708, + "grad_norm": 1.59375, + "learning_rate": 1.4069895964257032e-05, + "loss": 0.9084, + "step": 6477 + }, + { + "epoch": 1.0987633152581606, + "grad_norm": 1.6953125, + "learning_rate": 1.4068246303880584e-05, + "loss": 0.9504, + "step": 6478 + }, + { + "epoch": 1.0989347793471504, + "grad_norm": 1.6015625, + "learning_rate": 1.4066596510823303e-05, + "loss": 0.8512, + "step": 6479 + }, + { + "epoch": 1.0991062434361403, + "grad_norm": 1.5859375, + "learning_rate": 1.4064946585138997e-05, + "loss": 0.8665, + "step": 6480 + }, + { + "epoch": 1.0992777075251303, + "grad_norm": 1.71875, + "learning_rate": 1.4063296526881477e-05, + "loss": 0.8951, + "step": 6481 + }, + { + "epoch": 1.0994491716141201, + "grad_norm": 1.71875, + "learning_rate": 1.4061646336104556e-05, + "loss": 0.8214, + "step": 6482 + }, + { + "epoch": 1.09962063570311, + "grad_norm": 1.6484375, + "learning_rate": 1.4059996012862055e-05, + "loss": 0.9158, + "step": 6483 + }, + { + "epoch": 1.0997920997920998, + "grad_norm": 1.640625, + "learning_rate": 1.4058345557207794e-05, + "loss": 0.8622, + "step": 6484 + }, + { + "epoch": 1.0999635638810896, + "grad_norm": 1.6640625, + "learning_rate": 1.40566949691956e-05, + "loss": 0.8542, + "step": 6485 + }, + { + "epoch": 1.1001350279700794, + "grad_norm": 1.671875, + "learning_rate": 1.405504424887931e-05, + "loss": 0.9208, + "step": 6486 + }, + { + "epoch": 1.1003064920590693, + "grad_norm": 1.7421875, + "learning_rate": 1.4053393396312756e-05, + "loss": 0.8273, + "step": 6487 + }, + { + "epoch": 1.1004779561480593, + "grad_norm": 1.640625, + "learning_rate": 1.405174241154978e-05, + "loss": 0.8275, + "step": 6488 + }, + { + "epoch": 1.1006494202370491, + "grad_norm": 1.6640625, + "learning_rate": 1.4050091294644226e-05, + "loss": 0.8867, + "step": 6489 + }, + { + "epoch": 1.100820884326039, + "grad_norm": 1.671875, + "learning_rate": 1.4048440045649943e-05, + "loss": 0.8546, + "step": 6490 + }, + { + "epoch": 1.1009923484150288, + "grad_norm": 1.7890625, + "learning_rate": 1.4046788664620785e-05, + "loss": 0.8363, + "step": 6491 + }, + { + "epoch": 1.1011638125040186, + "grad_norm": 1.703125, + "learning_rate": 1.4045137151610614e-05, + "loss": 0.9141, + "step": 6492 + }, + { + "epoch": 1.1013352765930084, + "grad_norm": 1.78125, + "learning_rate": 1.4043485506673282e-05, + "loss": 0.9679, + "step": 6493 + }, + { + "epoch": 1.1015067406819985, + "grad_norm": 1.640625, + "learning_rate": 1.4041833729862666e-05, + "loss": 0.8664, + "step": 6494 + }, + { + "epoch": 1.1016782047709883, + "grad_norm": 1.6953125, + "learning_rate": 1.404018182123263e-05, + "loss": 0.8076, + "step": 6495 + }, + { + "epoch": 1.1018496688599781, + "grad_norm": 1.65625, + "learning_rate": 1.4038529780837049e-05, + "loss": 0.9172, + "step": 6496 + }, + { + "epoch": 1.102021132948968, + "grad_norm": 1.6328125, + "learning_rate": 1.4036877608729806e-05, + "loss": 0.8456, + "step": 6497 + }, + { + "epoch": 1.1021925970379578, + "grad_norm": 1.71875, + "learning_rate": 1.4035225304964781e-05, + "loss": 0.8858, + "step": 6498 + }, + { + "epoch": 1.1023640611269476, + "grad_norm": 1.6640625, + "learning_rate": 1.4033572869595866e-05, + "loss": 0.8169, + "step": 6499 + }, + { + "epoch": 1.1025355252159377, + "grad_norm": 1.5859375, + "learning_rate": 1.4031920302676951e-05, + "loss": 0.7992, + "step": 6500 + }, + { + "epoch": 1.1027069893049275, + "grad_norm": 1.7265625, + "learning_rate": 1.403026760426193e-05, + "loss": 0.9074, + "step": 6501 + }, + { + "epoch": 1.1028784533939173, + "grad_norm": 1.65625, + "learning_rate": 1.4028614774404707e-05, + "loss": 0.8155, + "step": 6502 + }, + { + "epoch": 1.1030499174829071, + "grad_norm": 1.5703125, + "learning_rate": 1.4026961813159186e-05, + "loss": 0.8348, + "step": 6503 + }, + { + "epoch": 1.103221381571897, + "grad_norm": 1.6953125, + "learning_rate": 1.4025308720579276e-05, + "loss": 0.8808, + "step": 6504 + }, + { + "epoch": 1.1033928456608868, + "grad_norm": 1.640625, + "learning_rate": 1.4023655496718892e-05, + "loss": 0.9387, + "step": 6505 + }, + { + "epoch": 1.1035643097498768, + "grad_norm": 1.734375, + "learning_rate": 1.402200214163195e-05, + "loss": 0.9147, + "step": 6506 + }, + { + "epoch": 1.1037357738388667, + "grad_norm": 1.6796875, + "learning_rate": 1.402034865537237e-05, + "loss": 0.8228, + "step": 6507 + }, + { + "epoch": 1.1039072379278565, + "grad_norm": 1.7109375, + "learning_rate": 1.4018695037994081e-05, + "loss": 0.8541, + "step": 6508 + }, + { + "epoch": 1.1040787020168463, + "grad_norm": 1.6171875, + "learning_rate": 1.4017041289551017e-05, + "loss": 0.8786, + "step": 6509 + }, + { + "epoch": 1.1042501661058362, + "grad_norm": 1.609375, + "learning_rate": 1.4015387410097108e-05, + "loss": 0.734, + "step": 6510 + }, + { + "epoch": 1.104421630194826, + "grad_norm": 1.625, + "learning_rate": 1.4013733399686296e-05, + "loss": 0.8647, + "step": 6511 + }, + { + "epoch": 1.104593094283816, + "grad_norm": 1.6015625, + "learning_rate": 1.4012079258372523e-05, + "loss": 0.871, + "step": 6512 + }, + { + "epoch": 1.1047645583728058, + "grad_norm": 1.6875, + "learning_rate": 1.4010424986209738e-05, + "loss": 0.9104, + "step": 6513 + }, + { + "epoch": 1.1049360224617957, + "grad_norm": 1.71875, + "learning_rate": 1.4008770583251892e-05, + "loss": 0.8532, + "step": 6514 + }, + { + "epoch": 1.1051074865507855, + "grad_norm": 1.6171875, + "learning_rate": 1.4007116049552942e-05, + "loss": 0.8285, + "step": 6515 + }, + { + "epoch": 1.1052789506397753, + "grad_norm": 1.71875, + "learning_rate": 1.4005461385166847e-05, + "loss": 0.8479, + "step": 6516 + }, + { + "epoch": 1.1054504147287652, + "grad_norm": 1.609375, + "learning_rate": 1.4003806590147573e-05, + "loss": 0.8497, + "step": 6517 + }, + { + "epoch": 1.1056218788177552, + "grad_norm": 1.7109375, + "learning_rate": 1.4002151664549088e-05, + "loss": 0.8999, + "step": 6518 + }, + { + "epoch": 1.105793342906745, + "grad_norm": 1.59375, + "learning_rate": 1.4000496608425369e-05, + "loss": 0.9195, + "step": 6519 + }, + { + "epoch": 1.1059648069957349, + "grad_norm": 1.5859375, + "learning_rate": 1.3998841421830391e-05, + "loss": 0.7609, + "step": 6520 + }, + { + "epoch": 1.1061362710847247, + "grad_norm": 1.6484375, + "learning_rate": 1.3997186104818134e-05, + "loss": 0.8273, + "step": 6521 + }, + { + "epoch": 1.1063077351737145, + "grad_norm": 1.6015625, + "learning_rate": 1.3995530657442588e-05, + "loss": 0.8517, + "step": 6522 + }, + { + "epoch": 1.1064791992627043, + "grad_norm": 1.6796875, + "learning_rate": 1.3993875079757744e-05, + "loss": 0.8764, + "step": 6523 + }, + { + "epoch": 1.1066506633516944, + "grad_norm": 1.6640625, + "learning_rate": 1.399221937181759e-05, + "loss": 0.8312, + "step": 6524 + }, + { + "epoch": 1.1068221274406842, + "grad_norm": 1.5859375, + "learning_rate": 1.3990563533676129e-05, + "loss": 0.8393, + "step": 6525 + }, + { + "epoch": 1.106993591529674, + "grad_norm": 1.625, + "learning_rate": 1.3988907565387364e-05, + "loss": 0.9126, + "step": 6526 + }, + { + "epoch": 1.1071650556186639, + "grad_norm": 1.7578125, + "learning_rate": 1.3987251467005303e-05, + "loss": 0.8461, + "step": 6527 + }, + { + "epoch": 1.1073365197076537, + "grad_norm": 1.640625, + "learning_rate": 1.3985595238583958e-05, + "loss": 0.889, + "step": 6528 + }, + { + "epoch": 1.1075079837966435, + "grad_norm": 1.75, + "learning_rate": 1.3983938880177345e-05, + "loss": 0.8923, + "step": 6529 + }, + { + "epoch": 1.1076794478856336, + "grad_norm": 1.6796875, + "learning_rate": 1.3982282391839483e-05, + "loss": 0.8516, + "step": 6530 + }, + { + "epoch": 1.1078509119746234, + "grad_norm": 1.7578125, + "learning_rate": 1.3980625773624394e-05, + "loss": 0.9837, + "step": 6531 + }, + { + "epoch": 1.1080223760636132, + "grad_norm": 1.7734375, + "learning_rate": 1.3978969025586109e-05, + "loss": 0.8922, + "step": 6532 + }, + { + "epoch": 1.108193840152603, + "grad_norm": 1.7421875, + "learning_rate": 1.397731214777866e-05, + "loss": 0.8567, + "step": 6533 + }, + { + "epoch": 1.1083653042415929, + "grad_norm": 1.546875, + "learning_rate": 1.3975655140256089e-05, + "loss": 0.7654, + "step": 6534 + }, + { + "epoch": 1.1085367683305827, + "grad_norm": 1.640625, + "learning_rate": 1.3973998003072429e-05, + "loss": 0.8481, + "step": 6535 + }, + { + "epoch": 1.1087082324195727, + "grad_norm": 1.5625, + "learning_rate": 1.397234073628173e-05, + "loss": 0.8526, + "step": 6536 + }, + { + "epoch": 1.1088796965085626, + "grad_norm": 1.6640625, + "learning_rate": 1.3970683339938042e-05, + "loss": 0.886, + "step": 6537 + }, + { + "epoch": 1.1090511605975524, + "grad_norm": 1.6640625, + "learning_rate": 1.3969025814095419e-05, + "loss": 0.9093, + "step": 6538 + }, + { + "epoch": 1.1092226246865422, + "grad_norm": 1.640625, + "learning_rate": 1.3967368158807915e-05, + "loss": 0.8276, + "step": 6539 + }, + { + "epoch": 1.109394088775532, + "grad_norm": 1.6484375, + "learning_rate": 1.3965710374129599e-05, + "loss": 0.8418, + "step": 6540 + }, + { + "epoch": 1.1095655528645219, + "grad_norm": 1.625, + "learning_rate": 1.3964052460114532e-05, + "loss": 0.837, + "step": 6541 + }, + { + "epoch": 1.109737016953512, + "grad_norm": 1.609375, + "learning_rate": 1.396239441681679e-05, + "loss": 0.8559, + "step": 6542 + }, + { + "epoch": 1.1099084810425017, + "grad_norm": 1.703125, + "learning_rate": 1.396073624429044e-05, + "loss": 0.96, + "step": 6543 + }, + { + "epoch": 1.1100799451314916, + "grad_norm": 1.6796875, + "learning_rate": 1.395907794258957e-05, + "loss": 0.894, + "step": 6544 + }, + { + "epoch": 1.1102514092204814, + "grad_norm": 1.6015625, + "learning_rate": 1.3957419511768257e-05, + "loss": 0.8482, + "step": 6545 + }, + { + "epoch": 1.1104228733094712, + "grad_norm": 1.640625, + "learning_rate": 1.3955760951880595e-05, + "loss": 0.8397, + "step": 6546 + }, + { + "epoch": 1.110594337398461, + "grad_norm": 1.75, + "learning_rate": 1.3954102262980672e-05, + "loss": 0.9021, + "step": 6547 + }, + { + "epoch": 1.1107658014874509, + "grad_norm": 1.703125, + "learning_rate": 1.3952443445122583e-05, + "loss": 0.8697, + "step": 6548 + }, + { + "epoch": 1.110937265576441, + "grad_norm": 1.6328125, + "learning_rate": 1.395078449836043e-05, + "loss": 0.8267, + "step": 6549 + }, + { + "epoch": 1.1111087296654307, + "grad_norm": 1.6953125, + "learning_rate": 1.3949125422748315e-05, + "loss": 0.8296, + "step": 6550 + }, + { + "epoch": 1.1112801937544206, + "grad_norm": 1.671875, + "learning_rate": 1.394746621834035e-05, + "loss": 0.9149, + "step": 6551 + }, + { + "epoch": 1.1114516578434104, + "grad_norm": 1.703125, + "learning_rate": 1.3945806885190651e-05, + "loss": 0.7707, + "step": 6552 + }, + { + "epoch": 1.1116231219324002, + "grad_norm": 1.6640625, + "learning_rate": 1.394414742335333e-05, + "loss": 0.8369, + "step": 6553 + }, + { + "epoch": 1.1117945860213903, + "grad_norm": 1.578125, + "learning_rate": 1.3942487832882503e-05, + "loss": 0.8169, + "step": 6554 + }, + { + "epoch": 1.11196605011038, + "grad_norm": 1.65625, + "learning_rate": 1.3940828113832306e-05, + "loss": 0.8029, + "step": 6555 + }, + { + "epoch": 1.11213751419937, + "grad_norm": 1.7578125, + "learning_rate": 1.3939168266256865e-05, + "loss": 0.8772, + "step": 6556 + }, + { + "epoch": 1.1123089782883597, + "grad_norm": 1.6640625, + "learning_rate": 1.3937508290210313e-05, + "loss": 0.7688, + "step": 6557 + }, + { + "epoch": 1.1124804423773496, + "grad_norm": 1.6875, + "learning_rate": 1.393584818574679e-05, + "loss": 0.8588, + "step": 6558 + }, + { + "epoch": 1.1126519064663394, + "grad_norm": 1.65625, + "learning_rate": 1.3934187952920433e-05, + "loss": 0.8125, + "step": 6559 + }, + { + "epoch": 1.1128233705553292, + "grad_norm": 1.6484375, + "learning_rate": 1.3932527591785394e-05, + "loss": 0.9402, + "step": 6560 + }, + { + "epoch": 1.1129948346443193, + "grad_norm": 1.7109375, + "learning_rate": 1.3930867102395821e-05, + "loss": 0.891, + "step": 6561 + }, + { + "epoch": 1.113166298733309, + "grad_norm": 1.65625, + "learning_rate": 1.3929206484805869e-05, + "loss": 0.8461, + "step": 6562 + }, + { + "epoch": 1.113337762822299, + "grad_norm": 1.6171875, + "learning_rate": 1.3927545739069699e-05, + "loss": 0.7938, + "step": 6563 + }, + { + "epoch": 1.1135092269112887, + "grad_norm": 22.125, + "learning_rate": 1.3925884865241472e-05, + "loss": 1.3668, + "step": 6564 + }, + { + "epoch": 1.1136806910002786, + "grad_norm": 1.6953125, + "learning_rate": 1.3924223863375356e-05, + "loss": 0.8056, + "step": 6565 + }, + { + "epoch": 1.1138521550892686, + "grad_norm": 1.6171875, + "learning_rate": 1.3922562733525522e-05, + "loss": 0.8781, + "step": 6566 + }, + { + "epoch": 1.1140236191782584, + "grad_norm": 1.671875, + "learning_rate": 1.3920901475746147e-05, + "loss": 0.7755, + "step": 6567 + }, + { + "epoch": 1.1141950832672483, + "grad_norm": 1.7421875, + "learning_rate": 1.391924009009141e-05, + "loss": 0.8503, + "step": 6568 + }, + { + "epoch": 1.114366547356238, + "grad_norm": 1.671875, + "learning_rate": 1.3917578576615495e-05, + "loss": 0.8424, + "step": 6569 + }, + { + "epoch": 1.114538011445228, + "grad_norm": 1.6640625, + "learning_rate": 1.3915916935372594e-05, + "loss": 0.8289, + "step": 6570 + }, + { + "epoch": 1.1147094755342177, + "grad_norm": 1.703125, + "learning_rate": 1.391425516641689e-05, + "loss": 0.8466, + "step": 6571 + }, + { + "epoch": 1.1148809396232076, + "grad_norm": 1.734375, + "learning_rate": 1.3912593269802588e-05, + "loss": 0.9011, + "step": 6572 + }, + { + "epoch": 1.1150524037121976, + "grad_norm": 1.6640625, + "learning_rate": 1.3910931245583883e-05, + "loss": 0.8765, + "step": 6573 + }, + { + "epoch": 1.1152238678011874, + "grad_norm": 1.7109375, + "learning_rate": 1.3909269093814985e-05, + "loss": 0.8353, + "step": 6574 + }, + { + "epoch": 1.1153953318901773, + "grad_norm": 1.6953125, + "learning_rate": 1.3907606814550105e-05, + "loss": 0.9173, + "step": 6575 + }, + { + "epoch": 1.115566795979167, + "grad_norm": 1.546875, + "learning_rate": 1.3905944407843447e-05, + "loss": 0.8287, + "step": 6576 + }, + { + "epoch": 1.115738260068157, + "grad_norm": 1.65625, + "learning_rate": 1.3904281873749237e-05, + "loss": 0.857, + "step": 6577 + }, + { + "epoch": 1.115909724157147, + "grad_norm": 1.75, + "learning_rate": 1.3902619212321692e-05, + "loss": 0.8939, + "step": 6578 + }, + { + "epoch": 1.1160811882461368, + "grad_norm": 1.7734375, + "learning_rate": 1.3900956423615035e-05, + "loss": 0.8707, + "step": 6579 + }, + { + "epoch": 1.1162526523351266, + "grad_norm": 1.6796875, + "learning_rate": 1.3899293507683502e-05, + "loss": 0.7755, + "step": 6580 + }, + { + "epoch": 1.1164241164241164, + "grad_norm": 1.640625, + "learning_rate": 1.3897630464581325e-05, + "loss": 0.8604, + "step": 6581 + }, + { + "epoch": 1.1165955805131063, + "grad_norm": 1.6484375, + "learning_rate": 1.389596729436274e-05, + "loss": 0.8777, + "step": 6582 + }, + { + "epoch": 1.116767044602096, + "grad_norm": 1.625, + "learning_rate": 1.3894303997081991e-05, + "loss": 0.8515, + "step": 6583 + }, + { + "epoch": 1.116938508691086, + "grad_norm": 1.6171875, + "learning_rate": 1.3892640572793323e-05, + "loss": 0.9125, + "step": 6584 + }, + { + "epoch": 1.117109972780076, + "grad_norm": 1.703125, + "learning_rate": 1.3890977021550988e-05, + "loss": 0.8635, + "step": 6585 + }, + { + "epoch": 1.1172814368690658, + "grad_norm": 1.6953125, + "learning_rate": 1.3889313343409244e-05, + "loss": 0.8413, + "step": 6586 + }, + { + "epoch": 1.1174529009580556, + "grad_norm": 1.6171875, + "learning_rate": 1.3887649538422344e-05, + "loss": 0.8536, + "step": 6587 + }, + { + "epoch": 1.1176243650470454, + "grad_norm": 1.78125, + "learning_rate": 1.3885985606644552e-05, + "loss": 0.872, + "step": 6588 + }, + { + "epoch": 1.1177958291360353, + "grad_norm": 1.7109375, + "learning_rate": 1.3884321548130136e-05, + "loss": 0.8876, + "step": 6589 + }, + { + "epoch": 1.117967293225025, + "grad_norm": 1.625, + "learning_rate": 1.3882657362933368e-05, + "loss": 0.9244, + "step": 6590 + }, + { + "epoch": 1.1181387573140151, + "grad_norm": 1.625, + "learning_rate": 1.3880993051108522e-05, + "loss": 0.868, + "step": 6591 + }, + { + "epoch": 1.118310221403005, + "grad_norm": 1.7109375, + "learning_rate": 1.3879328612709879e-05, + "loss": 0.9156, + "step": 6592 + }, + { + "epoch": 1.1184816854919948, + "grad_norm": 1.765625, + "learning_rate": 1.387766404779172e-05, + "loss": 0.8918, + "step": 6593 + }, + { + "epoch": 1.1186531495809846, + "grad_norm": 1.671875, + "learning_rate": 1.3875999356408336e-05, + "loss": 0.9196, + "step": 6594 + }, + { + "epoch": 1.1188246136699744, + "grad_norm": 1.625, + "learning_rate": 1.3874334538614016e-05, + "loss": 0.8413, + "step": 6595 + }, + { + "epoch": 1.1189960777589643, + "grad_norm": 1.7734375, + "learning_rate": 1.3872669594463059e-05, + "loss": 0.8718, + "step": 6596 + }, + { + "epoch": 1.1191675418479543, + "grad_norm": 1.6796875, + "learning_rate": 1.3871004524009764e-05, + "loss": 0.8636, + "step": 6597 + }, + { + "epoch": 1.1193390059369441, + "grad_norm": 1.75, + "learning_rate": 1.3869339327308433e-05, + "loss": 0.9268, + "step": 6598 + }, + { + "epoch": 1.119510470025934, + "grad_norm": 1.6328125, + "learning_rate": 1.3867674004413379e-05, + "loss": 0.8645, + "step": 6599 + }, + { + "epoch": 1.1196819341149238, + "grad_norm": 1.671875, + "learning_rate": 1.3866008555378906e-05, + "loss": 0.8711, + "step": 6600 + }, + { + "epoch": 1.1198533982039136, + "grad_norm": 1.6015625, + "learning_rate": 1.3864342980259338e-05, + "loss": 0.7778, + "step": 6601 + }, + { + "epoch": 1.1200248622929034, + "grad_norm": 1.6953125, + "learning_rate": 1.3862677279108993e-05, + "loss": 0.9177, + "step": 6602 + }, + { + "epoch": 1.1201963263818935, + "grad_norm": 1.6484375, + "learning_rate": 1.3861011451982198e-05, + "loss": 0.8764, + "step": 6603 + }, + { + "epoch": 1.1203677904708833, + "grad_norm": 1.6484375, + "learning_rate": 1.3859345498933283e-05, + "loss": 0.7912, + "step": 6604 + }, + { + "epoch": 1.1205392545598731, + "grad_norm": 1.59375, + "learning_rate": 1.3857679420016575e-05, + "loss": 0.8659, + "step": 6605 + }, + { + "epoch": 1.120710718648863, + "grad_norm": 1.6484375, + "learning_rate": 1.3856013215286415e-05, + "loss": 0.832, + "step": 6606 + }, + { + "epoch": 1.1208821827378528, + "grad_norm": 1.6875, + "learning_rate": 1.3854346884797143e-05, + "loss": 0.8807, + "step": 6607 + }, + { + "epoch": 1.1210536468268426, + "grad_norm": 1.6953125, + "learning_rate": 1.3852680428603105e-05, + "loss": 0.8882, + "step": 6608 + }, + { + "epoch": 1.1212251109158327, + "grad_norm": 1.7109375, + "learning_rate": 1.3851013846758651e-05, + "loss": 0.9261, + "step": 6609 + }, + { + "epoch": 1.1213965750048225, + "grad_norm": 1.609375, + "learning_rate": 1.3849347139318134e-05, + "loss": 0.8339, + "step": 6610 + }, + { + "epoch": 1.1215680390938123, + "grad_norm": 1.6015625, + "learning_rate": 1.3847680306335911e-05, + "loss": 0.8992, + "step": 6611 + }, + { + "epoch": 1.1217395031828021, + "grad_norm": 1.6171875, + "learning_rate": 1.3846013347866345e-05, + "loss": 0.8608, + "step": 6612 + }, + { + "epoch": 1.121910967271792, + "grad_norm": 1.609375, + "learning_rate": 1.38443462639638e-05, + "loss": 0.922, + "step": 6613 + }, + { + "epoch": 1.1220824313607818, + "grad_norm": 1.65625, + "learning_rate": 1.3842679054682646e-05, + "loss": 0.7879, + "step": 6614 + }, + { + "epoch": 1.1222538954497718, + "grad_norm": 1.640625, + "learning_rate": 1.3841011720077262e-05, + "loss": 0.8357, + "step": 6615 + }, + { + "epoch": 1.1224253595387617, + "grad_norm": 1.6640625, + "learning_rate": 1.383934426020202e-05, + "loss": 0.814, + "step": 6616 + }, + { + "epoch": 1.1225968236277515, + "grad_norm": 1.625, + "learning_rate": 1.3837676675111303e-05, + "loss": 0.8158, + "step": 6617 + }, + { + "epoch": 1.1227682877167413, + "grad_norm": 1.65625, + "learning_rate": 1.3836008964859497e-05, + "loss": 0.8426, + "step": 6618 + }, + { + "epoch": 1.1229397518057311, + "grad_norm": 1.75, + "learning_rate": 1.3834341129500995e-05, + "loss": 0.9509, + "step": 6619 + }, + { + "epoch": 1.123111215894721, + "grad_norm": 1.6484375, + "learning_rate": 1.3832673169090187e-05, + "loss": 0.8784, + "step": 6620 + }, + { + "epoch": 1.123282679983711, + "grad_norm": 1.625, + "learning_rate": 1.3831005083681479e-05, + "loss": 0.8034, + "step": 6621 + }, + { + "epoch": 1.1234541440727008, + "grad_norm": 1.6484375, + "learning_rate": 1.3829336873329266e-05, + "loss": 0.9271, + "step": 6622 + }, + { + "epoch": 1.1236256081616907, + "grad_norm": 1.578125, + "learning_rate": 1.382766853808796e-05, + "loss": 0.8658, + "step": 6623 + }, + { + "epoch": 1.1237970722506805, + "grad_norm": 1.6640625, + "learning_rate": 1.3826000078011968e-05, + "loss": 0.8444, + "step": 6624 + }, + { + "epoch": 1.1239685363396703, + "grad_norm": 1.7265625, + "learning_rate": 1.3824331493155706e-05, + "loss": 0.9042, + "step": 6625 + }, + { + "epoch": 1.1241400004286601, + "grad_norm": 1.6640625, + "learning_rate": 1.3822662783573595e-05, + "loss": 0.7856, + "step": 6626 + }, + { + "epoch": 1.1243114645176502, + "grad_norm": 1.6171875, + "learning_rate": 1.3820993949320054e-05, + "loss": 0.8483, + "step": 6627 + }, + { + "epoch": 1.12448292860664, + "grad_norm": 1.59375, + "learning_rate": 1.381932499044951e-05, + "loss": 0.8497, + "step": 6628 + }, + { + "epoch": 1.1246543926956298, + "grad_norm": 1.6328125, + "learning_rate": 1.3817655907016399e-05, + "loss": 0.896, + "step": 6629 + }, + { + "epoch": 1.1248258567846197, + "grad_norm": 1.7265625, + "learning_rate": 1.381598669907515e-05, + "loss": 0.8452, + "step": 6630 + }, + { + "epoch": 1.1249973208736095, + "grad_norm": 1.6953125, + "learning_rate": 1.3814317366680206e-05, + "loss": 0.8716, + "step": 6631 + }, + { + "epoch": 1.1251687849625993, + "grad_norm": 1.7265625, + "learning_rate": 1.381264790988601e-05, + "loss": 0.882, + "step": 6632 + }, + { + "epoch": 1.1253402490515891, + "grad_norm": 1.6640625, + "learning_rate": 1.381097832874701e-05, + "loss": 0.8736, + "step": 6633 + }, + { + "epoch": 1.1255117131405792, + "grad_norm": 1.703125, + "learning_rate": 1.3809308623317655e-05, + "loss": 0.8683, + "step": 6634 + }, + { + "epoch": 1.125683177229569, + "grad_norm": 1.734375, + "learning_rate": 1.3807638793652401e-05, + "loss": 0.908, + "step": 6635 + }, + { + "epoch": 1.1258546413185588, + "grad_norm": 1.6484375, + "learning_rate": 1.3805968839805709e-05, + "loss": 0.8239, + "step": 6636 + }, + { + "epoch": 1.1260261054075487, + "grad_norm": 1.703125, + "learning_rate": 1.3804298761832038e-05, + "loss": 0.9048, + "step": 6637 + }, + { + "epoch": 1.1261975694965385, + "grad_norm": 1.703125, + "learning_rate": 1.380262855978586e-05, + "loss": 0.8182, + "step": 6638 + }, + { + "epoch": 1.1263690335855285, + "grad_norm": 1.6484375, + "learning_rate": 1.3800958233721644e-05, + "loss": 0.9245, + "step": 6639 + }, + { + "epoch": 1.1265404976745184, + "grad_norm": 1.65625, + "learning_rate": 1.379928778369387e-05, + "loss": 0.8484, + "step": 6640 + }, + { + "epoch": 1.1267119617635082, + "grad_norm": 1.6171875, + "learning_rate": 1.3797617209757013e-05, + "loss": 0.8341, + "step": 6641 + }, + { + "epoch": 1.126883425852498, + "grad_norm": 1.65625, + "learning_rate": 1.3795946511965557e-05, + "loss": 0.8392, + "step": 6642 + }, + { + "epoch": 1.1270548899414878, + "grad_norm": 1.6171875, + "learning_rate": 1.3794275690373993e-05, + "loss": 0.9658, + "step": 6643 + }, + { + "epoch": 1.1272263540304777, + "grad_norm": 1.609375, + "learning_rate": 1.379260474503681e-05, + "loss": 0.8144, + "step": 6644 + }, + { + "epoch": 1.1273978181194675, + "grad_norm": 1.7421875, + "learning_rate": 1.3790933676008505e-05, + "loss": 0.8777, + "step": 6645 + }, + { + "epoch": 1.1275692822084575, + "grad_norm": 1.6953125, + "learning_rate": 1.3789262483343578e-05, + "loss": 0.8284, + "step": 6646 + }, + { + "epoch": 1.1277407462974474, + "grad_norm": 2.03125, + "learning_rate": 1.3787591167096528e-05, + "loss": 0.846, + "step": 6647 + }, + { + "epoch": 1.1279122103864372, + "grad_norm": 1.609375, + "learning_rate": 1.3785919727321872e-05, + "loss": 0.8, + "step": 6648 + }, + { + "epoch": 1.128083674475427, + "grad_norm": 1.609375, + "learning_rate": 1.3784248164074116e-05, + "loss": 0.9209, + "step": 6649 + }, + { + "epoch": 1.1282551385644168, + "grad_norm": 1.625, + "learning_rate": 1.3782576477407778e-05, + "loss": 0.8437, + "step": 6650 + }, + { + "epoch": 1.1284266026534069, + "grad_norm": 1.7109375, + "learning_rate": 1.3780904667377377e-05, + "loss": 0.9282, + "step": 6651 + }, + { + "epoch": 1.1285980667423967, + "grad_norm": 1.6328125, + "learning_rate": 1.3779232734037437e-05, + "loss": 0.8401, + "step": 6652 + }, + { + "epoch": 1.1287695308313865, + "grad_norm": 1.7890625, + "learning_rate": 1.3777560677442485e-05, + "loss": 0.986, + "step": 6653 + }, + { + "epoch": 1.1289409949203764, + "grad_norm": 1.7265625, + "learning_rate": 1.3775888497647056e-05, + "loss": 0.8463, + "step": 6654 + }, + { + "epoch": 1.1291124590093662, + "grad_norm": 1.71875, + "learning_rate": 1.3774216194705683e-05, + "loss": 0.8735, + "step": 6655 + }, + { + "epoch": 1.129283923098356, + "grad_norm": 1.671875, + "learning_rate": 1.377254376867291e-05, + "loss": 0.8773, + "step": 6656 + }, + { + "epoch": 1.1294553871873458, + "grad_norm": 1.640625, + "learning_rate": 1.3770871219603276e-05, + "loss": 0.9427, + "step": 6657 + }, + { + "epoch": 1.1296268512763359, + "grad_norm": 1.640625, + "learning_rate": 1.3769198547551333e-05, + "loss": 0.8364, + "step": 6658 + }, + { + "epoch": 1.1297983153653257, + "grad_norm": 1.65625, + "learning_rate": 1.3767525752571631e-05, + "loss": 0.835, + "step": 6659 + }, + { + "epoch": 1.1299697794543155, + "grad_norm": 1.640625, + "learning_rate": 1.3765852834718727e-05, + "loss": 0.9108, + "step": 6660 + }, + { + "epoch": 1.1301412435433054, + "grad_norm": 1.609375, + "learning_rate": 1.3764179794047183e-05, + "loss": 0.8486, + "step": 6661 + }, + { + "epoch": 1.1303127076322952, + "grad_norm": 1.59375, + "learning_rate": 1.376250663061156e-05, + "loss": 0.8227, + "step": 6662 + }, + { + "epoch": 1.1304841717212852, + "grad_norm": 1.5859375, + "learning_rate": 1.3760833344466428e-05, + "loss": 0.8379, + "step": 6663 + }, + { + "epoch": 1.130655635810275, + "grad_norm": 1.7578125, + "learning_rate": 1.3759159935666358e-05, + "loss": 0.8692, + "step": 6664 + }, + { + "epoch": 1.1308270998992649, + "grad_norm": 1.671875, + "learning_rate": 1.3757486404265927e-05, + "loss": 0.8368, + "step": 6665 + }, + { + "epoch": 1.1309985639882547, + "grad_norm": 1.625, + "learning_rate": 1.3755812750319716e-05, + "loss": 0.793, + "step": 6666 + }, + { + "epoch": 1.1311700280772445, + "grad_norm": 1.65625, + "learning_rate": 1.3754138973882304e-05, + "loss": 0.8666, + "step": 6667 + }, + { + "epoch": 1.1313414921662344, + "grad_norm": 1.75, + "learning_rate": 1.3752465075008288e-05, + "loss": 0.909, + "step": 6668 + }, + { + "epoch": 1.1315129562552242, + "grad_norm": 1.703125, + "learning_rate": 1.3750791053752254e-05, + "loss": 0.9476, + "step": 6669 + }, + { + "epoch": 1.1316844203442142, + "grad_norm": 1.6171875, + "learning_rate": 1.3749116910168798e-05, + "loss": 0.8361, + "step": 6670 + }, + { + "epoch": 1.131855884433204, + "grad_norm": 1.6875, + "learning_rate": 1.3747442644312523e-05, + "loss": 0.9592, + "step": 6671 + }, + { + "epoch": 1.1320273485221939, + "grad_norm": 1.671875, + "learning_rate": 1.3745768256238033e-05, + "loss": 0.8749, + "step": 6672 + }, + { + "epoch": 1.1321988126111837, + "grad_norm": 1.6015625, + "learning_rate": 1.3744093745999935e-05, + "loss": 0.7972, + "step": 6673 + }, + { + "epoch": 1.1323702767001735, + "grad_norm": 1.6796875, + "learning_rate": 1.374241911365284e-05, + "loss": 0.8684, + "step": 6674 + }, + { + "epoch": 1.1325417407891636, + "grad_norm": 1.6640625, + "learning_rate": 1.3740744359251363e-05, + "loss": 0.7984, + "step": 6675 + }, + { + "epoch": 1.1327132048781534, + "grad_norm": 1.7421875, + "learning_rate": 1.3739069482850128e-05, + "loss": 0.8367, + "step": 6676 + }, + { + "epoch": 1.1328846689671432, + "grad_norm": 1.703125, + "learning_rate": 1.3737394484503756e-05, + "loss": 0.8666, + "step": 6677 + }, + { + "epoch": 1.133056133056133, + "grad_norm": 1.65625, + "learning_rate": 1.3735719364266878e-05, + "loss": 0.9206, + "step": 6678 + }, + { + "epoch": 1.133227597145123, + "grad_norm": 1.671875, + "learning_rate": 1.3734044122194126e-05, + "loss": 0.8831, + "step": 6679 + }, + { + "epoch": 1.1333990612341127, + "grad_norm": 1.6171875, + "learning_rate": 1.373236875834013e-05, + "loss": 0.8538, + "step": 6680 + }, + { + "epoch": 1.1335705253231025, + "grad_norm": 1.734375, + "learning_rate": 1.3730693272759537e-05, + "loss": 0.95, + "step": 6681 + }, + { + "epoch": 1.1337419894120926, + "grad_norm": 1.7265625, + "learning_rate": 1.3729017665506985e-05, + "loss": 0.8747, + "step": 6682 + }, + { + "epoch": 1.1339134535010824, + "grad_norm": 1.7421875, + "learning_rate": 1.3727341936637126e-05, + "loss": 0.8905, + "step": 6683 + }, + { + "epoch": 1.1340849175900722, + "grad_norm": 1.6796875, + "learning_rate": 1.372566608620461e-05, + "loss": 0.8614, + "step": 6684 + }, + { + "epoch": 1.134256381679062, + "grad_norm": 1.5546875, + "learning_rate": 1.3723990114264094e-05, + "loss": 0.8072, + "step": 6685 + }, + { + "epoch": 1.134427845768052, + "grad_norm": 1.6484375, + "learning_rate": 1.3722314020870237e-05, + "loss": 0.798, + "step": 6686 + }, + { + "epoch": 1.134599309857042, + "grad_norm": 1.7578125, + "learning_rate": 1.3720637806077703e-05, + "loss": 0.9451, + "step": 6687 + }, + { + "epoch": 1.1347707739460318, + "grad_norm": 1.71875, + "learning_rate": 1.3718961469941159e-05, + "loss": 0.8844, + "step": 6688 + }, + { + "epoch": 1.1349422380350216, + "grad_norm": 1.671875, + "learning_rate": 1.3717285012515278e-05, + "loss": 0.7849, + "step": 6689 + }, + { + "epoch": 1.1351137021240114, + "grad_norm": 1.65625, + "learning_rate": 1.3715608433854738e-05, + "loss": 0.9024, + "step": 6690 + }, + { + "epoch": 1.1352851662130012, + "grad_norm": 1.6875, + "learning_rate": 1.371393173401421e-05, + "loss": 0.8926, + "step": 6691 + }, + { + "epoch": 1.135456630301991, + "grad_norm": 1.7734375, + "learning_rate": 1.3712254913048384e-05, + "loss": 0.8341, + "step": 6692 + }, + { + "epoch": 1.135628094390981, + "grad_norm": 1.7109375, + "learning_rate": 1.3710577971011948e-05, + "loss": 0.934, + "step": 6693 + }, + { + "epoch": 1.135799558479971, + "grad_norm": 1.671875, + "learning_rate": 1.3708900907959585e-05, + "loss": 0.847, + "step": 6694 + }, + { + "epoch": 1.1359710225689608, + "grad_norm": 1.5703125, + "learning_rate": 1.3707223723946003e-05, + "loss": 0.827, + "step": 6695 + }, + { + "epoch": 1.1361424866579506, + "grad_norm": 1.65625, + "learning_rate": 1.3705546419025897e-05, + "loss": 0.8121, + "step": 6696 + }, + { + "epoch": 1.1363139507469404, + "grad_norm": 1.578125, + "learning_rate": 1.3703868993253967e-05, + "loss": 0.7692, + "step": 6697 + }, + { + "epoch": 1.1364854148359302, + "grad_norm": 1.6328125, + "learning_rate": 1.370219144668492e-05, + "loss": 0.8743, + "step": 6698 + }, + { + "epoch": 1.13665687892492, + "grad_norm": 1.6484375, + "learning_rate": 1.3700513779373467e-05, + "loss": 0.9635, + "step": 6699 + }, + { + "epoch": 1.1368283430139101, + "grad_norm": 1.7265625, + "learning_rate": 1.3698835991374328e-05, + "loss": 0.9609, + "step": 6700 + }, + { + "epoch": 1.1369998071029, + "grad_norm": 1.6640625, + "learning_rate": 1.3697158082742219e-05, + "loss": 0.8923, + "step": 6701 + }, + { + "epoch": 1.1371712711918898, + "grad_norm": 1.65625, + "learning_rate": 1.3695480053531864e-05, + "loss": 0.809, + "step": 6702 + }, + { + "epoch": 1.1373427352808796, + "grad_norm": 1.625, + "learning_rate": 1.3693801903797984e-05, + "loss": 0.8371, + "step": 6703 + }, + { + "epoch": 1.1375141993698694, + "grad_norm": 1.6875, + "learning_rate": 1.3692123633595315e-05, + "loss": 0.9109, + "step": 6704 + }, + { + "epoch": 1.1376856634588592, + "grad_norm": 1.7109375, + "learning_rate": 1.3690445242978594e-05, + "loss": 0.9534, + "step": 6705 + }, + { + "epoch": 1.1378571275478493, + "grad_norm": 1.640625, + "learning_rate": 1.3688766732002555e-05, + "loss": 0.8689, + "step": 6706 + }, + { + "epoch": 1.1380285916368391, + "grad_norm": 1.7109375, + "learning_rate": 1.3687088100721942e-05, + "loss": 0.8972, + "step": 6707 + }, + { + "epoch": 1.138200055725829, + "grad_norm": 1.71875, + "learning_rate": 1.3685409349191505e-05, + "loss": 0.8928, + "step": 6708 + }, + { + "epoch": 1.1383715198148188, + "grad_norm": 1.5859375, + "learning_rate": 1.3683730477465989e-05, + "loss": 0.882, + "step": 6709 + }, + { + "epoch": 1.1385429839038086, + "grad_norm": 1.75, + "learning_rate": 1.3682051485600148e-05, + "loss": 0.9091, + "step": 6710 + }, + { + "epoch": 1.1387144479927984, + "grad_norm": 1.6015625, + "learning_rate": 1.3680372373648744e-05, + "loss": 0.795, + "step": 6711 + }, + { + "epoch": 1.1388859120817885, + "grad_norm": 1.6953125, + "learning_rate": 1.3678693141666539e-05, + "loss": 0.9273, + "step": 6712 + }, + { + "epoch": 1.1390573761707783, + "grad_norm": 1.6640625, + "learning_rate": 1.3677013789708297e-05, + "loss": 0.9238, + "step": 6713 + }, + { + "epoch": 1.1392288402597681, + "grad_norm": 1.6328125, + "learning_rate": 1.367533431782879e-05, + "loss": 0.8532, + "step": 6714 + }, + { + "epoch": 1.139400304348758, + "grad_norm": 1.65625, + "learning_rate": 1.367365472608279e-05, + "loss": 0.8438, + "step": 6715 + }, + { + "epoch": 1.1395717684377478, + "grad_norm": 1.6875, + "learning_rate": 1.3671975014525076e-05, + "loss": 0.8949, + "step": 6716 + }, + { + "epoch": 1.1397432325267376, + "grad_norm": 1.7578125, + "learning_rate": 1.3670295183210431e-05, + "loss": 0.8876, + "step": 6717 + }, + { + "epoch": 1.1399146966157276, + "grad_norm": 1.6015625, + "learning_rate": 1.3668615232193636e-05, + "loss": 0.864, + "step": 6718 + }, + { + "epoch": 1.1400861607047175, + "grad_norm": 1.6328125, + "learning_rate": 1.3666935161529487e-05, + "loss": 0.8316, + "step": 6719 + }, + { + "epoch": 1.1402576247937073, + "grad_norm": 1.7578125, + "learning_rate": 1.3665254971272772e-05, + "loss": 0.9238, + "step": 6720 + }, + { + "epoch": 1.1404290888826971, + "grad_norm": 1.7890625, + "learning_rate": 1.3663574661478291e-05, + "loss": 0.9234, + "step": 6721 + }, + { + "epoch": 1.140600552971687, + "grad_norm": 1.796875, + "learning_rate": 1.366189423220084e-05, + "loss": 0.9048, + "step": 6722 + }, + { + "epoch": 1.1407720170606768, + "grad_norm": 1.7421875, + "learning_rate": 1.3660213683495233e-05, + "loss": 0.9594, + "step": 6723 + }, + { + "epoch": 1.1409434811496668, + "grad_norm": 1.6328125, + "learning_rate": 1.3658533015416276e-05, + "loss": 0.8342, + "step": 6724 + }, + { + "epoch": 1.1411149452386566, + "grad_norm": 1.6796875, + "learning_rate": 1.3656852228018779e-05, + "loss": 0.8728, + "step": 6725 + }, + { + "epoch": 1.1412864093276465, + "grad_norm": 1.640625, + "learning_rate": 1.3655171321357561e-05, + "loss": 0.8813, + "step": 6726 + }, + { + "epoch": 1.1414578734166363, + "grad_norm": 1.7109375, + "learning_rate": 1.3653490295487442e-05, + "loss": 0.8986, + "step": 6727 + }, + { + "epoch": 1.1416293375056261, + "grad_norm": 1.75, + "learning_rate": 1.3651809150463246e-05, + "loss": 0.9289, + "step": 6728 + }, + { + "epoch": 1.141800801594616, + "grad_norm": 1.6640625, + "learning_rate": 1.3650127886339801e-05, + "loss": 0.8557, + "step": 6729 + }, + { + "epoch": 1.1419722656836058, + "grad_norm": 1.640625, + "learning_rate": 1.3648446503171942e-05, + "loss": 0.9425, + "step": 6730 + }, + { + "epoch": 1.1421437297725958, + "grad_norm": 1.609375, + "learning_rate": 1.3646765001014504e-05, + "loss": 0.8632, + "step": 6731 + }, + { + "epoch": 1.1423151938615856, + "grad_norm": 1.625, + "learning_rate": 1.3645083379922327e-05, + "loss": 0.7903, + "step": 6732 + }, + { + "epoch": 1.1424866579505755, + "grad_norm": 1.71875, + "learning_rate": 1.3643401639950253e-05, + "loss": 0.9099, + "step": 6733 + }, + { + "epoch": 1.1426581220395653, + "grad_norm": 1.6953125, + "learning_rate": 1.3641719781153132e-05, + "loss": 0.8892, + "step": 6734 + }, + { + "epoch": 1.1428295861285551, + "grad_norm": 1.671875, + "learning_rate": 1.3640037803585818e-05, + "loss": 0.843, + "step": 6735 + }, + { + "epoch": 1.1430010502175452, + "grad_norm": 1.7265625, + "learning_rate": 1.3638355707303163e-05, + "loss": 0.8157, + "step": 6736 + }, + { + "epoch": 1.143172514306535, + "grad_norm": 1.5703125, + "learning_rate": 1.3636673492360029e-05, + "loss": 0.8234, + "step": 6737 + }, + { + "epoch": 1.1433439783955248, + "grad_norm": 1.6796875, + "learning_rate": 1.3634991158811276e-05, + "loss": 0.8297, + "step": 6738 + }, + { + "epoch": 1.1435154424845146, + "grad_norm": 1.7578125, + "learning_rate": 1.3633308706711772e-05, + "loss": 0.9657, + "step": 6739 + }, + { + "epoch": 1.1436869065735045, + "grad_norm": 1.5390625, + "learning_rate": 1.3631626136116392e-05, + "loss": 0.7461, + "step": 6740 + }, + { + "epoch": 1.1438583706624943, + "grad_norm": 1.6484375, + "learning_rate": 1.3629943447080003e-05, + "loss": 0.8086, + "step": 6741 + }, + { + "epoch": 1.1440298347514841, + "grad_norm": 1.7265625, + "learning_rate": 1.3628260639657496e-05, + "loss": 0.8388, + "step": 6742 + }, + { + "epoch": 1.1442012988404742, + "grad_norm": 1.6953125, + "learning_rate": 1.3626577713903744e-05, + "loss": 0.9211, + "step": 6743 + }, + { + "epoch": 1.144372762929464, + "grad_norm": 1.640625, + "learning_rate": 1.3624894669873636e-05, + "loss": 0.8763, + "step": 6744 + }, + { + "epoch": 1.1445442270184538, + "grad_norm": 1.84375, + "learning_rate": 1.3623211507622063e-05, + "loss": 0.9707, + "step": 6745 + }, + { + "epoch": 1.1447156911074436, + "grad_norm": 1.5859375, + "learning_rate": 1.3621528227203918e-05, + "loss": 0.841, + "step": 6746 + }, + { + "epoch": 1.1448871551964335, + "grad_norm": 1.609375, + "learning_rate": 1.3619844828674101e-05, + "loss": 0.8323, + "step": 6747 + }, + { + "epoch": 1.1450586192854235, + "grad_norm": 1.7109375, + "learning_rate": 1.3618161312087515e-05, + "loss": 0.838, + "step": 6748 + }, + { + "epoch": 1.1452300833744133, + "grad_norm": 1.6953125, + "learning_rate": 1.361647767749906e-05, + "loss": 0.8694, + "step": 6749 + }, + { + "epoch": 1.1454015474634032, + "grad_norm": 1.6953125, + "learning_rate": 1.3614793924963649e-05, + "loss": 0.8692, + "step": 6750 + }, + { + "epoch": 1.145573011552393, + "grad_norm": 1.6953125, + "learning_rate": 1.36131100545362e-05, + "loss": 0.8843, + "step": 6751 + }, + { + "epoch": 1.1457444756413828, + "grad_norm": 1.765625, + "learning_rate": 1.3611426066271625e-05, + "loss": 0.9358, + "step": 6752 + }, + { + "epoch": 1.1459159397303726, + "grad_norm": 1.625, + "learning_rate": 1.3609741960224847e-05, + "loss": 0.7755, + "step": 6753 + }, + { + "epoch": 1.1460874038193625, + "grad_norm": 1.6953125, + "learning_rate": 1.3608057736450792e-05, + "loss": 0.9001, + "step": 6754 + }, + { + "epoch": 1.1462588679083525, + "grad_norm": 1.671875, + "learning_rate": 1.3606373395004384e-05, + "loss": 0.9319, + "step": 6755 + }, + { + "epoch": 1.1464303319973423, + "grad_norm": 1.6171875, + "learning_rate": 1.3604688935940562e-05, + "loss": 0.8448, + "step": 6756 + }, + { + "epoch": 1.1466017960863322, + "grad_norm": 1.6484375, + "learning_rate": 1.3603004359314259e-05, + "loss": 0.8968, + "step": 6757 + }, + { + "epoch": 1.146773260175322, + "grad_norm": 1.703125, + "learning_rate": 1.3601319665180415e-05, + "loss": 0.8989, + "step": 6758 + }, + { + "epoch": 1.1469447242643118, + "grad_norm": 1.640625, + "learning_rate": 1.3599634853593977e-05, + "loss": 0.896, + "step": 6759 + }, + { + "epoch": 1.1471161883533019, + "grad_norm": 1.7734375, + "learning_rate": 1.359794992460989e-05, + "loss": 0.9144, + "step": 6760 + }, + { + "epoch": 1.1472876524422917, + "grad_norm": 1.8203125, + "learning_rate": 1.3596264878283107e-05, + "loss": 0.8946, + "step": 6761 + }, + { + "epoch": 1.1474591165312815, + "grad_norm": 1.640625, + "learning_rate": 1.3594579714668585e-05, + "loss": 0.8703, + "step": 6762 + }, + { + "epoch": 1.1476305806202713, + "grad_norm": 1.7265625, + "learning_rate": 1.3592894433821284e-05, + "loss": 0.9466, + "step": 6763 + }, + { + "epoch": 1.1478020447092612, + "grad_norm": 1.75, + "learning_rate": 1.3591209035796164e-05, + "loss": 0.8969, + "step": 6764 + }, + { + "epoch": 1.147973508798251, + "grad_norm": 1.640625, + "learning_rate": 1.3589523520648195e-05, + "loss": 0.9025, + "step": 6765 + }, + { + "epoch": 1.1481449728872408, + "grad_norm": 1.578125, + "learning_rate": 1.3587837888432348e-05, + "loss": 0.8256, + "step": 6766 + }, + { + "epoch": 1.1483164369762309, + "grad_norm": 1.6015625, + "learning_rate": 1.3586152139203596e-05, + "loss": 0.8648, + "step": 6767 + }, + { + "epoch": 1.1484879010652207, + "grad_norm": 1.703125, + "learning_rate": 1.3584466273016919e-05, + "loss": 0.8814, + "step": 6768 + }, + { + "epoch": 1.1486593651542105, + "grad_norm": 1.6796875, + "learning_rate": 1.3582780289927296e-05, + "loss": 0.9134, + "step": 6769 + }, + { + "epoch": 1.1488308292432003, + "grad_norm": 1.703125, + "learning_rate": 1.3581094189989718e-05, + "loss": 0.8339, + "step": 6770 + }, + { + "epoch": 1.1490022933321902, + "grad_norm": 1.71875, + "learning_rate": 1.3579407973259176e-05, + "loss": 0.8392, + "step": 6771 + }, + { + "epoch": 1.1491737574211802, + "grad_norm": 1.6171875, + "learning_rate": 1.3577721639790658e-05, + "loss": 0.8614, + "step": 6772 + }, + { + "epoch": 1.14934522151017, + "grad_norm": 1.625, + "learning_rate": 1.3576035189639165e-05, + "loss": 0.7583, + "step": 6773 + }, + { + "epoch": 1.1495166855991599, + "grad_norm": 1.6171875, + "learning_rate": 1.35743486228597e-05, + "loss": 0.8958, + "step": 6774 + }, + { + "epoch": 1.1496881496881497, + "grad_norm": 1.6640625, + "learning_rate": 1.3572661939507266e-05, + "loss": 0.8706, + "step": 6775 + }, + { + "epoch": 1.1498596137771395, + "grad_norm": 1.65625, + "learning_rate": 1.3570975139636872e-05, + "loss": 0.8905, + "step": 6776 + }, + { + "epoch": 1.1500310778661293, + "grad_norm": 1.6953125, + "learning_rate": 1.3569288223303531e-05, + "loss": 0.876, + "step": 6777 + }, + { + "epoch": 1.1502025419551192, + "grad_norm": 1.6953125, + "learning_rate": 1.3567601190562264e-05, + "loss": 0.818, + "step": 6778 + }, + { + "epoch": 1.1503740060441092, + "grad_norm": 1.7109375, + "learning_rate": 1.3565914041468085e-05, + "loss": 0.9179, + "step": 6779 + }, + { + "epoch": 1.150545470133099, + "grad_norm": 1.71875, + "learning_rate": 1.3564226776076022e-05, + "loss": 0.8875, + "step": 6780 + }, + { + "epoch": 1.1507169342220889, + "grad_norm": 1.578125, + "learning_rate": 1.3562539394441101e-05, + "loss": 0.875, + "step": 6781 + }, + { + "epoch": 1.1508883983110787, + "grad_norm": 1.6328125, + "learning_rate": 1.3560851896618354e-05, + "loss": 0.8554, + "step": 6782 + }, + { + "epoch": 1.1510598624000685, + "grad_norm": 1.78125, + "learning_rate": 1.3559164282662821e-05, + "loss": 0.8489, + "step": 6783 + }, + { + "epoch": 1.1512313264890586, + "grad_norm": 1.609375, + "learning_rate": 1.3557476552629538e-05, + "loss": 0.8179, + "step": 6784 + }, + { + "epoch": 1.1514027905780484, + "grad_norm": 1.640625, + "learning_rate": 1.3555788706573547e-05, + "loss": 0.9496, + "step": 6785 + }, + { + "epoch": 1.1515742546670382, + "grad_norm": 1.625, + "learning_rate": 1.3554100744549899e-05, + "loss": 0.8999, + "step": 6786 + }, + { + "epoch": 1.151745718756028, + "grad_norm": 1.6640625, + "learning_rate": 1.3552412666613637e-05, + "loss": 0.816, + "step": 6787 + }, + { + "epoch": 1.1519171828450179, + "grad_norm": 1.6796875, + "learning_rate": 1.3550724472819825e-05, + "loss": 0.8914, + "step": 6788 + }, + { + "epoch": 1.1520886469340077, + "grad_norm": 1.6796875, + "learning_rate": 1.3549036163223515e-05, + "loss": 0.8921, + "step": 6789 + }, + { + "epoch": 1.1522601110229975, + "grad_norm": 1.6171875, + "learning_rate": 1.3547347737879772e-05, + "loss": 0.8957, + "step": 6790 + }, + { + "epoch": 1.1524315751119876, + "grad_norm": 1.578125, + "learning_rate": 1.3545659196843661e-05, + "loss": 0.8737, + "step": 6791 + }, + { + "epoch": 1.1526030392009774, + "grad_norm": 1.671875, + "learning_rate": 1.3543970540170253e-05, + "loss": 0.8306, + "step": 6792 + }, + { + "epoch": 1.1527745032899672, + "grad_norm": 1.765625, + "learning_rate": 1.3542281767914617e-05, + "loss": 0.9172, + "step": 6793 + }, + { + "epoch": 1.152945967378957, + "grad_norm": 1.703125, + "learning_rate": 1.3540592880131839e-05, + "loss": 0.8589, + "step": 6794 + }, + { + "epoch": 1.1531174314679469, + "grad_norm": 1.625, + "learning_rate": 1.3538903876876993e-05, + "loss": 0.8532, + "step": 6795 + }, + { + "epoch": 1.1532888955569367, + "grad_norm": 1.78125, + "learning_rate": 1.353721475820516e-05, + "loss": 1.0322, + "step": 6796 + }, + { + "epoch": 1.1534603596459267, + "grad_norm": 1.6875, + "learning_rate": 1.3535525524171438e-05, + "loss": 0.8166, + "step": 6797 + }, + { + "epoch": 1.1536318237349166, + "grad_norm": 1.6328125, + "learning_rate": 1.3533836174830915e-05, + "loss": 0.8826, + "step": 6798 + }, + { + "epoch": 1.1538032878239064, + "grad_norm": 1.765625, + "learning_rate": 1.3532146710238684e-05, + "loss": 0.8866, + "step": 6799 + }, + { + "epoch": 1.1539747519128962, + "grad_norm": 1.609375, + "learning_rate": 1.3530457130449855e-05, + "loss": 0.9088, + "step": 6800 + }, + { + "epoch": 1.154146216001886, + "grad_norm": 1.6640625, + "learning_rate": 1.3528767435519521e-05, + "loss": 0.8938, + "step": 6801 + }, + { + "epoch": 1.1543176800908759, + "grad_norm": 1.6328125, + "learning_rate": 1.3527077625502791e-05, + "loss": 0.9035, + "step": 6802 + }, + { + "epoch": 1.154489144179866, + "grad_norm": 1.59375, + "learning_rate": 1.3525387700454779e-05, + "loss": 0.8444, + "step": 6803 + }, + { + "epoch": 1.1546606082688557, + "grad_norm": 1.625, + "learning_rate": 1.3523697660430599e-05, + "loss": 0.9083, + "step": 6804 + }, + { + "epoch": 1.1548320723578456, + "grad_norm": 1.734375, + "learning_rate": 1.3522007505485368e-05, + "loss": 0.8714, + "step": 6805 + }, + { + "epoch": 1.1550035364468354, + "grad_norm": 1.640625, + "learning_rate": 1.3520317235674212e-05, + "loss": 0.8453, + "step": 6806 + }, + { + "epoch": 1.1551750005358252, + "grad_norm": 1.7421875, + "learning_rate": 1.3518626851052251e-05, + "loss": 0.8262, + "step": 6807 + }, + { + "epoch": 1.155346464624815, + "grad_norm": 1.75, + "learning_rate": 1.3516936351674623e-05, + "loss": 0.8558, + "step": 6808 + }, + { + "epoch": 1.155517928713805, + "grad_norm": 1.6875, + "learning_rate": 1.3515245737596453e-05, + "loss": 0.8869, + "step": 6809 + }, + { + "epoch": 1.155689392802795, + "grad_norm": 1.6640625, + "learning_rate": 1.3513555008872884e-05, + "loss": 0.8404, + "step": 6810 + }, + { + "epoch": 1.1558608568917847, + "grad_norm": 1.7421875, + "learning_rate": 1.3511864165559056e-05, + "loss": 0.8585, + "step": 6811 + }, + { + "epoch": 1.1560323209807746, + "grad_norm": 1.6875, + "learning_rate": 1.3510173207710113e-05, + "loss": 0.8694, + "step": 6812 + }, + { + "epoch": 1.1562037850697644, + "grad_norm": 1.8046875, + "learning_rate": 1.3508482135381205e-05, + "loss": 0.9258, + "step": 6813 + }, + { + "epoch": 1.1563752491587542, + "grad_norm": 1.75, + "learning_rate": 1.350679094862748e-05, + "loss": 0.8493, + "step": 6814 + }, + { + "epoch": 1.156546713247744, + "grad_norm": 1.6640625, + "learning_rate": 1.3505099647504097e-05, + "loss": 0.8316, + "step": 6815 + }, + { + "epoch": 1.156718177336734, + "grad_norm": 1.609375, + "learning_rate": 1.3503408232066215e-05, + "loss": 0.8276, + "step": 6816 + }, + { + "epoch": 1.156889641425724, + "grad_norm": 1.671875, + "learning_rate": 1.3501716702369e-05, + "loss": 0.8804, + "step": 6817 + }, + { + "epoch": 1.1570611055147138, + "grad_norm": 1.6875, + "learning_rate": 1.3500025058467618e-05, + "loss": 0.8201, + "step": 6818 + }, + { + "epoch": 1.1572325696037036, + "grad_norm": 1.6953125, + "learning_rate": 1.3498333300417238e-05, + "loss": 0.8295, + "step": 6819 + }, + { + "epoch": 1.1574040336926934, + "grad_norm": 1.6484375, + "learning_rate": 1.3496641428273032e-05, + "loss": 0.907, + "step": 6820 + }, + { + "epoch": 1.1575754977816834, + "grad_norm": 1.640625, + "learning_rate": 1.3494949442090186e-05, + "loss": 0.8573, + "step": 6821 + }, + { + "epoch": 1.1577469618706733, + "grad_norm": 1.5546875, + "learning_rate": 1.3493257341923876e-05, + "loss": 0.8617, + "step": 6822 + }, + { + "epoch": 1.157918425959663, + "grad_norm": 1.78125, + "learning_rate": 1.3491565127829293e-05, + "loss": 0.9322, + "step": 6823 + }, + { + "epoch": 1.158089890048653, + "grad_norm": 1.6875, + "learning_rate": 1.3489872799861616e-05, + "loss": 0.8675, + "step": 6824 + }, + { + "epoch": 1.1582613541376428, + "grad_norm": 1.7421875, + "learning_rate": 1.348818035807605e-05, + "loss": 0.8458, + "step": 6825 + }, + { + "epoch": 1.1584328182266326, + "grad_norm": 1.6484375, + "learning_rate": 1.3486487802527788e-05, + "loss": 0.8289, + "step": 6826 + }, + { + "epoch": 1.1586042823156224, + "grad_norm": 1.7109375, + "learning_rate": 1.3484795133272028e-05, + "loss": 0.9288, + "step": 6827 + }, + { + "epoch": 1.1587757464046125, + "grad_norm": 1.7890625, + "learning_rate": 1.348310235036398e-05, + "loss": 0.917, + "step": 6828 + }, + { + "epoch": 1.1589472104936023, + "grad_norm": 1.6796875, + "learning_rate": 1.3481409453858846e-05, + "loss": 0.8495, + "step": 6829 + }, + { + "epoch": 1.159118674582592, + "grad_norm": 1.640625, + "learning_rate": 1.3479716443811838e-05, + "loss": 0.8831, + "step": 6830 + }, + { + "epoch": 1.159290138671582, + "grad_norm": 1.609375, + "learning_rate": 1.3478023320278175e-05, + "loss": 0.8112, + "step": 6831 + }, + { + "epoch": 1.1594616027605718, + "grad_norm": 1.7421875, + "learning_rate": 1.3476330083313074e-05, + "loss": 0.9331, + "step": 6832 + }, + { + "epoch": 1.1596330668495618, + "grad_norm": 1.6796875, + "learning_rate": 1.3474636732971758e-05, + "loss": 0.8153, + "step": 6833 + }, + { + "epoch": 1.1598045309385516, + "grad_norm": 1.75, + "learning_rate": 1.3472943269309455e-05, + "loss": 0.8215, + "step": 6834 + }, + { + "epoch": 1.1599759950275415, + "grad_norm": 1.625, + "learning_rate": 1.3471249692381394e-05, + "loss": 0.8797, + "step": 6835 + }, + { + "epoch": 1.1601474591165313, + "grad_norm": 1.8125, + "learning_rate": 1.3469556002242808e-05, + "loss": 0.846, + "step": 6836 + }, + { + "epoch": 1.160318923205521, + "grad_norm": 1.71875, + "learning_rate": 1.3467862198948935e-05, + "loss": 0.9078, + "step": 6837 + }, + { + "epoch": 1.160490387294511, + "grad_norm": 1.6015625, + "learning_rate": 1.3466168282555018e-05, + "loss": 0.772, + "step": 6838 + }, + { + "epoch": 1.1606618513835008, + "grad_norm": 1.703125, + "learning_rate": 1.3464474253116303e-05, + "loss": 0.8907, + "step": 6839 + }, + { + "epoch": 1.1608333154724908, + "grad_norm": 1.671875, + "learning_rate": 1.3462780110688036e-05, + "loss": 0.7626, + "step": 6840 + }, + { + "epoch": 1.1610047795614806, + "grad_norm": 1.734375, + "learning_rate": 1.3461085855325467e-05, + "loss": 0.8413, + "step": 6841 + }, + { + "epoch": 1.1611762436504705, + "grad_norm": 1.7890625, + "learning_rate": 1.3459391487083858e-05, + "loss": 0.8411, + "step": 6842 + }, + { + "epoch": 1.1613477077394603, + "grad_norm": 1.6171875, + "learning_rate": 1.3457697006018462e-05, + "loss": 0.8852, + "step": 6843 + }, + { + "epoch": 1.16151917182845, + "grad_norm": 1.625, + "learning_rate": 1.3456002412184548e-05, + "loss": 0.8167, + "step": 6844 + }, + { + "epoch": 1.1616906359174402, + "grad_norm": 1.6484375, + "learning_rate": 1.3454307705637382e-05, + "loss": 0.8421, + "step": 6845 + }, + { + "epoch": 1.16186210000643, + "grad_norm": 1.65625, + "learning_rate": 1.3452612886432234e-05, + "loss": 0.8503, + "step": 6846 + }, + { + "epoch": 1.1620335640954198, + "grad_norm": 1.625, + "learning_rate": 1.3450917954624378e-05, + "loss": 0.854, + "step": 6847 + }, + { + "epoch": 1.1622050281844096, + "grad_norm": 1.7578125, + "learning_rate": 1.3449222910269093e-05, + "loss": 0.8682, + "step": 6848 + }, + { + "epoch": 1.1623764922733995, + "grad_norm": 1.6328125, + "learning_rate": 1.3447527753421661e-05, + "loss": 0.976, + "step": 6849 + }, + { + "epoch": 1.1625479563623893, + "grad_norm": 1.640625, + "learning_rate": 1.3445832484137365e-05, + "loss": 0.8742, + "step": 6850 + }, + { + "epoch": 1.162719420451379, + "grad_norm": 1.6875, + "learning_rate": 1.3444137102471495e-05, + "loss": 0.8685, + "step": 6851 + }, + { + "epoch": 1.1628908845403692, + "grad_norm": 1.625, + "learning_rate": 1.3442441608479349e-05, + "loss": 0.7899, + "step": 6852 + }, + { + "epoch": 1.163062348629359, + "grad_norm": 1.734375, + "learning_rate": 1.3440746002216213e-05, + "loss": 0.8784, + "step": 6853 + }, + { + "epoch": 1.1632338127183488, + "grad_norm": 1.6875, + "learning_rate": 1.3439050283737399e-05, + "loss": 0.8723, + "step": 6854 + }, + { + "epoch": 1.1634052768073386, + "grad_norm": 1.6484375, + "learning_rate": 1.3437354453098202e-05, + "loss": 0.8222, + "step": 6855 + }, + { + "epoch": 1.1635767408963285, + "grad_norm": 1.6875, + "learning_rate": 1.3435658510353933e-05, + "loss": 0.8552, + "step": 6856 + }, + { + "epoch": 1.1637482049853185, + "grad_norm": 1.640625, + "learning_rate": 1.3433962455559901e-05, + "loss": 0.8358, + "step": 6857 + }, + { + "epoch": 1.1639196690743083, + "grad_norm": 1.7265625, + "learning_rate": 1.3432266288771427e-05, + "loss": 0.8489, + "step": 6858 + }, + { + "epoch": 1.1640911331632982, + "grad_norm": 1.578125, + "learning_rate": 1.3430570010043821e-05, + "loss": 0.7495, + "step": 6859 + }, + { + "epoch": 1.164262597252288, + "grad_norm": 1.6875, + "learning_rate": 1.342887361943241e-05, + "loss": 0.8586, + "step": 6860 + }, + { + "epoch": 1.1644340613412778, + "grad_norm": 1.6953125, + "learning_rate": 1.3427177116992515e-05, + "loss": 0.8377, + "step": 6861 + }, + { + "epoch": 1.1646055254302676, + "grad_norm": 1.6875, + "learning_rate": 1.3425480502779471e-05, + "loss": 0.7789, + "step": 6862 + }, + { + "epoch": 1.1647769895192575, + "grad_norm": 1.7109375, + "learning_rate": 1.3423783776848609e-05, + "loss": 0.8138, + "step": 6863 + }, + { + "epoch": 1.1649484536082475, + "grad_norm": 1.75, + "learning_rate": 1.3422086939255265e-05, + "loss": 0.9288, + "step": 6864 + }, + { + "epoch": 1.1651199176972373, + "grad_norm": 1.640625, + "learning_rate": 1.342038999005478e-05, + "loss": 0.8544, + "step": 6865 + }, + { + "epoch": 1.1652913817862272, + "grad_norm": 1.828125, + "learning_rate": 1.3418692929302497e-05, + "loss": 0.9111, + "step": 6866 + }, + { + "epoch": 1.165462845875217, + "grad_norm": 1.7109375, + "learning_rate": 1.3416995757053764e-05, + "loss": 0.9236, + "step": 6867 + }, + { + "epoch": 1.1656343099642068, + "grad_norm": 1.5859375, + "learning_rate": 1.3415298473363932e-05, + "loss": 0.8341, + "step": 6868 + }, + { + "epoch": 1.1658057740531969, + "grad_norm": 1.7109375, + "learning_rate": 1.3413601078288356e-05, + "loss": 0.8754, + "step": 6869 + }, + { + "epoch": 1.1659772381421867, + "grad_norm": 1.8125, + "learning_rate": 1.3411903571882395e-05, + "loss": 0.9529, + "step": 6870 + }, + { + "epoch": 1.1661487022311765, + "grad_norm": 1.6328125, + "learning_rate": 1.3410205954201407e-05, + "loss": 0.8552, + "step": 6871 + }, + { + "epoch": 1.1663201663201663, + "grad_norm": 1.6796875, + "learning_rate": 1.3408508225300765e-05, + "loss": 0.8721, + "step": 6872 + }, + { + "epoch": 1.1664916304091562, + "grad_norm": 2.015625, + "learning_rate": 1.3406810385235833e-05, + "loss": 0.8574, + "step": 6873 + }, + { + "epoch": 1.166663094498146, + "grad_norm": 1.78125, + "learning_rate": 1.3405112434061986e-05, + "loss": 0.893, + "step": 6874 + }, + { + "epoch": 1.1668345585871358, + "grad_norm": 1.7421875, + "learning_rate": 1.3403414371834602e-05, + "loss": 0.8774, + "step": 6875 + }, + { + "epoch": 1.1670060226761259, + "grad_norm": 1.6796875, + "learning_rate": 1.3401716198609056e-05, + "loss": 0.9101, + "step": 6876 + }, + { + "epoch": 1.1671774867651157, + "grad_norm": 1.703125, + "learning_rate": 1.3400017914440738e-05, + "loss": 0.883, + "step": 6877 + }, + { + "epoch": 1.1673489508541055, + "grad_norm": 1.65625, + "learning_rate": 1.339831951938503e-05, + "loss": 0.8582, + "step": 6878 + }, + { + "epoch": 1.1675204149430953, + "grad_norm": 1.6484375, + "learning_rate": 1.3396621013497327e-05, + "loss": 0.8527, + "step": 6879 + }, + { + "epoch": 1.1676918790320852, + "grad_norm": 1.7578125, + "learning_rate": 1.3394922396833021e-05, + "loss": 0.8982, + "step": 6880 + }, + { + "epoch": 1.1678633431210752, + "grad_norm": 1.703125, + "learning_rate": 1.3393223669447513e-05, + "loss": 0.9034, + "step": 6881 + }, + { + "epoch": 1.168034807210065, + "grad_norm": 1.671875, + "learning_rate": 1.3391524831396202e-05, + "loss": 0.945, + "step": 6882 + }, + { + "epoch": 1.1682062712990549, + "grad_norm": 1.6953125, + "learning_rate": 1.3389825882734495e-05, + "loss": 0.8772, + "step": 6883 + }, + { + "epoch": 1.1683777353880447, + "grad_norm": 1.6171875, + "learning_rate": 1.3388126823517802e-05, + "loss": 0.8102, + "step": 6884 + }, + { + "epoch": 1.1685491994770345, + "grad_norm": 1.6171875, + "learning_rate": 1.3386427653801535e-05, + "loss": 0.8419, + "step": 6885 + }, + { + "epoch": 1.1687206635660243, + "grad_norm": 1.703125, + "learning_rate": 1.3384728373641111e-05, + "loss": 0.9198, + "step": 6886 + }, + { + "epoch": 1.1688921276550142, + "grad_norm": 1.671875, + "learning_rate": 1.3383028983091948e-05, + "loss": 0.8829, + "step": 6887 + }, + { + "epoch": 1.1690635917440042, + "grad_norm": 1.625, + "learning_rate": 1.3381329482209471e-05, + "loss": 0.8461, + "step": 6888 + }, + { + "epoch": 1.169235055832994, + "grad_norm": 1.625, + "learning_rate": 1.3379629871049105e-05, + "loss": 0.8655, + "step": 6889 + }, + { + "epoch": 1.1694065199219839, + "grad_norm": 1.8203125, + "learning_rate": 1.337793014966628e-05, + "loss": 0.8972, + "step": 6890 + }, + { + "epoch": 1.1695779840109737, + "grad_norm": 1.6953125, + "learning_rate": 1.3376230318116437e-05, + "loss": 0.8264, + "step": 6891 + }, + { + "epoch": 1.1697494480999635, + "grad_norm": 1.65625, + "learning_rate": 1.337453037645501e-05, + "loss": 0.9174, + "step": 6892 + }, + { + "epoch": 1.1699209121889533, + "grad_norm": 1.609375, + "learning_rate": 1.3372830324737438e-05, + "loss": 0.7695, + "step": 6893 + }, + { + "epoch": 1.1700923762779434, + "grad_norm": 1.7421875, + "learning_rate": 1.3371130163019168e-05, + "loss": 0.8772, + "step": 6894 + }, + { + "epoch": 1.1702638403669332, + "grad_norm": 1.7109375, + "learning_rate": 1.3369429891355653e-05, + "loss": 0.9167, + "step": 6895 + }, + { + "epoch": 1.170435304455923, + "grad_norm": 1.640625, + "learning_rate": 1.3367729509802336e-05, + "loss": 0.8204, + "step": 6896 + }, + { + "epoch": 1.1706067685449129, + "grad_norm": 1.671875, + "learning_rate": 1.3366029018414679e-05, + "loss": 0.7964, + "step": 6897 + }, + { + "epoch": 1.1707782326339027, + "grad_norm": 1.640625, + "learning_rate": 1.3364328417248142e-05, + "loss": 0.8479, + "step": 6898 + }, + { + "epoch": 1.1709496967228925, + "grad_norm": 1.59375, + "learning_rate": 1.3362627706358187e-05, + "loss": 0.8342, + "step": 6899 + }, + { + "epoch": 1.1711211608118826, + "grad_norm": 1.7265625, + "learning_rate": 1.3360926885800279e-05, + "loss": 0.866, + "step": 6900 + }, + { + "epoch": 1.1712926249008724, + "grad_norm": 1.6953125, + "learning_rate": 1.3359225955629889e-05, + "loss": 0.9075, + "step": 6901 + }, + { + "epoch": 1.1714640889898622, + "grad_norm": 1.71875, + "learning_rate": 1.3357524915902488e-05, + "loss": 0.9485, + "step": 6902 + }, + { + "epoch": 1.171635553078852, + "grad_norm": 1.6640625, + "learning_rate": 1.335582376667356e-05, + "loss": 0.8911, + "step": 6903 + }, + { + "epoch": 1.1718070171678419, + "grad_norm": 1.625, + "learning_rate": 1.3354122507998584e-05, + "loss": 0.8715, + "step": 6904 + }, + { + "epoch": 1.1719784812568317, + "grad_norm": 1.6484375, + "learning_rate": 1.3352421139933038e-05, + "loss": 0.8291, + "step": 6905 + }, + { + "epoch": 1.1721499453458217, + "grad_norm": 1.625, + "learning_rate": 1.3350719662532416e-05, + "loss": 0.8443, + "step": 6906 + }, + { + "epoch": 1.1723214094348116, + "grad_norm": 1.6171875, + "learning_rate": 1.334901807585221e-05, + "loss": 0.8684, + "step": 6907 + }, + { + "epoch": 1.1724928735238014, + "grad_norm": 1.6796875, + "learning_rate": 1.3347316379947912e-05, + "loss": 0.8409, + "step": 6908 + }, + { + "epoch": 1.1726643376127912, + "grad_norm": 1.703125, + "learning_rate": 1.3345614574875022e-05, + "loss": 0.9154, + "step": 6909 + }, + { + "epoch": 1.172835801701781, + "grad_norm": 1.671875, + "learning_rate": 1.334391266068904e-05, + "loss": 0.8695, + "step": 6910 + }, + { + "epoch": 1.1730072657907709, + "grad_norm": 1.7109375, + "learning_rate": 1.3342210637445478e-05, + "loss": 0.8835, + "step": 6911 + }, + { + "epoch": 1.1731787298797607, + "grad_norm": 1.6875, + "learning_rate": 1.3340508505199839e-05, + "loss": 0.8492, + "step": 6912 + }, + { + "epoch": 1.1733501939687507, + "grad_norm": 1.6484375, + "learning_rate": 1.333880626400764e-05, + "loss": 0.793, + "step": 6913 + }, + { + "epoch": 1.1735216580577406, + "grad_norm": 1.59375, + "learning_rate": 1.3337103913924394e-05, + "loss": 0.8493, + "step": 6914 + }, + { + "epoch": 1.1736931221467304, + "grad_norm": 1.734375, + "learning_rate": 1.3335401455005625e-05, + "loss": 0.8716, + "step": 6915 + }, + { + "epoch": 1.1738645862357202, + "grad_norm": 1.6484375, + "learning_rate": 1.3333698887306854e-05, + "loss": 0.8844, + "step": 6916 + }, + { + "epoch": 1.17403605032471, + "grad_norm": 1.7734375, + "learning_rate": 1.3331996210883609e-05, + "loss": 0.9383, + "step": 6917 + }, + { + "epoch": 1.1742075144137, + "grad_norm": 1.6328125, + "learning_rate": 1.3330293425791419e-05, + "loss": 0.7867, + "step": 6918 + }, + { + "epoch": 1.17437897850269, + "grad_norm": 1.6171875, + "learning_rate": 1.3328590532085822e-05, + "loss": 0.8331, + "step": 6919 + }, + { + "epoch": 1.1745504425916797, + "grad_norm": 1.640625, + "learning_rate": 1.332688752982235e-05, + "loss": 0.8586, + "step": 6920 + }, + { + "epoch": 1.1747219066806696, + "grad_norm": 1.6171875, + "learning_rate": 1.3325184419056552e-05, + "loss": 0.8463, + "step": 6921 + }, + { + "epoch": 1.1748933707696594, + "grad_norm": 1.6953125, + "learning_rate": 1.3323481199843966e-05, + "loss": 0.8227, + "step": 6922 + }, + { + "epoch": 1.1750648348586492, + "grad_norm": 1.578125, + "learning_rate": 1.3321777872240142e-05, + "loss": 0.8337, + "step": 6923 + }, + { + "epoch": 1.175236298947639, + "grad_norm": 1.6328125, + "learning_rate": 1.3320074436300635e-05, + "loss": 0.9401, + "step": 6924 + }, + { + "epoch": 1.175407763036629, + "grad_norm": 1.609375, + "learning_rate": 1.3318370892080998e-05, + "loss": 0.8901, + "step": 6925 + }, + { + "epoch": 1.175579227125619, + "grad_norm": 1.5625, + "learning_rate": 1.3316667239636792e-05, + "loss": 0.7857, + "step": 6926 + }, + { + "epoch": 1.1757506912146087, + "grad_norm": 1.625, + "learning_rate": 1.3314963479023575e-05, + "loss": 0.8501, + "step": 6927 + }, + { + "epoch": 1.1759221553035986, + "grad_norm": 1.6640625, + "learning_rate": 1.3313259610296916e-05, + "loss": 0.9171, + "step": 6928 + }, + { + "epoch": 1.1760936193925884, + "grad_norm": 1.6796875, + "learning_rate": 1.3311555633512386e-05, + "loss": 0.8805, + "step": 6929 + }, + { + "epoch": 1.1762650834815784, + "grad_norm": 1.703125, + "learning_rate": 1.3309851548725553e-05, + "loss": 0.8211, + "step": 6930 + }, + { + "epoch": 1.1764365475705683, + "grad_norm": 1.6171875, + "learning_rate": 1.3308147355992002e-05, + "loss": 0.8142, + "step": 6931 + }, + { + "epoch": 1.176608011659558, + "grad_norm": 1.7421875, + "learning_rate": 1.3306443055367306e-05, + "loss": 0.9, + "step": 6932 + }, + { + "epoch": 1.176779475748548, + "grad_norm": 1.6796875, + "learning_rate": 1.3304738646907057e-05, + "loss": 0.8249, + "step": 6933 + }, + { + "epoch": 1.1769509398375377, + "grad_norm": 1.7421875, + "learning_rate": 1.330303413066683e-05, + "loss": 0.8355, + "step": 6934 + }, + { + "epoch": 1.1771224039265276, + "grad_norm": 1.78125, + "learning_rate": 1.3301329506702221e-05, + "loss": 0.8734, + "step": 6935 + }, + { + "epoch": 1.1772938680155174, + "grad_norm": 1.6640625, + "learning_rate": 1.3299624775068826e-05, + "loss": 0.9342, + "step": 6936 + }, + { + "epoch": 1.1774653321045074, + "grad_norm": 1.6484375, + "learning_rate": 1.3297919935822243e-05, + "loss": 0.8546, + "step": 6937 + }, + { + "epoch": 1.1776367961934973, + "grad_norm": 1.671875, + "learning_rate": 1.3296214989018075e-05, + "loss": 0.8482, + "step": 6938 + }, + { + "epoch": 1.177808260282487, + "grad_norm": 1.703125, + "learning_rate": 1.3294509934711919e-05, + "loss": 0.8412, + "step": 6939 + }, + { + "epoch": 1.177979724371477, + "grad_norm": 1.671875, + "learning_rate": 1.3292804772959391e-05, + "loss": 0.8311, + "step": 6940 + }, + { + "epoch": 1.1781511884604667, + "grad_norm": 1.6875, + "learning_rate": 1.3291099503816098e-05, + "loss": 0.8382, + "step": 6941 + }, + { + "epoch": 1.1783226525494568, + "grad_norm": 1.6171875, + "learning_rate": 1.3289394127337658e-05, + "loss": 0.8111, + "step": 6942 + }, + { + "epoch": 1.1784941166384466, + "grad_norm": 1.640625, + "learning_rate": 1.3287688643579688e-05, + "loss": 0.8764, + "step": 6943 + }, + { + "epoch": 1.1786655807274364, + "grad_norm": 1.6328125, + "learning_rate": 1.3285983052597813e-05, + "loss": 0.8151, + "step": 6944 + }, + { + "epoch": 1.1788370448164263, + "grad_norm": 1.6171875, + "learning_rate": 1.3284277354447655e-05, + "loss": 0.8074, + "step": 6945 + }, + { + "epoch": 1.179008508905416, + "grad_norm": 1.671875, + "learning_rate": 1.3282571549184844e-05, + "loss": 0.8439, + "step": 6946 + }, + { + "epoch": 1.179179972994406, + "grad_norm": 1.7265625, + "learning_rate": 1.3280865636865014e-05, + "loss": 0.9015, + "step": 6947 + }, + { + "epoch": 1.1793514370833957, + "grad_norm": 1.6015625, + "learning_rate": 1.3279159617543801e-05, + "loss": 0.8248, + "step": 6948 + }, + { + "epoch": 1.1795229011723858, + "grad_norm": 1.640625, + "learning_rate": 1.3277453491276849e-05, + "loss": 0.9074, + "step": 6949 + }, + { + "epoch": 1.1796943652613756, + "grad_norm": 1.6328125, + "learning_rate": 1.3275747258119793e-05, + "loss": 0.81, + "step": 6950 + }, + { + "epoch": 1.1798658293503654, + "grad_norm": 1.59375, + "learning_rate": 1.3274040918128286e-05, + "loss": 0.8182, + "step": 6951 + }, + { + "epoch": 1.1800372934393553, + "grad_norm": 1.625, + "learning_rate": 1.3272334471357975e-05, + "loss": 0.8618, + "step": 6952 + }, + { + "epoch": 1.180208757528345, + "grad_norm": 1.7265625, + "learning_rate": 1.327062791786451e-05, + "loss": 0.9241, + "step": 6953 + }, + { + "epoch": 1.1803802216173351, + "grad_norm": 1.671875, + "learning_rate": 1.3268921257703557e-05, + "loss": 0.7983, + "step": 6954 + }, + { + "epoch": 1.180551685706325, + "grad_norm": 1.671875, + "learning_rate": 1.3267214490930771e-05, + "loss": 0.8148, + "step": 6955 + }, + { + "epoch": 1.1807231497953148, + "grad_norm": 1.65625, + "learning_rate": 1.3265507617601818e-05, + "loss": 0.7872, + "step": 6956 + }, + { + "epoch": 1.1808946138843046, + "grad_norm": 1.71875, + "learning_rate": 1.3263800637772363e-05, + "loss": 0.8317, + "step": 6957 + }, + { + "epoch": 1.1810660779732944, + "grad_norm": 1.640625, + "learning_rate": 1.3262093551498084e-05, + "loss": 0.7933, + "step": 6958 + }, + { + "epoch": 1.1812375420622843, + "grad_norm": 1.71875, + "learning_rate": 1.3260386358834645e-05, + "loss": 0.8589, + "step": 6959 + }, + { + "epoch": 1.181409006151274, + "grad_norm": 1.703125, + "learning_rate": 1.3258679059837731e-05, + "loss": 0.8669, + "step": 6960 + }, + { + "epoch": 1.1815804702402641, + "grad_norm": 1.796875, + "learning_rate": 1.3256971654563022e-05, + "loss": 0.8813, + "step": 6961 + }, + { + "epoch": 1.181751934329254, + "grad_norm": 1.7109375, + "learning_rate": 1.3255264143066202e-05, + "loss": 0.8569, + "step": 6962 + }, + { + "epoch": 1.1819233984182438, + "grad_norm": 1.65625, + "learning_rate": 1.3253556525402963e-05, + "loss": 0.8348, + "step": 6963 + }, + { + "epoch": 1.1820948625072336, + "grad_norm": 1.65625, + "learning_rate": 1.325184880162899e-05, + "loss": 0.8558, + "step": 6964 + }, + { + "epoch": 1.1822663265962234, + "grad_norm": 1.6171875, + "learning_rate": 1.3250140971799984e-05, + "loss": 0.8327, + "step": 6965 + }, + { + "epoch": 1.1824377906852135, + "grad_norm": 1.703125, + "learning_rate": 1.324843303597164e-05, + "loss": 0.9319, + "step": 6966 + }, + { + "epoch": 1.1826092547742033, + "grad_norm": 1.6328125, + "learning_rate": 1.3246724994199664e-05, + "loss": 0.8491, + "step": 6967 + }, + { + "epoch": 1.1827807188631931, + "grad_norm": 1.734375, + "learning_rate": 1.324501684653976e-05, + "loss": 0.8954, + "step": 6968 + }, + { + "epoch": 1.182952182952183, + "grad_norm": 1.6875, + "learning_rate": 1.3243308593047637e-05, + "loss": 0.8383, + "step": 6969 + }, + { + "epoch": 1.1831236470411728, + "grad_norm": 1.734375, + "learning_rate": 1.3241600233779008e-05, + "loss": 0.8032, + "step": 6970 + }, + { + "epoch": 1.1832951111301626, + "grad_norm": 1.6328125, + "learning_rate": 1.3239891768789587e-05, + "loss": 0.8594, + "step": 6971 + }, + { + "epoch": 1.1834665752191524, + "grad_norm": 1.7265625, + "learning_rate": 1.3238183198135098e-05, + "loss": 0.8261, + "step": 6972 + }, + { + "epoch": 1.1836380393081425, + "grad_norm": 1.6875, + "learning_rate": 1.323647452187126e-05, + "loss": 0.9486, + "step": 6973 + }, + { + "epoch": 1.1838095033971323, + "grad_norm": 1.6328125, + "learning_rate": 1.3234765740053799e-05, + "loss": 0.8751, + "step": 6974 + }, + { + "epoch": 1.1839809674861221, + "grad_norm": 1.796875, + "learning_rate": 1.3233056852738446e-05, + "loss": 0.8476, + "step": 6975 + }, + { + "epoch": 1.184152431575112, + "grad_norm": 1.6796875, + "learning_rate": 1.3231347859980937e-05, + "loss": 0.88, + "step": 6976 + }, + { + "epoch": 1.1843238956641018, + "grad_norm": 1.7265625, + "learning_rate": 1.3229638761837003e-05, + "loss": 0.8816, + "step": 6977 + }, + { + "epoch": 1.1844953597530918, + "grad_norm": 1.6484375, + "learning_rate": 1.322792955836239e-05, + "loss": 0.8436, + "step": 6978 + }, + { + "epoch": 1.1846668238420817, + "grad_norm": 1.7265625, + "learning_rate": 1.3226220249612837e-05, + "loss": 0.8455, + "step": 6979 + }, + { + "epoch": 1.1848382879310715, + "grad_norm": 1.7109375, + "learning_rate": 1.3224510835644095e-05, + "loss": 0.833, + "step": 6980 + }, + { + "epoch": 1.1850097520200613, + "grad_norm": 1.6484375, + "learning_rate": 1.322280131651191e-05, + "loss": 0.885, + "step": 6981 + }, + { + "epoch": 1.1851812161090511, + "grad_norm": 1.6640625, + "learning_rate": 1.3221091692272042e-05, + "loss": 0.8174, + "step": 6982 + }, + { + "epoch": 1.185352680198041, + "grad_norm": 1.7109375, + "learning_rate": 1.321938196298024e-05, + "loss": 0.9002, + "step": 6983 + }, + { + "epoch": 1.1855241442870308, + "grad_norm": 1.71875, + "learning_rate": 1.321767212869227e-05, + "loss": 0.8802, + "step": 6984 + }, + { + "epoch": 1.1856956083760208, + "grad_norm": 1.5859375, + "learning_rate": 1.3215962189463896e-05, + "loss": 0.8047, + "step": 6985 + }, + { + "epoch": 1.1858670724650107, + "grad_norm": 1.5859375, + "learning_rate": 1.3214252145350883e-05, + "loss": 0.8862, + "step": 6986 + }, + { + "epoch": 1.1860385365540005, + "grad_norm": 1.625, + "learning_rate": 1.3212541996409005e-05, + "loss": 0.8768, + "step": 6987 + }, + { + "epoch": 1.1862100006429903, + "grad_norm": 1.6015625, + "learning_rate": 1.3210831742694037e-05, + "loss": 0.8169, + "step": 6988 + }, + { + "epoch": 1.1863814647319801, + "grad_norm": 1.6875, + "learning_rate": 1.3209121384261754e-05, + "loss": 0.8902, + "step": 6989 + }, + { + "epoch": 1.18655292882097, + "grad_norm": 1.796875, + "learning_rate": 1.3207410921167938e-05, + "loss": 0.8861, + "step": 6990 + }, + { + "epoch": 1.18672439290996, + "grad_norm": 1.7109375, + "learning_rate": 1.3205700353468373e-05, + "loss": 0.8325, + "step": 6991 + }, + { + "epoch": 1.1868958569989498, + "grad_norm": 1.6015625, + "learning_rate": 1.3203989681218847e-05, + "loss": 0.8863, + "step": 6992 + }, + { + "epoch": 1.1870673210879397, + "grad_norm": 1.703125, + "learning_rate": 1.3202278904475154e-05, + "loss": 0.8174, + "step": 6993 + }, + { + "epoch": 1.1872387851769295, + "grad_norm": 1.703125, + "learning_rate": 1.3200568023293085e-05, + "loss": 0.8864, + "step": 6994 + }, + { + "epoch": 1.1874102492659193, + "grad_norm": 1.65625, + "learning_rate": 1.3198857037728446e-05, + "loss": 0.8373, + "step": 6995 + }, + { + "epoch": 1.1875817133549091, + "grad_norm": 1.703125, + "learning_rate": 1.3197145947837031e-05, + "loss": 0.9123, + "step": 6996 + }, + { + "epoch": 1.1877531774438992, + "grad_norm": 1.6171875, + "learning_rate": 1.3195434753674645e-05, + "loss": 0.9354, + "step": 6997 + }, + { + "epoch": 1.187924641532889, + "grad_norm": 1.6171875, + "learning_rate": 1.31937234552971e-05, + "loss": 0.7199, + "step": 6998 + }, + { + "epoch": 1.1880961056218788, + "grad_norm": 1.578125, + "learning_rate": 1.3192012052760208e-05, + "loss": 0.7695, + "step": 6999 + }, + { + "epoch": 1.1882675697108687, + "grad_norm": 1.6640625, + "learning_rate": 1.3190300546119781e-05, + "loss": 0.8186, + "step": 7000 + }, + { + "epoch": 1.1882675697108687, + "eval_loss": 0.8492981195449829, + "eval_runtime": 836.9828, + "eval_samples_per_second": 2.986, + "eval_steps_per_second": 2.986, + "step": 7000 + }, + { + "epoch": 1.1884390337998585, + "grad_norm": 1.7421875, + "learning_rate": 1.3188588935431642e-05, + "loss": 0.9399, + "step": 7001 + }, + { + "epoch": 1.1886104978888483, + "grad_norm": 1.703125, + "learning_rate": 1.3186877220751605e-05, + "loss": 0.8776, + "step": 7002 + }, + { + "epoch": 1.1887819619778384, + "grad_norm": 11.1875, + "learning_rate": 1.3185165402135507e-05, + "loss": 0.8575, + "step": 7003 + }, + { + "epoch": 1.1889534260668282, + "grad_norm": 1.6875, + "learning_rate": 1.3183453479639167e-05, + "loss": 0.8264, + "step": 7004 + }, + { + "epoch": 1.189124890155818, + "grad_norm": 1.6171875, + "learning_rate": 1.3181741453318427e-05, + "loss": 0.8503, + "step": 7005 + }, + { + "epoch": 1.1892963542448078, + "grad_norm": 1.6875, + "learning_rate": 1.3180029323229111e-05, + "loss": 0.8506, + "step": 7006 + }, + { + "epoch": 1.1894678183337977, + "grad_norm": 1.78125, + "learning_rate": 1.3178317089427066e-05, + "loss": 0.9127, + "step": 7007 + }, + { + "epoch": 1.1896392824227875, + "grad_norm": 1.640625, + "learning_rate": 1.3176604751968133e-05, + "loss": 0.8574, + "step": 7008 + }, + { + "epoch": 1.1898107465117773, + "grad_norm": 1.6015625, + "learning_rate": 1.3174892310908158e-05, + "loss": 0.8767, + "step": 7009 + }, + { + "epoch": 1.1899822106007674, + "grad_norm": 1.7578125, + "learning_rate": 1.3173179766302988e-05, + "loss": 0.8028, + "step": 7010 + }, + { + "epoch": 1.1901536746897572, + "grad_norm": 1.671875, + "learning_rate": 1.3171467118208476e-05, + "loss": 0.8199, + "step": 7011 + }, + { + "epoch": 1.190325138778747, + "grad_norm": 1.6484375, + "learning_rate": 1.3169754366680476e-05, + "loss": 0.842, + "step": 7012 + }, + { + "epoch": 1.1904966028677368, + "grad_norm": 1.7109375, + "learning_rate": 1.3168041511774856e-05, + "loss": 0.9031, + "step": 7013 + }, + { + "epoch": 1.1906680669567267, + "grad_norm": 1.6796875, + "learning_rate": 1.3166328553547469e-05, + "loss": 0.7974, + "step": 7014 + }, + { + "epoch": 1.1908395310457167, + "grad_norm": 1.6875, + "learning_rate": 1.3164615492054184e-05, + "loss": 0.8576, + "step": 7015 + }, + { + "epoch": 1.1910109951347065, + "grad_norm": 1.71875, + "learning_rate": 1.3162902327350873e-05, + "loss": 0.9097, + "step": 7016 + }, + { + "epoch": 1.1911824592236964, + "grad_norm": 1.6328125, + "learning_rate": 1.3161189059493407e-05, + "loss": 0.8865, + "step": 7017 + }, + { + "epoch": 1.1913539233126862, + "grad_norm": 1.65625, + "learning_rate": 1.315947568853766e-05, + "loss": 0.8532, + "step": 7018 + }, + { + "epoch": 1.191525387401676, + "grad_norm": 1.7578125, + "learning_rate": 1.3157762214539516e-05, + "loss": 0.9153, + "step": 7019 + }, + { + "epoch": 1.1916968514906658, + "grad_norm": 1.6796875, + "learning_rate": 1.315604863755485e-05, + "loss": 0.7673, + "step": 7020 + }, + { + "epoch": 1.1918683155796557, + "grad_norm": 1.640625, + "learning_rate": 1.3154334957639557e-05, + "loss": 0.8389, + "step": 7021 + }, + { + "epoch": 1.1920397796686457, + "grad_norm": 1.625, + "learning_rate": 1.3152621174849522e-05, + "loss": 0.7504, + "step": 7022 + }, + { + "epoch": 1.1922112437576355, + "grad_norm": 1.71875, + "learning_rate": 1.3150907289240639e-05, + "loss": 0.8595, + "step": 7023 + }, + { + "epoch": 1.1923827078466254, + "grad_norm": 1.5625, + "learning_rate": 1.3149193300868803e-05, + "loss": 0.7952, + "step": 7024 + }, + { + "epoch": 1.1925541719356152, + "grad_norm": 1.6953125, + "learning_rate": 1.314747920978992e-05, + "loss": 0.8829, + "step": 7025 + }, + { + "epoch": 1.192725636024605, + "grad_norm": 1.703125, + "learning_rate": 1.3145765016059882e-05, + "loss": 0.9387, + "step": 7026 + }, + { + "epoch": 1.192897100113595, + "grad_norm": 1.953125, + "learning_rate": 1.3144050719734602e-05, + "loss": 0.8143, + "step": 7027 + }, + { + "epoch": 1.193068564202585, + "grad_norm": 1.71875, + "learning_rate": 1.314233632086999e-05, + "loss": 0.8377, + "step": 7028 + }, + { + "epoch": 1.1932400282915747, + "grad_norm": 1.7578125, + "learning_rate": 1.3140621819521957e-05, + "loss": 0.8073, + "step": 7029 + }, + { + "epoch": 1.1934114923805645, + "grad_norm": 1.7109375, + "learning_rate": 1.313890721574642e-05, + "loss": 0.8738, + "step": 7030 + }, + { + "epoch": 1.1935829564695544, + "grad_norm": 1.8203125, + "learning_rate": 1.3137192509599297e-05, + "loss": 0.8952, + "step": 7031 + }, + { + "epoch": 1.1937544205585442, + "grad_norm": 1.703125, + "learning_rate": 1.3135477701136515e-05, + "loss": 0.8685, + "step": 7032 + }, + { + "epoch": 1.193925884647534, + "grad_norm": 1.59375, + "learning_rate": 1.3133762790413998e-05, + "loss": 0.8827, + "step": 7033 + }, + { + "epoch": 1.194097348736524, + "grad_norm": 1.7109375, + "learning_rate": 1.3132047777487676e-05, + "loss": 0.8834, + "step": 7034 + }, + { + "epoch": 1.194268812825514, + "grad_norm": 1.609375, + "learning_rate": 1.3130332662413478e-05, + "loss": 0.8182, + "step": 7035 + }, + { + "epoch": 1.1944402769145037, + "grad_norm": 1.7265625, + "learning_rate": 1.312861744524735e-05, + "loss": 0.9155, + "step": 7036 + }, + { + "epoch": 1.1946117410034935, + "grad_norm": 1.765625, + "learning_rate": 1.3126902126045223e-05, + "loss": 0.9201, + "step": 7037 + }, + { + "epoch": 1.1947832050924834, + "grad_norm": 1.734375, + "learning_rate": 1.3125186704863042e-05, + "loss": 0.8421, + "step": 7038 + }, + { + "epoch": 1.1949546691814734, + "grad_norm": 1.6484375, + "learning_rate": 1.3123471181756753e-05, + "loss": 0.8264, + "step": 7039 + }, + { + "epoch": 1.1951261332704632, + "grad_norm": 1.703125, + "learning_rate": 1.3121755556782307e-05, + "loss": 0.8418, + "step": 7040 + }, + { + "epoch": 1.195297597359453, + "grad_norm": 1.6328125, + "learning_rate": 1.3120039829995659e-05, + "loss": 0.7933, + "step": 7041 + }, + { + "epoch": 1.195469061448443, + "grad_norm": 1.71875, + "learning_rate": 1.3118324001452765e-05, + "loss": 0.8414, + "step": 7042 + }, + { + "epoch": 1.1956405255374327, + "grad_norm": 1.65625, + "learning_rate": 1.311660807120958e-05, + "loss": 0.8003, + "step": 7043 + }, + { + "epoch": 1.1958119896264225, + "grad_norm": 1.6640625, + "learning_rate": 1.3114892039322072e-05, + "loss": 0.8051, + "step": 7044 + }, + { + "epoch": 1.1959834537154124, + "grad_norm": 1.6328125, + "learning_rate": 1.31131759058462e-05, + "loss": 0.8998, + "step": 7045 + }, + { + "epoch": 1.1961549178044024, + "grad_norm": 4.8125, + "learning_rate": 1.3111459670837942e-05, + "loss": 0.9592, + "step": 7046 + }, + { + "epoch": 1.1963263818933922, + "grad_norm": 1.671875, + "learning_rate": 1.310974333435327e-05, + "loss": 0.8949, + "step": 7047 + }, + { + "epoch": 1.196497845982382, + "grad_norm": 1.75, + "learning_rate": 1.3108026896448153e-05, + "loss": 0.8741, + "step": 7048 + }, + { + "epoch": 1.196669310071372, + "grad_norm": 1.8359375, + "learning_rate": 1.3106310357178575e-05, + "loss": 0.8458, + "step": 7049 + }, + { + "epoch": 1.1968407741603617, + "grad_norm": 1.609375, + "learning_rate": 1.3104593716600521e-05, + "loss": 0.8015, + "step": 7050 + }, + { + "epoch": 1.1970122382493518, + "grad_norm": 1.640625, + "learning_rate": 1.3102876974769975e-05, + "loss": 0.8168, + "step": 7051 + }, + { + "epoch": 1.1971837023383416, + "grad_norm": 1.7265625, + "learning_rate": 1.3101160131742927e-05, + "loss": 0.8824, + "step": 7052 + }, + { + "epoch": 1.1973551664273314, + "grad_norm": 1.65625, + "learning_rate": 1.309944318757537e-05, + "loss": 0.7913, + "step": 7053 + }, + { + "epoch": 1.1975266305163212, + "grad_norm": 1.6171875, + "learning_rate": 1.30977261423233e-05, + "loss": 0.792, + "step": 7054 + }, + { + "epoch": 1.197698094605311, + "grad_norm": 1.7734375, + "learning_rate": 1.3096008996042712e-05, + "loss": 0.805, + "step": 7055 + }, + { + "epoch": 1.197869558694301, + "grad_norm": 1.5625, + "learning_rate": 1.3094291748789614e-05, + "loss": 0.7919, + "step": 7056 + }, + { + "epoch": 1.1980410227832907, + "grad_norm": 1.6640625, + "learning_rate": 1.309257440062001e-05, + "loss": 0.8871, + "step": 7057 + }, + { + "epoch": 1.1982124868722808, + "grad_norm": 1.6171875, + "learning_rate": 1.309085695158991e-05, + "loss": 0.8488, + "step": 7058 + }, + { + "epoch": 1.1983839509612706, + "grad_norm": 1.6484375, + "learning_rate": 1.3089139401755325e-05, + "loss": 0.8912, + "step": 7059 + }, + { + "epoch": 1.1985554150502604, + "grad_norm": 1.8046875, + "learning_rate": 1.3087421751172269e-05, + "loss": 0.8928, + "step": 7060 + }, + { + "epoch": 1.1987268791392502, + "grad_norm": 1.6328125, + "learning_rate": 1.3085703999896765e-05, + "loss": 0.8323, + "step": 7061 + }, + { + "epoch": 1.19889834322824, + "grad_norm": 1.625, + "learning_rate": 1.3083986147984835e-05, + "loss": 0.8322, + "step": 7062 + }, + { + "epoch": 1.1990698073172301, + "grad_norm": 1.6171875, + "learning_rate": 1.3082268195492505e-05, + "loss": 0.8949, + "step": 7063 + }, + { + "epoch": 1.19924127140622, + "grad_norm": 1.71875, + "learning_rate": 1.3080550142475798e-05, + "loss": 0.9076, + "step": 7064 + }, + { + "epoch": 1.1994127354952098, + "grad_norm": 1.7890625, + "learning_rate": 1.3078831988990757e-05, + "loss": 0.9873, + "step": 7065 + }, + { + "epoch": 1.1995841995841996, + "grad_norm": 1.7421875, + "learning_rate": 1.3077113735093407e-05, + "loss": 0.8335, + "step": 7066 + }, + { + "epoch": 1.1997556636731894, + "grad_norm": 1.7578125, + "learning_rate": 1.3075395380839787e-05, + "loss": 0.9648, + "step": 7067 + }, + { + "epoch": 1.1999271277621792, + "grad_norm": 1.75, + "learning_rate": 1.3073676926285947e-05, + "loss": 0.8638, + "step": 7068 + }, + { + "epoch": 1.200098591851169, + "grad_norm": 1.609375, + "learning_rate": 1.3071958371487927e-05, + "loss": 0.7763, + "step": 7069 + }, + { + "epoch": 1.2002700559401591, + "grad_norm": 1.59375, + "learning_rate": 1.3070239716501778e-05, + "loss": 0.7944, + "step": 7070 + }, + { + "epoch": 1.200441520029149, + "grad_norm": 1.671875, + "learning_rate": 1.3068520961383552e-05, + "loss": 0.894, + "step": 7071 + }, + { + "epoch": 1.2006129841181388, + "grad_norm": 1.6796875, + "learning_rate": 1.30668021061893e-05, + "loss": 0.9565, + "step": 7072 + }, + { + "epoch": 1.2007844482071286, + "grad_norm": 1.703125, + "learning_rate": 1.3065083150975081e-05, + "loss": 0.8892, + "step": 7073 + }, + { + "epoch": 1.2009559122961184, + "grad_norm": 1.6953125, + "learning_rate": 1.3063364095796962e-05, + "loss": 0.7983, + "step": 7074 + }, + { + "epoch": 1.2011273763851082, + "grad_norm": 1.59375, + "learning_rate": 1.3061644940711002e-05, + "loss": 0.8658, + "step": 7075 + }, + { + "epoch": 1.2012988404740983, + "grad_norm": 1.703125, + "learning_rate": 1.3059925685773273e-05, + "loss": 0.8818, + "step": 7076 + }, + { + "epoch": 1.2014703045630881, + "grad_norm": 1.8203125, + "learning_rate": 1.3058206331039842e-05, + "loss": 0.875, + "step": 7077 + }, + { + "epoch": 1.201641768652078, + "grad_norm": 1.53125, + "learning_rate": 1.305648687656679e-05, + "loss": 0.792, + "step": 7078 + }, + { + "epoch": 1.2018132327410678, + "grad_norm": 1.6796875, + "learning_rate": 1.3054767322410188e-05, + "loss": 0.7946, + "step": 7079 + }, + { + "epoch": 1.2019846968300576, + "grad_norm": 1.6484375, + "learning_rate": 1.3053047668626122e-05, + "loss": 0.812, + "step": 7080 + }, + { + "epoch": 1.2021561609190474, + "grad_norm": 1.6875, + "learning_rate": 1.3051327915270676e-05, + "loss": 0.8986, + "step": 7081 + }, + { + "epoch": 1.2023276250080375, + "grad_norm": 1.703125, + "learning_rate": 1.3049608062399934e-05, + "loss": 0.8617, + "step": 7082 + }, + { + "epoch": 1.2024990890970273, + "grad_norm": 1.6875, + "learning_rate": 1.3047888110069993e-05, + "loss": 0.8427, + "step": 7083 + }, + { + "epoch": 1.2026705531860171, + "grad_norm": 1.703125, + "learning_rate": 1.3046168058336941e-05, + "loss": 0.8435, + "step": 7084 + }, + { + "epoch": 1.202842017275007, + "grad_norm": 1.609375, + "learning_rate": 1.3044447907256877e-05, + "loss": 0.8159, + "step": 7085 + }, + { + "epoch": 1.2030134813639968, + "grad_norm": 1.65625, + "learning_rate": 1.30427276568859e-05, + "loss": 0.8177, + "step": 7086 + }, + { + "epoch": 1.2031849454529866, + "grad_norm": 1.640625, + "learning_rate": 1.304100730728012e-05, + "loss": 0.8673, + "step": 7087 + }, + { + "epoch": 1.2033564095419766, + "grad_norm": 1.6640625, + "learning_rate": 1.3039286858495642e-05, + "loss": 0.7668, + "step": 7088 + }, + { + "epoch": 1.2035278736309665, + "grad_norm": 1.734375, + "learning_rate": 1.3037566310588571e-05, + "loss": 0.9049, + "step": 7089 + }, + { + "epoch": 1.2036993377199563, + "grad_norm": 1.5546875, + "learning_rate": 1.3035845663615025e-05, + "loss": 0.8266, + "step": 7090 + }, + { + "epoch": 1.2038708018089461, + "grad_norm": 1.6484375, + "learning_rate": 1.3034124917631124e-05, + "loss": 0.7289, + "step": 7091 + }, + { + "epoch": 1.204042265897936, + "grad_norm": 1.703125, + "learning_rate": 1.3032404072692979e-05, + "loss": 0.8224, + "step": 7092 + }, + { + "epoch": 1.2042137299869258, + "grad_norm": 1.59375, + "learning_rate": 1.3030683128856719e-05, + "loss": 0.879, + "step": 7093 + }, + { + "epoch": 1.2043851940759158, + "grad_norm": 1.640625, + "learning_rate": 1.3028962086178472e-05, + "loss": 0.7933, + "step": 7094 + }, + { + "epoch": 1.2045566581649056, + "grad_norm": 1.65625, + "learning_rate": 1.3027240944714361e-05, + "loss": 0.8349, + "step": 7095 + }, + { + "epoch": 1.2047281222538955, + "grad_norm": 1.7265625, + "learning_rate": 1.3025519704520528e-05, + "loss": 0.8045, + "step": 7096 + }, + { + "epoch": 1.2048995863428853, + "grad_norm": 1.7109375, + "learning_rate": 1.3023798365653102e-05, + "loss": 0.8647, + "step": 7097 + }, + { + "epoch": 1.2050710504318751, + "grad_norm": 1.7734375, + "learning_rate": 1.3022076928168224e-05, + "loss": 0.848, + "step": 7098 + }, + { + "epoch": 1.205242514520865, + "grad_norm": 1.7421875, + "learning_rate": 1.302035539212204e-05, + "loss": 0.9455, + "step": 7099 + }, + { + "epoch": 1.205413978609855, + "grad_norm": 1.703125, + "learning_rate": 1.301863375757069e-05, + "loss": 0.8987, + "step": 7100 + }, + { + "epoch": 1.2055854426988448, + "grad_norm": 1.640625, + "learning_rate": 1.3016912024570329e-05, + "loss": 0.8748, + "step": 7101 + }, + { + "epoch": 1.2057569067878346, + "grad_norm": 1.6015625, + "learning_rate": 1.3015190193177105e-05, + "loss": 0.8823, + "step": 7102 + }, + { + "epoch": 1.2059283708768245, + "grad_norm": 1.625, + "learning_rate": 1.3013468263447175e-05, + "loss": 0.8718, + "step": 7103 + }, + { + "epoch": 1.2060998349658143, + "grad_norm": 1.7109375, + "learning_rate": 1.3011746235436698e-05, + "loss": 0.9014, + "step": 7104 + }, + { + "epoch": 1.2062712990548041, + "grad_norm": 1.65625, + "learning_rate": 1.3010024109201834e-05, + "loss": 0.8978, + "step": 7105 + }, + { + "epoch": 1.206442763143794, + "grad_norm": 1.6171875, + "learning_rate": 1.3008301884798746e-05, + "loss": 0.8466, + "step": 7106 + }, + { + "epoch": 1.206614227232784, + "grad_norm": 1.671875, + "learning_rate": 1.3006579562283607e-05, + "loss": 0.9149, + "step": 7107 + }, + { + "epoch": 1.2067856913217738, + "grad_norm": 1.703125, + "learning_rate": 1.300485714171259e-05, + "loss": 0.8793, + "step": 7108 + }, + { + "epoch": 1.2069571554107636, + "grad_norm": 1.5625, + "learning_rate": 1.3003134623141864e-05, + "loss": 0.8049, + "step": 7109 + }, + { + "epoch": 1.2071286194997535, + "grad_norm": 1.625, + "learning_rate": 1.300141200662761e-05, + "loss": 0.8108, + "step": 7110 + }, + { + "epoch": 1.2073000835887433, + "grad_norm": 1.5703125, + "learning_rate": 1.299968929222601e-05, + "loss": 0.7865, + "step": 7111 + }, + { + "epoch": 1.2074715476777333, + "grad_norm": 1.6640625, + "learning_rate": 1.2997966479993243e-05, + "loss": 0.8582, + "step": 7112 + }, + { + "epoch": 1.2076430117667232, + "grad_norm": 1.65625, + "learning_rate": 1.2996243569985501e-05, + "loss": 0.8529, + "step": 7113 + }, + { + "epoch": 1.207814475855713, + "grad_norm": 1.71875, + "learning_rate": 1.299452056225897e-05, + "loss": 0.8457, + "step": 7114 + }, + { + "epoch": 1.2079859399447028, + "grad_norm": 1.671875, + "learning_rate": 1.299279745686985e-05, + "loss": 0.8492, + "step": 7115 + }, + { + "epoch": 1.2081574040336927, + "grad_norm": 1.578125, + "learning_rate": 1.2991074253874337e-05, + "loss": 0.7936, + "step": 7116 + }, + { + "epoch": 1.2083288681226825, + "grad_norm": 1.625, + "learning_rate": 1.2989350953328628e-05, + "loss": 0.861, + "step": 7117 + }, + { + "epoch": 1.2085003322116723, + "grad_norm": 1.6328125, + "learning_rate": 1.2987627555288928e-05, + "loss": 0.7935, + "step": 7118 + }, + { + "epoch": 1.2086717963006623, + "grad_norm": 1.6953125, + "learning_rate": 1.2985904059811442e-05, + "loss": 0.8859, + "step": 7119 + }, + { + "epoch": 1.2088432603896522, + "grad_norm": 1.6015625, + "learning_rate": 1.2984180466952381e-05, + "loss": 0.8689, + "step": 7120 + }, + { + "epoch": 1.209014724478642, + "grad_norm": 1.625, + "learning_rate": 1.2982456776767957e-05, + "loss": 0.8682, + "step": 7121 + }, + { + "epoch": 1.2091861885676318, + "grad_norm": 1.6796875, + "learning_rate": 1.298073298931439e-05, + "loss": 0.803, + "step": 7122 + }, + { + "epoch": 1.2093576526566217, + "grad_norm": 1.71875, + "learning_rate": 1.297900910464789e-05, + "loss": 0.7887, + "step": 7123 + }, + { + "epoch": 1.2095291167456117, + "grad_norm": 1.8046875, + "learning_rate": 1.2977285122824688e-05, + "loss": 0.8895, + "step": 7124 + }, + { + "epoch": 1.2097005808346015, + "grad_norm": 1.6796875, + "learning_rate": 1.2975561043901008e-05, + "loss": 0.8874, + "step": 7125 + }, + { + "epoch": 1.2098720449235913, + "grad_norm": 1.7890625, + "learning_rate": 1.2973836867933076e-05, + "loss": 0.885, + "step": 7126 + }, + { + "epoch": 1.2100435090125812, + "grad_norm": 1.6328125, + "learning_rate": 1.2972112594977127e-05, + "loss": 0.8098, + "step": 7127 + }, + { + "epoch": 1.210214973101571, + "grad_norm": 1.7578125, + "learning_rate": 1.2970388225089393e-05, + "loss": 0.8832, + "step": 7128 + }, + { + "epoch": 1.2103864371905608, + "grad_norm": 1.8359375, + "learning_rate": 1.2968663758326115e-05, + "loss": 0.9134, + "step": 7129 + }, + { + "epoch": 1.2105579012795507, + "grad_norm": 1.6328125, + "learning_rate": 1.2966939194743531e-05, + "loss": 0.8345, + "step": 7130 + }, + { + "epoch": 1.2107293653685407, + "grad_norm": 1.78125, + "learning_rate": 1.2965214534397888e-05, + "loss": 0.8522, + "step": 7131 + }, + { + "epoch": 1.2109008294575305, + "grad_norm": 1.6484375, + "learning_rate": 1.2963489777345433e-05, + "loss": 0.8498, + "step": 7132 + }, + { + "epoch": 1.2110722935465204, + "grad_norm": 1.7109375, + "learning_rate": 1.2961764923642415e-05, + "loss": 0.8472, + "step": 7133 + }, + { + "epoch": 1.2112437576355102, + "grad_norm": 1.703125, + "learning_rate": 1.2960039973345094e-05, + "loss": 0.9188, + "step": 7134 + }, + { + "epoch": 1.2114152217245, + "grad_norm": 1.625, + "learning_rate": 1.295831492650972e-05, + "loss": 0.8582, + "step": 7135 + }, + { + "epoch": 1.21158668581349, + "grad_norm": 1.78125, + "learning_rate": 1.2956589783192557e-05, + "loss": 0.902, + "step": 7136 + }, + { + "epoch": 1.2117581499024799, + "grad_norm": 1.671875, + "learning_rate": 1.2954864543449866e-05, + "loss": 0.8307, + "step": 7137 + }, + { + "epoch": 1.2119296139914697, + "grad_norm": 1.703125, + "learning_rate": 1.2953139207337917e-05, + "loss": 0.9602, + "step": 7138 + }, + { + "epoch": 1.2121010780804595, + "grad_norm": 1.6328125, + "learning_rate": 1.2951413774912977e-05, + "loss": 0.8885, + "step": 7139 + }, + { + "epoch": 1.2122725421694494, + "grad_norm": 1.7578125, + "learning_rate": 1.2949688246231324e-05, + "loss": 0.9143, + "step": 7140 + }, + { + "epoch": 1.2124440062584392, + "grad_norm": 1.6796875, + "learning_rate": 1.2947962621349224e-05, + "loss": 0.955, + "step": 7141 + }, + { + "epoch": 1.212615470347429, + "grad_norm": 1.7265625, + "learning_rate": 1.2946236900322965e-05, + "loss": 0.8418, + "step": 7142 + }, + { + "epoch": 1.212786934436419, + "grad_norm": 1.71875, + "learning_rate": 1.2944511083208827e-05, + "loss": 0.8636, + "step": 7143 + }, + { + "epoch": 1.2129583985254089, + "grad_norm": 1.578125, + "learning_rate": 1.294278517006309e-05, + "loss": 0.8267, + "step": 7144 + }, + { + "epoch": 1.2131298626143987, + "grad_norm": 1.6328125, + "learning_rate": 1.294105916094205e-05, + "loss": 0.8761, + "step": 7145 + }, + { + "epoch": 1.2133013267033885, + "grad_norm": 1.640625, + "learning_rate": 1.2939333055902e-05, + "loss": 0.7752, + "step": 7146 + }, + { + "epoch": 1.2134727907923784, + "grad_norm": 1.671875, + "learning_rate": 1.2937606854999225e-05, + "loss": 0.8522, + "step": 7147 + }, + { + "epoch": 1.2136442548813684, + "grad_norm": 1.65625, + "learning_rate": 1.293588055829003e-05, + "loss": 0.8549, + "step": 7148 + }, + { + "epoch": 1.2138157189703582, + "grad_norm": 1.6328125, + "learning_rate": 1.2934154165830714e-05, + "loss": 0.8497, + "step": 7149 + }, + { + "epoch": 1.213987183059348, + "grad_norm": 1.65625, + "learning_rate": 1.2932427677677582e-05, + "loss": 0.8336, + "step": 7150 + }, + { + "epoch": 1.2141586471483379, + "grad_norm": 1.6953125, + "learning_rate": 1.2930701093886942e-05, + "loss": 0.8674, + "step": 7151 + }, + { + "epoch": 1.2143301112373277, + "grad_norm": 1.6875, + "learning_rate": 1.29289744145151e-05, + "loss": 0.9201, + "step": 7152 + }, + { + "epoch": 1.2145015753263175, + "grad_norm": 1.6953125, + "learning_rate": 1.2927247639618376e-05, + "loss": 0.8189, + "step": 7153 + }, + { + "epoch": 1.2146730394153074, + "grad_norm": 1.6953125, + "learning_rate": 1.2925520769253085e-05, + "loss": 0.9168, + "step": 7154 + }, + { + "epoch": 1.2148445035042974, + "grad_norm": 1.6640625, + "learning_rate": 1.2923793803475542e-05, + "loss": 0.8362, + "step": 7155 + }, + { + "epoch": 1.2150159675932872, + "grad_norm": 1.7421875, + "learning_rate": 1.2922066742342074e-05, + "loss": 0.8896, + "step": 7156 + }, + { + "epoch": 1.215187431682277, + "grad_norm": 1.6640625, + "learning_rate": 1.2920339585909006e-05, + "loss": 0.8406, + "step": 7157 + }, + { + "epoch": 1.2153588957712669, + "grad_norm": 1.6171875, + "learning_rate": 1.291861233423267e-05, + "loss": 0.8135, + "step": 7158 + }, + { + "epoch": 1.2155303598602567, + "grad_norm": 1.6640625, + "learning_rate": 1.2916884987369391e-05, + "loss": 0.8631, + "step": 7159 + }, + { + "epoch": 1.2157018239492468, + "grad_norm": 1.6875, + "learning_rate": 1.291515754537551e-05, + "loss": 0.8561, + "step": 7160 + }, + { + "epoch": 1.2158732880382366, + "grad_norm": 1.703125, + "learning_rate": 1.2913430008307361e-05, + "loss": 0.8446, + "step": 7161 + }, + { + "epoch": 1.2160447521272264, + "grad_norm": 1.5859375, + "learning_rate": 1.2911702376221294e-05, + "loss": 0.8365, + "step": 7162 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 1.6875, + "learning_rate": 1.2909974649173646e-05, + "loss": 0.8221, + "step": 7163 + }, + { + "epoch": 1.216387680305206, + "grad_norm": 1.6640625, + "learning_rate": 1.2908246827220766e-05, + "loss": 0.8318, + "step": 7164 + }, + { + "epoch": 1.2165591443941959, + "grad_norm": 1.71875, + "learning_rate": 1.2906518910419006e-05, + "loss": 0.8276, + "step": 7165 + }, + { + "epoch": 1.2167306084831857, + "grad_norm": 1.671875, + "learning_rate": 1.2904790898824719e-05, + "loss": 0.9029, + "step": 7166 + }, + { + "epoch": 1.2169020725721758, + "grad_norm": 1.640625, + "learning_rate": 1.290306279249426e-05, + "loss": 0.7511, + "step": 7167 + }, + { + "epoch": 1.2170735366611656, + "grad_norm": 1.625, + "learning_rate": 1.2901334591483996e-05, + "loss": 0.8239, + "step": 7168 + }, + { + "epoch": 1.2172450007501554, + "grad_norm": 1.71875, + "learning_rate": 1.2899606295850284e-05, + "loss": 0.8241, + "step": 7169 + }, + { + "epoch": 1.2174164648391452, + "grad_norm": 1.625, + "learning_rate": 1.2897877905649492e-05, + "loss": 0.76, + "step": 7170 + }, + { + "epoch": 1.217587928928135, + "grad_norm": 1.6640625, + "learning_rate": 1.2896149420937988e-05, + "loss": 0.876, + "step": 7171 + }, + { + "epoch": 1.2177593930171249, + "grad_norm": 1.7109375, + "learning_rate": 1.2894420841772144e-05, + "loss": 0.911, + "step": 7172 + }, + { + "epoch": 1.217930857106115, + "grad_norm": 1.78125, + "learning_rate": 1.289269216820834e-05, + "loss": 0.9441, + "step": 7173 + }, + { + "epoch": 1.2181023211951048, + "grad_norm": 1.734375, + "learning_rate": 1.2890963400302949e-05, + "loss": 0.8377, + "step": 7174 + }, + { + "epoch": 1.2182737852840946, + "grad_norm": 1.7578125, + "learning_rate": 1.2889234538112359e-05, + "loss": 0.8068, + "step": 7175 + }, + { + "epoch": 1.2184452493730844, + "grad_norm": 1.7265625, + "learning_rate": 1.288750558169295e-05, + "loss": 0.8127, + "step": 7176 + }, + { + "epoch": 1.2186167134620742, + "grad_norm": 1.6640625, + "learning_rate": 1.2885776531101109e-05, + "loss": 0.8431, + "step": 7177 + }, + { + "epoch": 1.218788177551064, + "grad_norm": 1.6328125, + "learning_rate": 1.2884047386393228e-05, + "loss": 0.8766, + "step": 7178 + }, + { + "epoch": 1.218959641640054, + "grad_norm": 1.765625, + "learning_rate": 1.2882318147625701e-05, + "loss": 1.0066, + "step": 7179 + }, + { + "epoch": 1.219131105729044, + "grad_norm": 1.7734375, + "learning_rate": 1.2880588814854923e-05, + "loss": 0.8912, + "step": 7180 + }, + { + "epoch": 1.2193025698180338, + "grad_norm": 1.6953125, + "learning_rate": 1.2878859388137304e-05, + "loss": 0.9064, + "step": 7181 + }, + { + "epoch": 1.2194740339070236, + "grad_norm": 1.6953125, + "learning_rate": 1.2877129867529234e-05, + "loss": 0.9499, + "step": 7182 + }, + { + "epoch": 1.2196454979960134, + "grad_norm": 1.8125, + "learning_rate": 1.2875400253087126e-05, + "loss": 0.8005, + "step": 7183 + }, + { + "epoch": 1.2198169620850032, + "grad_norm": 1.59375, + "learning_rate": 1.2873670544867389e-05, + "loss": 0.8108, + "step": 7184 + }, + { + "epoch": 1.2199884261739933, + "grad_norm": 1.6328125, + "learning_rate": 1.2871940742926432e-05, + "loss": 0.8299, + "step": 7185 + }, + { + "epoch": 1.220159890262983, + "grad_norm": 1.6875, + "learning_rate": 1.2870210847320676e-05, + "loss": 0.8547, + "step": 7186 + }, + { + "epoch": 1.220331354351973, + "grad_norm": 1.7421875, + "learning_rate": 1.2868480858106535e-05, + "loss": 0.9192, + "step": 7187 + }, + { + "epoch": 1.2205028184409628, + "grad_norm": 1.640625, + "learning_rate": 1.286675077534043e-05, + "loss": 0.8247, + "step": 7188 + }, + { + "epoch": 1.2206742825299526, + "grad_norm": 1.7734375, + "learning_rate": 1.2865020599078786e-05, + "loss": 0.9666, + "step": 7189 + }, + { + "epoch": 1.2208457466189424, + "grad_norm": 1.6875, + "learning_rate": 1.2863290329378033e-05, + "loss": 0.8251, + "step": 7190 + }, + { + "epoch": 1.2210172107079325, + "grad_norm": 1.703125, + "learning_rate": 1.2861559966294602e-05, + "loss": 0.8475, + "step": 7191 + }, + { + "epoch": 1.2211886747969223, + "grad_norm": 1.6640625, + "learning_rate": 1.2859829509884924e-05, + "loss": 0.8498, + "step": 7192 + }, + { + "epoch": 1.221360138885912, + "grad_norm": 1.71875, + "learning_rate": 1.2858098960205437e-05, + "loss": 0.9615, + "step": 7193 + }, + { + "epoch": 1.221531602974902, + "grad_norm": 1.75, + "learning_rate": 1.285636831731258e-05, + "loss": 0.8848, + "step": 7194 + }, + { + "epoch": 1.2217030670638918, + "grad_norm": 1.5859375, + "learning_rate": 1.2854637581262794e-05, + "loss": 0.8595, + "step": 7195 + }, + { + "epoch": 1.2218745311528816, + "grad_norm": 1.59375, + "learning_rate": 1.2852906752112528e-05, + "loss": 0.8858, + "step": 7196 + }, + { + "epoch": 1.2220459952418716, + "grad_norm": 1.734375, + "learning_rate": 1.285117582991823e-05, + "loss": 0.9023, + "step": 7197 + }, + { + "epoch": 1.2222174593308615, + "grad_norm": 1.640625, + "learning_rate": 1.2849444814736351e-05, + "loss": 0.8564, + "step": 7198 + }, + { + "epoch": 1.2223889234198513, + "grad_norm": 1.625, + "learning_rate": 1.2847713706623348e-05, + "loss": 0.9097, + "step": 7199 + }, + { + "epoch": 1.222560387508841, + "grad_norm": 1.640625, + "learning_rate": 1.2845982505635677e-05, + "loss": 0.8507, + "step": 7200 + }, + { + "epoch": 1.222731851597831, + "grad_norm": 1.6953125, + "learning_rate": 1.2844251211829799e-05, + "loss": 0.7985, + "step": 7201 + }, + { + "epoch": 1.2229033156868208, + "grad_norm": 1.71875, + "learning_rate": 1.284251982526218e-05, + "loss": 0.863, + "step": 7202 + }, + { + "epoch": 1.2230747797758106, + "grad_norm": 1.6640625, + "learning_rate": 1.284078834598928e-05, + "loss": 0.8775, + "step": 7203 + }, + { + "epoch": 1.2232462438648006, + "grad_norm": 1.6953125, + "learning_rate": 1.2839056774067583e-05, + "loss": 0.8685, + "step": 7204 + }, + { + "epoch": 1.2234177079537905, + "grad_norm": 1.71875, + "learning_rate": 1.2837325109553549e-05, + "loss": 0.8441, + "step": 7205 + }, + { + "epoch": 1.2235891720427803, + "grad_norm": 1.796875, + "learning_rate": 1.283559335250366e-05, + "loss": 0.8704, + "step": 7206 + }, + { + "epoch": 1.22376063613177, + "grad_norm": 1.6328125, + "learning_rate": 1.2833861502974392e-05, + "loss": 0.901, + "step": 7207 + }, + { + "epoch": 1.22393210022076, + "grad_norm": 1.6484375, + "learning_rate": 1.2832129561022228e-05, + "loss": 0.7814, + "step": 7208 + }, + { + "epoch": 1.22410356430975, + "grad_norm": 1.578125, + "learning_rate": 1.2830397526703659e-05, + "loss": 0.789, + "step": 7209 + }, + { + "epoch": 1.2242750283987398, + "grad_norm": 1.6953125, + "learning_rate": 1.2828665400075166e-05, + "loss": 0.9164, + "step": 7210 + }, + { + "epoch": 1.2244464924877296, + "grad_norm": 1.7734375, + "learning_rate": 1.2826933181193244e-05, + "loss": 0.9192, + "step": 7211 + }, + { + "epoch": 1.2246179565767195, + "grad_norm": 1.6640625, + "learning_rate": 1.2825200870114382e-05, + "loss": 0.8172, + "step": 7212 + }, + { + "epoch": 1.2247894206657093, + "grad_norm": 1.625, + "learning_rate": 1.2823468466895083e-05, + "loss": 0.8955, + "step": 7213 + }, + { + "epoch": 1.224960884754699, + "grad_norm": 1.625, + "learning_rate": 1.2821735971591846e-05, + "loss": 0.8579, + "step": 7214 + }, + { + "epoch": 1.225132348843689, + "grad_norm": 1.5859375, + "learning_rate": 1.2820003384261175e-05, + "loss": 0.8119, + "step": 7215 + }, + { + "epoch": 1.225303812932679, + "grad_norm": 1.6953125, + "learning_rate": 1.281827070495957e-05, + "loss": 0.9551, + "step": 7216 + }, + { + "epoch": 1.2254752770216688, + "grad_norm": 1.8203125, + "learning_rate": 1.2816537933743547e-05, + "loss": 0.9452, + "step": 7217 + }, + { + "epoch": 1.2256467411106586, + "grad_norm": 1.7421875, + "learning_rate": 1.2814805070669616e-05, + "loss": 0.8825, + "step": 7218 + }, + { + "epoch": 1.2258182051996485, + "grad_norm": 1.6328125, + "learning_rate": 1.2813072115794292e-05, + "loss": 0.84, + "step": 7219 + }, + { + "epoch": 1.2259896692886383, + "grad_norm": 1.765625, + "learning_rate": 1.2811339069174091e-05, + "loss": 0.9508, + "step": 7220 + }, + { + "epoch": 1.2261611333776283, + "grad_norm": 1.640625, + "learning_rate": 1.280960593086554e-05, + "loss": 0.8388, + "step": 7221 + }, + { + "epoch": 1.2263325974666182, + "grad_norm": 1.8125, + "learning_rate": 1.280787270092516e-05, + "loss": 0.8698, + "step": 7222 + }, + { + "epoch": 1.226504061555608, + "grad_norm": 1.6953125, + "learning_rate": 1.2806139379409475e-05, + "loss": 0.8033, + "step": 7223 + }, + { + "epoch": 1.2266755256445978, + "grad_norm": 1.8203125, + "learning_rate": 1.2804405966375018e-05, + "loss": 0.9809, + "step": 7224 + }, + { + "epoch": 1.2268469897335876, + "grad_norm": 1.7265625, + "learning_rate": 1.2802672461878323e-05, + "loss": 0.857, + "step": 7225 + }, + { + "epoch": 1.2270184538225775, + "grad_norm": 1.546875, + "learning_rate": 1.2800938865975923e-05, + "loss": 0.7538, + "step": 7226 + }, + { + "epoch": 1.2271899179115673, + "grad_norm": 1.609375, + "learning_rate": 1.2799205178724362e-05, + "loss": 0.8331, + "step": 7227 + }, + { + "epoch": 1.2273613820005573, + "grad_norm": 1.7578125, + "learning_rate": 1.2797471400180177e-05, + "loss": 0.9054, + "step": 7228 + }, + { + "epoch": 1.2275328460895472, + "grad_norm": 1.703125, + "learning_rate": 1.2795737530399919e-05, + "loss": 0.8271, + "step": 7229 + }, + { + "epoch": 1.227704310178537, + "grad_norm": 1.6328125, + "learning_rate": 1.2794003569440128e-05, + "loss": 0.836, + "step": 7230 + }, + { + "epoch": 1.2278757742675268, + "grad_norm": 1.703125, + "learning_rate": 1.2792269517357361e-05, + "loss": 0.9147, + "step": 7231 + }, + { + "epoch": 1.2280472383565166, + "grad_norm": 1.7109375, + "learning_rate": 1.279053537420817e-05, + "loss": 0.8463, + "step": 7232 + }, + { + "epoch": 1.2282187024455067, + "grad_norm": 1.640625, + "learning_rate": 1.2788801140049117e-05, + "loss": 0.7947, + "step": 7233 + }, + { + "epoch": 1.2283901665344965, + "grad_norm": 1.6640625, + "learning_rate": 1.2787066814936753e-05, + "loss": 0.8187, + "step": 7234 + }, + { + "epoch": 1.2285616306234863, + "grad_norm": 1.703125, + "learning_rate": 1.2785332398927641e-05, + "loss": 0.8147, + "step": 7235 + }, + { + "epoch": 1.2287330947124762, + "grad_norm": 1.703125, + "learning_rate": 1.2783597892078357e-05, + "loss": 0.8814, + "step": 7236 + }, + { + "epoch": 1.228904558801466, + "grad_norm": 1.609375, + "learning_rate": 1.278186329444546e-05, + "loss": 0.8836, + "step": 7237 + }, + { + "epoch": 1.2290760228904558, + "grad_norm": 1.640625, + "learning_rate": 1.278012860608553e-05, + "loss": 0.8651, + "step": 7238 + }, + { + "epoch": 1.2292474869794456, + "grad_norm": 1.6640625, + "learning_rate": 1.2778393827055133e-05, + "loss": 0.8794, + "step": 7239 + }, + { + "epoch": 1.2294189510684357, + "grad_norm": 1.65625, + "learning_rate": 1.2776658957410852e-05, + "loss": 0.7917, + "step": 7240 + }, + { + "epoch": 1.2295904151574255, + "grad_norm": 1.609375, + "learning_rate": 1.2774923997209268e-05, + "loss": 0.8094, + "step": 7241 + }, + { + "epoch": 1.2297618792464153, + "grad_norm": 1.609375, + "learning_rate": 1.277318894650696e-05, + "loss": 0.867, + "step": 7242 + }, + { + "epoch": 1.2299333433354052, + "grad_norm": 1.703125, + "learning_rate": 1.2771453805360521e-05, + "loss": 0.8815, + "step": 7243 + }, + { + "epoch": 1.230104807424395, + "grad_norm": 1.7421875, + "learning_rate": 1.2769718573826536e-05, + "loss": 0.8418, + "step": 7244 + }, + { + "epoch": 1.230276271513385, + "grad_norm": 1.6953125, + "learning_rate": 1.2767983251961596e-05, + "loss": 0.9079, + "step": 7245 + }, + { + "epoch": 1.2304477356023749, + "grad_norm": 1.703125, + "learning_rate": 1.2766247839822302e-05, + "loss": 0.8975, + "step": 7246 + }, + { + "epoch": 1.2306191996913647, + "grad_norm": 1.640625, + "learning_rate": 1.2764512337465247e-05, + "loss": 0.9299, + "step": 7247 + }, + { + "epoch": 1.2307906637803545, + "grad_norm": 1.703125, + "learning_rate": 1.2762776744947034e-05, + "loss": 0.8715, + "step": 7248 + }, + { + "epoch": 1.2309621278693443, + "grad_norm": 1.6171875, + "learning_rate": 1.2761041062324268e-05, + "loss": 0.8434, + "step": 7249 + }, + { + "epoch": 1.2311335919583342, + "grad_norm": 1.6953125, + "learning_rate": 1.275930528965356e-05, + "loss": 0.9107, + "step": 7250 + }, + { + "epoch": 1.231305056047324, + "grad_norm": 1.7265625, + "learning_rate": 1.2757569426991514e-05, + "loss": 0.9146, + "step": 7251 + }, + { + "epoch": 1.231476520136314, + "grad_norm": 1.6328125, + "learning_rate": 1.2755833474394744e-05, + "loss": 0.8134, + "step": 7252 + }, + { + "epoch": 1.2316479842253039, + "grad_norm": 1.5390625, + "learning_rate": 1.2754097431919864e-05, + "loss": 0.8832, + "step": 7253 + }, + { + "epoch": 1.2318194483142937, + "grad_norm": 1.703125, + "learning_rate": 1.2752361299623499e-05, + "loss": 0.9229, + "step": 7254 + }, + { + "epoch": 1.2319909124032835, + "grad_norm": 1.7109375, + "learning_rate": 1.2750625077562266e-05, + "loss": 0.9005, + "step": 7255 + }, + { + "epoch": 1.2321623764922733, + "grad_norm": 1.6640625, + "learning_rate": 1.2748888765792792e-05, + "loss": 0.902, + "step": 7256 + }, + { + "epoch": 1.2323338405812634, + "grad_norm": 1.6171875, + "learning_rate": 1.2747152364371706e-05, + "loss": 0.8608, + "step": 7257 + }, + { + "epoch": 1.2325053046702532, + "grad_norm": 1.7421875, + "learning_rate": 1.2745415873355636e-05, + "loss": 0.7951, + "step": 7258 + }, + { + "epoch": 1.232676768759243, + "grad_norm": 1.6796875, + "learning_rate": 1.2743679292801215e-05, + "loss": 0.8875, + "step": 7259 + }, + { + "epoch": 1.2328482328482329, + "grad_norm": 1.6640625, + "learning_rate": 1.274194262276508e-05, + "loss": 0.8669, + "step": 7260 + }, + { + "epoch": 1.2330196969372227, + "grad_norm": 1.703125, + "learning_rate": 1.2740205863303873e-05, + "loss": 0.942, + "step": 7261 + }, + { + "epoch": 1.2331911610262125, + "grad_norm": 1.625, + "learning_rate": 1.2738469014474233e-05, + "loss": 0.8201, + "step": 7262 + }, + { + "epoch": 1.2333626251152023, + "grad_norm": 1.6484375, + "learning_rate": 1.2736732076332802e-05, + "loss": 0.8711, + "step": 7263 + }, + { + "epoch": 1.2335340892041924, + "grad_norm": 1.71875, + "learning_rate": 1.2734995048936236e-05, + "loss": 1.0015, + "step": 7264 + }, + { + "epoch": 1.2337055532931822, + "grad_norm": 1.5546875, + "learning_rate": 1.2733257932341182e-05, + "loss": 0.7611, + "step": 7265 + }, + { + "epoch": 1.233877017382172, + "grad_norm": 1.78125, + "learning_rate": 1.2731520726604298e-05, + "loss": 0.8811, + "step": 7266 + }, + { + "epoch": 1.2340484814711619, + "grad_norm": 1.6953125, + "learning_rate": 1.2729783431782234e-05, + "loss": 0.9193, + "step": 7267 + }, + { + "epoch": 1.2342199455601517, + "grad_norm": 1.6875, + "learning_rate": 1.2728046047931655e-05, + "loss": 0.8882, + "step": 7268 + }, + { + "epoch": 1.2343914096491415, + "grad_norm": 1.65625, + "learning_rate": 1.2726308575109219e-05, + "loss": 0.8562, + "step": 7269 + }, + { + "epoch": 1.2345628737381316, + "grad_norm": 1.59375, + "learning_rate": 1.2724571013371593e-05, + "loss": 0.865, + "step": 7270 + }, + { + "epoch": 1.2347343378271214, + "grad_norm": 1.6875, + "learning_rate": 1.2722833362775448e-05, + "loss": 0.8359, + "step": 7271 + }, + { + "epoch": 1.2349058019161112, + "grad_norm": 1.6796875, + "learning_rate": 1.2721095623377456e-05, + "loss": 0.9229, + "step": 7272 + }, + { + "epoch": 1.235077266005101, + "grad_norm": 1.8515625, + "learning_rate": 1.2719357795234287e-05, + "loss": 0.9621, + "step": 7273 + }, + { + "epoch": 1.2352487300940909, + "grad_norm": 1.6171875, + "learning_rate": 1.2717619878402618e-05, + "loss": 0.834, + "step": 7274 + }, + { + "epoch": 1.2354201941830807, + "grad_norm": 1.546875, + "learning_rate": 1.2715881872939133e-05, + "loss": 0.7711, + "step": 7275 + }, + { + "epoch": 1.2355916582720707, + "grad_norm": 1.6484375, + "learning_rate": 1.2714143778900514e-05, + "loss": 0.8419, + "step": 7276 + }, + { + "epoch": 1.2357631223610606, + "grad_norm": 1.6328125, + "learning_rate": 1.2712405596343444e-05, + "loss": 0.8519, + "step": 7277 + }, + { + "epoch": 1.2359345864500504, + "grad_norm": 1.7109375, + "learning_rate": 1.2710667325324618e-05, + "loss": 0.9297, + "step": 7278 + }, + { + "epoch": 1.2361060505390402, + "grad_norm": 1.7109375, + "learning_rate": 1.2708928965900719e-05, + "loss": 0.9045, + "step": 7279 + }, + { + "epoch": 1.23627751462803, + "grad_norm": 1.7265625, + "learning_rate": 1.2707190518128446e-05, + "loss": 0.9259, + "step": 7280 + }, + { + "epoch": 1.2364489787170199, + "grad_norm": 1.6171875, + "learning_rate": 1.2705451982064497e-05, + "loss": 0.8243, + "step": 7281 + }, + { + "epoch": 1.23662044280601, + "grad_norm": 1.6796875, + "learning_rate": 1.2703713357765569e-05, + "loss": 0.8922, + "step": 7282 + }, + { + "epoch": 1.2367919068949997, + "grad_norm": 1.828125, + "learning_rate": 1.2701974645288366e-05, + "loss": 0.8736, + "step": 7283 + }, + { + "epoch": 1.2369633709839896, + "grad_norm": 1.7265625, + "learning_rate": 1.2700235844689602e-05, + "loss": 0.9215, + "step": 7284 + }, + { + "epoch": 1.2371348350729794, + "grad_norm": 1.6953125, + "learning_rate": 1.2698496956025972e-05, + "loss": 0.9459, + "step": 7285 + }, + { + "epoch": 1.2373062991619692, + "grad_norm": 1.7265625, + "learning_rate": 1.2696757979354199e-05, + "loss": 0.9388, + "step": 7286 + }, + { + "epoch": 1.237477763250959, + "grad_norm": 1.640625, + "learning_rate": 1.269501891473099e-05, + "loss": 0.8158, + "step": 7287 + }, + { + "epoch": 1.2376492273399489, + "grad_norm": 1.6796875, + "learning_rate": 1.2693279762213067e-05, + "loss": 0.8949, + "step": 7288 + }, + { + "epoch": 1.237820691428939, + "grad_norm": 1.6484375, + "learning_rate": 1.269154052185715e-05, + "loss": 0.8572, + "step": 7289 + }, + { + "epoch": 1.2379921555179287, + "grad_norm": 1.7109375, + "learning_rate": 1.268980119371996e-05, + "loss": 0.7945, + "step": 7290 + }, + { + "epoch": 1.2381636196069186, + "grad_norm": 1.765625, + "learning_rate": 1.2688061777858223e-05, + "loss": 0.9051, + "step": 7291 + }, + { + "epoch": 1.2383350836959084, + "grad_norm": 1.84375, + "learning_rate": 1.268632227432867e-05, + "loss": 0.9626, + "step": 7292 + }, + { + "epoch": 1.2385065477848982, + "grad_norm": 1.765625, + "learning_rate": 1.2684582683188033e-05, + "loss": 0.8529, + "step": 7293 + }, + { + "epoch": 1.2386780118738883, + "grad_norm": 1.6875, + "learning_rate": 1.2682843004493047e-05, + "loss": 0.8847, + "step": 7294 + }, + { + "epoch": 1.238849475962878, + "grad_norm": 1.7265625, + "learning_rate": 1.2681103238300446e-05, + "loss": 0.8428, + "step": 7295 + }, + { + "epoch": 1.239020940051868, + "grad_norm": 1.6953125, + "learning_rate": 1.2679363384666972e-05, + "loss": 0.8807, + "step": 7296 + }, + { + "epoch": 1.2391924041408577, + "grad_norm": 1.7109375, + "learning_rate": 1.267762344364937e-05, + "loss": 0.8548, + "step": 7297 + }, + { + "epoch": 1.2393638682298476, + "grad_norm": 1.625, + "learning_rate": 1.2675883415304383e-05, + "loss": 0.8498, + "step": 7298 + }, + { + "epoch": 1.2395353323188374, + "grad_norm": 1.75, + "learning_rate": 1.2674143299688761e-05, + "loss": 0.8959, + "step": 7299 + }, + { + "epoch": 1.2397067964078272, + "grad_norm": 1.6484375, + "learning_rate": 1.2672403096859257e-05, + "loss": 0.8864, + "step": 7300 + }, + { + "epoch": 1.2398782604968173, + "grad_norm": 1.7421875, + "learning_rate": 1.2670662806872625e-05, + "loss": 0.8839, + "step": 7301 + }, + { + "epoch": 1.240049724585807, + "grad_norm": 1.6328125, + "learning_rate": 1.2668922429785619e-05, + "loss": 0.8581, + "step": 7302 + }, + { + "epoch": 1.240221188674797, + "grad_norm": 1.6484375, + "learning_rate": 1.2667181965655006e-05, + "loss": 0.8189, + "step": 7303 + }, + { + "epoch": 1.2403926527637867, + "grad_norm": 1.6640625, + "learning_rate": 1.2665441414537544e-05, + "loss": 0.8233, + "step": 7304 + }, + { + "epoch": 1.2405641168527766, + "grad_norm": 1.6328125, + "learning_rate": 1.266370077649e-05, + "loss": 0.8569, + "step": 7305 + }, + { + "epoch": 1.2407355809417666, + "grad_norm": 1.6796875, + "learning_rate": 1.2661960051569144e-05, + "loss": 0.8426, + "step": 7306 + }, + { + "epoch": 1.2409070450307564, + "grad_norm": 1.59375, + "learning_rate": 1.2660219239831748e-05, + "loss": 0.842, + "step": 7307 + }, + { + "epoch": 1.2410785091197463, + "grad_norm": 1.6953125, + "learning_rate": 1.2658478341334583e-05, + "loss": 0.943, + "step": 7308 + }, + { + "epoch": 1.241249973208736, + "grad_norm": 1.6328125, + "learning_rate": 1.265673735613443e-05, + "loss": 0.8756, + "step": 7309 + }, + { + "epoch": 1.241421437297726, + "grad_norm": 1.5546875, + "learning_rate": 1.2654996284288063e-05, + "loss": 0.7903, + "step": 7310 + }, + { + "epoch": 1.2415929013867157, + "grad_norm": 1.6796875, + "learning_rate": 1.2653255125852272e-05, + "loss": 0.8261, + "step": 7311 + }, + { + "epoch": 1.2417643654757056, + "grad_norm": 1.6484375, + "learning_rate": 1.2651513880883841e-05, + "loss": 0.8329, + "step": 7312 + }, + { + "epoch": 1.2419358295646956, + "grad_norm": 1.7578125, + "learning_rate": 1.264977254943956e-05, + "loss": 0.9497, + "step": 7313 + }, + { + "epoch": 1.2421072936536854, + "grad_norm": 1.71875, + "learning_rate": 1.2648031131576217e-05, + "loss": 0.9085, + "step": 7314 + }, + { + "epoch": 1.2422787577426753, + "grad_norm": 1.7421875, + "learning_rate": 1.2646289627350605e-05, + "loss": 0.8737, + "step": 7315 + }, + { + "epoch": 1.242450221831665, + "grad_norm": 1.6953125, + "learning_rate": 1.2644548036819525e-05, + "loss": 0.8706, + "step": 7316 + }, + { + "epoch": 1.242621685920655, + "grad_norm": 1.65625, + "learning_rate": 1.2642806360039774e-05, + "loss": 0.7986, + "step": 7317 + }, + { + "epoch": 1.242793150009645, + "grad_norm": 1.6484375, + "learning_rate": 1.2641064597068156e-05, + "loss": 0.9261, + "step": 7318 + }, + { + "epoch": 1.2429646140986348, + "grad_norm": 1.734375, + "learning_rate": 1.2639322747961476e-05, + "loss": 0.8666, + "step": 7319 + }, + { + "epoch": 1.2431360781876246, + "grad_norm": 1.640625, + "learning_rate": 1.2637580812776542e-05, + "loss": 0.8477, + "step": 7320 + }, + { + "epoch": 1.2433075422766144, + "grad_norm": 1.6953125, + "learning_rate": 1.2635838791570166e-05, + "loss": 0.8551, + "step": 7321 + }, + { + "epoch": 1.2434790063656043, + "grad_norm": 1.71875, + "learning_rate": 1.263409668439916e-05, + "loss": 0.954, + "step": 7322 + }, + { + "epoch": 1.243650470454594, + "grad_norm": 1.7421875, + "learning_rate": 1.2632354491320342e-05, + "loss": 0.8462, + "step": 7323 + }, + { + "epoch": 1.243821934543584, + "grad_norm": 1.734375, + "learning_rate": 1.2630612212390534e-05, + "loss": 0.8168, + "step": 7324 + }, + { + "epoch": 1.243993398632574, + "grad_norm": 1.578125, + "learning_rate": 1.2628869847666556e-05, + "loss": 0.7898, + "step": 7325 + }, + { + "epoch": 1.2441648627215638, + "grad_norm": 1.6328125, + "learning_rate": 1.2627127397205232e-05, + "loss": 0.803, + "step": 7326 + }, + { + "epoch": 1.2443363268105536, + "grad_norm": 1.7578125, + "learning_rate": 1.262538486106339e-05, + "loss": 0.8628, + "step": 7327 + }, + { + "epoch": 1.2445077908995434, + "grad_norm": 1.703125, + "learning_rate": 1.2623642239297862e-05, + "loss": 0.8727, + "step": 7328 + }, + { + "epoch": 1.2446792549885333, + "grad_norm": 1.7265625, + "learning_rate": 1.2621899531965477e-05, + "loss": 0.9508, + "step": 7329 + }, + { + "epoch": 1.2448507190775233, + "grad_norm": 1.6796875, + "learning_rate": 1.2620156739123079e-05, + "loss": 0.8475, + "step": 7330 + }, + { + "epoch": 1.2450221831665131, + "grad_norm": 1.6796875, + "learning_rate": 1.2618413860827506e-05, + "loss": 0.8502, + "step": 7331 + }, + { + "epoch": 1.245193647255503, + "grad_norm": 1.609375, + "learning_rate": 1.2616670897135592e-05, + "loss": 0.7271, + "step": 7332 + }, + { + "epoch": 1.2453651113444928, + "grad_norm": 1.6875, + "learning_rate": 1.2614927848104189e-05, + "loss": 0.851, + "step": 7333 + }, + { + "epoch": 1.2455365754334826, + "grad_norm": 1.59375, + "learning_rate": 1.261318471379014e-05, + "loss": 0.7724, + "step": 7334 + }, + { + "epoch": 1.2457080395224724, + "grad_norm": 1.71875, + "learning_rate": 1.2611441494250299e-05, + "loss": 0.8478, + "step": 7335 + }, + { + "epoch": 1.2458795036114623, + "grad_norm": 1.6640625, + "learning_rate": 1.260969818954152e-05, + "loss": 0.8933, + "step": 7336 + }, + { + "epoch": 1.2460509677004523, + "grad_norm": 1.609375, + "learning_rate": 1.2607954799720647e-05, + "loss": 0.8304, + "step": 7337 + }, + { + "epoch": 1.2462224317894421, + "grad_norm": 1.6796875, + "learning_rate": 1.2606211324844555e-05, + "loss": 0.8571, + "step": 7338 + }, + { + "epoch": 1.246393895878432, + "grad_norm": 1.6875, + "learning_rate": 1.2604467764970094e-05, + "loss": 0.8683, + "step": 7339 + }, + { + "epoch": 1.2465653599674218, + "grad_norm": 1.671875, + "learning_rate": 1.2602724120154134e-05, + "loss": 0.8936, + "step": 7340 + }, + { + "epoch": 1.2467368240564116, + "grad_norm": 1.7109375, + "learning_rate": 1.2600980390453537e-05, + "loss": 0.9597, + "step": 7341 + }, + { + "epoch": 1.2469082881454017, + "grad_norm": 1.71875, + "learning_rate": 1.2599236575925178e-05, + "loss": 0.8553, + "step": 7342 + }, + { + "epoch": 1.2470797522343915, + "grad_norm": 1.6328125, + "learning_rate": 1.2597492676625925e-05, + "loss": 0.806, + "step": 7343 + }, + { + "epoch": 1.2472512163233813, + "grad_norm": 1.8125, + "learning_rate": 1.2595748692612654e-05, + "loss": 0.81, + "step": 7344 + }, + { + "epoch": 1.2474226804123711, + "grad_norm": 1.6953125, + "learning_rate": 1.2594004623942244e-05, + "loss": 0.8586, + "step": 7345 + }, + { + "epoch": 1.247594144501361, + "grad_norm": 1.6640625, + "learning_rate": 1.2592260470671576e-05, + "loss": 0.8506, + "step": 7346 + }, + { + "epoch": 1.2477656085903508, + "grad_norm": 1.703125, + "learning_rate": 1.259051623285753e-05, + "loss": 0.9173, + "step": 7347 + }, + { + "epoch": 1.2479370726793406, + "grad_norm": 1.7265625, + "learning_rate": 1.2588771910556996e-05, + "loss": 0.8443, + "step": 7348 + }, + { + "epoch": 1.2481085367683307, + "grad_norm": 1.7109375, + "learning_rate": 1.2587027503826863e-05, + "loss": 0.8387, + "step": 7349 + }, + { + "epoch": 1.2482800008573205, + "grad_norm": 1.59375, + "learning_rate": 1.2585283012724019e-05, + "loss": 0.852, + "step": 7350 + }, + { + "epoch": 1.2484514649463103, + "grad_norm": 1.5625, + "learning_rate": 1.2583538437305363e-05, + "loss": 0.8271, + "step": 7351 + }, + { + "epoch": 1.2486229290353001, + "grad_norm": 1.609375, + "learning_rate": 1.2581793777627787e-05, + "loss": 0.9076, + "step": 7352 + }, + { + "epoch": 1.24879439312429, + "grad_norm": 1.609375, + "learning_rate": 1.2580049033748195e-05, + "loss": 0.9031, + "step": 7353 + }, + { + "epoch": 1.24896585721328, + "grad_norm": 1.703125, + "learning_rate": 1.257830420572349e-05, + "loss": 0.9212, + "step": 7354 + }, + { + "epoch": 1.2491373213022698, + "grad_norm": 1.671875, + "learning_rate": 1.2576559293610575e-05, + "loss": 0.8859, + "step": 7355 + }, + { + "epoch": 1.2493087853912597, + "grad_norm": 1.7109375, + "learning_rate": 1.257481429746636e-05, + "loss": 0.815, + "step": 7356 + }, + { + "epoch": 1.2494802494802495, + "grad_norm": 1.734375, + "learning_rate": 1.2573069217347751e-05, + "loss": 0.9199, + "step": 7357 + }, + { + "epoch": 1.2496517135692393, + "grad_norm": 1.6875, + "learning_rate": 1.257132405331167e-05, + "loss": 0.7968, + "step": 7358 + }, + { + "epoch": 1.2498231776582291, + "grad_norm": 1.6875, + "learning_rate": 1.2569578805415026e-05, + "loss": 0.8123, + "step": 7359 + }, + { + "epoch": 1.249994641747219, + "grad_norm": 1.5625, + "learning_rate": 1.2567833473714743e-05, + "loss": 0.8134, + "step": 7360 + }, + { + "epoch": 1.2501661058362088, + "grad_norm": 1.796875, + "learning_rate": 1.2566088058267737e-05, + "loss": 0.8872, + "step": 7361 + }, + { + "epoch": 1.2503375699251988, + "grad_norm": 1.671875, + "learning_rate": 1.256434255913094e-05, + "loss": 0.8235, + "step": 7362 + }, + { + "epoch": 1.2505090340141887, + "grad_norm": 1.59375, + "learning_rate": 1.2562596976361276e-05, + "loss": 0.8185, + "step": 7363 + }, + { + "epoch": 1.2506804981031785, + "grad_norm": 1.609375, + "learning_rate": 1.2560851310015674e-05, + "loss": 0.861, + "step": 7364 + }, + { + "epoch": 1.2508519621921683, + "grad_norm": 1.6953125, + "learning_rate": 1.2559105560151065e-05, + "loss": 0.8713, + "step": 7365 + }, + { + "epoch": 1.2510234262811584, + "grad_norm": 1.6640625, + "learning_rate": 1.2557359726824392e-05, + "loss": 0.8448, + "step": 7366 + }, + { + "epoch": 1.2511948903701482, + "grad_norm": 1.6875, + "learning_rate": 1.2555613810092585e-05, + "loss": 0.8949, + "step": 7367 + }, + { + "epoch": 1.251366354459138, + "grad_norm": 1.6328125, + "learning_rate": 1.2553867810012588e-05, + "loss": 0.8536, + "step": 7368 + }, + { + "epoch": 1.2515378185481278, + "grad_norm": 1.703125, + "learning_rate": 1.2552121726641344e-05, + "loss": 0.8656, + "step": 7369 + }, + { + "epoch": 1.2517092826371177, + "grad_norm": 1.7734375, + "learning_rate": 1.2550375560035804e-05, + "loss": 0.8059, + "step": 7370 + }, + { + "epoch": 1.2518807467261075, + "grad_norm": 1.6015625, + "learning_rate": 1.2548629310252912e-05, + "loss": 0.8736, + "step": 7371 + }, + { + "epoch": 1.2520522108150973, + "grad_norm": 1.7421875, + "learning_rate": 1.254688297734962e-05, + "loss": 0.9381, + "step": 7372 + }, + { + "epoch": 1.2522236749040871, + "grad_norm": 1.65625, + "learning_rate": 1.254513656138288e-05, + "loss": 0.8456, + "step": 7373 + }, + { + "epoch": 1.2523951389930772, + "grad_norm": 1.609375, + "learning_rate": 1.2543390062409655e-05, + "loss": 0.8379, + "step": 7374 + }, + { + "epoch": 1.252566603082067, + "grad_norm": 1.6796875, + "learning_rate": 1.2541643480486905e-05, + "loss": 0.868, + "step": 7375 + }, + { + "epoch": 1.2527380671710568, + "grad_norm": 1.71875, + "learning_rate": 1.2539896815671585e-05, + "loss": 0.8134, + "step": 7376 + }, + { + "epoch": 1.2529095312600467, + "grad_norm": 1.7578125, + "learning_rate": 1.2538150068020672e-05, + "loss": 0.8924, + "step": 7377 + }, + { + "epoch": 1.2530809953490367, + "grad_norm": 1.8046875, + "learning_rate": 1.2536403237591125e-05, + "loss": 0.9193, + "step": 7378 + }, + { + "epoch": 1.2532524594380265, + "grad_norm": 1.8046875, + "learning_rate": 1.2534656324439912e-05, + "loss": 0.8861, + "step": 7379 + }, + { + "epoch": 1.2534239235270164, + "grad_norm": 1.6328125, + "learning_rate": 1.253290932862402e-05, + "loss": 0.8459, + "step": 7380 + }, + { + "epoch": 1.2535953876160062, + "grad_norm": 1.7109375, + "learning_rate": 1.2531162250200411e-05, + "loss": 0.9036, + "step": 7381 + }, + { + "epoch": 1.253766851704996, + "grad_norm": 1.6875, + "learning_rate": 1.2529415089226073e-05, + "loss": 0.9044, + "step": 7382 + }, + { + "epoch": 1.2539383157939858, + "grad_norm": 1.7421875, + "learning_rate": 1.2527667845757983e-05, + "loss": 0.8205, + "step": 7383 + }, + { + "epoch": 1.2541097798829757, + "grad_norm": 1.7265625, + "learning_rate": 1.2525920519853123e-05, + "loss": 0.8355, + "step": 7384 + }, + { + "epoch": 1.2542812439719655, + "grad_norm": 1.625, + "learning_rate": 1.2524173111568485e-05, + "loss": 0.8957, + "step": 7385 + }, + { + "epoch": 1.2544527080609555, + "grad_norm": 1.6875, + "learning_rate": 1.2522425620961059e-05, + "loss": 0.8491, + "step": 7386 + }, + { + "epoch": 1.2546241721499454, + "grad_norm": 1.6640625, + "learning_rate": 1.252067804808783e-05, + "loss": 0.7969, + "step": 7387 + }, + { + "epoch": 1.2547956362389352, + "grad_norm": 1.7265625, + "learning_rate": 1.2518930393005807e-05, + "loss": 0.8935, + "step": 7388 + }, + { + "epoch": 1.254967100327925, + "grad_norm": 1.65625, + "learning_rate": 1.2517182655771971e-05, + "loss": 0.814, + "step": 7389 + }, + { + "epoch": 1.255138564416915, + "grad_norm": 1.6484375, + "learning_rate": 1.2515434836443331e-05, + "loss": 0.8395, + "step": 7390 + }, + { + "epoch": 1.255310028505905, + "grad_norm": 1.640625, + "learning_rate": 1.251368693507689e-05, + "loss": 0.8807, + "step": 7391 + }, + { + "epoch": 1.2554814925948947, + "grad_norm": 1.7109375, + "learning_rate": 1.251193895172965e-05, + "loss": 0.9083, + "step": 7392 + }, + { + "epoch": 1.2556529566838845, + "grad_norm": 1.6796875, + "learning_rate": 1.2510190886458625e-05, + "loss": 0.9081, + "step": 7393 + }, + { + "epoch": 1.2558244207728744, + "grad_norm": 1.6015625, + "learning_rate": 1.250844273932082e-05, + "loss": 0.8703, + "step": 7394 + }, + { + "epoch": 1.2559958848618642, + "grad_norm": 1.5625, + "learning_rate": 1.2506694510373252e-05, + "loss": 0.8511, + "step": 7395 + }, + { + "epoch": 1.256167348950854, + "grad_norm": 1.671875, + "learning_rate": 1.2504946199672935e-05, + "loss": 0.8408, + "step": 7396 + }, + { + "epoch": 1.2563388130398438, + "grad_norm": 1.75, + "learning_rate": 1.2503197807276894e-05, + "loss": 0.9484, + "step": 7397 + }, + { + "epoch": 1.256510277128834, + "grad_norm": 1.6171875, + "learning_rate": 1.2501449333242144e-05, + "loss": 0.8141, + "step": 7398 + }, + { + "epoch": 1.2566817412178237, + "grad_norm": 1.7109375, + "learning_rate": 1.2499700777625709e-05, + "loss": 0.9525, + "step": 7399 + }, + { + "epoch": 1.2568532053068135, + "grad_norm": 1.671875, + "learning_rate": 1.2497952140484624e-05, + "loss": 0.9133, + "step": 7400 + }, + { + "epoch": 1.2570246693958034, + "grad_norm": 1.71875, + "learning_rate": 1.2496203421875912e-05, + "loss": 0.8465, + "step": 7401 + }, + { + "epoch": 1.2571961334847932, + "grad_norm": 1.71875, + "learning_rate": 1.2494454621856604e-05, + "loss": 0.8892, + "step": 7402 + }, + { + "epoch": 1.2573675975737832, + "grad_norm": 1.6875, + "learning_rate": 1.249270574048374e-05, + "loss": 0.8771, + "step": 7403 + }, + { + "epoch": 1.257539061662773, + "grad_norm": 1.625, + "learning_rate": 1.249095677781435e-05, + "loss": 0.9029, + "step": 7404 + }, + { + "epoch": 1.257710525751763, + "grad_norm": 1.6640625, + "learning_rate": 1.2489207733905485e-05, + "loss": 0.8047, + "step": 7405 + }, + { + "epoch": 1.2578819898407527, + "grad_norm": 1.703125, + "learning_rate": 1.248745860881418e-05, + "loss": 0.8067, + "step": 7406 + }, + { + "epoch": 1.2580534539297425, + "grad_norm": 1.5859375, + "learning_rate": 1.2485709402597483e-05, + "loss": 0.8371, + "step": 7407 + }, + { + "epoch": 1.2582249180187324, + "grad_norm": 1.78125, + "learning_rate": 1.2483960115312443e-05, + "loss": 0.9358, + "step": 7408 + }, + { + "epoch": 1.2583963821077222, + "grad_norm": 1.6015625, + "learning_rate": 1.248221074701611e-05, + "loss": 0.8565, + "step": 7409 + }, + { + "epoch": 1.2585678461967122, + "grad_norm": 1.8359375, + "learning_rate": 1.2480461297765536e-05, + "loss": 0.8881, + "step": 7410 + }, + { + "epoch": 1.258739310285702, + "grad_norm": 1.671875, + "learning_rate": 1.2478711767617782e-05, + "loss": 0.8481, + "step": 7411 + }, + { + "epoch": 1.258910774374692, + "grad_norm": 1.796875, + "learning_rate": 1.2476962156629897e-05, + "loss": 0.8353, + "step": 7412 + }, + { + "epoch": 1.2590822384636817, + "grad_norm": 1.7421875, + "learning_rate": 1.2475212464858952e-05, + "loss": 0.851, + "step": 7413 + }, + { + "epoch": 1.2592537025526715, + "grad_norm": 1.7734375, + "learning_rate": 1.2473462692362008e-05, + "loss": 0.8653, + "step": 7414 + }, + { + "epoch": 1.2594251666416616, + "grad_norm": 1.6328125, + "learning_rate": 1.2471712839196133e-05, + "loss": 0.7982, + "step": 7415 + }, + { + "epoch": 1.2595966307306514, + "grad_norm": 1.6328125, + "learning_rate": 1.2469962905418393e-05, + "loss": 0.8258, + "step": 7416 + }, + { + "epoch": 1.2597680948196412, + "grad_norm": 1.640625, + "learning_rate": 1.2468212891085862e-05, + "loss": 0.8672, + "step": 7417 + }, + { + "epoch": 1.259939558908631, + "grad_norm": 1.671875, + "learning_rate": 1.2466462796255617e-05, + "loss": 0.7899, + "step": 7418 + }, + { + "epoch": 1.260111022997621, + "grad_norm": 1.6484375, + "learning_rate": 1.2464712620984727e-05, + "loss": 0.829, + "step": 7419 + }, + { + "epoch": 1.2602824870866107, + "grad_norm": 1.703125, + "learning_rate": 1.246296236533028e-05, + "loss": 0.8621, + "step": 7420 + }, + { + "epoch": 1.2604539511756006, + "grad_norm": 1.671875, + "learning_rate": 1.2461212029349354e-05, + "loss": 0.8306, + "step": 7421 + }, + { + "epoch": 1.2606254152645906, + "grad_norm": 1.7890625, + "learning_rate": 1.2459461613099037e-05, + "loss": 0.8217, + "step": 7422 + }, + { + "epoch": 1.2607968793535804, + "grad_norm": 1.6484375, + "learning_rate": 1.2457711116636416e-05, + "loss": 0.8073, + "step": 7423 + }, + { + "epoch": 1.2609683434425702, + "grad_norm": 1.671875, + "learning_rate": 1.2455960540018579e-05, + "loss": 0.8963, + "step": 7424 + }, + { + "epoch": 1.26113980753156, + "grad_norm": 1.6640625, + "learning_rate": 1.2454209883302622e-05, + "loss": 0.8579, + "step": 7425 + }, + { + "epoch": 1.26131127162055, + "grad_norm": 1.7265625, + "learning_rate": 1.2452459146545637e-05, + "loss": 0.8723, + "step": 7426 + }, + { + "epoch": 1.26148273570954, + "grad_norm": 1.7734375, + "learning_rate": 1.2450708329804724e-05, + "loss": 0.833, + "step": 7427 + }, + { + "epoch": 1.2616541997985298, + "grad_norm": 1.6328125, + "learning_rate": 1.2448957433136985e-05, + "loss": 0.8809, + "step": 7428 + }, + { + "epoch": 1.2618256638875196, + "grad_norm": 1.6796875, + "learning_rate": 1.2447206456599525e-05, + "loss": 0.8621, + "step": 7429 + }, + { + "epoch": 1.2619971279765094, + "grad_norm": 1.6796875, + "learning_rate": 1.2445455400249444e-05, + "loss": 0.8685, + "step": 7430 + }, + { + "epoch": 1.2621685920654993, + "grad_norm": 1.6875, + "learning_rate": 1.2443704264143852e-05, + "loss": 0.8792, + "step": 7431 + }, + { + "epoch": 1.262340056154489, + "grad_norm": 1.75, + "learning_rate": 1.2441953048339866e-05, + "loss": 0.8524, + "step": 7432 + }, + { + "epoch": 1.262511520243479, + "grad_norm": 1.703125, + "learning_rate": 1.2440201752894592e-05, + "loss": 0.832, + "step": 7433 + }, + { + "epoch": 1.262682984332469, + "grad_norm": 1.703125, + "learning_rate": 1.2438450377865153e-05, + "loss": 0.8459, + "step": 7434 + }, + { + "epoch": 1.2628544484214588, + "grad_norm": 1.5546875, + "learning_rate": 1.2436698923308662e-05, + "loss": 0.7745, + "step": 7435 + }, + { + "epoch": 1.2630259125104486, + "grad_norm": 1.703125, + "learning_rate": 1.2434947389282247e-05, + "loss": 0.8945, + "step": 7436 + }, + { + "epoch": 1.2631973765994384, + "grad_norm": 1.6875, + "learning_rate": 1.2433195775843026e-05, + "loss": 0.8573, + "step": 7437 + }, + { + "epoch": 1.2633688406884283, + "grad_norm": 1.6953125, + "learning_rate": 1.2431444083048128e-05, + "loss": 0.8573, + "step": 7438 + }, + { + "epoch": 1.2635403047774183, + "grad_norm": 1.765625, + "learning_rate": 1.2429692310954682e-05, + "loss": 0.8684, + "step": 7439 + }, + { + "epoch": 1.2637117688664081, + "grad_norm": 1.6484375, + "learning_rate": 1.2427940459619821e-05, + "loss": 0.8501, + "step": 7440 + }, + { + "epoch": 1.263883232955398, + "grad_norm": 1.6953125, + "learning_rate": 1.2426188529100677e-05, + "loss": 0.9016, + "step": 7441 + }, + { + "epoch": 1.2640546970443878, + "grad_norm": 1.7109375, + "learning_rate": 1.2424436519454393e-05, + "loss": 0.8824, + "step": 7442 + }, + { + "epoch": 1.2642261611333776, + "grad_norm": 1.734375, + "learning_rate": 1.24226844307381e-05, + "loss": 0.9148, + "step": 7443 + }, + { + "epoch": 1.2643976252223674, + "grad_norm": 1.71875, + "learning_rate": 1.2420932263008947e-05, + "loss": 0.8463, + "step": 7444 + }, + { + "epoch": 1.2645690893113573, + "grad_norm": 1.5859375, + "learning_rate": 1.2419180016324076e-05, + "loss": 0.8036, + "step": 7445 + }, + { + "epoch": 1.264740553400347, + "grad_norm": 1.703125, + "learning_rate": 1.241742769074064e-05, + "loss": 0.92, + "step": 7446 + }, + { + "epoch": 1.2649120174893371, + "grad_norm": 1.609375, + "learning_rate": 1.2415675286315778e-05, + "loss": 0.862, + "step": 7447 + }, + { + "epoch": 1.265083481578327, + "grad_norm": 1.5703125, + "learning_rate": 1.2413922803106649e-05, + "loss": 0.7859, + "step": 7448 + }, + { + "epoch": 1.2652549456673168, + "grad_norm": 1.6015625, + "learning_rate": 1.2412170241170406e-05, + "loss": 0.8329, + "step": 7449 + }, + { + "epoch": 1.2654264097563066, + "grad_norm": 1.6171875, + "learning_rate": 1.2410417600564212e-05, + "loss": 0.8083, + "step": 7450 + }, + { + "epoch": 1.2655978738452967, + "grad_norm": 1.734375, + "learning_rate": 1.2408664881345219e-05, + "loss": 0.9278, + "step": 7451 + }, + { + "epoch": 1.2657693379342865, + "grad_norm": 1.75, + "learning_rate": 1.2406912083570599e-05, + "loss": 0.8668, + "step": 7452 + }, + { + "epoch": 1.2659408020232763, + "grad_norm": 1.71875, + "learning_rate": 1.2405159207297508e-05, + "loss": 0.9055, + "step": 7453 + }, + { + "epoch": 1.2661122661122661, + "grad_norm": 1.6640625, + "learning_rate": 1.2403406252583121e-05, + "loss": 0.874, + "step": 7454 + }, + { + "epoch": 1.266283730201256, + "grad_norm": 1.6796875, + "learning_rate": 1.2401653219484604e-05, + "loss": 0.9812, + "step": 7455 + }, + { + "epoch": 1.2664551942902458, + "grad_norm": 1.65625, + "learning_rate": 1.2399900108059134e-05, + "loss": 0.86, + "step": 7456 + }, + { + "epoch": 1.2666266583792356, + "grad_norm": 1.5625, + "learning_rate": 1.2398146918363882e-05, + "loss": 0.7916, + "step": 7457 + }, + { + "epoch": 1.2667981224682254, + "grad_norm": 1.734375, + "learning_rate": 1.239639365045603e-05, + "loss": 0.9295, + "step": 7458 + }, + { + "epoch": 1.2669695865572155, + "grad_norm": 1.6875, + "learning_rate": 1.2394640304392753e-05, + "loss": 0.8695, + "step": 7459 + }, + { + "epoch": 1.2671410506462053, + "grad_norm": 1.6875, + "learning_rate": 1.2392886880231243e-05, + "loss": 0.8425, + "step": 7460 + }, + { + "epoch": 1.2673125147351951, + "grad_norm": 1.5703125, + "learning_rate": 1.239113337802868e-05, + "loss": 0.7969, + "step": 7461 + }, + { + "epoch": 1.267483978824185, + "grad_norm": 1.6796875, + "learning_rate": 1.2389379797842252e-05, + "loss": 0.8745, + "step": 7462 + }, + { + "epoch": 1.267655442913175, + "grad_norm": 1.7109375, + "learning_rate": 1.2387626139729155e-05, + "loss": 0.8732, + "step": 7463 + }, + { + "epoch": 1.2678269070021648, + "grad_norm": 1.7109375, + "learning_rate": 1.2385872403746575e-05, + "loss": 0.8747, + "step": 7464 + }, + { + "epoch": 1.2679983710911547, + "grad_norm": 1.6796875, + "learning_rate": 1.238411858995171e-05, + "loss": 0.8893, + "step": 7465 + }, + { + "epoch": 1.2681698351801445, + "grad_norm": 1.625, + "learning_rate": 1.2382364698401764e-05, + "loss": 0.7905, + "step": 7466 + }, + { + "epoch": 1.2683412992691343, + "grad_norm": 1.7734375, + "learning_rate": 1.238061072915393e-05, + "loss": 0.8982, + "step": 7467 + }, + { + "epoch": 1.2685127633581241, + "grad_norm": 1.671875, + "learning_rate": 1.2378856682265419e-05, + "loss": 0.7652, + "step": 7468 + }, + { + "epoch": 1.268684227447114, + "grad_norm": 1.7421875, + "learning_rate": 1.2377102557793433e-05, + "loss": 0.8618, + "step": 7469 + }, + { + "epoch": 1.2688556915361038, + "grad_norm": 1.671875, + "learning_rate": 1.2375348355795181e-05, + "loss": 0.8758, + "step": 7470 + }, + { + "epoch": 1.2690271556250938, + "grad_norm": 1.71875, + "learning_rate": 1.2373594076327873e-05, + "loss": 0.8996, + "step": 7471 + }, + { + "epoch": 1.2691986197140837, + "grad_norm": 1.6328125, + "learning_rate": 1.2371839719448725e-05, + "loss": 0.8519, + "step": 7472 + }, + { + "epoch": 1.2693700838030735, + "grad_norm": 1.6640625, + "learning_rate": 1.2370085285214953e-05, + "loss": 0.8879, + "step": 7473 + }, + { + "epoch": 1.2695415478920633, + "grad_norm": 1.6953125, + "learning_rate": 1.2368330773683774e-05, + "loss": 0.8703, + "step": 7474 + }, + { + "epoch": 1.2697130119810534, + "grad_norm": 1.6484375, + "learning_rate": 1.236657618491241e-05, + "loss": 0.8536, + "step": 7475 + }, + { + "epoch": 1.2698844760700432, + "grad_norm": 1.75, + "learning_rate": 1.2364821518958088e-05, + "loss": 0.8663, + "step": 7476 + }, + { + "epoch": 1.270055940159033, + "grad_norm": 1.546875, + "learning_rate": 1.2363066775878028e-05, + "loss": 0.8806, + "step": 7477 + }, + { + "epoch": 1.2702274042480228, + "grad_norm": 1.625, + "learning_rate": 1.236131195572946e-05, + "loss": 0.7836, + "step": 7478 + }, + { + "epoch": 1.2703988683370127, + "grad_norm": 1.6328125, + "learning_rate": 1.235955705856962e-05, + "loss": 0.7843, + "step": 7479 + }, + { + "epoch": 1.2705703324260025, + "grad_norm": 1.5703125, + "learning_rate": 1.2357802084455738e-05, + "loss": 0.8683, + "step": 7480 + }, + { + "epoch": 1.2707417965149923, + "grad_norm": 1.65625, + "learning_rate": 1.2356047033445055e-05, + "loss": 0.9125, + "step": 7481 + }, + { + "epoch": 1.2709132606039821, + "grad_norm": 1.6875, + "learning_rate": 1.2354291905594801e-05, + "loss": 0.8758, + "step": 7482 + }, + { + "epoch": 1.2710847246929722, + "grad_norm": 1.6171875, + "learning_rate": 1.2352536700962228e-05, + "loss": 0.8316, + "step": 7483 + }, + { + "epoch": 1.271256188781962, + "grad_norm": 1.609375, + "learning_rate": 1.2350781419604569e-05, + "loss": 0.8318, + "step": 7484 + }, + { + "epoch": 1.2714276528709518, + "grad_norm": 1.75, + "learning_rate": 1.234902606157908e-05, + "loss": 0.9431, + "step": 7485 + }, + { + "epoch": 1.2715991169599417, + "grad_norm": 1.7890625, + "learning_rate": 1.2347270626943002e-05, + "loss": 0.9024, + "step": 7486 + }, + { + "epoch": 1.2717705810489317, + "grad_norm": 1.7890625, + "learning_rate": 1.234551511575359e-05, + "loss": 0.945, + "step": 7487 + }, + { + "epoch": 1.2719420451379215, + "grad_norm": 1.6328125, + "learning_rate": 1.23437595280681e-05, + "loss": 0.8238, + "step": 7488 + }, + { + "epoch": 1.2721135092269114, + "grad_norm": 1.71875, + "learning_rate": 1.2342003863943785e-05, + "loss": 0.8964, + "step": 7489 + }, + { + "epoch": 1.2722849733159012, + "grad_norm": 1.640625, + "learning_rate": 1.2340248123437904e-05, + "loss": 0.8467, + "step": 7490 + }, + { + "epoch": 1.272456437404891, + "grad_norm": 1.7265625, + "learning_rate": 1.2338492306607721e-05, + "loss": 0.8657, + "step": 7491 + }, + { + "epoch": 1.2726279014938808, + "grad_norm": 1.75, + "learning_rate": 1.2336736413510497e-05, + "loss": 0.8956, + "step": 7492 + }, + { + "epoch": 1.2727993655828707, + "grad_norm": 1.609375, + "learning_rate": 1.23349804442035e-05, + "loss": 0.8019, + "step": 7493 + }, + { + "epoch": 1.2729708296718605, + "grad_norm": 1.6796875, + "learning_rate": 1.2333224398743997e-05, + "loss": 0.8398, + "step": 7494 + }, + { + "epoch": 1.2731422937608505, + "grad_norm": 1.65625, + "learning_rate": 1.233146827718926e-05, + "loss": 0.8467, + "step": 7495 + }, + { + "epoch": 1.2733137578498404, + "grad_norm": 1.6953125, + "learning_rate": 1.2329712079596563e-05, + "loss": 0.9317, + "step": 7496 + }, + { + "epoch": 1.2734852219388302, + "grad_norm": 1.71875, + "learning_rate": 1.2327955806023181e-05, + "loss": 0.8621, + "step": 7497 + }, + { + "epoch": 1.27365668602782, + "grad_norm": 1.625, + "learning_rate": 1.2326199456526397e-05, + "loss": 0.8108, + "step": 7498 + }, + { + "epoch": 1.2738281501168098, + "grad_norm": 1.6875, + "learning_rate": 1.2324443031163487e-05, + "loss": 0.8021, + "step": 7499 + }, + { + "epoch": 1.2739996142057999, + "grad_norm": 1.734375, + "learning_rate": 1.2322686529991739e-05, + "loss": 0.8839, + "step": 7500 + }, + { + "epoch": 1.2741710782947897, + "grad_norm": 1.7421875, + "learning_rate": 1.2320929953068435e-05, + "loss": 0.883, + "step": 7501 + }, + { + "epoch": 1.2743425423837795, + "grad_norm": 1.625, + "learning_rate": 1.2319173300450864e-05, + "loss": 0.8737, + "step": 7502 + }, + { + "epoch": 1.2745140064727694, + "grad_norm": 1.53125, + "learning_rate": 1.231741657219632e-05, + "loss": 0.797, + "step": 7503 + }, + { + "epoch": 1.2746854705617592, + "grad_norm": 1.6015625, + "learning_rate": 1.2315659768362097e-05, + "loss": 0.812, + "step": 7504 + }, + { + "epoch": 1.274856934650749, + "grad_norm": 1.65625, + "learning_rate": 1.2313902889005486e-05, + "loss": 0.9194, + "step": 7505 + }, + { + "epoch": 1.2750283987397388, + "grad_norm": 1.6640625, + "learning_rate": 1.2312145934183788e-05, + "loss": 0.9029, + "step": 7506 + }, + { + "epoch": 1.2751998628287289, + "grad_norm": 1.6640625, + "learning_rate": 1.2310388903954304e-05, + "loss": 0.8317, + "step": 7507 + }, + { + "epoch": 1.2753713269177187, + "grad_norm": 1.578125, + "learning_rate": 1.2308631798374339e-05, + "loss": 0.8674, + "step": 7508 + }, + { + "epoch": 1.2755427910067085, + "grad_norm": 1.625, + "learning_rate": 1.23068746175012e-05, + "loss": 0.7913, + "step": 7509 + }, + { + "epoch": 1.2757142550956984, + "grad_norm": 1.671875, + "learning_rate": 1.2305117361392192e-05, + "loss": 0.8466, + "step": 7510 + }, + { + "epoch": 1.2758857191846882, + "grad_norm": 1.625, + "learning_rate": 1.2303360030104625e-05, + "loss": 0.8082, + "step": 7511 + }, + { + "epoch": 1.2760571832736782, + "grad_norm": 1.8125, + "learning_rate": 1.2301602623695814e-05, + "loss": 0.9373, + "step": 7512 + }, + { + "epoch": 1.276228647362668, + "grad_norm": 1.6484375, + "learning_rate": 1.2299845142223075e-05, + "loss": 0.8069, + "step": 7513 + }, + { + "epoch": 1.2764001114516579, + "grad_norm": 1.65625, + "learning_rate": 1.2298087585743725e-05, + "loss": 0.8812, + "step": 7514 + }, + { + "epoch": 1.2765715755406477, + "grad_norm": 1.71875, + "learning_rate": 1.2296329954315085e-05, + "loss": 0.8012, + "step": 7515 + }, + { + "epoch": 1.2767430396296375, + "grad_norm": 1.6875, + "learning_rate": 1.2294572247994478e-05, + "loss": 0.8418, + "step": 7516 + }, + { + "epoch": 1.2769145037186274, + "grad_norm": 1.7265625, + "learning_rate": 1.229281446683923e-05, + "loss": 0.9418, + "step": 7517 + }, + { + "epoch": 1.2770859678076172, + "grad_norm": 1.71875, + "learning_rate": 1.2291056610906666e-05, + "loss": 0.9058, + "step": 7518 + }, + { + "epoch": 1.2772574318966072, + "grad_norm": 1.625, + "learning_rate": 1.228929868025412e-05, + "loss": 0.8718, + "step": 7519 + }, + { + "epoch": 1.277428895985597, + "grad_norm": 1.7421875, + "learning_rate": 1.2287540674938925e-05, + "loss": 0.8379, + "step": 7520 + }, + { + "epoch": 1.2776003600745869, + "grad_norm": 1.578125, + "learning_rate": 1.2285782595018417e-05, + "loss": 0.8615, + "step": 7521 + }, + { + "epoch": 1.2777718241635767, + "grad_norm": 1.7109375, + "learning_rate": 1.2284024440549924e-05, + "loss": 0.9499, + "step": 7522 + }, + { + "epoch": 1.2779432882525665, + "grad_norm": 1.59375, + "learning_rate": 1.2282266211590798e-05, + "loss": 0.8202, + "step": 7523 + }, + { + "epoch": 1.2781147523415566, + "grad_norm": 1.703125, + "learning_rate": 1.2280507908198375e-05, + "loss": 0.9373, + "step": 7524 + }, + { + "epoch": 1.2782862164305464, + "grad_norm": 1.6953125, + "learning_rate": 1.227874953043e-05, + "loss": 0.8337, + "step": 7525 + }, + { + "epoch": 1.2784576805195362, + "grad_norm": 1.65625, + "learning_rate": 1.2276991078343024e-05, + "loss": 0.8789, + "step": 7526 + }, + { + "epoch": 1.278629144608526, + "grad_norm": 1.625, + "learning_rate": 1.2275232551994795e-05, + "loss": 0.8343, + "step": 7527 + }, + { + "epoch": 1.2788006086975159, + "grad_norm": 1.6640625, + "learning_rate": 1.2273473951442663e-05, + "loss": 0.8961, + "step": 7528 + }, + { + "epoch": 1.2789720727865057, + "grad_norm": 1.7265625, + "learning_rate": 1.2271715276743984e-05, + "loss": 0.8509, + "step": 7529 + }, + { + "epoch": 1.2791435368754955, + "grad_norm": 1.625, + "learning_rate": 1.2269956527956118e-05, + "loss": 0.8277, + "step": 7530 + }, + { + "epoch": 1.2793150009644856, + "grad_norm": 1.7421875, + "learning_rate": 1.226819770513642e-05, + "loss": 0.8797, + "step": 7531 + }, + { + "epoch": 1.2794864650534754, + "grad_norm": 1.75, + "learning_rate": 1.2266438808342257e-05, + "loss": 0.8979, + "step": 7532 + }, + { + "epoch": 1.2796579291424652, + "grad_norm": 1.8359375, + "learning_rate": 1.2264679837630984e-05, + "loss": 0.8799, + "step": 7533 + }, + { + "epoch": 1.279829393231455, + "grad_norm": 1.640625, + "learning_rate": 1.2262920793059976e-05, + "loss": 0.8473, + "step": 7534 + }, + { + "epoch": 1.2800008573204449, + "grad_norm": 1.6640625, + "learning_rate": 1.22611616746866e-05, + "loss": 0.7805, + "step": 7535 + }, + { + "epoch": 1.280172321409435, + "grad_norm": 1.6328125, + "learning_rate": 1.2259402482568226e-05, + "loss": 0.8182, + "step": 7536 + }, + { + "epoch": 1.2803437854984248, + "grad_norm": 1.7421875, + "learning_rate": 1.225764321676223e-05, + "loss": 0.8051, + "step": 7537 + }, + { + "epoch": 1.2805152495874146, + "grad_norm": 1.765625, + "learning_rate": 1.2255883877325989e-05, + "loss": 0.7994, + "step": 7538 + }, + { + "epoch": 1.2806867136764044, + "grad_norm": 1.8125, + "learning_rate": 1.2254124464316876e-05, + "loss": 0.9064, + "step": 7539 + }, + { + "epoch": 1.2808581777653942, + "grad_norm": 1.7578125, + "learning_rate": 1.225236497779228e-05, + "loss": 0.9003, + "step": 7540 + }, + { + "epoch": 1.281029641854384, + "grad_norm": 1.765625, + "learning_rate": 1.2250605417809579e-05, + "loss": 0.9243, + "step": 7541 + }, + { + "epoch": 1.2812011059433739, + "grad_norm": 1.65625, + "learning_rate": 1.2248845784426157e-05, + "loss": 0.7572, + "step": 7542 + }, + { + "epoch": 1.2813725700323637, + "grad_norm": 1.640625, + "learning_rate": 1.2247086077699408e-05, + "loss": 0.9036, + "step": 7543 + }, + { + "epoch": 1.2815440341213538, + "grad_norm": 1.703125, + "learning_rate": 1.2245326297686721e-05, + "loss": 0.8296, + "step": 7544 + }, + { + "epoch": 1.2817154982103436, + "grad_norm": 1.71875, + "learning_rate": 1.2243566444445487e-05, + "loss": 0.8512, + "step": 7545 + }, + { + "epoch": 1.2818869622993334, + "grad_norm": 1.671875, + "learning_rate": 1.2241806518033104e-05, + "loss": 0.8552, + "step": 7546 + }, + { + "epoch": 1.2820584263883232, + "grad_norm": 1.734375, + "learning_rate": 1.2240046518506967e-05, + "loss": 0.8765, + "step": 7547 + }, + { + "epoch": 1.2822298904773133, + "grad_norm": 1.515625, + "learning_rate": 1.2238286445924479e-05, + "loss": 0.808, + "step": 7548 + }, + { + "epoch": 1.282401354566303, + "grad_norm": 1.71875, + "learning_rate": 1.2236526300343041e-05, + "loss": 0.8427, + "step": 7549 + }, + { + "epoch": 1.282572818655293, + "grad_norm": 1.6640625, + "learning_rate": 1.2234766081820063e-05, + "loss": 0.8815, + "step": 7550 + }, + { + "epoch": 1.2827442827442828, + "grad_norm": 1.6328125, + "learning_rate": 1.2233005790412942e-05, + "loss": 0.846, + "step": 7551 + }, + { + "epoch": 1.2829157468332726, + "grad_norm": 1.703125, + "learning_rate": 1.2231245426179095e-05, + "loss": 0.8415, + "step": 7552 + }, + { + "epoch": 1.2830872109222624, + "grad_norm": 1.6484375, + "learning_rate": 1.222948498917593e-05, + "loss": 0.8797, + "step": 7553 + }, + { + "epoch": 1.2832586750112522, + "grad_norm": 1.7578125, + "learning_rate": 1.2227724479460866e-05, + "loss": 0.8813, + "step": 7554 + }, + { + "epoch": 1.283430139100242, + "grad_norm": 1.6484375, + "learning_rate": 1.2225963897091317e-05, + "loss": 0.8223, + "step": 7555 + }, + { + "epoch": 1.283601603189232, + "grad_norm": 1.703125, + "learning_rate": 1.2224203242124707e-05, + "loss": 0.8077, + "step": 7556 + }, + { + "epoch": 1.283773067278222, + "grad_norm": 1.703125, + "learning_rate": 1.2222442514618452e-05, + "loss": 0.9232, + "step": 7557 + }, + { + "epoch": 1.2839445313672118, + "grad_norm": 1.6796875, + "learning_rate": 1.2220681714629976e-05, + "loss": 0.8709, + "step": 7558 + }, + { + "epoch": 1.2841159954562016, + "grad_norm": 1.71875, + "learning_rate": 1.2218920842216708e-05, + "loss": 0.877, + "step": 7559 + }, + { + "epoch": 1.2842874595451916, + "grad_norm": 1.609375, + "learning_rate": 1.2217159897436075e-05, + "loss": 0.9244, + "step": 7560 + }, + { + "epoch": 1.2844589236341815, + "grad_norm": 1.6796875, + "learning_rate": 1.2215398880345507e-05, + "loss": 0.9621, + "step": 7561 + }, + { + "epoch": 1.2846303877231713, + "grad_norm": 1.765625, + "learning_rate": 1.2213637791002443e-05, + "loss": 0.9206, + "step": 7562 + }, + { + "epoch": 1.284801851812161, + "grad_norm": 1.640625, + "learning_rate": 1.2211876629464314e-05, + "loss": 0.86, + "step": 7563 + }, + { + "epoch": 1.284973315901151, + "grad_norm": 1.6484375, + "learning_rate": 1.2210115395788558e-05, + "loss": 0.8369, + "step": 7564 + }, + { + "epoch": 1.2851447799901408, + "grad_norm": 1.625, + "learning_rate": 1.2208354090032615e-05, + "loss": 0.8893, + "step": 7565 + }, + { + "epoch": 1.2853162440791306, + "grad_norm": 1.75, + "learning_rate": 1.2206592712253931e-05, + "loss": 0.8371, + "step": 7566 + }, + { + "epoch": 1.2854877081681204, + "grad_norm": 1.71875, + "learning_rate": 1.220483126250995e-05, + "loss": 0.8877, + "step": 7567 + }, + { + "epoch": 1.2856591722571105, + "grad_norm": 1.7109375, + "learning_rate": 1.220306974085812e-05, + "loss": 0.8343, + "step": 7568 + }, + { + "epoch": 1.2858306363461003, + "grad_norm": 1.7421875, + "learning_rate": 1.2201308147355886e-05, + "loss": 0.927, + "step": 7569 + }, + { + "epoch": 1.2860021004350901, + "grad_norm": 1.734375, + "learning_rate": 1.2199546482060707e-05, + "loss": 0.8311, + "step": 7570 + }, + { + "epoch": 1.28617356452408, + "grad_norm": 1.6328125, + "learning_rate": 1.2197784745030033e-05, + "loss": 0.8742, + "step": 7571 + }, + { + "epoch": 1.28634502861307, + "grad_norm": 1.703125, + "learning_rate": 1.219602293632132e-05, + "loss": 0.8421, + "step": 7572 + }, + { + "epoch": 1.2865164927020598, + "grad_norm": 1.75, + "learning_rate": 1.2194261055992033e-05, + "loss": 0.9014, + "step": 7573 + }, + { + "epoch": 1.2866879567910496, + "grad_norm": 1.6953125, + "learning_rate": 1.219249910409963e-05, + "loss": 0.8843, + "step": 7574 + }, + { + "epoch": 1.2868594208800395, + "grad_norm": 1.625, + "learning_rate": 1.2190737080701574e-05, + "loss": 0.8677, + "step": 7575 + }, + { + "epoch": 1.2870308849690293, + "grad_norm": 1.6875, + "learning_rate": 1.2188974985855334e-05, + "loss": 0.8641, + "step": 7576 + }, + { + "epoch": 1.2872023490580191, + "grad_norm": 1.7421875, + "learning_rate": 1.2187212819618376e-05, + "loss": 0.8851, + "step": 7577 + }, + { + "epoch": 1.287373813147009, + "grad_norm": 1.8046875, + "learning_rate": 1.2185450582048168e-05, + "loss": 0.826, + "step": 7578 + }, + { + "epoch": 1.2875452772359988, + "grad_norm": 1.7265625, + "learning_rate": 1.2183688273202192e-05, + "loss": 0.9094, + "step": 7579 + }, + { + "epoch": 1.2877167413249888, + "grad_norm": 1.6796875, + "learning_rate": 1.2181925893137914e-05, + "loss": 0.853, + "step": 7580 + }, + { + "epoch": 1.2878882054139786, + "grad_norm": 1.75, + "learning_rate": 1.2180163441912817e-05, + "loss": 0.9173, + "step": 7581 + }, + { + "epoch": 1.2880596695029685, + "grad_norm": 1.75, + "learning_rate": 1.2178400919584384e-05, + "loss": 0.8648, + "step": 7582 + }, + { + "epoch": 1.2882311335919583, + "grad_norm": 1.6875, + "learning_rate": 1.2176638326210091e-05, + "loss": 0.8913, + "step": 7583 + }, + { + "epoch": 1.2884025976809481, + "grad_norm": 1.734375, + "learning_rate": 1.217487566184743e-05, + "loss": 0.8438, + "step": 7584 + }, + { + "epoch": 1.2885740617699382, + "grad_norm": 1.71875, + "learning_rate": 1.217311292655388e-05, + "loss": 0.8476, + "step": 7585 + }, + { + "epoch": 1.288745525858928, + "grad_norm": 1.75, + "learning_rate": 1.2171350120386933e-05, + "loss": 0.8477, + "step": 7586 + }, + { + "epoch": 1.2889169899479178, + "grad_norm": 1.765625, + "learning_rate": 1.2169587243404085e-05, + "loss": 0.8612, + "step": 7587 + }, + { + "epoch": 1.2890884540369076, + "grad_norm": 1.640625, + "learning_rate": 1.2167824295662825e-05, + "loss": 0.8484, + "step": 7588 + }, + { + "epoch": 1.2892599181258975, + "grad_norm": 1.7578125, + "learning_rate": 1.2166061277220653e-05, + "loss": 0.9121, + "step": 7589 + }, + { + "epoch": 1.2894313822148873, + "grad_norm": 1.671875, + "learning_rate": 1.2164298188135065e-05, + "loss": 0.8612, + "step": 7590 + }, + { + "epoch": 1.2896028463038771, + "grad_norm": 1.765625, + "learning_rate": 1.2162535028463563e-05, + "loss": 0.8444, + "step": 7591 + }, + { + "epoch": 1.2897743103928672, + "grad_norm": 1.6328125, + "learning_rate": 1.2160771798263651e-05, + "loss": 0.7786, + "step": 7592 + }, + { + "epoch": 1.289945774481857, + "grad_norm": 1.71875, + "learning_rate": 1.2159008497592834e-05, + "loss": 0.8248, + "step": 7593 + }, + { + "epoch": 1.2901172385708468, + "grad_norm": 1.6640625, + "learning_rate": 1.2157245126508619e-05, + "loss": 0.8333, + "step": 7594 + }, + { + "epoch": 1.2902887026598366, + "grad_norm": 1.7109375, + "learning_rate": 1.2155481685068517e-05, + "loss": 0.8895, + "step": 7595 + }, + { + "epoch": 1.2904601667488265, + "grad_norm": 1.7109375, + "learning_rate": 1.2153718173330045e-05, + "loss": 0.902, + "step": 7596 + }, + { + "epoch": 1.2906316308378165, + "grad_norm": 1.6796875, + "learning_rate": 1.2151954591350708e-05, + "loss": 0.8519, + "step": 7597 + }, + { + "epoch": 1.2908030949268063, + "grad_norm": 1.703125, + "learning_rate": 1.215019093918803e-05, + "loss": 0.8967, + "step": 7598 + }, + { + "epoch": 1.2909745590157962, + "grad_norm": 1.6796875, + "learning_rate": 1.2148427216899528e-05, + "loss": 0.8826, + "step": 7599 + }, + { + "epoch": 1.291146023104786, + "grad_norm": 1.5625, + "learning_rate": 1.2146663424542722e-05, + "loss": 0.8378, + "step": 7600 + }, + { + "epoch": 1.2913174871937758, + "grad_norm": 1.609375, + "learning_rate": 1.214489956217514e-05, + "loss": 0.8425, + "step": 7601 + }, + { + "epoch": 1.2914889512827656, + "grad_norm": 1.6171875, + "learning_rate": 1.2143135629854307e-05, + "loss": 0.8659, + "step": 7602 + }, + { + "epoch": 1.2916604153717555, + "grad_norm": 1.6640625, + "learning_rate": 1.2141371627637752e-05, + "loss": 0.8331, + "step": 7603 + }, + { + "epoch": 1.2918318794607455, + "grad_norm": 1.703125, + "learning_rate": 1.2139607555583004e-05, + "loss": 0.8546, + "step": 7604 + }, + { + "epoch": 1.2920033435497353, + "grad_norm": 1.71875, + "learning_rate": 1.2137843413747594e-05, + "loss": 0.9113, + "step": 7605 + }, + { + "epoch": 1.2921748076387252, + "grad_norm": 1.625, + "learning_rate": 1.2136079202189061e-05, + "loss": 0.8261, + "step": 7606 + }, + { + "epoch": 1.292346271727715, + "grad_norm": 1.6953125, + "learning_rate": 1.2134314920964947e-05, + "loss": 0.901, + "step": 7607 + }, + { + "epoch": 1.2925177358167048, + "grad_norm": 1.8125, + "learning_rate": 1.2132550570132779e-05, + "loss": 0.8526, + "step": 7608 + }, + { + "epoch": 1.2926891999056949, + "grad_norm": 1.6953125, + "learning_rate": 1.2130786149750108e-05, + "loss": 0.8381, + "step": 7609 + }, + { + "epoch": 1.2928606639946847, + "grad_norm": 1.7109375, + "learning_rate": 1.2129021659874478e-05, + "loss": 0.8988, + "step": 7610 + }, + { + "epoch": 1.2930321280836745, + "grad_norm": 1.703125, + "learning_rate": 1.2127257100563435e-05, + "loss": 0.866, + "step": 7611 + }, + { + "epoch": 1.2932035921726643, + "grad_norm": 1.6953125, + "learning_rate": 1.2125492471874526e-05, + "loss": 0.9133, + "step": 7612 + }, + { + "epoch": 1.2933750562616542, + "grad_norm": 1.8125, + "learning_rate": 1.2123727773865305e-05, + "loss": 0.8806, + "step": 7613 + }, + { + "epoch": 1.293546520350644, + "grad_norm": 1.734375, + "learning_rate": 1.2121963006593325e-05, + "loss": 0.9313, + "step": 7614 + }, + { + "epoch": 1.2937179844396338, + "grad_norm": 1.6953125, + "learning_rate": 1.212019817011614e-05, + "loss": 0.8593, + "step": 7615 + }, + { + "epoch": 1.2938894485286239, + "grad_norm": 1.6796875, + "learning_rate": 1.2118433264491307e-05, + "loss": 0.8664, + "step": 7616 + }, + { + "epoch": 1.2940609126176137, + "grad_norm": 1.7578125, + "learning_rate": 1.211666828977639e-05, + "loss": 0.8287, + "step": 7617 + }, + { + "epoch": 1.2942323767066035, + "grad_norm": 1.7265625, + "learning_rate": 1.2114903246028949e-05, + "loss": 0.9472, + "step": 7618 + }, + { + "epoch": 1.2944038407955933, + "grad_norm": 1.6953125, + "learning_rate": 1.2113138133306545e-05, + "loss": 0.807, + "step": 7619 + }, + { + "epoch": 1.2945753048845832, + "grad_norm": 1.7890625, + "learning_rate": 1.2111372951666755e-05, + "loss": 0.7895, + "step": 7620 + }, + { + "epoch": 1.2947467689735732, + "grad_norm": 1.6953125, + "learning_rate": 1.2109607701167139e-05, + "loss": 0.822, + "step": 7621 + }, + { + "epoch": 1.294918233062563, + "grad_norm": 1.75, + "learning_rate": 1.2107842381865273e-05, + "loss": 0.9415, + "step": 7622 + }, + { + "epoch": 1.2950896971515529, + "grad_norm": 1.671875, + "learning_rate": 1.210607699381873e-05, + "loss": 0.8165, + "step": 7623 + }, + { + "epoch": 1.2952611612405427, + "grad_norm": 1.671875, + "learning_rate": 1.2104311537085085e-05, + "loss": 0.793, + "step": 7624 + }, + { + "epoch": 1.2954326253295325, + "grad_norm": 1.6796875, + "learning_rate": 1.2102546011721918e-05, + "loss": 0.8712, + "step": 7625 + }, + { + "epoch": 1.2956040894185223, + "grad_norm": 1.6171875, + "learning_rate": 1.2100780417786805e-05, + "loss": 0.7509, + "step": 7626 + }, + { + "epoch": 1.2957755535075122, + "grad_norm": 1.7421875, + "learning_rate": 1.2099014755337333e-05, + "loss": 0.8279, + "step": 7627 + }, + { + "epoch": 1.2959470175965022, + "grad_norm": 1.7109375, + "learning_rate": 1.2097249024431086e-05, + "loss": 0.8896, + "step": 7628 + }, + { + "epoch": 1.296118481685492, + "grad_norm": 1.734375, + "learning_rate": 1.209548322512565e-05, + "loss": 0.8632, + "step": 7629 + }, + { + "epoch": 1.2962899457744819, + "grad_norm": 1.671875, + "learning_rate": 1.2093717357478618e-05, + "loss": 0.9059, + "step": 7630 + }, + { + "epoch": 1.2964614098634717, + "grad_norm": 1.75, + "learning_rate": 1.2091951421547579e-05, + "loss": 0.8138, + "step": 7631 + }, + { + "epoch": 1.2966328739524615, + "grad_norm": 1.65625, + "learning_rate": 1.2090185417390124e-05, + "loss": 0.8017, + "step": 7632 + }, + { + "epoch": 1.2968043380414516, + "grad_norm": 1.7109375, + "learning_rate": 1.2088419345063851e-05, + "loss": 0.8845, + "step": 7633 + }, + { + "epoch": 1.2969758021304414, + "grad_norm": 1.640625, + "learning_rate": 1.208665320462636e-05, + "loss": 0.8646, + "step": 7634 + }, + { + "epoch": 1.2971472662194312, + "grad_norm": 1.6875, + "learning_rate": 1.208488699613525e-05, + "loss": 0.8522, + "step": 7635 + }, + { + "epoch": 1.297318730308421, + "grad_norm": 1.71875, + "learning_rate": 1.2083120719648124e-05, + "loss": 0.8765, + "step": 7636 + }, + { + "epoch": 1.2974901943974109, + "grad_norm": 1.71875, + "learning_rate": 1.2081354375222589e-05, + "loss": 0.9103, + "step": 7637 + }, + { + "epoch": 1.2976616584864007, + "grad_norm": 1.6953125, + "learning_rate": 1.2079587962916248e-05, + "loss": 0.8577, + "step": 7638 + }, + { + "epoch": 1.2978331225753905, + "grad_norm": 1.75, + "learning_rate": 1.2077821482786714e-05, + "loss": 0.9572, + "step": 7639 + }, + { + "epoch": 1.2980045866643803, + "grad_norm": 1.546875, + "learning_rate": 1.2076054934891594e-05, + "loss": 0.8155, + "step": 7640 + }, + { + "epoch": 1.2981760507533704, + "grad_norm": 1.7265625, + "learning_rate": 1.2074288319288508e-05, + "loss": 0.8119, + "step": 7641 + }, + { + "epoch": 1.2983475148423602, + "grad_norm": 1.6875, + "learning_rate": 1.207252163603507e-05, + "loss": 0.8288, + "step": 7642 + }, + { + "epoch": 1.29851897893135, + "grad_norm": 1.7265625, + "learning_rate": 1.2070754885188895e-05, + "loss": 0.8586, + "step": 7643 + }, + { + "epoch": 1.2986904430203399, + "grad_norm": 1.8125, + "learning_rate": 1.2068988066807606e-05, + "loss": 0.9835, + "step": 7644 + }, + { + "epoch": 1.29886190710933, + "grad_norm": 1.734375, + "learning_rate": 1.2067221180948826e-05, + "loss": 0.8761, + "step": 7645 + }, + { + "epoch": 1.2990333711983197, + "grad_norm": 1.7421875, + "learning_rate": 1.2065454227670173e-05, + "loss": 0.8515, + "step": 7646 + }, + { + "epoch": 1.2992048352873096, + "grad_norm": 1.7265625, + "learning_rate": 1.2063687207029285e-05, + "loss": 0.9168, + "step": 7647 + }, + { + "epoch": 1.2993762993762994, + "grad_norm": 1.6015625, + "learning_rate": 1.2061920119083788e-05, + "loss": 0.7479, + "step": 7648 + }, + { + "epoch": 1.2995477634652892, + "grad_norm": 1.6875, + "learning_rate": 1.2060152963891307e-05, + "loss": 0.9024, + "step": 7649 + }, + { + "epoch": 1.299719227554279, + "grad_norm": 1.734375, + "learning_rate": 1.2058385741509481e-05, + "loss": 0.7964, + "step": 7650 + }, + { + "epoch": 1.2998906916432689, + "grad_norm": 1.5703125, + "learning_rate": 1.2056618451995947e-05, + "loss": 0.8361, + "step": 7651 + }, + { + "epoch": 1.3000621557322587, + "grad_norm": 1.7109375, + "learning_rate": 1.2054851095408339e-05, + "loss": 0.8796, + "step": 7652 + }, + { + "epoch": 1.3002336198212487, + "grad_norm": 1.640625, + "learning_rate": 1.20530836718043e-05, + "loss": 0.8677, + "step": 7653 + }, + { + "epoch": 1.3004050839102386, + "grad_norm": 1.6953125, + "learning_rate": 1.2051316181241472e-05, + "loss": 0.878, + "step": 7654 + }, + { + "epoch": 1.3005765479992284, + "grad_norm": 1.7734375, + "learning_rate": 1.2049548623777494e-05, + "loss": 0.9416, + "step": 7655 + }, + { + "epoch": 1.3007480120882182, + "grad_norm": 1.65625, + "learning_rate": 1.2047780999470023e-05, + "loss": 0.8483, + "step": 7656 + }, + { + "epoch": 1.3009194761772083, + "grad_norm": 1.71875, + "learning_rate": 1.2046013308376698e-05, + "loss": 0.9298, + "step": 7657 + }, + { + "epoch": 1.301090940266198, + "grad_norm": 1.625, + "learning_rate": 1.2044245550555179e-05, + "loss": 0.8033, + "step": 7658 + }, + { + "epoch": 1.301262404355188, + "grad_norm": 1.6328125, + "learning_rate": 1.2042477726063113e-05, + "loss": 0.8801, + "step": 7659 + }, + { + "epoch": 1.3014338684441777, + "grad_norm": 1.625, + "learning_rate": 1.2040709834958155e-05, + "loss": 0.8041, + "step": 7660 + }, + { + "epoch": 1.3016053325331676, + "grad_norm": 1.78125, + "learning_rate": 1.2038941877297968e-05, + "loss": 0.8988, + "step": 7661 + }, + { + "epoch": 1.3017767966221574, + "grad_norm": 1.75, + "learning_rate": 1.2037173853140206e-05, + "loss": 0.89, + "step": 7662 + }, + { + "epoch": 1.3019482607111472, + "grad_norm": 1.59375, + "learning_rate": 1.2035405762542534e-05, + "loss": 0.8521, + "step": 7663 + }, + { + "epoch": 1.302119724800137, + "grad_norm": 1.65625, + "learning_rate": 1.2033637605562616e-05, + "loss": 0.896, + "step": 7664 + }, + { + "epoch": 1.302291188889127, + "grad_norm": 1.8515625, + "learning_rate": 1.2031869382258116e-05, + "loss": 0.8838, + "step": 7665 + }, + { + "epoch": 1.302462652978117, + "grad_norm": 1.8359375, + "learning_rate": 1.2030101092686705e-05, + "loss": 0.9619, + "step": 7666 + }, + { + "epoch": 1.3026341170671067, + "grad_norm": 1.6875, + "learning_rate": 1.2028332736906052e-05, + "loss": 0.8727, + "step": 7667 + }, + { + "epoch": 1.3028055811560966, + "grad_norm": 1.59375, + "learning_rate": 1.2026564314973832e-05, + "loss": 0.8851, + "step": 7668 + }, + { + "epoch": 1.3029770452450866, + "grad_norm": 1.640625, + "learning_rate": 1.2024795826947718e-05, + "loss": 0.9059, + "step": 7669 + }, + { + "epoch": 1.3031485093340764, + "grad_norm": 1.7265625, + "learning_rate": 1.2023027272885388e-05, + "loss": 0.8688, + "step": 7670 + }, + { + "epoch": 1.3033199734230663, + "grad_norm": 1.8046875, + "learning_rate": 1.2021258652844521e-05, + "loss": 0.9054, + "step": 7671 + }, + { + "epoch": 1.303491437512056, + "grad_norm": 1.75, + "learning_rate": 1.2019489966882796e-05, + "loss": 0.8857, + "step": 7672 + }, + { + "epoch": 1.303662901601046, + "grad_norm": 1.6484375, + "learning_rate": 1.20177212150579e-05, + "loss": 0.8302, + "step": 7673 + }, + { + "epoch": 1.3038343656900357, + "grad_norm": 1.640625, + "learning_rate": 1.2015952397427513e-05, + "loss": 0.7771, + "step": 7674 + }, + { + "epoch": 1.3040058297790256, + "grad_norm": 1.71875, + "learning_rate": 1.2014183514049331e-05, + "loss": 0.8096, + "step": 7675 + }, + { + "epoch": 1.3041772938680154, + "grad_norm": 1.6796875, + "learning_rate": 1.2012414564981039e-05, + "loss": 0.8404, + "step": 7676 + }, + { + "epoch": 1.3043487579570054, + "grad_norm": 1.765625, + "learning_rate": 1.2010645550280332e-05, + "loss": 0.9386, + "step": 7677 + }, + { + "epoch": 1.3045202220459953, + "grad_norm": 1.625, + "learning_rate": 1.20088764700049e-05, + "loss": 0.8149, + "step": 7678 + }, + { + "epoch": 1.304691686134985, + "grad_norm": 1.65625, + "learning_rate": 1.2007107324212441e-05, + "loss": 0.8008, + "step": 7679 + }, + { + "epoch": 1.304863150223975, + "grad_norm": 1.609375, + "learning_rate": 1.2005338112960657e-05, + "loss": 0.8362, + "step": 7680 + }, + { + "epoch": 1.3050346143129647, + "grad_norm": 1.6796875, + "learning_rate": 1.2003568836307244e-05, + "loss": 0.838, + "step": 7681 + }, + { + "epoch": 1.3052060784019548, + "grad_norm": 1.6640625, + "learning_rate": 1.2001799494309905e-05, + "loss": 0.7765, + "step": 7682 + }, + { + "epoch": 1.3053775424909446, + "grad_norm": 1.625, + "learning_rate": 1.200003008702635e-05, + "loss": 0.8431, + "step": 7683 + }, + { + "epoch": 1.3055490065799344, + "grad_norm": 1.7734375, + "learning_rate": 1.1998260614514279e-05, + "loss": 0.8863, + "step": 7684 + }, + { + "epoch": 1.3057204706689243, + "grad_norm": 1.78125, + "learning_rate": 1.1996491076831404e-05, + "loss": 0.8998, + "step": 7685 + }, + { + "epoch": 1.305891934757914, + "grad_norm": 1.6953125, + "learning_rate": 1.199472147403544e-05, + "loss": 0.8527, + "step": 7686 + }, + { + "epoch": 1.306063398846904, + "grad_norm": 1.8125, + "learning_rate": 1.1992951806184095e-05, + "loss": 0.8908, + "step": 7687 + }, + { + "epoch": 1.3062348629358937, + "grad_norm": 1.578125, + "learning_rate": 1.199118207333509e-05, + "loss": 0.8322, + "step": 7688 + }, + { + "epoch": 1.3064063270248838, + "grad_norm": 1.6328125, + "learning_rate": 1.1989412275546137e-05, + "loss": 0.7763, + "step": 7689 + }, + { + "epoch": 1.3065777911138736, + "grad_norm": 1.6015625, + "learning_rate": 1.198764241287496e-05, + "loss": 0.8284, + "step": 7690 + }, + { + "epoch": 1.3067492552028634, + "grad_norm": 1.7109375, + "learning_rate": 1.1985872485379278e-05, + "loss": 0.8426, + "step": 7691 + }, + { + "epoch": 1.3069207192918533, + "grad_norm": 1.6328125, + "learning_rate": 1.1984102493116813e-05, + "loss": 0.8513, + "step": 7692 + }, + { + "epoch": 1.307092183380843, + "grad_norm": 1.6484375, + "learning_rate": 1.1982332436145296e-05, + "loss": 0.7398, + "step": 7693 + }, + { + "epoch": 1.3072636474698331, + "grad_norm": 1.78125, + "learning_rate": 1.1980562314522457e-05, + "loss": 0.8287, + "step": 7694 + }, + { + "epoch": 1.307435111558823, + "grad_norm": 1.703125, + "learning_rate": 1.1978792128306021e-05, + "loss": 0.82, + "step": 7695 + }, + { + "epoch": 1.3076065756478128, + "grad_norm": 1.7421875, + "learning_rate": 1.1977021877553721e-05, + "loss": 0.8936, + "step": 7696 + }, + { + "epoch": 1.3077780397368026, + "grad_norm": 1.7109375, + "learning_rate": 1.1975251562323293e-05, + "loss": 0.9517, + "step": 7697 + }, + { + "epoch": 1.3079495038257924, + "grad_norm": 1.671875, + "learning_rate": 1.1973481182672475e-05, + "loss": 0.8354, + "step": 7698 + }, + { + "epoch": 1.3081209679147823, + "grad_norm": 1.6328125, + "learning_rate": 1.1971710738659002e-05, + "loss": 0.7502, + "step": 7699 + }, + { + "epoch": 1.308292432003772, + "grad_norm": 1.640625, + "learning_rate": 1.196994023034062e-05, + "loss": 0.8179, + "step": 7700 + }, + { + "epoch": 1.308292432003772, + "eval_loss": 0.8456034660339355, + "eval_runtime": 836.8754, + "eval_samples_per_second": 2.986, + "eval_steps_per_second": 2.986, + "step": 7700 + }, + { + "epoch": 1.3084638960927621, + "grad_norm": 1.65625, + "learning_rate": 1.1968169657775069e-05, + "loss": 0.8484, + "step": 7701 + }, + { + "epoch": 1.308635360181752, + "grad_norm": 1.7265625, + "learning_rate": 1.196639902102009e-05, + "loss": 0.8856, + "step": 7702 + }, + { + "epoch": 1.3088068242707418, + "grad_norm": 1.7890625, + "learning_rate": 1.1964628320133436e-05, + "loss": 0.8913, + "step": 7703 + }, + { + "epoch": 1.3089782883597316, + "grad_norm": 1.671875, + "learning_rate": 1.1962857555172854e-05, + "loss": 0.8323, + "step": 7704 + }, + { + "epoch": 1.3091497524487214, + "grad_norm": 1.6640625, + "learning_rate": 1.1961086726196097e-05, + "loss": 0.7931, + "step": 7705 + }, + { + "epoch": 1.3093212165377115, + "grad_norm": 1.703125, + "learning_rate": 1.1959315833260918e-05, + "loss": 0.9209, + "step": 7706 + }, + { + "epoch": 1.3094926806267013, + "grad_norm": 1.6640625, + "learning_rate": 1.1957544876425069e-05, + "loss": 0.8677, + "step": 7707 + }, + { + "epoch": 1.3096641447156911, + "grad_norm": 1.6015625, + "learning_rate": 1.195577385574631e-05, + "loss": 0.7754, + "step": 7708 + }, + { + "epoch": 1.309835608804681, + "grad_norm": 1.78125, + "learning_rate": 1.1954002771282403e-05, + "loss": 0.887, + "step": 7709 + }, + { + "epoch": 1.3100070728936708, + "grad_norm": 1.7421875, + "learning_rate": 1.1952231623091106e-05, + "loss": 0.8784, + "step": 7710 + }, + { + "epoch": 1.3101785369826606, + "grad_norm": 1.75, + "learning_rate": 1.1950460411230185e-05, + "loss": 0.8745, + "step": 7711 + }, + { + "epoch": 1.3103500010716504, + "grad_norm": 1.6875, + "learning_rate": 1.1948689135757406e-05, + "loss": 0.8325, + "step": 7712 + }, + { + "epoch": 1.3105214651606405, + "grad_norm": 1.6796875, + "learning_rate": 1.1946917796730534e-05, + "loss": 0.8219, + "step": 7713 + }, + { + "epoch": 1.3106929292496303, + "grad_norm": 1.6953125, + "learning_rate": 1.1945146394207342e-05, + "loss": 0.8945, + "step": 7714 + }, + { + "epoch": 1.3108643933386201, + "grad_norm": 1.6484375, + "learning_rate": 1.1943374928245603e-05, + "loss": 0.8552, + "step": 7715 + }, + { + "epoch": 1.31103585742761, + "grad_norm": 1.609375, + "learning_rate": 1.1941603398903088e-05, + "loss": 0.8237, + "step": 7716 + }, + { + "epoch": 1.3112073215165998, + "grad_norm": 1.71875, + "learning_rate": 1.193983180623758e-05, + "loss": 0.8898, + "step": 7717 + }, + { + "epoch": 1.3113787856055898, + "grad_norm": 1.734375, + "learning_rate": 1.1938060150306844e-05, + "loss": 0.8595, + "step": 7718 + }, + { + "epoch": 1.3115502496945797, + "grad_norm": 1.7109375, + "learning_rate": 1.1936288431168673e-05, + "loss": 0.8116, + "step": 7719 + }, + { + "epoch": 1.3117217137835695, + "grad_norm": 1.6796875, + "learning_rate": 1.1934516648880842e-05, + "loss": 0.8697, + "step": 7720 + }, + { + "epoch": 1.3118931778725593, + "grad_norm": 1.703125, + "learning_rate": 1.193274480350114e-05, + "loss": 0.8571, + "step": 7721 + }, + { + "epoch": 1.3120646419615491, + "grad_norm": 1.7265625, + "learning_rate": 1.1930972895087349e-05, + "loss": 0.8991, + "step": 7722 + }, + { + "epoch": 1.312236106050539, + "grad_norm": 1.578125, + "learning_rate": 1.1929200923697267e-05, + "loss": 0.798, + "step": 7723 + }, + { + "epoch": 1.3124075701395288, + "grad_norm": 1.6953125, + "learning_rate": 1.1927428889388674e-05, + "loss": 0.8255, + "step": 7724 + }, + { + "epoch": 1.3125790342285186, + "grad_norm": 1.71875, + "learning_rate": 1.1925656792219366e-05, + "loss": 0.9045, + "step": 7725 + }, + { + "epoch": 1.3127504983175087, + "grad_norm": 1.6796875, + "learning_rate": 1.1923884632247143e-05, + "loss": 0.9382, + "step": 7726 + }, + { + "epoch": 1.3129219624064985, + "grad_norm": 1.65625, + "learning_rate": 1.1922112409529793e-05, + "loss": 0.86, + "step": 7727 + }, + { + "epoch": 1.3130934264954883, + "grad_norm": 1.6875, + "learning_rate": 1.192034012412512e-05, + "loss": 0.8747, + "step": 7728 + }, + { + "epoch": 1.3132648905844782, + "grad_norm": 1.6328125, + "learning_rate": 1.1918567776090925e-05, + "loss": 0.871, + "step": 7729 + }, + { + "epoch": 1.3134363546734682, + "grad_norm": 1.6484375, + "learning_rate": 1.191679536548501e-05, + "loss": 0.887, + "step": 7730 + }, + { + "epoch": 1.313607818762458, + "grad_norm": 1.5546875, + "learning_rate": 1.1915022892365182e-05, + "loss": 0.7853, + "step": 7731 + }, + { + "epoch": 1.3137792828514478, + "grad_norm": 1.7109375, + "learning_rate": 1.1913250356789244e-05, + "loss": 0.8789, + "step": 7732 + }, + { + "epoch": 1.3139507469404377, + "grad_norm": 1.6953125, + "learning_rate": 1.1911477758815006e-05, + "loss": 0.9076, + "step": 7733 + }, + { + "epoch": 1.3141222110294275, + "grad_norm": 1.6953125, + "learning_rate": 1.1909705098500287e-05, + "loss": 0.8747, + "step": 7734 + }, + { + "epoch": 1.3142936751184173, + "grad_norm": 1.7109375, + "learning_rate": 1.1907932375902886e-05, + "loss": 0.9129, + "step": 7735 + }, + { + "epoch": 1.3144651392074072, + "grad_norm": 1.6015625, + "learning_rate": 1.1906159591080628e-05, + "loss": 0.8726, + "step": 7736 + }, + { + "epoch": 1.314636603296397, + "grad_norm": 1.6328125, + "learning_rate": 1.190438674409133e-05, + "loss": 0.753, + "step": 7737 + }, + { + "epoch": 1.314808067385387, + "grad_norm": 1.6328125, + "learning_rate": 1.1902613834992807e-05, + "loss": 0.8014, + "step": 7738 + }, + { + "epoch": 1.3149795314743769, + "grad_norm": 1.71875, + "learning_rate": 1.1900840863842883e-05, + "loss": 0.8932, + "step": 7739 + }, + { + "epoch": 1.3151509955633667, + "grad_norm": 1.7421875, + "learning_rate": 1.1899067830699378e-05, + "loss": 0.8219, + "step": 7740 + }, + { + "epoch": 1.3153224596523565, + "grad_norm": 1.6796875, + "learning_rate": 1.1897294735620123e-05, + "loss": 0.9269, + "step": 7741 + }, + { + "epoch": 1.3154939237413465, + "grad_norm": 1.8046875, + "learning_rate": 1.1895521578662939e-05, + "loss": 0.9126, + "step": 7742 + }, + { + "epoch": 1.3156653878303364, + "grad_norm": 1.796875, + "learning_rate": 1.189374835988566e-05, + "loss": 0.8141, + "step": 7743 + }, + { + "epoch": 1.3158368519193262, + "grad_norm": 1.671875, + "learning_rate": 1.1891975079346117e-05, + "loss": 0.8801, + "step": 7744 + }, + { + "epoch": 1.316008316008316, + "grad_norm": 1.8125, + "learning_rate": 1.1890201737102141e-05, + "loss": 0.9298, + "step": 7745 + }, + { + "epoch": 1.3161797800973059, + "grad_norm": 1.8203125, + "learning_rate": 1.188842833321157e-05, + "loss": 0.9065, + "step": 7746 + }, + { + "epoch": 1.3163512441862957, + "grad_norm": 1.7109375, + "learning_rate": 1.1886654867732239e-05, + "loss": 0.8665, + "step": 7747 + }, + { + "epoch": 1.3165227082752855, + "grad_norm": 1.6875, + "learning_rate": 1.1884881340721985e-05, + "loss": 0.8517, + "step": 7748 + }, + { + "epoch": 1.3166941723642753, + "grad_norm": 1.765625, + "learning_rate": 1.1883107752238657e-05, + "loss": 0.8489, + "step": 7749 + }, + { + "epoch": 1.3168656364532654, + "grad_norm": 1.7421875, + "learning_rate": 1.188133410234009e-05, + "loss": 0.8792, + "step": 7750 + }, + { + "epoch": 1.3170371005422552, + "grad_norm": 1.703125, + "learning_rate": 1.1879560391084137e-05, + "loss": 0.8721, + "step": 7751 + }, + { + "epoch": 1.317208564631245, + "grad_norm": 1.6875, + "learning_rate": 1.1877786618528642e-05, + "loss": 0.874, + "step": 7752 + }, + { + "epoch": 1.3173800287202349, + "grad_norm": 1.734375, + "learning_rate": 1.1876012784731454e-05, + "loss": 0.8498, + "step": 7753 + }, + { + "epoch": 1.317551492809225, + "grad_norm": 1.6328125, + "learning_rate": 1.1874238889750423e-05, + "loss": 0.7978, + "step": 7754 + }, + { + "epoch": 1.3177229568982147, + "grad_norm": 1.6875, + "learning_rate": 1.1872464933643406e-05, + "loss": 0.834, + "step": 7755 + }, + { + "epoch": 1.3178944209872046, + "grad_norm": 1.6953125, + "learning_rate": 1.1870690916468255e-05, + "loss": 0.8322, + "step": 7756 + }, + { + "epoch": 1.3180658850761944, + "grad_norm": 1.7265625, + "learning_rate": 1.1868916838282829e-05, + "loss": 0.9117, + "step": 7757 + }, + { + "epoch": 1.3182373491651842, + "grad_norm": 1.6328125, + "learning_rate": 1.1867142699144986e-05, + "loss": 0.7738, + "step": 7758 + }, + { + "epoch": 1.318408813254174, + "grad_norm": 1.6875, + "learning_rate": 1.1865368499112591e-05, + "loss": 0.8178, + "step": 7759 + }, + { + "epoch": 1.3185802773431639, + "grad_norm": 1.6796875, + "learning_rate": 1.1863594238243503e-05, + "loss": 0.8666, + "step": 7760 + }, + { + "epoch": 1.3187517414321537, + "grad_norm": 1.640625, + "learning_rate": 1.1861819916595592e-05, + "loss": 0.8319, + "step": 7761 + }, + { + "epoch": 1.3189232055211437, + "grad_norm": 1.703125, + "learning_rate": 1.1860045534226719e-05, + "loss": 0.8273, + "step": 7762 + }, + { + "epoch": 1.3190946696101336, + "grad_norm": 1.671875, + "learning_rate": 1.1858271091194762e-05, + "loss": 0.8214, + "step": 7763 + }, + { + "epoch": 1.3192661336991234, + "grad_norm": 1.6796875, + "learning_rate": 1.1856496587557582e-05, + "loss": 0.8312, + "step": 7764 + }, + { + "epoch": 1.3194375977881132, + "grad_norm": 1.65625, + "learning_rate": 1.185472202337306e-05, + "loss": 0.8638, + "step": 7765 + }, + { + "epoch": 1.3196090618771033, + "grad_norm": 1.6484375, + "learning_rate": 1.1852947398699065e-05, + "loss": 0.8527, + "step": 7766 + }, + { + "epoch": 1.319780525966093, + "grad_norm": 1.671875, + "learning_rate": 1.1851172713593482e-05, + "loss": 0.8765, + "step": 7767 + }, + { + "epoch": 1.319951990055083, + "grad_norm": 6.34375, + "learning_rate": 1.1849397968114183e-05, + "loss": 0.9896, + "step": 7768 + }, + { + "epoch": 1.3201234541440727, + "grad_norm": 1.6484375, + "learning_rate": 1.1847623162319056e-05, + "loss": 0.7787, + "step": 7769 + }, + { + "epoch": 1.3202949182330626, + "grad_norm": 1.7890625, + "learning_rate": 1.1845848296265976e-05, + "loss": 0.8828, + "step": 7770 + }, + { + "epoch": 1.3204663823220524, + "grad_norm": 1.65625, + "learning_rate": 1.1844073370012835e-05, + "loss": 0.8136, + "step": 7771 + }, + { + "epoch": 1.3206378464110422, + "grad_norm": 1.7578125, + "learning_rate": 1.1842298383617515e-05, + "loss": 0.8852, + "step": 7772 + }, + { + "epoch": 1.320809310500032, + "grad_norm": 1.7734375, + "learning_rate": 1.1840523337137908e-05, + "loss": 0.8697, + "step": 7773 + }, + { + "epoch": 1.320980774589022, + "grad_norm": 1.7109375, + "learning_rate": 1.1838748230631905e-05, + "loss": 0.8943, + "step": 7774 + }, + { + "epoch": 1.321152238678012, + "grad_norm": 1.6796875, + "learning_rate": 1.1836973064157399e-05, + "loss": 0.8774, + "step": 7775 + }, + { + "epoch": 1.3213237027670017, + "grad_norm": 1.8125, + "learning_rate": 1.1835197837772282e-05, + "loss": 0.841, + "step": 7776 + }, + { + "epoch": 1.3214951668559916, + "grad_norm": 1.6953125, + "learning_rate": 1.1833422551534453e-05, + "loss": 0.9008, + "step": 7777 + }, + { + "epoch": 1.3216666309449814, + "grad_norm": 1.53125, + "learning_rate": 1.183164720550181e-05, + "loss": 0.8287, + "step": 7778 + }, + { + "epoch": 1.3218380950339714, + "grad_norm": 1.7109375, + "learning_rate": 1.1829871799732255e-05, + "loss": 0.8227, + "step": 7779 + }, + { + "epoch": 1.3220095591229613, + "grad_norm": 1.65625, + "learning_rate": 1.1828096334283692e-05, + "loss": 0.9078, + "step": 7780 + }, + { + "epoch": 1.322181023211951, + "grad_norm": 1.625, + "learning_rate": 1.1826320809214023e-05, + "loss": 0.8535, + "step": 7781 + }, + { + "epoch": 1.322352487300941, + "grad_norm": 1.71875, + "learning_rate": 1.1824545224581155e-05, + "loss": 0.8968, + "step": 7782 + }, + { + "epoch": 1.3225239513899307, + "grad_norm": 1.703125, + "learning_rate": 1.1822769580442995e-05, + "loss": 0.8102, + "step": 7783 + }, + { + "epoch": 1.3226954154789206, + "grad_norm": 1.7265625, + "learning_rate": 1.1820993876857453e-05, + "loss": 0.8553, + "step": 7784 + }, + { + "epoch": 1.3228668795679104, + "grad_norm": 1.6640625, + "learning_rate": 1.1819218113882448e-05, + "loss": 0.8806, + "step": 7785 + }, + { + "epoch": 1.3230383436569004, + "grad_norm": 1.609375, + "learning_rate": 1.1817442291575887e-05, + "loss": 0.7205, + "step": 7786 + }, + { + "epoch": 1.3232098077458903, + "grad_norm": 1.625, + "learning_rate": 1.181566640999569e-05, + "loss": 0.9008, + "step": 7787 + }, + { + "epoch": 1.32338127183488, + "grad_norm": 1.75, + "learning_rate": 1.1813890469199774e-05, + "loss": 0.8797, + "step": 7788 + }, + { + "epoch": 1.32355273592387, + "grad_norm": 1.7109375, + "learning_rate": 1.181211446924606e-05, + "loss": 0.9086, + "step": 7789 + }, + { + "epoch": 1.3237242000128597, + "grad_norm": 1.6015625, + "learning_rate": 1.1810338410192468e-05, + "loss": 0.8851, + "step": 7790 + }, + { + "epoch": 1.3238956641018498, + "grad_norm": 1.703125, + "learning_rate": 1.1808562292096923e-05, + "loss": 0.8203, + "step": 7791 + }, + { + "epoch": 1.3240671281908396, + "grad_norm": 1.7421875, + "learning_rate": 1.1806786115017354e-05, + "loss": 0.8045, + "step": 7792 + }, + { + "epoch": 1.3242385922798294, + "grad_norm": 1.671875, + "learning_rate": 1.1805009879011686e-05, + "loss": 0.8182, + "step": 7793 + }, + { + "epoch": 1.3244100563688193, + "grad_norm": 1.625, + "learning_rate": 1.1803233584137847e-05, + "loss": 0.9296, + "step": 7794 + }, + { + "epoch": 1.324581520457809, + "grad_norm": 1.734375, + "learning_rate": 1.1801457230453768e-05, + "loss": 0.9271, + "step": 7795 + }, + { + "epoch": 1.324752984546799, + "grad_norm": 1.765625, + "learning_rate": 1.1799680818017387e-05, + "loss": 0.8048, + "step": 7796 + }, + { + "epoch": 1.3249244486357887, + "grad_norm": 1.703125, + "learning_rate": 1.179790434688664e-05, + "loss": 0.8658, + "step": 7797 + }, + { + "epoch": 1.3250959127247788, + "grad_norm": 1.578125, + "learning_rate": 1.179612781711946e-05, + "loss": 0.8503, + "step": 7798 + }, + { + "epoch": 1.3252673768137686, + "grad_norm": 1.6484375, + "learning_rate": 1.179435122877379e-05, + "loss": 0.8722, + "step": 7799 + }, + { + "epoch": 1.3254388409027584, + "grad_norm": 1.5625, + "learning_rate": 1.1792574581907567e-05, + "loss": 0.7956, + "step": 7800 + }, + { + "epoch": 1.3256103049917483, + "grad_norm": 1.6875, + "learning_rate": 1.179079787657874e-05, + "loss": 0.8823, + "step": 7801 + }, + { + "epoch": 1.325781769080738, + "grad_norm": 1.75, + "learning_rate": 1.1789021112845247e-05, + "loss": 0.9627, + "step": 7802 + }, + { + "epoch": 1.3259532331697281, + "grad_norm": 1.59375, + "learning_rate": 1.178724429076504e-05, + "loss": 0.8166, + "step": 7803 + }, + { + "epoch": 1.326124697258718, + "grad_norm": 1.734375, + "learning_rate": 1.1785467410396066e-05, + "loss": 0.7876, + "step": 7804 + }, + { + "epoch": 1.3262961613477078, + "grad_norm": 1.7109375, + "learning_rate": 1.1783690471796276e-05, + "loss": 0.8619, + "step": 7805 + }, + { + "epoch": 1.3264676254366976, + "grad_norm": 1.6328125, + "learning_rate": 1.1781913475023626e-05, + "loss": 0.8014, + "step": 7806 + }, + { + "epoch": 1.3266390895256874, + "grad_norm": 1.6015625, + "learning_rate": 1.1780136420136063e-05, + "loss": 0.8473, + "step": 7807 + }, + { + "epoch": 1.3268105536146773, + "grad_norm": 1.6484375, + "learning_rate": 1.1778359307191552e-05, + "loss": 0.8229, + "step": 7808 + }, + { + "epoch": 1.326982017703667, + "grad_norm": 1.8203125, + "learning_rate": 1.1776582136248047e-05, + "loss": 0.8526, + "step": 7809 + }, + { + "epoch": 1.3271534817926571, + "grad_norm": 1.640625, + "learning_rate": 1.1774804907363508e-05, + "loss": 0.8695, + "step": 7810 + }, + { + "epoch": 1.327324945881647, + "grad_norm": 1.7109375, + "learning_rate": 1.1773027620595897e-05, + "loss": 0.8008, + "step": 7811 + }, + { + "epoch": 1.3274964099706368, + "grad_norm": 1.7421875, + "learning_rate": 1.1771250276003179e-05, + "loss": 0.9204, + "step": 7812 + }, + { + "epoch": 1.3276678740596266, + "grad_norm": 1.7109375, + "learning_rate": 1.1769472873643317e-05, + "loss": 0.8668, + "step": 7813 + }, + { + "epoch": 1.3278393381486164, + "grad_norm": 1.6953125, + "learning_rate": 1.1767695413574284e-05, + "loss": 0.8277, + "step": 7814 + }, + { + "epoch": 1.3280108022376065, + "grad_norm": 1.65625, + "learning_rate": 1.1765917895854046e-05, + "loss": 0.7791, + "step": 7815 + }, + { + "epoch": 1.3281822663265963, + "grad_norm": 1.7109375, + "learning_rate": 1.1764140320540578e-05, + "loss": 0.8521, + "step": 7816 + }, + { + "epoch": 1.3283537304155861, + "grad_norm": 1.734375, + "learning_rate": 1.1762362687691849e-05, + "loss": 0.9123, + "step": 7817 + }, + { + "epoch": 1.328525194504576, + "grad_norm": 1.546875, + "learning_rate": 1.1760584997365835e-05, + "loss": 0.8249, + "step": 7818 + }, + { + "epoch": 1.3286966585935658, + "grad_norm": 1.6796875, + "learning_rate": 1.1758807249620517e-05, + "loss": 0.7786, + "step": 7819 + }, + { + "epoch": 1.3288681226825556, + "grad_norm": 1.640625, + "learning_rate": 1.175702944451387e-05, + "loss": 0.8714, + "step": 7820 + }, + { + "epoch": 1.3290395867715454, + "grad_norm": 1.640625, + "learning_rate": 1.1755251582103879e-05, + "loss": 0.8321, + "step": 7821 + }, + { + "epoch": 1.3292110508605353, + "grad_norm": 1.6484375, + "learning_rate": 1.1753473662448523e-05, + "loss": 0.8579, + "step": 7822 + }, + { + "epoch": 1.3293825149495253, + "grad_norm": 1.6640625, + "learning_rate": 1.1751695685605784e-05, + "loss": 0.8547, + "step": 7823 + }, + { + "epoch": 1.3295539790385151, + "grad_norm": 1.7265625, + "learning_rate": 1.1749917651633655e-05, + "loss": 0.8092, + "step": 7824 + }, + { + "epoch": 1.329725443127505, + "grad_norm": 1.6953125, + "learning_rate": 1.1748139560590123e-05, + "loss": 0.8526, + "step": 7825 + }, + { + "epoch": 1.3298969072164948, + "grad_norm": 1.71875, + "learning_rate": 1.1746361412533175e-05, + "loss": 0.841, + "step": 7826 + }, + { + "epoch": 1.3300683713054848, + "grad_norm": 1.734375, + "learning_rate": 1.174458320752081e-05, + "loss": 0.8476, + "step": 7827 + }, + { + "epoch": 1.3302398353944747, + "grad_norm": 1.6484375, + "learning_rate": 1.1742804945611014e-05, + "loss": 0.932, + "step": 7828 + }, + { + "epoch": 1.3304112994834645, + "grad_norm": 1.5703125, + "learning_rate": 1.1741026626861784e-05, + "loss": 0.786, + "step": 7829 + }, + { + "epoch": 1.3305827635724543, + "grad_norm": 1.6875, + "learning_rate": 1.1739248251331124e-05, + "loss": 0.9036, + "step": 7830 + }, + { + "epoch": 1.3307542276614441, + "grad_norm": 1.7421875, + "learning_rate": 1.1737469819077026e-05, + "loss": 0.7656, + "step": 7831 + }, + { + "epoch": 1.330925691750434, + "grad_norm": 1.7265625, + "learning_rate": 1.1735691330157492e-05, + "loss": 0.9576, + "step": 7832 + }, + { + "epoch": 1.3310971558394238, + "grad_norm": 1.5703125, + "learning_rate": 1.1733912784630532e-05, + "loss": 0.7358, + "step": 7833 + }, + { + "epoch": 1.3312686199284136, + "grad_norm": 1.703125, + "learning_rate": 1.1732134182554144e-05, + "loss": 0.8518, + "step": 7834 + }, + { + "epoch": 1.3314400840174037, + "grad_norm": 1.734375, + "learning_rate": 1.173035552398634e-05, + "loss": 0.8398, + "step": 7835 + }, + { + "epoch": 1.3316115481063935, + "grad_norm": 1.6484375, + "learning_rate": 1.1728576808985126e-05, + "loss": 0.8388, + "step": 7836 + }, + { + "epoch": 1.3317830121953833, + "grad_norm": 1.7265625, + "learning_rate": 1.1726798037608514e-05, + "loss": 0.9044, + "step": 7837 + }, + { + "epoch": 1.3319544762843731, + "grad_norm": 1.7421875, + "learning_rate": 1.1725019209914514e-05, + "loss": 0.9309, + "step": 7838 + }, + { + "epoch": 1.3321259403733632, + "grad_norm": 1.65625, + "learning_rate": 1.1723240325961142e-05, + "loss": 0.8207, + "step": 7839 + }, + { + "epoch": 1.332297404462353, + "grad_norm": 1.671875, + "learning_rate": 1.1721461385806414e-05, + "loss": 0.846, + "step": 7840 + }, + { + "epoch": 1.3324688685513428, + "grad_norm": 1.6875, + "learning_rate": 1.1719682389508346e-05, + "loss": 0.8934, + "step": 7841 + }, + { + "epoch": 1.3326403326403327, + "grad_norm": 1.7578125, + "learning_rate": 1.171790333712496e-05, + "loss": 0.8431, + "step": 7842 + }, + { + "epoch": 1.3328117967293225, + "grad_norm": 1.7421875, + "learning_rate": 1.1716124228714278e-05, + "loss": 0.8962, + "step": 7843 + }, + { + "epoch": 1.3329832608183123, + "grad_norm": 1.71875, + "learning_rate": 1.1714345064334325e-05, + "loss": 0.923, + "step": 7844 + }, + { + "epoch": 1.3331547249073021, + "grad_norm": 1.734375, + "learning_rate": 1.171256584404312e-05, + "loss": 0.8543, + "step": 7845 + }, + { + "epoch": 1.333326188996292, + "grad_norm": 1.6328125, + "learning_rate": 1.1710786567898696e-05, + "loss": 0.8548, + "step": 7846 + }, + { + "epoch": 1.333497653085282, + "grad_norm": 1.796875, + "learning_rate": 1.1709007235959078e-05, + "loss": 0.8992, + "step": 7847 + }, + { + "epoch": 1.3336691171742718, + "grad_norm": 1.6796875, + "learning_rate": 1.1707227848282301e-05, + "loss": 0.8639, + "step": 7848 + }, + { + "epoch": 1.3338405812632617, + "grad_norm": 1.75, + "learning_rate": 1.1705448404926392e-05, + "loss": 0.8901, + "step": 7849 + }, + { + "epoch": 1.3340120453522515, + "grad_norm": 1.625, + "learning_rate": 1.1703668905949393e-05, + "loss": 0.8759, + "step": 7850 + }, + { + "epoch": 1.3341835094412415, + "grad_norm": 1.65625, + "learning_rate": 1.170188935140933e-05, + "loss": 0.8358, + "step": 7851 + }, + { + "epoch": 1.3343549735302314, + "grad_norm": 1.7578125, + "learning_rate": 1.170010974136425e-05, + "loss": 0.8509, + "step": 7852 + }, + { + "epoch": 1.3345264376192212, + "grad_norm": 1.7265625, + "learning_rate": 1.1698330075872188e-05, + "loss": 0.8816, + "step": 7853 + }, + { + "epoch": 1.334697901708211, + "grad_norm": 1.625, + "learning_rate": 1.1696550354991187e-05, + "loss": 0.8541, + "step": 7854 + }, + { + "epoch": 1.3348693657972008, + "grad_norm": 1.7578125, + "learning_rate": 1.1694770578779293e-05, + "loss": 0.887, + "step": 7855 + }, + { + "epoch": 1.3350408298861907, + "grad_norm": 1.703125, + "learning_rate": 1.1692990747294546e-05, + "loss": 0.8772, + "step": 7856 + }, + { + "epoch": 1.3352122939751805, + "grad_norm": 1.578125, + "learning_rate": 1.1691210860594997e-05, + "loss": 0.8723, + "step": 7857 + }, + { + "epoch": 1.3353837580641703, + "grad_norm": 1.6875, + "learning_rate": 1.1689430918738691e-05, + "loss": 0.901, + "step": 7858 + }, + { + "epoch": 1.3355552221531604, + "grad_norm": 1.640625, + "learning_rate": 1.1687650921783683e-05, + "loss": 0.8865, + "step": 7859 + }, + { + "epoch": 1.3357266862421502, + "grad_norm": 1.6796875, + "learning_rate": 1.168587086978802e-05, + "loss": 0.8233, + "step": 7860 + }, + { + "epoch": 1.33589815033114, + "grad_norm": 1.703125, + "learning_rate": 1.1684090762809762e-05, + "loss": 0.9395, + "step": 7861 + }, + { + "epoch": 1.3360696144201298, + "grad_norm": 1.640625, + "learning_rate": 1.1682310600906962e-05, + "loss": 0.8309, + "step": 7862 + }, + { + "epoch": 1.3362410785091199, + "grad_norm": 1.65625, + "learning_rate": 1.1680530384137681e-05, + "loss": 0.9013, + "step": 7863 + }, + { + "epoch": 1.3364125425981097, + "grad_norm": 1.6640625, + "learning_rate": 1.1678750112559971e-05, + "loss": 0.8431, + "step": 7864 + }, + { + "epoch": 1.3365840066870995, + "grad_norm": 1.78125, + "learning_rate": 1.1676969786231903e-05, + "loss": 0.9223, + "step": 7865 + }, + { + "epoch": 1.3367554707760894, + "grad_norm": 1.65625, + "learning_rate": 1.167518940521153e-05, + "loss": 0.7831, + "step": 7866 + }, + { + "epoch": 1.3369269348650792, + "grad_norm": 1.84375, + "learning_rate": 1.167340896955693e-05, + "loss": 0.8798, + "step": 7867 + }, + { + "epoch": 1.337098398954069, + "grad_norm": 1.7109375, + "learning_rate": 1.1671628479326157e-05, + "loss": 0.8683, + "step": 7868 + }, + { + "epoch": 1.3372698630430588, + "grad_norm": 1.6875, + "learning_rate": 1.1669847934577285e-05, + "loss": 0.8294, + "step": 7869 + }, + { + "epoch": 1.3374413271320487, + "grad_norm": 1.765625, + "learning_rate": 1.1668067335368383e-05, + "loss": 0.8769, + "step": 7870 + }, + { + "epoch": 1.3376127912210387, + "grad_norm": 1.6484375, + "learning_rate": 1.1666286681757524e-05, + "loss": 0.8865, + "step": 7871 + }, + { + "epoch": 1.3377842553100285, + "grad_norm": 1.6796875, + "learning_rate": 1.1664505973802782e-05, + "loss": 0.8907, + "step": 7872 + }, + { + "epoch": 1.3379557193990184, + "grad_norm": 1.65625, + "learning_rate": 1.1662725211562233e-05, + "loss": 0.8434, + "step": 7873 + }, + { + "epoch": 1.3381271834880082, + "grad_norm": 1.75, + "learning_rate": 1.1660944395093954e-05, + "loss": 0.8884, + "step": 7874 + }, + { + "epoch": 1.338298647576998, + "grad_norm": 1.6328125, + "learning_rate": 1.1659163524456023e-05, + "loss": 0.8571, + "step": 7875 + }, + { + "epoch": 1.338470111665988, + "grad_norm": 1.71875, + "learning_rate": 1.1657382599706522e-05, + "loss": 0.8605, + "step": 7876 + }, + { + "epoch": 1.3386415757549779, + "grad_norm": 1.671875, + "learning_rate": 1.1655601620903531e-05, + "loss": 0.9084, + "step": 7877 + }, + { + "epoch": 1.3388130398439677, + "grad_norm": 1.6953125, + "learning_rate": 1.165382058810514e-05, + "loss": 0.8482, + "step": 7878 + }, + { + "epoch": 1.3389845039329575, + "grad_norm": 1.7421875, + "learning_rate": 1.165203950136943e-05, + "loss": 0.8411, + "step": 7879 + }, + { + "epoch": 1.3391559680219474, + "grad_norm": 1.6953125, + "learning_rate": 1.1650258360754491e-05, + "loss": 0.8961, + "step": 7880 + }, + { + "epoch": 1.3393274321109372, + "grad_norm": 1.75, + "learning_rate": 1.1648477166318413e-05, + "loss": 0.8305, + "step": 7881 + }, + { + "epoch": 1.339498896199927, + "grad_norm": 1.6640625, + "learning_rate": 1.1646695918119289e-05, + "loss": 0.8248, + "step": 7882 + }, + { + "epoch": 1.339670360288917, + "grad_norm": 1.765625, + "learning_rate": 1.1644914616215206e-05, + "loss": 0.9079, + "step": 7883 + }, + { + "epoch": 1.3398418243779069, + "grad_norm": 1.7109375, + "learning_rate": 1.1643133260664268e-05, + "loss": 0.9124, + "step": 7884 + }, + { + "epoch": 1.3400132884668967, + "grad_norm": 1.7265625, + "learning_rate": 1.1641351851524564e-05, + "loss": 0.8732, + "step": 7885 + }, + { + "epoch": 1.3401847525558865, + "grad_norm": 1.796875, + "learning_rate": 1.1639570388854196e-05, + "loss": 0.8122, + "step": 7886 + }, + { + "epoch": 1.3403562166448764, + "grad_norm": 1.7734375, + "learning_rate": 1.1637788872711262e-05, + "loss": 0.8699, + "step": 7887 + }, + { + "epoch": 1.3405276807338664, + "grad_norm": 1.75, + "learning_rate": 1.1636007303153867e-05, + "loss": 0.9392, + "step": 7888 + }, + { + "epoch": 1.3406991448228562, + "grad_norm": 1.7265625, + "learning_rate": 1.163422568024011e-05, + "loss": 0.9003, + "step": 7889 + }, + { + "epoch": 1.340870608911846, + "grad_norm": 1.6875, + "learning_rate": 1.1632444004028105e-05, + "loss": 0.88, + "step": 7890 + }, + { + "epoch": 1.3410420730008359, + "grad_norm": 1.75, + "learning_rate": 1.1630662274575952e-05, + "loss": 0.8422, + "step": 7891 + }, + { + "epoch": 1.3412135370898257, + "grad_norm": 1.7421875, + "learning_rate": 1.162888049194176e-05, + "loss": 0.9335, + "step": 7892 + }, + { + "epoch": 1.3413850011788155, + "grad_norm": 1.703125, + "learning_rate": 1.1627098656183645e-05, + "loss": 0.8748, + "step": 7893 + }, + { + "epoch": 1.3415564652678054, + "grad_norm": 1.578125, + "learning_rate": 1.1625316767359711e-05, + "loss": 0.7939, + "step": 7894 + }, + { + "epoch": 1.3417279293567954, + "grad_norm": 1.7578125, + "learning_rate": 1.162353482552808e-05, + "loss": 0.8575, + "step": 7895 + }, + { + "epoch": 1.3418993934457852, + "grad_norm": 1.703125, + "learning_rate": 1.1621752830746863e-05, + "loss": 0.8959, + "step": 7896 + }, + { + "epoch": 1.342070857534775, + "grad_norm": 1.6328125, + "learning_rate": 1.1619970783074179e-05, + "loss": 0.8454, + "step": 7897 + }, + { + "epoch": 1.342242321623765, + "grad_norm": 1.7109375, + "learning_rate": 1.1618188682568148e-05, + "loss": 0.9552, + "step": 7898 + }, + { + "epoch": 1.3424137857127547, + "grad_norm": 1.71875, + "learning_rate": 1.1616406529286891e-05, + "loss": 0.858, + "step": 7899 + }, + { + "epoch": 1.3425852498017448, + "grad_norm": 1.625, + "learning_rate": 1.161462432328853e-05, + "loss": 0.9018, + "step": 7900 + }, + { + "epoch": 1.3427567138907346, + "grad_norm": 1.7109375, + "learning_rate": 1.161284206463119e-05, + "loss": 0.8469, + "step": 7901 + }, + { + "epoch": 1.3429281779797244, + "grad_norm": 1.6015625, + "learning_rate": 1.1611059753373e-05, + "loss": 0.8626, + "step": 7902 + }, + { + "epoch": 1.3430996420687142, + "grad_norm": 1.6953125, + "learning_rate": 1.1609277389572082e-05, + "loss": 0.8517, + "step": 7903 + }, + { + "epoch": 1.343271106157704, + "grad_norm": 1.828125, + "learning_rate": 1.1607494973286566e-05, + "loss": 0.888, + "step": 7904 + }, + { + "epoch": 1.343442570246694, + "grad_norm": 1.7578125, + "learning_rate": 1.1605712504574588e-05, + "loss": 0.993, + "step": 7905 + }, + { + "epoch": 1.3436140343356837, + "grad_norm": 1.8203125, + "learning_rate": 1.160392998349428e-05, + "loss": 0.9164, + "step": 7906 + }, + { + "epoch": 1.3437854984246738, + "grad_norm": 1.6640625, + "learning_rate": 1.1602147410103772e-05, + "loss": 0.7978, + "step": 7907 + }, + { + "epoch": 1.3439569625136636, + "grad_norm": 1.6171875, + "learning_rate": 1.1600364784461207e-05, + "loss": 0.8315, + "step": 7908 + }, + { + "epoch": 1.3441284266026534, + "grad_norm": 1.6328125, + "learning_rate": 1.1598582106624716e-05, + "loss": 0.8325, + "step": 7909 + }, + { + "epoch": 1.3442998906916432, + "grad_norm": 1.6796875, + "learning_rate": 1.1596799376652446e-05, + "loss": 0.8375, + "step": 7910 + }, + { + "epoch": 1.344471354780633, + "grad_norm": 1.8125, + "learning_rate": 1.1595016594602535e-05, + "loss": 0.8738, + "step": 7911 + }, + { + "epoch": 1.3446428188696231, + "grad_norm": 1.703125, + "learning_rate": 1.1593233760533127e-05, + "loss": 0.8572, + "step": 7912 + }, + { + "epoch": 1.344814282958613, + "grad_norm": 1.671875, + "learning_rate": 1.1591450874502365e-05, + "loss": 0.8272, + "step": 7913 + }, + { + "epoch": 1.3449857470476028, + "grad_norm": 1.6796875, + "learning_rate": 1.1589667936568399e-05, + "loss": 0.8233, + "step": 7914 + }, + { + "epoch": 1.3451572111365926, + "grad_norm": 1.7421875, + "learning_rate": 1.1587884946789374e-05, + "loss": 0.8449, + "step": 7915 + }, + { + "epoch": 1.3453286752255824, + "grad_norm": 1.6171875, + "learning_rate": 1.1586101905223442e-05, + "loss": 0.7674, + "step": 7916 + }, + { + "epoch": 1.3455001393145722, + "grad_norm": 1.7109375, + "learning_rate": 1.1584318811928752e-05, + "loss": 0.8283, + "step": 7917 + }, + { + "epoch": 1.345671603403562, + "grad_norm": 1.7421875, + "learning_rate": 1.1582535666963462e-05, + "loss": 0.8261, + "step": 7918 + }, + { + "epoch": 1.345843067492552, + "grad_norm": 1.625, + "learning_rate": 1.1580752470385725e-05, + "loss": 0.9092, + "step": 7919 + }, + { + "epoch": 1.346014531581542, + "grad_norm": 1.7734375, + "learning_rate": 1.1578969222253696e-05, + "loss": 0.8821, + "step": 7920 + }, + { + "epoch": 1.3461859956705318, + "grad_norm": 1.6953125, + "learning_rate": 1.1577185922625536e-05, + "loss": 0.8901, + "step": 7921 + }, + { + "epoch": 1.3463574597595216, + "grad_norm": 1.6640625, + "learning_rate": 1.1575402571559403e-05, + "loss": 0.8937, + "step": 7922 + }, + { + "epoch": 1.3465289238485114, + "grad_norm": 1.8046875, + "learning_rate": 1.157361916911346e-05, + "loss": 0.8723, + "step": 7923 + }, + { + "epoch": 1.3467003879375015, + "grad_norm": 1.6796875, + "learning_rate": 1.1571835715345869e-05, + "loss": 0.885, + "step": 7924 + }, + { + "epoch": 1.3468718520264913, + "grad_norm": 1.6640625, + "learning_rate": 1.1570052210314798e-05, + "loss": 0.8569, + "step": 7925 + }, + { + "epoch": 1.3470433161154811, + "grad_norm": 1.6953125, + "learning_rate": 1.1568268654078411e-05, + "loss": 0.9641, + "step": 7926 + }, + { + "epoch": 1.347214780204471, + "grad_norm": 1.65625, + "learning_rate": 1.1566485046694877e-05, + "loss": 0.8032, + "step": 7927 + }, + { + "epoch": 1.3473862442934608, + "grad_norm": 1.5859375, + "learning_rate": 1.156470138822237e-05, + "loss": 0.8226, + "step": 7928 + }, + { + "epoch": 1.3475577083824506, + "grad_norm": 1.640625, + "learning_rate": 1.1562917678719056e-05, + "loss": 0.8646, + "step": 7929 + }, + { + "epoch": 1.3477291724714404, + "grad_norm": 1.7109375, + "learning_rate": 1.1561133918243113e-05, + "loss": 0.8501, + "step": 7930 + }, + { + "epoch": 1.3479006365604302, + "grad_norm": 1.6640625, + "learning_rate": 1.1559350106852713e-05, + "loss": 0.735, + "step": 7931 + }, + { + "epoch": 1.3480721006494203, + "grad_norm": 1.640625, + "learning_rate": 1.1557566244606035e-05, + "loss": 0.9369, + "step": 7932 + }, + { + "epoch": 1.3482435647384101, + "grad_norm": 1.765625, + "learning_rate": 1.1555782331561253e-05, + "loss": 0.9249, + "step": 7933 + }, + { + "epoch": 1.3484150288274, + "grad_norm": 1.671875, + "learning_rate": 1.1553998367776555e-05, + "loss": 0.9296, + "step": 7934 + }, + { + "epoch": 1.3485864929163898, + "grad_norm": 1.65625, + "learning_rate": 1.1552214353310117e-05, + "loss": 0.8557, + "step": 7935 + }, + { + "epoch": 1.3487579570053798, + "grad_norm": 1.6015625, + "learning_rate": 1.1550430288220122e-05, + "loss": 0.8573, + "step": 7936 + }, + { + "epoch": 1.3489294210943696, + "grad_norm": 1.734375, + "learning_rate": 1.154864617256476e-05, + "loss": 0.832, + "step": 7937 + }, + { + "epoch": 1.3491008851833595, + "grad_norm": 1.703125, + "learning_rate": 1.1546862006402211e-05, + "loss": 0.8838, + "step": 7938 + }, + { + "epoch": 1.3492723492723493, + "grad_norm": 1.65625, + "learning_rate": 1.1545077789790668e-05, + "loss": 0.8245, + "step": 7939 + }, + { + "epoch": 1.3494438133613391, + "grad_norm": 1.7421875, + "learning_rate": 1.1543293522788321e-05, + "loss": 0.8864, + "step": 7940 + }, + { + "epoch": 1.349615277450329, + "grad_norm": 1.703125, + "learning_rate": 1.1541509205453363e-05, + "loss": 0.9193, + "step": 7941 + }, + { + "epoch": 1.3497867415393188, + "grad_norm": 1.75, + "learning_rate": 1.1539724837843984e-05, + "loss": 0.8673, + "step": 7942 + }, + { + "epoch": 1.3499582056283086, + "grad_norm": 1.6875, + "learning_rate": 1.1537940420018378e-05, + "loss": 0.8974, + "step": 7943 + }, + { + "epoch": 1.3501296697172986, + "grad_norm": 1.6640625, + "learning_rate": 1.1536155952034743e-05, + "loss": 0.8876, + "step": 7944 + }, + { + "epoch": 1.3503011338062885, + "grad_norm": 1.671875, + "learning_rate": 1.1534371433951279e-05, + "loss": 0.8667, + "step": 7945 + }, + { + "epoch": 1.3504725978952783, + "grad_norm": 1.7578125, + "learning_rate": 1.1532586865826185e-05, + "loss": 0.9604, + "step": 7946 + }, + { + "epoch": 1.3506440619842681, + "grad_norm": 1.546875, + "learning_rate": 1.153080224771766e-05, + "loss": 0.7985, + "step": 7947 + }, + { + "epoch": 1.3508155260732582, + "grad_norm": 1.671875, + "learning_rate": 1.1529017579683915e-05, + "loss": 0.8312, + "step": 7948 + }, + { + "epoch": 1.350986990162248, + "grad_norm": 1.6875, + "learning_rate": 1.1527232861783145e-05, + "loss": 0.9297, + "step": 7949 + }, + { + "epoch": 1.3511584542512378, + "grad_norm": 1.7265625, + "learning_rate": 1.1525448094073559e-05, + "loss": 0.8972, + "step": 7950 + }, + { + "epoch": 1.3513299183402276, + "grad_norm": 1.7109375, + "learning_rate": 1.1523663276613368e-05, + "loss": 0.8746, + "step": 7951 + }, + { + "epoch": 1.3515013824292175, + "grad_norm": 1.65625, + "learning_rate": 1.152187840946078e-05, + "loss": 0.8506, + "step": 7952 + }, + { + "epoch": 1.3516728465182073, + "grad_norm": 1.671875, + "learning_rate": 1.1520093492674004e-05, + "loss": 0.9189, + "step": 7953 + }, + { + "epoch": 1.3518443106071971, + "grad_norm": 1.71875, + "learning_rate": 1.1518308526311257e-05, + "loss": 0.829, + "step": 7954 + }, + { + "epoch": 1.352015774696187, + "grad_norm": 1.6875, + "learning_rate": 1.1516523510430753e-05, + "loss": 0.8295, + "step": 7955 + }, + { + "epoch": 1.352187238785177, + "grad_norm": 1.6796875, + "learning_rate": 1.1514738445090705e-05, + "loss": 0.8435, + "step": 7956 + }, + { + "epoch": 1.3523587028741668, + "grad_norm": 1.6015625, + "learning_rate": 1.1512953330349332e-05, + "loss": 0.8204, + "step": 7957 + }, + { + "epoch": 1.3525301669631566, + "grad_norm": 1.6953125, + "learning_rate": 1.1511168166264854e-05, + "loss": 0.8189, + "step": 7958 + }, + { + "epoch": 1.3527016310521465, + "grad_norm": 1.6640625, + "learning_rate": 1.1509382952895494e-05, + "loss": 0.794, + "step": 7959 + }, + { + "epoch": 1.3528730951411365, + "grad_norm": 1.6328125, + "learning_rate": 1.150759769029947e-05, + "loss": 0.7831, + "step": 7960 + }, + { + "epoch": 1.3530445592301263, + "grad_norm": 1.734375, + "learning_rate": 1.1505812378535008e-05, + "loss": 0.8546, + "step": 7961 + }, + { + "epoch": 1.3532160233191162, + "grad_norm": 1.671875, + "learning_rate": 1.1504027017660333e-05, + "loss": 0.9, + "step": 7962 + }, + { + "epoch": 1.353387487408106, + "grad_norm": 1.5703125, + "learning_rate": 1.1502241607733675e-05, + "loss": 0.8458, + "step": 7963 + }, + { + "epoch": 1.3535589514970958, + "grad_norm": 1.75, + "learning_rate": 1.1500456148813258e-05, + "loss": 0.8764, + "step": 7964 + }, + { + "epoch": 1.3537304155860856, + "grad_norm": 1.71875, + "learning_rate": 1.1498670640957319e-05, + "loss": 0.8472, + "step": 7965 + }, + { + "epoch": 1.3539018796750755, + "grad_norm": 1.6640625, + "learning_rate": 1.1496885084224088e-05, + "loss": 0.7855, + "step": 7966 + }, + { + "epoch": 1.3540733437640653, + "grad_norm": 1.765625, + "learning_rate": 1.1495099478671797e-05, + "loss": 0.962, + "step": 7967 + }, + { + "epoch": 1.3542448078530553, + "grad_norm": 1.625, + "learning_rate": 1.1493313824358678e-05, + "loss": 0.8217, + "step": 7968 + }, + { + "epoch": 1.3544162719420452, + "grad_norm": 1.6328125, + "learning_rate": 1.1491528121342977e-05, + "loss": 0.846, + "step": 7969 + }, + { + "epoch": 1.354587736031035, + "grad_norm": 1.6640625, + "learning_rate": 1.1489742369682923e-05, + "loss": 0.8412, + "step": 7970 + }, + { + "epoch": 1.3547592001200248, + "grad_norm": 1.671875, + "learning_rate": 1.1487956569436766e-05, + "loss": 0.9254, + "step": 7971 + }, + { + "epoch": 1.3549306642090146, + "grad_norm": 1.7734375, + "learning_rate": 1.1486170720662736e-05, + "loss": 0.8846, + "step": 7972 + }, + { + "epoch": 1.3551021282980047, + "grad_norm": 1.9296875, + "learning_rate": 1.1484384823419086e-05, + "loss": 0.8607, + "step": 7973 + }, + { + "epoch": 1.3552735923869945, + "grad_norm": 1.78125, + "learning_rate": 1.1482598877764055e-05, + "loss": 0.8201, + "step": 7974 + }, + { + "epoch": 1.3554450564759843, + "grad_norm": 1.6640625, + "learning_rate": 1.1480812883755894e-05, + "loss": 0.8109, + "step": 7975 + }, + { + "epoch": 1.3556165205649742, + "grad_norm": 1.640625, + "learning_rate": 1.1479026841452848e-05, + "loss": 0.9103, + "step": 7976 + }, + { + "epoch": 1.355787984653964, + "grad_norm": 1.7109375, + "learning_rate": 1.1477240750913168e-05, + "loss": 0.8906, + "step": 7977 + }, + { + "epoch": 1.3559594487429538, + "grad_norm": 1.734375, + "learning_rate": 1.1475454612195102e-05, + "loss": 0.8033, + "step": 7978 + }, + { + "epoch": 1.3561309128319436, + "grad_norm": 1.6953125, + "learning_rate": 1.1473668425356908e-05, + "loss": 0.8839, + "step": 7979 + }, + { + "epoch": 1.3563023769209337, + "grad_norm": 1.5625, + "learning_rate": 1.1471882190456836e-05, + "loss": 0.7843, + "step": 7980 + }, + { + "epoch": 1.3564738410099235, + "grad_norm": 1.7109375, + "learning_rate": 1.1470095907553143e-05, + "loss": 0.8963, + "step": 7981 + }, + { + "epoch": 1.3566453050989133, + "grad_norm": 1.734375, + "learning_rate": 1.1468309576704084e-05, + "loss": 0.9226, + "step": 7982 + }, + { + "epoch": 1.3568167691879032, + "grad_norm": 1.6484375, + "learning_rate": 1.1466523197967923e-05, + "loss": 0.8045, + "step": 7983 + }, + { + "epoch": 1.356988233276893, + "grad_norm": 1.6875, + "learning_rate": 1.1464736771402918e-05, + "loss": 0.8812, + "step": 7984 + }, + { + "epoch": 1.357159697365883, + "grad_norm": 1.6875, + "learning_rate": 1.1462950297067333e-05, + "loss": 0.8684, + "step": 7985 + }, + { + "epoch": 1.3573311614548729, + "grad_norm": 1.7890625, + "learning_rate": 1.1461163775019426e-05, + "loss": 0.9137, + "step": 7986 + }, + { + "epoch": 1.3575026255438627, + "grad_norm": 1.65625, + "learning_rate": 1.1459377205317467e-05, + "loss": 0.8363, + "step": 7987 + }, + { + "epoch": 1.3576740896328525, + "grad_norm": 1.703125, + "learning_rate": 1.1457590588019726e-05, + "loss": 0.8861, + "step": 7988 + }, + { + "epoch": 1.3578455537218423, + "grad_norm": 1.6015625, + "learning_rate": 1.1455803923184465e-05, + "loss": 0.8179, + "step": 7989 + }, + { + "epoch": 1.3580170178108322, + "grad_norm": 1.671875, + "learning_rate": 1.1454017210869954e-05, + "loss": 0.8174, + "step": 7990 + }, + { + "epoch": 1.358188481899822, + "grad_norm": 1.671875, + "learning_rate": 1.1452230451134465e-05, + "loss": 0.8978, + "step": 7991 + }, + { + "epoch": 1.358359945988812, + "grad_norm": 1.640625, + "learning_rate": 1.1450443644036276e-05, + "loss": 0.8766, + "step": 7992 + }, + { + "epoch": 1.3585314100778019, + "grad_norm": 1.734375, + "learning_rate": 1.1448656789633657e-05, + "loss": 0.9489, + "step": 7993 + }, + { + "epoch": 1.3587028741667917, + "grad_norm": 1.71875, + "learning_rate": 1.1446869887984887e-05, + "loss": 0.9773, + "step": 7994 + }, + { + "epoch": 1.3588743382557815, + "grad_norm": 1.734375, + "learning_rate": 1.144508293914824e-05, + "loss": 0.9206, + "step": 7995 + }, + { + "epoch": 1.3590458023447713, + "grad_norm": 1.859375, + "learning_rate": 1.1443295943181996e-05, + "loss": 0.8688, + "step": 7996 + }, + { + "epoch": 1.3592172664337614, + "grad_norm": 1.65625, + "learning_rate": 1.1441508900144436e-05, + "loss": 0.8146, + "step": 7997 + }, + { + "epoch": 1.3593887305227512, + "grad_norm": 1.6953125, + "learning_rate": 1.1439721810093843e-05, + "loss": 0.878, + "step": 7998 + }, + { + "epoch": 1.359560194611741, + "grad_norm": 1.7578125, + "learning_rate": 1.14379346730885e-05, + "loss": 0.8769, + "step": 7999 + }, + { + "epoch": 1.3597316587007309, + "grad_norm": 1.7421875, + "learning_rate": 1.1436147489186693e-05, + "loss": 0.8759, + "step": 8000 + }, + { + "epoch": 1.3599031227897207, + "grad_norm": 1.640625, + "learning_rate": 1.1434360258446705e-05, + "loss": 0.8362, + "step": 8001 + }, + { + "epoch": 1.3600745868787105, + "grad_norm": 1.7265625, + "learning_rate": 1.1432572980926833e-05, + "loss": 0.8737, + "step": 8002 + }, + { + "epoch": 1.3602460509677003, + "grad_norm": 1.6171875, + "learning_rate": 1.1430785656685358e-05, + "loss": 0.8076, + "step": 8003 + }, + { + "epoch": 1.3604175150566904, + "grad_norm": 1.609375, + "learning_rate": 1.1428998285780576e-05, + "loss": 0.8479, + "step": 8004 + }, + { + "epoch": 1.3605889791456802, + "grad_norm": 1.6796875, + "learning_rate": 1.142721086827078e-05, + "loss": 0.8055, + "step": 8005 + }, + { + "epoch": 1.36076044323467, + "grad_norm": 1.7421875, + "learning_rate": 1.1425423404214263e-05, + "loss": 0.847, + "step": 8006 + }, + { + "epoch": 1.3609319073236599, + "grad_norm": 1.765625, + "learning_rate": 1.142363589366932e-05, + "loss": 0.8638, + "step": 8007 + }, + { + "epoch": 1.3611033714126497, + "grad_norm": 1.6640625, + "learning_rate": 1.142184833669425e-05, + "loss": 0.859, + "step": 8008 + }, + { + "epoch": 1.3612748355016397, + "grad_norm": 1.703125, + "learning_rate": 1.1420060733347353e-05, + "loss": 0.7812, + "step": 8009 + }, + { + "epoch": 1.3614462995906296, + "grad_norm": 1.8359375, + "learning_rate": 1.1418273083686926e-05, + "loss": 0.8876, + "step": 8010 + }, + { + "epoch": 1.3616177636796194, + "grad_norm": 1.71875, + "learning_rate": 1.1416485387771274e-05, + "loss": 0.8667, + "step": 8011 + }, + { + "epoch": 1.3617892277686092, + "grad_norm": 1.6484375, + "learning_rate": 1.1414697645658699e-05, + "loss": 0.8258, + "step": 8012 + }, + { + "epoch": 1.361960691857599, + "grad_norm": 1.7578125, + "learning_rate": 1.1412909857407505e-05, + "loss": 0.8949, + "step": 8013 + }, + { + "epoch": 1.3621321559465889, + "grad_norm": 1.6640625, + "learning_rate": 1.1411122023076002e-05, + "loss": 0.8998, + "step": 8014 + }, + { + "epoch": 1.3623036200355787, + "grad_norm": 1.59375, + "learning_rate": 1.1409334142722494e-05, + "loss": 0.8753, + "step": 8015 + }, + { + "epoch": 1.3624750841245685, + "grad_norm": 1.671875, + "learning_rate": 1.1407546216405296e-05, + "loss": 0.8763, + "step": 8016 + }, + { + "epoch": 1.3626465482135586, + "grad_norm": 1.8671875, + "learning_rate": 1.1405758244182716e-05, + "loss": 0.9637, + "step": 8017 + }, + { + "epoch": 1.3628180123025484, + "grad_norm": 1.7734375, + "learning_rate": 1.1403970226113064e-05, + "loss": 0.9156, + "step": 8018 + }, + { + "epoch": 1.3629894763915382, + "grad_norm": 1.8125, + "learning_rate": 1.1402182162254653e-05, + "loss": 0.8782, + "step": 8019 + }, + { + "epoch": 1.363160940480528, + "grad_norm": 1.609375, + "learning_rate": 1.1400394052665804e-05, + "loss": 0.7742, + "step": 8020 + }, + { + "epoch": 1.363332404569518, + "grad_norm": 1.765625, + "learning_rate": 1.1398605897404833e-05, + "loss": 0.9132, + "step": 8021 + }, + { + "epoch": 1.363503868658508, + "grad_norm": 1.703125, + "learning_rate": 1.1396817696530055e-05, + "loss": 0.8553, + "step": 8022 + }, + { + "epoch": 1.3636753327474977, + "grad_norm": 1.640625, + "learning_rate": 1.1395029450099794e-05, + "loss": 0.8345, + "step": 8023 + }, + { + "epoch": 1.3638467968364876, + "grad_norm": 1.6953125, + "learning_rate": 1.139324115817237e-05, + "loss": 0.8422, + "step": 8024 + }, + { + "epoch": 1.3640182609254774, + "grad_norm": 1.7421875, + "learning_rate": 1.1391452820806103e-05, + "loss": 0.9355, + "step": 8025 + }, + { + "epoch": 1.3641897250144672, + "grad_norm": 1.7734375, + "learning_rate": 1.138966443805932e-05, + "loss": 0.9624, + "step": 8026 + }, + { + "epoch": 1.364361189103457, + "grad_norm": 1.7109375, + "learning_rate": 1.1387876009990348e-05, + "loss": 0.8817, + "step": 8027 + }, + { + "epoch": 1.3645326531924469, + "grad_norm": 1.7109375, + "learning_rate": 1.1386087536657513e-05, + "loss": 0.7908, + "step": 8028 + }, + { + "epoch": 1.364704117281437, + "grad_norm": 1.703125, + "learning_rate": 1.1384299018119142e-05, + "loss": 0.862, + "step": 8029 + }, + { + "epoch": 1.3648755813704267, + "grad_norm": 1.703125, + "learning_rate": 1.1382510454433568e-05, + "loss": 0.8278, + "step": 8030 + }, + { + "epoch": 1.3650470454594166, + "grad_norm": 1.734375, + "learning_rate": 1.1380721845659123e-05, + "loss": 0.8649, + "step": 8031 + }, + { + "epoch": 1.3652185095484064, + "grad_norm": 1.65625, + "learning_rate": 1.1378933191854137e-05, + "loss": 0.8856, + "step": 8032 + }, + { + "epoch": 1.3653899736373964, + "grad_norm": 1.6796875, + "learning_rate": 1.137714449307695e-05, + "loss": 0.9367, + "step": 8033 + }, + { + "epoch": 1.3655614377263863, + "grad_norm": 1.5703125, + "learning_rate": 1.1375355749385896e-05, + "loss": 0.7769, + "step": 8034 + }, + { + "epoch": 1.365732901815376, + "grad_norm": 1.578125, + "learning_rate": 1.137356696083931e-05, + "loss": 0.8368, + "step": 8035 + }, + { + "epoch": 1.365904365904366, + "grad_norm": 1.6875, + "learning_rate": 1.1371778127495535e-05, + "loss": 0.8367, + "step": 8036 + }, + { + "epoch": 1.3660758299933558, + "grad_norm": 1.6640625, + "learning_rate": 1.1369989249412905e-05, + "loss": 0.8095, + "step": 8037 + }, + { + "epoch": 1.3662472940823456, + "grad_norm": 1.625, + "learning_rate": 1.1368200326649768e-05, + "loss": 0.8846, + "step": 8038 + }, + { + "epoch": 1.3664187581713354, + "grad_norm": 1.7109375, + "learning_rate": 1.1366411359264466e-05, + "loss": 0.9167, + "step": 8039 + }, + { + "epoch": 1.3665902222603252, + "grad_norm": 1.6484375, + "learning_rate": 1.1364622347315348e-05, + "loss": 0.853, + "step": 8040 + }, + { + "epoch": 1.3667616863493153, + "grad_norm": 1.625, + "learning_rate": 1.1362833290860756e-05, + "loss": 0.8976, + "step": 8041 + }, + { + "epoch": 1.366933150438305, + "grad_norm": 1.6796875, + "learning_rate": 1.1361044189959035e-05, + "loss": 0.8774, + "step": 8042 + }, + { + "epoch": 1.367104614527295, + "grad_norm": 1.7265625, + "learning_rate": 1.135925504466854e-05, + "loss": 0.8337, + "step": 8043 + }, + { + "epoch": 1.3672760786162848, + "grad_norm": 1.7109375, + "learning_rate": 1.1357465855047617e-05, + "loss": 0.9059, + "step": 8044 + }, + { + "epoch": 1.3674475427052748, + "grad_norm": 1.8203125, + "learning_rate": 1.1355676621154624e-05, + "loss": 0.8321, + "step": 8045 + }, + { + "epoch": 1.3676190067942646, + "grad_norm": 1.7421875, + "learning_rate": 1.135388734304791e-05, + "loss": 0.8401, + "step": 8046 + }, + { + "epoch": 1.3677904708832545, + "grad_norm": 1.6484375, + "learning_rate": 1.1352098020785832e-05, + "loss": 0.8627, + "step": 8047 + }, + { + "epoch": 1.3679619349722443, + "grad_norm": 1.7265625, + "learning_rate": 1.1350308654426744e-05, + "loss": 0.8864, + "step": 8048 + }, + { + "epoch": 1.368133399061234, + "grad_norm": 1.796875, + "learning_rate": 1.1348519244029008e-05, + "loss": 0.8816, + "step": 8049 + }, + { + "epoch": 1.368304863150224, + "grad_norm": 1.734375, + "learning_rate": 1.1346729789650982e-05, + "loss": 0.8284, + "step": 8050 + }, + { + "epoch": 1.3684763272392138, + "grad_norm": 1.6875, + "learning_rate": 1.1344940291351022e-05, + "loss": 0.8998, + "step": 8051 + }, + { + "epoch": 1.3686477913282036, + "grad_norm": 1.59375, + "learning_rate": 1.1343150749187501e-05, + "loss": 0.8313, + "step": 8052 + }, + { + "epoch": 1.3688192554171936, + "grad_norm": 1.6953125, + "learning_rate": 1.1341361163218771e-05, + "loss": 0.9093, + "step": 8053 + }, + { + "epoch": 1.3689907195061835, + "grad_norm": 1.734375, + "learning_rate": 1.1339571533503207e-05, + "loss": 0.9283, + "step": 8054 + }, + { + "epoch": 1.3691621835951733, + "grad_norm": 1.7265625, + "learning_rate": 1.1337781860099165e-05, + "loss": 0.957, + "step": 8055 + }, + { + "epoch": 1.369333647684163, + "grad_norm": 1.6640625, + "learning_rate": 1.1335992143065023e-05, + "loss": 0.8692, + "step": 8056 + }, + { + "epoch": 1.369505111773153, + "grad_norm": 1.7109375, + "learning_rate": 1.1334202382459145e-05, + "loss": 0.8551, + "step": 8057 + }, + { + "epoch": 1.369676575862143, + "grad_norm": 1.703125, + "learning_rate": 1.1332412578339905e-05, + "loss": 0.8644, + "step": 8058 + }, + { + "epoch": 1.3698480399511328, + "grad_norm": 1.640625, + "learning_rate": 1.133062273076567e-05, + "loss": 0.848, + "step": 8059 + }, + { + "epoch": 1.3700195040401226, + "grad_norm": 1.75, + "learning_rate": 1.1328832839794819e-05, + "loss": 0.7959, + "step": 8060 + }, + { + "epoch": 1.3701909681291125, + "grad_norm": 1.6796875, + "learning_rate": 1.1327042905485725e-05, + "loss": 0.8229, + "step": 8061 + }, + { + "epoch": 1.3703624322181023, + "grad_norm": 1.7265625, + "learning_rate": 1.1325252927896765e-05, + "loss": 0.8541, + "step": 8062 + }, + { + "epoch": 1.370533896307092, + "grad_norm": 1.734375, + "learning_rate": 1.132346290708632e-05, + "loss": 0.8699, + "step": 8063 + }, + { + "epoch": 1.370705360396082, + "grad_norm": 1.734375, + "learning_rate": 1.132167284311276e-05, + "loss": 0.8418, + "step": 8064 + }, + { + "epoch": 1.370876824485072, + "grad_norm": 1.765625, + "learning_rate": 1.1319882736034476e-05, + "loss": 0.8203, + "step": 8065 + }, + { + "epoch": 1.3710482885740618, + "grad_norm": 1.5703125, + "learning_rate": 1.1318092585909841e-05, + "loss": 0.8273, + "step": 8066 + }, + { + "epoch": 1.3712197526630516, + "grad_norm": 1.640625, + "learning_rate": 1.1316302392797244e-05, + "loss": 0.8852, + "step": 8067 + }, + { + "epoch": 1.3713912167520415, + "grad_norm": 1.6953125, + "learning_rate": 1.1314512156755073e-05, + "loss": 0.8474, + "step": 8068 + }, + { + "epoch": 1.3715626808410313, + "grad_norm": 1.6640625, + "learning_rate": 1.131272187784171e-05, + "loss": 0.7935, + "step": 8069 + }, + { + "epoch": 1.3717341449300213, + "grad_norm": 1.765625, + "learning_rate": 1.1310931556115543e-05, + "loss": 0.9003, + "step": 8070 + }, + { + "epoch": 1.3719056090190112, + "grad_norm": 1.609375, + "learning_rate": 1.130914119163496e-05, + "loss": 0.8306, + "step": 8071 + }, + { + "epoch": 1.372077073108001, + "grad_norm": 1.703125, + "learning_rate": 1.1307350784458355e-05, + "loss": 0.9435, + "step": 8072 + }, + { + "epoch": 1.3722485371969908, + "grad_norm": 1.7421875, + "learning_rate": 1.1305560334644116e-05, + "loss": 0.9552, + "step": 8073 + }, + { + "epoch": 1.3724200012859806, + "grad_norm": 1.7109375, + "learning_rate": 1.1303769842250638e-05, + "loss": 0.8818, + "step": 8074 + }, + { + "epoch": 1.3725914653749705, + "grad_norm": 1.65625, + "learning_rate": 1.1301979307336317e-05, + "loss": 0.8798, + "step": 8075 + }, + { + "epoch": 1.3727629294639603, + "grad_norm": 1.71875, + "learning_rate": 1.1300188729959548e-05, + "loss": 0.8477, + "step": 8076 + }, + { + "epoch": 1.3729343935529503, + "grad_norm": 1.640625, + "learning_rate": 1.1298398110178728e-05, + "loss": 0.8337, + "step": 8077 + }, + { + "epoch": 1.3731058576419402, + "grad_norm": 1.6875, + "learning_rate": 1.1296607448052255e-05, + "loss": 0.865, + "step": 8078 + }, + { + "epoch": 1.37327732173093, + "grad_norm": 1.640625, + "learning_rate": 1.1294816743638533e-05, + "loss": 0.8704, + "step": 8079 + }, + { + "epoch": 1.3734487858199198, + "grad_norm": 1.65625, + "learning_rate": 1.1293025996995962e-05, + "loss": 0.8725, + "step": 8080 + }, + { + "epoch": 1.3736202499089096, + "grad_norm": 1.796875, + "learning_rate": 1.1291235208182942e-05, + "loss": 0.898, + "step": 8081 + }, + { + "epoch": 1.3737917139978997, + "grad_norm": 1.640625, + "learning_rate": 1.128944437725788e-05, + "loss": 0.814, + "step": 8082 + }, + { + "epoch": 1.3739631780868895, + "grad_norm": 1.6015625, + "learning_rate": 1.128765350427918e-05, + "loss": 0.9043, + "step": 8083 + }, + { + "epoch": 1.3741346421758793, + "grad_norm": 1.6171875, + "learning_rate": 1.1285862589305252e-05, + "loss": 0.8093, + "step": 8084 + }, + { + "epoch": 1.3743061062648692, + "grad_norm": 1.6796875, + "learning_rate": 1.12840716323945e-05, + "loss": 0.9416, + "step": 8085 + }, + { + "epoch": 1.374477570353859, + "grad_norm": 1.6484375, + "learning_rate": 1.128228063360534e-05, + "loss": 0.8444, + "step": 8086 + }, + { + "epoch": 1.3746490344428488, + "grad_norm": 1.7109375, + "learning_rate": 1.1280489592996177e-05, + "loss": 0.817, + "step": 8087 + }, + { + "epoch": 1.3748204985318386, + "grad_norm": 1.6953125, + "learning_rate": 1.127869851062543e-05, + "loss": 0.8625, + "step": 8088 + }, + { + "epoch": 1.3749919626208287, + "grad_norm": 1.7734375, + "learning_rate": 1.1276907386551508e-05, + "loss": 0.8986, + "step": 8089 + }, + { + "epoch": 1.3751634267098185, + "grad_norm": 1.6875, + "learning_rate": 1.1275116220832826e-05, + "loss": 0.8341, + "step": 8090 + }, + { + "epoch": 1.3753348907988083, + "grad_norm": 1.6171875, + "learning_rate": 1.1273325013527805e-05, + "loss": 0.8414, + "step": 8091 + }, + { + "epoch": 1.3755063548877982, + "grad_norm": 1.6796875, + "learning_rate": 1.1271533764694862e-05, + "loss": 0.8472, + "step": 8092 + }, + { + "epoch": 1.375677818976788, + "grad_norm": 1.6875, + "learning_rate": 1.126974247439241e-05, + "loss": 0.7282, + "step": 8093 + }, + { + "epoch": 1.375849283065778, + "grad_norm": 1.6484375, + "learning_rate": 1.1267951142678877e-05, + "loss": 0.8236, + "step": 8094 + }, + { + "epoch": 1.3760207471547679, + "grad_norm": 1.671875, + "learning_rate": 1.126615976961268e-05, + "loss": 0.8156, + "step": 8095 + }, + { + "epoch": 1.3761922112437577, + "grad_norm": 1.734375, + "learning_rate": 1.1264368355252248e-05, + "loss": 0.8662, + "step": 8096 + }, + { + "epoch": 1.3763636753327475, + "grad_norm": 1.71875, + "learning_rate": 1.1262576899656001e-05, + "loss": 0.8275, + "step": 8097 + }, + { + "epoch": 1.3765351394217373, + "grad_norm": 1.6640625, + "learning_rate": 1.1260785402882371e-05, + "loss": 0.9154, + "step": 8098 + }, + { + "epoch": 1.3767066035107272, + "grad_norm": 1.703125, + "learning_rate": 1.1258993864989777e-05, + "loss": 0.8768, + "step": 8099 + }, + { + "epoch": 1.376878067599717, + "grad_norm": 1.8671875, + "learning_rate": 1.1257202286036653e-05, + "loss": 0.9332, + "step": 8100 + }, + { + "epoch": 1.377049531688707, + "grad_norm": 1.6328125, + "learning_rate": 1.125541066608143e-05, + "loss": 0.8837, + "step": 8101 + }, + { + "epoch": 1.3772209957776969, + "grad_norm": 1.6953125, + "learning_rate": 1.1253619005182534e-05, + "loss": 0.8344, + "step": 8102 + }, + { + "epoch": 1.3773924598666867, + "grad_norm": 1.7734375, + "learning_rate": 1.1251827303398406e-05, + "loss": 0.9364, + "step": 8103 + }, + { + "epoch": 1.3775639239556765, + "grad_norm": 1.703125, + "learning_rate": 1.1250035560787473e-05, + "loss": 0.9229, + "step": 8104 + }, + { + "epoch": 1.3777353880446663, + "grad_norm": 1.6484375, + "learning_rate": 1.1248243777408172e-05, + "loss": 0.8675, + "step": 8105 + }, + { + "epoch": 1.3779068521336564, + "grad_norm": 1.6328125, + "learning_rate": 1.1246451953318944e-05, + "loss": 0.8326, + "step": 8106 + }, + { + "epoch": 1.3780783162226462, + "grad_norm": 1.6953125, + "learning_rate": 1.1244660088578223e-05, + "loss": 0.9618, + "step": 8107 + }, + { + "epoch": 1.378249780311636, + "grad_norm": 1.7265625, + "learning_rate": 1.124286818324445e-05, + "loss": 0.8605, + "step": 8108 + }, + { + "epoch": 1.3784212444006259, + "grad_norm": 1.6953125, + "learning_rate": 1.1241076237376065e-05, + "loss": 0.8746, + "step": 8109 + }, + { + "epoch": 1.3785927084896157, + "grad_norm": 1.78125, + "learning_rate": 1.1239284251031511e-05, + "loss": 0.904, + "step": 8110 + }, + { + "epoch": 1.3787641725786055, + "grad_norm": 1.6953125, + "learning_rate": 1.1237492224269229e-05, + "loss": 0.892, + "step": 8111 + }, + { + "epoch": 1.3789356366675953, + "grad_norm": 1.71875, + "learning_rate": 1.1235700157147666e-05, + "loss": 0.8849, + "step": 8112 + }, + { + "epoch": 1.3791071007565852, + "grad_norm": 1.78125, + "learning_rate": 1.1233908049725267e-05, + "loss": 0.8826, + "step": 8113 + }, + { + "epoch": 1.3792785648455752, + "grad_norm": 1.6484375, + "learning_rate": 1.1232115902060481e-05, + "loss": 0.835, + "step": 8114 + }, + { + "epoch": 1.379450028934565, + "grad_norm": 1.6484375, + "learning_rate": 1.1230323714211757e-05, + "loss": 0.837, + "step": 8115 + }, + { + "epoch": 1.3796214930235549, + "grad_norm": 1.546875, + "learning_rate": 1.1228531486237545e-05, + "loss": 0.8524, + "step": 8116 + }, + { + "epoch": 1.3797929571125447, + "grad_norm": 1.65625, + "learning_rate": 1.1226739218196289e-05, + "loss": 0.7318, + "step": 8117 + }, + { + "epoch": 1.3799644212015347, + "grad_norm": 1.703125, + "learning_rate": 1.1224946910146452e-05, + "loss": 0.8689, + "step": 8118 + }, + { + "epoch": 1.3801358852905246, + "grad_norm": 1.640625, + "learning_rate": 1.1223154562146484e-05, + "loss": 0.841, + "step": 8119 + }, + { + "epoch": 1.3803073493795144, + "grad_norm": 1.7109375, + "learning_rate": 1.1221362174254837e-05, + "loss": 0.8204, + "step": 8120 + }, + { + "epoch": 1.3804788134685042, + "grad_norm": 1.7890625, + "learning_rate": 1.1219569746529973e-05, + "loss": 0.9147, + "step": 8121 + }, + { + "epoch": 1.380650277557494, + "grad_norm": 1.5859375, + "learning_rate": 1.1217777279030346e-05, + "loss": 0.785, + "step": 8122 + }, + { + "epoch": 1.3808217416464839, + "grad_norm": 1.734375, + "learning_rate": 1.1215984771814416e-05, + "loss": 0.8118, + "step": 8123 + }, + { + "epoch": 1.3809932057354737, + "grad_norm": 1.640625, + "learning_rate": 1.1214192224940643e-05, + "loss": 0.8752, + "step": 8124 + }, + { + "epoch": 1.3811646698244635, + "grad_norm": 1.640625, + "learning_rate": 1.1212399638467491e-05, + "loss": 0.7907, + "step": 8125 + }, + { + "epoch": 1.3813361339134536, + "grad_norm": 1.671875, + "learning_rate": 1.121060701245342e-05, + "loss": 0.8421, + "step": 8126 + }, + { + "epoch": 1.3815075980024434, + "grad_norm": 1.7109375, + "learning_rate": 1.1208814346956898e-05, + "loss": 0.849, + "step": 8127 + }, + { + "epoch": 1.3816790620914332, + "grad_norm": 1.6484375, + "learning_rate": 1.1207021642036387e-05, + "loss": 0.8065, + "step": 8128 + }, + { + "epoch": 1.381850526180423, + "grad_norm": 1.671875, + "learning_rate": 1.1205228897750353e-05, + "loss": 0.8164, + "step": 8129 + }, + { + "epoch": 1.382021990269413, + "grad_norm": 1.6640625, + "learning_rate": 1.1203436114157267e-05, + "loss": 0.8265, + "step": 8130 + }, + { + "epoch": 1.382193454358403, + "grad_norm": 1.6875, + "learning_rate": 1.1201643291315599e-05, + "loss": 0.8398, + "step": 8131 + }, + { + "epoch": 1.3823649184473927, + "grad_norm": 1.8359375, + "learning_rate": 1.1199850429283817e-05, + "loss": 0.833, + "step": 8132 + }, + { + "epoch": 1.3825363825363826, + "grad_norm": 1.84375, + "learning_rate": 1.1198057528120394e-05, + "loss": 0.946, + "step": 8133 + }, + { + "epoch": 1.3827078466253724, + "grad_norm": 1.671875, + "learning_rate": 1.1196264587883803e-05, + "loss": 0.8525, + "step": 8134 + }, + { + "epoch": 1.3828793107143622, + "grad_norm": 1.8046875, + "learning_rate": 1.119447160863252e-05, + "loss": 0.9194, + "step": 8135 + }, + { + "epoch": 1.383050774803352, + "grad_norm": 1.640625, + "learning_rate": 1.1192678590425021e-05, + "loss": 0.8202, + "step": 8136 + }, + { + "epoch": 1.3832222388923419, + "grad_norm": 1.71875, + "learning_rate": 1.119088553331978e-05, + "loss": 0.9056, + "step": 8137 + }, + { + "epoch": 1.383393702981332, + "grad_norm": 1.7265625, + "learning_rate": 1.1189092437375282e-05, + "loss": 0.8426, + "step": 8138 + }, + { + "epoch": 1.3835651670703217, + "grad_norm": 1.6875, + "learning_rate": 1.1187299302649996e-05, + "loss": 0.9339, + "step": 8139 + }, + { + "epoch": 1.3837366311593116, + "grad_norm": 1.6953125, + "learning_rate": 1.118550612920241e-05, + "loss": 0.7956, + "step": 8140 + }, + { + "epoch": 1.3839080952483014, + "grad_norm": 1.6953125, + "learning_rate": 1.1183712917091006e-05, + "loss": 0.9118, + "step": 8141 + }, + { + "epoch": 1.3840795593372914, + "grad_norm": 1.703125, + "learning_rate": 1.1181919666374266e-05, + "loss": 0.8234, + "step": 8142 + }, + { + "epoch": 1.3842510234262813, + "grad_norm": 1.7109375, + "learning_rate": 1.1180126377110674e-05, + "loss": 0.8845, + "step": 8143 + }, + { + "epoch": 1.384422487515271, + "grad_norm": 1.703125, + "learning_rate": 1.117833304935872e-05, + "loss": 0.8255, + "step": 8144 + }, + { + "epoch": 1.384593951604261, + "grad_norm": 1.625, + "learning_rate": 1.1176539683176887e-05, + "loss": 0.8834, + "step": 8145 + }, + { + "epoch": 1.3847654156932507, + "grad_norm": 1.671875, + "learning_rate": 1.1174746278623662e-05, + "loss": 0.7859, + "step": 8146 + }, + { + "epoch": 1.3849368797822406, + "grad_norm": 1.65625, + "learning_rate": 1.117295283575754e-05, + "loss": 0.9149, + "step": 8147 + }, + { + "epoch": 1.3851083438712304, + "grad_norm": 1.6640625, + "learning_rate": 1.1171159354637008e-05, + "loss": 0.8333, + "step": 8148 + }, + { + "epoch": 1.3852798079602202, + "grad_norm": 1.734375, + "learning_rate": 1.116936583532056e-05, + "loss": 0.8805, + "step": 8149 + }, + { + "epoch": 1.3854512720492103, + "grad_norm": 1.7421875, + "learning_rate": 1.1167572277866688e-05, + "loss": 0.8682, + "step": 8150 + }, + { + "epoch": 1.3856227361382, + "grad_norm": 1.625, + "learning_rate": 1.1165778682333888e-05, + "loss": 0.8281, + "step": 8151 + }, + { + "epoch": 1.38579420022719, + "grad_norm": 1.59375, + "learning_rate": 1.1163985048780652e-05, + "loss": 0.8823, + "step": 8152 + }, + { + "epoch": 1.3859656643161797, + "grad_norm": 1.7421875, + "learning_rate": 1.1162191377265483e-05, + "loss": 0.8931, + "step": 8153 + }, + { + "epoch": 1.3861371284051696, + "grad_norm": 1.671875, + "learning_rate": 1.1160397667846875e-05, + "loss": 0.8454, + "step": 8154 + }, + { + "epoch": 1.3863085924941596, + "grad_norm": 1.6484375, + "learning_rate": 1.1158603920583336e-05, + "loss": 0.8052, + "step": 8155 + }, + { + "epoch": 1.3864800565831494, + "grad_norm": 1.7265625, + "learning_rate": 1.1156810135533354e-05, + "loss": 0.8256, + "step": 8156 + }, + { + "epoch": 1.3866515206721393, + "grad_norm": 1.7265625, + "learning_rate": 1.115501631275544e-05, + "loss": 0.8202, + "step": 8157 + }, + { + "epoch": 1.386822984761129, + "grad_norm": 1.5390625, + "learning_rate": 1.1153222452308093e-05, + "loss": 0.7897, + "step": 8158 + }, + { + "epoch": 1.386994448850119, + "grad_norm": 1.7890625, + "learning_rate": 1.1151428554249818e-05, + "loss": 0.931, + "step": 8159 + }, + { + "epoch": 1.3871659129391087, + "grad_norm": 1.609375, + "learning_rate": 1.1149634618639123e-05, + "loss": 0.7929, + "step": 8160 + }, + { + "epoch": 1.3873373770280986, + "grad_norm": 1.8125, + "learning_rate": 1.1147840645534515e-05, + "loss": 0.9386, + "step": 8161 + }, + { + "epoch": 1.3875088411170886, + "grad_norm": 1.6484375, + "learning_rate": 1.11460466349945e-05, + "loss": 0.8324, + "step": 8162 + }, + { + "epoch": 1.3876803052060784, + "grad_norm": 1.640625, + "learning_rate": 1.114425258707759e-05, + "loss": 0.8598, + "step": 8163 + }, + { + "epoch": 1.3878517692950683, + "grad_norm": 2.015625, + "learning_rate": 1.1142458501842292e-05, + "loss": 0.901, + "step": 8164 + }, + { + "epoch": 1.388023233384058, + "grad_norm": 1.84375, + "learning_rate": 1.1140664379347124e-05, + "loss": 0.8722, + "step": 8165 + }, + { + "epoch": 1.388194697473048, + "grad_norm": 1.6953125, + "learning_rate": 1.1138870219650592e-05, + "loss": 0.845, + "step": 8166 + }, + { + "epoch": 1.388366161562038, + "grad_norm": 2.015625, + "learning_rate": 1.1137076022811217e-05, + "loss": 0.8876, + "step": 8167 + }, + { + "epoch": 1.3885376256510278, + "grad_norm": 1.7109375, + "learning_rate": 1.1135281788887506e-05, + "loss": 0.8479, + "step": 8168 + }, + { + "epoch": 1.3887090897400176, + "grad_norm": 1.6171875, + "learning_rate": 1.1133487517937985e-05, + "loss": 0.8327, + "step": 8169 + }, + { + "epoch": 1.3888805538290074, + "grad_norm": 1.765625, + "learning_rate": 1.1131693210021163e-05, + "loss": 0.8874, + "step": 8170 + }, + { + "epoch": 1.3890520179179973, + "grad_norm": 1.703125, + "learning_rate": 1.1129898865195568e-05, + "loss": 0.8704, + "step": 8171 + }, + { + "epoch": 1.389223482006987, + "grad_norm": 1.59375, + "learning_rate": 1.1128104483519715e-05, + "loss": 0.8239, + "step": 8172 + }, + { + "epoch": 1.389394946095977, + "grad_norm": 1.6953125, + "learning_rate": 1.1126310065052128e-05, + "loss": 0.8521, + "step": 8173 + }, + { + "epoch": 1.389566410184967, + "grad_norm": 1.6484375, + "learning_rate": 1.1124515609851328e-05, + "loss": 0.8451, + "step": 8174 + }, + { + "epoch": 1.3897378742739568, + "grad_norm": 1.75, + "learning_rate": 1.1122721117975838e-05, + "loss": 0.8913, + "step": 8175 + }, + { + "epoch": 1.3899093383629466, + "grad_norm": 1.640625, + "learning_rate": 1.1120926589484187e-05, + "loss": 0.8485, + "step": 8176 + }, + { + "epoch": 1.3900808024519364, + "grad_norm": 1.6484375, + "learning_rate": 1.1119132024434896e-05, + "loss": 0.8612, + "step": 8177 + }, + { + "epoch": 1.3902522665409263, + "grad_norm": 1.609375, + "learning_rate": 1.1117337422886497e-05, + "loss": 0.8926, + "step": 8178 + }, + { + "epoch": 1.3904237306299163, + "grad_norm": 1.65625, + "learning_rate": 1.1115542784897518e-05, + "loss": 0.8277, + "step": 8179 + }, + { + "epoch": 1.3905951947189061, + "grad_norm": 1.7734375, + "learning_rate": 1.1113748110526486e-05, + "loss": 0.9561, + "step": 8180 + }, + { + "epoch": 1.390766658807896, + "grad_norm": 1.6484375, + "learning_rate": 1.1111953399831934e-05, + "loss": 0.7891, + "step": 8181 + }, + { + "epoch": 1.3909381228968858, + "grad_norm": 1.65625, + "learning_rate": 1.1110158652872395e-05, + "loss": 0.849, + "step": 8182 + }, + { + "epoch": 1.3911095869858756, + "grad_norm": 1.640625, + "learning_rate": 1.1108363869706402e-05, + "loss": 0.7991, + "step": 8183 + }, + { + "epoch": 1.3912810510748654, + "grad_norm": 1.671875, + "learning_rate": 1.1106569050392492e-05, + "loss": 0.8487, + "step": 8184 + }, + { + "epoch": 1.3914525151638553, + "grad_norm": 1.640625, + "learning_rate": 1.1104774194989197e-05, + "loss": 0.8098, + "step": 8185 + }, + { + "epoch": 1.3916239792528453, + "grad_norm": 1.75, + "learning_rate": 1.1102979303555052e-05, + "loss": 0.8735, + "step": 8186 + }, + { + "epoch": 1.3917954433418351, + "grad_norm": 1.6328125, + "learning_rate": 1.11011843761486e-05, + "loss": 0.7936, + "step": 8187 + }, + { + "epoch": 1.391966907430825, + "grad_norm": 1.6796875, + "learning_rate": 1.1099389412828379e-05, + "loss": 0.9163, + "step": 8188 + }, + { + "epoch": 1.3921383715198148, + "grad_norm": 1.7578125, + "learning_rate": 1.109759441365293e-05, + "loss": 0.9175, + "step": 8189 + }, + { + "epoch": 1.3923098356088046, + "grad_norm": 1.640625, + "learning_rate": 1.1095799378680796e-05, + "loss": 0.8568, + "step": 8190 + }, + { + "epoch": 1.3924812996977947, + "grad_norm": 1.703125, + "learning_rate": 1.1094004307970516e-05, + "loss": 0.8124, + "step": 8191 + }, + { + "epoch": 1.3926527637867845, + "grad_norm": 1.703125, + "learning_rate": 1.1092209201580634e-05, + "loss": 0.9517, + "step": 8192 + }, + { + "epoch": 1.3928242278757743, + "grad_norm": 1.7421875, + "learning_rate": 1.10904140595697e-05, + "loss": 0.83, + "step": 8193 + }, + { + "epoch": 1.3929956919647641, + "grad_norm": 1.6640625, + "learning_rate": 1.1088618881996257e-05, + "loss": 0.8657, + "step": 8194 + }, + { + "epoch": 1.393167156053754, + "grad_norm": 1.6484375, + "learning_rate": 1.108682366891885e-05, + "loss": 0.7925, + "step": 8195 + }, + { + "epoch": 1.3933386201427438, + "grad_norm": 1.578125, + "learning_rate": 1.1085028420396033e-05, + "loss": 0.6944, + "step": 8196 + }, + { + "epoch": 1.3935100842317336, + "grad_norm": 1.578125, + "learning_rate": 1.1083233136486354e-05, + "loss": 0.8304, + "step": 8197 + }, + { + "epoch": 1.3936815483207234, + "grad_norm": 1.7421875, + "learning_rate": 1.1081437817248362e-05, + "loss": 0.8687, + "step": 8198 + }, + { + "epoch": 1.3938530124097135, + "grad_norm": 1.5859375, + "learning_rate": 1.1079642462740612e-05, + "loss": 0.7692, + "step": 8199 + }, + { + "epoch": 1.3940244764987033, + "grad_norm": 1.671875, + "learning_rate": 1.1077847073021653e-05, + "loss": 0.8898, + "step": 8200 + }, + { + "epoch": 1.3941959405876931, + "grad_norm": 1.703125, + "learning_rate": 1.1076051648150043e-05, + "loss": 0.8604, + "step": 8201 + }, + { + "epoch": 1.394367404676683, + "grad_norm": 1.6328125, + "learning_rate": 1.107425618818434e-05, + "loss": 0.793, + "step": 8202 + }, + { + "epoch": 1.394538868765673, + "grad_norm": 1.7578125, + "learning_rate": 1.1072460693183095e-05, + "loss": 0.9108, + "step": 8203 + }, + { + "epoch": 1.3947103328546628, + "grad_norm": 1.7890625, + "learning_rate": 1.1070665163204868e-05, + "loss": 0.8878, + "step": 8204 + }, + { + "epoch": 1.3948817969436527, + "grad_norm": 1.6484375, + "learning_rate": 1.1068869598308218e-05, + "loss": 0.8829, + "step": 8205 + }, + { + "epoch": 1.3950532610326425, + "grad_norm": 1.7421875, + "learning_rate": 1.1067073998551706e-05, + "loss": 0.9368, + "step": 8206 + }, + { + "epoch": 1.3952247251216323, + "grad_norm": 1.625, + "learning_rate": 1.1065278363993892e-05, + "loss": 0.8212, + "step": 8207 + }, + { + "epoch": 1.3953961892106221, + "grad_norm": 1.734375, + "learning_rate": 1.1063482694693339e-05, + "loss": 0.892, + "step": 8208 + }, + { + "epoch": 1.395567653299612, + "grad_norm": 1.7265625, + "learning_rate": 1.1061686990708612e-05, + "loss": 0.8342, + "step": 8209 + }, + { + "epoch": 1.3957391173886018, + "grad_norm": 1.6875, + "learning_rate": 1.1059891252098273e-05, + "loss": 0.8443, + "step": 8210 + }, + { + "epoch": 1.3959105814775918, + "grad_norm": 1.6796875, + "learning_rate": 1.105809547892089e-05, + "loss": 0.8837, + "step": 8211 + }, + { + "epoch": 1.3960820455665817, + "grad_norm": 1.5859375, + "learning_rate": 1.1056299671235028e-05, + "loss": 0.8203, + "step": 8212 + }, + { + "epoch": 1.3962535096555715, + "grad_norm": 1.765625, + "learning_rate": 1.105450382909926e-05, + "loss": 0.8863, + "step": 8213 + }, + { + "epoch": 1.3964249737445613, + "grad_norm": 1.7421875, + "learning_rate": 1.1052707952572149e-05, + "loss": 0.8789, + "step": 8214 + }, + { + "epoch": 1.3965964378335514, + "grad_norm": 1.6875, + "learning_rate": 1.1050912041712264e-05, + "loss": 0.8143, + "step": 8215 + }, + { + "epoch": 1.3967679019225412, + "grad_norm": 1.734375, + "learning_rate": 1.1049116096578181e-05, + "loss": 0.8136, + "step": 8216 + }, + { + "epoch": 1.396939366011531, + "grad_norm": 1.8203125, + "learning_rate": 1.1047320117228472e-05, + "loss": 0.9049, + "step": 8217 + }, + { + "epoch": 1.3971108301005208, + "grad_norm": 1.6171875, + "learning_rate": 1.1045524103721712e-05, + "loss": 0.806, + "step": 8218 + }, + { + "epoch": 1.3972822941895107, + "grad_norm": 1.734375, + "learning_rate": 1.1043728056116474e-05, + "loss": 0.8712, + "step": 8219 + }, + { + "epoch": 1.3974537582785005, + "grad_norm": 1.5625, + "learning_rate": 1.1041931974471331e-05, + "loss": 0.8647, + "step": 8220 + }, + { + "epoch": 1.3976252223674903, + "grad_norm": 1.765625, + "learning_rate": 1.1040135858844864e-05, + "loss": 0.9376, + "step": 8221 + }, + { + "epoch": 1.3977966864564801, + "grad_norm": 1.6484375, + "learning_rate": 1.1038339709295652e-05, + "loss": 0.8598, + "step": 8222 + }, + { + "epoch": 1.3979681505454702, + "grad_norm": 1.71875, + "learning_rate": 1.103654352588227e-05, + "loss": 0.8858, + "step": 8223 + }, + { + "epoch": 1.39813961463446, + "grad_norm": 1.75, + "learning_rate": 1.10347473086633e-05, + "loss": 0.9499, + "step": 8224 + }, + { + "epoch": 1.3983110787234498, + "grad_norm": 1.765625, + "learning_rate": 1.1032951057697325e-05, + "loss": 0.8693, + "step": 8225 + }, + { + "epoch": 1.3984825428124397, + "grad_norm": 1.703125, + "learning_rate": 1.1031154773042925e-05, + "loss": 0.8588, + "step": 8226 + }, + { + "epoch": 1.3986540069014297, + "grad_norm": 1.65625, + "learning_rate": 1.1029358454758687e-05, + "loss": 0.8084, + "step": 8227 + }, + { + "epoch": 1.3988254709904195, + "grad_norm": 1.6640625, + "learning_rate": 1.102756210290319e-05, + "loss": 0.819, + "step": 8228 + }, + { + "epoch": 1.3989969350794094, + "grad_norm": 1.671875, + "learning_rate": 1.1025765717535026e-05, + "loss": 0.8563, + "step": 8229 + }, + { + "epoch": 1.3991683991683992, + "grad_norm": 1.7109375, + "learning_rate": 1.1023969298712783e-05, + "loss": 0.8326, + "step": 8230 + }, + { + "epoch": 1.399339863257389, + "grad_norm": 1.7890625, + "learning_rate": 1.102217284649504e-05, + "loss": 0.9333, + "step": 8231 + }, + { + "epoch": 1.3995113273463788, + "grad_norm": 1.734375, + "learning_rate": 1.1020376360940393e-05, + "loss": 0.8824, + "step": 8232 + }, + { + "epoch": 1.3996827914353687, + "grad_norm": 1.6953125, + "learning_rate": 1.101857984210743e-05, + "loss": 0.8249, + "step": 8233 + }, + { + "epoch": 1.3998542555243585, + "grad_norm": 1.8046875, + "learning_rate": 1.101678329005474e-05, + "loss": 0.8813, + "step": 8234 + }, + { + "epoch": 1.4000257196133485, + "grad_norm": 1.7734375, + "learning_rate": 1.101498670484092e-05, + "loss": 0.8005, + "step": 8235 + }, + { + "epoch": 1.4001971837023384, + "grad_norm": 1.7109375, + "learning_rate": 1.1013190086524564e-05, + "loss": 0.8701, + "step": 8236 + }, + { + "epoch": 1.4003686477913282, + "grad_norm": 1.6953125, + "learning_rate": 1.1011393435164261e-05, + "loss": 0.894, + "step": 8237 + }, + { + "epoch": 1.400540111880318, + "grad_norm": 1.6875, + "learning_rate": 1.1009596750818611e-05, + "loss": 0.843, + "step": 8238 + }, + { + "epoch": 1.400711575969308, + "grad_norm": 1.59375, + "learning_rate": 1.100780003354621e-05, + "loss": 0.844, + "step": 8239 + }, + { + "epoch": 1.400883040058298, + "grad_norm": 1.78125, + "learning_rate": 1.1006003283405653e-05, + "loss": 0.9017, + "step": 8240 + }, + { + "epoch": 1.4010545041472877, + "grad_norm": 1.703125, + "learning_rate": 1.1004206500455541e-05, + "loss": 0.8942, + "step": 8241 + }, + { + "epoch": 1.4012259682362775, + "grad_norm": 1.59375, + "learning_rate": 1.1002409684754473e-05, + "loss": 0.886, + "step": 8242 + }, + { + "epoch": 1.4013974323252674, + "grad_norm": 1.7578125, + "learning_rate": 1.1000612836361053e-05, + "loss": 0.9032, + "step": 8243 + }, + { + "epoch": 1.4015688964142572, + "grad_norm": 1.875, + "learning_rate": 1.0998815955333878e-05, + "loss": 0.8702, + "step": 8244 + }, + { + "epoch": 1.401740360503247, + "grad_norm": 1.6796875, + "learning_rate": 1.0997019041731553e-05, + "loss": 0.9045, + "step": 8245 + }, + { + "epoch": 1.4019118245922368, + "grad_norm": 1.6875, + "learning_rate": 1.0995222095612686e-05, + "loss": 0.942, + "step": 8246 + }, + { + "epoch": 1.402083288681227, + "grad_norm": 1.734375, + "learning_rate": 1.0993425117035876e-05, + "loss": 0.8191, + "step": 8247 + }, + { + "epoch": 1.4022547527702167, + "grad_norm": 1.7421875, + "learning_rate": 1.0991628106059734e-05, + "loss": 0.8259, + "step": 8248 + }, + { + "epoch": 1.4024262168592065, + "grad_norm": 1.6640625, + "learning_rate": 1.0989831062742867e-05, + "loss": 0.7985, + "step": 8249 + }, + { + "epoch": 1.4025976809481964, + "grad_norm": 1.6484375, + "learning_rate": 1.0988033987143876e-05, + "loss": 0.8193, + "step": 8250 + }, + { + "epoch": 1.4027691450371862, + "grad_norm": 1.7265625, + "learning_rate": 1.0986236879321379e-05, + "loss": 0.9145, + "step": 8251 + }, + { + "epoch": 1.4029406091261762, + "grad_norm": 1.6796875, + "learning_rate": 1.0984439739333983e-05, + "loss": 0.8936, + "step": 8252 + }, + { + "epoch": 1.403112073215166, + "grad_norm": 1.6328125, + "learning_rate": 1.09826425672403e-05, + "loss": 0.8913, + "step": 8253 + }, + { + "epoch": 1.403283537304156, + "grad_norm": 1.6875, + "learning_rate": 1.0980845363098945e-05, + "loss": 0.8355, + "step": 8254 + }, + { + "epoch": 1.4034550013931457, + "grad_norm": 1.671875, + "learning_rate": 1.0979048126968525e-05, + "loss": 0.8765, + "step": 8255 + }, + { + "epoch": 1.4036264654821355, + "grad_norm": 1.703125, + "learning_rate": 1.0977250858907662e-05, + "loss": 0.8471, + "step": 8256 + }, + { + "epoch": 1.4037979295711254, + "grad_norm": 1.6953125, + "learning_rate": 1.0975453558974968e-05, + "loss": 0.8599, + "step": 8257 + }, + { + "epoch": 1.4039693936601152, + "grad_norm": 1.6875, + "learning_rate": 1.0973656227229059e-05, + "loss": 0.8168, + "step": 8258 + }, + { + "epoch": 1.4041408577491052, + "grad_norm": 1.71875, + "learning_rate": 1.0971858863728557e-05, + "loss": 0.8604, + "step": 8259 + }, + { + "epoch": 1.404312321838095, + "grad_norm": 1.6484375, + "learning_rate": 1.0970061468532075e-05, + "loss": 0.836, + "step": 8260 + }, + { + "epoch": 1.404483785927085, + "grad_norm": 1.6875, + "learning_rate": 1.0968264041698237e-05, + "loss": 0.8421, + "step": 8261 + }, + { + "epoch": 1.4046552500160747, + "grad_norm": 1.703125, + "learning_rate": 1.096646658328566e-05, + "loss": 0.8652, + "step": 8262 + }, + { + "epoch": 1.4048267141050645, + "grad_norm": 1.7734375, + "learning_rate": 1.0964669093352969e-05, + "loss": 0.9318, + "step": 8263 + }, + { + "epoch": 1.4049981781940546, + "grad_norm": 1.6484375, + "learning_rate": 1.0962871571958788e-05, + "loss": 0.8129, + "step": 8264 + }, + { + "epoch": 1.4051696422830444, + "grad_norm": 1.671875, + "learning_rate": 1.096107401916174e-05, + "loss": 0.9011, + "step": 8265 + }, + { + "epoch": 1.4053411063720342, + "grad_norm": 1.78125, + "learning_rate": 1.0959276435020448e-05, + "loss": 0.9159, + "step": 8266 + }, + { + "epoch": 1.405512570461024, + "grad_norm": 1.6640625, + "learning_rate": 1.0957478819593538e-05, + "loss": 0.8199, + "step": 8267 + }, + { + "epoch": 1.405684034550014, + "grad_norm": 1.640625, + "learning_rate": 1.0955681172939639e-05, + "loss": 0.8743, + "step": 8268 + }, + { + "epoch": 1.4058554986390037, + "grad_norm": 1.6796875, + "learning_rate": 1.0953883495117379e-05, + "loss": 0.7695, + "step": 8269 + }, + { + "epoch": 1.4060269627279935, + "grad_norm": 1.640625, + "learning_rate": 1.0952085786185385e-05, + "loss": 0.8522, + "step": 8270 + }, + { + "epoch": 1.4061984268169836, + "grad_norm": 1.7421875, + "learning_rate": 1.095028804620229e-05, + "loss": 0.8782, + "step": 8271 + }, + { + "epoch": 1.4063698909059734, + "grad_norm": 1.75, + "learning_rate": 1.094849027522672e-05, + "loss": 1.0023, + "step": 8272 + }, + { + "epoch": 1.4065413549949632, + "grad_norm": 1.703125, + "learning_rate": 1.0946692473317313e-05, + "loss": 0.8444, + "step": 8273 + }, + { + "epoch": 1.406712819083953, + "grad_norm": 1.7578125, + "learning_rate": 1.0944894640532697e-05, + "loss": 0.825, + "step": 8274 + }, + { + "epoch": 1.406884283172943, + "grad_norm": 1.8125, + "learning_rate": 1.0943096776931512e-05, + "loss": 0.931, + "step": 8275 + }, + { + "epoch": 1.407055747261933, + "grad_norm": 1.7421875, + "learning_rate": 1.0941298882572387e-05, + "loss": 0.8311, + "step": 8276 + }, + { + "epoch": 1.4072272113509228, + "grad_norm": 1.7578125, + "learning_rate": 1.0939500957513964e-05, + "loss": 0.9291, + "step": 8277 + }, + { + "epoch": 1.4073986754399126, + "grad_norm": 1.6796875, + "learning_rate": 1.0937703001814877e-05, + "loss": 0.8732, + "step": 8278 + }, + { + "epoch": 1.4075701395289024, + "grad_norm": 1.6796875, + "learning_rate": 1.0935905015533761e-05, + "loss": 0.7694, + "step": 8279 + }, + { + "epoch": 1.4077416036178922, + "grad_norm": 1.6875, + "learning_rate": 1.093410699872926e-05, + "loss": 0.8506, + "step": 8280 + }, + { + "epoch": 1.407913067706882, + "grad_norm": 1.6171875, + "learning_rate": 1.093230895146001e-05, + "loss": 0.839, + "step": 8281 + }, + { + "epoch": 1.408084531795872, + "grad_norm": 1.6953125, + "learning_rate": 1.0930510873784657e-05, + "loss": 0.8417, + "step": 8282 + }, + { + "epoch": 1.408255995884862, + "grad_norm": 1.7734375, + "learning_rate": 1.0928712765761837e-05, + "loss": 0.8868, + "step": 8283 + }, + { + "epoch": 1.4084274599738518, + "grad_norm": 1.734375, + "learning_rate": 1.09269146274502e-05, + "loss": 0.8386, + "step": 8284 + }, + { + "epoch": 1.4085989240628416, + "grad_norm": 1.6171875, + "learning_rate": 1.0925116458908385e-05, + "loss": 0.7894, + "step": 8285 + }, + { + "epoch": 1.4087703881518314, + "grad_norm": 1.75, + "learning_rate": 1.092331826019504e-05, + "loss": 0.8861, + "step": 8286 + }, + { + "epoch": 1.4089418522408212, + "grad_norm": 1.765625, + "learning_rate": 1.0921520031368808e-05, + "loss": 0.8584, + "step": 8287 + }, + { + "epoch": 1.4091133163298113, + "grad_norm": 1.7578125, + "learning_rate": 1.0919721772488341e-05, + "loss": 0.869, + "step": 8288 + }, + { + "epoch": 1.4092847804188011, + "grad_norm": 1.5859375, + "learning_rate": 1.0917923483612278e-05, + "loss": 0.8477, + "step": 8289 + }, + { + "epoch": 1.409456244507791, + "grad_norm": 1.6484375, + "learning_rate": 1.0916125164799276e-05, + "loss": 0.8251, + "step": 8290 + }, + { + "epoch": 1.4096277085967808, + "grad_norm": 1.8046875, + "learning_rate": 1.0914326816107984e-05, + "loss": 0.8932, + "step": 8291 + }, + { + "epoch": 1.4097991726857706, + "grad_norm": 1.6796875, + "learning_rate": 1.091252843759705e-05, + "loss": 0.8542, + "step": 8292 + }, + { + "epoch": 1.4099706367747604, + "grad_norm": 1.71875, + "learning_rate": 1.0910730029325127e-05, + "loss": 0.9365, + "step": 8293 + }, + { + "epoch": 1.4101421008637502, + "grad_norm": 1.5546875, + "learning_rate": 1.0908931591350871e-05, + "loss": 0.873, + "step": 8294 + }, + { + "epoch": 1.41031356495274, + "grad_norm": 1.6953125, + "learning_rate": 1.0907133123732931e-05, + "loss": 0.849, + "step": 8295 + }, + { + "epoch": 1.4104850290417301, + "grad_norm": 1.6484375, + "learning_rate": 1.0905334626529964e-05, + "loss": 0.8907, + "step": 8296 + }, + { + "epoch": 1.41065649313072, + "grad_norm": 1.7109375, + "learning_rate": 1.0903536099800627e-05, + "loss": 0.8811, + "step": 8297 + }, + { + "epoch": 1.4108279572197098, + "grad_norm": 1.6953125, + "learning_rate": 1.0901737543603574e-05, + "loss": 0.8517, + "step": 8298 + }, + { + "epoch": 1.4109994213086996, + "grad_norm": 1.7265625, + "learning_rate": 1.0899938957997465e-05, + "loss": 0.8915, + "step": 8299 + }, + { + "epoch": 1.4111708853976896, + "grad_norm": 1.7421875, + "learning_rate": 1.0898140343040957e-05, + "loss": 0.8506, + "step": 8300 + }, + { + "epoch": 1.4113423494866795, + "grad_norm": 1.6875, + "learning_rate": 1.089634169879271e-05, + "loss": 0.8708, + "step": 8301 + }, + { + "epoch": 1.4115138135756693, + "grad_norm": 1.7734375, + "learning_rate": 1.0894543025311384e-05, + "loss": 0.9046, + "step": 8302 + }, + { + "epoch": 1.4116852776646591, + "grad_norm": 1.8046875, + "learning_rate": 1.0892744322655645e-05, + "loss": 0.8828, + "step": 8303 + }, + { + "epoch": 1.411856741753649, + "grad_norm": 1.625, + "learning_rate": 1.0890945590884147e-05, + "loss": 0.824, + "step": 8304 + }, + { + "epoch": 1.4120282058426388, + "grad_norm": 1.609375, + "learning_rate": 1.0889146830055564e-05, + "loss": 0.8713, + "step": 8305 + }, + { + "epoch": 1.4121996699316286, + "grad_norm": 1.703125, + "learning_rate": 1.0887348040228552e-05, + "loss": 0.8932, + "step": 8306 + }, + { + "epoch": 1.4123711340206184, + "grad_norm": 1.71875, + "learning_rate": 1.0885549221461777e-05, + "loss": 0.8144, + "step": 8307 + }, + { + "epoch": 1.4125425981096085, + "grad_norm": 1.6953125, + "learning_rate": 1.0883750373813909e-05, + "loss": 0.8233, + "step": 8308 + }, + { + "epoch": 1.4127140621985983, + "grad_norm": 1.703125, + "learning_rate": 1.088195149734361e-05, + "loss": 0.945, + "step": 8309 + }, + { + "epoch": 1.4128855262875881, + "grad_norm": 1.6640625, + "learning_rate": 1.0880152592109554e-05, + "loss": 0.8226, + "step": 8310 + }, + { + "epoch": 1.413056990376578, + "grad_norm": 1.6953125, + "learning_rate": 1.0878353658170411e-05, + "loss": 1.0318, + "step": 8311 + }, + { + "epoch": 1.413228454465568, + "grad_norm": 1.640625, + "learning_rate": 1.0876554695584845e-05, + "loss": 0.8666, + "step": 8312 + }, + { + "epoch": 1.4133999185545578, + "grad_norm": 1.6171875, + "learning_rate": 1.087475570441153e-05, + "loss": 0.8516, + "step": 8313 + }, + { + "epoch": 1.4135713826435476, + "grad_norm": 1.671875, + "learning_rate": 1.0872956684709138e-05, + "loss": 0.8655, + "step": 8314 + }, + { + "epoch": 1.4137428467325375, + "grad_norm": 1.7421875, + "learning_rate": 1.0871157636536338e-05, + "loss": 0.8286, + "step": 8315 + }, + { + "epoch": 1.4139143108215273, + "grad_norm": 1.6875, + "learning_rate": 1.086935855995181e-05, + "loss": 0.904, + "step": 8316 + }, + { + "epoch": 1.4140857749105171, + "grad_norm": 1.75, + "learning_rate": 1.0867559455014224e-05, + "loss": 0.9264, + "step": 8317 + }, + { + "epoch": 1.414257238999507, + "grad_norm": 1.671875, + "learning_rate": 1.086576032178226e-05, + "loss": 0.8874, + "step": 8318 + }, + { + "epoch": 1.4144287030884968, + "grad_norm": 1.6640625, + "learning_rate": 1.0863961160314588e-05, + "loss": 0.8748, + "step": 8319 + }, + { + "epoch": 1.4146001671774868, + "grad_norm": 1.7109375, + "learning_rate": 1.086216197066989e-05, + "loss": 0.8369, + "step": 8320 + }, + { + "epoch": 1.4147716312664766, + "grad_norm": 1.6875, + "learning_rate": 1.0860362752906846e-05, + "loss": 0.8304, + "step": 8321 + }, + { + "epoch": 1.4149430953554665, + "grad_norm": 1.7578125, + "learning_rate": 1.085856350708413e-05, + "loss": 0.8489, + "step": 8322 + }, + { + "epoch": 1.4151145594444563, + "grad_norm": 1.625, + "learning_rate": 1.0856764233260428e-05, + "loss": 0.7751, + "step": 8323 + }, + { + "epoch": 1.4152860235334463, + "grad_norm": 1.6640625, + "learning_rate": 1.0854964931494418e-05, + "loss": 0.8466, + "step": 8324 + }, + { + "epoch": 1.4154574876224362, + "grad_norm": 1.6484375, + "learning_rate": 1.0853165601844779e-05, + "loss": 0.8576, + "step": 8325 + }, + { + "epoch": 1.415628951711426, + "grad_norm": 1.7734375, + "learning_rate": 1.0851366244370199e-05, + "loss": 0.8965, + "step": 8326 + }, + { + "epoch": 1.4158004158004158, + "grad_norm": 1.765625, + "learning_rate": 1.084956685912936e-05, + "loss": 0.9559, + "step": 8327 + }, + { + "epoch": 1.4159718798894056, + "grad_norm": 1.59375, + "learning_rate": 1.0847767446180945e-05, + "loss": 0.8144, + "step": 8328 + }, + { + "epoch": 1.4161433439783955, + "grad_norm": 1.765625, + "learning_rate": 1.0845968005583643e-05, + "loss": 0.8764, + "step": 8329 + }, + { + "epoch": 1.4163148080673853, + "grad_norm": 1.6484375, + "learning_rate": 1.0844168537396138e-05, + "loss": 0.8322, + "step": 8330 + }, + { + "epoch": 1.4164862721563751, + "grad_norm": 1.6796875, + "learning_rate": 1.0842369041677116e-05, + "loss": 0.8158, + "step": 8331 + }, + { + "epoch": 1.4166577362453652, + "grad_norm": 1.796875, + "learning_rate": 1.0840569518485272e-05, + "loss": 0.9187, + "step": 8332 + }, + { + "epoch": 1.416829200334355, + "grad_norm": 1.65625, + "learning_rate": 1.0838769967879286e-05, + "loss": 0.7694, + "step": 8333 + }, + { + "epoch": 1.4170006644233448, + "grad_norm": 1.734375, + "learning_rate": 1.0836970389917857e-05, + "loss": 0.8279, + "step": 8334 + }, + { + "epoch": 1.4171721285123347, + "grad_norm": 1.6171875, + "learning_rate": 1.083517078465967e-05, + "loss": 0.814, + "step": 8335 + }, + { + "epoch": 1.4173435926013247, + "grad_norm": 1.65625, + "learning_rate": 1.0833371152163417e-05, + "loss": 0.7892, + "step": 8336 + }, + { + "epoch": 1.4175150566903145, + "grad_norm": 1.75, + "learning_rate": 1.0831571492487794e-05, + "loss": 0.8923, + "step": 8337 + }, + { + "epoch": 1.4176865207793043, + "grad_norm": 1.84375, + "learning_rate": 1.0829771805691493e-05, + "loss": 0.9074, + "step": 8338 + }, + { + "epoch": 1.4178579848682942, + "grad_norm": 1.6796875, + "learning_rate": 1.0827972091833207e-05, + "loss": 0.8024, + "step": 8339 + }, + { + "epoch": 1.418029448957284, + "grad_norm": 1.6953125, + "learning_rate": 1.082617235097164e-05, + "loss": 0.8634, + "step": 8340 + }, + { + "epoch": 1.4182009130462738, + "grad_norm": 1.6953125, + "learning_rate": 1.0824372583165477e-05, + "loss": 0.8486, + "step": 8341 + }, + { + "epoch": 1.4183723771352637, + "grad_norm": 1.6015625, + "learning_rate": 1.0822572788473418e-05, + "loss": 0.8686, + "step": 8342 + }, + { + "epoch": 1.4185438412242535, + "grad_norm": 1.6015625, + "learning_rate": 1.0820772966954164e-05, + "loss": 0.8334, + "step": 8343 + }, + { + "epoch": 1.4187153053132435, + "grad_norm": 1.7265625, + "learning_rate": 1.0818973118666414e-05, + "loss": 0.9122, + "step": 8344 + }, + { + "epoch": 1.4188867694022333, + "grad_norm": 1.796875, + "learning_rate": 1.0817173243668866e-05, + "loss": 0.888, + "step": 8345 + }, + { + "epoch": 1.4190582334912232, + "grad_norm": 1.6640625, + "learning_rate": 1.0815373342020221e-05, + "loss": 0.8482, + "step": 8346 + }, + { + "epoch": 1.419229697580213, + "grad_norm": 1.640625, + "learning_rate": 1.0813573413779181e-05, + "loss": 0.8716, + "step": 8347 + }, + { + "epoch": 1.4194011616692028, + "grad_norm": 1.625, + "learning_rate": 1.0811773459004449e-05, + "loss": 0.8647, + "step": 8348 + }, + { + "epoch": 1.4195726257581929, + "grad_norm": 1.5625, + "learning_rate": 1.0809973477754727e-05, + "loss": 0.7912, + "step": 8349 + }, + { + "epoch": 1.4197440898471827, + "grad_norm": 1.8203125, + "learning_rate": 1.0808173470088719e-05, + "loss": 0.9811, + "step": 8350 + }, + { + "epoch": 1.4199155539361725, + "grad_norm": 1.6640625, + "learning_rate": 1.0806373436065134e-05, + "loss": 0.8185, + "step": 8351 + }, + { + "epoch": 1.4200870180251624, + "grad_norm": 1.6953125, + "learning_rate": 1.0804573375742673e-05, + "loss": 0.9047, + "step": 8352 + }, + { + "epoch": 1.4202584821141522, + "grad_norm": 1.65625, + "learning_rate": 1.0802773289180044e-05, + "loss": 0.7983, + "step": 8353 + }, + { + "epoch": 1.420429946203142, + "grad_norm": 1.703125, + "learning_rate": 1.0800973176435953e-05, + "loss": 0.8938, + "step": 8354 + }, + { + "epoch": 1.4206014102921318, + "grad_norm": 1.75, + "learning_rate": 1.0799173037569116e-05, + "loss": 0.8996, + "step": 8355 + }, + { + "epoch": 1.4207728743811219, + "grad_norm": 1.7109375, + "learning_rate": 1.079737287263823e-05, + "loss": 0.86, + "step": 8356 + }, + { + "epoch": 1.4209443384701117, + "grad_norm": 1.78125, + "learning_rate": 1.0795572681702018e-05, + "loss": 0.8385, + "step": 8357 + }, + { + "epoch": 1.4211158025591015, + "grad_norm": 1.7421875, + "learning_rate": 1.079377246481918e-05, + "loss": 0.8656, + "step": 8358 + }, + { + "epoch": 1.4212872666480914, + "grad_norm": 1.7734375, + "learning_rate": 1.0791972222048436e-05, + "loss": 0.8572, + "step": 8359 + }, + { + "epoch": 1.4214587307370812, + "grad_norm": 1.7734375, + "learning_rate": 1.0790171953448496e-05, + "loss": 0.8884, + "step": 8360 + }, + { + "epoch": 1.4216301948260712, + "grad_norm": 1.625, + "learning_rate": 1.0788371659078072e-05, + "loss": 0.8412, + "step": 8361 + }, + { + "epoch": 1.421801658915061, + "grad_norm": 1.703125, + "learning_rate": 1.0786571338995879e-05, + "loss": 0.8863, + "step": 8362 + }, + { + "epoch": 1.4219731230040509, + "grad_norm": 1.7109375, + "learning_rate": 1.0784770993260634e-05, + "loss": 0.8877, + "step": 8363 + }, + { + "epoch": 1.4221445870930407, + "grad_norm": 1.6875, + "learning_rate": 1.0782970621931048e-05, + "loss": 0.8147, + "step": 8364 + }, + { + "epoch": 1.4223160511820305, + "grad_norm": 1.859375, + "learning_rate": 1.0781170225065845e-05, + "loss": 0.8803, + "step": 8365 + }, + { + "epoch": 1.4224875152710204, + "grad_norm": 1.7890625, + "learning_rate": 1.0779369802723738e-05, + "loss": 0.8713, + "step": 8366 + }, + { + "epoch": 1.4226589793600102, + "grad_norm": 1.65625, + "learning_rate": 1.0777569354963448e-05, + "loss": 0.8642, + "step": 8367 + }, + { + "epoch": 1.4228304434490002, + "grad_norm": 1.7109375, + "learning_rate": 1.0775768881843693e-05, + "loss": 0.8819, + "step": 8368 + }, + { + "epoch": 1.42300190753799, + "grad_norm": 1.703125, + "learning_rate": 1.0773968383423197e-05, + "loss": 0.8153, + "step": 8369 + }, + { + "epoch": 1.4231733716269799, + "grad_norm": 1.734375, + "learning_rate": 1.0772167859760674e-05, + "loss": 0.9, + "step": 8370 + }, + { + "epoch": 1.4233448357159697, + "grad_norm": 1.7265625, + "learning_rate": 1.0770367310914849e-05, + "loss": 0.8695, + "step": 8371 + }, + { + "epoch": 1.4235162998049595, + "grad_norm": 1.7734375, + "learning_rate": 1.0768566736944445e-05, + "loss": 0.9497, + "step": 8372 + }, + { + "epoch": 1.4236877638939496, + "grad_norm": 1.71875, + "learning_rate": 1.0766766137908187e-05, + "loss": 0.8198, + "step": 8373 + }, + { + "epoch": 1.4238592279829394, + "grad_norm": 1.7265625, + "learning_rate": 1.0764965513864796e-05, + "loss": 0.9214, + "step": 8374 + }, + { + "epoch": 1.4240306920719292, + "grad_norm": 1.6875, + "learning_rate": 1.0763164864873e-05, + "loss": 0.8708, + "step": 8375 + }, + { + "epoch": 1.424202156160919, + "grad_norm": 1.6796875, + "learning_rate": 1.0761364190991523e-05, + "loss": 0.8654, + "step": 8376 + }, + { + "epoch": 1.4243736202499089, + "grad_norm": 1.6171875, + "learning_rate": 1.0759563492279093e-05, + "loss": 0.8355, + "step": 8377 + }, + { + "epoch": 1.4245450843388987, + "grad_norm": 1.671875, + "learning_rate": 1.0757762768794438e-05, + "loss": 0.8843, + "step": 8378 + }, + { + "epoch": 1.4247165484278885, + "grad_norm": 1.6953125, + "learning_rate": 1.0755962020596285e-05, + "loss": 0.7986, + "step": 8379 + }, + { + "epoch": 1.4248880125168786, + "grad_norm": 1.6171875, + "learning_rate": 1.0754161247743367e-05, + "loss": 0.8253, + "step": 8380 + }, + { + "epoch": 1.4250594766058684, + "grad_norm": 1.6953125, + "learning_rate": 1.0752360450294408e-05, + "loss": 0.8199, + "step": 8381 + }, + { + "epoch": 1.4252309406948582, + "grad_norm": 1.6328125, + "learning_rate": 1.0750559628308142e-05, + "loss": 0.8519, + "step": 8382 + }, + { + "epoch": 1.425402404783848, + "grad_norm": 1.65625, + "learning_rate": 1.0748758781843296e-05, + "loss": 0.8915, + "step": 8383 + }, + { + "epoch": 1.4255738688728379, + "grad_norm": 1.703125, + "learning_rate": 1.0746957910958612e-05, + "loss": 0.8812, + "step": 8384 + }, + { + "epoch": 1.425745332961828, + "grad_norm": 1.7890625, + "learning_rate": 1.0745157015712814e-05, + "loss": 0.8603, + "step": 8385 + }, + { + "epoch": 1.4259167970508178, + "grad_norm": 1.78125, + "learning_rate": 1.0743356096164646e-05, + "loss": 0.8808, + "step": 8386 + }, + { + "epoch": 1.4260882611398076, + "grad_norm": 1.8046875, + "learning_rate": 1.074155515237283e-05, + "loss": 0.9764, + "step": 8387 + }, + { + "epoch": 1.4262597252287974, + "grad_norm": 1.640625, + "learning_rate": 1.0739754184396112e-05, + "loss": 0.8728, + "step": 8388 + }, + { + "epoch": 1.4264311893177872, + "grad_norm": 1.8671875, + "learning_rate": 1.0737953192293222e-05, + "loss": 0.8995, + "step": 8389 + }, + { + "epoch": 1.426602653406777, + "grad_norm": 1.671875, + "learning_rate": 1.0736152176122901e-05, + "loss": 0.7569, + "step": 8390 + }, + { + "epoch": 1.4267741174957669, + "grad_norm": 1.671875, + "learning_rate": 1.0734351135943883e-05, + "loss": 0.9227, + "step": 8391 + }, + { + "epoch": 1.4269455815847567, + "grad_norm": 1.65625, + "learning_rate": 1.0732550071814912e-05, + "loss": 0.8322, + "step": 8392 + }, + { + "epoch": 1.4271170456737468, + "grad_norm": 1.6796875, + "learning_rate": 1.0730748983794723e-05, + "loss": 0.884, + "step": 8393 + }, + { + "epoch": 1.4272885097627366, + "grad_norm": 1.6796875, + "learning_rate": 1.0728947871942057e-05, + "loss": 0.8518, + "step": 8394 + }, + { + "epoch": 1.4274599738517264, + "grad_norm": 1.6640625, + "learning_rate": 1.0727146736315656e-05, + "loss": 0.791, + "step": 8395 + }, + { + "epoch": 1.4276314379407162, + "grad_norm": 1.6328125, + "learning_rate": 1.0725345576974265e-05, + "loss": 0.7869, + "step": 8396 + }, + { + "epoch": 1.4278029020297063, + "grad_norm": 1.5859375, + "learning_rate": 1.0723544393976622e-05, + "loss": 0.812, + "step": 8397 + }, + { + "epoch": 1.427974366118696, + "grad_norm": 1.703125, + "learning_rate": 1.0721743187381473e-05, + "loss": 0.888, + "step": 8398 + }, + { + "epoch": 1.428145830207686, + "grad_norm": 1.75, + "learning_rate": 1.0719941957247557e-05, + "loss": 0.9114, + "step": 8399 + }, + { + "epoch": 1.4283172942966758, + "grad_norm": 1.6484375, + "learning_rate": 1.0718140703633626e-05, + "loss": 0.8384, + "step": 8400 + }, + { + "epoch": 1.4283172942966758, + "eval_loss": 0.8431193828582764, + "eval_runtime": 836.8951, + "eval_samples_per_second": 2.986, + "eval_steps_per_second": 2.986, + "step": 8400 + }, + { + "epoch": 1.4284887583856656, + "grad_norm": 1.6484375, + "learning_rate": 1.0716339426598421e-05, + "loss": 0.8651, + "step": 8401 + }, + { + "epoch": 1.4286602224746554, + "grad_norm": 1.765625, + "learning_rate": 1.0714538126200688e-05, + "loss": 0.8133, + "step": 8402 + }, + { + "epoch": 1.4288316865636452, + "grad_norm": 1.75, + "learning_rate": 1.0712736802499179e-05, + "loss": 0.8836, + "step": 8403 + }, + { + "epoch": 1.429003150652635, + "grad_norm": 1.6328125, + "learning_rate": 1.0710935455552637e-05, + "loss": 0.8841, + "step": 8404 + }, + { + "epoch": 1.429174614741625, + "grad_norm": 1.75, + "learning_rate": 1.0709134085419814e-05, + "loss": 0.8592, + "step": 8405 + }, + { + "epoch": 1.429346078830615, + "grad_norm": 1.78125, + "learning_rate": 1.0707332692159459e-05, + "loss": 0.8613, + "step": 8406 + }, + { + "epoch": 1.4295175429196048, + "grad_norm": 1.75, + "learning_rate": 1.0705531275830319e-05, + "loss": 0.8449, + "step": 8407 + }, + { + "epoch": 1.4296890070085946, + "grad_norm": 1.6796875, + "learning_rate": 1.0703729836491148e-05, + "loss": 0.8013, + "step": 8408 + }, + { + "epoch": 1.4298604710975846, + "grad_norm": 1.65625, + "learning_rate": 1.0701928374200699e-05, + "loss": 0.8001, + "step": 8409 + }, + { + "epoch": 1.4300319351865745, + "grad_norm": 1.7109375, + "learning_rate": 1.070012688901772e-05, + "loss": 0.8349, + "step": 8410 + }, + { + "epoch": 1.4302033992755643, + "grad_norm": 1.6953125, + "learning_rate": 1.0698325381000965e-05, + "loss": 0.803, + "step": 8411 + }, + { + "epoch": 1.430374863364554, + "grad_norm": 1.7265625, + "learning_rate": 1.0696523850209193e-05, + "loss": 0.8824, + "step": 8412 + }, + { + "epoch": 1.430546327453544, + "grad_norm": 1.6796875, + "learning_rate": 1.0694722296701155e-05, + "loss": 0.8719, + "step": 8413 + }, + { + "epoch": 1.4307177915425338, + "grad_norm": 1.71875, + "learning_rate": 1.0692920720535604e-05, + "loss": 0.8876, + "step": 8414 + }, + { + "epoch": 1.4308892556315236, + "grad_norm": 1.6171875, + "learning_rate": 1.0691119121771305e-05, + "loss": 0.8705, + "step": 8415 + }, + { + "epoch": 1.4310607197205134, + "grad_norm": 1.7109375, + "learning_rate": 1.0689317500467006e-05, + "loss": 0.792, + "step": 8416 + }, + { + "epoch": 1.4312321838095035, + "grad_norm": 1.671875, + "learning_rate": 1.0687515856681466e-05, + "loss": 0.8669, + "step": 8417 + }, + { + "epoch": 1.4314036478984933, + "grad_norm": 1.59375, + "learning_rate": 1.0685714190473444e-05, + "loss": 0.8029, + "step": 8418 + }, + { + "epoch": 1.431575111987483, + "grad_norm": 1.625, + "learning_rate": 1.0683912501901703e-05, + "loss": 0.8535, + "step": 8419 + }, + { + "epoch": 1.431746576076473, + "grad_norm": 1.703125, + "learning_rate": 1.0682110791024997e-05, + "loss": 0.9202, + "step": 8420 + }, + { + "epoch": 1.431918040165463, + "grad_norm": 1.6171875, + "learning_rate": 1.0680309057902091e-05, + "loss": 0.92, + "step": 8421 + }, + { + "epoch": 1.4320895042544528, + "grad_norm": 1.625, + "learning_rate": 1.0678507302591748e-05, + "loss": 0.8744, + "step": 8422 + }, + { + "epoch": 1.4322609683434426, + "grad_norm": 1.59375, + "learning_rate": 1.0676705525152722e-05, + "loss": 0.7187, + "step": 8423 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 1.5859375, + "learning_rate": 1.0674903725643783e-05, + "loss": 0.8218, + "step": 8424 + }, + { + "epoch": 1.4326038965214223, + "grad_norm": 1.7265625, + "learning_rate": 1.067310190412369e-05, + "loss": 0.8625, + "step": 8425 + }, + { + "epoch": 1.432775360610412, + "grad_norm": 1.6484375, + "learning_rate": 1.0671300060651215e-05, + "loss": 0.8802, + "step": 8426 + }, + { + "epoch": 1.432946824699402, + "grad_norm": 1.640625, + "learning_rate": 1.0669498195285113e-05, + "loss": 0.9051, + "step": 8427 + }, + { + "epoch": 1.4331182887883918, + "grad_norm": 1.609375, + "learning_rate": 1.0667696308084152e-05, + "loss": 0.795, + "step": 8428 + }, + { + "epoch": 1.4332897528773818, + "grad_norm": 1.734375, + "learning_rate": 1.0665894399107103e-05, + "loss": 0.8164, + "step": 8429 + }, + { + "epoch": 1.4334612169663716, + "grad_norm": 1.7265625, + "learning_rate": 1.0664092468412726e-05, + "loss": 0.8222, + "step": 8430 + }, + { + "epoch": 1.4336326810553615, + "grad_norm": 1.6953125, + "learning_rate": 1.0662290516059797e-05, + "loss": 0.8728, + "step": 8431 + }, + { + "epoch": 1.4338041451443513, + "grad_norm": 1.8046875, + "learning_rate": 1.0660488542107081e-05, + "loss": 0.8853, + "step": 8432 + }, + { + "epoch": 1.4339756092333413, + "grad_norm": 1.65625, + "learning_rate": 1.0658686546613344e-05, + "loss": 0.9017, + "step": 8433 + }, + { + "epoch": 1.4341470733223312, + "grad_norm": 1.734375, + "learning_rate": 1.065688452963736e-05, + "loss": 0.8426, + "step": 8434 + }, + { + "epoch": 1.434318537411321, + "grad_norm": 1.7734375, + "learning_rate": 1.0655082491237896e-05, + "loss": 0.8915, + "step": 8435 + }, + { + "epoch": 1.4344900015003108, + "grad_norm": 1.65625, + "learning_rate": 1.0653280431473725e-05, + "loss": 0.8554, + "step": 8436 + }, + { + "epoch": 1.4346614655893006, + "grad_norm": 1.75, + "learning_rate": 1.0651478350403621e-05, + "loss": 0.9187, + "step": 8437 + }, + { + "epoch": 1.4348329296782905, + "grad_norm": 1.703125, + "learning_rate": 1.0649676248086353e-05, + "loss": 0.8988, + "step": 8438 + }, + { + "epoch": 1.4350043937672803, + "grad_norm": 1.65625, + "learning_rate": 1.0647874124580697e-05, + "loss": 0.9059, + "step": 8439 + }, + { + "epoch": 1.43517585785627, + "grad_norm": 1.65625, + "learning_rate": 1.0646071979945427e-05, + "loss": 0.883, + "step": 8440 + }, + { + "epoch": 1.4353473219452602, + "grad_norm": 1.8984375, + "learning_rate": 1.0644269814239314e-05, + "loss": 0.9435, + "step": 8441 + }, + { + "epoch": 1.43551878603425, + "grad_norm": 1.71875, + "learning_rate": 1.0642467627521138e-05, + "loss": 0.9301, + "step": 8442 + }, + { + "epoch": 1.4356902501232398, + "grad_norm": 1.6796875, + "learning_rate": 1.0640665419849674e-05, + "loss": 0.8366, + "step": 8443 + }, + { + "epoch": 1.4358617142122296, + "grad_norm": 1.6484375, + "learning_rate": 1.06388631912837e-05, + "loss": 0.8886, + "step": 8444 + }, + { + "epoch": 1.4360331783012195, + "grad_norm": 1.75, + "learning_rate": 1.063706094188199e-05, + "loss": 0.8579, + "step": 8445 + }, + { + "epoch": 1.4362046423902095, + "grad_norm": 1.6796875, + "learning_rate": 1.0635258671703324e-05, + "loss": 0.874, + "step": 8446 + }, + { + "epoch": 1.4363761064791993, + "grad_norm": 1.671875, + "learning_rate": 1.0633456380806478e-05, + "loss": 0.8619, + "step": 8447 + }, + { + "epoch": 1.4365475705681892, + "grad_norm": 1.7109375, + "learning_rate": 1.0631654069250235e-05, + "loss": 0.8869, + "step": 8448 + }, + { + "epoch": 1.436719034657179, + "grad_norm": 1.671875, + "learning_rate": 1.0629851737093375e-05, + "loss": 0.815, + "step": 8449 + }, + { + "epoch": 1.4368904987461688, + "grad_norm": 1.6328125, + "learning_rate": 1.062804938439468e-05, + "loss": 0.842, + "step": 8450 + }, + { + "epoch": 1.4370619628351586, + "grad_norm": 1.5625, + "learning_rate": 1.0626247011212924e-05, + "loss": 0.8771, + "step": 8451 + }, + { + "epoch": 1.4372334269241485, + "grad_norm": 1.6640625, + "learning_rate": 1.0624444617606899e-05, + "loss": 0.865, + "step": 8452 + }, + { + "epoch": 1.4374048910131385, + "grad_norm": 1.640625, + "learning_rate": 1.0622642203635383e-05, + "loss": 0.8332, + "step": 8453 + }, + { + "epoch": 1.4375763551021283, + "grad_norm": 1.78125, + "learning_rate": 1.0620839769357158e-05, + "loss": 0.904, + "step": 8454 + }, + { + "epoch": 1.4377478191911182, + "grad_norm": 1.6953125, + "learning_rate": 1.0619037314831015e-05, + "loss": 0.8609, + "step": 8455 + }, + { + "epoch": 1.437919283280108, + "grad_norm": 1.7265625, + "learning_rate": 1.0617234840115731e-05, + "loss": 0.9157, + "step": 8456 + }, + { + "epoch": 1.4380907473690978, + "grad_norm": 1.7421875, + "learning_rate": 1.0615432345270095e-05, + "loss": 0.7958, + "step": 8457 + }, + { + "epoch": 1.4382622114580879, + "grad_norm": 1.7265625, + "learning_rate": 1.0613629830352892e-05, + "loss": 0.9063, + "step": 8458 + }, + { + "epoch": 1.4384336755470777, + "grad_norm": 1.671875, + "learning_rate": 1.061182729542291e-05, + "loss": 0.8571, + "step": 8459 + }, + { + "epoch": 1.4386051396360675, + "grad_norm": 1.59375, + "learning_rate": 1.0610024740538933e-05, + "loss": 0.843, + "step": 8460 + }, + { + "epoch": 1.4387766037250573, + "grad_norm": 1.6328125, + "learning_rate": 1.0608222165759758e-05, + "loss": 0.7849, + "step": 8461 + }, + { + "epoch": 1.4389480678140472, + "grad_norm": 1.6328125, + "learning_rate": 1.0606419571144166e-05, + "loss": 0.7812, + "step": 8462 + }, + { + "epoch": 1.439119531903037, + "grad_norm": 1.6875, + "learning_rate": 1.0604616956750947e-05, + "loss": 0.7796, + "step": 8463 + }, + { + "epoch": 1.4392909959920268, + "grad_norm": 1.7265625, + "learning_rate": 1.0602814322638893e-05, + "loss": 0.8566, + "step": 8464 + }, + { + "epoch": 1.4394624600810169, + "grad_norm": 1.7890625, + "learning_rate": 1.0601011668866793e-05, + "loss": 0.8924, + "step": 8465 + }, + { + "epoch": 1.4396339241700067, + "grad_norm": 1.6015625, + "learning_rate": 1.0599208995493437e-05, + "loss": 0.7558, + "step": 8466 + }, + { + "epoch": 1.4398053882589965, + "grad_norm": 1.6875, + "learning_rate": 1.0597406302577622e-05, + "loss": 0.858, + "step": 8467 + }, + { + "epoch": 1.4399768523479863, + "grad_norm": 1.625, + "learning_rate": 1.0595603590178138e-05, + "loss": 0.8138, + "step": 8468 + }, + { + "epoch": 1.4401483164369762, + "grad_norm": 1.65625, + "learning_rate": 1.0593800858353778e-05, + "loss": 0.8355, + "step": 8469 + }, + { + "epoch": 1.4403197805259662, + "grad_norm": 1.6328125, + "learning_rate": 1.0591998107163335e-05, + "loss": 0.7906, + "step": 8470 + }, + { + "epoch": 1.440491244614956, + "grad_norm": 1.640625, + "learning_rate": 1.0590195336665605e-05, + "loss": 0.8069, + "step": 8471 + }, + { + "epoch": 1.4406627087039459, + "grad_norm": 1.6953125, + "learning_rate": 1.0588392546919384e-05, + "loss": 0.8495, + "step": 8472 + }, + { + "epoch": 1.4408341727929357, + "grad_norm": 1.6484375, + "learning_rate": 1.0586589737983465e-05, + "loss": 0.8981, + "step": 8473 + }, + { + "epoch": 1.4410056368819255, + "grad_norm": 1.6640625, + "learning_rate": 1.0584786909916647e-05, + "loss": 0.8765, + "step": 8474 + }, + { + "epoch": 1.4411771009709153, + "grad_norm": 1.7265625, + "learning_rate": 1.0582984062777726e-05, + "loss": 0.7787, + "step": 8475 + }, + { + "epoch": 1.4413485650599052, + "grad_norm": 1.7421875, + "learning_rate": 1.0581181196625496e-05, + "loss": 0.8043, + "step": 8476 + }, + { + "epoch": 1.4415200291488952, + "grad_norm": 1.6875, + "learning_rate": 1.0579378311518759e-05, + "loss": 0.8697, + "step": 8477 + }, + { + "epoch": 1.441691493237885, + "grad_norm": 1.65625, + "learning_rate": 1.0577575407516318e-05, + "loss": 0.9282, + "step": 8478 + }, + { + "epoch": 1.4418629573268749, + "grad_norm": 1.796875, + "learning_rate": 1.0575772484676965e-05, + "loss": 0.9028, + "step": 8479 + }, + { + "epoch": 1.4420344214158647, + "grad_norm": 1.8359375, + "learning_rate": 1.0573969543059503e-05, + "loss": 0.8979, + "step": 8480 + }, + { + "epoch": 1.4422058855048545, + "grad_norm": 1.7109375, + "learning_rate": 1.0572166582722734e-05, + "loss": 0.7764, + "step": 8481 + }, + { + "epoch": 1.4423773495938446, + "grad_norm": 1.6796875, + "learning_rate": 1.0570363603725456e-05, + "loss": 0.8835, + "step": 8482 + }, + { + "epoch": 1.4425488136828344, + "grad_norm": 1.6953125, + "learning_rate": 1.0568560606126475e-05, + "loss": 0.7674, + "step": 8483 + }, + { + "epoch": 1.4427202777718242, + "grad_norm": 1.7890625, + "learning_rate": 1.0566757589984593e-05, + "loss": 0.9045, + "step": 8484 + }, + { + "epoch": 1.442891741860814, + "grad_norm": 1.671875, + "learning_rate": 1.0564954555358606e-05, + "loss": 0.8809, + "step": 8485 + }, + { + "epoch": 1.4430632059498039, + "grad_norm": 1.7109375, + "learning_rate": 1.056315150230733e-05, + "loss": 0.8396, + "step": 8486 + }, + { + "epoch": 1.4432346700387937, + "grad_norm": 1.7578125, + "learning_rate": 1.056134843088956e-05, + "loss": 0.89, + "step": 8487 + }, + { + "epoch": 1.4434061341277835, + "grad_norm": 1.796875, + "learning_rate": 1.0559545341164103e-05, + "loss": 0.8926, + "step": 8488 + }, + { + "epoch": 1.4435775982167733, + "grad_norm": 1.75, + "learning_rate": 1.0557742233189767e-05, + "loss": 0.8657, + "step": 8489 + }, + { + "epoch": 1.4437490623057634, + "grad_norm": 1.75, + "learning_rate": 1.0555939107025357e-05, + "loss": 0.8467, + "step": 8490 + }, + { + "epoch": 1.4439205263947532, + "grad_norm": 1.65625, + "learning_rate": 1.055413596272968e-05, + "loss": 0.8783, + "step": 8491 + }, + { + "epoch": 1.444091990483743, + "grad_norm": 1.7734375, + "learning_rate": 1.055233280036154e-05, + "loss": 0.8765, + "step": 8492 + }, + { + "epoch": 1.4442634545727329, + "grad_norm": 1.7734375, + "learning_rate": 1.0550529619979748e-05, + "loss": 0.9313, + "step": 8493 + }, + { + "epoch": 1.444434918661723, + "grad_norm": 1.71875, + "learning_rate": 1.0548726421643112e-05, + "loss": 0.8334, + "step": 8494 + }, + { + "epoch": 1.4446063827507127, + "grad_norm": 1.765625, + "learning_rate": 1.054692320541044e-05, + "loss": 0.8327, + "step": 8495 + }, + { + "epoch": 1.4447778468397026, + "grad_norm": 1.7265625, + "learning_rate": 1.0545119971340545e-05, + "loss": 0.943, + "step": 8496 + }, + { + "epoch": 1.4449493109286924, + "grad_norm": 1.734375, + "learning_rate": 1.0543316719492233e-05, + "loss": 0.8807, + "step": 8497 + }, + { + "epoch": 1.4451207750176822, + "grad_norm": 1.703125, + "learning_rate": 1.054151344992432e-05, + "loss": 0.8186, + "step": 8498 + }, + { + "epoch": 1.445292239106672, + "grad_norm": 1.6484375, + "learning_rate": 1.0539710162695611e-05, + "loss": 0.943, + "step": 8499 + }, + { + "epoch": 1.4454637031956619, + "grad_norm": 1.796875, + "learning_rate": 1.053790685786492e-05, + "loss": 0.8957, + "step": 8500 + }, + { + "epoch": 1.4456351672846517, + "grad_norm": 1.6484375, + "learning_rate": 1.0536103535491067e-05, + "loss": 0.8395, + "step": 8501 + }, + { + "epoch": 1.4458066313736417, + "grad_norm": 1.7421875, + "learning_rate": 1.0534300195632856e-05, + "loss": 0.802, + "step": 8502 + }, + { + "epoch": 1.4459780954626316, + "grad_norm": 1.7421875, + "learning_rate": 1.0532496838349101e-05, + "loss": 0.8584, + "step": 8503 + }, + { + "epoch": 1.4461495595516214, + "grad_norm": 1.6484375, + "learning_rate": 1.0530693463698622e-05, + "loss": 0.8241, + "step": 8504 + }, + { + "epoch": 1.4463210236406112, + "grad_norm": 1.6015625, + "learning_rate": 1.0528890071740227e-05, + "loss": 0.8376, + "step": 8505 + }, + { + "epoch": 1.4464924877296013, + "grad_norm": 1.7578125, + "learning_rate": 1.0527086662532737e-05, + "loss": 0.9222, + "step": 8506 + }, + { + "epoch": 1.446663951818591, + "grad_norm": 1.8046875, + "learning_rate": 1.0525283236134968e-05, + "loss": 0.9086, + "step": 8507 + }, + { + "epoch": 1.446835415907581, + "grad_norm": 1.734375, + "learning_rate": 1.0523479792605733e-05, + "loss": 0.8853, + "step": 8508 + }, + { + "epoch": 1.4470068799965707, + "grad_norm": 1.6796875, + "learning_rate": 1.0521676332003851e-05, + "loss": 0.8754, + "step": 8509 + }, + { + "epoch": 1.4471783440855606, + "grad_norm": 1.828125, + "learning_rate": 1.051987285438814e-05, + "loss": 0.8787, + "step": 8510 + }, + { + "epoch": 1.4473498081745504, + "grad_norm": 1.6640625, + "learning_rate": 1.0518069359817418e-05, + "loss": 0.8779, + "step": 8511 + }, + { + "epoch": 1.4475212722635402, + "grad_norm": 1.7109375, + "learning_rate": 1.0516265848350502e-05, + "loss": 0.9028, + "step": 8512 + }, + { + "epoch": 1.44769273635253, + "grad_norm": 1.796875, + "learning_rate": 1.0514462320046212e-05, + "loss": 0.8451, + "step": 8513 + }, + { + "epoch": 1.44786420044152, + "grad_norm": 1.625, + "learning_rate": 1.051265877496337e-05, + "loss": 0.8443, + "step": 8514 + }, + { + "epoch": 1.44803566453051, + "grad_norm": 1.59375, + "learning_rate": 1.0510855213160793e-05, + "loss": 0.8919, + "step": 8515 + }, + { + "epoch": 1.4482071286194997, + "grad_norm": 1.7421875, + "learning_rate": 1.0509051634697305e-05, + "loss": 0.8251, + "step": 8516 + }, + { + "epoch": 1.4483785927084896, + "grad_norm": 1.7890625, + "learning_rate": 1.0507248039631726e-05, + "loss": 0.8602, + "step": 8517 + }, + { + "epoch": 1.4485500567974796, + "grad_norm": 1.71875, + "learning_rate": 1.050544442802288e-05, + "loss": 0.8148, + "step": 8518 + }, + { + "epoch": 1.4487215208864694, + "grad_norm": 1.796875, + "learning_rate": 1.0503640799929586e-05, + "loss": 0.8857, + "step": 8519 + }, + { + "epoch": 1.4488929849754593, + "grad_norm": 1.671875, + "learning_rate": 1.050183715541067e-05, + "loss": 0.8083, + "step": 8520 + }, + { + "epoch": 1.449064449064449, + "grad_norm": 1.6875, + "learning_rate": 1.0500033494524955e-05, + "loss": 0.9437, + "step": 8521 + }, + { + "epoch": 1.449235913153439, + "grad_norm": 1.6953125, + "learning_rate": 1.0498229817331262e-05, + "loss": 0.82, + "step": 8522 + }, + { + "epoch": 1.4494073772424287, + "grad_norm": 1.71875, + "learning_rate": 1.049642612388842e-05, + "loss": 0.9085, + "step": 8523 + }, + { + "epoch": 1.4495788413314186, + "grad_norm": 1.6015625, + "learning_rate": 1.049462241425525e-05, + "loss": 0.7999, + "step": 8524 + }, + { + "epoch": 1.4497503054204084, + "grad_norm": 1.6328125, + "learning_rate": 1.0492818688490587e-05, + "loss": 0.9326, + "step": 8525 + }, + { + "epoch": 1.4499217695093984, + "grad_norm": 1.640625, + "learning_rate": 1.0491014946653248e-05, + "loss": 0.8545, + "step": 8526 + }, + { + "epoch": 1.4500932335983883, + "grad_norm": 1.625, + "learning_rate": 1.0489211188802061e-05, + "loss": 0.9042, + "step": 8527 + }, + { + "epoch": 1.450264697687378, + "grad_norm": 1.7578125, + "learning_rate": 1.0487407414995856e-05, + "loss": 0.8962, + "step": 8528 + }, + { + "epoch": 1.450436161776368, + "grad_norm": 1.6640625, + "learning_rate": 1.048560362529346e-05, + "loss": 0.833, + "step": 8529 + }, + { + "epoch": 1.4506076258653577, + "grad_norm": 1.71875, + "learning_rate": 1.0483799819753704e-05, + "loss": 0.8709, + "step": 8530 + }, + { + "epoch": 1.4507790899543478, + "grad_norm": 1.6953125, + "learning_rate": 1.048199599843541e-05, + "loss": 0.8699, + "step": 8531 + }, + { + "epoch": 1.4509505540433376, + "grad_norm": 1.703125, + "learning_rate": 1.0480192161397408e-05, + "loss": 0.855, + "step": 8532 + }, + { + "epoch": 1.4511220181323274, + "grad_norm": 1.6640625, + "learning_rate": 1.0478388308698537e-05, + "loss": 0.9085, + "step": 8533 + }, + { + "epoch": 1.4512934822213173, + "grad_norm": 1.765625, + "learning_rate": 1.0476584440397622e-05, + "loss": 0.81, + "step": 8534 + }, + { + "epoch": 1.451464946310307, + "grad_norm": 1.6796875, + "learning_rate": 1.0474780556553491e-05, + "loss": 0.8483, + "step": 8535 + }, + { + "epoch": 1.451636410399297, + "grad_norm": 1.734375, + "learning_rate": 1.0472976657224982e-05, + "loss": 0.9003, + "step": 8536 + }, + { + "epoch": 1.4518078744882867, + "grad_norm": 1.7890625, + "learning_rate": 1.047117274247092e-05, + "loss": 0.848, + "step": 8537 + }, + { + "epoch": 1.4519793385772768, + "grad_norm": 1.765625, + "learning_rate": 1.046936881235014e-05, + "loss": 0.8042, + "step": 8538 + }, + { + "epoch": 1.4521508026662666, + "grad_norm": 1.609375, + "learning_rate": 1.0467564866921478e-05, + "loss": 0.8757, + "step": 8539 + }, + { + "epoch": 1.4523222667552564, + "grad_norm": 1.6875, + "learning_rate": 1.0465760906243762e-05, + "loss": 0.8383, + "step": 8540 + }, + { + "epoch": 1.4524937308442463, + "grad_norm": 1.71875, + "learning_rate": 1.0463956930375832e-05, + "loss": 0.7948, + "step": 8541 + }, + { + "epoch": 1.452665194933236, + "grad_norm": 1.6875, + "learning_rate": 1.0462152939376518e-05, + "loss": 0.9375, + "step": 8542 + }, + { + "epoch": 1.4528366590222261, + "grad_norm": 1.625, + "learning_rate": 1.0460348933304656e-05, + "loss": 0.8774, + "step": 8543 + }, + { + "epoch": 1.453008123111216, + "grad_norm": 1.7890625, + "learning_rate": 1.0458544912219082e-05, + "loss": 0.9465, + "step": 8544 + }, + { + "epoch": 1.4531795872002058, + "grad_norm": 1.7734375, + "learning_rate": 1.0456740876178633e-05, + "loss": 0.8724, + "step": 8545 + }, + { + "epoch": 1.4533510512891956, + "grad_norm": 1.78125, + "learning_rate": 1.0454936825242143e-05, + "loss": 0.9094, + "step": 8546 + }, + { + "epoch": 1.4535225153781854, + "grad_norm": 1.5859375, + "learning_rate": 1.0453132759468449e-05, + "loss": 0.9206, + "step": 8547 + }, + { + "epoch": 1.4536939794671753, + "grad_norm": 1.8046875, + "learning_rate": 1.0451328678916393e-05, + "loss": 0.9343, + "step": 8548 + }, + { + "epoch": 1.453865443556165, + "grad_norm": 1.6171875, + "learning_rate": 1.0449524583644806e-05, + "loss": 0.8568, + "step": 8549 + }, + { + "epoch": 1.4540369076451551, + "grad_norm": 1.7265625, + "learning_rate": 1.044772047371253e-05, + "loss": 0.8451, + "step": 8550 + }, + { + "epoch": 1.454208371734145, + "grad_norm": 1.765625, + "learning_rate": 1.0445916349178404e-05, + "loss": 0.8701, + "step": 8551 + }, + { + "epoch": 1.4543798358231348, + "grad_norm": 1.578125, + "learning_rate": 1.0444112210101264e-05, + "loss": 0.8086, + "step": 8552 + }, + { + "epoch": 1.4545512999121246, + "grad_norm": 1.6796875, + "learning_rate": 1.0442308056539956e-05, + "loss": 0.8005, + "step": 8553 + }, + { + "epoch": 1.4547227640011144, + "grad_norm": 1.640625, + "learning_rate": 1.0440503888553316e-05, + "loss": 0.8714, + "step": 8554 + }, + { + "epoch": 1.4548942280901045, + "grad_norm": 1.609375, + "learning_rate": 1.0438699706200184e-05, + "loss": 0.8887, + "step": 8555 + }, + { + "epoch": 1.4550656921790943, + "grad_norm": 1.796875, + "learning_rate": 1.0436895509539405e-05, + "loss": 0.9123, + "step": 8556 + }, + { + "epoch": 1.4552371562680841, + "grad_norm": 1.6484375, + "learning_rate": 1.0435091298629815e-05, + "loss": 0.8783, + "step": 8557 + }, + { + "epoch": 1.455408620357074, + "grad_norm": 1.71875, + "learning_rate": 1.0433287073530263e-05, + "loss": 0.897, + "step": 8558 + }, + { + "epoch": 1.4555800844460638, + "grad_norm": 1.78125, + "learning_rate": 1.0431482834299586e-05, + "loss": 0.8824, + "step": 8559 + }, + { + "epoch": 1.4557515485350536, + "grad_norm": 1.640625, + "learning_rate": 1.0429678580996627e-05, + "loss": 0.8112, + "step": 8560 + }, + { + "epoch": 1.4559230126240434, + "grad_norm": 1.75, + "learning_rate": 1.0427874313680234e-05, + "loss": 0.861, + "step": 8561 + }, + { + "epoch": 1.4560944767130335, + "grad_norm": 1.6953125, + "learning_rate": 1.0426070032409247e-05, + "loss": 0.863, + "step": 8562 + }, + { + "epoch": 1.4562659408020233, + "grad_norm": 1.703125, + "learning_rate": 1.0424265737242514e-05, + "loss": 0.8264, + "step": 8563 + }, + { + "epoch": 1.4564374048910131, + "grad_norm": 1.7734375, + "learning_rate": 1.0422461428238874e-05, + "loss": 0.8788, + "step": 8564 + }, + { + "epoch": 1.456608868980003, + "grad_norm": 1.71875, + "learning_rate": 1.0420657105457182e-05, + "loss": 0.8383, + "step": 8565 + }, + { + "epoch": 1.4567803330689928, + "grad_norm": 1.7109375, + "learning_rate": 1.0418852768956274e-05, + "loss": 0.9114, + "step": 8566 + }, + { + "epoch": 1.4569517971579828, + "grad_norm": 1.703125, + "learning_rate": 1.0417048418795e-05, + "loss": 0.8532, + "step": 8567 + }, + { + "epoch": 1.4571232612469727, + "grad_norm": 1.734375, + "learning_rate": 1.0415244055032205e-05, + "loss": 0.8919, + "step": 8568 + }, + { + "epoch": 1.4572947253359625, + "grad_norm": 1.625, + "learning_rate": 1.041343967772674e-05, + "loss": 0.9256, + "step": 8569 + }, + { + "epoch": 1.4574661894249523, + "grad_norm": 1.671875, + "learning_rate": 1.0411635286937448e-05, + "loss": 0.8848, + "step": 8570 + }, + { + "epoch": 1.4576376535139421, + "grad_norm": 1.625, + "learning_rate": 1.0409830882723182e-05, + "loss": 0.8562, + "step": 8571 + }, + { + "epoch": 1.457809117602932, + "grad_norm": 1.7578125, + "learning_rate": 1.0408026465142787e-05, + "loss": 0.938, + "step": 8572 + }, + { + "epoch": 1.4579805816919218, + "grad_norm": 1.671875, + "learning_rate": 1.040622203425511e-05, + "loss": 0.8254, + "step": 8573 + }, + { + "epoch": 1.4581520457809118, + "grad_norm": 1.6953125, + "learning_rate": 1.0404417590119004e-05, + "loss": 0.8174, + "step": 8574 + }, + { + "epoch": 1.4583235098699017, + "grad_norm": 1.7265625, + "learning_rate": 1.0402613132793317e-05, + "loss": 0.9119, + "step": 8575 + }, + { + "epoch": 1.4584949739588915, + "grad_norm": 1.6875, + "learning_rate": 1.0400808662336906e-05, + "loss": 0.8193, + "step": 8576 + }, + { + "epoch": 1.4586664380478813, + "grad_norm": 1.7265625, + "learning_rate": 1.0399004178808609e-05, + "loss": 0.8587, + "step": 8577 + }, + { + "epoch": 1.4588379021368711, + "grad_norm": 1.6015625, + "learning_rate": 1.0397199682267283e-05, + "loss": 0.8488, + "step": 8578 + }, + { + "epoch": 1.4590093662258612, + "grad_norm": 1.7421875, + "learning_rate": 1.0395395172771779e-05, + "loss": 0.8938, + "step": 8579 + }, + { + "epoch": 1.459180830314851, + "grad_norm": 1.6796875, + "learning_rate": 1.0393590650380951e-05, + "loss": 0.8643, + "step": 8580 + }, + { + "epoch": 1.4593522944038408, + "grad_norm": 1.5859375, + "learning_rate": 1.039178611515365e-05, + "loss": 0.7745, + "step": 8581 + }, + { + "epoch": 1.4595237584928307, + "grad_norm": 1.7265625, + "learning_rate": 1.038998156714873e-05, + "loss": 0.8998, + "step": 8582 + }, + { + "epoch": 1.4596952225818205, + "grad_norm": 1.734375, + "learning_rate": 1.0388177006425042e-05, + "loss": 0.9082, + "step": 8583 + }, + { + "epoch": 1.4598666866708103, + "grad_norm": 1.6484375, + "learning_rate": 1.0386372433041438e-05, + "loss": 0.8217, + "step": 8584 + }, + { + "epoch": 1.4600381507598001, + "grad_norm": 1.6953125, + "learning_rate": 1.0384567847056776e-05, + "loss": 0.8611, + "step": 8585 + }, + { + "epoch": 1.46020961484879, + "grad_norm": 1.6640625, + "learning_rate": 1.0382763248529908e-05, + "loss": 0.7299, + "step": 8586 + }, + { + "epoch": 1.46038107893778, + "grad_norm": 1.7734375, + "learning_rate": 1.0380958637519688e-05, + "loss": 0.9224, + "step": 8587 + }, + { + "epoch": 1.4605525430267698, + "grad_norm": 1.796875, + "learning_rate": 1.0379154014084973e-05, + "loss": 0.8753, + "step": 8588 + }, + { + "epoch": 1.4607240071157597, + "grad_norm": 1.7109375, + "learning_rate": 1.0377349378284618e-05, + "loss": 0.9347, + "step": 8589 + }, + { + "epoch": 1.4608954712047495, + "grad_norm": 1.609375, + "learning_rate": 1.0375544730177477e-05, + "loss": 0.8349, + "step": 8590 + }, + { + "epoch": 1.4610669352937395, + "grad_norm": 1.75, + "learning_rate": 1.0373740069822411e-05, + "loss": 0.8822, + "step": 8591 + }, + { + "epoch": 1.4612383993827294, + "grad_norm": 1.765625, + "learning_rate": 1.0371935397278272e-05, + "loss": 0.896, + "step": 8592 + }, + { + "epoch": 1.4614098634717192, + "grad_norm": 1.7265625, + "learning_rate": 1.0370130712603919e-05, + "loss": 0.8061, + "step": 8593 + }, + { + "epoch": 1.461581327560709, + "grad_norm": 1.71875, + "learning_rate": 1.0368326015858212e-05, + "loss": 0.8535, + "step": 8594 + }, + { + "epoch": 1.4617527916496988, + "grad_norm": 1.5703125, + "learning_rate": 1.0366521307100007e-05, + "loss": 0.773, + "step": 8595 + }, + { + "epoch": 1.4619242557386887, + "grad_norm": 1.6640625, + "learning_rate": 1.036471658638816e-05, + "loss": 0.8229, + "step": 8596 + }, + { + "epoch": 1.4620957198276785, + "grad_norm": 1.6953125, + "learning_rate": 1.036291185378153e-05, + "loss": 0.9088, + "step": 8597 + }, + { + "epoch": 1.4622671839166683, + "grad_norm": 1.7890625, + "learning_rate": 1.036110710933898e-05, + "loss": 0.8838, + "step": 8598 + }, + { + "epoch": 1.4624386480056584, + "grad_norm": 1.6953125, + "learning_rate": 1.0359302353119365e-05, + "loss": 0.8271, + "step": 8599 + }, + { + "epoch": 1.4626101120946482, + "grad_norm": 1.7109375, + "learning_rate": 1.035749758518155e-05, + "loss": 0.8883, + "step": 8600 + }, + { + "epoch": 1.462781576183638, + "grad_norm": 1.6953125, + "learning_rate": 1.0355692805584394e-05, + "loss": 0.8199, + "step": 8601 + }, + { + "epoch": 1.4629530402726278, + "grad_norm": 1.6328125, + "learning_rate": 1.0353888014386753e-05, + "loss": 0.8284, + "step": 8602 + }, + { + "epoch": 1.463124504361618, + "grad_norm": 1.65625, + "learning_rate": 1.0352083211647493e-05, + "loss": 0.8684, + "step": 8603 + }, + { + "epoch": 1.4632959684506077, + "grad_norm": 1.640625, + "learning_rate": 1.0350278397425473e-05, + "loss": 0.7954, + "step": 8604 + }, + { + "epoch": 1.4634674325395975, + "grad_norm": 1.6015625, + "learning_rate": 1.0348473571779556e-05, + "loss": 0.8125, + "step": 8605 + }, + { + "epoch": 1.4636388966285874, + "grad_norm": 1.6953125, + "learning_rate": 1.0346668734768603e-05, + "loss": 0.9514, + "step": 8606 + }, + { + "epoch": 1.4638103607175772, + "grad_norm": 1.6484375, + "learning_rate": 1.0344863886451476e-05, + "loss": 0.8339, + "step": 8607 + }, + { + "epoch": 1.463981824806567, + "grad_norm": 1.734375, + "learning_rate": 1.0343059026887039e-05, + "loss": 0.7786, + "step": 8608 + }, + { + "epoch": 1.4641532888955568, + "grad_norm": 1.6640625, + "learning_rate": 1.0341254156134157e-05, + "loss": 0.8554, + "step": 8609 + }, + { + "epoch": 1.4643247529845467, + "grad_norm": 1.5546875, + "learning_rate": 1.033944927425169e-05, + "loss": 0.8097, + "step": 8610 + }, + { + "epoch": 1.4644962170735367, + "grad_norm": 1.5625, + "learning_rate": 1.0337644381298508e-05, + "loss": 0.8734, + "step": 8611 + }, + { + "epoch": 1.4646676811625265, + "grad_norm": 1.65625, + "learning_rate": 1.033583947733347e-05, + "loss": 0.8432, + "step": 8612 + }, + { + "epoch": 1.4648391452515164, + "grad_norm": 1.6640625, + "learning_rate": 1.0334034562415438e-05, + "loss": 0.8445, + "step": 8613 + }, + { + "epoch": 1.4650106093405062, + "grad_norm": 1.6953125, + "learning_rate": 1.0332229636603283e-05, + "loss": 0.8689, + "step": 8614 + }, + { + "epoch": 1.4651820734294962, + "grad_norm": 1.671875, + "learning_rate": 1.033042469995587e-05, + "loss": 0.8642, + "step": 8615 + }, + { + "epoch": 1.465353537518486, + "grad_norm": 1.75, + "learning_rate": 1.0328619752532063e-05, + "loss": 0.989, + "step": 8616 + }, + { + "epoch": 1.465525001607476, + "grad_norm": 1.640625, + "learning_rate": 1.0326814794390728e-05, + "loss": 0.9084, + "step": 8617 + }, + { + "epoch": 1.4656964656964657, + "grad_norm": 1.6953125, + "learning_rate": 1.032500982559073e-05, + "loss": 0.8981, + "step": 8618 + }, + { + "epoch": 1.4658679297854555, + "grad_norm": 1.6328125, + "learning_rate": 1.032320484619094e-05, + "loss": 0.8218, + "step": 8619 + }, + { + "epoch": 1.4660393938744454, + "grad_norm": 1.671875, + "learning_rate": 1.032139985625022e-05, + "loss": 0.8495, + "step": 8620 + }, + { + "epoch": 1.4662108579634352, + "grad_norm": 1.6171875, + "learning_rate": 1.0319594855827444e-05, + "loss": 0.8132, + "step": 8621 + }, + { + "epoch": 1.466382322052425, + "grad_norm": 1.7421875, + "learning_rate": 1.0317789844981474e-05, + "loss": 0.8734, + "step": 8622 + }, + { + "epoch": 1.466553786141415, + "grad_norm": 1.703125, + "learning_rate": 1.0315984823771183e-05, + "loss": 0.7964, + "step": 8623 + }, + { + "epoch": 1.466725250230405, + "grad_norm": 1.671875, + "learning_rate": 1.0314179792255436e-05, + "loss": 0.9034, + "step": 8624 + }, + { + "epoch": 1.4668967143193947, + "grad_norm": 1.71875, + "learning_rate": 1.0312374750493101e-05, + "loss": 0.8649, + "step": 8625 + }, + { + "epoch": 1.4670681784083845, + "grad_norm": 1.625, + "learning_rate": 1.031056969854305e-05, + "loss": 0.8618, + "step": 8626 + }, + { + "epoch": 1.4672396424973744, + "grad_norm": 1.6015625, + "learning_rate": 1.030876463646415e-05, + "loss": 0.8017, + "step": 8627 + }, + { + "epoch": 1.4674111065863644, + "grad_norm": 1.75, + "learning_rate": 1.0306959564315278e-05, + "loss": 0.8205, + "step": 8628 + }, + { + "epoch": 1.4675825706753542, + "grad_norm": 1.6640625, + "learning_rate": 1.0305154482155296e-05, + "loss": 0.8564, + "step": 8629 + }, + { + "epoch": 1.467754034764344, + "grad_norm": 1.75, + "learning_rate": 1.0303349390043076e-05, + "loss": 0.8954, + "step": 8630 + }, + { + "epoch": 1.467925498853334, + "grad_norm": 1.609375, + "learning_rate": 1.030154428803749e-05, + "loss": 0.8172, + "step": 8631 + }, + { + "epoch": 1.4680969629423237, + "grad_norm": 1.640625, + "learning_rate": 1.0299739176197409e-05, + "loss": 0.8819, + "step": 8632 + }, + { + "epoch": 1.4682684270313135, + "grad_norm": 1.6171875, + "learning_rate": 1.0297934054581707e-05, + "loss": 0.7962, + "step": 8633 + }, + { + "epoch": 1.4684398911203034, + "grad_norm": 1.6328125, + "learning_rate": 1.0296128923249251e-05, + "loss": 0.757, + "step": 8634 + }, + { + "epoch": 1.4686113552092934, + "grad_norm": 1.703125, + "learning_rate": 1.0294323782258917e-05, + "loss": 0.8211, + "step": 8635 + }, + { + "epoch": 1.4687828192982832, + "grad_norm": 1.7890625, + "learning_rate": 1.0292518631669575e-05, + "loss": 0.8835, + "step": 8636 + }, + { + "epoch": 1.468954283387273, + "grad_norm": 1.6953125, + "learning_rate": 1.02907134715401e-05, + "loss": 0.8118, + "step": 8637 + }, + { + "epoch": 1.469125747476263, + "grad_norm": 1.6796875, + "learning_rate": 1.0288908301929364e-05, + "loss": 0.8418, + "step": 8638 + }, + { + "epoch": 1.4692972115652527, + "grad_norm": 1.7265625, + "learning_rate": 1.0287103122896237e-05, + "loss": 0.8533, + "step": 8639 + }, + { + "epoch": 1.4694686756542428, + "grad_norm": 1.7109375, + "learning_rate": 1.0285297934499604e-05, + "loss": 0.8903, + "step": 8640 + }, + { + "epoch": 1.4696401397432326, + "grad_norm": 1.7890625, + "learning_rate": 1.0283492736798327e-05, + "loss": 0.8225, + "step": 8641 + }, + { + "epoch": 1.4698116038322224, + "grad_norm": 1.65625, + "learning_rate": 1.028168752985128e-05, + "loss": 0.9088, + "step": 8642 + }, + { + "epoch": 1.4699830679212122, + "grad_norm": 1.6015625, + "learning_rate": 1.0279882313717346e-05, + "loss": 0.7967, + "step": 8643 + }, + { + "epoch": 1.470154532010202, + "grad_norm": 1.6875, + "learning_rate": 1.0278077088455396e-05, + "loss": 0.8248, + "step": 8644 + }, + { + "epoch": 1.470325996099192, + "grad_norm": 1.6640625, + "learning_rate": 1.0276271854124303e-05, + "loss": 0.8308, + "step": 8645 + }, + { + "epoch": 1.4704974601881817, + "grad_norm": 1.7265625, + "learning_rate": 1.0274466610782948e-05, + "loss": 0.8704, + "step": 8646 + }, + { + "epoch": 1.4706689242771718, + "grad_norm": 1.7109375, + "learning_rate": 1.0272661358490199e-05, + "loss": 0.934, + "step": 8647 + }, + { + "epoch": 1.4708403883661616, + "grad_norm": 1.875, + "learning_rate": 1.0270856097304939e-05, + "loss": 0.9382, + "step": 8648 + }, + { + "epoch": 1.4710118524551514, + "grad_norm": 1.78125, + "learning_rate": 1.026905082728604e-05, + "loss": 0.9287, + "step": 8649 + }, + { + "epoch": 1.4711833165441413, + "grad_norm": 1.671875, + "learning_rate": 1.0267245548492382e-05, + "loss": 0.8068, + "step": 8650 + }, + { + "epoch": 1.471354780633131, + "grad_norm": 1.6796875, + "learning_rate": 1.0265440260982841e-05, + "loss": 0.8128, + "step": 8651 + }, + { + "epoch": 1.4715262447221211, + "grad_norm": 1.6875, + "learning_rate": 1.0263634964816293e-05, + "loss": 0.9054, + "step": 8652 + }, + { + "epoch": 1.471697708811111, + "grad_norm": 1.6875, + "learning_rate": 1.0261829660051615e-05, + "loss": 0.8416, + "step": 8653 + }, + { + "epoch": 1.4718691729001008, + "grad_norm": 1.8125, + "learning_rate": 1.0260024346747683e-05, + "loss": 0.9562, + "step": 8654 + }, + { + "epoch": 1.4720406369890906, + "grad_norm": 1.6640625, + "learning_rate": 1.0258219024963382e-05, + "loss": 0.8248, + "step": 8655 + }, + { + "epoch": 1.4722121010780804, + "grad_norm": 1.6875, + "learning_rate": 1.0256413694757585e-05, + "loss": 0.9124, + "step": 8656 + }, + { + "epoch": 1.4723835651670703, + "grad_norm": 1.5859375, + "learning_rate": 1.0254608356189176e-05, + "loss": 0.8353, + "step": 8657 + }, + { + "epoch": 1.47255502925606, + "grad_norm": 1.75, + "learning_rate": 1.0252803009317025e-05, + "loss": 0.873, + "step": 8658 + }, + { + "epoch": 1.4727264933450501, + "grad_norm": 1.5625, + "learning_rate": 1.0250997654200014e-05, + "loss": 0.75, + "step": 8659 + }, + { + "epoch": 1.47289795743404, + "grad_norm": 1.671875, + "learning_rate": 1.0249192290897028e-05, + "loss": 0.9558, + "step": 8660 + }, + { + "epoch": 1.4730694215230298, + "grad_norm": 1.6796875, + "learning_rate": 1.0247386919466942e-05, + "loss": 0.8325, + "step": 8661 + }, + { + "epoch": 1.4732408856120196, + "grad_norm": 1.7578125, + "learning_rate": 1.0245581539968637e-05, + "loss": 0.9022, + "step": 8662 + }, + { + "epoch": 1.4734123497010094, + "grad_norm": 1.640625, + "learning_rate": 1.0243776152460993e-05, + "loss": 0.7822, + "step": 8663 + }, + { + "epoch": 1.4735838137899995, + "grad_norm": 1.625, + "learning_rate": 1.024197075700289e-05, + "loss": 0.8517, + "step": 8664 + }, + { + "epoch": 1.4737552778789893, + "grad_norm": 1.6796875, + "learning_rate": 1.0240165353653211e-05, + "loss": 0.859, + "step": 8665 + }, + { + "epoch": 1.4739267419679791, + "grad_norm": 1.6953125, + "learning_rate": 1.0238359942470834e-05, + "loss": 0.8532, + "step": 8666 + }, + { + "epoch": 1.474098206056969, + "grad_norm": 1.625, + "learning_rate": 1.0236554523514644e-05, + "loss": 0.7895, + "step": 8667 + }, + { + "epoch": 1.4742696701459588, + "grad_norm": 1.6875, + "learning_rate": 1.023474909684352e-05, + "loss": 0.8081, + "step": 8668 + }, + { + "epoch": 1.4744411342349486, + "grad_norm": 1.71875, + "learning_rate": 1.0232943662516343e-05, + "loss": 0.8918, + "step": 8669 + }, + { + "epoch": 1.4746125983239384, + "grad_norm": 1.703125, + "learning_rate": 1.0231138220591996e-05, + "loss": 0.9468, + "step": 8670 + }, + { + "epoch": 1.4747840624129283, + "grad_norm": 1.7734375, + "learning_rate": 1.022933277112936e-05, + "loss": 0.8936, + "step": 8671 + }, + { + "epoch": 1.4749555265019183, + "grad_norm": 1.8359375, + "learning_rate": 1.022752731418732e-05, + "loss": 1.0092, + "step": 8672 + }, + { + "epoch": 1.4751269905909081, + "grad_norm": 1.6953125, + "learning_rate": 1.0225721849824754e-05, + "loss": 0.8682, + "step": 8673 + }, + { + "epoch": 1.475298454679898, + "grad_norm": 1.6875, + "learning_rate": 1.0223916378100555e-05, + "loss": 0.8251, + "step": 8674 + }, + { + "epoch": 1.4754699187688878, + "grad_norm": 1.734375, + "learning_rate": 1.0222110899073597e-05, + "loss": 0.8766, + "step": 8675 + }, + { + "epoch": 1.4756413828578778, + "grad_norm": 1.6953125, + "learning_rate": 1.0220305412802765e-05, + "loss": 0.8304, + "step": 8676 + }, + { + "epoch": 1.4758128469468677, + "grad_norm": 1.671875, + "learning_rate": 1.0218499919346945e-05, + "loss": 0.8465, + "step": 8677 + }, + { + "epoch": 1.4759843110358575, + "grad_norm": 1.6640625, + "learning_rate": 1.0216694418765019e-05, + "loss": 0.8278, + "step": 8678 + }, + { + "epoch": 1.4761557751248473, + "grad_norm": 1.8203125, + "learning_rate": 1.0214888911115872e-05, + "loss": 0.9232, + "step": 8679 + }, + { + "epoch": 1.4763272392138371, + "grad_norm": 1.671875, + "learning_rate": 1.021308339645839e-05, + "loss": 0.864, + "step": 8680 + }, + { + "epoch": 1.476498703302827, + "grad_norm": 1.6328125, + "learning_rate": 1.0211277874851453e-05, + "loss": 0.7689, + "step": 8681 + }, + { + "epoch": 1.4766701673918168, + "grad_norm": 1.71875, + "learning_rate": 1.020947234635395e-05, + "loss": 0.9219, + "step": 8682 + }, + { + "epoch": 1.4768416314808066, + "grad_norm": 1.6328125, + "learning_rate": 1.0207666811024766e-05, + "loss": 0.8304, + "step": 8683 + }, + { + "epoch": 1.4770130955697967, + "grad_norm": 1.765625, + "learning_rate": 1.0205861268922786e-05, + "loss": 0.931, + "step": 8684 + }, + { + "epoch": 1.4771845596587865, + "grad_norm": 1.671875, + "learning_rate": 1.0204055720106891e-05, + "loss": 0.8371, + "step": 8685 + }, + { + "epoch": 1.4773560237477763, + "grad_norm": 1.6328125, + "learning_rate": 1.0202250164635977e-05, + "loss": 0.8339, + "step": 8686 + }, + { + "epoch": 1.4775274878367661, + "grad_norm": 1.71875, + "learning_rate": 1.020044460256892e-05, + "loss": 0.8499, + "step": 8687 + }, + { + "epoch": 1.4776989519257562, + "grad_norm": 1.703125, + "learning_rate": 1.0198639033964608e-05, + "loss": 0.8418, + "step": 8688 + }, + { + "epoch": 1.477870416014746, + "grad_norm": 1.640625, + "learning_rate": 1.0196833458881931e-05, + "loss": 0.8294, + "step": 8689 + }, + { + "epoch": 1.4780418801037358, + "grad_norm": 1.65625, + "learning_rate": 1.0195027877379773e-05, + "loss": 0.7643, + "step": 8690 + }, + { + "epoch": 1.4782133441927257, + "grad_norm": 1.7421875, + "learning_rate": 1.0193222289517021e-05, + "loss": 0.8708, + "step": 8691 + }, + { + "epoch": 1.4783848082817155, + "grad_norm": 1.7109375, + "learning_rate": 1.0191416695352564e-05, + "loss": 0.8079, + "step": 8692 + }, + { + "epoch": 1.4785562723707053, + "grad_norm": 1.671875, + "learning_rate": 1.0189611094945287e-05, + "loss": 0.8953, + "step": 8693 + }, + { + "epoch": 1.4787277364596951, + "grad_norm": 1.6953125, + "learning_rate": 1.0187805488354079e-05, + "loss": 0.8435, + "step": 8694 + }, + { + "epoch": 1.478899200548685, + "grad_norm": 1.609375, + "learning_rate": 1.0185999875637827e-05, + "loss": 0.7899, + "step": 8695 + }, + { + "epoch": 1.479070664637675, + "grad_norm": 1.671875, + "learning_rate": 1.0184194256855418e-05, + "loss": 0.8195, + "step": 8696 + }, + { + "epoch": 1.4792421287266648, + "grad_norm": 1.625, + "learning_rate": 1.0182388632065738e-05, + "loss": 0.8243, + "step": 8697 + }, + { + "epoch": 1.4794135928156547, + "grad_norm": 1.78125, + "learning_rate": 1.0180583001327684e-05, + "loss": 0.9301, + "step": 8698 + }, + { + "epoch": 1.4795850569046445, + "grad_norm": 1.71875, + "learning_rate": 1.0178777364700134e-05, + "loss": 0.8632, + "step": 8699 + }, + { + "epoch": 1.4797565209936345, + "grad_norm": 1.7265625, + "learning_rate": 1.0176971722241982e-05, + "loss": 0.8346, + "step": 8700 + }, + { + "epoch": 1.4799279850826244, + "grad_norm": 1.6640625, + "learning_rate": 1.0175166074012113e-05, + "loss": 0.8498, + "step": 8701 + }, + { + "epoch": 1.4800994491716142, + "grad_norm": 1.7734375, + "learning_rate": 1.0173360420069421e-05, + "loss": 0.8662, + "step": 8702 + }, + { + "epoch": 1.480270913260604, + "grad_norm": 1.703125, + "learning_rate": 1.0171554760472796e-05, + "loss": 0.811, + "step": 8703 + }, + { + "epoch": 1.4804423773495938, + "grad_norm": 1.7265625, + "learning_rate": 1.0169749095281118e-05, + "loss": 0.9006, + "step": 8704 + }, + { + "epoch": 1.4806138414385837, + "grad_norm": 1.8046875, + "learning_rate": 1.0167943424553288e-05, + "loss": 0.9017, + "step": 8705 + }, + { + "epoch": 1.4807853055275735, + "grad_norm": 1.6171875, + "learning_rate": 1.0166137748348188e-05, + "loss": 0.8064, + "step": 8706 + }, + { + "epoch": 1.4809567696165633, + "grad_norm": 1.6640625, + "learning_rate": 1.0164332066724712e-05, + "loss": 0.8161, + "step": 8707 + }, + { + "epoch": 1.4811282337055534, + "grad_norm": 1.6796875, + "learning_rate": 1.0162526379741746e-05, + "loss": 0.8809, + "step": 8708 + }, + { + "epoch": 1.4812996977945432, + "grad_norm": 1.6484375, + "learning_rate": 1.0160720687458183e-05, + "loss": 0.7899, + "step": 8709 + }, + { + "epoch": 1.481471161883533, + "grad_norm": 1.625, + "learning_rate": 1.0158914989932915e-05, + "loss": 0.8231, + "step": 8710 + }, + { + "epoch": 1.4816426259725228, + "grad_norm": 1.6171875, + "learning_rate": 1.015710928722483e-05, + "loss": 0.8379, + "step": 8711 + }, + { + "epoch": 1.4818140900615129, + "grad_norm": 1.671875, + "learning_rate": 1.0155303579392819e-05, + "loss": 0.8668, + "step": 8712 + }, + { + "epoch": 1.4819855541505027, + "grad_norm": 1.734375, + "learning_rate": 1.0153497866495774e-05, + "loss": 0.897, + "step": 8713 + }, + { + "epoch": 1.4821570182394925, + "grad_norm": 1.7109375, + "learning_rate": 1.0151692148592584e-05, + "loss": 0.8135, + "step": 8714 + }, + { + "epoch": 1.4823284823284824, + "grad_norm": 1.7109375, + "learning_rate": 1.0149886425742145e-05, + "loss": 0.827, + "step": 8715 + }, + { + "epoch": 1.4824999464174722, + "grad_norm": 1.734375, + "learning_rate": 1.0148080698003347e-05, + "loss": 0.8225, + "step": 8716 + }, + { + "epoch": 1.482671410506462, + "grad_norm": 1.71875, + "learning_rate": 1.0146274965435074e-05, + "loss": 0.9086, + "step": 8717 + }, + { + "epoch": 1.4828428745954518, + "grad_norm": 1.78125, + "learning_rate": 1.0144469228096228e-05, + "loss": 0.8494, + "step": 8718 + }, + { + "epoch": 1.4830143386844417, + "grad_norm": 1.6484375, + "learning_rate": 1.0142663486045692e-05, + "loss": 0.79, + "step": 8719 + }, + { + "epoch": 1.4831858027734317, + "grad_norm": 1.625, + "learning_rate": 1.0140857739342365e-05, + "loss": 0.8612, + "step": 8720 + }, + { + "epoch": 1.4833572668624215, + "grad_norm": 1.6328125, + "learning_rate": 1.0139051988045136e-05, + "loss": 0.8386, + "step": 8721 + }, + { + "epoch": 1.4835287309514114, + "grad_norm": 1.6796875, + "learning_rate": 1.0137246232212901e-05, + "loss": 0.8252, + "step": 8722 + }, + { + "epoch": 1.4837001950404012, + "grad_norm": 1.6640625, + "learning_rate": 1.0135440471904546e-05, + "loss": 0.7856, + "step": 8723 + }, + { + "epoch": 1.483871659129391, + "grad_norm": 1.7578125, + "learning_rate": 1.0133634707178968e-05, + "loss": 0.917, + "step": 8724 + }, + { + "epoch": 1.484043123218381, + "grad_norm": 1.7421875, + "learning_rate": 1.0131828938095059e-05, + "loss": 0.8445, + "step": 8725 + }, + { + "epoch": 1.4842145873073709, + "grad_norm": 1.6171875, + "learning_rate": 1.0130023164711715e-05, + "loss": 0.8146, + "step": 8726 + }, + { + "epoch": 1.4843860513963607, + "grad_norm": 1.6640625, + "learning_rate": 1.0128217387087825e-05, + "loss": 0.8103, + "step": 8727 + }, + { + "epoch": 1.4845575154853505, + "grad_norm": 1.609375, + "learning_rate": 1.0126411605282279e-05, + "loss": 0.8867, + "step": 8728 + }, + { + "epoch": 1.4847289795743404, + "grad_norm": 1.75, + "learning_rate": 1.0124605819353974e-05, + "loss": 0.96, + "step": 8729 + }, + { + "epoch": 1.4849004436633302, + "grad_norm": 1.703125, + "learning_rate": 1.0122800029361808e-05, + "loss": 0.9119, + "step": 8730 + }, + { + "epoch": 1.48507190775232, + "grad_norm": 1.703125, + "learning_rate": 1.012099423536467e-05, + "loss": 0.8755, + "step": 8731 + }, + { + "epoch": 1.48524337184131, + "grad_norm": 1.703125, + "learning_rate": 1.0119188437421456e-05, + "loss": 0.8813, + "step": 8732 + }, + { + "epoch": 1.4854148359302999, + "grad_norm": 1.7265625, + "learning_rate": 1.0117382635591055e-05, + "loss": 0.8764, + "step": 8733 + }, + { + "epoch": 1.4855863000192897, + "grad_norm": 1.7734375, + "learning_rate": 1.0115576829932365e-05, + "loss": 0.884, + "step": 8734 + }, + { + "epoch": 1.4857577641082795, + "grad_norm": 1.6640625, + "learning_rate": 1.0113771020504281e-05, + "loss": 0.8937, + "step": 8735 + }, + { + "epoch": 1.4859292281972694, + "grad_norm": 1.7734375, + "learning_rate": 1.0111965207365695e-05, + "loss": 0.8606, + "step": 8736 + }, + { + "epoch": 1.4861006922862594, + "grad_norm": 1.6171875, + "learning_rate": 1.0110159390575501e-05, + "loss": 0.8762, + "step": 8737 + }, + { + "epoch": 1.4862721563752492, + "grad_norm": 1.703125, + "learning_rate": 1.0108353570192595e-05, + "loss": 0.8707, + "step": 8738 + }, + { + "epoch": 1.486443620464239, + "grad_norm": 1.7578125, + "learning_rate": 1.010654774627587e-05, + "loss": 0.8746, + "step": 8739 + }, + { + "epoch": 1.4866150845532289, + "grad_norm": 1.640625, + "learning_rate": 1.0104741918884222e-05, + "loss": 0.8249, + "step": 8740 + }, + { + "epoch": 1.4867865486422187, + "grad_norm": 1.671875, + "learning_rate": 1.0102936088076547e-05, + "loss": 0.8829, + "step": 8741 + }, + { + "epoch": 1.4869580127312085, + "grad_norm": 1.6796875, + "learning_rate": 1.0101130253911737e-05, + "loss": 0.8185, + "step": 8742 + }, + { + "epoch": 1.4871294768201984, + "grad_norm": 1.6328125, + "learning_rate": 1.009932441644869e-05, + "loss": 0.9014, + "step": 8743 + }, + { + "epoch": 1.4873009409091884, + "grad_norm": 1.71875, + "learning_rate": 1.0097518575746301e-05, + "loss": 0.8529, + "step": 8744 + }, + { + "epoch": 1.4874724049981782, + "grad_norm": 1.6484375, + "learning_rate": 1.0095712731863463e-05, + "loss": 0.8076, + "step": 8745 + }, + { + "epoch": 1.487643869087168, + "grad_norm": 1.75, + "learning_rate": 1.009390688485907e-05, + "loss": 0.8656, + "step": 8746 + }, + { + "epoch": 1.4878153331761579, + "grad_norm": 1.671875, + "learning_rate": 1.009210103479202e-05, + "loss": 0.7715, + "step": 8747 + }, + { + "epoch": 1.4879867972651477, + "grad_norm": 1.7890625, + "learning_rate": 1.0090295181721209e-05, + "loss": 0.9446, + "step": 8748 + }, + { + "epoch": 1.4881582613541378, + "grad_norm": 1.640625, + "learning_rate": 1.0088489325705535e-05, + "loss": 0.8537, + "step": 8749 + }, + { + "epoch": 1.4883297254431276, + "grad_norm": 1.6875, + "learning_rate": 1.0086683466803889e-05, + "loss": 0.9564, + "step": 8750 + }, + { + "epoch": 1.4885011895321174, + "grad_norm": 1.734375, + "learning_rate": 1.0084877605075168e-05, + "loss": 0.9122, + "step": 8751 + }, + { + "epoch": 1.4886726536211072, + "grad_norm": 1.7421875, + "learning_rate": 1.0083071740578269e-05, + "loss": 0.8204, + "step": 8752 + }, + { + "epoch": 1.488844117710097, + "grad_norm": 1.7421875, + "learning_rate": 1.0081265873372088e-05, + "loss": 0.7796, + "step": 8753 + }, + { + "epoch": 1.4890155817990869, + "grad_norm": 1.7109375, + "learning_rate": 1.0079460003515522e-05, + "loss": 0.8749, + "step": 8754 + }, + { + "epoch": 1.4891870458880767, + "grad_norm": 1.65625, + "learning_rate": 1.0077654131067467e-05, + "loss": 0.8704, + "step": 8755 + }, + { + "epoch": 1.4893585099770668, + "grad_norm": 1.796875, + "learning_rate": 1.0075848256086814e-05, + "loss": 0.9248, + "step": 8756 + }, + { + "epoch": 1.4895299740660566, + "grad_norm": 1.65625, + "learning_rate": 1.0074042378632464e-05, + "loss": 0.8705, + "step": 8757 + }, + { + "epoch": 1.4897014381550464, + "grad_norm": 1.8046875, + "learning_rate": 1.0072236498763316e-05, + "loss": 0.8117, + "step": 8758 + }, + { + "epoch": 1.4898729022440362, + "grad_norm": 1.6953125, + "learning_rate": 1.0070430616538262e-05, + "loss": 0.8788, + "step": 8759 + }, + { + "epoch": 1.490044366333026, + "grad_norm": 1.7421875, + "learning_rate": 1.0068624732016201e-05, + "loss": 0.8653, + "step": 8760 + }, + { + "epoch": 1.490215830422016, + "grad_norm": 1.5859375, + "learning_rate": 1.0066818845256032e-05, + "loss": 0.7809, + "step": 8761 + }, + { + "epoch": 1.490387294511006, + "grad_norm": 1.6484375, + "learning_rate": 1.0065012956316647e-05, + "loss": 0.8333, + "step": 8762 + }, + { + "epoch": 1.4905587585999958, + "grad_norm": 1.6796875, + "learning_rate": 1.0063207065256944e-05, + "loss": 0.8556, + "step": 8763 + }, + { + "epoch": 1.4907302226889856, + "grad_norm": 1.71875, + "learning_rate": 1.006140117213582e-05, + "loss": 0.9508, + "step": 8764 + }, + { + "epoch": 1.4909016867779754, + "grad_norm": 1.6953125, + "learning_rate": 1.0059595277012173e-05, + "loss": 0.8847, + "step": 8765 + }, + { + "epoch": 1.4910731508669652, + "grad_norm": 1.8359375, + "learning_rate": 1.00577893799449e-05, + "loss": 0.9396, + "step": 8766 + }, + { + "epoch": 1.491244614955955, + "grad_norm": 1.6953125, + "learning_rate": 1.0055983480992897e-05, + "loss": 0.9099, + "step": 8767 + }, + { + "epoch": 1.4914160790449449, + "grad_norm": 1.71875, + "learning_rate": 1.0054177580215063e-05, + "loss": 0.8431, + "step": 8768 + }, + { + "epoch": 1.491587543133935, + "grad_norm": 1.7421875, + "learning_rate": 1.0052371677670291e-05, + "loss": 0.8217, + "step": 8769 + }, + { + "epoch": 1.4917590072229248, + "grad_norm": 1.640625, + "learning_rate": 1.0050565773417484e-05, + "loss": 0.8995, + "step": 8770 + }, + { + "epoch": 1.4919304713119146, + "grad_norm": 1.71875, + "learning_rate": 1.0048759867515538e-05, + "loss": 0.9212, + "step": 8771 + }, + { + "epoch": 1.4921019354009044, + "grad_norm": 1.671875, + "learning_rate": 1.0046953960023346e-05, + "loss": 0.8676, + "step": 8772 + }, + { + "epoch": 1.4922733994898945, + "grad_norm": 1.6796875, + "learning_rate": 1.0045148050999814e-05, + "loss": 0.8223, + "step": 8773 + }, + { + "epoch": 1.4924448635788843, + "grad_norm": 1.59375, + "learning_rate": 1.0043342140503829e-05, + "loss": 0.8115, + "step": 8774 + }, + { + "epoch": 1.492616327667874, + "grad_norm": 1.6328125, + "learning_rate": 1.0041536228594293e-05, + "loss": 0.8233, + "step": 8775 + }, + { + "epoch": 1.492787791756864, + "grad_norm": 1.625, + "learning_rate": 1.0039730315330105e-05, + "loss": 0.8075, + "step": 8776 + }, + { + "epoch": 1.4929592558458538, + "grad_norm": 1.6640625, + "learning_rate": 1.0037924400770163e-05, + "loss": 0.9386, + "step": 8777 + }, + { + "epoch": 1.4931307199348436, + "grad_norm": 1.6953125, + "learning_rate": 1.0036118484973365e-05, + "loss": 0.8208, + "step": 8778 + }, + { + "epoch": 1.4933021840238334, + "grad_norm": 1.625, + "learning_rate": 1.0034312567998605e-05, + "loss": 0.8615, + "step": 8779 + }, + { + "epoch": 1.4934736481128232, + "grad_norm": 1.7421875, + "learning_rate": 1.0032506649904783e-05, + "loss": 0.9546, + "step": 8780 + }, + { + "epoch": 1.4936451122018133, + "grad_norm": 1.65625, + "learning_rate": 1.0030700730750798e-05, + "loss": 0.8694, + "step": 8781 + }, + { + "epoch": 1.493816576290803, + "grad_norm": 1.5703125, + "learning_rate": 1.0028894810595547e-05, + "loss": 0.753, + "step": 8782 + }, + { + "epoch": 1.493988040379793, + "grad_norm": 1.6171875, + "learning_rate": 1.0027088889497924e-05, + "loss": 0.849, + "step": 8783 + }, + { + "epoch": 1.4941595044687828, + "grad_norm": 1.7421875, + "learning_rate": 1.0025282967516833e-05, + "loss": 0.8654, + "step": 8784 + }, + { + "epoch": 1.4943309685577728, + "grad_norm": 1.6484375, + "learning_rate": 1.0023477044711168e-05, + "loss": 0.8175, + "step": 8785 + }, + { + "epoch": 1.4945024326467626, + "grad_norm": 1.7265625, + "learning_rate": 1.0021671121139828e-05, + "loss": 0.8664, + "step": 8786 + }, + { + "epoch": 1.4946738967357525, + "grad_norm": 1.6953125, + "learning_rate": 1.0019865196861714e-05, + "loss": 0.8677, + "step": 8787 + }, + { + "epoch": 1.4948453608247423, + "grad_norm": 1.625, + "learning_rate": 1.0018059271935719e-05, + "loss": 0.8039, + "step": 8788 + }, + { + "epoch": 1.4950168249137321, + "grad_norm": 1.796875, + "learning_rate": 1.0016253346420742e-05, + "loss": 0.8663, + "step": 8789 + }, + { + "epoch": 1.495188289002722, + "grad_norm": 1.7109375, + "learning_rate": 1.0014447420375686e-05, + "loss": 0.8492, + "step": 8790 + }, + { + "epoch": 1.4953597530917118, + "grad_norm": 1.6796875, + "learning_rate": 1.0012641493859441e-05, + "loss": 0.8019, + "step": 8791 + }, + { + "epoch": 1.4955312171807016, + "grad_norm": 1.6640625, + "learning_rate": 1.0010835566930912e-05, + "loss": 0.8758, + "step": 8792 + }, + { + "epoch": 1.4957026812696916, + "grad_norm": 1.6953125, + "learning_rate": 1.0009029639648993e-05, + "loss": 0.8206, + "step": 8793 + }, + { + "epoch": 1.4958741453586815, + "grad_norm": 1.671875, + "learning_rate": 1.000722371207258e-05, + "loss": 0.895, + "step": 8794 + }, + { + "epoch": 1.4960456094476713, + "grad_norm": 1.640625, + "learning_rate": 1.0005417784260578e-05, + "loss": 0.8039, + "step": 8795 + }, + { + "epoch": 1.4962170735366611, + "grad_norm": 1.75, + "learning_rate": 1.0003611856271885e-05, + "loss": 0.8428, + "step": 8796 + }, + { + "epoch": 1.4963885376256512, + "grad_norm": 1.6953125, + "learning_rate": 1.0001805928165392e-05, + "loss": 0.8586, + "step": 8797 + }, + { + "epoch": 1.496560001714641, + "grad_norm": 1.671875, + "learning_rate": 1e-05, + "loss": 0.8142, + "step": 8798 + }, + { + "epoch": 1.4967314658036308, + "grad_norm": 1.6640625, + "learning_rate": 9.99819407183461e-06, + "loss": 0.835, + "step": 8799 + }, + { + "epoch": 1.4969029298926206, + "grad_norm": 1.7578125, + "learning_rate": 9.996388143728118e-06, + "loss": 0.8529, + "step": 8800 + }, + { + "epoch": 1.4970743939816105, + "grad_norm": 1.6953125, + "learning_rate": 9.994582215739422e-06, + "loss": 0.8341, + "step": 8801 + }, + { + "epoch": 1.4972458580706003, + "grad_norm": 1.7109375, + "learning_rate": 9.99277628792742e-06, + "loss": 0.8221, + "step": 8802 + }, + { + "epoch": 1.4974173221595901, + "grad_norm": 1.640625, + "learning_rate": 9.990970360351012e-06, + "loss": 0.8617, + "step": 8803 + }, + { + "epoch": 1.49758878624858, + "grad_norm": 1.7578125, + "learning_rate": 9.989164433069093e-06, + "loss": 0.8577, + "step": 8804 + }, + { + "epoch": 1.49776025033757, + "grad_norm": 1.859375, + "learning_rate": 9.987358506140564e-06, + "loss": 0.8556, + "step": 8805 + }, + { + "epoch": 1.4979317144265598, + "grad_norm": 1.8203125, + "learning_rate": 9.98555257962432e-06, + "loss": 0.8676, + "step": 8806 + }, + { + "epoch": 1.4981031785155496, + "grad_norm": 1.7109375, + "learning_rate": 9.983746653579262e-06, + "loss": 0.9136, + "step": 8807 + }, + { + "epoch": 1.4982746426045395, + "grad_norm": 1.6484375, + "learning_rate": 9.981940728064286e-06, + "loss": 0.9122, + "step": 8808 + }, + { + "epoch": 1.4984461066935295, + "grad_norm": 1.6796875, + "learning_rate": 9.980134803138291e-06, + "loss": 0.8316, + "step": 8809 + }, + { + "epoch": 1.4986175707825193, + "grad_norm": 1.71875, + "learning_rate": 9.978328878860174e-06, + "loss": 0.9408, + "step": 8810 + }, + { + "epoch": 1.4987890348715092, + "grad_norm": 1.6484375, + "learning_rate": 9.976522955288834e-06, + "loss": 0.7907, + "step": 8811 + }, + { + "epoch": 1.498960498960499, + "grad_norm": 1.6875, + "learning_rate": 9.974717032483169e-06, + "loss": 0.8882, + "step": 8812 + }, + { + "epoch": 1.4991319630494888, + "grad_norm": 1.6640625, + "learning_rate": 9.972911110502078e-06, + "loss": 0.787, + "step": 8813 + }, + { + "epoch": 1.4993034271384786, + "grad_norm": 1.71875, + "learning_rate": 9.971105189404455e-06, + "loss": 0.9062, + "step": 8814 + }, + { + "epoch": 1.4994748912274685, + "grad_norm": 1.7265625, + "learning_rate": 9.969299269249202e-06, + "loss": 0.8384, + "step": 8815 + }, + { + "epoch": 1.4996463553164583, + "grad_norm": 1.71875, + "learning_rate": 9.96749335009522e-06, + "loss": 0.8959, + "step": 8816 + }, + { + "epoch": 1.4998178194054483, + "grad_norm": 1.6328125, + "learning_rate": 9.9656874320014e-06, + "loss": 0.8492, + "step": 8817 + }, + { + "epoch": 1.4999892834944382, + "grad_norm": 1.6171875, + "learning_rate": 9.96388151502664e-06, + "loss": 0.7821, + "step": 8818 + }, + { + "epoch": 1.500160747583428, + "grad_norm": 1.609375, + "learning_rate": 9.96207559922984e-06, + "loss": 0.8641, + "step": 8819 + }, + { + "epoch": 1.5003322116724178, + "grad_norm": 1.7578125, + "learning_rate": 9.960269684669898e-06, + "loss": 0.8587, + "step": 8820 + }, + { + "epoch": 1.5005036757614079, + "grad_norm": 1.609375, + "learning_rate": 9.95846377140571e-06, + "loss": 0.8383, + "step": 8821 + }, + { + "epoch": 1.5006751398503977, + "grad_norm": 1.7109375, + "learning_rate": 9.956657859496175e-06, + "loss": 0.8629, + "step": 8822 + }, + { + "epoch": 1.5008466039393875, + "grad_norm": 1.671875, + "learning_rate": 9.954851949000189e-06, + "loss": 0.8301, + "step": 8823 + }, + { + "epoch": 1.5010180680283773, + "grad_norm": 1.640625, + "learning_rate": 9.953046039976654e-06, + "loss": 0.8623, + "step": 8824 + }, + { + "epoch": 1.5011895321173672, + "grad_norm": 1.65625, + "learning_rate": 9.951240132484464e-06, + "loss": 0.7772, + "step": 8825 + }, + { + "epoch": 1.501360996206357, + "grad_norm": 1.6484375, + "learning_rate": 9.949434226582516e-06, + "loss": 0.9161, + "step": 8826 + }, + { + "epoch": 1.5015324602953468, + "grad_norm": 1.734375, + "learning_rate": 9.947628322329714e-06, + "loss": 0.889, + "step": 8827 + }, + { + "epoch": 1.5017039243843366, + "grad_norm": 1.7421875, + "learning_rate": 9.945822419784944e-06, + "loss": 0.8393, + "step": 8828 + }, + { + "epoch": 1.5018753884733265, + "grad_norm": 1.7109375, + "learning_rate": 9.944016519007108e-06, + "loss": 0.8285, + "step": 8829 + }, + { + "epoch": 1.5020468525623165, + "grad_norm": 1.75, + "learning_rate": 9.942210620055102e-06, + "loss": 0.8412, + "step": 8830 + }, + { + "epoch": 1.5022183166513063, + "grad_norm": 1.78125, + "learning_rate": 9.94040472298783e-06, + "loss": 0.8117, + "step": 8831 + }, + { + "epoch": 1.5023897807402962, + "grad_norm": 1.75, + "learning_rate": 9.938598827864183e-06, + "loss": 0.8945, + "step": 8832 + }, + { + "epoch": 1.5025612448292862, + "grad_norm": 1.7265625, + "learning_rate": 9.936792934743059e-06, + "loss": 0.8701, + "step": 8833 + }, + { + "epoch": 1.502732708918276, + "grad_norm": 1.625, + "learning_rate": 9.934987043683356e-06, + "loss": 0.8044, + "step": 8834 + }, + { + "epoch": 1.5029041730072659, + "grad_norm": 1.765625, + "learning_rate": 9.933181154743971e-06, + "loss": 0.8532, + "step": 8835 + }, + { + "epoch": 1.5030756370962557, + "grad_norm": 1.625, + "learning_rate": 9.931375267983799e-06, + "loss": 0.8653, + "step": 8836 + }, + { + "epoch": 1.5032471011852455, + "grad_norm": 1.71875, + "learning_rate": 9.929569383461738e-06, + "loss": 0.8524, + "step": 8837 + }, + { + "epoch": 1.5034185652742353, + "grad_norm": 1.609375, + "learning_rate": 9.927763501236685e-06, + "loss": 0.8179, + "step": 8838 + }, + { + "epoch": 1.5035900293632252, + "grad_norm": 1.703125, + "learning_rate": 9.925957621367538e-06, + "loss": 0.841, + "step": 8839 + }, + { + "epoch": 1.503761493452215, + "grad_norm": 1.6796875, + "learning_rate": 9.92415174391319e-06, + "loss": 0.8189, + "step": 8840 + }, + { + "epoch": 1.5039329575412048, + "grad_norm": 1.6953125, + "learning_rate": 9.92234586893254e-06, + "loss": 0.7867, + "step": 8841 + }, + { + "epoch": 1.5041044216301949, + "grad_norm": 1.625, + "learning_rate": 9.920539996484482e-06, + "loss": 0.8511, + "step": 8842 + }, + { + "epoch": 1.5042758857191847, + "grad_norm": 1.640625, + "learning_rate": 9.918734126627914e-06, + "loss": 0.8939, + "step": 8843 + }, + { + "epoch": 1.5044473498081745, + "grad_norm": 1.7421875, + "learning_rate": 9.916928259421733e-06, + "loss": 0.8351, + "step": 8844 + }, + { + "epoch": 1.5046188138971646, + "grad_norm": 1.7109375, + "learning_rate": 9.915122394924834e-06, + "loss": 0.8679, + "step": 8845 + }, + { + "epoch": 1.5047902779861544, + "grad_norm": 1.7578125, + "learning_rate": 9.913316533196113e-06, + "loss": 0.8887, + "step": 8846 + }, + { + "epoch": 1.5049617420751442, + "grad_norm": 1.6875, + "learning_rate": 9.911510674294468e-06, + "loss": 0.8625, + "step": 8847 + }, + { + "epoch": 1.505133206164134, + "grad_norm": 1.65625, + "learning_rate": 9.909704818278791e-06, + "loss": 0.8188, + "step": 8848 + }, + { + "epoch": 1.5053046702531239, + "grad_norm": 1.703125, + "learning_rate": 9.907898965207982e-06, + "loss": 0.8792, + "step": 8849 + }, + { + "epoch": 1.5054761343421137, + "grad_norm": 1.7265625, + "learning_rate": 9.906093115140935e-06, + "loss": 0.8196, + "step": 8850 + }, + { + "epoch": 1.5056475984311035, + "grad_norm": 1.6640625, + "learning_rate": 9.904287268136544e-06, + "loss": 0.8907, + "step": 8851 + }, + { + "epoch": 1.5058190625200933, + "grad_norm": 1.640625, + "learning_rate": 9.902481424253704e-06, + "loss": 0.8197, + "step": 8852 + }, + { + "epoch": 1.5059905266090832, + "grad_norm": 1.6796875, + "learning_rate": 9.900675583551314e-06, + "loss": 0.9056, + "step": 8853 + }, + { + "epoch": 1.5061619906980732, + "grad_norm": 1.671875, + "learning_rate": 9.898869746088266e-06, + "loss": 0.8384, + "step": 8854 + }, + { + "epoch": 1.506333454787063, + "grad_norm": 1.6953125, + "learning_rate": 9.897063911923456e-06, + "loss": 0.9468, + "step": 8855 + }, + { + "epoch": 1.5065049188760529, + "grad_norm": 1.640625, + "learning_rate": 9.89525808111578e-06, + "loss": 0.828, + "step": 8856 + }, + { + "epoch": 1.506676382965043, + "grad_norm": 1.6171875, + "learning_rate": 9.893452253724133e-06, + "loss": 0.7832, + "step": 8857 + }, + { + "epoch": 1.5068478470540327, + "grad_norm": 1.640625, + "learning_rate": 9.891646429807409e-06, + "loss": 0.8636, + "step": 8858 + }, + { + "epoch": 1.5070193111430226, + "grad_norm": 1.71875, + "learning_rate": 9.889840609424502e-06, + "loss": 0.8303, + "step": 8859 + }, + { + "epoch": 1.5071907752320124, + "grad_norm": 1.703125, + "learning_rate": 9.888034792634308e-06, + "loss": 0.7927, + "step": 8860 + }, + { + "epoch": 1.5073622393210022, + "grad_norm": 1.7578125, + "learning_rate": 9.88622897949572e-06, + "loss": 0.8017, + "step": 8861 + }, + { + "epoch": 1.507533703409992, + "grad_norm": 1.796875, + "learning_rate": 9.884423170067639e-06, + "loss": 0.9169, + "step": 8862 + }, + { + "epoch": 1.5077051674989819, + "grad_norm": 1.6796875, + "learning_rate": 9.88261736440895e-06, + "loss": 0.8707, + "step": 8863 + }, + { + "epoch": 1.5078766315879717, + "grad_norm": 1.7265625, + "learning_rate": 9.880811562578549e-06, + "loss": 0.9191, + "step": 8864 + }, + { + "epoch": 1.5080480956769615, + "grad_norm": 1.734375, + "learning_rate": 9.879005764635334e-06, + "loss": 0.8502, + "step": 8865 + }, + { + "epoch": 1.5082195597659516, + "grad_norm": 1.6640625, + "learning_rate": 9.877199970638195e-06, + "loss": 0.8907, + "step": 8866 + }, + { + "epoch": 1.5083910238549414, + "grad_norm": 1.6640625, + "learning_rate": 9.875394180646028e-06, + "loss": 0.8445, + "step": 8867 + }, + { + "epoch": 1.5085624879439312, + "grad_norm": 1.7265625, + "learning_rate": 9.873588394717726e-06, + "loss": 0.87, + "step": 8868 + }, + { + "epoch": 1.5087339520329213, + "grad_norm": 1.703125, + "learning_rate": 9.87178261291218e-06, + "loss": 0.8434, + "step": 8869 + }, + { + "epoch": 1.508905416121911, + "grad_norm": 1.65625, + "learning_rate": 9.869976835288288e-06, + "loss": 0.7995, + "step": 8870 + }, + { + "epoch": 1.509076880210901, + "grad_norm": 1.640625, + "learning_rate": 9.868171061904941e-06, + "loss": 0.9007, + "step": 8871 + }, + { + "epoch": 1.5092483442998907, + "grad_norm": 1.828125, + "learning_rate": 9.866365292821032e-06, + "loss": 0.8768, + "step": 8872 + }, + { + "epoch": 1.5094198083888806, + "grad_norm": 1.7265625, + "learning_rate": 9.864559528095459e-06, + "loss": 0.8668, + "step": 8873 + }, + { + "epoch": 1.5095912724778704, + "grad_norm": 1.640625, + "learning_rate": 9.862753767787106e-06, + "loss": 0.9018, + "step": 8874 + }, + { + "epoch": 1.5097627365668602, + "grad_norm": 1.6953125, + "learning_rate": 9.860948011954868e-06, + "loss": 0.8425, + "step": 8875 + }, + { + "epoch": 1.50993420065585, + "grad_norm": 1.6875, + "learning_rate": 9.859142260657639e-06, + "loss": 0.7678, + "step": 8876 + }, + { + "epoch": 1.5101056647448399, + "grad_norm": 1.6875, + "learning_rate": 9.85733651395431e-06, + "loss": 0.8245, + "step": 8877 + }, + { + "epoch": 1.51027712883383, + "grad_norm": 1.6875, + "learning_rate": 9.855530771903776e-06, + "loss": 0.8217, + "step": 8878 + }, + { + "epoch": 1.5104485929228197, + "grad_norm": 1.6953125, + "learning_rate": 9.853725034564928e-06, + "loss": 0.8625, + "step": 8879 + }, + { + "epoch": 1.5106200570118096, + "grad_norm": 1.75, + "learning_rate": 9.851919301996658e-06, + "loss": 0.8898, + "step": 8880 + }, + { + "epoch": 1.5107915211007996, + "grad_norm": 1.6875, + "learning_rate": 9.850113574257857e-06, + "loss": 0.7756, + "step": 8881 + }, + { + "epoch": 1.5109629851897894, + "grad_norm": 1.7578125, + "learning_rate": 9.848307851407414e-06, + "loss": 0.8532, + "step": 8882 + }, + { + "epoch": 1.5111344492787793, + "grad_norm": 1.6796875, + "learning_rate": 9.846502133504227e-06, + "loss": 0.8205, + "step": 8883 + }, + { + "epoch": 1.511305913367769, + "grad_norm": 1.6796875, + "learning_rate": 9.844696420607181e-06, + "loss": 0.8913, + "step": 8884 + }, + { + "epoch": 1.511477377456759, + "grad_norm": 1.65625, + "learning_rate": 9.842890712775172e-06, + "loss": 0.8602, + "step": 8885 + }, + { + "epoch": 1.5116488415457487, + "grad_norm": 1.5859375, + "learning_rate": 9.841085010067089e-06, + "loss": 0.7896, + "step": 8886 + }, + { + "epoch": 1.5118203056347386, + "grad_norm": 1.65625, + "learning_rate": 9.83927931254182e-06, + "loss": 0.8216, + "step": 8887 + }, + { + "epoch": 1.5119917697237284, + "grad_norm": 1.734375, + "learning_rate": 9.837473620258258e-06, + "loss": 0.8148, + "step": 8888 + }, + { + "epoch": 1.5121632338127182, + "grad_norm": 1.6484375, + "learning_rate": 9.835667933275292e-06, + "loss": 0.7759, + "step": 8889 + }, + { + "epoch": 1.512334697901708, + "grad_norm": 1.828125, + "learning_rate": 9.833862251651815e-06, + "loss": 0.8444, + "step": 8890 + }, + { + "epoch": 1.512506161990698, + "grad_norm": 1.625, + "learning_rate": 9.832056575446715e-06, + "loss": 0.8576, + "step": 8891 + }, + { + "epoch": 1.512677626079688, + "grad_norm": 1.5703125, + "learning_rate": 9.830250904718884e-06, + "loss": 0.8579, + "step": 8892 + }, + { + "epoch": 1.5128490901686777, + "grad_norm": 1.8046875, + "learning_rate": 9.82844523952721e-06, + "loss": 0.9643, + "step": 8893 + }, + { + "epoch": 1.5130205542576678, + "grad_norm": 1.5703125, + "learning_rate": 9.82663957993058e-06, + "loss": 0.8405, + "step": 8894 + }, + { + "epoch": 1.5131920183466576, + "grad_norm": 1.8515625, + "learning_rate": 9.824833925987888e-06, + "loss": 0.9236, + "step": 8895 + }, + { + "epoch": 1.5133634824356474, + "grad_norm": 1.6328125, + "learning_rate": 9.823028277758021e-06, + "loss": 0.8099, + "step": 8896 + }, + { + "epoch": 1.5135349465246373, + "grad_norm": 1.6953125, + "learning_rate": 9.821222635299871e-06, + "loss": 0.8192, + "step": 8897 + }, + { + "epoch": 1.513706410613627, + "grad_norm": 1.65625, + "learning_rate": 9.819416998672323e-06, + "loss": 0.8839, + "step": 8898 + }, + { + "epoch": 1.513877874702617, + "grad_norm": 1.6015625, + "learning_rate": 9.817611367934264e-06, + "loss": 0.7973, + "step": 8899 + }, + { + "epoch": 1.5140493387916067, + "grad_norm": 1.703125, + "learning_rate": 9.815805743144587e-06, + "loss": 0.7841, + "step": 8900 + }, + { + "epoch": 1.5142208028805966, + "grad_norm": 1.8203125, + "learning_rate": 9.814000124362178e-06, + "loss": 0.8855, + "step": 8901 + }, + { + "epoch": 1.5143922669695864, + "grad_norm": 1.671875, + "learning_rate": 9.812194511645924e-06, + "loss": 0.7829, + "step": 8902 + }, + { + "epoch": 1.5145637310585764, + "grad_norm": 1.796875, + "learning_rate": 9.810388905054715e-06, + "loss": 0.8709, + "step": 8903 + }, + { + "epoch": 1.5147351951475663, + "grad_norm": 1.7109375, + "learning_rate": 9.808583304647439e-06, + "loss": 0.8896, + "step": 8904 + }, + { + "epoch": 1.514906659236556, + "grad_norm": 1.671875, + "learning_rate": 9.80677771048298e-06, + "loss": 0.8687, + "step": 8905 + }, + { + "epoch": 1.5150781233255461, + "grad_norm": 1.6484375, + "learning_rate": 9.80497212262023e-06, + "loss": 0.7913, + "step": 8906 + }, + { + "epoch": 1.515249587414536, + "grad_norm": 1.765625, + "learning_rate": 9.803166541118072e-06, + "loss": 0.81, + "step": 8907 + }, + { + "epoch": 1.5154210515035258, + "grad_norm": 1.6796875, + "learning_rate": 9.801360966035396e-06, + "loss": 0.807, + "step": 8908 + }, + { + "epoch": 1.5155925155925156, + "grad_norm": 1.8125, + "learning_rate": 9.799555397431086e-06, + "loss": 0.8345, + "step": 8909 + }, + { + "epoch": 1.5157639796815054, + "grad_norm": 1.71875, + "learning_rate": 9.797749835364028e-06, + "loss": 0.8034, + "step": 8910 + }, + { + "epoch": 1.5159354437704953, + "grad_norm": 1.765625, + "learning_rate": 9.79594427989311e-06, + "loss": 0.7871, + "step": 8911 + }, + { + "epoch": 1.516106907859485, + "grad_norm": 1.6953125, + "learning_rate": 9.794138731077219e-06, + "loss": 0.9255, + "step": 8912 + }, + { + "epoch": 1.516278371948475, + "grad_norm": 1.734375, + "learning_rate": 9.792333188975237e-06, + "loss": 0.9397, + "step": 8913 + }, + { + "epoch": 1.5164498360374647, + "grad_norm": 1.71875, + "learning_rate": 9.790527653646053e-06, + "loss": 0.8087, + "step": 8914 + }, + { + "epoch": 1.5166213001264548, + "grad_norm": 1.703125, + "learning_rate": 9.78872212514855e-06, + "loss": 0.8213, + "step": 8915 + }, + { + "epoch": 1.5167927642154446, + "grad_norm": 1.703125, + "learning_rate": 9.786916603541614e-06, + "loss": 0.8722, + "step": 8916 + }, + { + "epoch": 1.5169642283044344, + "grad_norm": 1.7421875, + "learning_rate": 9.78511108888413e-06, + "loss": 0.885, + "step": 8917 + }, + { + "epoch": 1.5171356923934245, + "grad_norm": 1.6640625, + "learning_rate": 9.783305581234983e-06, + "loss": 0.8474, + "step": 8918 + }, + { + "epoch": 1.5173071564824143, + "grad_norm": 1.6796875, + "learning_rate": 9.781500080653057e-06, + "loss": 0.7978, + "step": 8919 + }, + { + "epoch": 1.5174786205714041, + "grad_norm": 1.6171875, + "learning_rate": 9.77969458719724e-06, + "loss": 0.7995, + "step": 8920 + }, + { + "epoch": 1.517650084660394, + "grad_norm": 1.6015625, + "learning_rate": 9.777889100926408e-06, + "loss": 0.8285, + "step": 8921 + }, + { + "epoch": 1.5178215487493838, + "grad_norm": 1.59375, + "learning_rate": 9.77608362189945e-06, + "loss": 0.8133, + "step": 8922 + }, + { + "epoch": 1.5179930128383736, + "grad_norm": 1.6484375, + "learning_rate": 9.774278150175249e-06, + "loss": 0.8127, + "step": 8923 + }, + { + "epoch": 1.5181644769273634, + "grad_norm": 1.640625, + "learning_rate": 9.772472685812682e-06, + "loss": 0.8633, + "step": 8924 + }, + { + "epoch": 1.5183359410163533, + "grad_norm": 1.6796875, + "learning_rate": 9.770667228870643e-06, + "loss": 0.9239, + "step": 8925 + }, + { + "epoch": 1.518507405105343, + "grad_norm": 1.640625, + "learning_rate": 9.768861779408006e-06, + "loss": 0.942, + "step": 8926 + }, + { + "epoch": 1.5186788691943331, + "grad_norm": 1.671875, + "learning_rate": 9.767056337483659e-06, + "loss": 0.7934, + "step": 8927 + }, + { + "epoch": 1.518850333283323, + "grad_norm": 1.6953125, + "learning_rate": 9.765250903156482e-06, + "loss": 0.8728, + "step": 8928 + }, + { + "epoch": 1.5190217973723128, + "grad_norm": 1.625, + "learning_rate": 9.763445476485357e-06, + "loss": 0.8055, + "step": 8929 + }, + { + "epoch": 1.5191932614613028, + "grad_norm": 1.6875, + "learning_rate": 9.761640057529164e-06, + "loss": 0.8335, + "step": 8930 + }, + { + "epoch": 1.5193647255502927, + "grad_norm": 1.6875, + "learning_rate": 9.759834646346794e-06, + "loss": 0.9022, + "step": 8931 + }, + { + "epoch": 1.5195361896392825, + "grad_norm": 1.6875, + "learning_rate": 9.758029242997112e-06, + "loss": 0.8732, + "step": 8932 + }, + { + "epoch": 1.5197076537282723, + "grad_norm": 1.7890625, + "learning_rate": 9.75622384753901e-06, + "loss": 0.8624, + "step": 8933 + }, + { + "epoch": 1.5198791178172621, + "grad_norm": 1.734375, + "learning_rate": 9.754418460031365e-06, + "loss": 0.8726, + "step": 8934 + }, + { + "epoch": 1.520050581906252, + "grad_norm": 1.71875, + "learning_rate": 9.75261308053306e-06, + "loss": 0.8423, + "step": 8935 + }, + { + "epoch": 1.5202220459952418, + "grad_norm": 1.65625, + "learning_rate": 9.750807709102974e-06, + "loss": 0.8287, + "step": 8936 + }, + { + "epoch": 1.5203935100842316, + "grad_norm": 1.703125, + "learning_rate": 9.749002345799988e-06, + "loss": 0.8095, + "step": 8937 + }, + { + "epoch": 1.5205649741732215, + "grad_norm": 1.6796875, + "learning_rate": 9.747196990682979e-06, + "loss": 0.8468, + "step": 8938 + }, + { + "epoch": 1.5207364382622115, + "grad_norm": 1.71875, + "learning_rate": 9.74539164381083e-06, + "loss": 0.88, + "step": 8939 + }, + { + "epoch": 1.5209079023512013, + "grad_norm": 1.671875, + "learning_rate": 9.743586305242417e-06, + "loss": 0.8588, + "step": 8940 + }, + { + "epoch": 1.5210793664401911, + "grad_norm": 1.78125, + "learning_rate": 9.74178097503662e-06, + "loss": 0.8929, + "step": 8941 + }, + { + "epoch": 1.5212508305291812, + "grad_norm": 1.6640625, + "learning_rate": 9.739975653252317e-06, + "loss": 0.9824, + "step": 8942 + }, + { + "epoch": 1.521422294618171, + "grad_norm": 1.65625, + "learning_rate": 9.73817033994839e-06, + "loss": 0.8122, + "step": 8943 + }, + { + "epoch": 1.5215937587071608, + "grad_norm": 1.703125, + "learning_rate": 9.736365035183712e-06, + "loss": 0.824, + "step": 8944 + }, + { + "epoch": 1.5217652227961507, + "grad_norm": 1.703125, + "learning_rate": 9.734559739017162e-06, + "loss": 0.8004, + "step": 8945 + }, + { + "epoch": 1.5219366868851405, + "grad_norm": 1.671875, + "learning_rate": 9.732754451507621e-06, + "loss": 0.7771, + "step": 8946 + }, + { + "epoch": 1.5221081509741303, + "grad_norm": 1.7421875, + "learning_rate": 9.730949172713961e-06, + "loss": 0.9023, + "step": 8947 + }, + { + "epoch": 1.5222796150631202, + "grad_norm": 1.703125, + "learning_rate": 9.729143902695064e-06, + "loss": 0.8699, + "step": 8948 + }, + { + "epoch": 1.52245107915211, + "grad_norm": 1.7265625, + "learning_rate": 9.727338641509804e-06, + "loss": 0.9692, + "step": 8949 + }, + { + "epoch": 1.5226225432410998, + "grad_norm": 2.0, + "learning_rate": 9.725533389217056e-06, + "loss": 0.8999, + "step": 8950 + }, + { + "epoch": 1.5227940073300898, + "grad_norm": 1.7109375, + "learning_rate": 9.723728145875699e-06, + "loss": 0.8602, + "step": 8951 + }, + { + "epoch": 1.5229654714190797, + "grad_norm": 1.8671875, + "learning_rate": 9.721922911544607e-06, + "loss": 0.888, + "step": 8952 + }, + { + "epoch": 1.5231369355080695, + "grad_norm": 1.7421875, + "learning_rate": 9.720117686282657e-06, + "loss": 0.8557, + "step": 8953 + }, + { + "epoch": 1.5233083995970595, + "grad_norm": 1.6015625, + "learning_rate": 9.718312470148723e-06, + "loss": 0.8532, + "step": 8954 + }, + { + "epoch": 1.5234798636860494, + "grad_norm": 1.7578125, + "learning_rate": 9.71650726320168e-06, + "loss": 0.9081, + "step": 8955 + }, + { + "epoch": 1.5236513277750392, + "grad_norm": 1.7109375, + "learning_rate": 9.714702065500401e-06, + "loss": 0.8824, + "step": 8956 + }, + { + "epoch": 1.523822791864029, + "grad_norm": 1.703125, + "learning_rate": 9.712896877103764e-06, + "loss": 0.878, + "step": 8957 + }, + { + "epoch": 1.5239942559530189, + "grad_norm": 1.8125, + "learning_rate": 9.71109169807064e-06, + "loss": 0.9402, + "step": 8958 + }, + { + "epoch": 1.5241657200420087, + "grad_norm": 1.6328125, + "learning_rate": 9.709286528459904e-06, + "loss": 0.7948, + "step": 8959 + }, + { + "epoch": 1.5243371841309985, + "grad_norm": 1.59375, + "learning_rate": 9.707481368330428e-06, + "loss": 0.6961, + "step": 8960 + }, + { + "epoch": 1.5245086482199883, + "grad_norm": 1.6796875, + "learning_rate": 9.705676217741087e-06, + "loss": 0.8052, + "step": 8961 + }, + { + "epoch": 1.5246801123089782, + "grad_norm": 1.6796875, + "learning_rate": 9.703871076750752e-06, + "loss": 0.8027, + "step": 8962 + }, + { + "epoch": 1.5248515763979682, + "grad_norm": 1.7421875, + "learning_rate": 9.702065945418295e-06, + "loss": 0.9244, + "step": 8963 + }, + { + "epoch": 1.525023040486958, + "grad_norm": 1.7265625, + "learning_rate": 9.700260823802592e-06, + "loss": 0.8807, + "step": 8964 + }, + { + "epoch": 1.5251945045759479, + "grad_norm": 1.671875, + "learning_rate": 9.698455711962511e-06, + "loss": 0.9256, + "step": 8965 + }, + { + "epoch": 1.525365968664938, + "grad_norm": 1.796875, + "learning_rate": 9.696650609956931e-06, + "loss": 0.8056, + "step": 8966 + }, + { + "epoch": 1.5255374327539277, + "grad_norm": 1.71875, + "learning_rate": 9.69484551784471e-06, + "loss": 0.807, + "step": 8967 + }, + { + "epoch": 1.5257088968429176, + "grad_norm": 1.6875, + "learning_rate": 9.693040435684727e-06, + "loss": 0.9571, + "step": 8968 + }, + { + "epoch": 1.5258803609319074, + "grad_norm": 1.671875, + "learning_rate": 9.691235363535852e-06, + "loss": 0.8819, + "step": 8969 + }, + { + "epoch": 1.5260518250208972, + "grad_norm": 1.703125, + "learning_rate": 9.689430301456954e-06, + "loss": 0.828, + "step": 8970 + }, + { + "epoch": 1.526223289109887, + "grad_norm": 1.5859375, + "learning_rate": 9.6876252495069e-06, + "loss": 0.8439, + "step": 8971 + }, + { + "epoch": 1.5263947531988769, + "grad_norm": 1.71875, + "learning_rate": 9.685820207744565e-06, + "loss": 0.8378, + "step": 8972 + }, + { + "epoch": 1.5265662172878667, + "grad_norm": 1.6640625, + "learning_rate": 9.68401517622882e-06, + "loss": 0.9048, + "step": 8973 + }, + { + "epoch": 1.5267376813768565, + "grad_norm": 1.734375, + "learning_rate": 9.682210155018526e-06, + "loss": 0.8459, + "step": 8974 + }, + { + "epoch": 1.5269091454658466, + "grad_norm": 1.71875, + "learning_rate": 9.680405144172556e-06, + "loss": 0.79, + "step": 8975 + }, + { + "epoch": 1.5270806095548364, + "grad_norm": 1.6640625, + "learning_rate": 9.67860014374978e-06, + "loss": 0.8598, + "step": 8976 + }, + { + "epoch": 1.5272520736438262, + "grad_norm": 1.6640625, + "learning_rate": 9.676795153809065e-06, + "loss": 0.783, + "step": 8977 + }, + { + "epoch": 1.527423537732816, + "grad_norm": 1.71875, + "learning_rate": 9.674990174409275e-06, + "loss": 0.8452, + "step": 8978 + }, + { + "epoch": 1.527595001821806, + "grad_norm": 1.8359375, + "learning_rate": 9.673185205609277e-06, + "loss": 0.8368, + "step": 8979 + }, + { + "epoch": 1.527766465910796, + "grad_norm": 1.75, + "learning_rate": 9.67138024746794e-06, + "loss": 0.8541, + "step": 8980 + }, + { + "epoch": 1.5279379299997857, + "grad_norm": 1.6875, + "learning_rate": 9.669575300044133e-06, + "loss": 0.9475, + "step": 8981 + }, + { + "epoch": 1.5281093940887756, + "grad_norm": 1.7421875, + "learning_rate": 9.667770363396718e-06, + "loss": 0.8133, + "step": 8982 + }, + { + "epoch": 1.5282808581777654, + "grad_norm": 1.78125, + "learning_rate": 9.665965437584564e-06, + "loss": 0.8793, + "step": 8983 + }, + { + "epoch": 1.5284523222667552, + "grad_norm": 1.6796875, + "learning_rate": 9.664160522666534e-06, + "loss": 0.885, + "step": 8984 + }, + { + "epoch": 1.528623786355745, + "grad_norm": 1.6953125, + "learning_rate": 9.662355618701495e-06, + "loss": 0.8527, + "step": 8985 + }, + { + "epoch": 1.5287952504447349, + "grad_norm": 1.65625, + "learning_rate": 9.66055072574831e-06, + "loss": 0.8093, + "step": 8986 + }, + { + "epoch": 1.5289667145337247, + "grad_norm": 1.671875, + "learning_rate": 9.658745843865845e-06, + "loss": 0.8133, + "step": 8987 + }, + { + "epoch": 1.5291381786227147, + "grad_norm": 1.71875, + "learning_rate": 9.656940973112961e-06, + "loss": 0.8026, + "step": 8988 + }, + { + "epoch": 1.5293096427117046, + "grad_norm": 1.6875, + "learning_rate": 9.655136113548528e-06, + "loss": 0.7791, + "step": 8989 + }, + { + "epoch": 1.5294811068006944, + "grad_norm": 1.7734375, + "learning_rate": 9.6533312652314e-06, + "loss": 0.9006, + "step": 8990 + }, + { + "epoch": 1.5296525708896844, + "grad_norm": 1.6796875, + "learning_rate": 9.65152642822045e-06, + "loss": 0.9335, + "step": 8991 + }, + { + "epoch": 1.5298240349786743, + "grad_norm": 1.765625, + "learning_rate": 9.649721602574532e-06, + "loss": 0.8893, + "step": 8992 + }, + { + "epoch": 1.529995499067664, + "grad_norm": 1.6796875, + "learning_rate": 9.64791678835251e-06, + "loss": 0.8333, + "step": 8993 + }, + { + "epoch": 1.530166963156654, + "grad_norm": 1.6875, + "learning_rate": 9.64611198561325e-06, + "loss": 0.9223, + "step": 8994 + }, + { + "epoch": 1.5303384272456437, + "grad_norm": 1.640625, + "learning_rate": 9.64430719441561e-06, + "loss": 0.809, + "step": 8995 + }, + { + "epoch": 1.5305098913346336, + "grad_norm": 1.578125, + "learning_rate": 9.642502414818452e-06, + "loss": 0.805, + "step": 8996 + }, + { + "epoch": 1.5306813554236234, + "grad_norm": 1.84375, + "learning_rate": 9.640697646880636e-06, + "loss": 0.824, + "step": 8997 + }, + { + "epoch": 1.5308528195126132, + "grad_norm": 1.734375, + "learning_rate": 9.638892890661022e-06, + "loss": 0.8221, + "step": 8998 + }, + { + "epoch": 1.531024283601603, + "grad_norm": 1.59375, + "learning_rate": 9.637088146218471e-06, + "loss": 0.7839, + "step": 8999 + }, + { + "epoch": 1.531195747690593, + "grad_norm": 1.6484375, + "learning_rate": 9.635283413611846e-06, + "loss": 0.8316, + "step": 9000 + }, + { + "epoch": 1.531367211779583, + "grad_norm": 1.734375, + "learning_rate": 9.633478692899998e-06, + "loss": 0.8849, + "step": 9001 + }, + { + "epoch": 1.5315386758685727, + "grad_norm": 1.7421875, + "learning_rate": 9.631673984141792e-06, + "loss": 0.9323, + "step": 9002 + }, + { + "epoch": 1.5317101399575628, + "grad_norm": 1.78125, + "learning_rate": 9.629869287396083e-06, + "loss": 0.8271, + "step": 9003 + }, + { + "epoch": 1.5318816040465526, + "grad_norm": 1.7734375, + "learning_rate": 9.62806460272173e-06, + "loss": 0.9224, + "step": 9004 + }, + { + "epoch": 1.5320530681355424, + "grad_norm": 1.609375, + "learning_rate": 9.626259930177592e-06, + "loss": 0.8317, + "step": 9005 + }, + { + "epoch": 1.5322245322245323, + "grad_norm": 1.7421875, + "learning_rate": 9.624455269822526e-06, + "loss": 0.8575, + "step": 9006 + }, + { + "epoch": 1.532395996313522, + "grad_norm": 1.7578125, + "learning_rate": 9.622650621715385e-06, + "loss": 0.8243, + "step": 9007 + }, + { + "epoch": 1.532567460402512, + "grad_norm": 1.734375, + "learning_rate": 9.62084598591503e-06, + "loss": 0.8792, + "step": 9008 + }, + { + "epoch": 1.5327389244915017, + "grad_norm": 1.71875, + "learning_rate": 9.619041362480314e-06, + "loss": 0.9154, + "step": 9009 + }, + { + "epoch": 1.5329103885804916, + "grad_norm": 1.9140625, + "learning_rate": 9.617236751470094e-06, + "loss": 0.9161, + "step": 9010 + }, + { + "epoch": 1.5330818526694814, + "grad_norm": 1.6328125, + "learning_rate": 9.615432152943225e-06, + "loss": 0.8368, + "step": 9011 + }, + { + "epoch": 1.5332533167584714, + "grad_norm": 1.65625, + "learning_rate": 9.613627566958567e-06, + "loss": 0.828, + "step": 9012 + }, + { + "epoch": 1.5334247808474613, + "grad_norm": 1.703125, + "learning_rate": 9.611822993574963e-06, + "loss": 0.8458, + "step": 9013 + }, + { + "epoch": 1.533596244936451, + "grad_norm": 1.6171875, + "learning_rate": 9.610018432851275e-06, + "loss": 0.7959, + "step": 9014 + }, + { + "epoch": 1.5337677090254411, + "grad_norm": 1.734375, + "learning_rate": 9.608213884846353e-06, + "loss": 0.9074, + "step": 9015 + }, + { + "epoch": 1.533939173114431, + "grad_norm": 1.65625, + "learning_rate": 9.60640934961905e-06, + "loss": 0.7942, + "step": 9016 + }, + { + "epoch": 1.5341106372034208, + "grad_norm": 1.71875, + "learning_rate": 9.604604827228225e-06, + "loss": 0.9097, + "step": 9017 + }, + { + "epoch": 1.5342821012924106, + "grad_norm": 1.6640625, + "learning_rate": 9.602800317732718e-06, + "loss": 0.886, + "step": 9018 + }, + { + "epoch": 1.5344535653814004, + "grad_norm": 1.625, + "learning_rate": 9.600995821191395e-06, + "loss": 0.7985, + "step": 9019 + }, + { + "epoch": 1.5346250294703903, + "grad_norm": 1.7578125, + "learning_rate": 9.599191337663098e-06, + "loss": 0.82, + "step": 9020 + }, + { + "epoch": 1.53479649355938, + "grad_norm": 1.7421875, + "learning_rate": 9.597386867206683e-06, + "loss": 0.9522, + "step": 9021 + }, + { + "epoch": 1.53496795764837, + "grad_norm": 1.8515625, + "learning_rate": 9.595582409880996e-06, + "loss": 0.9203, + "step": 9022 + }, + { + "epoch": 1.5351394217373597, + "grad_norm": 1.7421875, + "learning_rate": 9.593777965744894e-06, + "loss": 0.8255, + "step": 9023 + }, + { + "epoch": 1.5353108858263498, + "grad_norm": 1.78125, + "learning_rate": 9.59197353485722e-06, + "loss": 0.8641, + "step": 9024 + }, + { + "epoch": 1.5354823499153396, + "grad_norm": 1.765625, + "learning_rate": 9.590169117276825e-06, + "loss": 0.8695, + "step": 9025 + }, + { + "epoch": 1.5356538140043294, + "grad_norm": 1.7421875, + "learning_rate": 9.588364713062555e-06, + "loss": 0.8724, + "step": 9026 + }, + { + "epoch": 1.5358252780933195, + "grad_norm": 1.828125, + "learning_rate": 9.586560322273264e-06, + "loss": 0.8957, + "step": 9027 + }, + { + "epoch": 1.5359967421823093, + "grad_norm": 1.640625, + "learning_rate": 9.584755944967798e-06, + "loss": 0.819, + "step": 9028 + }, + { + "epoch": 1.5361682062712991, + "grad_norm": 1.703125, + "learning_rate": 9.582951581205005e-06, + "loss": 0.8256, + "step": 9029 + }, + { + "epoch": 1.536339670360289, + "grad_norm": 1.8515625, + "learning_rate": 9.581147231043731e-06, + "loss": 0.8678, + "step": 9030 + }, + { + "epoch": 1.5365111344492788, + "grad_norm": 1.6953125, + "learning_rate": 9.579342894542822e-06, + "loss": 0.9207, + "step": 9031 + }, + { + "epoch": 1.5366825985382686, + "grad_norm": 1.7890625, + "learning_rate": 9.577538571761126e-06, + "loss": 0.7908, + "step": 9032 + }, + { + "epoch": 1.5368540626272584, + "grad_norm": 1.640625, + "learning_rate": 9.575734262757488e-06, + "loss": 0.8578, + "step": 9033 + }, + { + "epoch": 1.5370255267162483, + "grad_norm": 1.671875, + "learning_rate": 9.573929967590755e-06, + "loss": 0.8436, + "step": 9034 + }, + { + "epoch": 1.537196990805238, + "grad_norm": 1.734375, + "learning_rate": 9.57212568631977e-06, + "loss": 0.8757, + "step": 9035 + }, + { + "epoch": 1.5373684548942281, + "grad_norm": 1.671875, + "learning_rate": 9.570321419003375e-06, + "loss": 0.8262, + "step": 9036 + }, + { + "epoch": 1.537539918983218, + "grad_norm": 1.7265625, + "learning_rate": 9.568517165700419e-06, + "loss": 0.8632, + "step": 9037 + }, + { + "epoch": 1.5377113830722078, + "grad_norm": 1.65625, + "learning_rate": 9.56671292646974e-06, + "loss": 0.8484, + "step": 9038 + }, + { + "epoch": 1.5378828471611978, + "grad_norm": 1.765625, + "learning_rate": 9.564908701370186e-06, + "loss": 0.9305, + "step": 9039 + }, + { + "epoch": 1.5380543112501877, + "grad_norm": 1.640625, + "learning_rate": 9.563104490460599e-06, + "loss": 0.8929, + "step": 9040 + }, + { + "epoch": 1.5382257753391775, + "grad_norm": 1.6484375, + "learning_rate": 9.561300293799818e-06, + "loss": 0.8637, + "step": 9041 + }, + { + "epoch": 1.5383972394281673, + "grad_norm": 1.671875, + "learning_rate": 9.559496111446686e-06, + "loss": 0.7468, + "step": 9042 + }, + { + "epoch": 1.5385687035171571, + "grad_norm": 1.7265625, + "learning_rate": 9.557691943460046e-06, + "loss": 0.8259, + "step": 9043 + }, + { + "epoch": 1.538740167606147, + "grad_norm": 1.640625, + "learning_rate": 9.555887789898737e-06, + "loss": 0.8728, + "step": 9044 + }, + { + "epoch": 1.5389116316951368, + "grad_norm": 1.8125, + "learning_rate": 9.554083650821598e-06, + "loss": 0.858, + "step": 9045 + }, + { + "epoch": 1.5390830957841266, + "grad_norm": 1.7421875, + "learning_rate": 9.552279526287472e-06, + "loss": 0.8959, + "step": 9046 + }, + { + "epoch": 1.5392545598731164, + "grad_norm": 1.6796875, + "learning_rate": 9.550475416355199e-06, + "loss": 0.8063, + "step": 9047 + }, + { + "epoch": 1.5394260239621065, + "grad_norm": 1.65625, + "learning_rate": 9.548671321083612e-06, + "loss": 0.7583, + "step": 9048 + }, + { + "epoch": 1.5395974880510963, + "grad_norm": 1.65625, + "learning_rate": 9.546867240531553e-06, + "loss": 0.91, + "step": 9049 + }, + { + "epoch": 1.5397689521400861, + "grad_norm": 1.6953125, + "learning_rate": 9.54506317475786e-06, + "loss": 0.8993, + "step": 9050 + }, + { + "epoch": 1.5399404162290762, + "grad_norm": 1.828125, + "learning_rate": 9.54325912382137e-06, + "loss": 0.8975, + "step": 9051 + }, + { + "epoch": 1.540111880318066, + "grad_norm": 1.7109375, + "learning_rate": 9.54145508778092e-06, + "loss": 0.881, + "step": 9052 + }, + { + "epoch": 1.5402833444070558, + "grad_norm": 1.609375, + "learning_rate": 9.539651066695346e-06, + "loss": 0.8008, + "step": 9053 + }, + { + "epoch": 1.5404548084960457, + "grad_norm": 1.6796875, + "learning_rate": 9.537847060623484e-06, + "loss": 0.845, + "step": 9054 + }, + { + "epoch": 1.5406262725850355, + "grad_norm": 1.7109375, + "learning_rate": 9.53604306962417e-06, + "loss": 0.8601, + "step": 9055 + }, + { + "epoch": 1.5407977366740253, + "grad_norm": 1.7421875, + "learning_rate": 9.53423909375624e-06, + "loss": 0.8381, + "step": 9056 + }, + { + "epoch": 1.5409692007630151, + "grad_norm": 1.75, + "learning_rate": 9.532435133078523e-06, + "loss": 0.9294, + "step": 9057 + }, + { + "epoch": 1.541140664852005, + "grad_norm": 1.6796875, + "learning_rate": 9.530631187649864e-06, + "loss": 0.7982, + "step": 9058 + }, + { + "epoch": 1.5413121289409948, + "grad_norm": 1.7265625, + "learning_rate": 9.528827257529085e-06, + "loss": 0.8474, + "step": 9059 + }, + { + "epoch": 1.5414835930299848, + "grad_norm": 1.703125, + "learning_rate": 9.527023342775023e-06, + "loss": 0.8985, + "step": 9060 + }, + { + "epoch": 1.5416550571189747, + "grad_norm": 1.7109375, + "learning_rate": 9.525219443446512e-06, + "loss": 0.909, + "step": 9061 + }, + { + "epoch": 1.5418265212079645, + "grad_norm": 1.625, + "learning_rate": 9.523415559602383e-06, + "loss": 0.8313, + "step": 9062 + }, + { + "epoch": 1.5419979852969545, + "grad_norm": 1.6328125, + "learning_rate": 9.521611691301465e-06, + "loss": 0.8706, + "step": 9063 + }, + { + "epoch": 1.5421694493859444, + "grad_norm": 1.7109375, + "learning_rate": 9.519807838602593e-06, + "loss": 0.8734, + "step": 9064 + }, + { + "epoch": 1.5423409134749342, + "grad_norm": 1.7265625, + "learning_rate": 9.518004001564592e-06, + "loss": 0.9018, + "step": 9065 + }, + { + "epoch": 1.542512377563924, + "grad_norm": 1.7421875, + "learning_rate": 9.5162001802463e-06, + "loss": 0.8821, + "step": 9066 + }, + { + "epoch": 1.5426838416529138, + "grad_norm": 1.65625, + "learning_rate": 9.514396374706541e-06, + "loss": 0.8449, + "step": 9067 + }, + { + "epoch": 1.5428553057419037, + "grad_norm": 1.6875, + "learning_rate": 9.512592585004144e-06, + "loss": 0.9382, + "step": 9068 + }, + { + "epoch": 1.5430267698308935, + "grad_norm": 1.609375, + "learning_rate": 9.510788811197939e-06, + "loss": 0.7869, + "step": 9069 + }, + { + "epoch": 1.5431982339198833, + "grad_norm": 1.6328125, + "learning_rate": 9.508985053346757e-06, + "loss": 0.8338, + "step": 9070 + }, + { + "epoch": 1.5433696980088731, + "grad_norm": 1.6953125, + "learning_rate": 9.507181311509416e-06, + "loss": 0.7595, + "step": 9071 + }, + { + "epoch": 1.5435411620978632, + "grad_norm": 1.609375, + "learning_rate": 9.505377585744751e-06, + "loss": 0.9012, + "step": 9072 + }, + { + "epoch": 1.543712626186853, + "grad_norm": 1.6484375, + "learning_rate": 9.503573876111581e-06, + "loss": 0.8346, + "step": 9073 + }, + { + "epoch": 1.5438840902758428, + "grad_norm": 1.6796875, + "learning_rate": 9.501770182668739e-06, + "loss": 0.8816, + "step": 9074 + }, + { + "epoch": 1.5440555543648327, + "grad_norm": 1.546875, + "learning_rate": 9.499966505475049e-06, + "loss": 0.7961, + "step": 9075 + }, + { + "epoch": 1.5442270184538227, + "grad_norm": 1.78125, + "learning_rate": 9.498162844589333e-06, + "loss": 0.8428, + "step": 9076 + }, + { + "epoch": 1.5443984825428125, + "grad_norm": 1.7421875, + "learning_rate": 9.496359200070416e-06, + "loss": 0.9426, + "step": 9077 + }, + { + "epoch": 1.5445699466318024, + "grad_norm": 1.6953125, + "learning_rate": 9.494555571977122e-06, + "loss": 0.823, + "step": 9078 + }, + { + "epoch": 1.5447414107207922, + "grad_norm": 1.703125, + "learning_rate": 9.492751960368274e-06, + "loss": 0.8447, + "step": 9079 + }, + { + "epoch": 1.544912874809782, + "grad_norm": 1.7265625, + "learning_rate": 9.490948365302695e-06, + "loss": 0.9213, + "step": 9080 + }, + { + "epoch": 1.5450843388987718, + "grad_norm": 1.6015625, + "learning_rate": 9.489144786839209e-06, + "loss": 0.7861, + "step": 9081 + }, + { + "epoch": 1.5452558029877617, + "grad_norm": 1.7109375, + "learning_rate": 9.487341225036634e-06, + "loss": 0.8669, + "step": 9082 + }, + { + "epoch": 1.5454272670767515, + "grad_norm": 1.6875, + "learning_rate": 9.485537679953791e-06, + "loss": 0.8379, + "step": 9083 + }, + { + "epoch": 1.5455987311657413, + "grad_norm": 1.71875, + "learning_rate": 9.483734151649503e-06, + "loss": 0.9128, + "step": 9084 + }, + { + "epoch": 1.5457701952547314, + "grad_norm": 1.7265625, + "learning_rate": 9.481930640182586e-06, + "loss": 0.8098, + "step": 9085 + }, + { + "epoch": 1.5459416593437212, + "grad_norm": 1.6015625, + "learning_rate": 9.480127145611863e-06, + "loss": 0.8269, + "step": 9086 + }, + { + "epoch": 1.546113123432711, + "grad_norm": 1.75, + "learning_rate": 9.47832366799615e-06, + "loss": 0.9293, + "step": 9087 + }, + { + "epoch": 1.546284587521701, + "grad_norm": 1.734375, + "learning_rate": 9.476520207394268e-06, + "loss": 0.8146, + "step": 9088 + }, + { + "epoch": 1.5464560516106909, + "grad_norm": 1.6953125, + "learning_rate": 9.474716763865033e-06, + "loss": 0.8419, + "step": 9089 + }, + { + "epoch": 1.5466275156996807, + "grad_norm": 1.71875, + "learning_rate": 9.472913337467263e-06, + "loss": 0.7645, + "step": 9090 + }, + { + "epoch": 1.5467989797886705, + "grad_norm": 1.7578125, + "learning_rate": 9.471109928259774e-06, + "loss": 0.9052, + "step": 9091 + }, + { + "epoch": 1.5469704438776604, + "grad_norm": 1.7578125, + "learning_rate": 9.469306536301381e-06, + "loss": 0.9394, + "step": 9092 + }, + { + "epoch": 1.5471419079666502, + "grad_norm": 1.6953125, + "learning_rate": 9.467503161650902e-06, + "loss": 0.8726, + "step": 9093 + }, + { + "epoch": 1.54731337205564, + "grad_norm": 1.703125, + "learning_rate": 9.46569980436715e-06, + "loss": 0.833, + "step": 9094 + }, + { + "epoch": 1.5474848361446298, + "grad_norm": 1.59375, + "learning_rate": 9.463896464508938e-06, + "loss": 0.7878, + "step": 9095 + }, + { + "epoch": 1.5476563002336197, + "grad_norm": 1.625, + "learning_rate": 9.462093142135081e-06, + "loss": 0.7952, + "step": 9096 + }, + { + "epoch": 1.5478277643226097, + "grad_norm": 1.65625, + "learning_rate": 9.460289837304392e-06, + "loss": 0.8398, + "step": 9097 + }, + { + "epoch": 1.5479992284115995, + "grad_norm": 1.5859375, + "learning_rate": 9.458486550075684e-06, + "loss": 0.725, + "step": 9098 + }, + { + "epoch": 1.5481706925005894, + "grad_norm": 1.59375, + "learning_rate": 9.456683280507768e-06, + "loss": 0.8511, + "step": 9099 + }, + { + "epoch": 1.5483421565895794, + "grad_norm": 1.5859375, + "learning_rate": 9.454880028659458e-06, + "loss": 0.736, + "step": 9100 + }, + { + "epoch": 1.5483421565895794, + "eval_loss": 0.8392665386199951, + "eval_runtime": 835.9126, + "eval_samples_per_second": 2.99, + "eval_steps_per_second": 2.99, + "step": 9100 + }, + { + "epoch": 1.5485136206785692, + "grad_norm": 1.71875, + "learning_rate": 9.453076794589562e-06, + "loss": 0.8413, + "step": 9101 + }, + { + "epoch": 1.548685084767559, + "grad_norm": 1.734375, + "learning_rate": 9.451273578356892e-06, + "loss": 0.8524, + "step": 9102 + }, + { + "epoch": 1.5488565488565489, + "grad_norm": 1.6171875, + "learning_rate": 9.449470380020256e-06, + "loss": 0.7827, + "step": 9103 + }, + { + "epoch": 1.5490280129455387, + "grad_norm": 1.875, + "learning_rate": 9.447667199638467e-06, + "loss": 0.9264, + "step": 9104 + }, + { + "epoch": 1.5491994770345285, + "grad_norm": 1.71875, + "learning_rate": 9.445864037270328e-06, + "loss": 0.971, + "step": 9105 + }, + { + "epoch": 1.5493709411235184, + "grad_norm": 1.734375, + "learning_rate": 9.444060892974648e-06, + "loss": 0.864, + "step": 9106 + }, + { + "epoch": 1.5495424052125082, + "grad_norm": 1.8203125, + "learning_rate": 9.442257766810237e-06, + "loss": 0.8644, + "step": 9107 + }, + { + "epoch": 1.549713869301498, + "grad_norm": 1.59375, + "learning_rate": 9.4404546588359e-06, + "loss": 0.8537, + "step": 9108 + }, + { + "epoch": 1.549885333390488, + "grad_norm": 1.7578125, + "learning_rate": 9.438651569110444e-06, + "loss": 0.8993, + "step": 9109 + }, + { + "epoch": 1.5500567974794779, + "grad_norm": 1.6796875, + "learning_rate": 9.436848497692674e-06, + "loss": 0.8384, + "step": 9110 + }, + { + "epoch": 1.5502282615684677, + "grad_norm": 1.640625, + "learning_rate": 9.435045444641396e-06, + "loss": 0.8226, + "step": 9111 + }, + { + "epoch": 1.5503997256574578, + "grad_norm": 1.75, + "learning_rate": 9.43324241001541e-06, + "loss": 0.8532, + "step": 9112 + }, + { + "epoch": 1.5505711897464476, + "grad_norm": 1.6875, + "learning_rate": 9.431439393873527e-06, + "loss": 0.8572, + "step": 9113 + }, + { + "epoch": 1.5507426538354374, + "grad_norm": 1.65625, + "learning_rate": 9.429636396274544e-06, + "loss": 0.8009, + "step": 9114 + }, + { + "epoch": 1.5509141179244272, + "grad_norm": 1.7578125, + "learning_rate": 9.427833417277268e-06, + "loss": 0.8088, + "step": 9115 + }, + { + "epoch": 1.551085582013417, + "grad_norm": 1.65625, + "learning_rate": 9.426030456940502e-06, + "loss": 0.8109, + "step": 9116 + }, + { + "epoch": 1.551257046102407, + "grad_norm": 1.7578125, + "learning_rate": 9.42422751532304e-06, + "loss": 0.8577, + "step": 9117 + }, + { + "epoch": 1.5514285101913967, + "grad_norm": 1.7421875, + "learning_rate": 9.422424592483687e-06, + "loss": 0.9251, + "step": 9118 + }, + { + "epoch": 1.5515999742803865, + "grad_norm": 1.6953125, + "learning_rate": 9.420621688481244e-06, + "loss": 0.8698, + "step": 9119 + }, + { + "epoch": 1.5517714383693764, + "grad_norm": 1.65625, + "learning_rate": 9.418818803374506e-06, + "loss": 0.8414, + "step": 9120 + }, + { + "epoch": 1.5519429024583664, + "grad_norm": 1.71875, + "learning_rate": 9.417015937222279e-06, + "loss": 0.8404, + "step": 9121 + }, + { + "epoch": 1.5521143665473562, + "grad_norm": 1.6484375, + "learning_rate": 9.415213090083355e-06, + "loss": 0.8677, + "step": 9122 + }, + { + "epoch": 1.552285830636346, + "grad_norm": 1.7421875, + "learning_rate": 9.413410262016536e-06, + "loss": 0.8818, + "step": 9123 + }, + { + "epoch": 1.5524572947253361, + "grad_norm": 1.6640625, + "learning_rate": 9.411607453080616e-06, + "loss": 0.8228, + "step": 9124 + }, + { + "epoch": 1.552628758814326, + "grad_norm": 1.6328125, + "learning_rate": 9.409804663334395e-06, + "loss": 0.8459, + "step": 9125 + }, + { + "epoch": 1.5528002229033158, + "grad_norm": 1.703125, + "learning_rate": 9.408001892836665e-06, + "loss": 0.8643, + "step": 9126 + }, + { + "epoch": 1.5529716869923056, + "grad_norm": 1.671875, + "learning_rate": 9.406199141646227e-06, + "loss": 0.8558, + "step": 9127 + }, + { + "epoch": 1.5531431510812954, + "grad_norm": 1.6875, + "learning_rate": 9.404396409821863e-06, + "loss": 0.8688, + "step": 9128 + }, + { + "epoch": 1.5533146151702852, + "grad_norm": 1.6875, + "learning_rate": 9.40259369742238e-06, + "loss": 0.8741, + "step": 9129 + }, + { + "epoch": 1.553486079259275, + "grad_norm": 1.65625, + "learning_rate": 9.400791004506564e-06, + "loss": 0.826, + "step": 9130 + }, + { + "epoch": 1.553657543348265, + "grad_norm": 1.7109375, + "learning_rate": 9.39898833113321e-06, + "loss": 0.8623, + "step": 9131 + }, + { + "epoch": 1.5538290074372547, + "grad_norm": 1.7734375, + "learning_rate": 9.397185677361112e-06, + "loss": 0.925, + "step": 9132 + }, + { + "epoch": 1.5540004715262448, + "grad_norm": 30.375, + "learning_rate": 9.395383043249057e-06, + "loss": 1.2323, + "step": 9133 + }, + { + "epoch": 1.5541719356152346, + "grad_norm": 1.65625, + "learning_rate": 9.393580428855837e-06, + "loss": 0.8077, + "step": 9134 + }, + { + "epoch": 1.5543433997042244, + "grad_norm": 1.7890625, + "learning_rate": 9.391777834240245e-06, + "loss": 0.9564, + "step": 9135 + }, + { + "epoch": 1.5545148637932145, + "grad_norm": 1.734375, + "learning_rate": 9.389975259461065e-06, + "loss": 0.852, + "step": 9136 + }, + { + "epoch": 1.5546863278822043, + "grad_norm": 1.7109375, + "learning_rate": 9.388172704577092e-06, + "loss": 0.9216, + "step": 9137 + }, + { + "epoch": 1.5548577919711941, + "grad_norm": 1.75, + "learning_rate": 9.38637016964711e-06, + "loss": 0.895, + "step": 9138 + }, + { + "epoch": 1.555029256060184, + "grad_norm": 1.6640625, + "learning_rate": 9.38456765472991e-06, + "loss": 0.8907, + "step": 9139 + }, + { + "epoch": 1.5552007201491738, + "grad_norm": 1.6953125, + "learning_rate": 9.382765159884272e-06, + "loss": 0.8439, + "step": 9140 + }, + { + "epoch": 1.5553721842381636, + "grad_norm": 1.671875, + "learning_rate": 9.38096268516899e-06, + "loss": 0.8292, + "step": 9141 + }, + { + "epoch": 1.5555436483271534, + "grad_norm": 1.71875, + "learning_rate": 9.379160230642844e-06, + "loss": 0.8789, + "step": 9142 + }, + { + "epoch": 1.5557151124161432, + "grad_norm": 1.7265625, + "learning_rate": 9.37735779636462e-06, + "loss": 0.9112, + "step": 9143 + }, + { + "epoch": 1.555886576505133, + "grad_norm": 1.78125, + "learning_rate": 9.375555382393103e-06, + "loss": 0.991, + "step": 9144 + }, + { + "epoch": 1.5560580405941231, + "grad_norm": 1.7421875, + "learning_rate": 9.373752988787077e-06, + "loss": 0.856, + "step": 9145 + }, + { + "epoch": 1.556229504683113, + "grad_norm": 1.6015625, + "learning_rate": 9.371950615605326e-06, + "loss": 0.8673, + "step": 9146 + }, + { + "epoch": 1.5564009687721028, + "grad_norm": 1.703125, + "learning_rate": 9.370148262906628e-06, + "loss": 0.8611, + "step": 9147 + }, + { + "epoch": 1.5565724328610928, + "grad_norm": 1.65625, + "learning_rate": 9.368345930749767e-06, + "loss": 0.8286, + "step": 9148 + }, + { + "epoch": 1.5567438969500826, + "grad_norm": 1.6484375, + "learning_rate": 9.366543619193526e-06, + "loss": 0.8725, + "step": 9149 + }, + { + "epoch": 1.5569153610390725, + "grad_norm": 1.6875, + "learning_rate": 9.364741328296683e-06, + "loss": 0.8377, + "step": 9150 + }, + { + "epoch": 1.5570868251280623, + "grad_norm": 1.7734375, + "learning_rate": 9.362939058118015e-06, + "loss": 0.9226, + "step": 9151 + }, + { + "epoch": 1.5572582892170521, + "grad_norm": 1.671875, + "learning_rate": 9.361136808716305e-06, + "loss": 0.9356, + "step": 9152 + }, + { + "epoch": 1.557429753306042, + "grad_norm": 1.6171875, + "learning_rate": 9.359334580150328e-06, + "loss": 0.9033, + "step": 9153 + }, + { + "epoch": 1.5576012173950318, + "grad_norm": 1.6640625, + "learning_rate": 9.357532372478863e-06, + "loss": 0.8383, + "step": 9154 + }, + { + "epoch": 1.5577726814840216, + "grad_norm": 1.6875, + "learning_rate": 9.355730185760688e-06, + "loss": 0.9208, + "step": 9155 + }, + { + "epoch": 1.5579441455730114, + "grad_norm": 1.671875, + "learning_rate": 9.353928020054576e-06, + "loss": 0.8347, + "step": 9156 + }, + { + "epoch": 1.5581156096620015, + "grad_norm": 1.71875, + "learning_rate": 9.352125875419306e-06, + "loss": 0.8323, + "step": 9157 + }, + { + "epoch": 1.5582870737509913, + "grad_norm": 1.75, + "learning_rate": 9.35032375191365e-06, + "loss": 0.864, + "step": 9158 + }, + { + "epoch": 1.5584585378399811, + "grad_norm": 1.7109375, + "learning_rate": 9.34852164959638e-06, + "loss": 0.9107, + "step": 9159 + }, + { + "epoch": 1.5586300019289712, + "grad_norm": 1.8125, + "learning_rate": 9.346719568526275e-06, + "loss": 0.8692, + "step": 9160 + }, + { + "epoch": 1.558801466017961, + "grad_norm": 1.703125, + "learning_rate": 9.344917508762104e-06, + "loss": 0.9507, + "step": 9161 + }, + { + "epoch": 1.5589729301069508, + "grad_norm": 1.75, + "learning_rate": 9.343115470362645e-06, + "loss": 0.8296, + "step": 9162 + }, + { + "epoch": 1.5591443941959406, + "grad_norm": 1.765625, + "learning_rate": 9.34131345338666e-06, + "loss": 0.8738, + "step": 9163 + }, + { + "epoch": 1.5593158582849305, + "grad_norm": 1.6171875, + "learning_rate": 9.339511457892924e-06, + "loss": 0.8605, + "step": 9164 + }, + { + "epoch": 1.5594873223739203, + "grad_norm": 1.640625, + "learning_rate": 9.337709483940208e-06, + "loss": 0.8543, + "step": 9165 + }, + { + "epoch": 1.5596587864629101, + "grad_norm": 1.75, + "learning_rate": 9.335907531587275e-06, + "loss": 0.791, + "step": 9166 + }, + { + "epoch": 1.5598302505519, + "grad_norm": 1.7265625, + "learning_rate": 9.334105600892899e-06, + "loss": 0.8977, + "step": 9167 + }, + { + "epoch": 1.5600017146408898, + "grad_norm": 1.578125, + "learning_rate": 9.33230369191585e-06, + "loss": 0.8126, + "step": 9168 + }, + { + "epoch": 1.5601731787298796, + "grad_norm": 1.625, + "learning_rate": 9.33050180471489e-06, + "loss": 0.8863, + "step": 9169 + }, + { + "epoch": 1.5603446428188696, + "grad_norm": 1.6875, + "learning_rate": 9.328699939348788e-06, + "loss": 0.8737, + "step": 9170 + }, + { + "epoch": 1.5605161069078595, + "grad_norm": 1.65625, + "learning_rate": 9.326898095876308e-06, + "loss": 0.8067, + "step": 9171 + }, + { + "epoch": 1.5606875709968493, + "grad_norm": 1.6015625, + "learning_rate": 9.325096274356219e-06, + "loss": 0.8524, + "step": 9172 + }, + { + "epoch": 1.5608590350858393, + "grad_norm": 1.7109375, + "learning_rate": 9.323294474847281e-06, + "loss": 0.8329, + "step": 9173 + }, + { + "epoch": 1.5610304991748292, + "grad_norm": 1.625, + "learning_rate": 9.321492697408259e-06, + "loss": 0.7939, + "step": 9174 + }, + { + "epoch": 1.561201963263819, + "grad_norm": 1.7890625, + "learning_rate": 9.31969094209791e-06, + "loss": 0.8204, + "step": 9175 + }, + { + "epoch": 1.5613734273528088, + "grad_norm": 1.65625, + "learning_rate": 9.317889208975004e-06, + "loss": 0.8583, + "step": 9176 + }, + { + "epoch": 1.5615448914417986, + "grad_norm": 1.59375, + "learning_rate": 9.3160874980983e-06, + "loss": 0.8618, + "step": 9177 + }, + { + "epoch": 1.5617163555307885, + "grad_norm": 1.6171875, + "learning_rate": 9.314285809526558e-06, + "loss": 0.812, + "step": 9178 + }, + { + "epoch": 1.5618878196197783, + "grad_norm": 1.5859375, + "learning_rate": 9.312484143318538e-06, + "loss": 0.8151, + "step": 9179 + }, + { + "epoch": 1.5620592837087681, + "grad_norm": 2.03125, + "learning_rate": 9.310682499532998e-06, + "loss": 0.9445, + "step": 9180 + }, + { + "epoch": 1.562230747797758, + "grad_norm": 1.8046875, + "learning_rate": 9.308880878228698e-06, + "loss": 0.9197, + "step": 9181 + }, + { + "epoch": 1.562402211886748, + "grad_norm": 1.6796875, + "learning_rate": 9.307079279464396e-06, + "loss": 0.8719, + "step": 9182 + }, + { + "epoch": 1.5625736759757378, + "grad_norm": 1.7734375, + "learning_rate": 9.305277703298847e-06, + "loss": 0.8668, + "step": 9183 + }, + { + "epoch": 1.5627451400647276, + "grad_norm": 1.671875, + "learning_rate": 9.303476149790808e-06, + "loss": 0.797, + "step": 9184 + }, + { + "epoch": 1.5629166041537177, + "grad_norm": 1.7109375, + "learning_rate": 9.301674618999038e-06, + "loss": 0.8912, + "step": 9185 + }, + { + "epoch": 1.5630880682427075, + "grad_norm": 1.65625, + "learning_rate": 9.299873110982285e-06, + "loss": 0.8996, + "step": 9186 + }, + { + "epoch": 1.5632595323316973, + "grad_norm": 1.7265625, + "learning_rate": 9.298071625799306e-06, + "loss": 0.8605, + "step": 9187 + }, + { + "epoch": 1.5634309964206872, + "grad_norm": 1.6015625, + "learning_rate": 9.296270163508856e-06, + "loss": 0.8674, + "step": 9188 + }, + { + "epoch": 1.563602460509677, + "grad_norm": 1.7265625, + "learning_rate": 9.294468724169685e-06, + "loss": 0.886, + "step": 9189 + }, + { + "epoch": 1.5637739245986668, + "grad_norm": 1.703125, + "learning_rate": 9.292667307840546e-06, + "loss": 0.8608, + "step": 9190 + }, + { + "epoch": 1.5639453886876566, + "grad_norm": 1.703125, + "learning_rate": 9.29086591458019e-06, + "loss": 0.8972, + "step": 9191 + }, + { + "epoch": 1.5641168527766465, + "grad_norm": 1.65625, + "learning_rate": 9.289064544447366e-06, + "loss": 0.8055, + "step": 9192 + }, + { + "epoch": 1.5642883168656363, + "grad_norm": 1.71875, + "learning_rate": 9.287263197500825e-06, + "loss": 0.9147, + "step": 9193 + }, + { + "epoch": 1.5644597809546263, + "grad_norm": 1.6875, + "learning_rate": 9.285461873799315e-06, + "loss": 0.8236, + "step": 9194 + }, + { + "epoch": 1.5646312450436162, + "grad_norm": 1.6328125, + "learning_rate": 9.283660573401582e-06, + "loss": 0.827, + "step": 9195 + }, + { + "epoch": 1.564802709132606, + "grad_norm": 1.6484375, + "learning_rate": 9.28185929636638e-06, + "loss": 0.8257, + "step": 9196 + }, + { + "epoch": 1.564974173221596, + "grad_norm": 1.7421875, + "learning_rate": 9.280058042752446e-06, + "loss": 0.8898, + "step": 9197 + }, + { + "epoch": 1.5651456373105859, + "grad_norm": 1.6640625, + "learning_rate": 9.278256812618534e-06, + "loss": 0.8702, + "step": 9198 + }, + { + "epoch": 1.5653171013995757, + "grad_norm": 1.6328125, + "learning_rate": 9.276455606023382e-06, + "loss": 0.8156, + "step": 9199 + }, + { + "epoch": 1.5654885654885655, + "grad_norm": 1.71875, + "learning_rate": 9.274654423025739e-06, + "loss": 0.8452, + "step": 9200 + }, + { + "epoch": 1.5656600295775553, + "grad_norm": 1.7578125, + "learning_rate": 9.272853263684345e-06, + "loss": 0.8568, + "step": 9201 + }, + { + "epoch": 1.5658314936665452, + "grad_norm": 1.6953125, + "learning_rate": 9.271052128057946e-06, + "loss": 0.8569, + "step": 9202 + }, + { + "epoch": 1.566002957755535, + "grad_norm": 1.9375, + "learning_rate": 9.26925101620528e-06, + "loss": 0.8666, + "step": 9203 + }, + { + "epoch": 1.5661744218445248, + "grad_norm": 1.765625, + "learning_rate": 9.267449928185091e-06, + "loss": 0.8248, + "step": 9204 + }, + { + "epoch": 1.5663458859335146, + "grad_norm": 1.765625, + "learning_rate": 9.265648864056119e-06, + "loss": 0.8597, + "step": 9205 + }, + { + "epoch": 1.5665173500225047, + "grad_norm": 1.8671875, + "learning_rate": 9.2638478238771e-06, + "loss": 0.8764, + "step": 9206 + }, + { + "epoch": 1.5666888141114945, + "grad_norm": 1.65625, + "learning_rate": 9.26204680770678e-06, + "loss": 0.8861, + "step": 9207 + }, + { + "epoch": 1.5668602782004843, + "grad_norm": 1.65625, + "learning_rate": 9.260245815603894e-06, + "loss": 0.8443, + "step": 9208 + }, + { + "epoch": 1.5670317422894744, + "grad_norm": 1.7109375, + "learning_rate": 9.258444847627174e-06, + "loss": 0.859, + "step": 9209 + }, + { + "epoch": 1.5672032063784642, + "grad_norm": 1.640625, + "learning_rate": 9.256643903835361e-06, + "loss": 0.9118, + "step": 9210 + }, + { + "epoch": 1.567374670467454, + "grad_norm": 1.6484375, + "learning_rate": 9.254842984287187e-06, + "loss": 0.7796, + "step": 9211 + }, + { + "epoch": 1.5675461345564439, + "grad_norm": 1.5703125, + "learning_rate": 9.253042089041392e-06, + "loss": 0.7964, + "step": 9212 + }, + { + "epoch": 1.5677175986454337, + "grad_norm": 1.6015625, + "learning_rate": 9.251241218156706e-06, + "loss": 0.8026, + "step": 9213 + }, + { + "epoch": 1.5678890627344235, + "grad_norm": 1.65625, + "learning_rate": 9.249440371691863e-06, + "loss": 0.7103, + "step": 9214 + }, + { + "epoch": 1.5680605268234133, + "grad_norm": 1.7265625, + "learning_rate": 9.247639549705597e-06, + "loss": 0.8532, + "step": 9215 + }, + { + "epoch": 1.5682319909124032, + "grad_norm": 1.7421875, + "learning_rate": 9.245838752256636e-06, + "loss": 0.8715, + "step": 9216 + }, + { + "epoch": 1.568403455001393, + "grad_norm": 1.734375, + "learning_rate": 9.244037979403715e-06, + "loss": 0.9013, + "step": 9217 + }, + { + "epoch": 1.568574919090383, + "grad_norm": 1.8046875, + "learning_rate": 9.242237231205563e-06, + "loss": 0.8126, + "step": 9218 + }, + { + "epoch": 1.5687463831793729, + "grad_norm": 1.640625, + "learning_rate": 9.240436507720907e-06, + "loss": 0.816, + "step": 9219 + }, + { + "epoch": 1.5689178472683627, + "grad_norm": 1.671875, + "learning_rate": 9.238635809008482e-06, + "loss": 0.8114, + "step": 9220 + }, + { + "epoch": 1.5690893113573527, + "grad_norm": 1.796875, + "learning_rate": 9.236835135127005e-06, + "loss": 0.8607, + "step": 9221 + }, + { + "epoch": 1.5692607754463426, + "grad_norm": 1.703125, + "learning_rate": 9.235034486135207e-06, + "loss": 0.8988, + "step": 9222 + }, + { + "epoch": 1.5694322395353324, + "grad_norm": 1.71875, + "learning_rate": 9.233233862091818e-06, + "loss": 0.8391, + "step": 9223 + }, + { + "epoch": 1.5696037036243222, + "grad_norm": 1.65625, + "learning_rate": 9.231433263055558e-06, + "loss": 0.8328, + "step": 9224 + }, + { + "epoch": 1.569775167713312, + "grad_norm": 1.6484375, + "learning_rate": 9.229632689085155e-06, + "loss": 0.8381, + "step": 9225 + }, + { + "epoch": 1.5699466318023019, + "grad_norm": 1.65625, + "learning_rate": 9.22783214023933e-06, + "loss": 0.8908, + "step": 9226 + }, + { + "epoch": 1.5701180958912917, + "grad_norm": 1.7109375, + "learning_rate": 9.226031616576806e-06, + "loss": 0.8383, + "step": 9227 + }, + { + "epoch": 1.5702895599802815, + "grad_norm": 1.734375, + "learning_rate": 9.224231118156307e-06, + "loss": 0.9127, + "step": 9228 + }, + { + "epoch": 1.5704610240692713, + "grad_norm": 1.7578125, + "learning_rate": 9.222430645036552e-06, + "loss": 0.8773, + "step": 9229 + }, + { + "epoch": 1.5706324881582614, + "grad_norm": 1.640625, + "learning_rate": 9.220630197276262e-06, + "loss": 0.8129, + "step": 9230 + }, + { + "epoch": 1.5708039522472512, + "grad_norm": 1.6875, + "learning_rate": 9.218829774934157e-06, + "loss": 0.8681, + "step": 9231 + }, + { + "epoch": 1.570975416336241, + "grad_norm": 1.609375, + "learning_rate": 9.217029378068954e-06, + "loss": 0.8128, + "step": 9232 + }, + { + "epoch": 1.571146880425231, + "grad_norm": 1.609375, + "learning_rate": 9.21522900673937e-06, + "loss": 0.7568, + "step": 9233 + }, + { + "epoch": 1.571318344514221, + "grad_norm": 1.640625, + "learning_rate": 9.213428661004124e-06, + "loss": 0.7764, + "step": 9234 + }, + { + "epoch": 1.5714898086032107, + "grad_norm": 1.6796875, + "learning_rate": 9.211628340921932e-06, + "loss": 0.8369, + "step": 9235 + }, + { + "epoch": 1.5716612726922006, + "grad_norm": 1.734375, + "learning_rate": 9.209828046551507e-06, + "loss": 0.8511, + "step": 9236 + }, + { + "epoch": 1.5718327367811904, + "grad_norm": 1.6953125, + "learning_rate": 9.208027777951565e-06, + "loss": 0.8305, + "step": 9237 + }, + { + "epoch": 1.5720042008701802, + "grad_norm": 1.78125, + "learning_rate": 9.206227535180821e-06, + "loss": 0.8703, + "step": 9238 + }, + { + "epoch": 1.57217566495917, + "grad_norm": 1.703125, + "learning_rate": 9.204427318297987e-06, + "loss": 0.8569, + "step": 9239 + }, + { + "epoch": 1.5723471290481599, + "grad_norm": 1.7734375, + "learning_rate": 9.202627127361772e-06, + "loss": 0.8437, + "step": 9240 + }, + { + "epoch": 1.5725185931371497, + "grad_norm": 1.6796875, + "learning_rate": 9.20082696243089e-06, + "loss": 0.8106, + "step": 9241 + }, + { + "epoch": 1.5726900572261397, + "grad_norm": 1.6171875, + "learning_rate": 9.199026823564048e-06, + "loss": 0.7673, + "step": 9242 + }, + { + "epoch": 1.5728615213151296, + "grad_norm": 1.6796875, + "learning_rate": 9.197226710819961e-06, + "loss": 0.8538, + "step": 9243 + }, + { + "epoch": 1.5730329854041194, + "grad_norm": 1.734375, + "learning_rate": 9.195426624257332e-06, + "loss": 0.9001, + "step": 9244 + }, + { + "epoch": 1.5732044494931094, + "grad_norm": 1.734375, + "learning_rate": 9.19362656393487e-06, + "loss": 0.8341, + "step": 9245 + }, + { + "epoch": 1.5733759135820993, + "grad_norm": 1.640625, + "learning_rate": 9.191826529911283e-06, + "loss": 0.903, + "step": 9246 + }, + { + "epoch": 1.573547377671089, + "grad_norm": 1.6640625, + "learning_rate": 9.190026522245277e-06, + "loss": 0.8623, + "step": 9247 + }, + { + "epoch": 1.573718841760079, + "grad_norm": 1.6875, + "learning_rate": 9.188226540995555e-06, + "loss": 0.8689, + "step": 9248 + }, + { + "epoch": 1.5738903058490687, + "grad_norm": 1.6640625, + "learning_rate": 9.186426586220822e-06, + "loss": 0.8787, + "step": 9249 + }, + { + "epoch": 1.5740617699380586, + "grad_norm": 1.6953125, + "learning_rate": 9.184626657979782e-06, + "loss": 0.8006, + "step": 9250 + }, + { + "epoch": 1.5742332340270484, + "grad_norm": 1.640625, + "learning_rate": 9.182826756331137e-06, + "loss": 0.8874, + "step": 9251 + }, + { + "epoch": 1.5744046981160382, + "grad_norm": 1.703125, + "learning_rate": 9.181026881333589e-06, + "loss": 0.8036, + "step": 9252 + }, + { + "epoch": 1.574576162205028, + "grad_norm": 1.6484375, + "learning_rate": 9.179227033045836e-06, + "loss": 0.8686, + "step": 9253 + }, + { + "epoch": 1.574747626294018, + "grad_norm": 1.75, + "learning_rate": 9.177427211526585e-06, + "loss": 0.9301, + "step": 9254 + }, + { + "epoch": 1.574919090383008, + "grad_norm": 1.7265625, + "learning_rate": 9.17562741683453e-06, + "loss": 0.8932, + "step": 9255 + }, + { + "epoch": 1.5750905544719978, + "grad_norm": 1.640625, + "learning_rate": 9.173827649028366e-06, + "loss": 0.8502, + "step": 9256 + }, + { + "epoch": 1.5752620185609878, + "grad_norm": 1.6796875, + "learning_rate": 9.172027908166795e-06, + "loss": 0.8476, + "step": 9257 + }, + { + "epoch": 1.5754334826499776, + "grad_norm": 1.59375, + "learning_rate": 9.17022819430851e-06, + "loss": 0.8561, + "step": 9258 + }, + { + "epoch": 1.5756049467389674, + "grad_norm": 1.6484375, + "learning_rate": 9.16842850751221e-06, + "loss": 0.7854, + "step": 9259 + }, + { + "epoch": 1.5757764108279573, + "grad_norm": 1.7265625, + "learning_rate": 9.166628847836586e-06, + "loss": 0.7733, + "step": 9260 + }, + { + "epoch": 1.575947874916947, + "grad_norm": 1.8046875, + "learning_rate": 9.164829215340332e-06, + "loss": 0.8804, + "step": 9261 + }, + { + "epoch": 1.576119339005937, + "grad_norm": 1.6640625, + "learning_rate": 9.163029610082146e-06, + "loss": 0.7935, + "step": 9262 + }, + { + "epoch": 1.5762908030949268, + "grad_norm": 1.7265625, + "learning_rate": 9.161230032120714e-06, + "loss": 0.8089, + "step": 9263 + }, + { + "epoch": 1.5764622671839166, + "grad_norm": 1.59375, + "learning_rate": 9.15943048151473e-06, + "loss": 0.8143, + "step": 9264 + }, + { + "epoch": 1.5766337312729064, + "grad_norm": 1.703125, + "learning_rate": 9.157630958322884e-06, + "loss": 0.8527, + "step": 9265 + }, + { + "epoch": 1.5768051953618962, + "grad_norm": 1.71875, + "learning_rate": 9.155831462603867e-06, + "loss": 0.8017, + "step": 9266 + }, + { + "epoch": 1.5769766594508863, + "grad_norm": 1.6015625, + "learning_rate": 9.154031994416362e-06, + "loss": 0.8363, + "step": 9267 + }, + { + "epoch": 1.577148123539876, + "grad_norm": 1.7109375, + "learning_rate": 9.15223255381906e-06, + "loss": 0.8843, + "step": 9268 + }, + { + "epoch": 1.577319587628866, + "grad_norm": 1.796875, + "learning_rate": 9.150433140870642e-06, + "loss": 0.8803, + "step": 9269 + }, + { + "epoch": 1.577491051717856, + "grad_norm": 1.7421875, + "learning_rate": 9.148633755629803e-06, + "loss": 0.761, + "step": 9270 + }, + { + "epoch": 1.5776625158068458, + "grad_norm": 1.6796875, + "learning_rate": 9.146834398155223e-06, + "loss": 0.7771, + "step": 9271 + }, + { + "epoch": 1.5778339798958356, + "grad_norm": 1.5859375, + "learning_rate": 9.145035068505586e-06, + "loss": 0.7734, + "step": 9272 + }, + { + "epoch": 1.5780054439848255, + "grad_norm": 1.671875, + "learning_rate": 9.143235766739574e-06, + "loss": 0.8499, + "step": 9273 + }, + { + "epoch": 1.5781769080738153, + "grad_norm": 1.640625, + "learning_rate": 9.141436492915869e-06, + "loss": 0.8435, + "step": 9274 + }, + { + "epoch": 1.578348372162805, + "grad_norm": 1.65625, + "learning_rate": 9.139637247093156e-06, + "loss": 0.876, + "step": 9275 + }, + { + "epoch": 1.578519836251795, + "grad_norm": 1.7265625, + "learning_rate": 9.13783802933011e-06, + "loss": 0.8139, + "step": 9276 + }, + { + "epoch": 1.5786913003407848, + "grad_norm": 1.734375, + "learning_rate": 9.136038839685415e-06, + "loss": 0.935, + "step": 9277 + }, + { + "epoch": 1.5788627644297746, + "grad_norm": 1.6875, + "learning_rate": 9.134239678217744e-06, + "loss": 0.9228, + "step": 9278 + }, + { + "epoch": 1.5790342285187646, + "grad_norm": 1.734375, + "learning_rate": 9.13244054498578e-06, + "loss": 0.8317, + "step": 9279 + }, + { + "epoch": 1.5792056926077545, + "grad_norm": 1.7421875, + "learning_rate": 9.130641440048194e-06, + "loss": 0.8331, + "step": 9280 + }, + { + "epoch": 1.5793771566967443, + "grad_norm": 1.7109375, + "learning_rate": 9.128842363463663e-06, + "loss": 0.792, + "step": 9281 + }, + { + "epoch": 1.5795486207857343, + "grad_norm": 1.6953125, + "learning_rate": 9.127043315290867e-06, + "loss": 0.8437, + "step": 9282 + }, + { + "epoch": 1.5797200848747242, + "grad_norm": 1.59375, + "learning_rate": 9.125244295588473e-06, + "loss": 0.8124, + "step": 9283 + }, + { + "epoch": 1.579891548963714, + "grad_norm": 1.78125, + "learning_rate": 9.123445304415157e-06, + "loss": 0.8643, + "step": 9284 + }, + { + "epoch": 1.5800630130527038, + "grad_norm": 1.78125, + "learning_rate": 9.12164634182959e-06, + "loss": 0.9127, + "step": 9285 + }, + { + "epoch": 1.5802344771416936, + "grad_norm": 1.6640625, + "learning_rate": 9.119847407890446e-06, + "loss": 0.7783, + "step": 9286 + }, + { + "epoch": 1.5804059412306835, + "grad_norm": 1.8046875, + "learning_rate": 9.118048502656391e-06, + "loss": 0.8905, + "step": 9287 + }, + { + "epoch": 1.5805774053196733, + "grad_norm": 1.703125, + "learning_rate": 9.116249626186094e-06, + "loss": 0.8882, + "step": 9288 + }, + { + "epoch": 1.580748869408663, + "grad_norm": 1.671875, + "learning_rate": 9.114450778538228e-06, + "loss": 0.881, + "step": 9289 + }, + { + "epoch": 1.580920333497653, + "grad_norm": 1.6796875, + "learning_rate": 9.112651959771455e-06, + "loss": 0.863, + "step": 9290 + }, + { + "epoch": 1.581091797586643, + "grad_norm": 1.65625, + "learning_rate": 9.110853169944441e-06, + "loss": 0.8287, + "step": 9291 + }, + { + "epoch": 1.5812632616756328, + "grad_norm": 1.7109375, + "learning_rate": 9.109054409115854e-06, + "loss": 0.7872, + "step": 9292 + }, + { + "epoch": 1.5814347257646226, + "grad_norm": 1.7421875, + "learning_rate": 9.10725567734436e-06, + "loss": 0.9243, + "step": 9293 + }, + { + "epoch": 1.5816061898536127, + "grad_norm": 1.71875, + "learning_rate": 9.105456974688618e-06, + "loss": 0.905, + "step": 9294 + }, + { + "epoch": 1.5817776539426025, + "grad_norm": 1.75, + "learning_rate": 9.103658301207294e-06, + "loss": 0.899, + "step": 9295 + }, + { + "epoch": 1.5819491180315923, + "grad_norm": 1.8046875, + "learning_rate": 9.101859656959047e-06, + "loss": 0.8408, + "step": 9296 + }, + { + "epoch": 1.5821205821205822, + "grad_norm": 1.7578125, + "learning_rate": 9.100061042002539e-06, + "loss": 0.7829, + "step": 9297 + }, + { + "epoch": 1.582292046209572, + "grad_norm": 1.71875, + "learning_rate": 9.09826245639643e-06, + "loss": 0.9463, + "step": 9298 + }, + { + "epoch": 1.5824635102985618, + "grad_norm": 1.65625, + "learning_rate": 9.096463900199376e-06, + "loss": 0.8112, + "step": 9299 + }, + { + "epoch": 1.5826349743875516, + "grad_norm": 1.7265625, + "learning_rate": 9.094665373470039e-06, + "loss": 0.8755, + "step": 9300 + }, + { + "epoch": 1.5828064384765415, + "grad_norm": 1.78125, + "learning_rate": 9.092866876267074e-06, + "loss": 0.9219, + "step": 9301 + }, + { + "epoch": 1.5829779025655313, + "grad_norm": 1.6796875, + "learning_rate": 9.091068408649134e-06, + "loss": 0.8709, + "step": 9302 + }, + { + "epoch": 1.5831493666545213, + "grad_norm": 1.6484375, + "learning_rate": 9.089269970674875e-06, + "loss": 0.8616, + "step": 9303 + }, + { + "epoch": 1.5833208307435112, + "grad_norm": 1.6328125, + "learning_rate": 9.087471562402953e-06, + "loss": 0.8286, + "step": 9304 + }, + { + "epoch": 1.583492294832501, + "grad_norm": 1.6875, + "learning_rate": 9.08567318389202e-06, + "loss": 0.8534, + "step": 9305 + }, + { + "epoch": 1.583663758921491, + "grad_norm": 1.765625, + "learning_rate": 9.083874835200727e-06, + "loss": 0.9082, + "step": 9306 + }, + { + "epoch": 1.5838352230104809, + "grad_norm": 1.7265625, + "learning_rate": 9.082076516387726e-06, + "loss": 0.8264, + "step": 9307 + }, + { + "epoch": 1.5840066870994707, + "grad_norm": 1.8515625, + "learning_rate": 9.080278227511664e-06, + "loss": 0.9196, + "step": 9308 + }, + { + "epoch": 1.5841781511884605, + "grad_norm": 1.6328125, + "learning_rate": 9.078479968631193e-06, + "loss": 0.9158, + "step": 9309 + }, + { + "epoch": 1.5843496152774503, + "grad_norm": 1.65625, + "learning_rate": 9.076681739804961e-06, + "loss": 0.8469, + "step": 9310 + }, + { + "epoch": 1.5845210793664402, + "grad_norm": 1.703125, + "learning_rate": 9.074883541091616e-06, + "loss": 0.8779, + "step": 9311 + }, + { + "epoch": 1.58469254345543, + "grad_norm": 1.546875, + "learning_rate": 9.073085372549805e-06, + "loss": 0.7402, + "step": 9312 + }, + { + "epoch": 1.5848640075444198, + "grad_norm": 1.78125, + "learning_rate": 9.071287234238166e-06, + "loss": 0.8958, + "step": 9313 + }, + { + "epoch": 1.5850354716334096, + "grad_norm": 1.625, + "learning_rate": 9.06948912621535e-06, + "loss": 0.8287, + "step": 9314 + }, + { + "epoch": 1.5852069357223997, + "grad_norm": 11.0625, + "learning_rate": 9.067691048539996e-06, + "loss": 0.9041, + "step": 9315 + }, + { + "epoch": 1.5853783998113895, + "grad_norm": 1.71875, + "learning_rate": 9.065893001270744e-06, + "loss": 0.8723, + "step": 9316 + }, + { + "epoch": 1.5855498639003793, + "grad_norm": 1.609375, + "learning_rate": 9.064094984466242e-06, + "loss": 0.8329, + "step": 9317 + }, + { + "epoch": 1.5857213279893694, + "grad_norm": 1.59375, + "learning_rate": 9.062296998185127e-06, + "loss": 0.8169, + "step": 9318 + }, + { + "epoch": 1.5858927920783592, + "grad_norm": 1.734375, + "learning_rate": 9.060499042486039e-06, + "loss": 0.9146, + "step": 9319 + }, + { + "epoch": 1.586064256167349, + "grad_norm": 1.6015625, + "learning_rate": 9.058701117427613e-06, + "loss": 0.8901, + "step": 9320 + }, + { + "epoch": 1.5862357202563389, + "grad_norm": 1.78125, + "learning_rate": 9.05690322306849e-06, + "loss": 0.8549, + "step": 9321 + }, + { + "epoch": 1.5864071843453287, + "grad_norm": 1.640625, + "learning_rate": 9.055105359467303e-06, + "loss": 0.8646, + "step": 9322 + }, + { + "epoch": 1.5865786484343185, + "grad_norm": 1.6328125, + "learning_rate": 9.053307526682694e-06, + "loss": 0.8383, + "step": 9323 + }, + { + "epoch": 1.5867501125233083, + "grad_norm": 1.65625, + "learning_rate": 9.051509724773283e-06, + "loss": 0.835, + "step": 9324 + }, + { + "epoch": 1.5869215766122982, + "grad_norm": 1.65625, + "learning_rate": 9.049711953797716e-06, + "loss": 0.8508, + "step": 9325 + }, + { + "epoch": 1.587093040701288, + "grad_norm": 1.640625, + "learning_rate": 9.04791421381462e-06, + "loss": 0.8332, + "step": 9326 + }, + { + "epoch": 1.587264504790278, + "grad_norm": 1.671875, + "learning_rate": 9.046116504882625e-06, + "loss": 0.8394, + "step": 9327 + }, + { + "epoch": 1.5874359688792679, + "grad_norm": 1.7265625, + "learning_rate": 9.044318827060364e-06, + "loss": 0.8588, + "step": 9328 + }, + { + "epoch": 1.5876074329682577, + "grad_norm": 1.8125, + "learning_rate": 9.042521180406465e-06, + "loss": 0.9078, + "step": 9329 + }, + { + "epoch": 1.5877788970572477, + "grad_norm": 1.640625, + "learning_rate": 9.040723564979556e-06, + "loss": 0.8752, + "step": 9330 + }, + { + "epoch": 1.5879503611462376, + "grad_norm": 1.6015625, + "learning_rate": 9.038925980838262e-06, + "loss": 0.9109, + "step": 9331 + }, + { + "epoch": 1.5881218252352274, + "grad_norm": 1.703125, + "learning_rate": 9.037128428041214e-06, + "loss": 0.8279, + "step": 9332 + }, + { + "epoch": 1.5882932893242172, + "grad_norm": 1.6484375, + "learning_rate": 9.035330906647031e-06, + "loss": 0.8455, + "step": 9333 + }, + { + "epoch": 1.588464753413207, + "grad_norm": 1.6875, + "learning_rate": 9.033533416714341e-06, + "loss": 0.798, + "step": 9334 + }, + { + "epoch": 1.5886362175021969, + "grad_norm": 1.6875, + "learning_rate": 9.031735958301768e-06, + "loss": 0.9019, + "step": 9335 + }, + { + "epoch": 1.5888076815911867, + "grad_norm": 1.6328125, + "learning_rate": 9.02993853146793e-06, + "loss": 0.8193, + "step": 9336 + }, + { + "epoch": 1.5889791456801765, + "grad_norm": 1.6484375, + "learning_rate": 9.028141136271448e-06, + "loss": 0.8333, + "step": 9337 + }, + { + "epoch": 1.5891506097691663, + "grad_norm": 1.75, + "learning_rate": 9.026343772770944e-06, + "loss": 0.8714, + "step": 9338 + }, + { + "epoch": 1.5893220738581564, + "grad_norm": 1.7265625, + "learning_rate": 9.024546441025035e-06, + "loss": 0.8913, + "step": 9339 + }, + { + "epoch": 1.5894935379471462, + "grad_norm": 1.6953125, + "learning_rate": 9.022749141092341e-06, + "loss": 0.8662, + "step": 9340 + }, + { + "epoch": 1.589665002036136, + "grad_norm": 1.71875, + "learning_rate": 9.020951873031477e-06, + "loss": 0.9092, + "step": 9341 + }, + { + "epoch": 1.589836466125126, + "grad_norm": 1.75, + "learning_rate": 9.019154636901059e-06, + "loss": 0.8917, + "step": 9342 + }, + { + "epoch": 1.590007930214116, + "grad_norm": 1.640625, + "learning_rate": 9.017357432759702e-06, + "loss": 0.8658, + "step": 9343 + }, + { + "epoch": 1.5901793943031057, + "grad_norm": 1.625, + "learning_rate": 9.015560260666019e-06, + "loss": 0.8088, + "step": 9344 + }, + { + "epoch": 1.5903508583920956, + "grad_norm": 1.65625, + "learning_rate": 9.013763120678624e-06, + "loss": 0.9525, + "step": 9345 + }, + { + "epoch": 1.5905223224810854, + "grad_norm": 1.625, + "learning_rate": 9.011966012856129e-06, + "loss": 0.853, + "step": 9346 + }, + { + "epoch": 1.5906937865700752, + "grad_norm": 1.703125, + "learning_rate": 9.01016893725714e-06, + "loss": 0.8231, + "step": 9347 + }, + { + "epoch": 1.590865250659065, + "grad_norm": 1.75, + "learning_rate": 9.00837189394027e-06, + "loss": 0.8723, + "step": 9348 + }, + { + "epoch": 1.5910367147480549, + "grad_norm": 1.6796875, + "learning_rate": 9.006574882964127e-06, + "loss": 0.8768, + "step": 9349 + }, + { + "epoch": 1.5912081788370447, + "grad_norm": 1.6875, + "learning_rate": 9.004777904387318e-06, + "loss": 0.8245, + "step": 9350 + }, + { + "epoch": 1.5913796429260347, + "grad_norm": 1.703125, + "learning_rate": 9.002980958268448e-06, + "loss": 0.8931, + "step": 9351 + }, + { + "epoch": 1.5915511070150246, + "grad_norm": 1.703125, + "learning_rate": 9.001184044666125e-06, + "loss": 0.8787, + "step": 9352 + }, + { + "epoch": 1.5917225711040144, + "grad_norm": 1.625, + "learning_rate": 8.99938716363895e-06, + "loss": 0.8112, + "step": 9353 + }, + { + "epoch": 1.5918940351930044, + "grad_norm": 1.734375, + "learning_rate": 8.997590315245528e-06, + "loss": 0.9623, + "step": 9354 + }, + { + "epoch": 1.5920654992819943, + "grad_norm": 1.6484375, + "learning_rate": 8.995793499544459e-06, + "loss": 0.8516, + "step": 9355 + }, + { + "epoch": 1.592236963370984, + "grad_norm": 1.7265625, + "learning_rate": 8.993996716594347e-06, + "loss": 0.893, + "step": 9356 + }, + { + "epoch": 1.592408427459974, + "grad_norm": 1.671875, + "learning_rate": 8.99219996645379e-06, + "loss": 0.8418, + "step": 9357 + }, + { + "epoch": 1.5925798915489637, + "grad_norm": 1.71875, + "learning_rate": 8.990403249181394e-06, + "loss": 0.8901, + "step": 9358 + }, + { + "epoch": 1.5927513556379536, + "grad_norm": 1.59375, + "learning_rate": 8.988606564835742e-06, + "loss": 0.8342, + "step": 9359 + }, + { + "epoch": 1.5929228197269434, + "grad_norm": 1.71875, + "learning_rate": 8.986809913475441e-06, + "loss": 0.8132, + "step": 9360 + }, + { + "epoch": 1.5930942838159332, + "grad_norm": 1.7109375, + "learning_rate": 8.985013295159083e-06, + "loss": 0.8687, + "step": 9361 + }, + { + "epoch": 1.593265747904923, + "grad_norm": 1.734375, + "learning_rate": 8.983216709945264e-06, + "loss": 0.8738, + "step": 9362 + }, + { + "epoch": 1.5934372119939129, + "grad_norm": 1.765625, + "learning_rate": 8.981420157892574e-06, + "loss": 0.9278, + "step": 9363 + }, + { + "epoch": 1.593608676082903, + "grad_norm": 1.6796875, + "learning_rate": 8.97962363905961e-06, + "loss": 0.8692, + "step": 9364 + }, + { + "epoch": 1.5937801401718927, + "grad_norm": 1.75, + "learning_rate": 8.977827153504963e-06, + "loss": 0.8823, + "step": 9365 + }, + { + "epoch": 1.5939516042608826, + "grad_norm": 1.6640625, + "learning_rate": 8.97603070128722e-06, + "loss": 0.8109, + "step": 9366 + }, + { + "epoch": 1.5941230683498726, + "grad_norm": 1.8046875, + "learning_rate": 8.974234282464974e-06, + "loss": 0.8447, + "step": 9367 + }, + { + "epoch": 1.5942945324388624, + "grad_norm": 1.609375, + "learning_rate": 8.97243789709681e-06, + "loss": 0.917, + "step": 9368 + }, + { + "epoch": 1.5944659965278523, + "grad_norm": 1.625, + "learning_rate": 8.970641545241314e-06, + "loss": 0.8451, + "step": 9369 + }, + { + "epoch": 1.594637460616842, + "grad_norm": 1.59375, + "learning_rate": 8.968845226957079e-06, + "loss": 0.7834, + "step": 9370 + }, + { + "epoch": 1.594808924705832, + "grad_norm": 1.78125, + "learning_rate": 8.967048942302677e-06, + "loss": 0.8885, + "step": 9371 + }, + { + "epoch": 1.5949803887948217, + "grad_norm": 1.6328125, + "learning_rate": 8.965252691336701e-06, + "loss": 0.8003, + "step": 9372 + }, + { + "epoch": 1.5951518528838116, + "grad_norm": 1.65625, + "learning_rate": 8.963456474117732e-06, + "loss": 0.7934, + "step": 9373 + }, + { + "epoch": 1.5953233169728014, + "grad_norm": 1.6484375, + "learning_rate": 8.96166029070435e-06, + "loss": 0.8281, + "step": 9374 + }, + { + "epoch": 1.5954947810617912, + "grad_norm": 1.6796875, + "learning_rate": 8.959864141155137e-06, + "loss": 0.8975, + "step": 9375 + }, + { + "epoch": 1.5956662451507813, + "grad_norm": 1.609375, + "learning_rate": 8.95806802552867e-06, + "loss": 0.8455, + "step": 9376 + }, + { + "epoch": 1.595837709239771, + "grad_norm": 1.796875, + "learning_rate": 8.95627194388353e-06, + "loss": 0.8677, + "step": 9377 + }, + { + "epoch": 1.596009173328761, + "grad_norm": 1.6796875, + "learning_rate": 8.95447589627829e-06, + "loss": 0.7717, + "step": 9378 + }, + { + "epoch": 1.596180637417751, + "grad_norm": 1.765625, + "learning_rate": 8.95267988277153e-06, + "loss": 0.8799, + "step": 9379 + }, + { + "epoch": 1.5963521015067408, + "grad_norm": 1.7109375, + "learning_rate": 8.95088390342182e-06, + "loss": 0.8059, + "step": 9380 + }, + { + "epoch": 1.5965235655957306, + "grad_norm": 1.703125, + "learning_rate": 8.949087958287741e-06, + "loss": 0.855, + "step": 9381 + }, + { + "epoch": 1.5966950296847204, + "grad_norm": 1.65625, + "learning_rate": 8.947292047427858e-06, + "loss": 0.8773, + "step": 9382 + }, + { + "epoch": 1.5968664937737103, + "grad_norm": 1.7109375, + "learning_rate": 8.945496170900745e-06, + "loss": 0.8761, + "step": 9383 + }, + { + "epoch": 1.5970379578627, + "grad_norm": 1.7265625, + "learning_rate": 8.943700328764975e-06, + "loss": 0.9317, + "step": 9384 + }, + { + "epoch": 1.59720942195169, + "grad_norm": 1.7265625, + "learning_rate": 8.941904521079113e-06, + "loss": 0.8575, + "step": 9385 + }, + { + "epoch": 1.5973808860406797, + "grad_norm": 1.6875, + "learning_rate": 8.940108747901728e-06, + "loss": 0.845, + "step": 9386 + }, + { + "epoch": 1.5975523501296696, + "grad_norm": 1.6953125, + "learning_rate": 8.93831300929139e-06, + "loss": 0.8701, + "step": 9387 + }, + { + "epoch": 1.5977238142186596, + "grad_norm": 1.6328125, + "learning_rate": 8.936517305306663e-06, + "loss": 0.8437, + "step": 9388 + }, + { + "epoch": 1.5978952783076494, + "grad_norm": 1.671875, + "learning_rate": 8.93472163600611e-06, + "loss": 0.8959, + "step": 9389 + }, + { + "epoch": 1.5980667423966393, + "grad_norm": 1.6328125, + "learning_rate": 8.932926001448296e-06, + "loss": 0.7727, + "step": 9390 + }, + { + "epoch": 1.5982382064856293, + "grad_norm": 1.6796875, + "learning_rate": 8.931130401691784e-06, + "loss": 0.862, + "step": 9391 + }, + { + "epoch": 1.5984096705746191, + "grad_norm": 1.671875, + "learning_rate": 8.929334836795135e-06, + "loss": 0.8813, + "step": 9392 + }, + { + "epoch": 1.598581134663609, + "grad_norm": 1.6875, + "learning_rate": 8.92753930681691e-06, + "loss": 0.9302, + "step": 9393 + }, + { + "epoch": 1.5987525987525988, + "grad_norm": 1.71875, + "learning_rate": 8.925743811815665e-06, + "loss": 0.8973, + "step": 9394 + }, + { + "epoch": 1.5989240628415886, + "grad_norm": 1.6875, + "learning_rate": 8.923948351849959e-06, + "loss": 0.8164, + "step": 9395 + }, + { + "epoch": 1.5990955269305784, + "grad_norm": 1.671875, + "learning_rate": 8.922152926978349e-06, + "loss": 0.8623, + "step": 9396 + }, + { + "epoch": 1.5992669910195683, + "grad_norm": 1.7578125, + "learning_rate": 8.920357537259393e-06, + "loss": 0.7758, + "step": 9397 + }, + { + "epoch": 1.599438455108558, + "grad_norm": 1.71875, + "learning_rate": 8.918562182751642e-06, + "loss": 0.887, + "step": 9398 + }, + { + "epoch": 1.599609919197548, + "grad_norm": 1.640625, + "learning_rate": 8.916766863513648e-06, + "loss": 0.8277, + "step": 9399 + }, + { + "epoch": 1.599781383286538, + "grad_norm": 1.765625, + "learning_rate": 8.914971579603969e-06, + "loss": 0.8924, + "step": 9400 + }, + { + "epoch": 1.5999528473755278, + "grad_norm": 1.7109375, + "learning_rate": 8.913176331081151e-06, + "loss": 0.8944, + "step": 9401 + }, + { + "epoch": 1.6001243114645176, + "grad_norm": 1.609375, + "learning_rate": 8.911381118003743e-06, + "loss": 0.7726, + "step": 9402 + }, + { + "epoch": 1.6002957755535077, + "grad_norm": 1.6640625, + "learning_rate": 8.909585940430299e-06, + "loss": 0.8849, + "step": 9403 + }, + { + "epoch": 1.6004672396424975, + "grad_norm": 1.65625, + "learning_rate": 8.907790798419369e-06, + "loss": 0.8906, + "step": 9404 + }, + { + "epoch": 1.6006387037314873, + "grad_norm": 1.7265625, + "learning_rate": 8.90599569202949e-06, + "loss": 0.9278, + "step": 9405 + }, + { + "epoch": 1.6008101678204771, + "grad_norm": 1.6484375, + "learning_rate": 8.90420062131921e-06, + "loss": 0.8265, + "step": 9406 + }, + { + "epoch": 1.600981631909467, + "grad_norm": 1.7578125, + "learning_rate": 8.902405586347072e-06, + "loss": 0.9074, + "step": 9407 + }, + { + "epoch": 1.6011530959984568, + "grad_norm": 1.734375, + "learning_rate": 8.900610587171623e-06, + "loss": 0.9064, + "step": 9408 + }, + { + "epoch": 1.6013245600874466, + "grad_norm": 1.640625, + "learning_rate": 8.898815623851402e-06, + "loss": 0.8912, + "step": 9409 + }, + { + "epoch": 1.6014960241764364, + "grad_norm": 2.0625, + "learning_rate": 8.89702069644495e-06, + "loss": 0.828, + "step": 9410 + }, + { + "epoch": 1.6016674882654263, + "grad_norm": 1.625, + "learning_rate": 8.895225805010807e-06, + "loss": 0.7718, + "step": 9411 + }, + { + "epoch": 1.6018389523544163, + "grad_norm": 1.84375, + "learning_rate": 8.893430949607511e-06, + "loss": 0.9473, + "step": 9412 + }, + { + "epoch": 1.6020104164434061, + "grad_norm": 1.7265625, + "learning_rate": 8.891636130293598e-06, + "loss": 0.8774, + "step": 9413 + }, + { + "epoch": 1.602181880532396, + "grad_norm": 1.875, + "learning_rate": 8.889841347127607e-06, + "loss": 0.9145, + "step": 9414 + }, + { + "epoch": 1.602353344621386, + "grad_norm": 1.7890625, + "learning_rate": 8.888046600168067e-06, + "loss": 0.8802, + "step": 9415 + }, + { + "epoch": 1.6025248087103758, + "grad_norm": 1.671875, + "learning_rate": 8.886251889473519e-06, + "loss": 0.8245, + "step": 9416 + }, + { + "epoch": 1.6026962727993657, + "grad_norm": 1.6796875, + "learning_rate": 8.884457215102489e-06, + "loss": 0.8716, + "step": 9417 + }, + { + "epoch": 1.6028677368883555, + "grad_norm": 1.7421875, + "learning_rate": 8.882662577113505e-06, + "loss": 0.8579, + "step": 9418 + }, + { + "epoch": 1.6030392009773453, + "grad_norm": 1.765625, + "learning_rate": 8.880867975565106e-06, + "loss": 0.8044, + "step": 9419 + }, + { + "epoch": 1.6032106650663351, + "grad_norm": 1.6484375, + "learning_rate": 8.879073410515818e-06, + "loss": 0.7676, + "step": 9420 + }, + { + "epoch": 1.603382129155325, + "grad_norm": 1.6640625, + "learning_rate": 8.877278882024163e-06, + "loss": 0.8706, + "step": 9421 + }, + { + "epoch": 1.6035535932443148, + "grad_norm": 1.7890625, + "learning_rate": 8.875484390148674e-06, + "loss": 0.9197, + "step": 9422 + }, + { + "epoch": 1.6037250573333046, + "grad_norm": 1.6953125, + "learning_rate": 8.873689934947873e-06, + "loss": 0.9162, + "step": 9423 + }, + { + "epoch": 1.6038965214222947, + "grad_norm": 1.6953125, + "learning_rate": 8.871895516480286e-06, + "loss": 0.9041, + "step": 9424 + }, + { + "epoch": 1.6040679855112845, + "grad_norm": 1.6875, + "learning_rate": 8.870101134804432e-06, + "loss": 0.8439, + "step": 9425 + }, + { + "epoch": 1.6042394496002743, + "grad_norm": 1.703125, + "learning_rate": 8.868306789978837e-06, + "loss": 0.8439, + "step": 9426 + }, + { + "epoch": 1.6044109136892644, + "grad_norm": 1.703125, + "learning_rate": 8.866512482062019e-06, + "loss": 0.8968, + "step": 9427 + }, + { + "epoch": 1.6045823777782542, + "grad_norm": 1.6171875, + "learning_rate": 8.864718211112497e-06, + "loss": 0.7964, + "step": 9428 + }, + { + "epoch": 1.604753841867244, + "grad_norm": 1.7734375, + "learning_rate": 8.862923977188788e-06, + "loss": 0.8404, + "step": 9429 + }, + { + "epoch": 1.6049253059562338, + "grad_norm": 1.640625, + "learning_rate": 8.861129780349411e-06, + "loss": 0.8308, + "step": 9430 + }, + { + "epoch": 1.6050967700452237, + "grad_norm": 1.7109375, + "learning_rate": 8.85933562065288e-06, + "loss": 0.8702, + "step": 9431 + }, + { + "epoch": 1.6052682341342135, + "grad_norm": 1.6640625, + "learning_rate": 8.85754149815771e-06, + "loss": 0.8977, + "step": 9432 + }, + { + "epoch": 1.6054396982232033, + "grad_norm": 1.7109375, + "learning_rate": 8.855747412922413e-06, + "loss": 0.8239, + "step": 9433 + }, + { + "epoch": 1.6056111623121931, + "grad_norm": 1.609375, + "learning_rate": 8.853953365005503e-06, + "loss": 0.7898, + "step": 9434 + }, + { + "epoch": 1.605782626401183, + "grad_norm": 1.7890625, + "learning_rate": 8.852159354465486e-06, + "loss": 0.8764, + "step": 9435 + }, + { + "epoch": 1.605954090490173, + "grad_norm": 1.6953125, + "learning_rate": 8.850365381360879e-06, + "loss": 0.8465, + "step": 9436 + }, + { + "epoch": 1.6061255545791628, + "grad_norm": 1.7109375, + "learning_rate": 8.848571445750183e-06, + "loss": 0.8515, + "step": 9437 + }, + { + "epoch": 1.6062970186681527, + "grad_norm": 1.6640625, + "learning_rate": 8.84677754769191e-06, + "loss": 0.7793, + "step": 9438 + }, + { + "epoch": 1.6064684827571427, + "grad_norm": 1.7890625, + "learning_rate": 8.844983687244565e-06, + "loss": 0.8679, + "step": 9439 + }, + { + "epoch": 1.6066399468461325, + "grad_norm": 1.71875, + "learning_rate": 8.84318986446665e-06, + "loss": 0.8877, + "step": 9440 + }, + { + "epoch": 1.6068114109351224, + "grad_norm": 1.890625, + "learning_rate": 8.84139607941667e-06, + "loss": 0.9196, + "step": 9441 + }, + { + "epoch": 1.6069828750241122, + "grad_norm": 1.640625, + "learning_rate": 8.839602332153126e-06, + "loss": 0.7696, + "step": 9442 + }, + { + "epoch": 1.607154339113102, + "grad_norm": 1.6328125, + "learning_rate": 8.837808622734519e-06, + "loss": 0.8631, + "step": 9443 + }, + { + "epoch": 1.6073258032020918, + "grad_norm": 1.6953125, + "learning_rate": 8.83601495121935e-06, + "loss": 0.786, + "step": 9444 + }, + { + "epoch": 1.6074972672910817, + "grad_norm": 1.640625, + "learning_rate": 8.834221317666117e-06, + "loss": 0.8097, + "step": 9445 + }, + { + "epoch": 1.6076687313800715, + "grad_norm": 1.7265625, + "learning_rate": 8.832427722133315e-06, + "loss": 0.8723, + "step": 9446 + }, + { + "epoch": 1.6078401954690613, + "grad_norm": 1.796875, + "learning_rate": 8.830634164679444e-06, + "loss": 0.8215, + "step": 9447 + }, + { + "epoch": 1.6080116595580514, + "grad_norm": 1.6640625, + "learning_rate": 8.828840645362995e-06, + "loss": 0.8247, + "step": 9448 + }, + { + "epoch": 1.6081831236470412, + "grad_norm": 1.703125, + "learning_rate": 8.82704716424246e-06, + "loss": 0.8534, + "step": 9449 + }, + { + "epoch": 1.608354587736031, + "grad_norm": 1.734375, + "learning_rate": 8.825253721376341e-06, + "loss": 0.8736, + "step": 9450 + }, + { + "epoch": 1.6085260518250208, + "grad_norm": 1.7578125, + "learning_rate": 8.823460316823118e-06, + "loss": 0.9481, + "step": 9451 + }, + { + "epoch": 1.608697515914011, + "grad_norm": 1.6328125, + "learning_rate": 8.821666950641283e-06, + "loss": 0.8318, + "step": 9452 + }, + { + "epoch": 1.6088689800030007, + "grad_norm": 1.671875, + "learning_rate": 8.819873622889328e-06, + "loss": 0.9219, + "step": 9453 + }, + { + "epoch": 1.6090404440919905, + "grad_norm": 1.625, + "learning_rate": 8.818080333625737e-06, + "loss": 0.8387, + "step": 9454 + }, + { + "epoch": 1.6092119081809804, + "grad_norm": 1.5625, + "learning_rate": 8.816287082908998e-06, + "loss": 0.7747, + "step": 9455 + }, + { + "epoch": 1.6093833722699702, + "grad_norm": 1.65625, + "learning_rate": 8.814493870797594e-06, + "loss": 0.8218, + "step": 9456 + }, + { + "epoch": 1.60955483635896, + "grad_norm": 1.6171875, + "learning_rate": 8.812700697350006e-06, + "loss": 0.8, + "step": 9457 + }, + { + "epoch": 1.6097263004479498, + "grad_norm": 1.78125, + "learning_rate": 8.810907562624721e-06, + "loss": 0.8254, + "step": 9458 + }, + { + "epoch": 1.6098977645369397, + "grad_norm": 1.6875, + "learning_rate": 8.809114466680219e-06, + "loss": 0.8618, + "step": 9459 + }, + { + "epoch": 1.6100692286259295, + "grad_norm": 1.6875, + "learning_rate": 8.80732140957498e-06, + "loss": 0.8895, + "step": 9460 + }, + { + "epoch": 1.6102406927149195, + "grad_norm": 1.6796875, + "learning_rate": 8.80552839136748e-06, + "loss": 0.8411, + "step": 9461 + }, + { + "epoch": 1.6104121568039094, + "grad_norm": 1.6953125, + "learning_rate": 8.8037354121162e-06, + "loss": 0.8813, + "step": 9462 + }, + { + "epoch": 1.6105836208928992, + "grad_norm": 1.6484375, + "learning_rate": 8.801942471879611e-06, + "loss": 0.7896, + "step": 9463 + }, + { + "epoch": 1.6107550849818892, + "grad_norm": 1.8203125, + "learning_rate": 8.800149570716188e-06, + "loss": 0.8829, + "step": 9464 + }, + { + "epoch": 1.610926549070879, + "grad_norm": 1.640625, + "learning_rate": 8.798356708684405e-06, + "loss": 0.8807, + "step": 9465 + }, + { + "epoch": 1.611098013159869, + "grad_norm": 1.7578125, + "learning_rate": 8.796563885842737e-06, + "loss": 0.831, + "step": 9466 + }, + { + "epoch": 1.6112694772488587, + "grad_norm": 1.7734375, + "learning_rate": 8.79477110224965e-06, + "loss": 0.931, + "step": 9467 + }, + { + "epoch": 1.6114409413378485, + "grad_norm": 1.734375, + "learning_rate": 8.792978357963618e-06, + "loss": 0.8006, + "step": 9468 + }, + { + "epoch": 1.6116124054268384, + "grad_norm": 1.71875, + "learning_rate": 8.791185653043106e-06, + "loss": 0.8351, + "step": 9469 + }, + { + "epoch": 1.6117838695158282, + "grad_norm": 1.7734375, + "learning_rate": 8.789392987546581e-06, + "loss": 0.8104, + "step": 9470 + }, + { + "epoch": 1.611955333604818, + "grad_norm": 1.6796875, + "learning_rate": 8.78760036153251e-06, + "loss": 0.8881, + "step": 9471 + }, + { + "epoch": 1.6121267976938078, + "grad_norm": 1.6484375, + "learning_rate": 8.785807775059357e-06, + "loss": 0.8613, + "step": 9472 + }, + { + "epoch": 1.612298261782798, + "grad_norm": 1.6328125, + "learning_rate": 8.784015228185587e-06, + "loss": 0.8537, + "step": 9473 + }, + { + "epoch": 1.6124697258717877, + "grad_norm": 1.7578125, + "learning_rate": 8.782222720969658e-06, + "loss": 0.8582, + "step": 9474 + }, + { + "epoch": 1.6126411899607775, + "grad_norm": 1.65625, + "learning_rate": 8.78043025347003e-06, + "loss": 0.8746, + "step": 9475 + }, + { + "epoch": 1.6128126540497676, + "grad_norm": 1.703125, + "learning_rate": 8.778637825745165e-06, + "loss": 0.8207, + "step": 9476 + }, + { + "epoch": 1.6129841181387574, + "grad_norm": 1.6875, + "learning_rate": 8.77684543785352e-06, + "loss": 0.8755, + "step": 9477 + }, + { + "epoch": 1.6131555822277472, + "grad_norm": 1.609375, + "learning_rate": 8.77505308985355e-06, + "loss": 0.7851, + "step": 9478 + }, + { + "epoch": 1.613327046316737, + "grad_norm": 1.6171875, + "learning_rate": 8.773260781803713e-06, + "loss": 0.7995, + "step": 9479 + }, + { + "epoch": 1.613498510405727, + "grad_norm": 1.6796875, + "learning_rate": 8.77146851376246e-06, + "loss": 0.8759, + "step": 9480 + }, + { + "epoch": 1.6136699744947167, + "grad_norm": 1.6953125, + "learning_rate": 8.769676285788245e-06, + "loss": 0.8722, + "step": 9481 + }, + { + "epoch": 1.6138414385837065, + "grad_norm": 1.6953125, + "learning_rate": 8.76788409793952e-06, + "loss": 0.8435, + "step": 9482 + }, + { + "epoch": 1.6140129026726964, + "grad_norm": 1.6953125, + "learning_rate": 8.766091950274735e-06, + "loss": 0.8915, + "step": 9483 + }, + { + "epoch": 1.6141843667616862, + "grad_norm": 1.65625, + "learning_rate": 8.764299842852336e-06, + "loss": 0.839, + "step": 9484 + }, + { + "epoch": 1.6143558308506762, + "grad_norm": 1.65625, + "learning_rate": 8.762507775730776e-06, + "loss": 0.8317, + "step": 9485 + }, + { + "epoch": 1.614527294939666, + "grad_norm": 1.7421875, + "learning_rate": 8.760715748968494e-06, + "loss": 0.8901, + "step": 9486 + }, + { + "epoch": 1.614698759028656, + "grad_norm": 1.7734375, + "learning_rate": 8.75892376262394e-06, + "loss": 0.8669, + "step": 9487 + }, + { + "epoch": 1.614870223117646, + "grad_norm": 1.625, + "learning_rate": 8.757131816755554e-06, + "loss": 0.7872, + "step": 9488 + }, + { + "epoch": 1.6150416872066358, + "grad_norm": 1.6640625, + "learning_rate": 8.75533991142178e-06, + "loss": 0.7714, + "step": 9489 + }, + { + "epoch": 1.6152131512956256, + "grad_norm": 1.7109375, + "learning_rate": 8.75354804668106e-06, + "loss": 0.8928, + "step": 9490 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 1.609375, + "learning_rate": 8.75175622259183e-06, + "loss": 0.8651, + "step": 9491 + }, + { + "epoch": 1.6155560794736052, + "grad_norm": 1.5625, + "learning_rate": 8.74996443921253e-06, + "loss": 0.7452, + "step": 9492 + }, + { + "epoch": 1.615727543562595, + "grad_norm": 1.6171875, + "learning_rate": 8.748172696601597e-06, + "loss": 0.6888, + "step": 9493 + }, + { + "epoch": 1.615899007651585, + "grad_norm": 1.671875, + "learning_rate": 8.746380994817467e-06, + "loss": 0.7881, + "step": 9494 + }, + { + "epoch": 1.6160704717405747, + "grad_norm": 1.65625, + "learning_rate": 8.744589333918571e-06, + "loss": 0.8082, + "step": 9495 + }, + { + "epoch": 1.6162419358295645, + "grad_norm": 1.75, + "learning_rate": 8.742797713963352e-06, + "loss": 0.8459, + "step": 9496 + }, + { + "epoch": 1.6164133999185546, + "grad_norm": 1.7421875, + "learning_rate": 8.741006135010228e-06, + "loss": 0.8487, + "step": 9497 + }, + { + "epoch": 1.6165848640075444, + "grad_norm": 1.859375, + "learning_rate": 8.739214597117634e-06, + "loss": 0.8437, + "step": 9498 + }, + { + "epoch": 1.6167563280965342, + "grad_norm": 1.7578125, + "learning_rate": 8.737423100344002e-06, + "loss": 0.8587, + "step": 9499 + }, + { + "epoch": 1.6169277921855243, + "grad_norm": 1.7421875, + "learning_rate": 8.735631644747755e-06, + "loss": 0.8818, + "step": 9500 + }, + { + "epoch": 1.6170992562745141, + "grad_norm": 1.6484375, + "learning_rate": 8.733840230387322e-06, + "loss": 0.8194, + "step": 9501 + }, + { + "epoch": 1.617270720363504, + "grad_norm": 1.6328125, + "learning_rate": 8.732048857321126e-06, + "loss": 0.8206, + "step": 9502 + }, + { + "epoch": 1.6174421844524938, + "grad_norm": 1.6953125, + "learning_rate": 8.730257525607594e-06, + "loss": 0.8871, + "step": 9503 + }, + { + "epoch": 1.6176136485414836, + "grad_norm": 1.6796875, + "learning_rate": 8.728466235305143e-06, + "loss": 0.8642, + "step": 9504 + }, + { + "epoch": 1.6177851126304734, + "grad_norm": 1.7265625, + "learning_rate": 8.726674986472195e-06, + "loss": 0.8576, + "step": 9505 + }, + { + "epoch": 1.6179565767194632, + "grad_norm": 1.671875, + "learning_rate": 8.724883779167174e-06, + "loss": 0.8776, + "step": 9506 + }, + { + "epoch": 1.618128040808453, + "grad_norm": 1.640625, + "learning_rate": 8.723092613448494e-06, + "loss": 0.8235, + "step": 9507 + }, + { + "epoch": 1.618299504897443, + "grad_norm": 1.703125, + "learning_rate": 8.721301489374575e-06, + "loss": 0.8939, + "step": 9508 + }, + { + "epoch": 1.618470968986433, + "grad_norm": 1.703125, + "learning_rate": 8.719510407003826e-06, + "loss": 0.8792, + "step": 9509 + }, + { + "epoch": 1.6186424330754228, + "grad_norm": 1.671875, + "learning_rate": 8.717719366394665e-06, + "loss": 0.7704, + "step": 9510 + }, + { + "epoch": 1.6188138971644126, + "grad_norm": 1.7265625, + "learning_rate": 8.715928367605503e-06, + "loss": 0.8084, + "step": 9511 + }, + { + "epoch": 1.6189853612534026, + "grad_norm": 1.6796875, + "learning_rate": 8.714137410694753e-06, + "loss": 0.848, + "step": 9512 + }, + { + "epoch": 1.6191568253423925, + "grad_norm": 1.671875, + "learning_rate": 8.712346495720822e-06, + "loss": 0.8198, + "step": 9513 + }, + { + "epoch": 1.6193282894313823, + "grad_norm": 1.6171875, + "learning_rate": 8.710555622742124e-06, + "loss": 0.7781, + "step": 9514 + }, + { + "epoch": 1.6194997535203721, + "grad_norm": 1.75, + "learning_rate": 8.708764791817062e-06, + "loss": 0.9383, + "step": 9515 + }, + { + "epoch": 1.619671217609362, + "grad_norm": 1.7109375, + "learning_rate": 8.706974003004042e-06, + "loss": 0.8586, + "step": 9516 + }, + { + "epoch": 1.6198426816983518, + "grad_norm": 1.6484375, + "learning_rate": 8.705183256361468e-06, + "loss": 0.7714, + "step": 9517 + }, + { + "epoch": 1.6200141457873416, + "grad_norm": 1.609375, + "learning_rate": 8.703392551947744e-06, + "loss": 0.9317, + "step": 9518 + }, + { + "epoch": 1.6201856098763314, + "grad_norm": 1.6953125, + "learning_rate": 8.701601889821278e-06, + "loss": 0.8142, + "step": 9519 + }, + { + "epoch": 1.6203570739653212, + "grad_norm": 1.65625, + "learning_rate": 8.699811270040456e-06, + "loss": 0.8342, + "step": 9520 + }, + { + "epoch": 1.6205285380543113, + "grad_norm": 1.78125, + "learning_rate": 8.698020692663686e-06, + "loss": 0.9059, + "step": 9521 + }, + { + "epoch": 1.6207000021433011, + "grad_norm": 1.7421875, + "learning_rate": 8.696230157749365e-06, + "loss": 0.8225, + "step": 9522 + }, + { + "epoch": 1.620871466232291, + "grad_norm": 1.75, + "learning_rate": 8.694439665355887e-06, + "loss": 0.8743, + "step": 9523 + }, + { + "epoch": 1.621042930321281, + "grad_norm": 4.96875, + "learning_rate": 8.692649215541648e-06, + "loss": 0.8313, + "step": 9524 + }, + { + "epoch": 1.6212143944102708, + "grad_norm": 1.7578125, + "learning_rate": 8.690858808365041e-06, + "loss": 0.8802, + "step": 9525 + }, + { + "epoch": 1.6213858584992606, + "grad_norm": 1.6953125, + "learning_rate": 8.68906844388446e-06, + "loss": 0.7753, + "step": 9526 + }, + { + "epoch": 1.6215573225882505, + "grad_norm": 1.7734375, + "learning_rate": 8.68727812215829e-06, + "loss": 0.7983, + "step": 9527 + }, + { + "epoch": 1.6217287866772403, + "grad_norm": 1.8125, + "learning_rate": 8.685487843244927e-06, + "loss": 0.964, + "step": 9528 + }, + { + "epoch": 1.6219002507662301, + "grad_norm": 1.71875, + "learning_rate": 8.683697607202754e-06, + "loss": 0.7903, + "step": 9529 + }, + { + "epoch": 1.62207171485522, + "grad_norm": 1.7109375, + "learning_rate": 8.681907414090159e-06, + "loss": 0.8497, + "step": 9530 + }, + { + "epoch": 1.6222431789442098, + "grad_norm": 1.6015625, + "learning_rate": 8.68011726396553e-06, + "loss": 0.8279, + "step": 9531 + }, + { + "epoch": 1.6224146430331996, + "grad_norm": 1.796875, + "learning_rate": 8.678327156887243e-06, + "loss": 0.8786, + "step": 9532 + }, + { + "epoch": 1.6225861071221896, + "grad_norm": 1.78125, + "learning_rate": 8.676537092913685e-06, + "loss": 0.9326, + "step": 9533 + }, + { + "epoch": 1.6227575712111795, + "grad_norm": 1.875, + "learning_rate": 8.674747072103236e-06, + "loss": 0.7747, + "step": 9534 + }, + { + "epoch": 1.6229290353001693, + "grad_norm": 1.6953125, + "learning_rate": 8.672957094514278e-06, + "loss": 0.9462, + "step": 9535 + }, + { + "epoch": 1.6231004993891593, + "grad_norm": 1.703125, + "learning_rate": 8.671167160205183e-06, + "loss": 0.7966, + "step": 9536 + }, + { + "epoch": 1.6232719634781492, + "grad_norm": 1.6015625, + "learning_rate": 8.66937726923433e-06, + "loss": 0.8396, + "step": 9537 + }, + { + "epoch": 1.623443427567139, + "grad_norm": 1.8125, + "learning_rate": 8.667587421660099e-06, + "loss": 0.8396, + "step": 9538 + }, + { + "epoch": 1.6236148916561288, + "grad_norm": 1.6875, + "learning_rate": 8.665797617540857e-06, + "loss": 0.8863, + "step": 9539 + }, + { + "epoch": 1.6237863557451186, + "grad_norm": 1.5625, + "learning_rate": 8.664007856934979e-06, + "loss": 0.7412, + "step": 9540 + }, + { + "epoch": 1.6239578198341085, + "grad_norm": 1.71875, + "learning_rate": 8.662218139900836e-06, + "loss": 0.8568, + "step": 9541 + }, + { + "epoch": 1.6241292839230983, + "grad_norm": 1.59375, + "learning_rate": 8.660428466496795e-06, + "loss": 0.8518, + "step": 9542 + }, + { + "epoch": 1.6243007480120881, + "grad_norm": 1.7421875, + "learning_rate": 8.658638836781232e-06, + "loss": 0.9646, + "step": 9543 + }, + { + "epoch": 1.624472212101078, + "grad_norm": 1.71875, + "learning_rate": 8.656849250812504e-06, + "loss": 0.8214, + "step": 9544 + }, + { + "epoch": 1.624643676190068, + "grad_norm": 1.6953125, + "learning_rate": 8.65505970864898e-06, + "loss": 0.8817, + "step": 9545 + }, + { + "epoch": 1.6248151402790578, + "grad_norm": 1.625, + "learning_rate": 8.653270210349023e-06, + "loss": 0.7976, + "step": 9546 + }, + { + "epoch": 1.6249866043680476, + "grad_norm": 1.703125, + "learning_rate": 8.651480755970995e-06, + "loss": 0.892, + "step": 9547 + }, + { + "epoch": 1.6251580684570375, + "grad_norm": 1.6875, + "learning_rate": 8.649691345573259e-06, + "loss": 0.8196, + "step": 9548 + }, + { + "epoch": 1.6253295325460275, + "grad_norm": 1.765625, + "learning_rate": 8.647901979214173e-06, + "loss": 0.9425, + "step": 9549 + }, + { + "epoch": 1.6255009966350173, + "grad_norm": 1.7265625, + "learning_rate": 8.646112656952094e-06, + "loss": 0.8447, + "step": 9550 + }, + { + "epoch": 1.6256724607240072, + "grad_norm": 1.6328125, + "learning_rate": 8.644323378845378e-06, + "loss": 0.8281, + "step": 9551 + }, + { + "epoch": 1.625843924812997, + "grad_norm": 1.7265625, + "learning_rate": 8.642534144952383e-06, + "loss": 0.9119, + "step": 9552 + }, + { + "epoch": 1.6260153889019868, + "grad_norm": 1.7421875, + "learning_rate": 8.640744955331462e-06, + "loss": 0.8358, + "step": 9553 + }, + { + "epoch": 1.6261868529909767, + "grad_norm": 1.609375, + "learning_rate": 8.63895581004097e-06, + "loss": 0.8309, + "step": 9554 + }, + { + "epoch": 1.6263583170799665, + "grad_norm": 1.6640625, + "learning_rate": 8.637166709139251e-06, + "loss": 0.8626, + "step": 9555 + }, + { + "epoch": 1.6265297811689563, + "grad_norm": 1.6484375, + "learning_rate": 8.635377652684657e-06, + "loss": 0.7618, + "step": 9556 + }, + { + "epoch": 1.6267012452579461, + "grad_norm": 1.6796875, + "learning_rate": 8.633588640735536e-06, + "loss": 0.8348, + "step": 9557 + }, + { + "epoch": 1.6268727093469362, + "grad_norm": 1.6796875, + "learning_rate": 8.631799673350235e-06, + "loss": 0.9185, + "step": 9558 + }, + { + "epoch": 1.627044173435926, + "grad_norm": 1.7578125, + "learning_rate": 8.630010750587099e-06, + "loss": 0.9125, + "step": 9559 + }, + { + "epoch": 1.6272156375249158, + "grad_norm": 1.6875, + "learning_rate": 8.62822187250447e-06, + "loss": 0.9099, + "step": 9560 + }, + { + "epoch": 1.6273871016139059, + "grad_norm": 1.6171875, + "learning_rate": 8.626433039160693e-06, + "loss": 0.7726, + "step": 9561 + }, + { + "epoch": 1.6275585657028957, + "grad_norm": 1.9296875, + "learning_rate": 8.624644250614107e-06, + "loss": 0.863, + "step": 9562 + }, + { + "epoch": 1.6277300297918855, + "grad_norm": 1.703125, + "learning_rate": 8.622855506923051e-06, + "loss": 0.9398, + "step": 9563 + }, + { + "epoch": 1.6279014938808753, + "grad_norm": 1.7734375, + "learning_rate": 8.621066808145863e-06, + "loss": 0.8906, + "step": 9564 + }, + { + "epoch": 1.6280729579698652, + "grad_norm": 1.6640625, + "learning_rate": 8.619278154340877e-06, + "loss": 0.8521, + "step": 9565 + }, + { + "epoch": 1.628244422058855, + "grad_norm": 1.6796875, + "learning_rate": 8.617489545566437e-06, + "loss": 0.9032, + "step": 9566 + }, + { + "epoch": 1.6284158861478448, + "grad_norm": 1.671875, + "learning_rate": 8.615700981880861e-06, + "loss": 0.7931, + "step": 9567 + }, + { + "epoch": 1.6285873502368347, + "grad_norm": 1.6875, + "learning_rate": 8.61391246334249e-06, + "loss": 0.8021, + "step": 9568 + }, + { + "epoch": 1.6287588143258245, + "grad_norm": 1.734375, + "learning_rate": 8.612123990009655e-06, + "loss": 0.8412, + "step": 9569 + }, + { + "epoch": 1.6289302784148145, + "grad_norm": 1.6953125, + "learning_rate": 8.610335561940682e-06, + "loss": 0.9334, + "step": 9570 + }, + { + "epoch": 1.6291017425038044, + "grad_norm": 1.703125, + "learning_rate": 8.6085471791939e-06, + "loss": 0.8338, + "step": 9571 + }, + { + "epoch": 1.6292732065927942, + "grad_norm": 1.734375, + "learning_rate": 8.606758841827634e-06, + "loss": 0.8811, + "step": 9572 + }, + { + "epoch": 1.6294446706817842, + "grad_norm": 1.6796875, + "learning_rate": 8.604970549900208e-06, + "loss": 0.7772, + "step": 9573 + }, + { + "epoch": 1.629616134770774, + "grad_norm": 1.53125, + "learning_rate": 8.603182303469947e-06, + "loss": 0.7959, + "step": 9574 + }, + { + "epoch": 1.6297875988597639, + "grad_norm": 1.6328125, + "learning_rate": 8.601394102595169e-06, + "loss": 0.8504, + "step": 9575 + }, + { + "epoch": 1.6299590629487537, + "grad_norm": 1.640625, + "learning_rate": 8.599605947334196e-06, + "loss": 0.8329, + "step": 9576 + }, + { + "epoch": 1.6301305270377435, + "grad_norm": 1.578125, + "learning_rate": 8.59781783774535e-06, + "loss": 0.7754, + "step": 9577 + }, + { + "epoch": 1.6303019911267334, + "grad_norm": 1.71875, + "learning_rate": 8.59602977388694e-06, + "loss": 0.8126, + "step": 9578 + }, + { + "epoch": 1.6304734552157232, + "grad_norm": 1.6328125, + "learning_rate": 8.594241755817289e-06, + "loss": 0.7677, + "step": 9579 + }, + { + "epoch": 1.630644919304713, + "grad_norm": 1.5703125, + "learning_rate": 8.592453783594707e-06, + "loss": 0.8123, + "step": 9580 + }, + { + "epoch": 1.6308163833937028, + "grad_norm": 1.7421875, + "learning_rate": 8.590665857277507e-06, + "loss": 0.8539, + "step": 9581 + }, + { + "epoch": 1.6309878474826929, + "grad_norm": 1.65625, + "learning_rate": 8.588877976924e-06, + "loss": 0.7789, + "step": 9582 + }, + { + "epoch": 1.6311593115716827, + "grad_norm": 1.71875, + "learning_rate": 8.587090142592497e-06, + "loss": 0.808, + "step": 9583 + }, + { + "epoch": 1.6313307756606725, + "grad_norm": 1.8125, + "learning_rate": 8.585302354341305e-06, + "loss": 0.8538, + "step": 9584 + }, + { + "epoch": 1.6315022397496626, + "grad_norm": 1.6171875, + "learning_rate": 8.58351461222873e-06, + "loss": 0.7721, + "step": 9585 + }, + { + "epoch": 1.6316737038386524, + "grad_norm": 1.7109375, + "learning_rate": 8.581726916313077e-06, + "loss": 0.882, + "step": 9586 + }, + { + "epoch": 1.6318451679276422, + "grad_norm": 1.6640625, + "learning_rate": 8.57993926665265e-06, + "loss": 0.855, + "step": 9587 + }, + { + "epoch": 1.632016632016632, + "grad_norm": 1.7265625, + "learning_rate": 8.578151663305751e-06, + "loss": 0.8971, + "step": 9588 + }, + { + "epoch": 1.6321880961056219, + "grad_norm": 1.6796875, + "learning_rate": 8.576364106330683e-06, + "loss": 0.8524, + "step": 9589 + }, + { + "epoch": 1.6323595601946117, + "grad_norm": 1.75, + "learning_rate": 8.574576595785742e-06, + "loss": 0.8495, + "step": 9590 + }, + { + "epoch": 1.6325310242836015, + "grad_norm": 1.7109375, + "learning_rate": 8.572789131729224e-06, + "loss": 0.8608, + "step": 9591 + }, + { + "epoch": 1.6327024883725914, + "grad_norm": 1.6875, + "learning_rate": 8.571001714219425e-06, + "loss": 0.8293, + "step": 9592 + }, + { + "epoch": 1.6328739524615812, + "grad_norm": 1.6796875, + "learning_rate": 8.569214343314645e-06, + "loss": 0.8418, + "step": 9593 + }, + { + "epoch": 1.6330454165505712, + "grad_norm": 1.609375, + "learning_rate": 8.56742701907317e-06, + "loss": 0.8288, + "step": 9594 + }, + { + "epoch": 1.633216880639561, + "grad_norm": 1.7265625, + "learning_rate": 8.565639741553296e-06, + "loss": 0.8054, + "step": 9595 + }, + { + "epoch": 1.6333883447285509, + "grad_norm": 1.6640625, + "learning_rate": 8.56385251081331e-06, + "loss": 0.8483, + "step": 9596 + }, + { + "epoch": 1.633559808817541, + "grad_norm": 1.65625, + "learning_rate": 8.562065326911502e-06, + "loss": 0.856, + "step": 9597 + }, + { + "epoch": 1.6337312729065308, + "grad_norm": 1.6328125, + "learning_rate": 8.560278189906159e-06, + "loss": 0.7193, + "step": 9598 + }, + { + "epoch": 1.6339027369955206, + "grad_norm": 1.7109375, + "learning_rate": 8.558491099855565e-06, + "loss": 0.8671, + "step": 9599 + }, + { + "epoch": 1.6340742010845104, + "grad_norm": 1.65625, + "learning_rate": 8.556704056818011e-06, + "loss": 0.8556, + "step": 9600 + }, + { + "epoch": 1.6342456651735002, + "grad_norm": 1.7265625, + "learning_rate": 8.554917060851767e-06, + "loss": 0.8631, + "step": 9601 + }, + { + "epoch": 1.63441712926249, + "grad_norm": 1.75, + "learning_rate": 8.553130112015118e-06, + "loss": 0.8055, + "step": 9602 + }, + { + "epoch": 1.6345885933514799, + "grad_norm": 1.6875, + "learning_rate": 8.551343210366346e-06, + "loss": 0.8503, + "step": 9603 + }, + { + "epoch": 1.6347600574404697, + "grad_norm": 1.734375, + "learning_rate": 8.549556355963726e-06, + "loss": 0.903, + "step": 9604 + }, + { + "epoch": 1.6349315215294595, + "grad_norm": 1.6796875, + "learning_rate": 8.547769548865537e-06, + "loss": 0.8312, + "step": 9605 + }, + { + "epoch": 1.6351029856184496, + "grad_norm": 1.7265625, + "learning_rate": 8.545982789130048e-06, + "loss": 0.8706, + "step": 9606 + }, + { + "epoch": 1.6352744497074394, + "grad_norm": 1.71875, + "learning_rate": 8.544196076815539e-06, + "loss": 0.905, + "step": 9607 + }, + { + "epoch": 1.6354459137964292, + "grad_norm": 1.5546875, + "learning_rate": 8.542409411980276e-06, + "loss": 0.7822, + "step": 9608 + }, + { + "epoch": 1.6356173778854193, + "grad_norm": 1.6953125, + "learning_rate": 8.540622794682531e-06, + "loss": 0.9522, + "step": 9609 + }, + { + "epoch": 1.635788841974409, + "grad_norm": 1.609375, + "learning_rate": 8.538836224980574e-06, + "loss": 0.8145, + "step": 9610 + }, + { + "epoch": 1.635960306063399, + "grad_norm": 1.7734375, + "learning_rate": 8.537049702932669e-06, + "loss": 0.9594, + "step": 9611 + }, + { + "epoch": 1.6361317701523888, + "grad_norm": 1.6640625, + "learning_rate": 8.535263228597086e-06, + "loss": 0.834, + "step": 9612 + }, + { + "epoch": 1.6363032342413786, + "grad_norm": 1.8046875, + "learning_rate": 8.53347680203208e-06, + "loss": 0.9463, + "step": 9613 + }, + { + "epoch": 1.6364746983303684, + "grad_norm": 1.6328125, + "learning_rate": 8.531690423295917e-06, + "loss": 0.8724, + "step": 9614 + }, + { + "epoch": 1.6366461624193582, + "grad_norm": 1.7265625, + "learning_rate": 8.529904092446862e-06, + "loss": 0.8486, + "step": 9615 + }, + { + "epoch": 1.636817626508348, + "grad_norm": 1.71875, + "learning_rate": 8.528117809543168e-06, + "loss": 0.8814, + "step": 9616 + }, + { + "epoch": 1.6369890905973379, + "grad_norm": 1.5859375, + "learning_rate": 8.526331574643096e-06, + "loss": 0.9166, + "step": 9617 + }, + { + "epoch": 1.637160554686328, + "grad_norm": 1.71875, + "learning_rate": 8.5245453878049e-06, + "loss": 0.8858, + "step": 9618 + }, + { + "epoch": 1.6373320187753178, + "grad_norm": 1.5859375, + "learning_rate": 8.522759249086835e-06, + "loss": 0.7996, + "step": 9619 + }, + { + "epoch": 1.6375034828643076, + "grad_norm": 1.625, + "learning_rate": 8.520973158547154e-06, + "loss": 0.8665, + "step": 9620 + }, + { + "epoch": 1.6376749469532976, + "grad_norm": 1.8671875, + "learning_rate": 8.519187116244107e-06, + "loss": 0.91, + "step": 9621 + }, + { + "epoch": 1.6378464110422875, + "grad_norm": 1.703125, + "learning_rate": 8.517401122235945e-06, + "loss": 0.9372, + "step": 9622 + }, + { + "epoch": 1.6380178751312773, + "grad_norm": 1.7421875, + "learning_rate": 8.515615176580917e-06, + "loss": 0.9159, + "step": 9623 + }, + { + "epoch": 1.638189339220267, + "grad_norm": 1.734375, + "learning_rate": 8.513829279337267e-06, + "loss": 0.8983, + "step": 9624 + }, + { + "epoch": 1.638360803309257, + "grad_norm": 1.671875, + "learning_rate": 8.512043430563239e-06, + "loss": 0.8373, + "step": 9625 + }, + { + "epoch": 1.6385322673982468, + "grad_norm": 1.65625, + "learning_rate": 8.510257630317079e-06, + "loss": 0.8467, + "step": 9626 + }, + { + "epoch": 1.6387037314872366, + "grad_norm": 1.734375, + "learning_rate": 8.508471878657028e-06, + "loss": 0.9898, + "step": 9627 + }, + { + "epoch": 1.6388751955762264, + "grad_norm": 1.625, + "learning_rate": 8.506686175641324e-06, + "loss": 0.7781, + "step": 9628 + }, + { + "epoch": 1.6390466596652162, + "grad_norm": 1.6640625, + "learning_rate": 8.504900521328207e-06, + "loss": 0.7417, + "step": 9629 + }, + { + "epoch": 1.6392181237542063, + "grad_norm": 1.6171875, + "learning_rate": 8.503114915775915e-06, + "loss": 0.7881, + "step": 9630 + }, + { + "epoch": 1.639389587843196, + "grad_norm": 1.6953125, + "learning_rate": 8.501329359042683e-06, + "loss": 0.8748, + "step": 9631 + }, + { + "epoch": 1.639561051932186, + "grad_norm": 1.765625, + "learning_rate": 8.499543851186742e-06, + "loss": 0.8101, + "step": 9632 + }, + { + "epoch": 1.639732516021176, + "grad_norm": 1.828125, + "learning_rate": 8.497758392266328e-06, + "loss": 0.8246, + "step": 9633 + }, + { + "epoch": 1.6399039801101658, + "grad_norm": 1.7109375, + "learning_rate": 8.49597298233967e-06, + "loss": 0.7272, + "step": 9634 + }, + { + "epoch": 1.6400754441991556, + "grad_norm": 1.7109375, + "learning_rate": 8.494187621464997e-06, + "loss": 0.907, + "step": 9635 + }, + { + "epoch": 1.6402469082881455, + "grad_norm": 1.78125, + "learning_rate": 8.492402309700535e-06, + "loss": 0.8957, + "step": 9636 + }, + { + "epoch": 1.6404183723771353, + "grad_norm": 1.7578125, + "learning_rate": 8.490617047104511e-06, + "loss": 0.9228, + "step": 9637 + }, + { + "epoch": 1.640589836466125, + "grad_norm": 1.7734375, + "learning_rate": 8.488831833735149e-06, + "loss": 0.9267, + "step": 9638 + }, + { + "epoch": 1.640761300555115, + "grad_norm": 1.59375, + "learning_rate": 8.48704666965067e-06, + "loss": 0.8438, + "step": 9639 + }, + { + "epoch": 1.6409327646441048, + "grad_norm": 1.6796875, + "learning_rate": 8.485261554909298e-06, + "loss": 0.8037, + "step": 9640 + }, + { + "epoch": 1.6411042287330946, + "grad_norm": 1.75, + "learning_rate": 8.48347648956925e-06, + "loss": 0.8805, + "step": 9641 + }, + { + "epoch": 1.6412756928220844, + "grad_norm": 1.7578125, + "learning_rate": 8.481691473688745e-06, + "loss": 0.9045, + "step": 9642 + }, + { + "epoch": 1.6414471569110745, + "grad_norm": 1.7421875, + "learning_rate": 8.479906507325997e-06, + "loss": 0.8374, + "step": 9643 + }, + { + "epoch": 1.6416186210000643, + "grad_norm": 1.6484375, + "learning_rate": 8.478121590539221e-06, + "loss": 0.7615, + "step": 9644 + }, + { + "epoch": 1.641790085089054, + "grad_norm": 1.6875, + "learning_rate": 8.476336723386632e-06, + "loss": 0.8641, + "step": 9645 + }, + { + "epoch": 1.6419615491780442, + "grad_norm": 1.7265625, + "learning_rate": 8.474551905926446e-06, + "loss": 0.89, + "step": 9646 + }, + { + "epoch": 1.642133013267034, + "grad_norm": 1.78125, + "learning_rate": 8.47276713821686e-06, + "loss": 0.7849, + "step": 9647 + }, + { + "epoch": 1.6423044773560238, + "grad_norm": 1.53125, + "learning_rate": 8.47098242031609e-06, + "loss": 0.8519, + "step": 9648 + }, + { + "epoch": 1.6424759414450136, + "grad_norm": 1.6953125, + "learning_rate": 8.469197752282343e-06, + "loss": 0.8685, + "step": 9649 + }, + { + "epoch": 1.6426474055340035, + "grad_norm": 1.6875, + "learning_rate": 8.467413134173819e-06, + "loss": 0.813, + "step": 9650 + }, + { + "epoch": 1.6428188696229933, + "grad_norm": 1.6953125, + "learning_rate": 8.465628566048724e-06, + "loss": 0.8472, + "step": 9651 + }, + { + "epoch": 1.642990333711983, + "grad_norm": 1.6796875, + "learning_rate": 8.46384404796526e-06, + "loss": 0.8151, + "step": 9652 + }, + { + "epoch": 1.643161797800973, + "grad_norm": 1.7265625, + "learning_rate": 8.462059579981624e-06, + "loss": 0.8978, + "step": 9653 + }, + { + "epoch": 1.6433332618899628, + "grad_norm": 1.7890625, + "learning_rate": 8.460275162156019e-06, + "loss": 0.8837, + "step": 9654 + }, + { + "epoch": 1.6435047259789528, + "grad_norm": 1.7421875, + "learning_rate": 8.458490794546638e-06, + "loss": 0.852, + "step": 9655 + }, + { + "epoch": 1.6436761900679426, + "grad_norm": 1.671875, + "learning_rate": 8.456706477211677e-06, + "loss": 0.819, + "step": 9656 + }, + { + "epoch": 1.6438476541569325, + "grad_norm": 1.71875, + "learning_rate": 8.454922210209332e-06, + "loss": 0.9105, + "step": 9657 + }, + { + "epoch": 1.6440191182459225, + "grad_norm": 1.6875, + "learning_rate": 8.453137993597792e-06, + "loss": 0.8514, + "step": 9658 + }, + { + "epoch": 1.6441905823349123, + "grad_norm": 1.765625, + "learning_rate": 8.451353827435247e-06, + "loss": 0.8103, + "step": 9659 + }, + { + "epoch": 1.6443620464239022, + "grad_norm": 1.6640625, + "learning_rate": 8.449569711779883e-06, + "loss": 0.8241, + "step": 9660 + }, + { + "epoch": 1.644533510512892, + "grad_norm": 1.71875, + "learning_rate": 8.447785646689887e-06, + "loss": 0.8696, + "step": 9661 + }, + { + "epoch": 1.6447049746018818, + "grad_norm": 1.6640625, + "learning_rate": 8.446001632223448e-06, + "loss": 0.7978, + "step": 9662 + }, + { + "epoch": 1.6448764386908716, + "grad_norm": 1.7578125, + "learning_rate": 8.444217668438748e-06, + "loss": 0.7798, + "step": 9663 + }, + { + "epoch": 1.6450479027798615, + "grad_norm": 1.796875, + "learning_rate": 8.442433755393968e-06, + "loss": 0.9269, + "step": 9664 + }, + { + "epoch": 1.6452193668688513, + "grad_norm": 1.671875, + "learning_rate": 8.440649893147289e-06, + "loss": 0.842, + "step": 9665 + }, + { + "epoch": 1.645390830957841, + "grad_norm": 1.625, + "learning_rate": 8.438866081756889e-06, + "loss": 0.8264, + "step": 9666 + }, + { + "epoch": 1.6455622950468312, + "grad_norm": 1.6796875, + "learning_rate": 8.437082321280945e-06, + "loss": 0.7968, + "step": 9667 + }, + { + "epoch": 1.645733759135821, + "grad_norm": 1.703125, + "learning_rate": 8.435298611777632e-06, + "loss": 0.8673, + "step": 9668 + }, + { + "epoch": 1.6459052232248108, + "grad_norm": 1.7890625, + "learning_rate": 8.433514953305124e-06, + "loss": 0.9696, + "step": 9669 + }, + { + "epoch": 1.6460766873138009, + "grad_norm": 1.7265625, + "learning_rate": 8.431731345921592e-06, + "loss": 0.8855, + "step": 9670 + }, + { + "epoch": 1.6462481514027907, + "grad_norm": 1.640625, + "learning_rate": 8.429947789685206e-06, + "loss": 0.822, + "step": 9671 + }, + { + "epoch": 1.6464196154917805, + "grad_norm": 1.7890625, + "learning_rate": 8.428164284654133e-06, + "loss": 0.8785, + "step": 9672 + }, + { + "epoch": 1.6465910795807703, + "grad_norm": 1.6484375, + "learning_rate": 8.426380830886544e-06, + "loss": 0.8603, + "step": 9673 + }, + { + "epoch": 1.6467625436697602, + "grad_norm": 1.6875, + "learning_rate": 8.4245974284406e-06, + "loss": 0.8427, + "step": 9674 + }, + { + "epoch": 1.64693400775875, + "grad_norm": 1.8203125, + "learning_rate": 8.422814077374468e-06, + "loss": 0.8966, + "step": 9675 + }, + { + "epoch": 1.6471054718477398, + "grad_norm": 1.6484375, + "learning_rate": 8.421030777746306e-06, + "loss": 0.8551, + "step": 9676 + }, + { + "epoch": 1.6472769359367296, + "grad_norm": 1.7421875, + "learning_rate": 8.419247529614278e-06, + "loss": 0.8953, + "step": 9677 + }, + { + "epoch": 1.6474484000257195, + "grad_norm": 1.703125, + "learning_rate": 8.41746433303654e-06, + "loss": 0.8488, + "step": 9678 + }, + { + "epoch": 1.6476198641147095, + "grad_norm": 1.640625, + "learning_rate": 8.41568118807125e-06, + "loss": 0.8468, + "step": 9679 + }, + { + "epoch": 1.6477913282036993, + "grad_norm": 1.640625, + "learning_rate": 8.41389809477656e-06, + "loss": 0.7708, + "step": 9680 + }, + { + "epoch": 1.6479627922926892, + "grad_norm": 1.75, + "learning_rate": 8.412115053210631e-06, + "loss": 0.9317, + "step": 9681 + }, + { + "epoch": 1.6481342563816792, + "grad_norm": 1.703125, + "learning_rate": 8.410332063431606e-06, + "loss": 0.8848, + "step": 9682 + }, + { + "epoch": 1.648305720470669, + "grad_norm": 1.734375, + "learning_rate": 8.408549125497638e-06, + "loss": 0.8291, + "step": 9683 + }, + { + "epoch": 1.6484771845596589, + "grad_norm": 1.703125, + "learning_rate": 8.406766239466878e-06, + "loss": 0.8033, + "step": 9684 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 1.6015625, + "learning_rate": 8.404983405397468e-06, + "loss": 0.8135, + "step": 9685 + }, + { + "epoch": 1.6488201127376385, + "grad_norm": 1.6875, + "learning_rate": 8.403200623347556e-06, + "loss": 0.8502, + "step": 9686 + }, + { + "epoch": 1.6489915768266283, + "grad_norm": 1.6796875, + "learning_rate": 8.401417893375286e-06, + "loss": 0.7815, + "step": 9687 + }, + { + "epoch": 1.6491630409156182, + "grad_norm": 1.6875, + "learning_rate": 8.399635215538798e-06, + "loss": 0.8002, + "step": 9688 + }, + { + "epoch": 1.649334505004608, + "grad_norm": 1.734375, + "learning_rate": 8.39785258989623e-06, + "loss": 0.8572, + "step": 9689 + }, + { + "epoch": 1.6495059690935978, + "grad_norm": 1.7578125, + "learning_rate": 8.396070016505725e-06, + "loss": 0.9321, + "step": 9690 + }, + { + "epoch": 1.6496774331825879, + "grad_norm": 1.75, + "learning_rate": 8.394287495425412e-06, + "loss": 0.7589, + "step": 9691 + }, + { + "epoch": 1.6498488972715777, + "grad_norm": 1.7265625, + "learning_rate": 8.392505026713434e-06, + "loss": 0.8176, + "step": 9692 + }, + { + "epoch": 1.6500203613605675, + "grad_norm": 1.65625, + "learning_rate": 8.390722610427923e-06, + "loss": 0.7663, + "step": 9693 + }, + { + "epoch": 1.6501918254495576, + "grad_norm": 1.71875, + "learning_rate": 8.388940246627005e-06, + "loss": 0.8577, + "step": 9694 + }, + { + "epoch": 1.6503632895385474, + "grad_norm": 1.765625, + "learning_rate": 8.387157935368811e-06, + "loss": 0.8385, + "step": 9695 + }, + { + "epoch": 1.6505347536275372, + "grad_norm": 1.84375, + "learning_rate": 8.385375676711472e-06, + "loss": 0.9336, + "step": 9696 + }, + { + "epoch": 1.650706217716527, + "grad_norm": 1.6171875, + "learning_rate": 8.38359347071311e-06, + "loss": 0.7651, + "step": 9697 + }, + { + "epoch": 1.6508776818055169, + "grad_norm": 1.828125, + "learning_rate": 8.381811317431853e-06, + "loss": 0.9214, + "step": 9698 + }, + { + "epoch": 1.6510491458945067, + "grad_norm": 1.640625, + "learning_rate": 8.380029216925824e-06, + "loss": 0.8356, + "step": 9699 + }, + { + "epoch": 1.6512206099834965, + "grad_norm": 1.6484375, + "learning_rate": 8.378247169253138e-06, + "loss": 0.8845, + "step": 9700 + }, + { + "epoch": 1.6513920740724863, + "grad_norm": 1.7421875, + "learning_rate": 8.376465174471922e-06, + "loss": 0.8047, + "step": 9701 + }, + { + "epoch": 1.6515635381614762, + "grad_norm": 1.6328125, + "learning_rate": 8.374683232640289e-06, + "loss": 0.8342, + "step": 9702 + }, + { + "epoch": 1.6517350022504662, + "grad_norm": 1.75, + "learning_rate": 8.372901343816357e-06, + "loss": 0.9709, + "step": 9703 + }, + { + "epoch": 1.651906466339456, + "grad_norm": 1.6171875, + "learning_rate": 8.371119508058245e-06, + "loss": 0.9, + "step": 9704 + }, + { + "epoch": 1.6520779304284459, + "grad_norm": 1.6328125, + "learning_rate": 8.369337725424054e-06, + "loss": 0.8676, + "step": 9705 + }, + { + "epoch": 1.652249394517436, + "grad_norm": 1.6796875, + "learning_rate": 8.3675559959719e-06, + "loss": 0.8422, + "step": 9706 + }, + { + "epoch": 1.6524208586064257, + "grad_norm": 1.6796875, + "learning_rate": 8.365774319759892e-06, + "loss": 0.8387, + "step": 9707 + }, + { + "epoch": 1.6525923226954156, + "grad_norm": 1.7109375, + "learning_rate": 8.363992696846136e-06, + "loss": 0.8843, + "step": 9708 + }, + { + "epoch": 1.6527637867844054, + "grad_norm": 1.6015625, + "learning_rate": 8.36221112728874e-06, + "loss": 0.839, + "step": 9709 + }, + { + "epoch": 1.6529352508733952, + "grad_norm": 1.703125, + "learning_rate": 8.360429611145808e-06, + "loss": 0.7779, + "step": 9710 + }, + { + "epoch": 1.653106714962385, + "grad_norm": 1.7109375, + "learning_rate": 8.35864814847544e-06, + "loss": 0.8224, + "step": 9711 + }, + { + "epoch": 1.6532781790513749, + "grad_norm": 1.78125, + "learning_rate": 8.356866739335735e-06, + "loss": 0.8112, + "step": 9712 + }, + { + "epoch": 1.6534496431403647, + "grad_norm": 1.703125, + "learning_rate": 8.355085383784794e-06, + "loss": 0.8244, + "step": 9713 + }, + { + "epoch": 1.6536211072293545, + "grad_norm": 1.6953125, + "learning_rate": 8.353304081880713e-06, + "loss": 0.8558, + "step": 9714 + }, + { + "epoch": 1.6537925713183446, + "grad_norm": 1.6875, + "learning_rate": 8.351522833681587e-06, + "loss": 0.8791, + "step": 9715 + }, + { + "epoch": 1.6539640354073344, + "grad_norm": 1.7890625, + "learning_rate": 8.34974163924551e-06, + "loss": 0.8825, + "step": 9716 + }, + { + "epoch": 1.6541354994963242, + "grad_norm": 1.6953125, + "learning_rate": 8.347960498630574e-06, + "loss": 0.8269, + "step": 9717 + }, + { + "epoch": 1.6543069635853143, + "grad_norm": 1.6953125, + "learning_rate": 8.346179411894864e-06, + "loss": 0.8492, + "step": 9718 + }, + { + "epoch": 1.654478427674304, + "grad_norm": 1.734375, + "learning_rate": 8.34439837909647e-06, + "loss": 0.8248, + "step": 9719 + }, + { + "epoch": 1.654649891763294, + "grad_norm": 1.7265625, + "learning_rate": 8.342617400293482e-06, + "loss": 0.8771, + "step": 9720 + }, + { + "epoch": 1.6548213558522837, + "grad_norm": 1.6875, + "learning_rate": 8.34083647554398e-06, + "loss": 0.8594, + "step": 9721 + }, + { + "epoch": 1.6549928199412736, + "grad_norm": 1.671875, + "learning_rate": 8.33905560490605e-06, + "loss": 0.8927, + "step": 9722 + }, + { + "epoch": 1.6551642840302634, + "grad_norm": 1.640625, + "learning_rate": 8.337274788437769e-06, + "loss": 0.789, + "step": 9723 + }, + { + "epoch": 1.6553357481192532, + "grad_norm": 1.7109375, + "learning_rate": 8.33549402619722e-06, + "loss": 0.7861, + "step": 9724 + }, + { + "epoch": 1.655507212208243, + "grad_norm": 1.75, + "learning_rate": 8.333713318242477e-06, + "loss": 0.7888, + "step": 9725 + }, + { + "epoch": 1.6556786762972329, + "grad_norm": 1.6796875, + "learning_rate": 8.331932664631619e-06, + "loss": 0.8536, + "step": 9726 + }, + { + "epoch": 1.655850140386223, + "grad_norm": 1.6953125, + "learning_rate": 8.33015206542272e-06, + "loss": 0.7891, + "step": 9727 + }, + { + "epoch": 1.6560216044752127, + "grad_norm": 1.734375, + "learning_rate": 8.328371520673848e-06, + "loss": 0.887, + "step": 9728 + }, + { + "epoch": 1.6561930685642026, + "grad_norm": 1.7109375, + "learning_rate": 8.326591030443075e-06, + "loss": 0.8448, + "step": 9729 + }, + { + "epoch": 1.6563645326531926, + "grad_norm": 1.96875, + "learning_rate": 8.324810594788471e-06, + "loss": 0.848, + "step": 9730 + }, + { + "epoch": 1.6565359967421824, + "grad_norm": 1.7421875, + "learning_rate": 8.3230302137681e-06, + "loss": 0.9769, + "step": 9731 + }, + { + "epoch": 1.6567074608311723, + "grad_norm": 1.6484375, + "learning_rate": 8.32124988744003e-06, + "loss": 0.826, + "step": 9732 + }, + { + "epoch": 1.656878924920162, + "grad_norm": 1.84375, + "learning_rate": 8.319469615862324e-06, + "loss": 0.9164, + "step": 9733 + }, + { + "epoch": 1.657050389009152, + "grad_norm": 1.703125, + "learning_rate": 8.317689399093039e-06, + "loss": 0.8394, + "step": 9734 + }, + { + "epoch": 1.6572218530981417, + "grad_norm": 1.6796875, + "learning_rate": 8.315909237190241e-06, + "loss": 0.858, + "step": 9735 + }, + { + "epoch": 1.6573933171871316, + "grad_norm": 1.703125, + "learning_rate": 8.314129130211981e-06, + "loss": 0.8047, + "step": 9736 + }, + { + "epoch": 1.6575647812761214, + "grad_norm": 1.625, + "learning_rate": 8.31234907821632e-06, + "loss": 0.8094, + "step": 9737 + }, + { + "epoch": 1.6577362453651112, + "grad_norm": 1.78125, + "learning_rate": 8.31056908126131e-06, + "loss": 0.9054, + "step": 9738 + }, + { + "epoch": 1.657907709454101, + "grad_norm": 1.6953125, + "learning_rate": 8.308789139405008e-06, + "loss": 0.8816, + "step": 9739 + }, + { + "epoch": 1.658079173543091, + "grad_norm": 1.7265625, + "learning_rate": 8.30700925270546e-06, + "loss": 0.744, + "step": 9740 + }, + { + "epoch": 1.658250637632081, + "grad_norm": 1.7734375, + "learning_rate": 8.305229421220712e-06, + "loss": 0.9439, + "step": 9741 + }, + { + "epoch": 1.6584221017210707, + "grad_norm": 1.7421875, + "learning_rate": 8.303449645008815e-06, + "loss": 0.8557, + "step": 9742 + }, + { + "epoch": 1.6585935658100608, + "grad_norm": 1.7265625, + "learning_rate": 8.301669924127815e-06, + "loss": 0.8596, + "step": 9743 + }, + { + "epoch": 1.6587650298990506, + "grad_norm": 1.7734375, + "learning_rate": 8.299890258635753e-06, + "loss": 0.9133, + "step": 9744 + }, + { + "epoch": 1.6589364939880404, + "grad_norm": 1.703125, + "learning_rate": 8.298110648590673e-06, + "loss": 0.8114, + "step": 9745 + }, + { + "epoch": 1.6591079580770303, + "grad_norm": 1.65625, + "learning_rate": 8.296331094050609e-06, + "loss": 0.8435, + "step": 9746 + }, + { + "epoch": 1.65927942216602, + "grad_norm": 1.734375, + "learning_rate": 8.294551595073608e-06, + "loss": 0.8475, + "step": 9747 + }, + { + "epoch": 1.65945088625501, + "grad_norm": 1.6640625, + "learning_rate": 8.2927721517177e-06, + "loss": 0.8274, + "step": 9748 + }, + { + "epoch": 1.6596223503439997, + "grad_norm": 1.7265625, + "learning_rate": 8.290992764040922e-06, + "loss": 0.8764, + "step": 9749 + }, + { + "epoch": 1.6597938144329896, + "grad_norm": 1.6875, + "learning_rate": 8.28921343210131e-06, + "loss": 0.8514, + "step": 9750 + }, + { + "epoch": 1.6599652785219794, + "grad_norm": 1.703125, + "learning_rate": 8.287434155956885e-06, + "loss": 0.8684, + "step": 9751 + }, + { + "epoch": 1.6601367426109694, + "grad_norm": 1.7421875, + "learning_rate": 8.28565493566568e-06, + "loss": 0.9562, + "step": 9752 + }, + { + "epoch": 1.6603082066999593, + "grad_norm": 1.6796875, + "learning_rate": 8.283875771285725e-06, + "loss": 0.8543, + "step": 9753 + }, + { + "epoch": 1.660479670788949, + "grad_norm": 1.7734375, + "learning_rate": 8.282096662875042e-06, + "loss": 0.9567, + "step": 9754 + }, + { + "epoch": 1.6606511348779391, + "grad_norm": 1.640625, + "learning_rate": 8.280317610491655e-06, + "loss": 0.8085, + "step": 9755 + }, + { + "epoch": 1.660822598966929, + "grad_norm": 1.8203125, + "learning_rate": 8.27853861419359e-06, + "loss": 0.9327, + "step": 9756 + }, + { + "epoch": 1.6609940630559188, + "grad_norm": 1.7265625, + "learning_rate": 8.276759674038861e-06, + "loss": 0.8291, + "step": 9757 + }, + { + "epoch": 1.6611655271449086, + "grad_norm": 1.6640625, + "learning_rate": 8.274980790085489e-06, + "loss": 0.8373, + "step": 9758 + }, + { + "epoch": 1.6613369912338984, + "grad_norm": 1.71875, + "learning_rate": 8.273201962391488e-06, + "loss": 0.8789, + "step": 9759 + }, + { + "epoch": 1.6615084553228883, + "grad_norm": 1.734375, + "learning_rate": 8.271423191014874e-06, + "loss": 0.868, + "step": 9760 + }, + { + "epoch": 1.661679919411878, + "grad_norm": 1.7578125, + "learning_rate": 8.269644476013661e-06, + "loss": 0.8498, + "step": 9761 + }, + { + "epoch": 1.661851383500868, + "grad_norm": 1.6484375, + "learning_rate": 8.26786581744586e-06, + "loss": 0.8065, + "step": 9762 + }, + { + "epoch": 1.6620228475898577, + "grad_norm": 1.75, + "learning_rate": 8.26608721536947e-06, + "loss": 0.8592, + "step": 9763 + }, + { + "epoch": 1.6621943116788478, + "grad_norm": 1.78125, + "learning_rate": 8.26430866984251e-06, + "loss": 0.8404, + "step": 9764 + }, + { + "epoch": 1.6623657757678376, + "grad_norm": 1.6796875, + "learning_rate": 8.262530180922978e-06, + "loss": 0.824, + "step": 9765 + }, + { + "epoch": 1.6625372398568274, + "grad_norm": 1.796875, + "learning_rate": 8.260751748668881e-06, + "loss": 0.8971, + "step": 9766 + }, + { + "epoch": 1.6627087039458175, + "grad_norm": 1.7578125, + "learning_rate": 8.258973373138218e-06, + "loss": 0.924, + "step": 9767 + }, + { + "epoch": 1.6628801680348073, + "grad_norm": 1.765625, + "learning_rate": 8.25719505438899e-06, + "loss": 0.8984, + "step": 9768 + }, + { + "epoch": 1.6630516321237971, + "grad_norm": 1.765625, + "learning_rate": 8.255416792479192e-06, + "loss": 0.747, + "step": 9769 + }, + { + "epoch": 1.663223096212787, + "grad_norm": 1.7421875, + "learning_rate": 8.253638587466823e-06, + "loss": 0.9168, + "step": 9770 + }, + { + "epoch": 1.6633945603017768, + "grad_norm": 1.71875, + "learning_rate": 8.251860439409877e-06, + "loss": 0.8687, + "step": 9771 + }, + { + "epoch": 1.6635660243907666, + "grad_norm": 1.84375, + "learning_rate": 8.250082348366343e-06, + "loss": 0.9425, + "step": 9772 + }, + { + "epoch": 1.6637374884797564, + "grad_norm": 1.6953125, + "learning_rate": 8.248304314394218e-06, + "loss": 0.8669, + "step": 9773 + }, + { + "epoch": 1.6639089525687463, + "grad_norm": 1.7109375, + "learning_rate": 8.246526337551482e-06, + "loss": 0.8023, + "step": 9774 + }, + { + "epoch": 1.664080416657736, + "grad_norm": 1.7109375, + "learning_rate": 8.244748417896126e-06, + "loss": 0.9566, + "step": 9775 + }, + { + "epoch": 1.6642518807467261, + "grad_norm": 1.703125, + "learning_rate": 8.242970555486131e-06, + "loss": 0.8965, + "step": 9776 + }, + { + "epoch": 1.664423344835716, + "grad_norm": 1.671875, + "learning_rate": 8.241192750379484e-06, + "loss": 0.8908, + "step": 9777 + }, + { + "epoch": 1.6645948089247058, + "grad_norm": 1.6953125, + "learning_rate": 8.239415002634166e-06, + "loss": 0.8447, + "step": 9778 + }, + { + "epoch": 1.6647662730136958, + "grad_norm": 1.640625, + "learning_rate": 8.237637312308154e-06, + "loss": 0.8, + "step": 9779 + }, + { + "epoch": 1.6649377371026857, + "grad_norm": 1.7109375, + "learning_rate": 8.235859679459427e-06, + "loss": 0.8898, + "step": 9780 + }, + { + "epoch": 1.6651092011916755, + "grad_norm": 1.671875, + "learning_rate": 8.234082104145956e-06, + "loss": 0.9017, + "step": 9781 + }, + { + "epoch": 1.6652806652806653, + "grad_norm": 1.78125, + "learning_rate": 8.232304586425717e-06, + "loss": 0.8863, + "step": 9782 + }, + { + "epoch": 1.6654521293696551, + "grad_norm": 1.625, + "learning_rate": 8.230527126356684e-06, + "loss": 0.8047, + "step": 9783 + }, + { + "epoch": 1.665623593458645, + "grad_norm": 1.7265625, + "learning_rate": 8.228749723996825e-06, + "loss": 0.9204, + "step": 9784 + }, + { + "epoch": 1.6657950575476348, + "grad_norm": 1.7734375, + "learning_rate": 8.226972379404108e-06, + "loss": 0.9076, + "step": 9785 + }, + { + "epoch": 1.6659665216366246, + "grad_norm": 1.6875, + "learning_rate": 8.225195092636497e-06, + "loss": 0.8587, + "step": 9786 + }, + { + "epoch": 1.6661379857256144, + "grad_norm": 1.671875, + "learning_rate": 8.223417863751956e-06, + "loss": 0.8356, + "step": 9787 + }, + { + "epoch": 1.6663094498146045, + "grad_norm": 1.6796875, + "learning_rate": 8.221640692808451e-06, + "loss": 0.8909, + "step": 9788 + }, + { + "epoch": 1.6664809139035943, + "grad_norm": 1.625, + "learning_rate": 8.219863579863939e-06, + "loss": 0.7785, + "step": 9789 + }, + { + "epoch": 1.6666523779925841, + "grad_norm": 1.6484375, + "learning_rate": 8.218086524976377e-06, + "loss": 0.8028, + "step": 9790 + }, + { + "epoch": 1.6668238420815742, + "grad_norm": 1.6328125, + "learning_rate": 8.216309528203726e-06, + "loss": 0.8187, + "step": 9791 + }, + { + "epoch": 1.666995306170564, + "grad_norm": 1.75, + "learning_rate": 8.214532589603936e-06, + "loss": 0.9681, + "step": 9792 + }, + { + "epoch": 1.6671667702595538, + "grad_norm": 1.734375, + "learning_rate": 8.212755709234962e-06, + "loss": 0.8375, + "step": 9793 + }, + { + "epoch": 1.6673382343485437, + "grad_norm": 1.78125, + "learning_rate": 8.210978887154753e-06, + "loss": 0.9124, + "step": 9794 + }, + { + "epoch": 1.6675096984375335, + "grad_norm": 1.6796875, + "learning_rate": 8.209202123421262e-06, + "loss": 0.8024, + "step": 9795 + }, + { + "epoch": 1.6676811625265233, + "grad_norm": 1.625, + "learning_rate": 8.207425418092438e-06, + "loss": 0.8746, + "step": 9796 + }, + { + "epoch": 1.6678526266155131, + "grad_norm": 1.6328125, + "learning_rate": 8.205648771226215e-06, + "loss": 0.8642, + "step": 9797 + }, + { + "epoch": 1.668024090704503, + "grad_norm": 1.8125, + "learning_rate": 8.203872182880544e-06, + "loss": 0.8817, + "step": 9798 + }, + { + "epoch": 1.6681955547934928, + "grad_norm": 1.6796875, + "learning_rate": 8.202095653113365e-06, + "loss": 0.8011, + "step": 9799 + }, + { + "epoch": 1.6683670188824828, + "grad_norm": 1.7109375, + "learning_rate": 8.200319181982615e-06, + "loss": 0.8085, + "step": 9800 + }, + { + "epoch": 1.6683670188824828, + "eval_loss": 0.8372142910957336, + "eval_runtime": 835.8053, + "eval_samples_per_second": 2.99, + "eval_steps_per_second": 2.99, + "step": 9800 + }, + { + "epoch": 1.6685384829714727, + "grad_norm": 1.7109375, + "learning_rate": 8.198542769546235e-06, + "loss": 0.8591, + "step": 9801 + }, + { + "epoch": 1.6687099470604625, + "grad_norm": 1.7578125, + "learning_rate": 8.196766415862157e-06, + "loss": 0.9123, + "step": 9802 + }, + { + "epoch": 1.6688814111494525, + "grad_norm": 1.6953125, + "learning_rate": 8.194990120988317e-06, + "loss": 0.7732, + "step": 9803 + }, + { + "epoch": 1.6690528752384424, + "grad_norm": 1.6796875, + "learning_rate": 8.193213884982648e-06, + "loss": 0.8743, + "step": 9804 + }, + { + "epoch": 1.6692243393274322, + "grad_norm": 1.8203125, + "learning_rate": 8.191437707903077e-06, + "loss": 0.9104, + "step": 9805 + }, + { + "epoch": 1.669395803416422, + "grad_norm": 1.7578125, + "learning_rate": 8.189661589807532e-06, + "loss": 0.9285, + "step": 9806 + }, + { + "epoch": 1.6695672675054118, + "grad_norm": 1.703125, + "learning_rate": 8.187885530753942e-06, + "loss": 0.7954, + "step": 9807 + }, + { + "epoch": 1.6697387315944017, + "grad_norm": 1.7109375, + "learning_rate": 8.186109530800231e-06, + "loss": 0.9223, + "step": 9808 + }, + { + "epoch": 1.6699101956833915, + "grad_norm": 1.7734375, + "learning_rate": 8.184333590004315e-06, + "loss": 0.8979, + "step": 9809 + }, + { + "epoch": 1.6700816597723813, + "grad_norm": 1.625, + "learning_rate": 8.182557708424116e-06, + "loss": 0.8499, + "step": 9810 + }, + { + "epoch": 1.6702531238613711, + "grad_norm": 1.6171875, + "learning_rate": 8.180781886117554e-06, + "loss": 0.8487, + "step": 9811 + }, + { + "epoch": 1.6704245879503612, + "grad_norm": 1.7421875, + "learning_rate": 8.179006123142548e-06, + "loss": 0.8923, + "step": 9812 + }, + { + "epoch": 1.670596052039351, + "grad_norm": 1.6953125, + "learning_rate": 8.177230419557009e-06, + "loss": 0.8345, + "step": 9813 + }, + { + "epoch": 1.6707675161283408, + "grad_norm": 1.765625, + "learning_rate": 8.17545477541885e-06, + "loss": 0.9097, + "step": 9814 + }, + { + "epoch": 1.670938980217331, + "grad_norm": 1.7421875, + "learning_rate": 8.17367919078598e-06, + "loss": 0.8291, + "step": 9815 + }, + { + "epoch": 1.6711104443063207, + "grad_norm": 1.5703125, + "learning_rate": 8.17190366571631e-06, + "loss": 0.7673, + "step": 9816 + }, + { + "epoch": 1.6712819083953105, + "grad_norm": 1.6875, + "learning_rate": 8.170128200267745e-06, + "loss": 0.8326, + "step": 9817 + }, + { + "epoch": 1.6714533724843004, + "grad_norm": 1.6875, + "learning_rate": 8.16835279449819e-06, + "loss": 0.7833, + "step": 9818 + }, + { + "epoch": 1.6716248365732902, + "grad_norm": 1.828125, + "learning_rate": 8.16657744846555e-06, + "loss": 0.8009, + "step": 9819 + }, + { + "epoch": 1.67179630066228, + "grad_norm": 1.7265625, + "learning_rate": 8.164802162227722e-06, + "loss": 0.8816, + "step": 9820 + }, + { + "epoch": 1.6719677647512698, + "grad_norm": 1.6640625, + "learning_rate": 8.163026935842604e-06, + "loss": 0.8289, + "step": 9821 + }, + { + "epoch": 1.6721392288402597, + "grad_norm": 1.7109375, + "learning_rate": 8.161251769368098e-06, + "loss": 0.9218, + "step": 9822 + }, + { + "epoch": 1.6723106929292495, + "grad_norm": 1.703125, + "learning_rate": 8.159476662862094e-06, + "loss": 0.8668, + "step": 9823 + }, + { + "epoch": 1.6724821570182395, + "grad_norm": 1.6796875, + "learning_rate": 8.157701616382487e-06, + "loss": 0.8952, + "step": 9824 + }, + { + "epoch": 1.6726536211072294, + "grad_norm": 1.71875, + "learning_rate": 8.155926629987169e-06, + "loss": 0.9246, + "step": 9825 + }, + { + "epoch": 1.6728250851962192, + "grad_norm": 1.7265625, + "learning_rate": 8.154151703734026e-06, + "loss": 0.8608, + "step": 9826 + }, + { + "epoch": 1.6729965492852092, + "grad_norm": 1.7890625, + "learning_rate": 8.152376837680947e-06, + "loss": 0.8636, + "step": 9827 + }, + { + "epoch": 1.673168013374199, + "grad_norm": 1.75, + "learning_rate": 8.150602031885818e-06, + "loss": 0.9174, + "step": 9828 + }, + { + "epoch": 1.673339477463189, + "grad_norm": 1.65625, + "learning_rate": 8.14882728640652e-06, + "loss": 0.8828, + "step": 9829 + }, + { + "epoch": 1.6735109415521787, + "grad_norm": 1.65625, + "learning_rate": 8.147052601300936e-06, + "loss": 0.8424, + "step": 9830 + }, + { + "epoch": 1.6736824056411685, + "grad_norm": 1.7421875, + "learning_rate": 8.145277976626945e-06, + "loss": 0.8268, + "step": 9831 + }, + { + "epoch": 1.6738538697301584, + "grad_norm": 1.671875, + "learning_rate": 8.143503412442423e-06, + "loss": 0.9055, + "step": 9832 + }, + { + "epoch": 1.6740253338191482, + "grad_norm": 1.7109375, + "learning_rate": 8.141728908805244e-06, + "loss": 0.854, + "step": 9833 + }, + { + "epoch": 1.674196797908138, + "grad_norm": 1.71875, + "learning_rate": 8.139954465773283e-06, + "loss": 0.8855, + "step": 9834 + }, + { + "epoch": 1.6743682619971278, + "grad_norm": 1.734375, + "learning_rate": 8.138180083404412e-06, + "loss": 0.8924, + "step": 9835 + }, + { + "epoch": 1.6745397260861177, + "grad_norm": 1.6640625, + "learning_rate": 8.1364057617565e-06, + "loss": 0.8561, + "step": 9836 + }, + { + "epoch": 1.6747111901751077, + "grad_norm": 1.7265625, + "learning_rate": 8.134631500887412e-06, + "loss": 0.9067, + "step": 9837 + }, + { + "epoch": 1.6748826542640975, + "grad_norm": 1.8359375, + "learning_rate": 8.132857300855016e-06, + "loss": 0.8568, + "step": 9838 + }, + { + "epoch": 1.6750541183530874, + "grad_norm": 1.7578125, + "learning_rate": 8.131083161717175e-06, + "loss": 0.8184, + "step": 9839 + }, + { + "epoch": 1.6752255824420774, + "grad_norm": 1.6328125, + "learning_rate": 8.129309083531746e-06, + "loss": 0.8091, + "step": 9840 + }, + { + "epoch": 1.6753970465310672, + "grad_norm": 1.7109375, + "learning_rate": 8.127535066356595e-06, + "loss": 0.8772, + "step": 9841 + }, + { + "epoch": 1.675568510620057, + "grad_norm": 1.5859375, + "learning_rate": 8.12576111024958e-06, + "loss": 0.7298, + "step": 9842 + }, + { + "epoch": 1.675739974709047, + "grad_norm": 1.5625, + "learning_rate": 8.123987215268551e-06, + "loss": 0.806, + "step": 9843 + }, + { + "epoch": 1.6759114387980367, + "grad_norm": 1.640625, + "learning_rate": 8.122213381471363e-06, + "loss": 0.8501, + "step": 9844 + }, + { + "epoch": 1.6760829028870265, + "grad_norm": 1.7265625, + "learning_rate": 8.120439608915866e-06, + "loss": 0.8018, + "step": 9845 + }, + { + "epoch": 1.6762543669760164, + "grad_norm": 1.78125, + "learning_rate": 8.118665897659912e-06, + "loss": 0.8389, + "step": 9846 + }, + { + "epoch": 1.6764258310650062, + "grad_norm": 1.7578125, + "learning_rate": 8.116892247761348e-06, + "loss": 0.8466, + "step": 9847 + }, + { + "epoch": 1.676597295153996, + "grad_norm": 1.703125, + "learning_rate": 8.115118659278019e-06, + "loss": 0.8985, + "step": 9848 + }, + { + "epoch": 1.676768759242986, + "grad_norm": 1.7578125, + "learning_rate": 8.113345132267765e-06, + "loss": 0.8728, + "step": 9849 + }, + { + "epoch": 1.676940223331976, + "grad_norm": 1.7421875, + "learning_rate": 8.111571666788433e-06, + "loss": 0.7813, + "step": 9850 + }, + { + "epoch": 1.6771116874209657, + "grad_norm": 1.671875, + "learning_rate": 8.10979826289786e-06, + "loss": 0.779, + "step": 9851 + }, + { + "epoch": 1.6772831515099558, + "grad_norm": 1.765625, + "learning_rate": 8.108024920653885e-06, + "loss": 0.8168, + "step": 9852 + }, + { + "epoch": 1.6774546155989456, + "grad_norm": 1.6484375, + "learning_rate": 8.106251640114342e-06, + "loss": 0.8528, + "step": 9853 + }, + { + "epoch": 1.6776260796879354, + "grad_norm": 1.71875, + "learning_rate": 8.104478421337065e-06, + "loss": 0.8275, + "step": 9854 + }, + { + "epoch": 1.6777975437769252, + "grad_norm": 1.578125, + "learning_rate": 8.102705264379884e-06, + "loss": 0.7311, + "step": 9855 + }, + { + "epoch": 1.677969007865915, + "grad_norm": 1.78125, + "learning_rate": 8.100932169300627e-06, + "loss": 0.9246, + "step": 9856 + }, + { + "epoch": 1.678140471954905, + "grad_norm": 1.640625, + "learning_rate": 8.099159136157122e-06, + "loss": 0.8039, + "step": 9857 + }, + { + "epoch": 1.6783119360438947, + "grad_norm": 1.6328125, + "learning_rate": 8.097386165007197e-06, + "loss": 0.8484, + "step": 9858 + }, + { + "epoch": 1.6784834001328846, + "grad_norm": 1.7578125, + "learning_rate": 8.095613255908674e-06, + "loss": 0.878, + "step": 9859 + }, + { + "epoch": 1.6786548642218744, + "grad_norm": 1.6953125, + "learning_rate": 8.093840408919373e-06, + "loss": 0.9227, + "step": 9860 + }, + { + "epoch": 1.6788263283108644, + "grad_norm": 1.828125, + "learning_rate": 8.092067624097116e-06, + "loss": 0.9157, + "step": 9861 + }, + { + "epoch": 1.6789977923998542, + "grad_norm": 1.6484375, + "learning_rate": 8.090294901499718e-06, + "loss": 0.8615, + "step": 9862 + }, + { + "epoch": 1.679169256488844, + "grad_norm": 1.7109375, + "learning_rate": 8.088522241184992e-06, + "loss": 0.9062, + "step": 9863 + }, + { + "epoch": 1.6793407205778341, + "grad_norm": 1.75, + "learning_rate": 8.086749643210758e-06, + "loss": 0.8219, + "step": 9864 + }, + { + "epoch": 1.679512184666824, + "grad_norm": 1.7109375, + "learning_rate": 8.08497710763482e-06, + "loss": 0.8179, + "step": 9865 + }, + { + "epoch": 1.6796836487558138, + "grad_norm": 1.671875, + "learning_rate": 8.083204634514991e-06, + "loss": 0.7425, + "step": 9866 + }, + { + "epoch": 1.6798551128448036, + "grad_norm": 1.71875, + "learning_rate": 8.081432223909076e-06, + "loss": 0.8082, + "step": 9867 + }, + { + "epoch": 1.6800265769337934, + "grad_norm": 1.6640625, + "learning_rate": 8.079659875874883e-06, + "loss": 0.8319, + "step": 9868 + }, + { + "epoch": 1.6801980410227833, + "grad_norm": 1.8203125, + "learning_rate": 8.07788759047021e-06, + "loss": 0.8149, + "step": 9869 + }, + { + "epoch": 1.680369505111773, + "grad_norm": 1.7890625, + "learning_rate": 8.07611536775286e-06, + "loss": 0.8184, + "step": 9870 + }, + { + "epoch": 1.680540969200763, + "grad_norm": 1.7578125, + "learning_rate": 8.074343207780635e-06, + "loss": 0.8821, + "step": 9871 + }, + { + "epoch": 1.6807124332897527, + "grad_norm": 1.640625, + "learning_rate": 8.072571110611329e-06, + "loss": 0.8301, + "step": 9872 + }, + { + "epoch": 1.6808838973787428, + "grad_norm": 1.7421875, + "learning_rate": 8.070799076302735e-06, + "loss": 0.9121, + "step": 9873 + }, + { + "epoch": 1.6810553614677326, + "grad_norm": 1.65625, + "learning_rate": 8.06902710491265e-06, + "loss": 0.8864, + "step": 9874 + }, + { + "epoch": 1.6812268255567224, + "grad_norm": 1.671875, + "learning_rate": 8.067255196498862e-06, + "loss": 0.8234, + "step": 9875 + }, + { + "epoch": 1.6813982896457125, + "grad_norm": 1.796875, + "learning_rate": 8.06548335111916e-06, + "loss": 0.9018, + "step": 9876 + }, + { + "epoch": 1.6815697537347023, + "grad_norm": 1.7421875, + "learning_rate": 8.063711568831332e-06, + "loss": 0.9811, + "step": 9877 + }, + { + "epoch": 1.6817412178236921, + "grad_norm": 1.640625, + "learning_rate": 8.06193984969316e-06, + "loss": 0.8669, + "step": 9878 + }, + { + "epoch": 1.681912681912682, + "grad_norm": 1.7265625, + "learning_rate": 8.060168193762428e-06, + "loss": 0.89, + "step": 9879 + }, + { + "epoch": 1.6820841460016718, + "grad_norm": 1.6875, + "learning_rate": 8.058396601096914e-06, + "loss": 0.8921, + "step": 9880 + }, + { + "epoch": 1.6822556100906616, + "grad_norm": 1.6328125, + "learning_rate": 8.0566250717544e-06, + "loss": 0.8404, + "step": 9881 + }, + { + "epoch": 1.6824270741796514, + "grad_norm": 1.7578125, + "learning_rate": 8.05485360579266e-06, + "loss": 0.9198, + "step": 9882 + }, + { + "epoch": 1.6825985382686413, + "grad_norm": 1.6015625, + "learning_rate": 8.053082203269467e-06, + "loss": 0.8214, + "step": 9883 + }, + { + "epoch": 1.682770002357631, + "grad_norm": 1.671875, + "learning_rate": 8.051310864242598e-06, + "loss": 0.853, + "step": 9884 + }, + { + "epoch": 1.6829414664466211, + "grad_norm": 1.6875, + "learning_rate": 8.049539588769816e-06, + "loss": 0.8835, + "step": 9885 + }, + { + "epoch": 1.683112930535611, + "grad_norm": 1.7265625, + "learning_rate": 8.047768376908896e-06, + "loss": 0.7677, + "step": 9886 + }, + { + "epoch": 1.6832843946246008, + "grad_norm": 1.6640625, + "learning_rate": 8.045997228717597e-06, + "loss": 0.8603, + "step": 9887 + }, + { + "epoch": 1.6834558587135908, + "grad_norm": 1.6953125, + "learning_rate": 8.04422614425369e-06, + "loss": 0.9081, + "step": 9888 + }, + { + "epoch": 1.6836273228025807, + "grad_norm": 1.7578125, + "learning_rate": 8.042455123574936e-06, + "loss": 0.8705, + "step": 9889 + }, + { + "epoch": 1.6837987868915705, + "grad_norm": 1.640625, + "learning_rate": 8.040684166739088e-06, + "loss": 0.8418, + "step": 9890 + }, + { + "epoch": 1.6839702509805603, + "grad_norm": 1.7265625, + "learning_rate": 8.038913273803906e-06, + "loss": 0.8933, + "step": 9891 + }, + { + "epoch": 1.6841417150695501, + "grad_norm": 1.6640625, + "learning_rate": 8.03714244482715e-06, + "loss": 0.9062, + "step": 9892 + }, + { + "epoch": 1.68431317915854, + "grad_norm": 1.5546875, + "learning_rate": 8.035371679866569e-06, + "loss": 0.7517, + "step": 9893 + }, + { + "epoch": 1.6844846432475298, + "grad_norm": 1.671875, + "learning_rate": 8.033600978979913e-06, + "loss": 0.826, + "step": 9894 + }, + { + "epoch": 1.6846561073365196, + "grad_norm": 1.7265625, + "learning_rate": 8.031830342224935e-06, + "loss": 0.8717, + "step": 9895 + }, + { + "epoch": 1.6848275714255094, + "grad_norm": 1.6875, + "learning_rate": 8.030059769659382e-06, + "loss": 0.8776, + "step": 9896 + }, + { + "epoch": 1.6849990355144995, + "grad_norm": 1.6484375, + "learning_rate": 8.028289261340998e-06, + "loss": 0.8231, + "step": 9897 + }, + { + "epoch": 1.6851704996034893, + "grad_norm": 1.640625, + "learning_rate": 8.026518817327527e-06, + "loss": 0.8734, + "step": 9898 + }, + { + "epoch": 1.6853419636924791, + "grad_norm": 1.6875, + "learning_rate": 8.024748437676707e-06, + "loss": 0.8245, + "step": 9899 + }, + { + "epoch": 1.6855134277814692, + "grad_norm": 1.71875, + "learning_rate": 8.022978122446284e-06, + "loss": 0.8654, + "step": 9900 + }, + { + "epoch": 1.685684891870459, + "grad_norm": 1.703125, + "learning_rate": 8.021207871693984e-06, + "loss": 0.8636, + "step": 9901 + }, + { + "epoch": 1.6858563559594488, + "grad_norm": 1.6015625, + "learning_rate": 8.019437685477548e-06, + "loss": 0.9033, + "step": 9902 + }, + { + "epoch": 1.6860278200484387, + "grad_norm": 1.75, + "learning_rate": 8.017667563854706e-06, + "loss": 0.9517, + "step": 9903 + }, + { + "epoch": 1.6861992841374285, + "grad_norm": 1.6484375, + "learning_rate": 8.015897506883188e-06, + "loss": 0.8501, + "step": 9904 + }, + { + "epoch": 1.6863707482264183, + "grad_norm": 1.625, + "learning_rate": 8.014127514620726e-06, + "loss": 0.828, + "step": 9905 + }, + { + "epoch": 1.6865422123154081, + "grad_norm": 1.765625, + "learning_rate": 8.012357587125043e-06, + "loss": 0.8238, + "step": 9906 + }, + { + "epoch": 1.686713676404398, + "grad_norm": 1.734375, + "learning_rate": 8.010587724453865e-06, + "loss": 0.8278, + "step": 9907 + }, + { + "epoch": 1.6868851404933878, + "grad_norm": 1.78125, + "learning_rate": 8.008817926664912e-06, + "loss": 0.8806, + "step": 9908 + }, + { + "epoch": 1.6870566045823778, + "grad_norm": 1.6796875, + "learning_rate": 8.007048193815905e-06, + "loss": 0.8198, + "step": 9909 + }, + { + "epoch": 1.6872280686713677, + "grad_norm": 1.578125, + "learning_rate": 8.005278525964562e-06, + "loss": 0.7713, + "step": 9910 + }, + { + "epoch": 1.6873995327603575, + "grad_norm": 1.7109375, + "learning_rate": 8.003508923168596e-06, + "loss": 0.8293, + "step": 9911 + }, + { + "epoch": 1.6875709968493475, + "grad_norm": 1.7109375, + "learning_rate": 8.001739385485724e-06, + "loss": 0.8553, + "step": 9912 + }, + { + "epoch": 1.6877424609383374, + "grad_norm": 1.7109375, + "learning_rate": 7.999969912973656e-06, + "loss": 0.7869, + "step": 9913 + }, + { + "epoch": 1.6879139250273272, + "grad_norm": 1.65625, + "learning_rate": 7.998200505690097e-06, + "loss": 0.7464, + "step": 9914 + }, + { + "epoch": 1.688085389116317, + "grad_norm": 1.625, + "learning_rate": 7.99643116369276e-06, + "loss": 0.8032, + "step": 9915 + }, + { + "epoch": 1.6882568532053068, + "grad_norm": 1.6796875, + "learning_rate": 7.994661887039347e-06, + "loss": 0.8418, + "step": 9916 + }, + { + "epoch": 1.6884283172942967, + "grad_norm": 1.6875, + "learning_rate": 7.99289267578756e-06, + "loss": 0.7836, + "step": 9917 + }, + { + "epoch": 1.6885997813832865, + "grad_norm": 1.7734375, + "learning_rate": 7.991123529995102e-06, + "loss": 0.872, + "step": 9918 + }, + { + "epoch": 1.6887712454722763, + "grad_norm": 1.7265625, + "learning_rate": 7.989354449719671e-06, + "loss": 0.8624, + "step": 9919 + }, + { + "epoch": 1.6889427095612661, + "grad_norm": 1.7734375, + "learning_rate": 7.987585435018963e-06, + "loss": 0.8893, + "step": 9920 + }, + { + "epoch": 1.6891141736502562, + "grad_norm": 1.6953125, + "learning_rate": 7.98581648595067e-06, + "loss": 0.8371, + "step": 9921 + }, + { + "epoch": 1.689285637739246, + "grad_norm": 1.7578125, + "learning_rate": 7.984047602572486e-06, + "loss": 0.9117, + "step": 9922 + }, + { + "epoch": 1.6894571018282358, + "grad_norm": 1.734375, + "learning_rate": 7.982278784942106e-06, + "loss": 0.8182, + "step": 9923 + }, + { + "epoch": 1.6896285659172259, + "grad_norm": 1.78125, + "learning_rate": 7.980510033117208e-06, + "loss": 0.857, + "step": 9924 + }, + { + "epoch": 1.6898000300062157, + "grad_norm": 1.578125, + "learning_rate": 7.978741347155484e-06, + "loss": 0.8085, + "step": 9925 + }, + { + "epoch": 1.6899714940952055, + "grad_norm": 1.6484375, + "learning_rate": 7.976972727114615e-06, + "loss": 0.8537, + "step": 9926 + }, + { + "epoch": 1.6901429581841954, + "grad_norm": 1.78125, + "learning_rate": 7.975204173052284e-06, + "loss": 0.8811, + "step": 9927 + }, + { + "epoch": 1.6903144222731852, + "grad_norm": 1.7109375, + "learning_rate": 7.973435685026171e-06, + "loss": 0.8205, + "step": 9928 + }, + { + "epoch": 1.690485886362175, + "grad_norm": 1.671875, + "learning_rate": 7.97166726309395e-06, + "loss": 0.8305, + "step": 9929 + }, + { + "epoch": 1.6906573504511648, + "grad_norm": 1.6953125, + "learning_rate": 7.969898907313298e-06, + "loss": 0.8199, + "step": 9930 + }, + { + "epoch": 1.6908288145401547, + "grad_norm": 1.6953125, + "learning_rate": 7.968130617741887e-06, + "loss": 0.7937, + "step": 9931 + }, + { + "epoch": 1.6910002786291445, + "grad_norm": 1.71875, + "learning_rate": 7.966362394437389e-06, + "loss": 0.7968, + "step": 9932 + }, + { + "epoch": 1.6911717427181343, + "grad_norm": 1.6875, + "learning_rate": 7.964594237457469e-06, + "loss": 0.7697, + "step": 9933 + }, + { + "epoch": 1.6913432068071244, + "grad_norm": 1.6328125, + "learning_rate": 7.962826146859794e-06, + "loss": 0.8717, + "step": 9934 + }, + { + "epoch": 1.6915146708961142, + "grad_norm": 1.8671875, + "learning_rate": 7.961058122702037e-06, + "loss": 0.8174, + "step": 9935 + }, + { + "epoch": 1.691686134985104, + "grad_norm": 1.7890625, + "learning_rate": 7.959290165041848e-06, + "loss": 0.8491, + "step": 9936 + }, + { + "epoch": 1.691857599074094, + "grad_norm": 1.65625, + "learning_rate": 7.957522273936892e-06, + "loss": 0.8569, + "step": 9937 + }, + { + "epoch": 1.6920290631630839, + "grad_norm": 1.65625, + "learning_rate": 7.955754449444827e-06, + "loss": 0.8288, + "step": 9938 + }, + { + "epoch": 1.6922005272520737, + "grad_norm": 1.6640625, + "learning_rate": 7.953986691623305e-06, + "loss": 0.7834, + "step": 9939 + }, + { + "epoch": 1.6923719913410635, + "grad_norm": 1.734375, + "learning_rate": 7.952219000529982e-06, + "loss": 0.8607, + "step": 9940 + }, + { + "epoch": 1.6925434554300534, + "grad_norm": 1.703125, + "learning_rate": 7.950451376222508e-06, + "loss": 0.8376, + "step": 9941 + }, + { + "epoch": 1.6927149195190432, + "grad_norm": 1.7109375, + "learning_rate": 7.948683818758531e-06, + "loss": 0.7824, + "step": 9942 + }, + { + "epoch": 1.692886383608033, + "grad_norm": 1.6875, + "learning_rate": 7.946916328195701e-06, + "loss": 0.8443, + "step": 9943 + }, + { + "epoch": 1.6930578476970228, + "grad_norm": 1.6328125, + "learning_rate": 7.945148904591663e-06, + "loss": 0.8699, + "step": 9944 + }, + { + "epoch": 1.6932293117860127, + "grad_norm": 1.703125, + "learning_rate": 7.943381548004054e-06, + "loss": 0.8301, + "step": 9945 + }, + { + "epoch": 1.6934007758750027, + "grad_norm": 1.796875, + "learning_rate": 7.941614258490524e-06, + "loss": 0.9083, + "step": 9946 + }, + { + "epoch": 1.6935722399639925, + "grad_norm": 1.65625, + "learning_rate": 7.939847036108698e-06, + "loss": 0.7606, + "step": 9947 + }, + { + "epoch": 1.6937437040529824, + "grad_norm": 1.6796875, + "learning_rate": 7.938079880916219e-06, + "loss": 0.8062, + "step": 9948 + }, + { + "epoch": 1.6939151681419724, + "grad_norm": 1.640625, + "learning_rate": 7.936312792970719e-06, + "loss": 0.8749, + "step": 9949 + }, + { + "epoch": 1.6940866322309622, + "grad_norm": 1.625, + "learning_rate": 7.934545772329828e-06, + "loss": 0.8071, + "step": 9950 + }, + { + "epoch": 1.694258096319952, + "grad_norm": 1.703125, + "learning_rate": 7.93277881905118e-06, + "loss": 0.8501, + "step": 9951 + }, + { + "epoch": 1.6944295604089419, + "grad_norm": 1.8671875, + "learning_rate": 7.931011933192398e-06, + "loss": 0.8819, + "step": 9952 + }, + { + "epoch": 1.6946010244979317, + "grad_norm": 1.7109375, + "learning_rate": 7.929245114811108e-06, + "loss": 0.7806, + "step": 9953 + }, + { + "epoch": 1.6947724885869215, + "grad_norm": 1.609375, + "learning_rate": 7.927478363964933e-06, + "loss": 0.8344, + "step": 9954 + }, + { + "epoch": 1.6949439526759114, + "grad_norm": 1.8203125, + "learning_rate": 7.925711680711493e-06, + "loss": 0.9316, + "step": 9955 + }, + { + "epoch": 1.6951154167649012, + "grad_norm": 1.671875, + "learning_rate": 7.923945065108406e-06, + "loss": 0.9082, + "step": 9956 + }, + { + "epoch": 1.695286880853891, + "grad_norm": 1.7265625, + "learning_rate": 7.922178517213288e-06, + "loss": 0.8015, + "step": 9957 + }, + { + "epoch": 1.695458344942881, + "grad_norm": 1.734375, + "learning_rate": 7.920412037083757e-06, + "loss": 0.9053, + "step": 9958 + }, + { + "epoch": 1.6956298090318709, + "grad_norm": 1.6875, + "learning_rate": 7.918645624777415e-06, + "loss": 0.9377, + "step": 9959 + }, + { + "epoch": 1.6958012731208607, + "grad_norm": 1.7734375, + "learning_rate": 7.916879280351878e-06, + "loss": 0.8941, + "step": 9960 + }, + { + "epoch": 1.6959727372098508, + "grad_norm": 1.625, + "learning_rate": 7.915113003864753e-06, + "loss": 0.8981, + "step": 9961 + }, + { + "epoch": 1.6961442012988406, + "grad_norm": 1.734375, + "learning_rate": 7.913346795373643e-06, + "loss": 0.8599, + "step": 9962 + }, + { + "epoch": 1.6963156653878304, + "grad_norm": 1.6328125, + "learning_rate": 7.911580654936152e-06, + "loss": 0.7965, + "step": 9963 + }, + { + "epoch": 1.6964871294768202, + "grad_norm": 1.6640625, + "learning_rate": 7.90981458260988e-06, + "loss": 0.8015, + "step": 9964 + }, + { + "epoch": 1.69665859356581, + "grad_norm": 1.734375, + "learning_rate": 7.908048578452426e-06, + "loss": 0.9537, + "step": 9965 + }, + { + "epoch": 1.6968300576547999, + "grad_norm": 1.6953125, + "learning_rate": 7.906282642521384e-06, + "loss": 0.9249, + "step": 9966 + }, + { + "epoch": 1.6970015217437897, + "grad_norm": 1.6484375, + "learning_rate": 7.90451677487435e-06, + "loss": 0.8379, + "step": 9967 + }, + { + "epoch": 1.6971729858327795, + "grad_norm": 1.8203125, + "learning_rate": 7.902750975568914e-06, + "loss": 0.9602, + "step": 9968 + }, + { + "epoch": 1.6973444499217694, + "grad_norm": 1.6875, + "learning_rate": 7.90098524466267e-06, + "loss": 0.8189, + "step": 9969 + }, + { + "epoch": 1.6975159140107594, + "grad_norm": 1.7421875, + "learning_rate": 7.899219582213198e-06, + "loss": 0.8329, + "step": 9970 + }, + { + "epoch": 1.6976873780997492, + "grad_norm": 1.75, + "learning_rate": 7.897453988278087e-06, + "loss": 0.8815, + "step": 9971 + }, + { + "epoch": 1.697858842188739, + "grad_norm": 1.625, + "learning_rate": 7.895688462914919e-06, + "loss": 0.7879, + "step": 9972 + }, + { + "epoch": 1.698030306277729, + "grad_norm": 1.6328125, + "learning_rate": 7.893923006181274e-06, + "loss": 0.8663, + "step": 9973 + }, + { + "epoch": 1.698201770366719, + "grad_norm": 1.6484375, + "learning_rate": 7.892157618134729e-06, + "loss": 0.8129, + "step": 9974 + }, + { + "epoch": 1.6983732344557088, + "grad_norm": 1.6015625, + "learning_rate": 7.890392298832863e-06, + "loss": 0.8256, + "step": 9975 + }, + { + "epoch": 1.6985446985446986, + "grad_norm": 1.6796875, + "learning_rate": 7.888627048333248e-06, + "loss": 0.7792, + "step": 9976 + }, + { + "epoch": 1.6987161626336884, + "grad_norm": 1.7421875, + "learning_rate": 7.886861866693457e-06, + "loss": 0.8493, + "step": 9977 + }, + { + "epoch": 1.6988876267226782, + "grad_norm": 1.6796875, + "learning_rate": 7.885096753971056e-06, + "loss": 0.87, + "step": 9978 + }, + { + "epoch": 1.699059090811668, + "grad_norm": 1.75, + "learning_rate": 7.883331710223614e-06, + "loss": 0.9304, + "step": 9979 + }, + { + "epoch": 1.6992305549006579, + "grad_norm": 1.9453125, + "learning_rate": 7.881566735508696e-06, + "loss": 0.8853, + "step": 9980 + }, + { + "epoch": 1.6994020189896477, + "grad_norm": 1.7578125, + "learning_rate": 7.879801829883867e-06, + "loss": 0.8327, + "step": 9981 + }, + { + "epoch": 1.6995734830786378, + "grad_norm": 1.640625, + "learning_rate": 7.87803699340668e-06, + "loss": 0.7743, + "step": 9982 + }, + { + "epoch": 1.6997449471676276, + "grad_norm": 1.6875, + "learning_rate": 7.876272226134698e-06, + "loss": 0.8026, + "step": 9983 + }, + { + "epoch": 1.6999164112566174, + "grad_norm": 1.6953125, + "learning_rate": 7.874507528125476e-06, + "loss": 0.8312, + "step": 9984 + }, + { + "epoch": 1.7000878753456075, + "grad_norm": 1.65625, + "learning_rate": 7.872742899436568e-06, + "loss": 0.809, + "step": 9985 + }, + { + "epoch": 1.7002593394345973, + "grad_norm": 1.7421875, + "learning_rate": 7.870978340125524e-06, + "loss": 0.8874, + "step": 9986 + }, + { + "epoch": 1.700430803523587, + "grad_norm": 1.6171875, + "learning_rate": 7.869213850249895e-06, + "loss": 0.861, + "step": 9987 + }, + { + "epoch": 1.700602267612577, + "grad_norm": 1.75, + "learning_rate": 7.867449429867224e-06, + "loss": 0.9106, + "step": 9988 + }, + { + "epoch": 1.7007737317015668, + "grad_norm": 1.6328125, + "learning_rate": 7.865685079035058e-06, + "loss": 0.8218, + "step": 9989 + }, + { + "epoch": 1.7009451957905566, + "grad_norm": 1.65625, + "learning_rate": 7.863920797810938e-06, + "loss": 0.8162, + "step": 9990 + }, + { + "epoch": 1.7011166598795464, + "grad_norm": 1.6484375, + "learning_rate": 7.862156586252405e-06, + "loss": 0.8495, + "step": 9991 + }, + { + "epoch": 1.7012881239685362, + "grad_norm": 1.6796875, + "learning_rate": 7.860392444417001e-06, + "loss": 0.7733, + "step": 9992 + }, + { + "epoch": 1.701459588057526, + "grad_norm": 1.6484375, + "learning_rate": 7.858628372362253e-06, + "loss": 0.8968, + "step": 9993 + }, + { + "epoch": 1.701631052146516, + "grad_norm": 1.71875, + "learning_rate": 7.856864370145696e-06, + "loss": 0.7907, + "step": 9994 + }, + { + "epoch": 1.701802516235506, + "grad_norm": 1.75, + "learning_rate": 7.855100437824863e-06, + "loss": 0.8403, + "step": 9995 + }, + { + "epoch": 1.7019739803244958, + "grad_norm": 1.6953125, + "learning_rate": 7.853336575457281e-06, + "loss": 0.8308, + "step": 9996 + }, + { + "epoch": 1.7021454444134858, + "grad_norm": 1.78125, + "learning_rate": 7.851572783100475e-06, + "loss": 0.943, + "step": 9997 + }, + { + "epoch": 1.7023169085024756, + "grad_norm": 1.75, + "learning_rate": 7.849809060811973e-06, + "loss": 0.8719, + "step": 9998 + }, + { + "epoch": 1.7024883725914655, + "grad_norm": 1.71875, + "learning_rate": 7.848045408649295e-06, + "loss": 0.8588, + "step": 9999 + }, + { + "epoch": 1.7026598366804553, + "grad_norm": 1.6328125, + "learning_rate": 7.84628182666996e-06, + "loss": 0.82, + "step": 10000 + }, + { + "epoch": 1.702831300769445, + "grad_norm": 1.6875, + "learning_rate": 7.844518314931483e-06, + "loss": 0.901, + "step": 10001 + }, + { + "epoch": 1.703002764858435, + "grad_norm": 1.8125, + "learning_rate": 7.842754873491381e-06, + "loss": 0.7771, + "step": 10002 + }, + { + "epoch": 1.7031742289474248, + "grad_norm": 1.640625, + "learning_rate": 7.840991502407168e-06, + "loss": 0.8143, + "step": 10003 + }, + { + "epoch": 1.7033456930364146, + "grad_norm": 1.765625, + "learning_rate": 7.839228201736354e-06, + "loss": 0.8319, + "step": 10004 + }, + { + "epoch": 1.7035171571254044, + "grad_norm": 1.7421875, + "learning_rate": 7.837464971536442e-06, + "loss": 0.8533, + "step": 10005 + }, + { + "epoch": 1.7036886212143945, + "grad_norm": 1.6640625, + "learning_rate": 7.835701811864937e-06, + "loss": 0.8322, + "step": 10006 + }, + { + "epoch": 1.7038600853033843, + "grad_norm": 1.6875, + "learning_rate": 7.83393872277935e-06, + "loss": 0.9125, + "step": 10007 + }, + { + "epoch": 1.7040315493923741, + "grad_norm": 1.6875, + "learning_rate": 7.832175704337177e-06, + "loss": 0.8254, + "step": 10008 + }, + { + "epoch": 1.7042030134813642, + "grad_norm": 1.6484375, + "learning_rate": 7.830412756595917e-06, + "loss": 0.8034, + "step": 10009 + }, + { + "epoch": 1.704374477570354, + "grad_norm": 1.59375, + "learning_rate": 7.828649879613069e-06, + "loss": 0.7845, + "step": 10010 + }, + { + "epoch": 1.7045459416593438, + "grad_norm": 1.703125, + "learning_rate": 7.826887073446122e-06, + "loss": 0.8672, + "step": 10011 + }, + { + "epoch": 1.7047174057483336, + "grad_norm": 1.7265625, + "learning_rate": 7.825124338152574e-06, + "loss": 0.8432, + "step": 10012 + }, + { + "epoch": 1.7048888698373235, + "grad_norm": 1.609375, + "learning_rate": 7.823361673789909e-06, + "loss": 0.7993, + "step": 10013 + }, + { + "epoch": 1.7050603339263133, + "grad_norm": 1.65625, + "learning_rate": 7.821599080415618e-06, + "loss": 0.8738, + "step": 10014 + }, + { + "epoch": 1.7052317980153031, + "grad_norm": 1.71875, + "learning_rate": 7.81983655808718e-06, + "loss": 0.8296, + "step": 10015 + }, + { + "epoch": 1.705403262104293, + "grad_norm": 1.640625, + "learning_rate": 7.818074106862088e-06, + "loss": 0.8741, + "step": 10016 + }, + { + "epoch": 1.7055747261932828, + "grad_norm": 1.734375, + "learning_rate": 7.816311726797813e-06, + "loss": 0.8226, + "step": 10017 + }, + { + "epoch": 1.7057461902822728, + "grad_norm": 1.734375, + "learning_rate": 7.814549417951835e-06, + "loss": 0.8786, + "step": 10018 + }, + { + "epoch": 1.7059176543712626, + "grad_norm": 1.640625, + "learning_rate": 7.81278718038163e-06, + "loss": 0.8289, + "step": 10019 + }, + { + "epoch": 1.7060891184602525, + "grad_norm": 1.609375, + "learning_rate": 7.811025014144671e-06, + "loss": 0.7693, + "step": 10020 + }, + { + "epoch": 1.7062605825492423, + "grad_norm": 1.765625, + "learning_rate": 7.809262919298428e-06, + "loss": 0.8542, + "step": 10021 + }, + { + "epoch": 1.7064320466382323, + "grad_norm": 1.7890625, + "learning_rate": 7.807500895900373e-06, + "loss": 0.9365, + "step": 10022 + }, + { + "epoch": 1.7066035107272222, + "grad_norm": 1.671875, + "learning_rate": 7.805738944007968e-06, + "loss": 0.9101, + "step": 10023 + }, + { + "epoch": 1.706774974816212, + "grad_norm": 1.6796875, + "learning_rate": 7.803977063678682e-06, + "loss": 0.8732, + "step": 10024 + }, + { + "epoch": 1.7069464389052018, + "grad_norm": 1.7578125, + "learning_rate": 7.80221525496997e-06, + "loss": 0.8287, + "step": 10025 + }, + { + "epoch": 1.7071179029941916, + "grad_norm": 1.7578125, + "learning_rate": 7.800453517939298e-06, + "loss": 0.8571, + "step": 10026 + }, + { + "epoch": 1.7072893670831815, + "grad_norm": 1.7109375, + "learning_rate": 7.798691852644118e-06, + "loss": 0.8613, + "step": 10027 + }, + { + "epoch": 1.7074608311721713, + "grad_norm": 1.6953125, + "learning_rate": 7.796930259141885e-06, + "loss": 0.8497, + "step": 10028 + }, + { + "epoch": 1.7076322952611611, + "grad_norm": 1.625, + "learning_rate": 7.795168737490054e-06, + "loss": 0.7839, + "step": 10029 + }, + { + "epoch": 1.707803759350151, + "grad_norm": 1.6484375, + "learning_rate": 7.793407287746072e-06, + "loss": 0.9061, + "step": 10030 + }, + { + "epoch": 1.707975223439141, + "grad_norm": 1.6796875, + "learning_rate": 7.791645909967387e-06, + "loss": 0.8451, + "step": 10031 + }, + { + "epoch": 1.7081466875281308, + "grad_norm": 1.7109375, + "learning_rate": 7.789884604211447e-06, + "loss": 0.8251, + "step": 10032 + }, + { + "epoch": 1.7083181516171206, + "grad_norm": 1.6953125, + "learning_rate": 7.78812337053569e-06, + "loss": 0.883, + "step": 10033 + }, + { + "epoch": 1.7084896157061107, + "grad_norm": 1.6015625, + "learning_rate": 7.78636220899756e-06, + "loss": 0.7842, + "step": 10034 + }, + { + "epoch": 1.7086610797951005, + "grad_norm": 1.703125, + "learning_rate": 7.784601119654494e-06, + "loss": 0.8951, + "step": 10035 + }, + { + "epoch": 1.7088325438840903, + "grad_norm": 1.703125, + "learning_rate": 7.782840102563925e-06, + "loss": 0.9254, + "step": 10036 + }, + { + "epoch": 1.7090040079730802, + "grad_norm": 1.6953125, + "learning_rate": 7.781079157783292e-06, + "loss": 0.8714, + "step": 10037 + }, + { + "epoch": 1.70917547206207, + "grad_norm": 1.6640625, + "learning_rate": 7.779318285370024e-06, + "loss": 0.7217, + "step": 10038 + }, + { + "epoch": 1.7093469361510598, + "grad_norm": 1.734375, + "learning_rate": 7.777557485381553e-06, + "loss": 0.8525, + "step": 10039 + }, + { + "epoch": 1.7095184002400496, + "grad_norm": 1.71875, + "learning_rate": 7.775796757875298e-06, + "loss": 0.8348, + "step": 10040 + }, + { + "epoch": 1.7096898643290395, + "grad_norm": 1.6875, + "learning_rate": 7.774036102908685e-06, + "loss": 0.8365, + "step": 10041 + }, + { + "epoch": 1.7098613284180293, + "grad_norm": 1.6953125, + "learning_rate": 7.772275520539136e-06, + "loss": 0.8373, + "step": 10042 + }, + { + "epoch": 1.7100327925070193, + "grad_norm": 1.640625, + "learning_rate": 7.770515010824074e-06, + "loss": 0.8351, + "step": 10043 + }, + { + "epoch": 1.7102042565960092, + "grad_norm": 1.6640625, + "learning_rate": 7.768754573820908e-06, + "loss": 0.8265, + "step": 10044 + }, + { + "epoch": 1.710375720684999, + "grad_norm": 1.796875, + "learning_rate": 7.766994209587062e-06, + "loss": 0.9264, + "step": 10045 + }, + { + "epoch": 1.710547184773989, + "grad_norm": 1.6875, + "learning_rate": 7.765233918179942e-06, + "loss": 0.8793, + "step": 10046 + }, + { + "epoch": 1.7107186488629789, + "grad_norm": 1.7109375, + "learning_rate": 7.763473699656959e-06, + "loss": 0.9455, + "step": 10047 + }, + { + "epoch": 1.7108901129519687, + "grad_norm": 1.75, + "learning_rate": 7.761713554075521e-06, + "loss": 0.8932, + "step": 10048 + }, + { + "epoch": 1.7110615770409585, + "grad_norm": 1.7421875, + "learning_rate": 7.759953481493033e-06, + "loss": 0.8766, + "step": 10049 + }, + { + "epoch": 1.7112330411299483, + "grad_norm": 1.6875, + "learning_rate": 7.758193481966901e-06, + "loss": 0.8285, + "step": 10050 + }, + { + "epoch": 1.7114045052189382, + "grad_norm": 1.6953125, + "learning_rate": 7.756433555554518e-06, + "loss": 0.8557, + "step": 10051 + }, + { + "epoch": 1.711575969307928, + "grad_norm": 1.625, + "learning_rate": 7.754673702313284e-06, + "loss": 0.8181, + "step": 10052 + }, + { + "epoch": 1.7117474333969178, + "grad_norm": 1.703125, + "learning_rate": 7.752913922300595e-06, + "loss": 0.8187, + "step": 10053 + }, + { + "epoch": 1.7119188974859076, + "grad_norm": 1.65625, + "learning_rate": 7.751154215573845e-06, + "loss": 0.8839, + "step": 10054 + }, + { + "epoch": 1.7120903615748977, + "grad_norm": 1.671875, + "learning_rate": 7.749394582190426e-06, + "loss": 0.8244, + "step": 10055 + }, + { + "epoch": 1.7122618256638875, + "grad_norm": 1.671875, + "learning_rate": 7.747635022207724e-06, + "loss": 0.8911, + "step": 10056 + }, + { + "epoch": 1.7124332897528773, + "grad_norm": 1.7265625, + "learning_rate": 7.745875535683126e-06, + "loss": 0.8736, + "step": 10057 + }, + { + "epoch": 1.7126047538418674, + "grad_norm": 1.75, + "learning_rate": 7.744116122674015e-06, + "loss": 0.8851, + "step": 10058 + }, + { + "epoch": 1.7127762179308572, + "grad_norm": 1.6953125, + "learning_rate": 7.742356783237772e-06, + "loss": 0.8383, + "step": 10059 + }, + { + "epoch": 1.712947682019847, + "grad_norm": 1.6640625, + "learning_rate": 7.740597517431775e-06, + "loss": 0.8672, + "step": 10060 + }, + { + "epoch": 1.7131191461088369, + "grad_norm": 1.734375, + "learning_rate": 7.738838325313402e-06, + "loss": 0.8921, + "step": 10061 + }, + { + "epoch": 1.7132906101978267, + "grad_norm": 1.7421875, + "learning_rate": 7.737079206940027e-06, + "loss": 0.8335, + "step": 10062 + }, + { + "epoch": 1.7134620742868165, + "grad_norm": 1.671875, + "learning_rate": 7.73532016236902e-06, + "loss": 0.8776, + "step": 10063 + }, + { + "epoch": 1.7136335383758063, + "grad_norm": 1.703125, + "learning_rate": 7.733561191657748e-06, + "loss": 0.8359, + "step": 10064 + }, + { + "epoch": 1.7138050024647962, + "grad_norm": 1.78125, + "learning_rate": 7.731802294863583e-06, + "loss": 0.883, + "step": 10065 + }, + { + "epoch": 1.713976466553786, + "grad_norm": 1.765625, + "learning_rate": 7.730043472043884e-06, + "loss": 0.8665, + "step": 10066 + }, + { + "epoch": 1.714147930642776, + "grad_norm": 1.6953125, + "learning_rate": 7.728284723256017e-06, + "loss": 0.8383, + "step": 10067 + }, + { + "epoch": 1.7143193947317659, + "grad_norm": 1.6953125, + "learning_rate": 7.726526048557338e-06, + "loss": 0.8075, + "step": 10068 + }, + { + "epoch": 1.7144908588207557, + "grad_norm": 1.78125, + "learning_rate": 7.724767448005208e-06, + "loss": 0.8819, + "step": 10069 + }, + { + "epoch": 1.7146623229097457, + "grad_norm": 1.7421875, + "learning_rate": 7.723008921656977e-06, + "loss": 0.7742, + "step": 10070 + }, + { + "epoch": 1.7148337869987356, + "grad_norm": 1.640625, + "learning_rate": 7.72125046957e-06, + "loss": 0.8344, + "step": 10071 + }, + { + "epoch": 1.7150052510877254, + "grad_norm": 1.6875, + "learning_rate": 7.719492091801628e-06, + "loss": 0.8342, + "step": 10072 + }, + { + "epoch": 1.7151767151767152, + "grad_norm": 1.734375, + "learning_rate": 7.717733788409207e-06, + "loss": 0.8667, + "step": 10073 + }, + { + "epoch": 1.715348179265705, + "grad_norm": 1.71875, + "learning_rate": 7.71597555945008e-06, + "loss": 0.8732, + "step": 10074 + }, + { + "epoch": 1.7155196433546949, + "grad_norm": 1.671875, + "learning_rate": 7.71421740498159e-06, + "loss": 0.915, + "step": 10075 + }, + { + "epoch": 1.7156911074436847, + "grad_norm": 1.75, + "learning_rate": 7.712459325061078e-06, + "loss": 0.8567, + "step": 10076 + }, + { + "epoch": 1.7158625715326745, + "grad_norm": 1.8984375, + "learning_rate": 7.710701319745881e-06, + "loss": 0.8153, + "step": 10077 + }, + { + "epoch": 1.7160340356216643, + "grad_norm": 1.6875, + "learning_rate": 7.708943389093337e-06, + "loss": 0.8616, + "step": 10078 + }, + { + "epoch": 1.7162054997106544, + "grad_norm": 1.734375, + "learning_rate": 7.707185533160774e-06, + "loss": 0.8309, + "step": 10079 + }, + { + "epoch": 1.7163769637996442, + "grad_norm": 1.671875, + "learning_rate": 7.705427752005525e-06, + "loss": 0.8271, + "step": 10080 + }, + { + "epoch": 1.716548427888634, + "grad_norm": 1.703125, + "learning_rate": 7.703670045684919e-06, + "loss": 0.9102, + "step": 10081 + }, + { + "epoch": 1.716719891977624, + "grad_norm": 1.734375, + "learning_rate": 7.701912414256279e-06, + "loss": 0.9635, + "step": 10082 + }, + { + "epoch": 1.716891356066614, + "grad_norm": 1.6484375, + "learning_rate": 7.700154857776925e-06, + "loss": 0.8498, + "step": 10083 + }, + { + "epoch": 1.7170628201556037, + "grad_norm": 1.671875, + "learning_rate": 7.698397376304186e-06, + "loss": 0.8869, + "step": 10084 + }, + { + "epoch": 1.7172342842445936, + "grad_norm": 1.828125, + "learning_rate": 7.69663996989538e-06, + "loss": 0.8821, + "step": 10085 + }, + { + "epoch": 1.7174057483335834, + "grad_norm": 1.765625, + "learning_rate": 7.694882638607813e-06, + "loss": 0.9002, + "step": 10086 + }, + { + "epoch": 1.7175772124225732, + "grad_norm": 1.6953125, + "learning_rate": 7.693125382498804e-06, + "loss": 0.8881, + "step": 10087 + }, + { + "epoch": 1.717748676511563, + "grad_norm": 1.8359375, + "learning_rate": 7.691368201625662e-06, + "loss": 0.8636, + "step": 10088 + }, + { + "epoch": 1.7179201406005529, + "grad_norm": 1.703125, + "learning_rate": 7.689611096045698e-06, + "loss": 0.8352, + "step": 10089 + }, + { + "epoch": 1.7180916046895427, + "grad_norm": 1.71875, + "learning_rate": 7.687854065816216e-06, + "loss": 0.9081, + "step": 10090 + }, + { + "epoch": 1.7182630687785327, + "grad_norm": 1.65625, + "learning_rate": 7.686097110994516e-06, + "loss": 0.8152, + "step": 10091 + }, + { + "epoch": 1.7184345328675226, + "grad_norm": 1.609375, + "learning_rate": 7.684340231637905e-06, + "loss": 0.8166, + "step": 10092 + }, + { + "epoch": 1.7186059969565124, + "grad_norm": 1.59375, + "learning_rate": 7.68258342780368e-06, + "loss": 0.8214, + "step": 10093 + }, + { + "epoch": 1.7187774610455024, + "grad_norm": 1.71875, + "learning_rate": 7.680826699549136e-06, + "loss": 0.9223, + "step": 10094 + }, + { + "epoch": 1.7189489251344923, + "grad_norm": 1.6875, + "learning_rate": 7.679070046931567e-06, + "loss": 0.869, + "step": 10095 + }, + { + "epoch": 1.719120389223482, + "grad_norm": 1.6328125, + "learning_rate": 7.677313470008268e-06, + "loss": 0.8313, + "step": 10096 + }, + { + "epoch": 1.719291853312472, + "grad_norm": 1.6640625, + "learning_rate": 7.675556968836517e-06, + "loss": 0.8957, + "step": 10097 + }, + { + "epoch": 1.7194633174014617, + "grad_norm": 1.7421875, + "learning_rate": 7.673800543473608e-06, + "loss": 0.7927, + "step": 10098 + }, + { + "epoch": 1.7196347814904516, + "grad_norm": 1.625, + "learning_rate": 7.672044193976822e-06, + "loss": 0.8355, + "step": 10099 + }, + { + "epoch": 1.7198062455794414, + "grad_norm": 1.703125, + "learning_rate": 7.670287920403439e-06, + "loss": 0.819, + "step": 10100 + }, + { + "epoch": 1.7199777096684312, + "grad_norm": 1.8125, + "learning_rate": 7.668531722810742e-06, + "loss": 0.9703, + "step": 10101 + }, + { + "epoch": 1.720149173757421, + "grad_norm": 1.875, + "learning_rate": 7.666775601256006e-06, + "loss": 0.882, + "step": 10102 + }, + { + "epoch": 1.720320637846411, + "grad_norm": 1.671875, + "learning_rate": 7.665019555796502e-06, + "loss": 0.8179, + "step": 10103 + }, + { + "epoch": 1.720492101935401, + "grad_norm": 1.78125, + "learning_rate": 7.663263586489504e-06, + "loss": 0.9177, + "step": 10104 + }, + { + "epoch": 1.7206635660243907, + "grad_norm": 1.640625, + "learning_rate": 7.66150769339228e-06, + "loss": 0.7969, + "step": 10105 + }, + { + "epoch": 1.7208350301133808, + "grad_norm": 1.71875, + "learning_rate": 7.659751876562096e-06, + "loss": 0.8631, + "step": 10106 + }, + { + "epoch": 1.7210064942023706, + "grad_norm": 1.6171875, + "learning_rate": 7.657996136056216e-06, + "loss": 0.862, + "step": 10107 + }, + { + "epoch": 1.7211779582913604, + "grad_norm": 1.8046875, + "learning_rate": 7.656240471931904e-06, + "loss": 0.9143, + "step": 10108 + }, + { + "epoch": 1.7213494223803503, + "grad_norm": 1.640625, + "learning_rate": 7.654484884246412e-06, + "loss": 0.8633, + "step": 10109 + }, + { + "epoch": 1.72152088646934, + "grad_norm": 1.6328125, + "learning_rate": 7.652729373057001e-06, + "loss": 0.8728, + "step": 10110 + }, + { + "epoch": 1.72169235055833, + "grad_norm": 1.671875, + "learning_rate": 7.650973938420924e-06, + "loss": 0.7864, + "step": 10111 + }, + { + "epoch": 1.7218638146473197, + "grad_norm": 1.7265625, + "learning_rate": 7.649218580395433e-06, + "loss": 0.8282, + "step": 10112 + }, + { + "epoch": 1.7220352787363096, + "grad_norm": 1.7578125, + "learning_rate": 7.647463299037777e-06, + "loss": 0.8676, + "step": 10113 + }, + { + "epoch": 1.7222067428252994, + "grad_norm": 1.6953125, + "learning_rate": 7.6457080944052e-06, + "loss": 0.8708, + "step": 10114 + }, + { + "epoch": 1.7223782069142892, + "grad_norm": 1.703125, + "learning_rate": 7.643952966554948e-06, + "loss": 0.8541, + "step": 10115 + }, + { + "epoch": 1.7225496710032793, + "grad_norm": 1.703125, + "learning_rate": 7.642197915544263e-06, + "loss": 0.7929, + "step": 10116 + }, + { + "epoch": 1.722721135092269, + "grad_norm": 1.625, + "learning_rate": 7.640442941430382e-06, + "loss": 0.8458, + "step": 10117 + }, + { + "epoch": 1.722892599181259, + "grad_norm": 1.71875, + "learning_rate": 7.638688044270542e-06, + "loss": 0.8399, + "step": 10118 + }, + { + "epoch": 1.723064063270249, + "grad_norm": 1.6640625, + "learning_rate": 7.636933224121977e-06, + "loss": 0.8385, + "step": 10119 + }, + { + "epoch": 1.7232355273592388, + "grad_norm": 1.671875, + "learning_rate": 7.635178481041917e-06, + "loss": 0.8743, + "step": 10120 + }, + { + "epoch": 1.7234069914482286, + "grad_norm": 1.625, + "learning_rate": 7.633423815087593e-06, + "loss": 0.8821, + "step": 10121 + }, + { + "epoch": 1.7235784555372184, + "grad_norm": 1.6640625, + "learning_rate": 7.63166922631623e-06, + "loss": 0.7966, + "step": 10122 + }, + { + "epoch": 1.7237499196262083, + "grad_norm": 1.71875, + "learning_rate": 7.62991471478505e-06, + "loss": 0.8014, + "step": 10123 + }, + { + "epoch": 1.723921383715198, + "grad_norm": 1.6171875, + "learning_rate": 7.628160280551278e-06, + "loss": 0.8135, + "step": 10124 + }, + { + "epoch": 1.724092847804188, + "grad_norm": 1.65625, + "learning_rate": 7.62640592367213e-06, + "loss": 0.8701, + "step": 10125 + }, + { + "epoch": 1.7242643118931777, + "grad_norm": 1.671875, + "learning_rate": 7.624651644204823e-06, + "loss": 0.8518, + "step": 10126 + }, + { + "epoch": 1.7244357759821676, + "grad_norm": 1.6328125, + "learning_rate": 7.62289744220657e-06, + "loss": 0.8435, + "step": 10127 + }, + { + "epoch": 1.7246072400711576, + "grad_norm": 1.6640625, + "learning_rate": 7.621143317734584e-06, + "loss": 0.8237, + "step": 10128 + }, + { + "epoch": 1.7247787041601474, + "grad_norm": 1.734375, + "learning_rate": 7.619389270846071e-06, + "loss": 0.91, + "step": 10129 + }, + { + "epoch": 1.7249501682491373, + "grad_norm": 1.6875, + "learning_rate": 7.617635301598237e-06, + "loss": 0.8793, + "step": 10130 + }, + { + "epoch": 1.7251216323381273, + "grad_norm": 1.7734375, + "learning_rate": 7.615881410048294e-06, + "loss": 0.9091, + "step": 10131 + }, + { + "epoch": 1.7252930964271171, + "grad_norm": 1.734375, + "learning_rate": 7.614127596253431e-06, + "loss": 0.8372, + "step": 10132 + }, + { + "epoch": 1.725464560516107, + "grad_norm": 1.71875, + "learning_rate": 7.612373860270852e-06, + "loss": 0.8641, + "step": 10133 + }, + { + "epoch": 1.7256360246050968, + "grad_norm": 1.65625, + "learning_rate": 7.610620202157751e-06, + "loss": 0.8591, + "step": 10134 + }, + { + "epoch": 1.7258074886940866, + "grad_norm": 1.7109375, + "learning_rate": 7.608866621971325e-06, + "loss": 0.8311, + "step": 10135 + }, + { + "epoch": 1.7259789527830764, + "grad_norm": 1.625, + "learning_rate": 7.6071131197687606e-06, + "loss": 0.8582, + "step": 10136 + }, + { + "epoch": 1.7261504168720663, + "grad_norm": 1.5546875, + "learning_rate": 7.605359695607248e-06, + "loss": 0.8795, + "step": 10137 + }, + { + "epoch": 1.726321880961056, + "grad_norm": 1.7421875, + "learning_rate": 7.603606349543973e-06, + "loss": 0.9352, + "step": 10138 + }, + { + "epoch": 1.726493345050046, + "grad_norm": 1.7734375, + "learning_rate": 7.601853081636119e-06, + "loss": 0.9355, + "step": 10139 + }, + { + "epoch": 1.726664809139036, + "grad_norm": 1.71875, + "learning_rate": 7.600099891940869e-06, + "loss": 0.8794, + "step": 10140 + }, + { + "epoch": 1.7268362732280258, + "grad_norm": 1.625, + "learning_rate": 7.598346780515396e-06, + "loss": 0.8277, + "step": 10141 + }, + { + "epoch": 1.7270077373170156, + "grad_norm": 1.71875, + "learning_rate": 7.596593747416885e-06, + "loss": 0.8309, + "step": 10142 + }, + { + "epoch": 1.7271792014060057, + "grad_norm": 1.703125, + "learning_rate": 7.5948407927024955e-06, + "loss": 0.8031, + "step": 10143 + }, + { + "epoch": 1.7273506654949955, + "grad_norm": 1.75, + "learning_rate": 7.593087916429407e-06, + "loss": 0.851, + "step": 10144 + }, + { + "epoch": 1.7275221295839853, + "grad_norm": 1.6484375, + "learning_rate": 7.591335118654784e-06, + "loss": 0.8321, + "step": 10145 + }, + { + "epoch": 1.7276935936729751, + "grad_norm": 1.765625, + "learning_rate": 7.589582399435791e-06, + "loss": 0.8972, + "step": 10146 + }, + { + "epoch": 1.727865057761965, + "grad_norm": 1.7109375, + "learning_rate": 7.587829758829594e-06, + "loss": 0.805, + "step": 10147 + }, + { + "epoch": 1.7280365218509548, + "grad_norm": 1.6328125, + "learning_rate": 7.586077196893353e-06, + "loss": 0.8274, + "step": 10148 + }, + { + "epoch": 1.7282079859399446, + "grad_norm": 1.734375, + "learning_rate": 7.5843247136842245e-06, + "loss": 0.926, + "step": 10149 + }, + { + "epoch": 1.7283794500289344, + "grad_norm": 1.6640625, + "learning_rate": 7.582572309259364e-06, + "loss": 0.8003, + "step": 10150 + }, + { + "epoch": 1.7285509141179243, + "grad_norm": 1.6796875, + "learning_rate": 7.5808199836759235e-06, + "loss": 0.8239, + "step": 10151 + }, + { + "epoch": 1.7287223782069143, + "grad_norm": 1.8203125, + "learning_rate": 7.579067736991053e-06, + "loss": 0.8298, + "step": 10152 + }, + { + "epoch": 1.7288938422959041, + "grad_norm": 1.6796875, + "learning_rate": 7.5773155692618995e-06, + "loss": 0.8657, + "step": 10153 + }, + { + "epoch": 1.729065306384894, + "grad_norm": 1.6015625, + "learning_rate": 7.575563480545613e-06, + "loss": 0.8066, + "step": 10154 + }, + { + "epoch": 1.729236770473884, + "grad_norm": 1.7109375, + "learning_rate": 7.573811470899325e-06, + "loss": 0.8659, + "step": 10155 + }, + { + "epoch": 1.7294082345628738, + "grad_norm": 1.703125, + "learning_rate": 7.572059540380182e-06, + "loss": 0.9177, + "step": 10156 + }, + { + "epoch": 1.7295796986518637, + "grad_norm": 1.6953125, + "learning_rate": 7.570307689045322e-06, + "loss": 0.9513, + "step": 10157 + }, + { + "epoch": 1.7297511627408535, + "grad_norm": 1.6015625, + "learning_rate": 7.568555916951877e-06, + "loss": 0.787, + "step": 10158 + }, + { + "epoch": 1.7299226268298433, + "grad_norm": 1.703125, + "learning_rate": 7.566804224156978e-06, + "loss": 0.8231, + "step": 10159 + }, + { + "epoch": 1.7300940909188331, + "grad_norm": 1.6484375, + "learning_rate": 7.565052610717757e-06, + "loss": 0.8836, + "step": 10160 + }, + { + "epoch": 1.730265555007823, + "grad_norm": 1.6171875, + "learning_rate": 7.563301076691339e-06, + "loss": 0.8266, + "step": 10161 + }, + { + "epoch": 1.7304370190968128, + "grad_norm": 1.75, + "learning_rate": 7.5615496221348495e-06, + "loss": 0.8571, + "step": 10162 + }, + { + "epoch": 1.7306084831858026, + "grad_norm": 1.75, + "learning_rate": 7.559798247105409e-06, + "loss": 0.8509, + "step": 10163 + }, + { + "epoch": 1.7307799472747927, + "grad_norm": 1.78125, + "learning_rate": 7.558046951660136e-06, + "loss": 0.8736, + "step": 10164 + }, + { + "epoch": 1.7309514113637825, + "grad_norm": 1.765625, + "learning_rate": 7.55629573585615e-06, + "loss": 0.8781, + "step": 10165 + }, + { + "epoch": 1.7311228754527723, + "grad_norm": 1.6640625, + "learning_rate": 7.554544599750559e-06, + "loss": 0.8972, + "step": 10166 + }, + { + "epoch": 1.7312943395417624, + "grad_norm": 1.734375, + "learning_rate": 7.55279354340048e-06, + "loss": 0.8911, + "step": 10167 + }, + { + "epoch": 1.7314658036307522, + "grad_norm": 1.734375, + "learning_rate": 7.551042566863017e-06, + "loss": 0.8434, + "step": 10168 + }, + { + "epoch": 1.731637267719742, + "grad_norm": 1.703125, + "learning_rate": 7.549291670195278e-06, + "loss": 0.8142, + "step": 10169 + }, + { + "epoch": 1.7318087318087318, + "grad_norm": 1.7265625, + "learning_rate": 7.547540853454366e-06, + "loss": 0.8171, + "step": 10170 + }, + { + "epoch": 1.7319801958977217, + "grad_norm": 1.6484375, + "learning_rate": 7.5457901166973825e-06, + "loss": 0.8392, + "step": 10171 + }, + { + "epoch": 1.7321516599867115, + "grad_norm": 1.75, + "learning_rate": 7.544039459981425e-06, + "loss": 0.8475, + "step": 10172 + }, + { + "epoch": 1.7323231240757013, + "grad_norm": 1.7109375, + "learning_rate": 7.542288883363587e-06, + "loss": 0.8659, + "step": 10173 + }, + { + "epoch": 1.7324945881646912, + "grad_norm": 1.7734375, + "learning_rate": 7.540538386900966e-06, + "loss": 0.9016, + "step": 10174 + }, + { + "epoch": 1.732666052253681, + "grad_norm": 1.703125, + "learning_rate": 7.538787970650648e-06, + "loss": 0.9046, + "step": 10175 + }, + { + "epoch": 1.732837516342671, + "grad_norm": 1.6328125, + "learning_rate": 7.5370376346697235e-06, + "loss": 0.8584, + "step": 10176 + }, + { + "epoch": 1.7330089804316609, + "grad_norm": 1.71875, + "learning_rate": 7.535287379015278e-06, + "loss": 0.8298, + "step": 10177 + }, + { + "epoch": 1.7331804445206507, + "grad_norm": 1.703125, + "learning_rate": 7.53353720374439e-06, + "loss": 0.845, + "step": 10178 + }, + { + "epoch": 1.7333519086096407, + "grad_norm": 1.6796875, + "learning_rate": 7.5317871089141415e-06, + "loss": 0.7766, + "step": 10179 + }, + { + "epoch": 1.7335233726986305, + "grad_norm": 1.640625, + "learning_rate": 7.53003709458161e-06, + "loss": 0.7899, + "step": 10180 + }, + { + "epoch": 1.7336948367876204, + "grad_norm": 1.625, + "learning_rate": 7.528287160803871e-06, + "loss": 0.8287, + "step": 10181 + }, + { + "epoch": 1.7338663008766102, + "grad_norm": 1.6796875, + "learning_rate": 7.5265373076379934e-06, + "loss": 0.8382, + "step": 10182 + }, + { + "epoch": 1.7340377649656, + "grad_norm": 1.7109375, + "learning_rate": 7.524787535141049e-06, + "loss": 0.8667, + "step": 10183 + }, + { + "epoch": 1.7342092290545899, + "grad_norm": 1.703125, + "learning_rate": 7.523037843370104e-06, + "loss": 0.7994, + "step": 10184 + }, + { + "epoch": 1.7343806931435797, + "grad_norm": 1.671875, + "learning_rate": 7.521288232382221e-06, + "loss": 0.8789, + "step": 10185 + }, + { + "epoch": 1.7345521572325695, + "grad_norm": 1.6953125, + "learning_rate": 7.519538702234464e-06, + "loss": 0.8457, + "step": 10186 + }, + { + "epoch": 1.7347236213215593, + "grad_norm": 1.65625, + "learning_rate": 7.517789252983891e-06, + "loss": 0.8667, + "step": 10187 + }, + { + "epoch": 1.7348950854105494, + "grad_norm": 1.71875, + "learning_rate": 7.516039884687557e-06, + "loss": 0.8285, + "step": 10188 + }, + { + "epoch": 1.7350665494995392, + "grad_norm": 1.6328125, + "learning_rate": 7.51429059740252e-06, + "loss": 0.8443, + "step": 10189 + }, + { + "epoch": 1.735238013588529, + "grad_norm": 1.78125, + "learning_rate": 7.512541391185823e-06, + "loss": 0.8528, + "step": 10190 + }, + { + "epoch": 1.735409477677519, + "grad_norm": 1.703125, + "learning_rate": 7.51079226609452e-06, + "loss": 0.7553, + "step": 10191 + }, + { + "epoch": 1.735580941766509, + "grad_norm": 1.75, + "learning_rate": 7.5090432221856526e-06, + "loss": 0.8326, + "step": 10192 + }, + { + "epoch": 1.7357524058554987, + "grad_norm": 1.671875, + "learning_rate": 7.507294259516265e-06, + "loss": 0.8594, + "step": 10193 + }, + { + "epoch": 1.7359238699444886, + "grad_norm": 1.65625, + "learning_rate": 7.505545378143398e-06, + "loss": 0.8385, + "step": 10194 + }, + { + "epoch": 1.7360953340334784, + "grad_norm": 1.796875, + "learning_rate": 7.503796578124092e-06, + "loss": 0.8266, + "step": 10195 + }, + { + "epoch": 1.7362667981224682, + "grad_norm": 1.71875, + "learning_rate": 7.502047859515378e-06, + "loss": 0.8895, + "step": 10196 + }, + { + "epoch": 1.736438262211458, + "grad_norm": 1.6328125, + "learning_rate": 7.50029922237429e-06, + "loss": 0.7925, + "step": 10197 + }, + { + "epoch": 1.7366097263004479, + "grad_norm": 1.78125, + "learning_rate": 7.498550666757858e-06, + "loss": 0.8161, + "step": 10198 + }, + { + "epoch": 1.7367811903894377, + "grad_norm": 1.6875, + "learning_rate": 7.496802192723107e-06, + "loss": 0.8297, + "step": 10199 + }, + { + "epoch": 1.7369526544784277, + "grad_norm": 1.703125, + "learning_rate": 7.495053800327068e-06, + "loss": 0.8645, + "step": 10200 + }, + { + "epoch": 1.7371241185674176, + "grad_norm": 1.6484375, + "learning_rate": 7.493305489626753e-06, + "loss": 0.8653, + "step": 10201 + }, + { + "epoch": 1.7372955826564074, + "grad_norm": 1.6875, + "learning_rate": 7.491557260679183e-06, + "loss": 0.7951, + "step": 10202 + }, + { + "epoch": 1.7374670467453974, + "grad_norm": 1.6640625, + "learning_rate": 7.489809113541379e-06, + "loss": 0.7885, + "step": 10203 + }, + { + "epoch": 1.7376385108343873, + "grad_norm": 1.75, + "learning_rate": 7.488061048270352e-06, + "loss": 0.9382, + "step": 10204 + }, + { + "epoch": 1.737809974923377, + "grad_norm": 1.734375, + "learning_rate": 7.486313064923114e-06, + "loss": 0.8568, + "step": 10205 + }, + { + "epoch": 1.737981439012367, + "grad_norm": 1.6796875, + "learning_rate": 7.484565163556672e-06, + "loss": 0.8941, + "step": 10206 + }, + { + "epoch": 1.7381529031013567, + "grad_norm": 1.6328125, + "learning_rate": 7.482817344228031e-06, + "loss": 0.804, + "step": 10207 + }, + { + "epoch": 1.7383243671903466, + "grad_norm": 1.671875, + "learning_rate": 7.481069606994198e-06, + "loss": 0.8119, + "step": 10208 + }, + { + "epoch": 1.7384958312793364, + "grad_norm": 1.734375, + "learning_rate": 7.479321951912168e-06, + "loss": 0.839, + "step": 10209 + }, + { + "epoch": 1.7386672953683262, + "grad_norm": 1.640625, + "learning_rate": 7.477574379038943e-06, + "loss": 0.8308, + "step": 10210 + }, + { + "epoch": 1.738838759457316, + "grad_norm": 1.78125, + "learning_rate": 7.475826888431515e-06, + "loss": 0.8357, + "step": 10211 + }, + { + "epoch": 1.7390102235463059, + "grad_norm": 1.6875, + "learning_rate": 7.47407948014688e-06, + "loss": 0.8502, + "step": 10212 + }, + { + "epoch": 1.739181687635296, + "grad_norm": 1.6796875, + "learning_rate": 7.472332154242023e-06, + "loss": 0.8749, + "step": 10213 + }, + { + "epoch": 1.7393531517242857, + "grad_norm": 1.7265625, + "learning_rate": 7.470584910773931e-06, + "loss": 0.8231, + "step": 10214 + }, + { + "epoch": 1.7395246158132756, + "grad_norm": 1.734375, + "learning_rate": 7.4688377497995915e-06, + "loss": 0.8878, + "step": 10215 + }, + { + "epoch": 1.7396960799022656, + "grad_norm": 1.703125, + "learning_rate": 7.467090671375985e-06, + "loss": 0.9022, + "step": 10216 + }, + { + "epoch": 1.7398675439912554, + "grad_norm": 1.71875, + "learning_rate": 7.465343675560088e-06, + "loss": 0.8323, + "step": 10217 + }, + { + "epoch": 1.7400390080802453, + "grad_norm": 1.7578125, + "learning_rate": 7.4635967624088805e-06, + "loss": 0.8797, + "step": 10218 + }, + { + "epoch": 1.740210472169235, + "grad_norm": 1.6015625, + "learning_rate": 7.461849931979332e-06, + "loss": 0.8438, + "step": 10219 + }, + { + "epoch": 1.740381936258225, + "grad_norm": 1.6796875, + "learning_rate": 7.4601031843284155e-06, + "loss": 0.903, + "step": 10220 + }, + { + "epoch": 1.7405534003472147, + "grad_norm": 1.6796875, + "learning_rate": 7.4583565195130995e-06, + "loss": 0.7802, + "step": 10221 + }, + { + "epoch": 1.7407248644362046, + "grad_norm": 1.734375, + "learning_rate": 7.456609937590347e-06, + "loss": 0.8862, + "step": 10222 + }, + { + "epoch": 1.7408963285251944, + "grad_norm": 1.7734375, + "learning_rate": 7.454863438617124e-06, + "loss": 0.849, + "step": 10223 + }, + { + "epoch": 1.7410677926141842, + "grad_norm": 1.796875, + "learning_rate": 7.453117022650387e-06, + "loss": 0.8742, + "step": 10224 + }, + { + "epoch": 1.7412392567031743, + "grad_norm": 1.6484375, + "learning_rate": 7.451370689747095e-06, + "loss": 0.8438, + "step": 10225 + }, + { + "epoch": 1.741410720792164, + "grad_norm": 1.6640625, + "learning_rate": 7.4496244399642005e-06, + "loss": 0.9366, + "step": 10226 + }, + { + "epoch": 1.741582184881154, + "grad_norm": 1.6875, + "learning_rate": 7.447878273358657e-06, + "loss": 0.8167, + "step": 10227 + }, + { + "epoch": 1.741753648970144, + "grad_norm": 1.78125, + "learning_rate": 7.4461321899874155e-06, + "loss": 0.8649, + "step": 10228 + }, + { + "epoch": 1.7419251130591338, + "grad_norm": 1.6484375, + "learning_rate": 7.444386189907418e-06, + "loss": 0.8296, + "step": 10229 + }, + { + "epoch": 1.7420965771481236, + "grad_norm": 1.65625, + "learning_rate": 7.442640273175612e-06, + "loss": 0.8709, + "step": 10230 + }, + { + "epoch": 1.7422680412371134, + "grad_norm": 1.6484375, + "learning_rate": 7.440894439848935e-06, + "loss": 0.8295, + "step": 10231 + }, + { + "epoch": 1.7424395053261033, + "grad_norm": 1.7109375, + "learning_rate": 7.439148689984327e-06, + "loss": 0.8687, + "step": 10232 + }, + { + "epoch": 1.742610969415093, + "grad_norm": 1.6640625, + "learning_rate": 7.437403023638725e-06, + "loss": 0.8493, + "step": 10233 + }, + { + "epoch": 1.742782433504083, + "grad_norm": 1.7421875, + "learning_rate": 7.43565744086906e-06, + "loss": 0.8864, + "step": 10234 + }, + { + "epoch": 1.7429538975930727, + "grad_norm": 1.78125, + "learning_rate": 7.433911941732266e-06, + "loss": 0.8556, + "step": 10235 + }, + { + "epoch": 1.7431253616820626, + "grad_norm": 1.609375, + "learning_rate": 7.4321665262852626e-06, + "loss": 0.7473, + "step": 10236 + }, + { + "epoch": 1.7432968257710526, + "grad_norm": 1.5859375, + "learning_rate": 7.430421194584978e-06, + "loss": 0.9047, + "step": 10237 + }, + { + "epoch": 1.7434682898600424, + "grad_norm": 1.65625, + "learning_rate": 7.428675946688335e-06, + "loss": 0.8799, + "step": 10238 + }, + { + "epoch": 1.7436397539490323, + "grad_norm": 1.7890625, + "learning_rate": 7.426930782652253e-06, + "loss": 0.8822, + "step": 10239 + }, + { + "epoch": 1.7438112180380223, + "grad_norm": 1.625, + "learning_rate": 7.425185702533644e-06, + "loss": 0.8088, + "step": 10240 + }, + { + "epoch": 1.7439826821270121, + "grad_norm": 1.71875, + "learning_rate": 7.423440706389427e-06, + "loss": 0.8863, + "step": 10241 + }, + { + "epoch": 1.744154146216002, + "grad_norm": 1.6640625, + "learning_rate": 7.421695794276511e-06, + "loss": 0.788, + "step": 10242 + }, + { + "epoch": 1.7443256103049918, + "grad_norm": 1.671875, + "learning_rate": 7.4199509662518054e-06, + "loss": 0.859, + "step": 10243 + }, + { + "epoch": 1.7444970743939816, + "grad_norm": 1.7578125, + "learning_rate": 7.418206222372213e-06, + "loss": 0.8653, + "step": 10244 + }, + { + "epoch": 1.7446685384829714, + "grad_norm": 1.7578125, + "learning_rate": 7.416461562694639e-06, + "loss": 0.8734, + "step": 10245 + }, + { + "epoch": 1.7448400025719613, + "grad_norm": 1.6953125, + "learning_rate": 7.414716987275985e-06, + "loss": 0.8341, + "step": 10246 + }, + { + "epoch": 1.745011466660951, + "grad_norm": 1.8203125, + "learning_rate": 7.412972496173143e-06, + "loss": 0.8791, + "step": 10247 + }, + { + "epoch": 1.745182930749941, + "grad_norm": 1.8203125, + "learning_rate": 7.4112280894430076e-06, + "loss": 0.8111, + "step": 10248 + }, + { + "epoch": 1.745354394838931, + "grad_norm": 1.6640625, + "learning_rate": 7.409483767142473e-06, + "loss": 0.8054, + "step": 10249 + }, + { + "epoch": 1.7455258589279208, + "grad_norm": 1.8359375, + "learning_rate": 7.4077395293284285e-06, + "loss": 0.8333, + "step": 10250 + }, + { + "epoch": 1.7456973230169106, + "grad_norm": 1.6640625, + "learning_rate": 7.405995376057758e-06, + "loss": 0.7959, + "step": 10251 + }, + { + "epoch": 1.7458687871059007, + "grad_norm": 1.6328125, + "learning_rate": 7.404251307387349e-06, + "loss": 0.8575, + "step": 10252 + }, + { + "epoch": 1.7460402511948905, + "grad_norm": 1.6015625, + "learning_rate": 7.402507323374077e-06, + "loss": 0.8788, + "step": 10253 + }, + { + "epoch": 1.7462117152838803, + "grad_norm": 1.765625, + "learning_rate": 7.400763424074824e-06, + "loss": 0.8249, + "step": 10254 + }, + { + "epoch": 1.7463831793728701, + "grad_norm": 1.6640625, + "learning_rate": 7.399019609546464e-06, + "loss": 0.9007, + "step": 10255 + }, + { + "epoch": 1.74655464346186, + "grad_norm": 1.6640625, + "learning_rate": 7.397275879845868e-06, + "loss": 0.8715, + "step": 10256 + }, + { + "epoch": 1.7467261075508498, + "grad_norm": 1.7109375, + "learning_rate": 7.395532235029908e-06, + "loss": 0.8172, + "step": 10257 + }, + { + "epoch": 1.7468975716398396, + "grad_norm": 1.7265625, + "learning_rate": 7.393788675155449e-06, + "loss": 0.922, + "step": 10258 + }, + { + "epoch": 1.7470690357288294, + "grad_norm": 1.609375, + "learning_rate": 7.392045200279354e-06, + "loss": 0.7991, + "step": 10259 + }, + { + "epoch": 1.7472404998178193, + "grad_norm": 1.7265625, + "learning_rate": 7.390301810458487e-06, + "loss": 0.8249, + "step": 10260 + }, + { + "epoch": 1.7474119639068093, + "grad_norm": 1.734375, + "learning_rate": 7.388558505749703e-06, + "loss": 0.8807, + "step": 10261 + }, + { + "epoch": 1.7475834279957991, + "grad_norm": 1.625, + "learning_rate": 7.386815286209862e-06, + "loss": 0.8831, + "step": 10262 + }, + { + "epoch": 1.747754892084789, + "grad_norm": 1.640625, + "learning_rate": 7.385072151895814e-06, + "loss": 0.854, + "step": 10263 + }, + { + "epoch": 1.747926356173779, + "grad_norm": 1.5859375, + "learning_rate": 7.383329102864411e-06, + "loss": 0.8495, + "step": 10264 + }, + { + "epoch": 1.7480978202627688, + "grad_norm": 1.6875, + "learning_rate": 7.381586139172499e-06, + "loss": 0.8507, + "step": 10265 + }, + { + "epoch": 1.7482692843517587, + "grad_norm": 1.625, + "learning_rate": 7.379843260876922e-06, + "loss": 0.8643, + "step": 10266 + }, + { + "epoch": 1.7484407484407485, + "grad_norm": 1.734375, + "learning_rate": 7.3781004680345235e-06, + "loss": 0.7792, + "step": 10267 + }, + { + "epoch": 1.7486122125297383, + "grad_norm": 1.5546875, + "learning_rate": 7.376357760702142e-06, + "loss": 0.7385, + "step": 10268 + }, + { + "epoch": 1.7487836766187281, + "grad_norm": 1.6796875, + "learning_rate": 7.374615138936615e-06, + "loss": 0.8638, + "step": 10269 + }, + { + "epoch": 1.748955140707718, + "grad_norm": 1.7421875, + "learning_rate": 7.372872602794772e-06, + "loss": 0.8629, + "step": 10270 + }, + { + "epoch": 1.7491266047967078, + "grad_norm": 1.6953125, + "learning_rate": 7.371130152333448e-06, + "loss": 0.8324, + "step": 10271 + }, + { + "epoch": 1.7492980688856976, + "grad_norm": 1.65625, + "learning_rate": 7.369387787609469e-06, + "loss": 0.8626, + "step": 10272 + }, + { + "epoch": 1.7494695329746877, + "grad_norm": 1.671875, + "learning_rate": 7.367645508679659e-06, + "loss": 0.8506, + "step": 10273 + }, + { + "epoch": 1.7496409970636775, + "grad_norm": 1.703125, + "learning_rate": 7.365903315600842e-06, + "loss": 0.7514, + "step": 10274 + }, + { + "epoch": 1.7498124611526673, + "grad_norm": 1.7109375, + "learning_rate": 7.364161208429838e-06, + "loss": 0.8505, + "step": 10275 + }, + { + "epoch": 1.7499839252416574, + "grad_norm": 1.65625, + "learning_rate": 7.362419187223462e-06, + "loss": 0.8002, + "step": 10276 + }, + { + "epoch": 1.7501553893306472, + "grad_norm": 1.6015625, + "learning_rate": 7.360677252038529e-06, + "loss": 0.7803, + "step": 10277 + }, + { + "epoch": 1.750326853419637, + "grad_norm": 1.71875, + "learning_rate": 7.358935402931848e-06, + "loss": 0.8861, + "step": 10278 + }, + { + "epoch": 1.7504983175086268, + "grad_norm": 1.7578125, + "learning_rate": 7.357193639960227e-06, + "loss": 0.8101, + "step": 10279 + }, + { + "epoch": 1.7506697815976167, + "grad_norm": 1.7109375, + "learning_rate": 7.355451963180477e-06, + "loss": 0.829, + "step": 10280 + }, + { + "epoch": 1.7508412456866065, + "grad_norm": 1.6875, + "learning_rate": 7.353710372649399e-06, + "loss": 0.8157, + "step": 10281 + }, + { + "epoch": 1.7510127097755963, + "grad_norm": 1.6953125, + "learning_rate": 7.351968868423789e-06, + "loss": 0.9102, + "step": 10282 + }, + { + "epoch": 1.7511841738645861, + "grad_norm": 1.6640625, + "learning_rate": 7.350227450560443e-06, + "loss": 0.8951, + "step": 10283 + }, + { + "epoch": 1.751355637953576, + "grad_norm": 1.6171875, + "learning_rate": 7.348486119116161e-06, + "loss": 0.8913, + "step": 10284 + }, + { + "epoch": 1.751527102042566, + "grad_norm": 1.671875, + "learning_rate": 7.346744874147729e-06, + "loss": 0.8598, + "step": 10285 + }, + { + "epoch": 1.7516985661315558, + "grad_norm": 1.703125, + "learning_rate": 7.345003715711938e-06, + "loss": 0.8271, + "step": 10286 + }, + { + "epoch": 1.7518700302205457, + "grad_norm": 1.6796875, + "learning_rate": 7.3432626438655726e-06, + "loss": 0.8592, + "step": 10287 + }, + { + "epoch": 1.7520414943095357, + "grad_norm": 1.6796875, + "learning_rate": 7.3415216586654184e-06, + "loss": 0.8075, + "step": 10288 + }, + { + "epoch": 1.7522129583985255, + "grad_norm": 1.6640625, + "learning_rate": 7.339780760168254e-06, + "loss": 0.8372, + "step": 10289 + }, + { + "epoch": 1.7523844224875154, + "grad_norm": 1.65625, + "learning_rate": 7.338039948430857e-06, + "loss": 0.8529, + "step": 10290 + }, + { + "epoch": 1.7525558865765052, + "grad_norm": 1.671875, + "learning_rate": 7.336299223509999e-06, + "loss": 0.7569, + "step": 10291 + }, + { + "epoch": 1.752727350665495, + "grad_norm": 1.6796875, + "learning_rate": 7.334558585462461e-06, + "loss": 0.8094, + "step": 10292 + }, + { + "epoch": 1.7528988147544848, + "grad_norm": 1.828125, + "learning_rate": 7.332818034344999e-06, + "loss": 0.897, + "step": 10293 + }, + { + "epoch": 1.7530702788434747, + "grad_norm": 1.7578125, + "learning_rate": 7.331077570214385e-06, + "loss": 0.8805, + "step": 10294 + }, + { + "epoch": 1.7532417429324645, + "grad_norm": 1.6796875, + "learning_rate": 7.329337193127379e-06, + "loss": 0.8222, + "step": 10295 + }, + { + "epoch": 1.7534132070214543, + "grad_norm": 1.75, + "learning_rate": 7.327596903140746e-06, + "loss": 0.8482, + "step": 10296 + }, + { + "epoch": 1.7535846711104444, + "grad_norm": 1.7265625, + "learning_rate": 7.325856700311243e-06, + "loss": 0.8909, + "step": 10297 + }, + { + "epoch": 1.7537561351994342, + "grad_norm": 1.7109375, + "learning_rate": 7.32411658469562e-06, + "loss": 0.8621, + "step": 10298 + }, + { + "epoch": 1.753927599288424, + "grad_norm": 1.703125, + "learning_rate": 7.322376556350633e-06, + "loss": 0.8418, + "step": 10299 + }, + { + "epoch": 1.754099063377414, + "grad_norm": 1.671875, + "learning_rate": 7.32063661533303e-06, + "loss": 0.8549, + "step": 10300 + }, + { + "epoch": 1.7542705274664039, + "grad_norm": 1.78125, + "learning_rate": 7.318896761699557e-06, + "loss": 0.9614, + "step": 10301 + }, + { + "epoch": 1.7544419915553937, + "grad_norm": 1.6171875, + "learning_rate": 7.317156995506955e-06, + "loss": 0.8664, + "step": 10302 + }, + { + "epoch": 1.7546134556443835, + "grad_norm": 1.6953125, + "learning_rate": 7.315417316811967e-06, + "loss": 0.8552, + "step": 10303 + }, + { + "epoch": 1.7547849197333734, + "grad_norm": 1.6640625, + "learning_rate": 7.313677725671331e-06, + "loss": 0.8504, + "step": 10304 + }, + { + "epoch": 1.7549563838223632, + "grad_norm": 1.6484375, + "learning_rate": 7.311938222141779e-06, + "loss": 0.7789, + "step": 10305 + }, + { + "epoch": 1.755127847911353, + "grad_norm": 1.671875, + "learning_rate": 7.3101988062800435e-06, + "loss": 0.8515, + "step": 10306 + }, + { + "epoch": 1.7552993120003428, + "grad_norm": 1.7265625, + "learning_rate": 7.308459478142853e-06, + "loss": 0.8759, + "step": 10307 + }, + { + "epoch": 1.7554707760893327, + "grad_norm": 1.6640625, + "learning_rate": 7.3067202377869354e-06, + "loss": 0.8369, + "step": 10308 + }, + { + "epoch": 1.7556422401783225, + "grad_norm": 1.6640625, + "learning_rate": 7.304981085269012e-06, + "loss": 0.8632, + "step": 10309 + }, + { + "epoch": 1.7558137042673125, + "grad_norm": 1.765625, + "learning_rate": 7.303242020645804e-06, + "loss": 0.8468, + "step": 10310 + }, + { + "epoch": 1.7559851683563024, + "grad_norm": 1.71875, + "learning_rate": 7.30150304397403e-06, + "loss": 0.7487, + "step": 10311 + }, + { + "epoch": 1.7561566324452922, + "grad_norm": 1.71875, + "learning_rate": 7.2997641553104025e-06, + "loss": 0.8561, + "step": 10312 + }, + { + "epoch": 1.7563280965342822, + "grad_norm": 1.6328125, + "learning_rate": 7.298025354711633e-06, + "loss": 0.8081, + "step": 10313 + }, + { + "epoch": 1.756499560623272, + "grad_norm": 1.6953125, + "learning_rate": 7.296286642234434e-06, + "loss": 0.9342, + "step": 10314 + }, + { + "epoch": 1.7566710247122619, + "grad_norm": 1.7265625, + "learning_rate": 7.2945480179355075e-06, + "loss": 0.8533, + "step": 10315 + }, + { + "epoch": 1.7568424888012517, + "grad_norm": 1.71875, + "learning_rate": 7.292809481871559e-06, + "loss": 0.8413, + "step": 10316 + }, + { + "epoch": 1.7570139528902415, + "grad_norm": 1.6875, + "learning_rate": 7.291071034099285e-06, + "loss": 0.8442, + "step": 10317 + }, + { + "epoch": 1.7571854169792314, + "grad_norm": 1.8203125, + "learning_rate": 7.289332674675386e-06, + "loss": 0.919, + "step": 10318 + }, + { + "epoch": 1.7573568810682212, + "grad_norm": 1.7109375, + "learning_rate": 7.287594403656557e-06, + "loss": 0.829, + "step": 10319 + }, + { + "epoch": 1.757528345157211, + "grad_norm": 1.625, + "learning_rate": 7.2858562210994886e-06, + "loss": 0.8363, + "step": 10320 + }, + { + "epoch": 1.7576998092462008, + "grad_norm": 1.6875, + "learning_rate": 7.284118127060868e-06, + "loss": 0.8166, + "step": 10321 + }, + { + "epoch": 1.7578712733351909, + "grad_norm": 1.5859375, + "learning_rate": 7.282380121597384e-06, + "loss": 0.7882, + "step": 10322 + }, + { + "epoch": 1.7580427374241807, + "grad_norm": 1.6953125, + "learning_rate": 7.2806422047657165e-06, + "loss": 0.8613, + "step": 10323 + }, + { + "epoch": 1.7582142015131705, + "grad_norm": 1.609375, + "learning_rate": 7.278904376622548e-06, + "loss": 0.8299, + "step": 10324 + }, + { + "epoch": 1.7583856656021606, + "grad_norm": 1.671875, + "learning_rate": 7.277166637224553e-06, + "loss": 0.7943, + "step": 10325 + }, + { + "epoch": 1.7585571296911504, + "grad_norm": 1.8203125, + "learning_rate": 7.275428986628406e-06, + "loss": 0.9177, + "step": 10326 + }, + { + "epoch": 1.7587285937801402, + "grad_norm": 1.734375, + "learning_rate": 7.273691424890786e-06, + "loss": 0.8536, + "step": 10327 + }, + { + "epoch": 1.75890005786913, + "grad_norm": 1.8046875, + "learning_rate": 7.271953952068351e-06, + "loss": 0.8851, + "step": 10328 + }, + { + "epoch": 1.7590715219581199, + "grad_norm": 1.7421875, + "learning_rate": 7.270216568217769e-06, + "loss": 0.8791, + "step": 10329 + }, + { + "epoch": 1.7592429860471097, + "grad_norm": 1.734375, + "learning_rate": 7.2684792733957055e-06, + "loss": 0.8501, + "step": 10330 + }, + { + "epoch": 1.7594144501360995, + "grad_norm": 1.625, + "learning_rate": 7.2667420676588185e-06, + "loss": 0.8799, + "step": 10331 + }, + { + "epoch": 1.7595859142250894, + "grad_norm": 1.703125, + "learning_rate": 7.265004951063765e-06, + "loss": 0.8702, + "step": 10332 + }, + { + "epoch": 1.7597573783140792, + "grad_norm": 1.765625, + "learning_rate": 7.263267923667199e-06, + "loss": 0.9555, + "step": 10333 + }, + { + "epoch": 1.7599288424030692, + "grad_norm": 1.65625, + "learning_rate": 7.261530985525771e-06, + "loss": 0.9094, + "step": 10334 + }, + { + "epoch": 1.760100306492059, + "grad_norm": 1.71875, + "learning_rate": 7.25979413669613e-06, + "loss": 0.8269, + "step": 10335 + }, + { + "epoch": 1.760271770581049, + "grad_norm": 1.6796875, + "learning_rate": 7.258057377234922e-06, + "loss": 0.8336, + "step": 10336 + }, + { + "epoch": 1.760443234670039, + "grad_norm": 1.828125, + "learning_rate": 7.256320707198786e-06, + "loss": 0.8184, + "step": 10337 + }, + { + "epoch": 1.7606146987590288, + "grad_norm": 1.65625, + "learning_rate": 7.254584126644366e-06, + "loss": 0.8485, + "step": 10338 + }, + { + "epoch": 1.7607861628480186, + "grad_norm": 1.7109375, + "learning_rate": 7.252847635628298e-06, + "loss": 0.849, + "step": 10339 + }, + { + "epoch": 1.7609576269370084, + "grad_norm": 1.7421875, + "learning_rate": 7.251111234207211e-06, + "loss": 0.8736, + "step": 10340 + }, + { + "epoch": 1.7611290910259982, + "grad_norm": 1.6015625, + "learning_rate": 7.249374922437737e-06, + "loss": 0.877, + "step": 10341 + }, + { + "epoch": 1.761300555114988, + "grad_norm": 1.640625, + "learning_rate": 7.247638700376503e-06, + "loss": 0.8993, + "step": 10342 + }, + { + "epoch": 1.761472019203978, + "grad_norm": 1.75, + "learning_rate": 7.2459025680801365e-06, + "loss": 0.8747, + "step": 10343 + }, + { + "epoch": 1.7616434832929677, + "grad_norm": 1.765625, + "learning_rate": 7.2441665256052606e-06, + "loss": 0.8566, + "step": 10344 + }, + { + "epoch": 1.7618149473819575, + "grad_norm": 1.703125, + "learning_rate": 7.24243057300849e-06, + "loss": 0.9118, + "step": 10345 + }, + { + "epoch": 1.7619864114709476, + "grad_norm": 1.6796875, + "learning_rate": 7.240694710346443e-06, + "loss": 0.8452, + "step": 10346 + }, + { + "epoch": 1.7621578755599374, + "grad_norm": 1.71875, + "learning_rate": 7.238958937675731e-06, + "loss": 0.8191, + "step": 10347 + }, + { + "epoch": 1.7623293396489272, + "grad_norm": 1.6328125, + "learning_rate": 7.237223255052967e-06, + "loss": 0.8403, + "step": 10348 + }, + { + "epoch": 1.7625008037379173, + "grad_norm": 1.6015625, + "learning_rate": 7.235487662534755e-06, + "loss": 0.7737, + "step": 10349 + }, + { + "epoch": 1.7626722678269071, + "grad_norm": 1.6796875, + "learning_rate": 7.233752160177705e-06, + "loss": 0.8608, + "step": 10350 + }, + { + "epoch": 1.762843731915897, + "grad_norm": 1.7265625, + "learning_rate": 7.232016748038408e-06, + "loss": 0.8762, + "step": 10351 + }, + { + "epoch": 1.7630151960048868, + "grad_norm": 1.625, + "learning_rate": 7.230281426173469e-06, + "loss": 0.8423, + "step": 10352 + }, + { + "epoch": 1.7631866600938766, + "grad_norm": 1.7109375, + "learning_rate": 7.228546194639483e-06, + "loss": 0.8653, + "step": 10353 + }, + { + "epoch": 1.7633581241828664, + "grad_norm": 1.703125, + "learning_rate": 7.226811053493041e-06, + "loss": 0.8833, + "step": 10354 + }, + { + "epoch": 1.7635295882718562, + "grad_norm": 1.765625, + "learning_rate": 7.225076002790736e-06, + "loss": 0.8177, + "step": 10355 + }, + { + "epoch": 1.763701052360846, + "grad_norm": 1.640625, + "learning_rate": 7.223341042589151e-06, + "loss": 0.751, + "step": 10356 + }, + { + "epoch": 1.763872516449836, + "grad_norm": 1.796875, + "learning_rate": 7.221606172944869e-06, + "loss": 0.8932, + "step": 10357 + }, + { + "epoch": 1.764043980538826, + "grad_norm": 1.6796875, + "learning_rate": 7.219871393914473e-06, + "loss": 0.9326, + "step": 10358 + }, + { + "epoch": 1.7642154446278158, + "grad_norm": 1.765625, + "learning_rate": 7.218136705554541e-06, + "loss": 0.9091, + "step": 10359 + }, + { + "epoch": 1.7643869087168056, + "grad_norm": 1.7265625, + "learning_rate": 7.216402107921645e-06, + "loss": 0.7611, + "step": 10360 + }, + { + "epoch": 1.7645583728057956, + "grad_norm": 1.6171875, + "learning_rate": 7.214667601072358e-06, + "loss": 0.8232, + "step": 10361 + }, + { + "epoch": 1.7647298368947855, + "grad_norm": 1.71875, + "learning_rate": 7.2129331850632535e-06, + "loss": 0.8851, + "step": 10362 + }, + { + "epoch": 1.7649013009837753, + "grad_norm": 1.7265625, + "learning_rate": 7.21119885995089e-06, + "loss": 0.8625, + "step": 10363 + }, + { + "epoch": 1.7650727650727651, + "grad_norm": 1.6796875, + "learning_rate": 7.209464625791831e-06, + "loss": 0.8253, + "step": 10364 + }, + { + "epoch": 1.765244229161755, + "grad_norm": 1.609375, + "learning_rate": 7.207730482642641e-06, + "loss": 0.8143, + "step": 10365 + }, + { + "epoch": 1.7654156932507448, + "grad_norm": 1.6953125, + "learning_rate": 7.205996430559874e-06, + "loss": 0.8331, + "step": 10366 + }, + { + "epoch": 1.7655871573397346, + "grad_norm": 1.6953125, + "learning_rate": 7.2042624696000855e-06, + "loss": 0.9238, + "step": 10367 + }, + { + "epoch": 1.7657586214287244, + "grad_norm": 1.78125, + "learning_rate": 7.202528599819825e-06, + "loss": 0.8804, + "step": 10368 + }, + { + "epoch": 1.7659300855177142, + "grad_norm": 1.734375, + "learning_rate": 7.200794821275641e-06, + "loss": 0.8217, + "step": 10369 + }, + { + "epoch": 1.7661015496067043, + "grad_norm": 1.7265625, + "learning_rate": 7.199061134024079e-06, + "loss": 0.8658, + "step": 10370 + }, + { + "epoch": 1.7662730136956941, + "grad_norm": 1.796875, + "learning_rate": 7.197327538121681e-06, + "loss": 0.9574, + "step": 10371 + }, + { + "epoch": 1.766444477784684, + "grad_norm": 1.640625, + "learning_rate": 7.195594033624985e-06, + "loss": 0.8551, + "step": 10372 + }, + { + "epoch": 1.766615941873674, + "grad_norm": 1.6953125, + "learning_rate": 7.193860620590532e-06, + "loss": 0.8563, + "step": 10373 + }, + { + "epoch": 1.7667874059626638, + "grad_norm": 1.59375, + "learning_rate": 7.192127299074847e-06, + "loss": 0.7884, + "step": 10374 + }, + { + "epoch": 1.7669588700516536, + "grad_norm": 1.7578125, + "learning_rate": 7.190394069134464e-06, + "loss": 0.9111, + "step": 10375 + }, + { + "epoch": 1.7671303341406435, + "grad_norm": 1.8203125, + "learning_rate": 7.188660930825911e-06, + "loss": 0.8985, + "step": 10376 + }, + { + "epoch": 1.7673017982296333, + "grad_norm": 1.6640625, + "learning_rate": 7.186927884205712e-06, + "loss": 0.8794, + "step": 10377 + }, + { + "epoch": 1.7674732623186231, + "grad_norm": 1.75, + "learning_rate": 7.185194929330388e-06, + "loss": 0.8902, + "step": 10378 + }, + { + "epoch": 1.767644726407613, + "grad_norm": 1.6171875, + "learning_rate": 7.183462066256457e-06, + "loss": 0.8045, + "step": 10379 + }, + { + "epoch": 1.7678161904966028, + "grad_norm": 1.78125, + "learning_rate": 7.1817292950404325e-06, + "loss": 0.8288, + "step": 10380 + }, + { + "epoch": 1.7679876545855926, + "grad_norm": 1.8515625, + "learning_rate": 7.179996615738828e-06, + "loss": 0.8861, + "step": 10381 + }, + { + "epoch": 1.7681591186745826, + "grad_norm": 1.671875, + "learning_rate": 7.178264028408154e-06, + "loss": 0.8331, + "step": 10382 + }, + { + "epoch": 1.7683305827635725, + "grad_norm": 1.625, + "learning_rate": 7.176531533104916e-06, + "loss": 0.7918, + "step": 10383 + }, + { + "epoch": 1.7685020468525623, + "grad_norm": 1.6953125, + "learning_rate": 7.174799129885617e-06, + "loss": 0.8849, + "step": 10384 + }, + { + "epoch": 1.7686735109415523, + "grad_norm": 1.7734375, + "learning_rate": 7.173066818806762e-06, + "loss": 0.9, + "step": 10385 + }, + { + "epoch": 1.7688449750305422, + "grad_norm": 1.6640625, + "learning_rate": 7.171334599924837e-06, + "loss": 0.8639, + "step": 10386 + }, + { + "epoch": 1.769016439119532, + "grad_norm": 1.71875, + "learning_rate": 7.1696024732963445e-06, + "loss": 0.8004, + "step": 10387 + }, + { + "epoch": 1.7691879032085218, + "grad_norm": 1.6796875, + "learning_rate": 7.1678704389777735e-06, + "loss": 0.8962, + "step": 10388 + }, + { + "epoch": 1.7693593672975116, + "grad_norm": 1.6484375, + "learning_rate": 7.16613849702561e-06, + "loss": 0.8478, + "step": 10389 + }, + { + "epoch": 1.7695308313865015, + "grad_norm": 1.71875, + "learning_rate": 7.164406647496342e-06, + "loss": 0.8258, + "step": 10390 + }, + { + "epoch": 1.7697022954754913, + "grad_norm": 1.671875, + "learning_rate": 7.162674890446453e-06, + "loss": 0.8445, + "step": 10391 + }, + { + "epoch": 1.7698737595644811, + "grad_norm": 1.6484375, + "learning_rate": 7.16094322593242e-06, + "loss": 0.7893, + "step": 10392 + }, + { + "epoch": 1.770045223653471, + "grad_norm": 1.6484375, + "learning_rate": 7.1592116540107185e-06, + "loss": 0.8214, + "step": 10393 + }, + { + "epoch": 1.770216687742461, + "grad_norm": 1.7265625, + "learning_rate": 7.157480174737823e-06, + "loss": 0.8983, + "step": 10394 + }, + { + "epoch": 1.7703881518314508, + "grad_norm": 1.609375, + "learning_rate": 7.155748788170202e-06, + "loss": 0.8466, + "step": 10395 + }, + { + "epoch": 1.7705596159204406, + "grad_norm": 1.6640625, + "learning_rate": 7.154017494364329e-06, + "loss": 0.8173, + "step": 10396 + }, + { + "epoch": 1.7707310800094307, + "grad_norm": 1.609375, + "learning_rate": 7.1522862933766555e-06, + "loss": 0.817, + "step": 10397 + }, + { + "epoch": 1.7709025440984205, + "grad_norm": 1.6640625, + "learning_rate": 7.150555185263653e-06, + "loss": 0.8219, + "step": 10398 + }, + { + "epoch": 1.7710740081874103, + "grad_norm": 1.6328125, + "learning_rate": 7.148824170081774e-06, + "loss": 0.8267, + "step": 10399 + }, + { + "epoch": 1.7712454722764002, + "grad_norm": 1.734375, + "learning_rate": 7.147093247887476e-06, + "loss": 0.8318, + "step": 10400 + }, + { + "epoch": 1.77141693636539, + "grad_norm": 1.6171875, + "learning_rate": 7.145362418737209e-06, + "loss": 0.8338, + "step": 10401 + }, + { + "epoch": 1.7715884004543798, + "grad_norm": 1.7421875, + "learning_rate": 7.143631682687424e-06, + "loss": 0.9461, + "step": 10402 + }, + { + "epoch": 1.7717598645433696, + "grad_norm": 1.6796875, + "learning_rate": 7.141901039794566e-06, + "loss": 0.8248, + "step": 10403 + }, + { + "epoch": 1.7719313286323595, + "grad_norm": 1.7421875, + "learning_rate": 7.140170490115078e-06, + "loss": 0.8534, + "step": 10404 + }, + { + "epoch": 1.7721027927213493, + "grad_norm": 1.78125, + "learning_rate": 7.1384400337054e-06, + "loss": 0.8852, + "step": 10405 + }, + { + "epoch": 1.7722742568103391, + "grad_norm": 1.640625, + "learning_rate": 7.1367096706219665e-06, + "loss": 0.8271, + "step": 10406 + }, + { + "epoch": 1.7724457208993292, + "grad_norm": 1.7421875, + "learning_rate": 7.134979400921214e-06, + "loss": 0.7973, + "step": 10407 + }, + { + "epoch": 1.772617184988319, + "grad_norm": 1.8046875, + "learning_rate": 7.133249224659574e-06, + "loss": 0.7925, + "step": 10408 + }, + { + "epoch": 1.7727886490773088, + "grad_norm": 1.8046875, + "learning_rate": 7.131519141893469e-06, + "loss": 0.8688, + "step": 10409 + }, + { + "epoch": 1.7729601131662989, + "grad_norm": 1.6328125, + "learning_rate": 7.1297891526793276e-06, + "loss": 0.7748, + "step": 10410 + }, + { + "epoch": 1.7731315772552887, + "grad_norm": 1.671875, + "learning_rate": 7.128059257073569e-06, + "loss": 0.8827, + "step": 10411 + }, + { + "epoch": 1.7733030413442785, + "grad_norm": 1.6328125, + "learning_rate": 7.1263294551326145e-06, + "loss": 0.8463, + "step": 10412 + }, + { + "epoch": 1.7734745054332683, + "grad_norm": 1.75, + "learning_rate": 7.1245997469128754e-06, + "loss": 0.8997, + "step": 10413 + }, + { + "epoch": 1.7736459695222582, + "grad_norm": 1.6875, + "learning_rate": 7.122870132470769e-06, + "loss": 0.8661, + "step": 10414 + }, + { + "epoch": 1.773817433611248, + "grad_norm": 1.59375, + "learning_rate": 7.121140611862699e-06, + "loss": 0.7815, + "step": 10415 + }, + { + "epoch": 1.7739888977002378, + "grad_norm": 1.7109375, + "learning_rate": 7.119411185145075e-06, + "loss": 0.8995, + "step": 10416 + }, + { + "epoch": 1.7741603617892276, + "grad_norm": 1.6484375, + "learning_rate": 7.117681852374301e-06, + "loss": 0.9213, + "step": 10417 + }, + { + "epoch": 1.7743318258782175, + "grad_norm": 1.671875, + "learning_rate": 7.1159526136067755e-06, + "loss": 0.876, + "step": 10418 + }, + { + "epoch": 1.7745032899672075, + "grad_norm": 1.7421875, + "learning_rate": 7.114223468898897e-06, + "loss": 0.8722, + "step": 10419 + }, + { + "epoch": 1.7746747540561973, + "grad_norm": 1.7578125, + "learning_rate": 7.112494418307056e-06, + "loss": 0.8296, + "step": 10420 + }, + { + "epoch": 1.7748462181451872, + "grad_norm": 1.6875, + "learning_rate": 7.110765461887645e-06, + "loss": 0.807, + "step": 10421 + }, + { + "epoch": 1.7750176822341772, + "grad_norm": 1.6796875, + "learning_rate": 7.1090365996970526e-06, + "loss": 0.8302, + "step": 10422 + }, + { + "epoch": 1.775189146323167, + "grad_norm": 1.7421875, + "learning_rate": 7.107307831791663e-06, + "loss": 0.8044, + "step": 10423 + }, + { + "epoch": 1.7753606104121569, + "grad_norm": 1.6796875, + "learning_rate": 7.105579158227858e-06, + "loss": 0.8336, + "step": 10424 + }, + { + "epoch": 1.7755320745011467, + "grad_norm": 1.640625, + "learning_rate": 7.103850579062015e-06, + "loss": 0.8537, + "step": 10425 + }, + { + "epoch": 1.7757035385901365, + "grad_norm": 1.765625, + "learning_rate": 7.1021220943505124e-06, + "loss": 0.8888, + "step": 10426 + }, + { + "epoch": 1.7758750026791263, + "grad_norm": 1.7578125, + "learning_rate": 7.10039370414972e-06, + "loss": 0.877, + "step": 10427 + }, + { + "epoch": 1.7760464667681162, + "grad_norm": 1.6875, + "learning_rate": 7.098665408516004e-06, + "loss": 0.8352, + "step": 10428 + }, + { + "epoch": 1.776217930857106, + "grad_norm": 1.703125, + "learning_rate": 7.0969372075057385e-06, + "loss": 0.8651, + "step": 10429 + }, + { + "epoch": 1.7763893949460958, + "grad_norm": 1.6640625, + "learning_rate": 7.095209101175282e-06, + "loss": 0.8052, + "step": 10430 + }, + { + "epoch": 1.7765608590350859, + "grad_norm": 1.75, + "learning_rate": 7.093481089580999e-06, + "loss": 0.8881, + "step": 10431 + }, + { + "epoch": 1.7767323231240757, + "grad_norm": 1.640625, + "learning_rate": 7.0917531727792385e-06, + "loss": 0.9111, + "step": 10432 + }, + { + "epoch": 1.7769037872130655, + "grad_norm": 1.640625, + "learning_rate": 7.090025350826359e-06, + "loss": 0.8268, + "step": 10433 + }, + { + "epoch": 1.7770752513020556, + "grad_norm": 1.703125, + "learning_rate": 7.08829762377871e-06, + "loss": 0.7837, + "step": 10434 + }, + { + "epoch": 1.7772467153910454, + "grad_norm": 1.6796875, + "learning_rate": 7.086569991692641e-06, + "loss": 0.8403, + "step": 10435 + }, + { + "epoch": 1.7774181794800352, + "grad_norm": 1.703125, + "learning_rate": 7.084842454624493e-06, + "loss": 0.8457, + "step": 10436 + }, + { + "epoch": 1.777589643569025, + "grad_norm": 1.65625, + "learning_rate": 7.0831150126306125e-06, + "loss": 0.8311, + "step": 10437 + }, + { + "epoch": 1.7777611076580149, + "grad_norm": 1.71875, + "learning_rate": 7.081387665767334e-06, + "loss": 0.8601, + "step": 10438 + }, + { + "epoch": 1.7779325717470047, + "grad_norm": 1.6796875, + "learning_rate": 7.079660414090995e-06, + "loss": 0.8963, + "step": 10439 + }, + { + "epoch": 1.7781040358359945, + "grad_norm": 1.6953125, + "learning_rate": 7.077933257657927e-06, + "loss": 0.886, + "step": 10440 + }, + { + "epoch": 1.7782754999249843, + "grad_norm": 1.65625, + "learning_rate": 7.07620619652446e-06, + "loss": 0.8143, + "step": 10441 + }, + { + "epoch": 1.7784469640139742, + "grad_norm": 1.75, + "learning_rate": 7.074479230746921e-06, + "loss": 0.8514, + "step": 10442 + }, + { + "epoch": 1.7786184281029642, + "grad_norm": 1.7265625, + "learning_rate": 7.0727523603816276e-06, + "loss": 0.844, + "step": 10443 + }, + { + "epoch": 1.778789892191954, + "grad_norm": 1.671875, + "learning_rate": 7.071025585484901e-06, + "loss": 0.822, + "step": 10444 + }, + { + "epoch": 1.7789613562809439, + "grad_norm": 1.6328125, + "learning_rate": 7.069298906113061e-06, + "loss": 0.8756, + "step": 10445 + }, + { + "epoch": 1.779132820369934, + "grad_norm": 1.71875, + "learning_rate": 7.06757232232242e-06, + "loss": 0.8601, + "step": 10446 + }, + { + "epoch": 1.7793042844589237, + "grad_norm": 1.59375, + "learning_rate": 7.065845834169288e-06, + "loss": 0.8132, + "step": 10447 + }, + { + "epoch": 1.7794757485479136, + "grad_norm": 1.734375, + "learning_rate": 7.064119441709972e-06, + "loss": 0.885, + "step": 10448 + }, + { + "epoch": 1.7796472126369034, + "grad_norm": 1.6875, + "learning_rate": 7.062393145000776e-06, + "loss": 0.8066, + "step": 10449 + }, + { + "epoch": 1.7798186767258932, + "grad_norm": 1.6640625, + "learning_rate": 7.060666944098004e-06, + "loss": 0.8237, + "step": 10450 + }, + { + "epoch": 1.779990140814883, + "grad_norm": 1.65625, + "learning_rate": 7.05894083905795e-06, + "loss": 0.8584, + "step": 10451 + }, + { + "epoch": 1.7801616049038729, + "grad_norm": 1.6640625, + "learning_rate": 7.057214829936909e-06, + "loss": 0.8027, + "step": 10452 + }, + { + "epoch": 1.7803330689928627, + "grad_norm": 1.7734375, + "learning_rate": 7.055488916791176e-06, + "loss": 0.8564, + "step": 10453 + }, + { + "epoch": 1.7805045330818525, + "grad_norm": 1.71875, + "learning_rate": 7.053763099677038e-06, + "loss": 0.9454, + "step": 10454 + }, + { + "epoch": 1.7806759971708426, + "grad_norm": 1.703125, + "learning_rate": 7.052037378650778e-06, + "loss": 0.869, + "step": 10455 + }, + { + "epoch": 1.7808474612598324, + "grad_norm": 1.6640625, + "learning_rate": 7.050311753768681e-06, + "loss": 0.9125, + "step": 10456 + }, + { + "epoch": 1.7810189253488222, + "grad_norm": 1.671875, + "learning_rate": 7.048586225087024e-06, + "loss": 0.8282, + "step": 10457 + }, + { + "epoch": 1.7811903894378123, + "grad_norm": 1.640625, + "learning_rate": 7.0468607926620845e-06, + "loss": 0.7893, + "step": 10458 + }, + { + "epoch": 1.781361853526802, + "grad_norm": 1.6875, + "learning_rate": 7.0451354565501364e-06, + "loss": 0.9097, + "step": 10459 + }, + { + "epoch": 1.781533317615792, + "grad_norm": 1.65625, + "learning_rate": 7.043410216807447e-06, + "loss": 0.8195, + "step": 10460 + }, + { + "epoch": 1.7817047817047817, + "grad_norm": 1.6171875, + "learning_rate": 7.041685073490283e-06, + "loss": 0.8134, + "step": 10461 + }, + { + "epoch": 1.7818762457937716, + "grad_norm": 1.59375, + "learning_rate": 7.039960026654911e-06, + "loss": 0.7667, + "step": 10462 + }, + { + "epoch": 1.7820477098827614, + "grad_norm": 1.703125, + "learning_rate": 7.038235076357587e-06, + "loss": 0.8867, + "step": 10463 + }, + { + "epoch": 1.7822191739717512, + "grad_norm": 1.765625, + "learning_rate": 7.036510222654571e-06, + "loss": 0.8679, + "step": 10464 + }, + { + "epoch": 1.782390638060741, + "grad_norm": 1.703125, + "learning_rate": 7.034785465602118e-06, + "loss": 0.9346, + "step": 10465 + }, + { + "epoch": 1.7825621021497309, + "grad_norm": 1.625, + "learning_rate": 7.0330608052564744e-06, + "loss": 0.7896, + "step": 10466 + }, + { + "epoch": 1.782733566238721, + "grad_norm": 1.65625, + "learning_rate": 7.0313362416738905e-06, + "loss": 0.8179, + "step": 10467 + }, + { + "epoch": 1.7829050303277107, + "grad_norm": 1.640625, + "learning_rate": 7.0296117749106105e-06, + "loss": 0.8323, + "step": 10468 + }, + { + "epoch": 1.7830764944167006, + "grad_norm": 1.7265625, + "learning_rate": 7.0278874050228775e-06, + "loss": 0.8471, + "step": 10469 + }, + { + "epoch": 1.7832479585056906, + "grad_norm": 1.7890625, + "learning_rate": 7.026163132066927e-06, + "loss": 0.857, + "step": 10470 + }, + { + "epoch": 1.7834194225946804, + "grad_norm": 1.625, + "learning_rate": 7.024438956098996e-06, + "loss": 0.7765, + "step": 10471 + }, + { + "epoch": 1.7835908866836703, + "grad_norm": 1.7109375, + "learning_rate": 7.022714877175314e-06, + "loss": 0.8874, + "step": 10472 + }, + { + "epoch": 1.78376235077266, + "grad_norm": 1.6953125, + "learning_rate": 7.020990895352112e-06, + "loss": 0.8134, + "step": 10473 + }, + { + "epoch": 1.78393381486165, + "grad_norm": 1.5625, + "learning_rate": 7.019267010685615e-06, + "loss": 0.781, + "step": 10474 + }, + { + "epoch": 1.7841052789506398, + "grad_norm": 1.734375, + "learning_rate": 7.017543223232043e-06, + "loss": 0.8688, + "step": 10475 + }, + { + "epoch": 1.7842767430396296, + "grad_norm": 1.765625, + "learning_rate": 7.015819533047619e-06, + "loss": 0.886, + "step": 10476 + }, + { + "epoch": 1.7844482071286194, + "grad_norm": 1.7421875, + "learning_rate": 7.0140959401885635e-06, + "loss": 0.8504, + "step": 10477 + }, + { + "epoch": 1.7846196712176092, + "grad_norm": 1.703125, + "learning_rate": 7.012372444711078e-06, + "loss": 0.833, + "step": 10478 + }, + { + "epoch": 1.7847911353065993, + "grad_norm": 1.7734375, + "learning_rate": 7.010649046671376e-06, + "loss": 0.9199, + "step": 10479 + }, + { + "epoch": 1.784962599395589, + "grad_norm": 1.6875, + "learning_rate": 7.008925746125667e-06, + "loss": 0.8332, + "step": 10480 + }, + { + "epoch": 1.785134063484579, + "grad_norm": 1.6171875, + "learning_rate": 7.007202543130152e-06, + "loss": 0.8157, + "step": 10481 + }, + { + "epoch": 1.785305527573569, + "grad_norm": 1.671875, + "learning_rate": 7.005479437741032e-06, + "loss": 0.8156, + "step": 10482 + }, + { + "epoch": 1.7854769916625588, + "grad_norm": 1.71875, + "learning_rate": 7.003756430014502e-06, + "loss": 0.8972, + "step": 10483 + }, + { + "epoch": 1.7856484557515486, + "grad_norm": 1.6875, + "learning_rate": 7.00203352000676e-06, + "loss": 0.8747, + "step": 10484 + }, + { + "epoch": 1.7858199198405385, + "grad_norm": 1.7265625, + "learning_rate": 7.000310707773994e-06, + "loss": 0.9243, + "step": 10485 + }, + { + "epoch": 1.7859913839295283, + "grad_norm": 1.609375, + "learning_rate": 6.998587993372392e-06, + "loss": 0.7961, + "step": 10486 + }, + { + "epoch": 1.786162848018518, + "grad_norm": 1.6484375, + "learning_rate": 6.996865376858137e-06, + "loss": 0.8212, + "step": 10487 + }, + { + "epoch": 1.786334312107508, + "grad_norm": 1.765625, + "learning_rate": 6.995142858287416e-06, + "loss": 0.7968, + "step": 10488 + }, + { + "epoch": 1.7865057761964978, + "grad_norm": 1.8125, + "learning_rate": 6.993420437716396e-06, + "loss": 0.8075, + "step": 10489 + }, + { + "epoch": 1.7866772402854876, + "grad_norm": 1.6796875, + "learning_rate": 6.991698115201257e-06, + "loss": 0.8797, + "step": 10490 + }, + { + "epoch": 1.7868487043744776, + "grad_norm": 1.6328125, + "learning_rate": 6.9899758907981706e-06, + "loss": 0.818, + "step": 10491 + }, + { + "epoch": 1.7870201684634675, + "grad_norm": 1.6875, + "learning_rate": 6.988253764563307e-06, + "loss": 0.7987, + "step": 10492 + }, + { + "epoch": 1.7871916325524573, + "grad_norm": 1.703125, + "learning_rate": 6.986531736552829e-06, + "loss": 0.8601, + "step": 10493 + }, + { + "epoch": 1.787363096641447, + "grad_norm": 1.7265625, + "learning_rate": 6.984809806822897e-06, + "loss": 0.8766, + "step": 10494 + }, + { + "epoch": 1.7875345607304371, + "grad_norm": 1.65625, + "learning_rate": 6.983087975429673e-06, + "loss": 0.914, + "step": 10495 + }, + { + "epoch": 1.787706024819427, + "grad_norm": 1.8203125, + "learning_rate": 6.98136624242931e-06, + "loss": 0.9354, + "step": 10496 + }, + { + "epoch": 1.7878774889084168, + "grad_norm": 1.734375, + "learning_rate": 6.979644607877962e-06, + "loss": 0.9315, + "step": 10497 + }, + { + "epoch": 1.7880489529974066, + "grad_norm": 1.6953125, + "learning_rate": 6.977923071831776e-06, + "loss": 0.7592, + "step": 10498 + }, + { + "epoch": 1.7882204170863965, + "grad_norm": 1.84375, + "learning_rate": 6.9762016343469e-06, + "loss": 0.9585, + "step": 10499 + }, + { + "epoch": 1.7883918811753863, + "grad_norm": 1.734375, + "learning_rate": 6.974480295479476e-06, + "loss": 0.8544, + "step": 10500 + }, + { + "epoch": 1.7883918811753863, + "eval_loss": 0.8336145877838135, + "eval_runtime": 835.9359, + "eval_samples_per_second": 2.989, + "eval_steps_per_second": 2.989, + "step": 10500 + }, + { + "epoch": 1.788563345264376, + "grad_norm": 1.765625, + "learning_rate": 6.9727590552856405e-06, + "loss": 0.8694, + "step": 10501 + }, + { + "epoch": 1.788734809353366, + "grad_norm": 1.671875, + "learning_rate": 6.971037913821533e-06, + "loss": 0.7916, + "step": 10502 + }, + { + "epoch": 1.7889062734423558, + "grad_norm": 1.6875, + "learning_rate": 6.9693168711432835e-06, + "loss": 0.8967, + "step": 10503 + }, + { + "epoch": 1.7890777375313458, + "grad_norm": 1.671875, + "learning_rate": 6.967595927307025e-06, + "loss": 0.7497, + "step": 10504 + }, + { + "epoch": 1.7892492016203356, + "grad_norm": 1.65625, + "learning_rate": 6.96587508236888e-06, + "loss": 0.8398, + "step": 10505 + }, + { + "epoch": 1.7894206657093255, + "grad_norm": 1.8125, + "learning_rate": 6.964154336384976e-06, + "loss": 0.8664, + "step": 10506 + }, + { + "epoch": 1.7895921297983155, + "grad_norm": 1.6875, + "learning_rate": 6.962433689411431e-06, + "loss": 0.8373, + "step": 10507 + }, + { + "epoch": 1.7897635938873053, + "grad_norm": 1.6796875, + "learning_rate": 6.96071314150436e-06, + "loss": 0.8511, + "step": 10508 + }, + { + "epoch": 1.7899350579762952, + "grad_norm": 1.7109375, + "learning_rate": 6.9589926927198805e-06, + "loss": 0.8263, + "step": 10509 + }, + { + "epoch": 1.790106522065285, + "grad_norm": 1.7265625, + "learning_rate": 6.9572723431141e-06, + "loss": 0.934, + "step": 10510 + }, + { + "epoch": 1.7902779861542748, + "grad_norm": 1.71875, + "learning_rate": 6.955552092743125e-06, + "loss": 0.8711, + "step": 10511 + }, + { + "epoch": 1.7904494502432646, + "grad_norm": 1.65625, + "learning_rate": 6.953831941663065e-06, + "loss": 0.836, + "step": 10512 + }, + { + "epoch": 1.7906209143322545, + "grad_norm": 1.578125, + "learning_rate": 6.9521118899300135e-06, + "loss": 0.8665, + "step": 10513 + }, + { + "epoch": 1.7907923784212443, + "grad_norm": 1.671875, + "learning_rate": 6.95039193760007e-06, + "loss": 0.816, + "step": 10514 + }, + { + "epoch": 1.790963842510234, + "grad_norm": 1.7890625, + "learning_rate": 6.948672084729328e-06, + "loss": 0.8766, + "step": 10515 + }, + { + "epoch": 1.7911353065992242, + "grad_norm": 1.640625, + "learning_rate": 6.946952331373881e-06, + "loss": 0.802, + "step": 10516 + }, + { + "epoch": 1.791306770688214, + "grad_norm": 1.6953125, + "learning_rate": 6.945232677589815e-06, + "loss": 0.8039, + "step": 10517 + }, + { + "epoch": 1.7914782347772038, + "grad_norm": 1.734375, + "learning_rate": 6.943513123433214e-06, + "loss": 0.8979, + "step": 10518 + }, + { + "epoch": 1.7916496988661939, + "grad_norm": 1.734375, + "learning_rate": 6.94179366896016e-06, + "loss": 0.8817, + "step": 10519 + }, + { + "epoch": 1.7918211629551837, + "grad_norm": 1.6640625, + "learning_rate": 6.94007431422673e-06, + "loss": 0.8062, + "step": 10520 + }, + { + "epoch": 1.7919926270441735, + "grad_norm": 1.703125, + "learning_rate": 6.938355059289e-06, + "loss": 0.8331, + "step": 10521 + }, + { + "epoch": 1.7921640911331633, + "grad_norm": 1.6640625, + "learning_rate": 6.936635904203039e-06, + "loss": 0.812, + "step": 10522 + }, + { + "epoch": 1.7923355552221532, + "grad_norm": 1.671875, + "learning_rate": 6.934916849024922e-06, + "loss": 0.9065, + "step": 10523 + }, + { + "epoch": 1.792507019311143, + "grad_norm": 1.734375, + "learning_rate": 6.933197893810706e-06, + "loss": 0.8648, + "step": 10524 + }, + { + "epoch": 1.7926784834001328, + "grad_norm": 1.7578125, + "learning_rate": 6.931479038616453e-06, + "loss": 0.8835, + "step": 10525 + }, + { + "epoch": 1.7928499474891226, + "grad_norm": 1.6796875, + "learning_rate": 6.929760283498225e-06, + "loss": 0.893, + "step": 10526 + }, + { + "epoch": 1.7930214115781125, + "grad_norm": 1.65625, + "learning_rate": 6.928041628512074e-06, + "loss": 0.8337, + "step": 10527 + }, + { + "epoch": 1.7931928756671025, + "grad_norm": 1.7109375, + "learning_rate": 6.926323073714055e-06, + "loss": 0.8407, + "step": 10528 + }, + { + "epoch": 1.7933643397560923, + "grad_norm": 1.7265625, + "learning_rate": 6.924604619160214e-06, + "loss": 0.8472, + "step": 10529 + }, + { + "epoch": 1.7935358038450822, + "grad_norm": 1.5859375, + "learning_rate": 6.922886264906597e-06, + "loss": 0.8395, + "step": 10530 + }, + { + "epoch": 1.7937072679340722, + "grad_norm": 1.6640625, + "learning_rate": 6.921168011009247e-06, + "loss": 0.8467, + "step": 10531 + }, + { + "epoch": 1.793878732023062, + "grad_norm": 1.625, + "learning_rate": 6.919449857524201e-06, + "loss": 0.8281, + "step": 10532 + }, + { + "epoch": 1.7940501961120519, + "grad_norm": 1.71875, + "learning_rate": 6.917731804507497e-06, + "loss": 0.85, + "step": 10533 + }, + { + "epoch": 1.7942216602010417, + "grad_norm": 1.7421875, + "learning_rate": 6.916013852015165e-06, + "loss": 0.8706, + "step": 10534 + }, + { + "epoch": 1.7943931242900315, + "grad_norm": 1.71875, + "learning_rate": 6.914296000103238e-06, + "loss": 0.8551, + "step": 10535 + }, + { + "epoch": 1.7945645883790213, + "grad_norm": 1.7421875, + "learning_rate": 6.9125782488277345e-06, + "loss": 0.7838, + "step": 10536 + }, + { + "epoch": 1.7947360524680112, + "grad_norm": 1.6015625, + "learning_rate": 6.910860598244682e-06, + "loss": 0.8028, + "step": 10537 + }, + { + "epoch": 1.794907516557001, + "grad_norm": 1.6640625, + "learning_rate": 6.909143048410094e-06, + "loss": 0.8455, + "step": 10538 + }, + { + "epoch": 1.7950789806459908, + "grad_norm": 1.6640625, + "learning_rate": 6.907425599379993e-06, + "loss": 0.8057, + "step": 10539 + }, + { + "epoch": 1.7952504447349809, + "grad_norm": 1.7734375, + "learning_rate": 6.905708251210388e-06, + "loss": 0.788, + "step": 10540 + }, + { + "epoch": 1.7954219088239707, + "grad_norm": 1.8515625, + "learning_rate": 6.90399100395729e-06, + "loss": 0.8614, + "step": 10541 + }, + { + "epoch": 1.7955933729129605, + "grad_norm": 1.59375, + "learning_rate": 6.902273857676703e-06, + "loss": 0.8715, + "step": 10542 + }, + { + "epoch": 1.7957648370019506, + "grad_norm": 1.71875, + "learning_rate": 6.900556812424631e-06, + "loss": 0.9178, + "step": 10543 + }, + { + "epoch": 1.7959363010909404, + "grad_norm": 1.6796875, + "learning_rate": 6.898839868257072e-06, + "loss": 0.7955, + "step": 10544 + }, + { + "epoch": 1.7961077651799302, + "grad_norm": 1.703125, + "learning_rate": 6.897123025230023e-06, + "loss": 0.8021, + "step": 10545 + }, + { + "epoch": 1.79627922926892, + "grad_norm": 1.609375, + "learning_rate": 6.89540628339948e-06, + "loss": 0.7967, + "step": 10546 + }, + { + "epoch": 1.7964506933579099, + "grad_norm": 1.7578125, + "learning_rate": 6.893689642821426e-06, + "loss": 0.9277, + "step": 10547 + }, + { + "epoch": 1.7966221574468997, + "grad_norm": 1.703125, + "learning_rate": 6.89197310355185e-06, + "loss": 0.8216, + "step": 10548 + }, + { + "epoch": 1.7967936215358895, + "grad_norm": 1.625, + "learning_rate": 6.890256665646735e-06, + "loss": 0.7643, + "step": 10549 + }, + { + "epoch": 1.7969650856248793, + "grad_norm": 1.7890625, + "learning_rate": 6.88854032916206e-06, + "loss": 0.8166, + "step": 10550 + }, + { + "epoch": 1.7971365497138692, + "grad_norm": 1.609375, + "learning_rate": 6.886824094153801e-06, + "loss": 0.8987, + "step": 10551 + }, + { + "epoch": 1.7973080138028592, + "grad_norm": 1.71875, + "learning_rate": 6.885107960677933e-06, + "loss": 0.8656, + "step": 10552 + }, + { + "epoch": 1.797479477891849, + "grad_norm": 1.828125, + "learning_rate": 6.883391928790423e-06, + "loss": 0.8586, + "step": 10553 + }, + { + "epoch": 1.7976509419808389, + "grad_norm": 1.7421875, + "learning_rate": 6.881675998547238e-06, + "loss": 0.9042, + "step": 10554 + }, + { + "epoch": 1.797822406069829, + "grad_norm": 1.65625, + "learning_rate": 6.879960170004341e-06, + "loss": 0.852, + "step": 10555 + }, + { + "epoch": 1.7979938701588187, + "grad_norm": 1.75, + "learning_rate": 6.878244443217693e-06, + "loss": 0.8558, + "step": 10556 + }, + { + "epoch": 1.7981653342478086, + "grad_norm": 1.7265625, + "learning_rate": 6.876528818243249e-06, + "loss": 0.9334, + "step": 10557 + }, + { + "epoch": 1.7983367983367984, + "grad_norm": 1.71875, + "learning_rate": 6.874813295136964e-06, + "loss": 0.8312, + "step": 10558 + }, + { + "epoch": 1.7985082624257882, + "grad_norm": 1.7109375, + "learning_rate": 6.873097873954783e-06, + "loss": 0.7822, + "step": 10559 + }, + { + "epoch": 1.798679726514778, + "grad_norm": 1.59375, + "learning_rate": 6.871382554752655e-06, + "loss": 0.8809, + "step": 10560 + }, + { + "epoch": 1.7988511906037679, + "grad_norm": 1.640625, + "learning_rate": 6.869667337586524e-06, + "loss": 0.8643, + "step": 10561 + }, + { + "epoch": 1.7990226546927577, + "grad_norm": 1.75, + "learning_rate": 6.8679522225123286e-06, + "loss": 0.8073, + "step": 10562 + }, + { + "epoch": 1.7991941187817475, + "grad_norm": 1.625, + "learning_rate": 6.8662372095860055e-06, + "loss": 0.8247, + "step": 10563 + }, + { + "epoch": 1.7993655828707376, + "grad_norm": 1.6171875, + "learning_rate": 6.864522298863488e-06, + "loss": 0.8408, + "step": 10564 + }, + { + "epoch": 1.7995370469597274, + "grad_norm": 1.75, + "learning_rate": 6.862807490400705e-06, + "loss": 0.8944, + "step": 10565 + }, + { + "epoch": 1.7997085110487172, + "grad_norm": 1.7734375, + "learning_rate": 6.8610927842535825e-06, + "loss": 0.8626, + "step": 10566 + }, + { + "epoch": 1.7998799751377073, + "grad_norm": 1.671875, + "learning_rate": 6.859378180478046e-06, + "loss": 0.8183, + "step": 10567 + }, + { + "epoch": 1.800051439226697, + "grad_norm": 1.7421875, + "learning_rate": 6.857663679130013e-06, + "loss": 0.8266, + "step": 10568 + }, + { + "epoch": 1.800222903315687, + "grad_norm": 1.7109375, + "learning_rate": 6.855949280265402e-06, + "loss": 0.8345, + "step": 10569 + }, + { + "epoch": 1.8003943674046767, + "grad_norm": 1.7109375, + "learning_rate": 6.854234983940123e-06, + "loss": 0.8341, + "step": 10570 + }, + { + "epoch": 1.8005658314936666, + "grad_norm": 1.7734375, + "learning_rate": 6.8525207902100865e-06, + "loss": 0.8453, + "step": 10571 + }, + { + "epoch": 1.8007372955826564, + "grad_norm": 1.7421875, + "learning_rate": 6.850806699131198e-06, + "loss": 0.9124, + "step": 10572 + }, + { + "epoch": 1.8009087596716462, + "grad_norm": 1.625, + "learning_rate": 6.849092710759364e-06, + "loss": 0.826, + "step": 10573 + }, + { + "epoch": 1.801080223760636, + "grad_norm": 1.71875, + "learning_rate": 6.847378825150481e-06, + "loss": 0.9096, + "step": 10574 + }, + { + "epoch": 1.8012516878496259, + "grad_norm": 1.671875, + "learning_rate": 6.845665042360445e-06, + "loss": 0.8523, + "step": 10575 + }, + { + "epoch": 1.801423151938616, + "grad_norm": 1.6328125, + "learning_rate": 6.843951362445153e-06, + "loss": 0.916, + "step": 10576 + }, + { + "epoch": 1.8015946160276057, + "grad_norm": 1.7109375, + "learning_rate": 6.842237785460487e-06, + "loss": 0.8603, + "step": 10577 + }, + { + "epoch": 1.8017660801165956, + "grad_norm": 1.7265625, + "learning_rate": 6.840524311462341e-06, + "loss": 0.8717, + "step": 10578 + }, + { + "epoch": 1.8019375442055856, + "grad_norm": 1.703125, + "learning_rate": 6.838810940506595e-06, + "loss": 0.82, + "step": 10579 + }, + { + "epoch": 1.8021090082945754, + "grad_norm": 1.625, + "learning_rate": 6.837097672649126e-06, + "loss": 0.7833, + "step": 10580 + }, + { + "epoch": 1.8022804723835653, + "grad_norm": 1.8359375, + "learning_rate": 6.8353845079458195e-06, + "loss": 0.8223, + "step": 10581 + }, + { + "epoch": 1.802451936472555, + "grad_norm": 1.8203125, + "learning_rate": 6.833671446452535e-06, + "loss": 0.8815, + "step": 10582 + }, + { + "epoch": 1.802623400561545, + "grad_norm": 1.65625, + "learning_rate": 6.831958488225149e-06, + "loss": 0.8606, + "step": 10583 + }, + { + "epoch": 1.8027948646505347, + "grad_norm": 1.625, + "learning_rate": 6.8302456333195255e-06, + "loss": 0.8732, + "step": 10584 + }, + { + "epoch": 1.8029663287395246, + "grad_norm": 1.6640625, + "learning_rate": 6.828532881791528e-06, + "loss": 0.8402, + "step": 10585 + }, + { + "epoch": 1.8031377928285144, + "grad_norm": 1.6796875, + "learning_rate": 6.826820233697015e-06, + "loss": 0.8685, + "step": 10586 + }, + { + "epoch": 1.8033092569175042, + "grad_norm": 1.65625, + "learning_rate": 6.825107689091846e-06, + "loss": 0.8481, + "step": 10587 + }, + { + "epoch": 1.803480721006494, + "grad_norm": 1.65625, + "learning_rate": 6.823395248031867e-06, + "loss": 0.8237, + "step": 10588 + }, + { + "epoch": 1.803652185095484, + "grad_norm": 1.6875, + "learning_rate": 6.821682910572934e-06, + "loss": 0.9003, + "step": 10589 + }, + { + "epoch": 1.803823649184474, + "grad_norm": 1.6328125, + "learning_rate": 6.81997067677089e-06, + "loss": 0.8977, + "step": 10590 + }, + { + "epoch": 1.8039951132734637, + "grad_norm": 1.609375, + "learning_rate": 6.818258546681575e-06, + "loss": 0.7993, + "step": 10591 + }, + { + "epoch": 1.8041665773624538, + "grad_norm": 1.703125, + "learning_rate": 6.816546520360836e-06, + "loss": 0.9014, + "step": 10592 + }, + { + "epoch": 1.8043380414514436, + "grad_norm": 1.71875, + "learning_rate": 6.814834597864497e-06, + "loss": 0.8515, + "step": 10593 + }, + { + "epoch": 1.8045095055404334, + "grad_norm": 1.6953125, + "learning_rate": 6.8131227792483956e-06, + "loss": 0.9445, + "step": 10594 + }, + { + "epoch": 1.8046809696294233, + "grad_norm": 1.71875, + "learning_rate": 6.8114110645683625e-06, + "loss": 0.8309, + "step": 10595 + }, + { + "epoch": 1.804852433718413, + "grad_norm": 1.6015625, + "learning_rate": 6.809699453880223e-06, + "loss": 0.8086, + "step": 10596 + }, + { + "epoch": 1.805023897807403, + "grad_norm": 1.6875, + "learning_rate": 6.807987947239796e-06, + "loss": 0.9167, + "step": 10597 + }, + { + "epoch": 1.8051953618963927, + "grad_norm": 1.6484375, + "learning_rate": 6.806276544702902e-06, + "loss": 0.8983, + "step": 10598 + }, + { + "epoch": 1.8053668259853826, + "grad_norm": 1.59375, + "learning_rate": 6.804565246325358e-06, + "loss": 0.8876, + "step": 10599 + }, + { + "epoch": 1.8055382900743724, + "grad_norm": 1.625, + "learning_rate": 6.802854052162973e-06, + "loss": 0.846, + "step": 10600 + }, + { + "epoch": 1.8057097541633624, + "grad_norm": 1.6640625, + "learning_rate": 6.801142962271556e-06, + "loss": 0.8577, + "step": 10601 + }, + { + "epoch": 1.8058812182523523, + "grad_norm": 1.609375, + "learning_rate": 6.799431976706914e-06, + "loss": 0.8461, + "step": 10602 + }, + { + "epoch": 1.806052682341342, + "grad_norm": 1.5859375, + "learning_rate": 6.797721095524847e-06, + "loss": 0.8367, + "step": 10603 + }, + { + "epoch": 1.8062241464303321, + "grad_norm": 1.734375, + "learning_rate": 6.796010318781157e-06, + "loss": 0.8702, + "step": 10604 + }, + { + "epoch": 1.806395610519322, + "grad_norm": 1.7421875, + "learning_rate": 6.794299646531631e-06, + "loss": 0.8586, + "step": 10605 + }, + { + "epoch": 1.8065670746083118, + "grad_norm": 1.6015625, + "learning_rate": 6.792589078832066e-06, + "loss": 0.8048, + "step": 10606 + }, + { + "epoch": 1.8067385386973016, + "grad_norm": 1.75, + "learning_rate": 6.79087861573825e-06, + "loss": 0.8556, + "step": 10607 + }, + { + "epoch": 1.8069100027862914, + "grad_norm": 1.59375, + "learning_rate": 6.789168257305967e-06, + "loss": 0.8026, + "step": 10608 + }, + { + "epoch": 1.8070814668752813, + "grad_norm": 1.5859375, + "learning_rate": 6.787458003590996e-06, + "loss": 0.7729, + "step": 10609 + }, + { + "epoch": 1.807252930964271, + "grad_norm": 1.734375, + "learning_rate": 6.78574785464912e-06, + "loss": 0.8806, + "step": 10610 + }, + { + "epoch": 1.807424395053261, + "grad_norm": 1.6328125, + "learning_rate": 6.784037810536108e-06, + "loss": 0.86, + "step": 10611 + }, + { + "epoch": 1.8075958591422507, + "grad_norm": 1.8046875, + "learning_rate": 6.782327871307733e-06, + "loss": 0.9071, + "step": 10612 + }, + { + "epoch": 1.8077673232312408, + "grad_norm": 1.6640625, + "learning_rate": 6.780618037019764e-06, + "loss": 0.8872, + "step": 10613 + }, + { + "epoch": 1.8079387873202306, + "grad_norm": 1.6796875, + "learning_rate": 6.778908307727962e-06, + "loss": 0.8096, + "step": 10614 + }, + { + "epoch": 1.8081102514092204, + "grad_norm": 1.71875, + "learning_rate": 6.777198683488093e-06, + "loss": 0.8722, + "step": 10615 + }, + { + "epoch": 1.8082817154982105, + "grad_norm": 1.6328125, + "learning_rate": 6.7754891643559095e-06, + "loss": 0.8332, + "step": 10616 + }, + { + "epoch": 1.8084531795872003, + "grad_norm": 1.640625, + "learning_rate": 6.773779750387166e-06, + "loss": 0.7589, + "step": 10617 + }, + { + "epoch": 1.8086246436761901, + "grad_norm": 1.7421875, + "learning_rate": 6.772070441637614e-06, + "loss": 0.8632, + "step": 10618 + }, + { + "epoch": 1.80879610776518, + "grad_norm": 1.75, + "learning_rate": 6.7703612381629994e-06, + "loss": 0.8868, + "step": 10619 + }, + { + "epoch": 1.8089675718541698, + "grad_norm": 1.734375, + "learning_rate": 6.768652140019067e-06, + "loss": 0.8385, + "step": 10620 + }, + { + "epoch": 1.8091390359431596, + "grad_norm": 1.625, + "learning_rate": 6.7669431472615565e-06, + "loss": 0.8399, + "step": 10621 + }, + { + "epoch": 1.8093105000321494, + "grad_norm": 1.6796875, + "learning_rate": 6.765234259946204e-06, + "loss": 0.9131, + "step": 10622 + }, + { + "epoch": 1.8094819641211393, + "grad_norm": 1.78125, + "learning_rate": 6.763525478128744e-06, + "loss": 0.8054, + "step": 10623 + }, + { + "epoch": 1.809653428210129, + "grad_norm": 1.6640625, + "learning_rate": 6.7618168018649024e-06, + "loss": 0.8744, + "step": 10624 + }, + { + "epoch": 1.8098248922991191, + "grad_norm": 1.6640625, + "learning_rate": 6.760108231210411e-06, + "loss": 0.836, + "step": 10625 + }, + { + "epoch": 1.809996356388109, + "grad_norm": 1.703125, + "learning_rate": 6.758399766220992e-06, + "loss": 0.8886, + "step": 10626 + }, + { + "epoch": 1.8101678204770988, + "grad_norm": 1.78125, + "learning_rate": 6.756691406952367e-06, + "loss": 0.7741, + "step": 10627 + }, + { + "epoch": 1.8103392845660888, + "grad_norm": 1.703125, + "learning_rate": 6.754983153460244e-06, + "loss": 0.8985, + "step": 10628 + }, + { + "epoch": 1.8105107486550787, + "grad_norm": 1.7421875, + "learning_rate": 6.7532750058003395e-06, + "loss": 0.7809, + "step": 10629 + }, + { + "epoch": 1.8106822127440685, + "grad_norm": 1.7734375, + "learning_rate": 6.751566964028363e-06, + "loss": 0.9026, + "step": 10630 + }, + { + "epoch": 1.8108536768330583, + "grad_norm": 1.703125, + "learning_rate": 6.749859028200021e-06, + "loss": 0.8978, + "step": 10631 + }, + { + "epoch": 1.8110251409220481, + "grad_norm": 1.7421875, + "learning_rate": 6.7481511983710125e-06, + "loss": 0.8433, + "step": 10632 + }, + { + "epoch": 1.811196605011038, + "grad_norm": 1.640625, + "learning_rate": 6.746443474597041e-06, + "loss": 0.8048, + "step": 10633 + }, + { + "epoch": 1.8113680691000278, + "grad_norm": 1.7578125, + "learning_rate": 6.744735856933799e-06, + "loss": 0.8588, + "step": 10634 + }, + { + "epoch": 1.8115395331890176, + "grad_norm": 1.65625, + "learning_rate": 6.7430283454369795e-06, + "loss": 0.8494, + "step": 10635 + }, + { + "epoch": 1.8117109972780074, + "grad_norm": 1.6328125, + "learning_rate": 6.741320940162271e-06, + "loss": 0.8099, + "step": 10636 + }, + { + "epoch": 1.8118824613669975, + "grad_norm": 1.6640625, + "learning_rate": 6.7396136411653566e-06, + "loss": 0.827, + "step": 10637 + }, + { + "epoch": 1.8120539254559873, + "grad_norm": 1.671875, + "learning_rate": 6.737906448501923e-06, + "loss": 0.7802, + "step": 10638 + }, + { + "epoch": 1.8122253895449771, + "grad_norm": 1.65625, + "learning_rate": 6.73619936222764e-06, + "loss": 0.8083, + "step": 10639 + }, + { + "epoch": 1.8123968536339672, + "grad_norm": 1.75, + "learning_rate": 6.734492382398184e-06, + "loss": 0.8436, + "step": 10640 + }, + { + "epoch": 1.812568317722957, + "grad_norm": 1.8125, + "learning_rate": 6.7327855090692305e-06, + "loss": 0.845, + "step": 10641 + }, + { + "epoch": 1.8127397818119468, + "grad_norm": 1.6796875, + "learning_rate": 6.731078742296444e-06, + "loss": 0.8076, + "step": 10642 + }, + { + "epoch": 1.8129112459009367, + "grad_norm": 1.671875, + "learning_rate": 6.72937208213549e-06, + "loss": 0.8896, + "step": 10643 + }, + { + "epoch": 1.8130827099899265, + "grad_norm": 1.7109375, + "learning_rate": 6.727665528642029e-06, + "loss": 0.8489, + "step": 10644 + }, + { + "epoch": 1.8132541740789163, + "grad_norm": 1.6484375, + "learning_rate": 6.725959081871717e-06, + "loss": 0.8434, + "step": 10645 + }, + { + "epoch": 1.8134256381679061, + "grad_norm": 1.7109375, + "learning_rate": 6.724252741880208e-06, + "loss": 0.884, + "step": 10646 + }, + { + "epoch": 1.813597102256896, + "grad_norm": 1.71875, + "learning_rate": 6.722546508723154e-06, + "loss": 0.8877, + "step": 10647 + }, + { + "epoch": 1.8137685663458858, + "grad_norm": 1.75, + "learning_rate": 6.720840382456198e-06, + "loss": 0.8864, + "step": 10648 + }, + { + "epoch": 1.8139400304348758, + "grad_norm": 1.703125, + "learning_rate": 6.719134363134986e-06, + "loss": 0.8983, + "step": 10649 + }, + { + "epoch": 1.8141114945238657, + "grad_norm": 1.8125, + "learning_rate": 6.717428450815159e-06, + "loss": 0.8891, + "step": 10650 + }, + { + "epoch": 1.8142829586128555, + "grad_norm": 1.6953125, + "learning_rate": 6.715722645552349e-06, + "loss": 0.8517, + "step": 10651 + }, + { + "epoch": 1.8144544227018455, + "grad_norm": 1.546875, + "learning_rate": 6.714016947402191e-06, + "loss": 0.8189, + "step": 10652 + }, + { + "epoch": 1.8146258867908354, + "grad_norm": 1.7109375, + "learning_rate": 6.712311356420315e-06, + "loss": 0.7795, + "step": 10653 + }, + { + "epoch": 1.8147973508798252, + "grad_norm": 1.640625, + "learning_rate": 6.710605872662346e-06, + "loss": 0.8564, + "step": 10654 + }, + { + "epoch": 1.814968814968815, + "grad_norm": 1.6328125, + "learning_rate": 6.708900496183906e-06, + "loss": 0.8641, + "step": 10655 + }, + { + "epoch": 1.8151402790578048, + "grad_norm": 1.7578125, + "learning_rate": 6.707195227040612e-06, + "loss": 0.8355, + "step": 10656 + }, + { + "epoch": 1.8153117431467947, + "grad_norm": 1.71875, + "learning_rate": 6.705490065288083e-06, + "loss": 0.7409, + "step": 10657 + }, + { + "epoch": 1.8154832072357845, + "grad_norm": 1.75, + "learning_rate": 6.7037850109819294e-06, + "loss": 0.8776, + "step": 10658 + }, + { + "epoch": 1.8156546713247743, + "grad_norm": 1.78125, + "learning_rate": 6.702080064177759e-06, + "loss": 0.7984, + "step": 10659 + }, + { + "epoch": 1.8158261354137641, + "grad_norm": 1.7421875, + "learning_rate": 6.700375224931175e-06, + "loss": 0.8178, + "step": 10660 + }, + { + "epoch": 1.8159975995027542, + "grad_norm": 1.71875, + "learning_rate": 6.698670493297781e-06, + "loss": 0.7873, + "step": 10661 + }, + { + "epoch": 1.816169063591744, + "grad_norm": 1.703125, + "learning_rate": 6.696965869333176e-06, + "loss": 0.8951, + "step": 10662 + }, + { + "epoch": 1.8163405276807338, + "grad_norm": 1.71875, + "learning_rate": 6.69526135309295e-06, + "loss": 0.8174, + "step": 10663 + }, + { + "epoch": 1.8165119917697239, + "grad_norm": 1.6484375, + "learning_rate": 6.693556944632696e-06, + "loss": 0.7665, + "step": 10664 + }, + { + "epoch": 1.8166834558587137, + "grad_norm": 1.7265625, + "learning_rate": 6.691852644008e-06, + "loss": 0.8757, + "step": 10665 + }, + { + "epoch": 1.8168549199477035, + "grad_norm": 1.6796875, + "learning_rate": 6.690148451274447e-06, + "loss": 0.8875, + "step": 10666 + }, + { + "epoch": 1.8170263840366934, + "grad_norm": 1.625, + "learning_rate": 6.688444366487618e-06, + "loss": 0.7827, + "step": 10667 + }, + { + "epoch": 1.8171978481256832, + "grad_norm": 1.671875, + "learning_rate": 6.686740389703087e-06, + "loss": 0.8972, + "step": 10668 + }, + { + "epoch": 1.817369312214673, + "grad_norm": 1.6875, + "learning_rate": 6.685036520976429e-06, + "loss": 0.8712, + "step": 10669 + }, + { + "epoch": 1.8175407763036628, + "grad_norm": 1.6796875, + "learning_rate": 6.683332760363212e-06, + "loss": 0.8629, + "step": 10670 + }, + { + "epoch": 1.8177122403926527, + "grad_norm": 1.75, + "learning_rate": 6.681629107919002e-06, + "loss": 0.8683, + "step": 10671 + }, + { + "epoch": 1.8178837044816425, + "grad_norm": 1.7109375, + "learning_rate": 6.679925563699365e-06, + "loss": 0.8437, + "step": 10672 + }, + { + "epoch": 1.8180551685706325, + "grad_norm": 1.7734375, + "learning_rate": 6.678222127759862e-06, + "loss": 0.8247, + "step": 10673 + }, + { + "epoch": 1.8182266326596224, + "grad_norm": 1.7265625, + "learning_rate": 6.676518800156039e-06, + "loss": 0.9106, + "step": 10674 + }, + { + "epoch": 1.8183980967486122, + "grad_norm": 1.578125, + "learning_rate": 6.674815580943453e-06, + "loss": 0.8461, + "step": 10675 + }, + { + "epoch": 1.8185695608376022, + "grad_norm": 1.7421875, + "learning_rate": 6.673112470177653e-06, + "loss": 0.8636, + "step": 10676 + }, + { + "epoch": 1.818741024926592, + "grad_norm": 1.8203125, + "learning_rate": 6.671409467914183e-06, + "loss": 0.8077, + "step": 10677 + }, + { + "epoch": 1.818912489015582, + "grad_norm": 1.640625, + "learning_rate": 6.669706574208585e-06, + "loss": 0.8084, + "step": 10678 + }, + { + "epoch": 1.8190839531045717, + "grad_norm": 1.640625, + "learning_rate": 6.668003789116393e-06, + "loss": 0.789, + "step": 10679 + }, + { + "epoch": 1.8192554171935615, + "grad_norm": 1.59375, + "learning_rate": 6.666301112693147e-06, + "loss": 0.8396, + "step": 10680 + }, + { + "epoch": 1.8194268812825514, + "grad_norm": 1.7265625, + "learning_rate": 6.664598544994376e-06, + "loss": 0.9069, + "step": 10681 + }, + { + "epoch": 1.8195983453715412, + "grad_norm": 1.65625, + "learning_rate": 6.662896086075607e-06, + "loss": 0.8007, + "step": 10682 + }, + { + "epoch": 1.819769809460531, + "grad_norm": 1.640625, + "learning_rate": 6.6611937359923615e-06, + "loss": 0.8241, + "step": 10683 + }, + { + "epoch": 1.8199412735495208, + "grad_norm": 1.7109375, + "learning_rate": 6.659491494800161e-06, + "loss": 0.8853, + "step": 10684 + }, + { + "epoch": 1.8201127376385107, + "grad_norm": 1.71875, + "learning_rate": 6.657789362554527e-06, + "loss": 0.8538, + "step": 10685 + }, + { + "epoch": 1.8202842017275007, + "grad_norm": 1.703125, + "learning_rate": 6.656087339310964e-06, + "loss": 0.797, + "step": 10686 + }, + { + "epoch": 1.8204556658164905, + "grad_norm": 1.71875, + "learning_rate": 6.654385425124981e-06, + "loss": 0.8743, + "step": 10687 + }, + { + "epoch": 1.8206271299054804, + "grad_norm": 1.8203125, + "learning_rate": 6.652683620052092e-06, + "loss": 0.8449, + "step": 10688 + }, + { + "epoch": 1.8207985939944704, + "grad_norm": 1.6640625, + "learning_rate": 6.6509819241477924e-06, + "loss": 0.7926, + "step": 10689 + }, + { + "epoch": 1.8209700580834602, + "grad_norm": 1.765625, + "learning_rate": 6.649280337467585e-06, + "loss": 0.8931, + "step": 10690 + }, + { + "epoch": 1.82114152217245, + "grad_norm": 1.6875, + "learning_rate": 6.6475788600669635e-06, + "loss": 0.8234, + "step": 10691 + }, + { + "epoch": 1.82131298626144, + "grad_norm": 1.625, + "learning_rate": 6.645877492001419e-06, + "loss": 0.8517, + "step": 10692 + }, + { + "epoch": 1.8214844503504297, + "grad_norm": 1.640625, + "learning_rate": 6.64417623332644e-06, + "loss": 0.8323, + "step": 10693 + }, + { + "epoch": 1.8216559144394195, + "grad_norm": 1.75, + "learning_rate": 6.6424750840975115e-06, + "loss": 0.8254, + "step": 10694 + }, + { + "epoch": 1.8218273785284094, + "grad_norm": 1.5546875, + "learning_rate": 6.640774044370113e-06, + "loss": 0.7968, + "step": 10695 + }, + { + "epoch": 1.8219988426173992, + "grad_norm": 1.6796875, + "learning_rate": 6.6390731141997255e-06, + "loss": 0.7962, + "step": 10696 + }, + { + "epoch": 1.822170306706389, + "grad_norm": 1.7109375, + "learning_rate": 6.6373722936418176e-06, + "loss": 0.8424, + "step": 10697 + }, + { + "epoch": 1.822341770795379, + "grad_norm": 1.6875, + "learning_rate": 6.6356715827518615e-06, + "loss": 0.9017, + "step": 10698 + }, + { + "epoch": 1.822513234884369, + "grad_norm": 1.703125, + "learning_rate": 6.633970981585323e-06, + "loss": 0.8439, + "step": 10699 + }, + { + "epoch": 1.8226846989733587, + "grad_norm": 1.8203125, + "learning_rate": 6.632270490197667e-06, + "loss": 0.8899, + "step": 10700 + }, + { + "epoch": 1.8228561630623488, + "grad_norm": 1.671875, + "learning_rate": 6.630570108644352e-06, + "loss": 0.8732, + "step": 10701 + }, + { + "epoch": 1.8230276271513386, + "grad_norm": 1.578125, + "learning_rate": 6.628869836980833e-06, + "loss": 0.7281, + "step": 10702 + }, + { + "epoch": 1.8231990912403284, + "grad_norm": 1.65625, + "learning_rate": 6.627169675262563e-06, + "loss": 0.8485, + "step": 10703 + }, + { + "epoch": 1.8233705553293182, + "grad_norm": 1.6875, + "learning_rate": 6.625469623544994e-06, + "loss": 0.8677, + "step": 10704 + }, + { + "epoch": 1.823542019418308, + "grad_norm": 1.828125, + "learning_rate": 6.623769681883565e-06, + "loss": 0.8255, + "step": 10705 + }, + { + "epoch": 1.823713483507298, + "grad_norm": 1.796875, + "learning_rate": 6.62206985033372e-06, + "loss": 0.897, + "step": 10706 + }, + { + "epoch": 1.8238849475962877, + "grad_norm": 1.6875, + "learning_rate": 6.620370128950898e-06, + "loss": 0.8786, + "step": 10707 + }, + { + "epoch": 1.8240564116852775, + "grad_norm": 1.7265625, + "learning_rate": 6.618670517790535e-06, + "loss": 0.8853, + "step": 10708 + }, + { + "epoch": 1.8242278757742674, + "grad_norm": 1.796875, + "learning_rate": 6.6169710169080585e-06, + "loss": 0.8861, + "step": 10709 + }, + { + "epoch": 1.8243993398632574, + "grad_norm": 1.625, + "learning_rate": 6.615271626358895e-06, + "loss": 0.8048, + "step": 10710 + }, + { + "epoch": 1.8245708039522472, + "grad_norm": 1.71875, + "learning_rate": 6.613572346198469e-06, + "loss": 0.8505, + "step": 10711 + }, + { + "epoch": 1.824742268041237, + "grad_norm": 1.7109375, + "learning_rate": 6.611873176482201e-06, + "loss": 0.8112, + "step": 10712 + }, + { + "epoch": 1.8249137321302271, + "grad_norm": 1.671875, + "learning_rate": 6.610174117265507e-06, + "loss": 0.8226, + "step": 10713 + }, + { + "epoch": 1.825085196219217, + "grad_norm": 1.6015625, + "learning_rate": 6.608475168603801e-06, + "loss": 0.863, + "step": 10714 + }, + { + "epoch": 1.8252566603082068, + "grad_norm": 1.765625, + "learning_rate": 6.606776330552491e-06, + "loss": 0.8547, + "step": 10715 + }, + { + "epoch": 1.8254281243971966, + "grad_norm": 1.6328125, + "learning_rate": 6.605077603166981e-06, + "loss": 0.7697, + "step": 10716 + }, + { + "epoch": 1.8255995884861864, + "grad_norm": 1.65625, + "learning_rate": 6.603378986502677e-06, + "loss": 0.8732, + "step": 10717 + }, + { + "epoch": 1.8257710525751762, + "grad_norm": 1.59375, + "learning_rate": 6.6016804806149715e-06, + "loss": 0.8357, + "step": 10718 + }, + { + "epoch": 1.825942516664166, + "grad_norm": 1.7265625, + "learning_rate": 6.5999820855592675e-06, + "loss": 0.8082, + "step": 10719 + }, + { + "epoch": 1.826113980753156, + "grad_norm": 1.6875, + "learning_rate": 6.598283801390948e-06, + "loss": 0.9118, + "step": 10720 + }, + { + "epoch": 1.8262854448421457, + "grad_norm": 1.6796875, + "learning_rate": 6.596585628165404e-06, + "loss": 0.8519, + "step": 10721 + }, + { + "epoch": 1.8264569089311358, + "grad_norm": 1.5859375, + "learning_rate": 6.594887565938018e-06, + "loss": 0.7778, + "step": 10722 + }, + { + "epoch": 1.8266283730201256, + "grad_norm": 1.6796875, + "learning_rate": 6.593189614764171e-06, + "loss": 0.893, + "step": 10723 + }, + { + "epoch": 1.8267998371091154, + "grad_norm": 1.6640625, + "learning_rate": 6.591491774699239e-06, + "loss": 0.9023, + "step": 10724 + }, + { + "epoch": 1.8269713011981055, + "grad_norm": 1.7421875, + "learning_rate": 6.589794045798596e-06, + "loss": 0.8617, + "step": 10725 + }, + { + "epoch": 1.8271427652870953, + "grad_norm": 1.65625, + "learning_rate": 6.588096428117608e-06, + "loss": 0.8279, + "step": 10726 + }, + { + "epoch": 1.8273142293760851, + "grad_norm": 1.671875, + "learning_rate": 6.586398921711646e-06, + "loss": 0.8542, + "step": 10727 + }, + { + "epoch": 1.827485693465075, + "grad_norm": 1.734375, + "learning_rate": 6.5847015266360706e-06, + "loss": 0.8184, + "step": 10728 + }, + { + "epoch": 1.8276571575540648, + "grad_norm": 1.6796875, + "learning_rate": 6.5830042429462386e-06, + "loss": 0.81, + "step": 10729 + }, + { + "epoch": 1.8278286216430546, + "grad_norm": 1.7421875, + "learning_rate": 6.581307070697505e-06, + "loss": 0.8955, + "step": 10730 + }, + { + "epoch": 1.8280000857320444, + "grad_norm": 1.7265625, + "learning_rate": 6.579610009945225e-06, + "loss": 0.9194, + "step": 10731 + }, + { + "epoch": 1.8281715498210342, + "grad_norm": 1.59375, + "learning_rate": 6.57791306074474e-06, + "loss": 0.8302, + "step": 10732 + }, + { + "epoch": 1.828343013910024, + "grad_norm": 1.625, + "learning_rate": 6.576216223151395e-06, + "loss": 0.8272, + "step": 10733 + }, + { + "epoch": 1.8285144779990141, + "grad_norm": 1.796875, + "learning_rate": 6.57451949722053e-06, + "loss": 0.8894, + "step": 10734 + }, + { + "epoch": 1.828685942088004, + "grad_norm": 1.734375, + "learning_rate": 6.572822883007486e-06, + "loss": 0.932, + "step": 10735 + }, + { + "epoch": 1.8288574061769938, + "grad_norm": 1.640625, + "learning_rate": 6.571126380567594e-06, + "loss": 0.8385, + "step": 10736 + }, + { + "epoch": 1.8290288702659838, + "grad_norm": 1.7265625, + "learning_rate": 6.569429989956182e-06, + "loss": 0.8257, + "step": 10737 + }, + { + "epoch": 1.8292003343549736, + "grad_norm": 1.796875, + "learning_rate": 6.567733711228577e-06, + "loss": 0.9127, + "step": 10738 + }, + { + "epoch": 1.8293717984439635, + "grad_norm": 1.7265625, + "learning_rate": 6.566037544440098e-06, + "loss": 0.8548, + "step": 10739 + }, + { + "epoch": 1.8295432625329533, + "grad_norm": 1.8125, + "learning_rate": 6.564341489646068e-06, + "loss": 0.9063, + "step": 10740 + }, + { + "epoch": 1.8297147266219431, + "grad_norm": 1.75, + "learning_rate": 6.562645546901798e-06, + "loss": 0.8873, + "step": 10741 + }, + { + "epoch": 1.829886190710933, + "grad_norm": 1.6171875, + "learning_rate": 6.5609497162626044e-06, + "loss": 0.8054, + "step": 10742 + }, + { + "epoch": 1.8300576547999228, + "grad_norm": 1.71875, + "learning_rate": 6.5592539977837875e-06, + "loss": 0.8719, + "step": 10743 + }, + { + "epoch": 1.8302291188889126, + "grad_norm": 1.609375, + "learning_rate": 6.557558391520655e-06, + "loss": 0.793, + "step": 10744 + }, + { + "epoch": 1.8304005829779024, + "grad_norm": 1.6953125, + "learning_rate": 6.555862897528507e-06, + "loss": 0.8555, + "step": 10745 + }, + { + "epoch": 1.8305720470668925, + "grad_norm": 1.671875, + "learning_rate": 6.554167515862637e-06, + "loss": 0.8055, + "step": 10746 + }, + { + "epoch": 1.8307435111558823, + "grad_norm": 1.7890625, + "learning_rate": 6.552472246578343e-06, + "loss": 0.8477, + "step": 10747 + }, + { + "epoch": 1.8309149752448721, + "grad_norm": 1.734375, + "learning_rate": 6.55077708973091e-06, + "loss": 0.8425, + "step": 10748 + }, + { + "epoch": 1.8310864393338622, + "grad_norm": 1.78125, + "learning_rate": 6.549082045375624e-06, + "loss": 0.8409, + "step": 10749 + }, + { + "epoch": 1.831257903422852, + "grad_norm": 1.6484375, + "learning_rate": 6.547387113567768e-06, + "loss": 0.8289, + "step": 10750 + }, + { + "epoch": 1.8314293675118418, + "grad_norm": 1.75, + "learning_rate": 6.54569229436262e-06, + "loss": 0.9509, + "step": 10751 + }, + { + "epoch": 1.8316008316008316, + "grad_norm": 1.6796875, + "learning_rate": 6.543997587815454e-06, + "loss": 0.8315, + "step": 10752 + }, + { + "epoch": 1.8317722956898215, + "grad_norm": 1.6875, + "learning_rate": 6.5423029939815394e-06, + "loss": 0.8989, + "step": 10753 + }, + { + "epoch": 1.8319437597788113, + "grad_norm": 1.6796875, + "learning_rate": 6.5406085129161475e-06, + "loss": 0.8576, + "step": 10754 + }, + { + "epoch": 1.8321152238678011, + "grad_norm": 1.7734375, + "learning_rate": 6.538914144674537e-06, + "loss": 0.8601, + "step": 10755 + }, + { + "epoch": 1.832286687956791, + "grad_norm": 1.671875, + "learning_rate": 6.53721988931197e-06, + "loss": 0.8074, + "step": 10756 + }, + { + "epoch": 1.8324581520457808, + "grad_norm": 1.7578125, + "learning_rate": 6.535525746883702e-06, + "loss": 0.8987, + "step": 10757 + }, + { + "epoch": 1.8326296161347708, + "grad_norm": 1.6328125, + "learning_rate": 6.533831717444984e-06, + "loss": 0.7944, + "step": 10758 + }, + { + "epoch": 1.8328010802237606, + "grad_norm": 1.7578125, + "learning_rate": 6.5321378010510675e-06, + "loss": 0.8961, + "step": 10759 + }, + { + "epoch": 1.8329725443127505, + "grad_norm": 1.7421875, + "learning_rate": 6.5304439977571945e-06, + "loss": 0.8753, + "step": 10760 + }, + { + "epoch": 1.8331440084017405, + "grad_norm": 1.609375, + "learning_rate": 6.52875030761861e-06, + "loss": 0.825, + "step": 10761 + }, + { + "epoch": 1.8333154724907303, + "grad_norm": 1.5859375, + "learning_rate": 6.527056730690547e-06, + "loss": 0.8377, + "step": 10762 + }, + { + "epoch": 1.8334869365797202, + "grad_norm": 1.59375, + "learning_rate": 6.525363267028244e-06, + "loss": 0.7996, + "step": 10763 + }, + { + "epoch": 1.83365840066871, + "grad_norm": 1.671875, + "learning_rate": 6.523669916686928e-06, + "loss": 0.8381, + "step": 10764 + }, + { + "epoch": 1.8338298647576998, + "grad_norm": 1.8125, + "learning_rate": 6.521976679721829e-06, + "loss": 0.8257, + "step": 10765 + }, + { + "epoch": 1.8340013288466896, + "grad_norm": 1.6484375, + "learning_rate": 6.520283556188166e-06, + "loss": 0.8644, + "step": 10766 + }, + { + "epoch": 1.8341727929356795, + "grad_norm": 1.7578125, + "learning_rate": 6.518590546141159e-06, + "loss": 0.8382, + "step": 10767 + }, + { + "epoch": 1.8343442570246693, + "grad_norm": 1.7578125, + "learning_rate": 6.516897649636025e-06, + "loss": 0.8607, + "step": 10768 + }, + { + "epoch": 1.8345157211136591, + "grad_norm": 1.765625, + "learning_rate": 6.5152048667279735e-06, + "loss": 0.8957, + "step": 10769 + }, + { + "epoch": 1.8346871852026492, + "grad_norm": 1.8125, + "learning_rate": 6.513512197472214e-06, + "loss": 0.9375, + "step": 10770 + }, + { + "epoch": 1.834858649291639, + "grad_norm": 1.703125, + "learning_rate": 6.511819641923951e-06, + "loss": 0.8961, + "step": 10771 + }, + { + "epoch": 1.8350301133806288, + "grad_norm": 1.6171875, + "learning_rate": 6.510127200138385e-06, + "loss": 0.8588, + "step": 10772 + }, + { + "epoch": 1.8352015774696189, + "grad_norm": 1.7890625, + "learning_rate": 6.50843487217071e-06, + "loss": 0.8759, + "step": 10773 + }, + { + "epoch": 1.8353730415586087, + "grad_norm": 1.6796875, + "learning_rate": 6.506742658076124e-06, + "loss": 0.8354, + "step": 10774 + }, + { + "epoch": 1.8355445056475985, + "grad_norm": 1.7265625, + "learning_rate": 6.505050557909816e-06, + "loss": 0.7801, + "step": 10775 + }, + { + "epoch": 1.8357159697365883, + "grad_norm": 1.859375, + "learning_rate": 6.503358571726968e-06, + "loss": 0.8814, + "step": 10776 + }, + { + "epoch": 1.8358874338255782, + "grad_norm": 1.6953125, + "learning_rate": 6.501666699582769e-06, + "loss": 0.8261, + "step": 10777 + }, + { + "epoch": 1.836058897914568, + "grad_norm": 1.828125, + "learning_rate": 6.499974941532387e-06, + "loss": 0.8936, + "step": 10778 + }, + { + "epoch": 1.8362303620035578, + "grad_norm": 1.75, + "learning_rate": 6.498283297631004e-06, + "loss": 0.8725, + "step": 10779 + }, + { + "epoch": 1.8364018260925477, + "grad_norm": 1.7421875, + "learning_rate": 6.496591767933789e-06, + "loss": 0.7837, + "step": 10780 + }, + { + "epoch": 1.8365732901815375, + "grad_norm": 1.71875, + "learning_rate": 6.4949003524959055e-06, + "loss": 0.8273, + "step": 10781 + }, + { + "epoch": 1.8367447542705273, + "grad_norm": 1.8046875, + "learning_rate": 6.493209051372522e-06, + "loss": 0.8467, + "step": 10782 + }, + { + "epoch": 1.8369162183595173, + "grad_norm": 1.71875, + "learning_rate": 6.491517864618799e-06, + "loss": 0.8545, + "step": 10783 + }, + { + "epoch": 1.8370876824485072, + "grad_norm": 1.6484375, + "learning_rate": 6.4898267922898884e-06, + "loss": 0.8044, + "step": 10784 + }, + { + "epoch": 1.837259146537497, + "grad_norm": 1.6953125, + "learning_rate": 6.488135834440945e-06, + "loss": 0.8334, + "step": 10785 + }, + { + "epoch": 1.837430610626487, + "grad_norm": 1.6875, + "learning_rate": 6.4864449911271165e-06, + "loss": 0.884, + "step": 10786 + }, + { + "epoch": 1.8376020747154769, + "grad_norm": 1.8046875, + "learning_rate": 6.484754262403547e-06, + "loss": 0.8961, + "step": 10787 + }, + { + "epoch": 1.8377735388044667, + "grad_norm": 1.5859375, + "learning_rate": 6.483063648325383e-06, + "loss": 0.7778, + "step": 10788 + }, + { + "epoch": 1.8379450028934565, + "grad_norm": 1.7421875, + "learning_rate": 6.48137314894775e-06, + "loss": 0.8881, + "step": 10789 + }, + { + "epoch": 1.8381164669824464, + "grad_norm": 1.7109375, + "learning_rate": 6.479682764325792e-06, + "loss": 0.8727, + "step": 10790 + }, + { + "epoch": 1.8382879310714362, + "grad_norm": 1.6953125, + "learning_rate": 6.477992494514633e-06, + "loss": 0.8266, + "step": 10791 + }, + { + "epoch": 1.838459395160426, + "grad_norm": 1.703125, + "learning_rate": 6.476302339569405e-06, + "loss": 0.8603, + "step": 10792 + }, + { + "epoch": 1.8386308592494158, + "grad_norm": 1.6796875, + "learning_rate": 6.474612299545225e-06, + "loss": 0.7457, + "step": 10793 + }, + { + "epoch": 1.8388023233384057, + "grad_norm": 1.6796875, + "learning_rate": 6.472922374497211e-06, + "loss": 0.8563, + "step": 10794 + }, + { + "epoch": 1.8389737874273957, + "grad_norm": 1.59375, + "learning_rate": 6.471232564480483e-06, + "loss": 0.7924, + "step": 10795 + }, + { + "epoch": 1.8391452515163855, + "grad_norm": 1.6875, + "learning_rate": 6.469542869550147e-06, + "loss": 0.883, + "step": 10796 + }, + { + "epoch": 1.8393167156053754, + "grad_norm": 1.6640625, + "learning_rate": 6.467853289761315e-06, + "loss": 0.9064, + "step": 10797 + }, + { + "epoch": 1.8394881796943654, + "grad_norm": 1.7265625, + "learning_rate": 6.466163825169086e-06, + "loss": 0.9077, + "step": 10798 + }, + { + "epoch": 1.8396596437833552, + "grad_norm": 1.6875, + "learning_rate": 6.464474475828563e-06, + "loss": 0.8597, + "step": 10799 + }, + { + "epoch": 1.839831107872345, + "grad_norm": 1.71875, + "learning_rate": 6.4627852417948425e-06, + "loss": 0.8379, + "step": 10800 + }, + { + "epoch": 1.8400025719613349, + "grad_norm": 1.7421875, + "learning_rate": 6.461096123123013e-06, + "loss": 0.8876, + "step": 10801 + }, + { + "epoch": 1.8401740360503247, + "grad_norm": 1.7109375, + "learning_rate": 6.459407119868165e-06, + "loss": 0.8891, + "step": 10802 + }, + { + "epoch": 1.8403455001393145, + "grad_norm": 1.6640625, + "learning_rate": 6.4577182320853836e-06, + "loss": 0.8582, + "step": 10803 + }, + { + "epoch": 1.8405169642283044, + "grad_norm": 1.6796875, + "learning_rate": 6.456029459829751e-06, + "loss": 0.8022, + "step": 10804 + }, + { + "epoch": 1.8406884283172942, + "grad_norm": 1.71875, + "learning_rate": 6.454340803156341e-06, + "loss": 0.8657, + "step": 10805 + }, + { + "epoch": 1.840859892406284, + "grad_norm": 1.6171875, + "learning_rate": 6.452652262120231e-06, + "loss": 0.8819, + "step": 10806 + }, + { + "epoch": 1.841031356495274, + "grad_norm": 1.6484375, + "learning_rate": 6.450963836776487e-06, + "loss": 0.816, + "step": 10807 + }, + { + "epoch": 1.8412028205842639, + "grad_norm": 1.7578125, + "learning_rate": 6.449275527180178e-06, + "loss": 0.9231, + "step": 10808 + }, + { + "epoch": 1.8413742846732537, + "grad_norm": 1.7578125, + "learning_rate": 6.447587333386365e-06, + "loss": 0.9352, + "step": 10809 + }, + { + "epoch": 1.8415457487622438, + "grad_norm": 1.65625, + "learning_rate": 6.445899255450106e-06, + "loss": 0.8779, + "step": 10810 + }, + { + "epoch": 1.8417172128512336, + "grad_norm": 1.671875, + "learning_rate": 6.444211293426457e-06, + "loss": 0.8487, + "step": 10811 + }, + { + "epoch": 1.8418886769402234, + "grad_norm": 1.7421875, + "learning_rate": 6.442523447370466e-06, + "loss": 0.8554, + "step": 10812 + }, + { + "epoch": 1.8420601410292132, + "grad_norm": 1.7109375, + "learning_rate": 6.440835717337182e-06, + "loss": 0.9101, + "step": 10813 + }, + { + "epoch": 1.842231605118203, + "grad_norm": 1.796875, + "learning_rate": 6.439148103381647e-06, + "loss": 0.8077, + "step": 10814 + }, + { + "epoch": 1.8424030692071929, + "grad_norm": 1.78125, + "learning_rate": 6.437460605558902e-06, + "loss": 0.8836, + "step": 10815 + }, + { + "epoch": 1.8425745332961827, + "grad_norm": 1.640625, + "learning_rate": 6.435773223923982e-06, + "loss": 0.8436, + "step": 10816 + }, + { + "epoch": 1.8427459973851725, + "grad_norm": 1.8125, + "learning_rate": 6.434085958531918e-06, + "loss": 0.8848, + "step": 10817 + }, + { + "epoch": 1.8429174614741624, + "grad_norm": 1.6953125, + "learning_rate": 6.432398809437739e-06, + "loss": 0.8224, + "step": 10818 + }, + { + "epoch": 1.8430889255631524, + "grad_norm": 1.6796875, + "learning_rate": 6.43071177669647e-06, + "loss": 0.8838, + "step": 10819 + }, + { + "epoch": 1.8432603896521422, + "grad_norm": 1.6953125, + "learning_rate": 6.429024860363128e-06, + "loss": 0.8179, + "step": 10820 + }, + { + "epoch": 1.843431853741132, + "grad_norm": 1.6796875, + "learning_rate": 6.427338060492734e-06, + "loss": 0.8074, + "step": 10821 + }, + { + "epoch": 1.843603317830122, + "grad_norm": 1.71875, + "learning_rate": 6.4256513771403e-06, + "loss": 0.9034, + "step": 10822 + }, + { + "epoch": 1.843774781919112, + "grad_norm": 1.7578125, + "learning_rate": 6.4239648103608384e-06, + "loss": 0.8656, + "step": 10823 + }, + { + "epoch": 1.8439462460081018, + "grad_norm": 1.7265625, + "learning_rate": 6.4222783602093465e-06, + "loss": 0.8631, + "step": 10824 + }, + { + "epoch": 1.8441177100970916, + "grad_norm": 1.734375, + "learning_rate": 6.420592026740829e-06, + "loss": 0.7848, + "step": 10825 + }, + { + "epoch": 1.8442891741860814, + "grad_norm": 1.6484375, + "learning_rate": 6.418905810010285e-06, + "loss": 0.7932, + "step": 10826 + }, + { + "epoch": 1.8444606382750712, + "grad_norm": 1.71875, + "learning_rate": 6.4172197100727075e-06, + "loss": 0.7858, + "step": 10827 + }, + { + "epoch": 1.844632102364061, + "grad_norm": 1.671875, + "learning_rate": 6.4155337269830855e-06, + "loss": 0.8253, + "step": 10828 + }, + { + "epoch": 1.8448035664530509, + "grad_norm": 1.7265625, + "learning_rate": 6.413847860796407e-06, + "loss": 0.8208, + "step": 10829 + }, + { + "epoch": 1.8449750305420407, + "grad_norm": 1.6484375, + "learning_rate": 6.412162111567655e-06, + "loss": 0.8985, + "step": 10830 + }, + { + "epoch": 1.8451464946310308, + "grad_norm": 1.6640625, + "learning_rate": 6.410476479351806e-06, + "loss": 0.8778, + "step": 10831 + }, + { + "epoch": 1.8453179587200206, + "grad_norm": 1.6171875, + "learning_rate": 6.408790964203837e-06, + "loss": 0.8216, + "step": 10832 + }, + { + "epoch": 1.8454894228090104, + "grad_norm": 1.7265625, + "learning_rate": 6.407105566178717e-06, + "loss": 0.7939, + "step": 10833 + }, + { + "epoch": 1.8456608868980005, + "grad_norm": 1.71875, + "learning_rate": 6.405420285331414e-06, + "loss": 0.8711, + "step": 10834 + }, + { + "epoch": 1.8458323509869903, + "grad_norm": 1.65625, + "learning_rate": 6.4037351217168965e-06, + "loss": 0.8093, + "step": 10835 + }, + { + "epoch": 1.84600381507598, + "grad_norm": 1.7109375, + "learning_rate": 6.402050075390112e-06, + "loss": 0.8094, + "step": 10836 + }, + { + "epoch": 1.84617527916497, + "grad_norm": 1.7109375, + "learning_rate": 6.400365146406027e-06, + "loss": 0.8507, + "step": 10837 + }, + { + "epoch": 1.8463467432539598, + "grad_norm": 1.6015625, + "learning_rate": 6.398680334819587e-06, + "loss": 0.867, + "step": 10838 + }, + { + "epoch": 1.8465182073429496, + "grad_norm": 1.6796875, + "learning_rate": 6.396995640685744e-06, + "loss": 0.8413, + "step": 10839 + }, + { + "epoch": 1.8466896714319394, + "grad_norm": 1.6640625, + "learning_rate": 6.395311064059442e-06, + "loss": 0.8135, + "step": 10840 + }, + { + "epoch": 1.8468611355209292, + "grad_norm": 1.65625, + "learning_rate": 6.393626604995617e-06, + "loss": 0.8094, + "step": 10841 + }, + { + "epoch": 1.847032599609919, + "grad_norm": 1.796875, + "learning_rate": 6.391942263549211e-06, + "loss": 0.8335, + "step": 10842 + }, + { + "epoch": 1.847204063698909, + "grad_norm": 1.7890625, + "learning_rate": 6.390258039775155e-06, + "loss": 0.8797, + "step": 10843 + }, + { + "epoch": 1.847375527787899, + "grad_norm": 1.6796875, + "learning_rate": 6.388573933728376e-06, + "loss": 0.8325, + "step": 10844 + }, + { + "epoch": 1.8475469918768888, + "grad_norm": 1.828125, + "learning_rate": 6.386889945463801e-06, + "loss": 1.0021, + "step": 10845 + }, + { + "epoch": 1.8477184559658788, + "grad_norm": 1.6796875, + "learning_rate": 6.385206075036352e-06, + "loss": 0.853, + "step": 10846 + }, + { + "epoch": 1.8478899200548686, + "grad_norm": 1.609375, + "learning_rate": 6.383522322500943e-06, + "loss": 0.7751, + "step": 10847 + }, + { + "epoch": 1.8480613841438585, + "grad_norm": 1.6953125, + "learning_rate": 6.381838687912489e-06, + "loss": 0.913, + "step": 10848 + }, + { + "epoch": 1.8482328482328483, + "grad_norm": 1.703125, + "learning_rate": 6.3801551713259015e-06, + "loss": 0.8545, + "step": 10849 + }, + { + "epoch": 1.848404312321838, + "grad_norm": 1.7578125, + "learning_rate": 6.3784717727960844e-06, + "loss": 0.8677, + "step": 10850 + }, + { + "epoch": 1.848575776410828, + "grad_norm": 1.7421875, + "learning_rate": 6.376788492377941e-06, + "loss": 0.7775, + "step": 10851 + }, + { + "epoch": 1.8487472404998178, + "grad_norm": 1.625, + "learning_rate": 6.375105330126368e-06, + "loss": 0.7807, + "step": 10852 + }, + { + "epoch": 1.8489187045888076, + "grad_norm": 1.71875, + "learning_rate": 6.373422286096259e-06, + "loss": 0.8227, + "step": 10853 + }, + { + "epoch": 1.8490901686777974, + "grad_norm": 1.7109375, + "learning_rate": 6.371739360342507e-06, + "loss": 0.9536, + "step": 10854 + }, + { + "epoch": 1.8492616327667875, + "grad_norm": 1.7265625, + "learning_rate": 6.3700565529199965e-06, + "loss": 0.7772, + "step": 10855 + }, + { + "epoch": 1.8494330968557773, + "grad_norm": 1.640625, + "learning_rate": 6.3683738638836125e-06, + "loss": 0.8825, + "step": 10856 + }, + { + "epoch": 1.849604560944767, + "grad_norm": 1.6171875, + "learning_rate": 6.366691293288229e-06, + "loss": 0.8269, + "step": 10857 + }, + { + "epoch": 1.8497760250337572, + "grad_norm": 1.7578125, + "learning_rate": 6.365008841188729e-06, + "loss": 0.9076, + "step": 10858 + }, + { + "epoch": 1.849947489122747, + "grad_norm": 1.6875, + "learning_rate": 6.363326507639978e-06, + "loss": 0.9056, + "step": 10859 + }, + { + "epoch": 1.8501189532117368, + "grad_norm": 1.71875, + "learning_rate": 6.36164429269684e-06, + "loss": 0.9067, + "step": 10860 + }, + { + "epoch": 1.8502904173007266, + "grad_norm": 1.6640625, + "learning_rate": 6.359962196414185e-06, + "loss": 0.7929, + "step": 10861 + }, + { + "epoch": 1.8504618813897165, + "grad_norm": 1.5859375, + "learning_rate": 6.358280218846871e-06, + "loss": 0.766, + "step": 10862 + }, + { + "epoch": 1.8506333454787063, + "grad_norm": 1.59375, + "learning_rate": 6.356598360049751e-06, + "loss": 0.8052, + "step": 10863 + }, + { + "epoch": 1.850804809567696, + "grad_norm": 1.8671875, + "learning_rate": 6.354916620077677e-06, + "loss": 0.8696, + "step": 10864 + }, + { + "epoch": 1.850976273656686, + "grad_norm": 1.75, + "learning_rate": 6.353234998985499e-06, + "loss": 0.8394, + "step": 10865 + }, + { + "epoch": 1.8511477377456758, + "grad_norm": 1.7109375, + "learning_rate": 6.3515534968280604e-06, + "loss": 0.8423, + "step": 10866 + }, + { + "epoch": 1.8513192018346658, + "grad_norm": 1.6796875, + "learning_rate": 6.3498721136601994e-06, + "loss": 0.8579, + "step": 10867 + }, + { + "epoch": 1.8514906659236556, + "grad_norm": 1.703125, + "learning_rate": 6.348190849536755e-06, + "loss": 0.7728, + "step": 10868 + }, + { + "epoch": 1.8516621300126455, + "grad_norm": 1.5859375, + "learning_rate": 6.346509704512563e-06, + "loss": 0.779, + "step": 10869 + }, + { + "epoch": 1.8518335941016355, + "grad_norm": 1.65625, + "learning_rate": 6.344828678642444e-06, + "loss": 0.7862, + "step": 10870 + }, + { + "epoch": 1.8520050581906253, + "grad_norm": 1.6640625, + "learning_rate": 6.343147771981225e-06, + "loss": 0.825, + "step": 10871 + }, + { + "epoch": 1.8521765222796152, + "grad_norm": 1.625, + "learning_rate": 6.341466984583728e-06, + "loss": 0.8207, + "step": 10872 + }, + { + "epoch": 1.852347986368605, + "grad_norm": 1.6953125, + "learning_rate": 6.339786316504769e-06, + "loss": 0.8482, + "step": 10873 + }, + { + "epoch": 1.8525194504575948, + "grad_norm": 1.609375, + "learning_rate": 6.338105767799161e-06, + "loss": 0.854, + "step": 10874 + }, + { + "epoch": 1.8526909145465846, + "grad_norm": 1.7265625, + "learning_rate": 6.336425338521712e-06, + "loss": 0.8701, + "step": 10875 + }, + { + "epoch": 1.8528623786355745, + "grad_norm": 1.8046875, + "learning_rate": 6.33474502872723e-06, + "loss": 0.9802, + "step": 10876 + }, + { + "epoch": 1.8530338427245643, + "grad_norm": 1.7109375, + "learning_rate": 6.333064838470515e-06, + "loss": 0.8136, + "step": 10877 + }, + { + "epoch": 1.853205306813554, + "grad_norm": 1.6640625, + "learning_rate": 6.331384767806365e-06, + "loss": 0.808, + "step": 10878 + }, + { + "epoch": 1.853376770902544, + "grad_norm": 1.7421875, + "learning_rate": 6.3297048167895705e-06, + "loss": 0.9327, + "step": 10879 + }, + { + "epoch": 1.853548234991534, + "grad_norm": 1.6328125, + "learning_rate": 6.328024985474924e-06, + "loss": 0.8691, + "step": 10880 + }, + { + "epoch": 1.8537196990805238, + "grad_norm": 1.6796875, + "learning_rate": 6.326345273917214e-06, + "loss": 0.7949, + "step": 10881 + }, + { + "epoch": 1.8538911631695136, + "grad_norm": 1.65625, + "learning_rate": 6.324665682171214e-06, + "loss": 0.8325, + "step": 10882 + }, + { + "epoch": 1.8540626272585037, + "grad_norm": 1.796875, + "learning_rate": 6.322986210291705e-06, + "loss": 0.8751, + "step": 10883 + }, + { + "epoch": 1.8542340913474935, + "grad_norm": 1.7265625, + "learning_rate": 6.321306858333463e-06, + "loss": 0.8528, + "step": 10884 + }, + { + "epoch": 1.8544055554364833, + "grad_norm": 1.640625, + "learning_rate": 6.319627626351258e-06, + "loss": 0.8821, + "step": 10885 + }, + { + "epoch": 1.8545770195254732, + "grad_norm": 1.75, + "learning_rate": 6.317948514399854e-06, + "loss": 0.8082, + "step": 10886 + }, + { + "epoch": 1.854748483614463, + "grad_norm": 1.7890625, + "learning_rate": 6.3162695225340155e-06, + "loss": 0.8664, + "step": 10887 + }, + { + "epoch": 1.8549199477034528, + "grad_norm": 1.6875, + "learning_rate": 6.314590650808498e-06, + "loss": 0.8479, + "step": 10888 + }, + { + "epoch": 1.8550914117924426, + "grad_norm": 1.765625, + "learning_rate": 6.312911899278059e-06, + "loss": 0.8752, + "step": 10889 + }, + { + "epoch": 1.8552628758814325, + "grad_norm": 1.71875, + "learning_rate": 6.311233267997446e-06, + "loss": 0.8617, + "step": 10890 + }, + { + "epoch": 1.8554343399704223, + "grad_norm": 1.65625, + "learning_rate": 6.309554757021408e-06, + "loss": 0.8035, + "step": 10891 + }, + { + "epoch": 1.8556058040594123, + "grad_norm": 1.625, + "learning_rate": 6.307876366404687e-06, + "loss": 0.8573, + "step": 10892 + }, + { + "epoch": 1.8557772681484022, + "grad_norm": 1.6875, + "learning_rate": 6.30619809620202e-06, + "loss": 0.923, + "step": 10893 + }, + { + "epoch": 1.855948732237392, + "grad_norm": 1.7109375, + "learning_rate": 6.304519946468142e-06, + "loss": 0.7693, + "step": 10894 + }, + { + "epoch": 1.856120196326382, + "grad_norm": 1.640625, + "learning_rate": 6.302841917257785e-06, + "loss": 0.824, + "step": 10895 + }, + { + "epoch": 1.8562916604153719, + "grad_norm": 1.6171875, + "learning_rate": 6.301164008625674e-06, + "loss": 0.7486, + "step": 10896 + }, + { + "epoch": 1.8564631245043617, + "grad_norm": 1.734375, + "learning_rate": 6.299486220626534e-06, + "loss": 0.9049, + "step": 10897 + }, + { + "epoch": 1.8566345885933515, + "grad_norm": 1.765625, + "learning_rate": 6.297808553315084e-06, + "loss": 0.8944, + "step": 10898 + }, + { + "epoch": 1.8568060526823413, + "grad_norm": 1.7109375, + "learning_rate": 6.296131006746037e-06, + "loss": 0.9116, + "step": 10899 + }, + { + "epoch": 1.8569775167713312, + "grad_norm": 1.7734375, + "learning_rate": 6.294453580974106e-06, + "loss": 0.9053, + "step": 10900 + }, + { + "epoch": 1.857148980860321, + "grad_norm": 1.7421875, + "learning_rate": 6.2927762760539975e-06, + "loss": 0.8544, + "step": 10901 + }, + { + "epoch": 1.8573204449493108, + "grad_norm": 2.25, + "learning_rate": 6.291099092040414e-06, + "loss": 0.8509, + "step": 10902 + }, + { + "epoch": 1.8574919090383006, + "grad_norm": 1.78125, + "learning_rate": 6.289422028988057e-06, + "loss": 0.815, + "step": 10903 + }, + { + "epoch": 1.8576633731272907, + "grad_norm": 1.6484375, + "learning_rate": 6.287745086951621e-06, + "loss": 0.7843, + "step": 10904 + }, + { + "epoch": 1.8578348372162805, + "grad_norm": 1.734375, + "learning_rate": 6.286068265985795e-06, + "loss": 0.8522, + "step": 10905 + }, + { + "epoch": 1.8580063013052703, + "grad_norm": 1.765625, + "learning_rate": 6.284391566145269e-06, + "loss": 0.8605, + "step": 10906 + }, + { + "epoch": 1.8581777653942604, + "grad_norm": 1.8046875, + "learning_rate": 6.282714987484725e-06, + "loss": 0.8064, + "step": 10907 + }, + { + "epoch": 1.8583492294832502, + "grad_norm": 1.7109375, + "learning_rate": 6.281038530058843e-06, + "loss": 0.832, + "step": 10908 + }, + { + "epoch": 1.85852069357224, + "grad_norm": 1.640625, + "learning_rate": 6.2793621939223e-06, + "loss": 0.7933, + "step": 10909 + }, + { + "epoch": 1.8586921576612299, + "grad_norm": 1.671875, + "learning_rate": 6.277685979129766e-06, + "loss": 0.8531, + "step": 10910 + }, + { + "epoch": 1.8588636217502197, + "grad_norm": 1.6953125, + "learning_rate": 6.276009885735909e-06, + "loss": 0.8221, + "step": 10911 + }, + { + "epoch": 1.8590350858392095, + "grad_norm": 1.7265625, + "learning_rate": 6.274333913795392e-06, + "loss": 0.8708, + "step": 10912 + }, + { + "epoch": 1.8592065499281993, + "grad_norm": 1.78125, + "learning_rate": 6.2726580633628775e-06, + "loss": 0.8046, + "step": 10913 + }, + { + "epoch": 1.8593780140171892, + "grad_norm": 1.671875, + "learning_rate": 6.270982334493017e-06, + "loss": 0.8399, + "step": 10914 + }, + { + "epoch": 1.859549478106179, + "grad_norm": 1.734375, + "learning_rate": 6.26930672724047e-06, + "loss": 0.9163, + "step": 10915 + }, + { + "epoch": 1.859720942195169, + "grad_norm": 1.6171875, + "learning_rate": 6.267631241659875e-06, + "loss": 0.9097, + "step": 10916 + }, + { + "epoch": 1.8598924062841589, + "grad_norm": 1.71875, + "learning_rate": 6.26595587780588e-06, + "loss": 0.8519, + "step": 10917 + }, + { + "epoch": 1.8600638703731487, + "grad_norm": 1.75, + "learning_rate": 6.2642806357331244e-06, + "loss": 0.9399, + "step": 10918 + }, + { + "epoch": 1.8602353344621387, + "grad_norm": 1.6875, + "learning_rate": 6.262605515496245e-06, + "loss": 0.8465, + "step": 10919 + }, + { + "epoch": 1.8604067985511286, + "grad_norm": 1.6953125, + "learning_rate": 6.260930517149875e-06, + "loss": 0.7655, + "step": 10920 + }, + { + "epoch": 1.8605782626401184, + "grad_norm": 1.6328125, + "learning_rate": 6.2592556407486394e-06, + "loss": 0.9003, + "step": 10921 + }, + { + "epoch": 1.8607497267291082, + "grad_norm": 1.703125, + "learning_rate": 6.257580886347162e-06, + "loss": 0.8566, + "step": 10922 + }, + { + "epoch": 1.860921190818098, + "grad_norm": 1.640625, + "learning_rate": 6.255906254000067e-06, + "loss": 0.839, + "step": 10923 + }, + { + "epoch": 1.8610926549070879, + "grad_norm": 1.5859375, + "learning_rate": 6.254231743761967e-06, + "loss": 0.7622, + "step": 10924 + }, + { + "epoch": 1.8612641189960777, + "grad_norm": 1.546875, + "learning_rate": 6.252557355687476e-06, + "loss": 0.7633, + "step": 10925 + }, + { + "epoch": 1.8614355830850675, + "grad_norm": 1.6484375, + "learning_rate": 6.250883089831202e-06, + "loss": 0.809, + "step": 10926 + }, + { + "epoch": 1.8616070471740573, + "grad_norm": 1.6953125, + "learning_rate": 6.2492089462477515e-06, + "loss": 0.862, + "step": 10927 + }, + { + "epoch": 1.8617785112630474, + "grad_norm": 1.7734375, + "learning_rate": 6.247534924991716e-06, + "loss": 0.8738, + "step": 10928 + }, + { + "epoch": 1.8619499753520372, + "grad_norm": 1.71875, + "learning_rate": 6.245861026117699e-06, + "loss": 0.8735, + "step": 10929 + }, + { + "epoch": 1.862121439441027, + "grad_norm": 1.6875, + "learning_rate": 6.244187249680287e-06, + "loss": 0.8735, + "step": 10930 + }, + { + "epoch": 1.862292903530017, + "grad_norm": 1.6953125, + "learning_rate": 6.242513595734075e-06, + "loss": 0.8638, + "step": 10931 + }, + { + "epoch": 1.862464367619007, + "grad_norm": 1.6796875, + "learning_rate": 6.240840064333644e-06, + "loss": 0.8454, + "step": 10932 + }, + { + "epoch": 1.8626358317079967, + "grad_norm": 1.7265625, + "learning_rate": 6.239166655533575e-06, + "loss": 0.8941, + "step": 10933 + }, + { + "epoch": 1.8628072957969866, + "grad_norm": 1.6015625, + "learning_rate": 6.237493369388441e-06, + "loss": 0.7839, + "step": 10934 + }, + { + "epoch": 1.8629787598859764, + "grad_norm": 1.796875, + "learning_rate": 6.235820205952818e-06, + "loss": 0.8964, + "step": 10935 + }, + { + "epoch": 1.8631502239749662, + "grad_norm": 1.75, + "learning_rate": 6.2341471652812734e-06, + "loss": 0.8891, + "step": 10936 + }, + { + "epoch": 1.863321688063956, + "grad_norm": 1.6640625, + "learning_rate": 6.2324742474283695e-06, + "loss": 0.8195, + "step": 10937 + }, + { + "epoch": 1.8634931521529459, + "grad_norm": 1.6875, + "learning_rate": 6.23080145244867e-06, + "loss": 0.7823, + "step": 10938 + }, + { + "epoch": 1.8636646162419357, + "grad_norm": 1.609375, + "learning_rate": 6.229128780396727e-06, + "loss": 0.8456, + "step": 10939 + }, + { + "epoch": 1.8638360803309257, + "grad_norm": 1.6171875, + "learning_rate": 6.227456231327094e-06, + "loss": 0.8666, + "step": 10940 + }, + { + "epoch": 1.8640075444199156, + "grad_norm": 1.8125, + "learning_rate": 6.225783805294319e-06, + "loss": 0.8293, + "step": 10941 + }, + { + "epoch": 1.8641790085089054, + "grad_norm": 1.6953125, + "learning_rate": 6.224111502352947e-06, + "loss": 0.9055, + "step": 10942 + }, + { + "epoch": 1.8643504725978954, + "grad_norm": 1.6875, + "learning_rate": 6.222439322557516e-06, + "loss": 0.7858, + "step": 10943 + }, + { + "epoch": 1.8645219366868853, + "grad_norm": 1.734375, + "learning_rate": 6.2207672659625665e-06, + "loss": 0.9471, + "step": 10944 + }, + { + "epoch": 1.864693400775875, + "grad_norm": 1.7265625, + "learning_rate": 6.219095332622626e-06, + "loss": 0.9263, + "step": 10945 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 1.6875, + "learning_rate": 6.217423522592223e-06, + "loss": 0.8449, + "step": 10946 + }, + { + "epoch": 1.8650363289538547, + "grad_norm": 1.8359375, + "learning_rate": 6.215751835925885e-06, + "loss": 0.8745, + "step": 10947 + }, + { + "epoch": 1.8652077930428446, + "grad_norm": 1.578125, + "learning_rate": 6.2140802726781294e-06, + "loss": 0.8291, + "step": 10948 + }, + { + "epoch": 1.8653792571318344, + "grad_norm": 1.7109375, + "learning_rate": 6.2124088329034715e-06, + "loss": 0.7714, + "step": 10949 + }, + { + "epoch": 1.8655507212208242, + "grad_norm": 1.828125, + "learning_rate": 6.210737516656427e-06, + "loss": 0.849, + "step": 10950 + }, + { + "epoch": 1.865722185309814, + "grad_norm": 1.703125, + "learning_rate": 6.2090663239915e-06, + "loss": 0.8369, + "step": 10951 + }, + { + "epoch": 1.865893649398804, + "grad_norm": 1.71875, + "learning_rate": 6.207395254963193e-06, + "loss": 0.8538, + "step": 10952 + }, + { + "epoch": 1.866065113487794, + "grad_norm": 1.734375, + "learning_rate": 6.205724309626011e-06, + "loss": 0.8414, + "step": 10953 + }, + { + "epoch": 1.8662365775767837, + "grad_norm": 1.78125, + "learning_rate": 6.204053488034446e-06, + "loss": 0.8347, + "step": 10954 + }, + { + "epoch": 1.8664080416657738, + "grad_norm": 1.6015625, + "learning_rate": 6.2023827902429915e-06, + "loss": 0.7904, + "step": 10955 + }, + { + "epoch": 1.8665795057547636, + "grad_norm": 1.65625, + "learning_rate": 6.200712216306134e-06, + "loss": 0.8911, + "step": 10956 + }, + { + "epoch": 1.8667509698437534, + "grad_norm": 1.7265625, + "learning_rate": 6.1990417662783574e-06, + "loss": 0.8466, + "step": 10957 + }, + { + "epoch": 1.8669224339327433, + "grad_norm": 1.6484375, + "learning_rate": 6.197371440214144e-06, + "loss": 0.8515, + "step": 10958 + }, + { + "epoch": 1.867093898021733, + "grad_norm": 1.6875, + "learning_rate": 6.195701238167966e-06, + "loss": 0.8596, + "step": 10959 + }, + { + "epoch": 1.867265362110723, + "grad_norm": 1.6796875, + "learning_rate": 6.194031160194296e-06, + "loss": 0.8283, + "step": 10960 + }, + { + "epoch": 1.8674368261997127, + "grad_norm": 1.6484375, + "learning_rate": 6.192361206347603e-06, + "loss": 0.7804, + "step": 10961 + }, + { + "epoch": 1.8676082902887026, + "grad_norm": 1.6796875, + "learning_rate": 6.190691376682349e-06, + "loss": 0.8363, + "step": 10962 + }, + { + "epoch": 1.8677797543776924, + "grad_norm": 1.640625, + "learning_rate": 6.189021671252993e-06, + "loss": 0.8353, + "step": 10963 + }, + { + "epoch": 1.8679512184666824, + "grad_norm": 1.6484375, + "learning_rate": 6.187352090113992e-06, + "loss": 0.8868, + "step": 10964 + }, + { + "epoch": 1.8681226825556723, + "grad_norm": 1.765625, + "learning_rate": 6.185682633319796e-06, + "loss": 0.8438, + "step": 10965 + }, + { + "epoch": 1.868294146644662, + "grad_norm": 1.6796875, + "learning_rate": 6.184013300924852e-06, + "loss": 0.8665, + "step": 10966 + }, + { + "epoch": 1.868465610733652, + "grad_norm": 1.7578125, + "learning_rate": 6.1823440929836055e-06, + "loss": 0.8625, + "step": 10967 + }, + { + "epoch": 1.868637074822642, + "grad_norm": 1.6875, + "learning_rate": 6.180675009550492e-06, + "loss": 0.8351, + "step": 10968 + }, + { + "epoch": 1.8688085389116318, + "grad_norm": 1.7890625, + "learning_rate": 6.179006050679947e-06, + "loss": 0.8879, + "step": 10969 + }, + { + "epoch": 1.8689800030006216, + "grad_norm": 1.6484375, + "learning_rate": 6.177337216426407e-06, + "loss": 0.8025, + "step": 10970 + }, + { + "epoch": 1.8691514670896114, + "grad_norm": 1.7109375, + "learning_rate": 6.175668506844294e-06, + "loss": 0.8274, + "step": 10971 + }, + { + "epoch": 1.8693229311786013, + "grad_norm": 1.71875, + "learning_rate": 6.173999921988032e-06, + "loss": 0.8356, + "step": 10972 + }, + { + "epoch": 1.869494395267591, + "grad_norm": 1.6640625, + "learning_rate": 6.172331461912044e-06, + "loss": 0.7581, + "step": 10973 + }, + { + "epoch": 1.869665859356581, + "grad_norm": 1.6875, + "learning_rate": 6.170663126670737e-06, + "loss": 0.8047, + "step": 10974 + }, + { + "epoch": 1.8698373234455707, + "grad_norm": 1.703125, + "learning_rate": 6.1689949163185245e-06, + "loss": 0.809, + "step": 10975 + }, + { + "epoch": 1.8700087875345606, + "grad_norm": 1.6875, + "learning_rate": 6.167326830909815e-06, + "loss": 0.785, + "step": 10976 + }, + { + "epoch": 1.8701802516235506, + "grad_norm": 1.6015625, + "learning_rate": 6.1656588704990085e-06, + "loss": 0.7857, + "step": 10977 + }, + { + "epoch": 1.8703517157125404, + "grad_norm": 1.6875, + "learning_rate": 6.163991035140506e-06, + "loss": 0.8535, + "step": 10978 + }, + { + "epoch": 1.8705231798015303, + "grad_norm": 1.8046875, + "learning_rate": 6.162323324888702e-06, + "loss": 0.8995, + "step": 10979 + }, + { + "epoch": 1.8706946438905203, + "grad_norm": 1.6875, + "learning_rate": 6.160655739797985e-06, + "loss": 0.7442, + "step": 10980 + }, + { + "epoch": 1.8708661079795101, + "grad_norm": 1.6875, + "learning_rate": 6.158988279922741e-06, + "loss": 0.9137, + "step": 10981 + }, + { + "epoch": 1.8710375720685, + "grad_norm": 1.703125, + "learning_rate": 6.157320945317353e-06, + "loss": 0.8714, + "step": 10982 + }, + { + "epoch": 1.8712090361574898, + "grad_norm": 1.765625, + "learning_rate": 6.1556537360362014e-06, + "loss": 0.9099, + "step": 10983 + }, + { + "epoch": 1.8713805002464796, + "grad_norm": 1.7109375, + "learning_rate": 6.153986652133657e-06, + "loss": 0.8458, + "step": 10984 + }, + { + "epoch": 1.8715519643354694, + "grad_norm": 1.578125, + "learning_rate": 6.152319693664091e-06, + "loss": 0.8526, + "step": 10985 + }, + { + "epoch": 1.8717234284244593, + "grad_norm": 1.6640625, + "learning_rate": 6.150652860681869e-06, + "loss": 0.8533, + "step": 10986 + }, + { + "epoch": 1.871894892513449, + "grad_norm": 1.8125, + "learning_rate": 6.148986153241352e-06, + "loss": 0.9229, + "step": 10987 + }, + { + "epoch": 1.872066356602439, + "grad_norm": 1.796875, + "learning_rate": 6.147319571396897e-06, + "loss": 0.8749, + "step": 10988 + }, + { + "epoch": 1.872237820691429, + "grad_norm": 1.7890625, + "learning_rate": 6.14565311520286e-06, + "loss": 0.8774, + "step": 10989 + }, + { + "epoch": 1.8724092847804188, + "grad_norm": 1.734375, + "learning_rate": 6.143986784713588e-06, + "loss": 0.948, + "step": 10990 + }, + { + "epoch": 1.8725807488694086, + "grad_norm": 1.640625, + "learning_rate": 6.142320579983427e-06, + "loss": 0.8464, + "step": 10991 + }, + { + "epoch": 1.8727522129583987, + "grad_norm": 1.6796875, + "learning_rate": 6.14065450106672e-06, + "loss": 0.8901, + "step": 10992 + }, + { + "epoch": 1.8729236770473885, + "grad_norm": 1.7578125, + "learning_rate": 6.138988548017802e-06, + "loss": 0.805, + "step": 10993 + }, + { + "epoch": 1.8730951411363783, + "grad_norm": 1.7578125, + "learning_rate": 6.137322720891007e-06, + "loss": 0.823, + "step": 10994 + }, + { + "epoch": 1.8732666052253681, + "grad_norm": 1.671875, + "learning_rate": 6.135657019740663e-06, + "loss": 0.8371, + "step": 10995 + }, + { + "epoch": 1.873438069314358, + "grad_norm": 1.7421875, + "learning_rate": 6.133991444621097e-06, + "loss": 0.9085, + "step": 10996 + }, + { + "epoch": 1.8736095334033478, + "grad_norm": 1.6640625, + "learning_rate": 6.132325995586628e-06, + "loss": 0.8843, + "step": 10997 + }, + { + "epoch": 1.8737809974923376, + "grad_norm": 1.796875, + "learning_rate": 6.130660672691571e-06, + "loss": 0.9687, + "step": 10998 + }, + { + "epoch": 1.8739524615813274, + "grad_norm": 1.71875, + "learning_rate": 6.128995475990241e-06, + "loss": 0.8697, + "step": 10999 + }, + { + "epoch": 1.8741239256703173, + "grad_norm": 1.71875, + "learning_rate": 6.127330405536943e-06, + "loss": 0.8857, + "step": 11000 + }, + { + "epoch": 1.8742953897593073, + "grad_norm": 1.7421875, + "learning_rate": 6.125665461385986e-06, + "loss": 0.8626, + "step": 11001 + }, + { + "epoch": 1.8744668538482971, + "grad_norm": 1.6640625, + "learning_rate": 6.124000643591667e-06, + "loss": 0.7529, + "step": 11002 + }, + { + "epoch": 1.874638317937287, + "grad_norm": 1.703125, + "learning_rate": 6.122335952208283e-06, + "loss": 0.8251, + "step": 11003 + }, + { + "epoch": 1.874809782026277, + "grad_norm": 1.703125, + "learning_rate": 6.120671387290125e-06, + "loss": 0.8308, + "step": 11004 + }, + { + "epoch": 1.8749812461152668, + "grad_norm": 1.7265625, + "learning_rate": 6.1190069488914806e-06, + "loss": 0.8574, + "step": 11005 + }, + { + "epoch": 1.8751527102042567, + "grad_norm": 1.4921875, + "learning_rate": 6.117342637066635e-06, + "loss": 0.7836, + "step": 11006 + }, + { + "epoch": 1.8753241742932465, + "grad_norm": 1.71875, + "learning_rate": 6.115678451869866e-06, + "loss": 0.8602, + "step": 11007 + }, + { + "epoch": 1.8754956383822363, + "grad_norm": 1.6328125, + "learning_rate": 6.114014393355453e-06, + "loss": 0.8605, + "step": 11008 + }, + { + "epoch": 1.8756671024712261, + "grad_norm": 1.703125, + "learning_rate": 6.112350461577661e-06, + "loss": 0.8752, + "step": 11009 + }, + { + "epoch": 1.875838566560216, + "grad_norm": 1.671875, + "learning_rate": 6.110686656590761e-06, + "loss": 0.754, + "step": 11010 + }, + { + "epoch": 1.8760100306492058, + "grad_norm": 1.625, + "learning_rate": 6.109022978449013e-06, + "loss": 0.8233, + "step": 11011 + }, + { + "epoch": 1.8761814947381956, + "grad_norm": 1.6796875, + "learning_rate": 6.107359427206679e-06, + "loss": 0.8455, + "step": 11012 + }, + { + "epoch": 1.8763529588271857, + "grad_norm": 1.6953125, + "learning_rate": 6.105696002918012e-06, + "loss": 0.7755, + "step": 11013 + }, + { + "epoch": 1.8765244229161755, + "grad_norm": 1.71875, + "learning_rate": 6.104032705637264e-06, + "loss": 0.8127, + "step": 11014 + }, + { + "epoch": 1.8766958870051653, + "grad_norm": 1.6875, + "learning_rate": 6.102369535418679e-06, + "loss": 0.8543, + "step": 11015 + }, + { + "epoch": 1.8768673510941554, + "grad_norm": 1.8046875, + "learning_rate": 6.100706492316499e-06, + "loss": 0.9207, + "step": 11016 + }, + { + "epoch": 1.8770388151831452, + "grad_norm": 1.6953125, + "learning_rate": 6.099043576384966e-06, + "loss": 0.8285, + "step": 11017 + }, + { + "epoch": 1.877210279272135, + "grad_norm": 1.71875, + "learning_rate": 6.097380787678311e-06, + "loss": 0.7908, + "step": 11018 + }, + { + "epoch": 1.8773817433611248, + "grad_norm": 1.7734375, + "learning_rate": 6.095718126250769e-06, + "loss": 0.8402, + "step": 11019 + }, + { + "epoch": 1.8775532074501147, + "grad_norm": 1.921875, + "learning_rate": 6.094055592156557e-06, + "loss": 0.8721, + "step": 11020 + }, + { + "epoch": 1.8777246715391045, + "grad_norm": 1.71875, + "learning_rate": 6.092393185449901e-06, + "loss": 0.8221, + "step": 11021 + }, + { + "epoch": 1.8778961356280943, + "grad_norm": 1.6484375, + "learning_rate": 6.090730906185016e-06, + "loss": 0.9318, + "step": 11022 + }, + { + "epoch": 1.8780675997170841, + "grad_norm": 1.734375, + "learning_rate": 6.089068754416118e-06, + "loss": 0.8173, + "step": 11023 + }, + { + "epoch": 1.878239063806074, + "grad_norm": 1.796875, + "learning_rate": 6.087406730197414e-06, + "loss": 0.8182, + "step": 11024 + }, + { + "epoch": 1.878410527895064, + "grad_norm": 1.65625, + "learning_rate": 6.085744833583111e-06, + "loss": 0.9004, + "step": 11025 + }, + { + "epoch": 1.8785819919840538, + "grad_norm": 1.7109375, + "learning_rate": 6.08408306462741e-06, + "loss": 0.897, + "step": 11026 + }, + { + "epoch": 1.8787534560730437, + "grad_norm": 1.6796875, + "learning_rate": 6.082421423384505e-06, + "loss": 0.8675, + "step": 11027 + }, + { + "epoch": 1.8789249201620337, + "grad_norm": 1.6171875, + "learning_rate": 6.0807599099085915e-06, + "loss": 0.7529, + "step": 11028 + }, + { + "epoch": 1.8790963842510235, + "grad_norm": 1.671875, + "learning_rate": 6.079098524253853e-06, + "loss": 0.8412, + "step": 11029 + }, + { + "epoch": 1.8792678483400134, + "grad_norm": 1.6015625, + "learning_rate": 6.077437266474478e-06, + "loss": 0.85, + "step": 11030 + }, + { + "epoch": 1.8794393124290032, + "grad_norm": 1.796875, + "learning_rate": 6.075776136624649e-06, + "loss": 0.9143, + "step": 11031 + }, + { + "epoch": 1.879610776517993, + "grad_norm": 1.6953125, + "learning_rate": 6.074115134758532e-06, + "loss": 0.8095, + "step": 11032 + }, + { + "epoch": 1.8797822406069828, + "grad_norm": 1.59375, + "learning_rate": 6.0724542609303035e-06, + "loss": 0.7707, + "step": 11033 + }, + { + "epoch": 1.8799537046959727, + "grad_norm": 1.703125, + "learning_rate": 6.070793515194133e-06, + "loss": 0.8904, + "step": 11034 + }, + { + "epoch": 1.8801251687849625, + "grad_norm": 1.6640625, + "learning_rate": 6.069132897604182e-06, + "loss": 0.8905, + "step": 11035 + }, + { + "epoch": 1.8802966328739523, + "grad_norm": 1.7421875, + "learning_rate": 6.06747240821461e-06, + "loss": 0.8963, + "step": 11036 + }, + { + "epoch": 1.8804680969629424, + "grad_norm": 1.65625, + "learning_rate": 6.065812047079569e-06, + "loss": 0.9203, + "step": 11037 + }, + { + "epoch": 1.8806395610519322, + "grad_norm": 1.7421875, + "learning_rate": 6.064151814253214e-06, + "loss": 0.9206, + "step": 11038 + }, + { + "epoch": 1.880811025140922, + "grad_norm": 1.7578125, + "learning_rate": 6.062491709789688e-06, + "loss": 0.8701, + "step": 11039 + }, + { + "epoch": 1.880982489229912, + "grad_norm": 1.6796875, + "learning_rate": 6.060831733743136e-06, + "loss": 0.8292, + "step": 11040 + }, + { + "epoch": 1.881153953318902, + "grad_norm": 1.7421875, + "learning_rate": 6.059171886167694e-06, + "loss": 0.8637, + "step": 11041 + }, + { + "epoch": 1.8813254174078917, + "grad_norm": 1.7734375, + "learning_rate": 6.0575121671174985e-06, + "loss": 0.9126, + "step": 11042 + }, + { + "epoch": 1.8814968814968815, + "grad_norm": 1.6875, + "learning_rate": 6.055852576646677e-06, + "loss": 0.8704, + "step": 11043 + }, + { + "epoch": 1.8816683455858714, + "grad_norm": 1.703125, + "learning_rate": 6.0541931148093525e-06, + "loss": 0.9013, + "step": 11044 + }, + { + "epoch": 1.8818398096748612, + "grad_norm": 1.6796875, + "learning_rate": 6.052533781659651e-06, + "loss": 0.8459, + "step": 11045 + }, + { + "epoch": 1.882011273763851, + "grad_norm": 1.8046875, + "learning_rate": 6.050874577251686e-06, + "loss": 0.9123, + "step": 11046 + }, + { + "epoch": 1.8821827378528408, + "grad_norm": 1.65625, + "learning_rate": 6.049215501639574e-06, + "loss": 0.8889, + "step": 11047 + }, + { + "epoch": 1.8823542019418307, + "grad_norm": 1.6328125, + "learning_rate": 6.04755655487742e-06, + "loss": 0.7668, + "step": 11048 + }, + { + "epoch": 1.8825256660308207, + "grad_norm": 1.6640625, + "learning_rate": 6.0458977370193316e-06, + "loss": 0.809, + "step": 11049 + }, + { + "epoch": 1.8826971301198105, + "grad_norm": 1.6953125, + "learning_rate": 6.044239048119407e-06, + "loss": 0.8047, + "step": 11050 + }, + { + "epoch": 1.8828685942088004, + "grad_norm": 1.703125, + "learning_rate": 6.042580488231744e-06, + "loss": 0.7854, + "step": 11051 + }, + { + "epoch": 1.8830400582977904, + "grad_norm": 1.7578125, + "learning_rate": 6.040922057410432e-06, + "loss": 0.9012, + "step": 11052 + }, + { + "epoch": 1.8832115223867802, + "grad_norm": 1.625, + "learning_rate": 6.039263755709561e-06, + "loss": 0.8462, + "step": 11053 + }, + { + "epoch": 1.88338298647577, + "grad_norm": 1.6953125, + "learning_rate": 6.037605583183217e-06, + "loss": 0.8311, + "step": 11054 + }, + { + "epoch": 1.88355445056476, + "grad_norm": 1.734375, + "learning_rate": 6.035947539885472e-06, + "loss": 0.9026, + "step": 11055 + }, + { + "epoch": 1.8837259146537497, + "grad_norm": 1.890625, + "learning_rate": 6.034289625870405e-06, + "loss": 0.8557, + "step": 11056 + }, + { + "epoch": 1.8838973787427395, + "grad_norm": 1.7734375, + "learning_rate": 6.032631841192088e-06, + "loss": 0.8124, + "step": 11057 + }, + { + "epoch": 1.8840688428317294, + "grad_norm": 1.59375, + "learning_rate": 6.030974185904586e-06, + "loss": 0.8698, + "step": 11058 + }, + { + "epoch": 1.8842403069207192, + "grad_norm": 1.6953125, + "learning_rate": 6.029316660061961e-06, + "loss": 0.9407, + "step": 11059 + }, + { + "epoch": 1.884411771009709, + "grad_norm": 1.609375, + "learning_rate": 6.027659263718273e-06, + "loss": 0.7825, + "step": 11060 + }, + { + "epoch": 1.8845832350986989, + "grad_norm": 1.7109375, + "learning_rate": 6.026001996927574e-06, + "loss": 0.843, + "step": 11061 + }, + { + "epoch": 1.884754699187689, + "grad_norm": 1.703125, + "learning_rate": 6.0243448597439154e-06, + "loss": 0.9317, + "step": 11062 + }, + { + "epoch": 1.8849261632766787, + "grad_norm": 1.609375, + "learning_rate": 6.0226878522213385e-06, + "loss": 0.8107, + "step": 11063 + }, + { + "epoch": 1.8850976273656685, + "grad_norm": 1.6875, + "learning_rate": 6.02103097441389e-06, + "loss": 0.7993, + "step": 11064 + }, + { + "epoch": 1.8852690914546586, + "grad_norm": 1.71875, + "learning_rate": 6.0193742263756115e-06, + "loss": 0.8842, + "step": 11065 + }, + { + "epoch": 1.8854405555436484, + "grad_norm": 1.75, + "learning_rate": 6.017717608160522e-06, + "loss": 0.8449, + "step": 11066 + }, + { + "epoch": 1.8856120196326382, + "grad_norm": 1.78125, + "learning_rate": 6.0160611198226595e-06, + "loss": 0.8286, + "step": 11067 + }, + { + "epoch": 1.885783483721628, + "grad_norm": 1.671875, + "learning_rate": 6.014404761416044e-06, + "loss": 0.8558, + "step": 11068 + }, + { + "epoch": 1.885954947810618, + "grad_norm": 1.6484375, + "learning_rate": 6.012748532994699e-06, + "loss": 0.7994, + "step": 11069 + }, + { + "epoch": 1.8861264118996077, + "grad_norm": 1.6796875, + "learning_rate": 6.011092434612639e-06, + "loss": 0.8989, + "step": 11070 + }, + { + "epoch": 1.8862978759885975, + "grad_norm": 1.6796875, + "learning_rate": 6.009436466323873e-06, + "loss": 0.8535, + "step": 11071 + }, + { + "epoch": 1.8864693400775874, + "grad_norm": 1.6328125, + "learning_rate": 6.007780628182413e-06, + "loss": 0.846, + "step": 11072 + }, + { + "epoch": 1.8866408041665772, + "grad_norm": 1.703125, + "learning_rate": 6.00612492024226e-06, + "loss": 0.8123, + "step": 11073 + }, + { + "epoch": 1.8868122682555672, + "grad_norm": 1.6484375, + "learning_rate": 6.004469342557413e-06, + "loss": 0.7837, + "step": 11074 + }, + { + "epoch": 1.886983732344557, + "grad_norm": 1.6015625, + "learning_rate": 6.002813895181865e-06, + "loss": 0.83, + "step": 11075 + }, + { + "epoch": 1.887155196433547, + "grad_norm": 1.6953125, + "learning_rate": 6.00115857816961e-06, + "loss": 0.8554, + "step": 11076 + }, + { + "epoch": 1.887326660522537, + "grad_norm": 1.7421875, + "learning_rate": 5.999503391574635e-06, + "loss": 0.8573, + "step": 11077 + }, + { + "epoch": 1.8874981246115268, + "grad_norm": 1.6953125, + "learning_rate": 5.9978483354509155e-06, + "loss": 0.7935, + "step": 11078 + }, + { + "epoch": 1.8876695887005166, + "grad_norm": 1.765625, + "learning_rate": 5.99619340985243e-06, + "loss": 0.8641, + "step": 11079 + }, + { + "epoch": 1.8878410527895064, + "grad_norm": 1.6171875, + "learning_rate": 5.9945386148331565e-06, + "loss": 0.8151, + "step": 11080 + }, + { + "epoch": 1.8880125168784962, + "grad_norm": 1.625, + "learning_rate": 5.992883950447062e-06, + "loss": 0.8537, + "step": 11081 + }, + { + "epoch": 1.888183980967486, + "grad_norm": 1.703125, + "learning_rate": 5.99122941674811e-06, + "loss": 0.8656, + "step": 11082 + }, + { + "epoch": 1.888355445056476, + "grad_norm": 1.6875, + "learning_rate": 5.989575013790264e-06, + "loss": 0.8776, + "step": 11083 + }, + { + "epoch": 1.8885269091454657, + "grad_norm": 1.8125, + "learning_rate": 5.987920741627479e-06, + "loss": 0.8657, + "step": 11084 + }, + { + "epoch": 1.8886983732344556, + "grad_norm": 1.6484375, + "learning_rate": 5.986266600313706e-06, + "loss": 0.9305, + "step": 11085 + }, + { + "epoch": 1.8888698373234456, + "grad_norm": 1.7890625, + "learning_rate": 5.984612589902893e-06, + "loss": 0.9352, + "step": 11086 + }, + { + "epoch": 1.8890413014124354, + "grad_norm": 1.6328125, + "learning_rate": 5.982958710448984e-06, + "loss": 0.8567, + "step": 11087 + }, + { + "epoch": 1.8892127655014253, + "grad_norm": 1.71875, + "learning_rate": 5.9813049620059206e-06, + "loss": 0.8659, + "step": 11088 + }, + { + "epoch": 1.8893842295904153, + "grad_norm": 1.734375, + "learning_rate": 5.979651344627633e-06, + "loss": 0.8445, + "step": 11089 + }, + { + "epoch": 1.8895556936794051, + "grad_norm": 1.84375, + "learning_rate": 5.977997858368055e-06, + "loss": 0.7571, + "step": 11090 + }, + { + "epoch": 1.889727157768395, + "grad_norm": 1.6953125, + "learning_rate": 5.976344503281113e-06, + "loss": 0.7836, + "step": 11091 + }, + { + "epoch": 1.8898986218573848, + "grad_norm": 1.5703125, + "learning_rate": 5.974691279420727e-06, + "loss": 0.8175, + "step": 11092 + }, + { + "epoch": 1.8900700859463746, + "grad_norm": 1.671875, + "learning_rate": 5.973038186840816e-06, + "loss": 0.8845, + "step": 11093 + }, + { + "epoch": 1.8902415500353644, + "grad_norm": 1.7734375, + "learning_rate": 5.971385225595294e-06, + "loss": 0.8002, + "step": 11094 + }, + { + "epoch": 1.8904130141243543, + "grad_norm": 1.671875, + "learning_rate": 5.969732395738071e-06, + "loss": 0.8699, + "step": 11095 + }, + { + "epoch": 1.890584478213344, + "grad_norm": 1.7109375, + "learning_rate": 5.968079697323052e-06, + "loss": 0.7574, + "step": 11096 + }, + { + "epoch": 1.890755942302334, + "grad_norm": 1.65625, + "learning_rate": 5.966427130404136e-06, + "loss": 0.8268, + "step": 11097 + }, + { + "epoch": 1.890927406391324, + "grad_norm": 1.7109375, + "learning_rate": 5.964774695035219e-06, + "loss": 0.8457, + "step": 11098 + }, + { + "epoch": 1.8910988704803138, + "grad_norm": 1.96875, + "learning_rate": 5.963122391270195e-06, + "loss": 0.9298, + "step": 11099 + }, + { + "epoch": 1.8912703345693036, + "grad_norm": 1.7265625, + "learning_rate": 5.961470219162955e-06, + "loss": 0.8533, + "step": 11100 + }, + { + "epoch": 1.8914417986582936, + "grad_norm": 1.765625, + "learning_rate": 5.959818178767376e-06, + "loss": 0.9166, + "step": 11101 + }, + { + "epoch": 1.8916132627472835, + "grad_norm": 1.6875, + "learning_rate": 5.9581662701373386e-06, + "loss": 0.8582, + "step": 11102 + }, + { + "epoch": 1.8917847268362733, + "grad_norm": 1.78125, + "learning_rate": 5.9565144933267205e-06, + "loss": 0.7822, + "step": 11103 + }, + { + "epoch": 1.8919561909252631, + "grad_norm": 1.7421875, + "learning_rate": 5.9548628483893915e-06, + "loss": 0.9107, + "step": 11104 + }, + { + "epoch": 1.892127655014253, + "grad_norm": 1.796875, + "learning_rate": 5.953211335379217e-06, + "loss": 0.9657, + "step": 11105 + }, + { + "epoch": 1.8922991191032428, + "grad_norm": 1.6796875, + "learning_rate": 5.951559954350059e-06, + "loss": 0.9031, + "step": 11106 + }, + { + "epoch": 1.8924705831922326, + "grad_norm": 1.6171875, + "learning_rate": 5.949908705355778e-06, + "loss": 0.7995, + "step": 11107 + }, + { + "epoch": 1.8926420472812224, + "grad_norm": 1.7578125, + "learning_rate": 5.948257588450224e-06, + "loss": 0.8743, + "step": 11108 + }, + { + "epoch": 1.8928135113702123, + "grad_norm": 1.671875, + "learning_rate": 5.946606603687246e-06, + "loss": 0.8933, + "step": 11109 + }, + { + "epoch": 1.8929849754592023, + "grad_norm": 1.7109375, + "learning_rate": 5.944955751120691e-06, + "loss": 0.8185, + "step": 11110 + }, + { + "epoch": 1.8931564395481921, + "grad_norm": 1.6875, + "learning_rate": 5.943305030804403e-06, + "loss": 0.8492, + "step": 11111 + }, + { + "epoch": 1.893327903637182, + "grad_norm": 1.7109375, + "learning_rate": 5.94165444279221e-06, + "loss": 0.8794, + "step": 11112 + }, + { + "epoch": 1.893499367726172, + "grad_norm": 1.6796875, + "learning_rate": 5.94000398713795e-06, + "loss": 0.8339, + "step": 11113 + }, + { + "epoch": 1.8936708318151618, + "grad_norm": 1.625, + "learning_rate": 5.938353663895447e-06, + "loss": 0.8516, + "step": 11114 + }, + { + "epoch": 1.8938422959041517, + "grad_norm": 1.65625, + "learning_rate": 5.936703473118526e-06, + "loss": 0.8399, + "step": 11115 + }, + { + "epoch": 1.8940137599931415, + "grad_norm": 1.6484375, + "learning_rate": 5.935053414861005e-06, + "loss": 0.7824, + "step": 11116 + }, + { + "epoch": 1.8941852240821313, + "grad_norm": 1.6875, + "learning_rate": 5.933403489176701e-06, + "loss": 0.8342, + "step": 11117 + }, + { + "epoch": 1.8943566881711211, + "grad_norm": 1.640625, + "learning_rate": 5.931753696119419e-06, + "loss": 0.8584, + "step": 11118 + }, + { + "epoch": 1.894528152260111, + "grad_norm": 1.7109375, + "learning_rate": 5.930104035742972e-06, + "loss": 0.8126, + "step": 11119 + }, + { + "epoch": 1.8946996163491008, + "grad_norm": 1.6796875, + "learning_rate": 5.928454508101156e-06, + "loss": 0.8786, + "step": 11120 + }, + { + "epoch": 1.8948710804380906, + "grad_norm": 1.671875, + "learning_rate": 5.926805113247772e-06, + "loss": 0.8117, + "step": 11121 + }, + { + "epoch": 1.8950425445270807, + "grad_norm": 1.71875, + "learning_rate": 5.9251558512366115e-06, + "loss": 0.8673, + "step": 11122 + }, + { + "epoch": 1.8952140086160705, + "grad_norm": 1.6328125, + "learning_rate": 5.923506722121467e-06, + "loss": 0.898, + "step": 11123 + }, + { + "epoch": 1.8953854727050603, + "grad_norm": 1.609375, + "learning_rate": 5.9218577259561146e-06, + "loss": 0.7834, + "step": 11124 + }, + { + "epoch": 1.8955569367940504, + "grad_norm": 1.7265625, + "learning_rate": 5.920208862794339e-06, + "loss": 0.8138, + "step": 11125 + }, + { + "epoch": 1.8957284008830402, + "grad_norm": 1.703125, + "learning_rate": 5.9185601326899145e-06, + "loss": 0.8742, + "step": 11126 + }, + { + "epoch": 1.89589986497203, + "grad_norm": 1.7734375, + "learning_rate": 5.9169115356966135e-06, + "loss": 0.8483, + "step": 11127 + }, + { + "epoch": 1.8960713290610198, + "grad_norm": 1.7265625, + "learning_rate": 5.915263071868203e-06, + "loss": 0.8284, + "step": 11128 + }, + { + "epoch": 1.8962427931500097, + "grad_norm": 1.6640625, + "learning_rate": 5.913614741258446e-06, + "loss": 0.8855, + "step": 11129 + }, + { + "epoch": 1.8964142572389995, + "grad_norm": 1.7734375, + "learning_rate": 5.911966543921101e-06, + "loss": 0.8719, + "step": 11130 + }, + { + "epoch": 1.8965857213279893, + "grad_norm": 1.671875, + "learning_rate": 5.91031847990992e-06, + "loss": 0.8622, + "step": 11131 + }, + { + "epoch": 1.8967571854169791, + "grad_norm": 1.6875, + "learning_rate": 5.908670549278655e-06, + "loss": 0.832, + "step": 11132 + }, + { + "epoch": 1.896928649505969, + "grad_norm": 1.703125, + "learning_rate": 5.907022752081047e-06, + "loss": 0.8111, + "step": 11133 + }, + { + "epoch": 1.897100113594959, + "grad_norm": 1.625, + "learning_rate": 5.905375088370842e-06, + "loss": 0.8047, + "step": 11134 + }, + { + "epoch": 1.8972715776839488, + "grad_norm": 1.7421875, + "learning_rate": 5.903727558201776e-06, + "loss": 0.9245, + "step": 11135 + }, + { + "epoch": 1.8974430417729387, + "grad_norm": 1.703125, + "learning_rate": 5.902080161627577e-06, + "loss": 0.9039, + "step": 11136 + }, + { + "epoch": 1.8976145058619287, + "grad_norm": 1.6796875, + "learning_rate": 5.900432898701977e-06, + "loss": 0.7891, + "step": 11137 + }, + { + "epoch": 1.8977859699509185, + "grad_norm": 1.6796875, + "learning_rate": 5.898785769478695e-06, + "loss": 0.8917, + "step": 11138 + }, + { + "epoch": 1.8979574340399084, + "grad_norm": 1.7109375, + "learning_rate": 5.897138774011455e-06, + "loss": 0.8707, + "step": 11139 + }, + { + "epoch": 1.8981288981288982, + "grad_norm": 1.7109375, + "learning_rate": 5.8954919123539675e-06, + "loss": 0.8902, + "step": 11140 + }, + { + "epoch": 1.898300362217888, + "grad_norm": 1.6796875, + "learning_rate": 5.893845184559948e-06, + "loss": 0.8544, + "step": 11141 + }, + { + "epoch": 1.8984718263068778, + "grad_norm": 1.6328125, + "learning_rate": 5.892198590683096e-06, + "loss": 0.811, + "step": 11142 + }, + { + "epoch": 1.8986432903958677, + "grad_norm": 1.6328125, + "learning_rate": 5.890552130777119e-06, + "loss": 0.8141, + "step": 11143 + }, + { + "epoch": 1.8988147544848575, + "grad_norm": 1.65625, + "learning_rate": 5.88890580489571e-06, + "loss": 0.6992, + "step": 11144 + }, + { + "epoch": 1.8989862185738473, + "grad_norm": 1.6875, + "learning_rate": 5.887259613092564e-06, + "loss": 0.7467, + "step": 11145 + }, + { + "epoch": 1.8991576826628374, + "grad_norm": 1.671875, + "learning_rate": 5.885613555421372e-06, + "loss": 0.8146, + "step": 11146 + }, + { + "epoch": 1.8993291467518272, + "grad_norm": 1.734375, + "learning_rate": 5.883967631935813e-06, + "loss": 0.7893, + "step": 11147 + }, + { + "epoch": 1.899500610840817, + "grad_norm": 1.859375, + "learning_rate": 5.882321842689569e-06, + "loss": 0.8589, + "step": 11148 + }, + { + "epoch": 1.899672074929807, + "grad_norm": 1.6171875, + "learning_rate": 5.880676187736316e-06, + "loss": 0.7366, + "step": 11149 + }, + { + "epoch": 1.8998435390187969, + "grad_norm": 1.6875, + "learning_rate": 5.8790306671297234e-06, + "loss": 0.8711, + "step": 11150 + }, + { + "epoch": 1.9000150031077867, + "grad_norm": 1.6484375, + "learning_rate": 5.87738528092346e-06, + "loss": 0.8846, + "step": 11151 + }, + { + "epoch": 1.9001864671967765, + "grad_norm": 1.65625, + "learning_rate": 5.875740029171185e-06, + "loss": 0.8274, + "step": 11152 + }, + { + "epoch": 1.9003579312857664, + "grad_norm": 1.78125, + "learning_rate": 5.87409491192656e-06, + "loss": 0.7669, + "step": 11153 + }, + { + "epoch": 1.9005293953747562, + "grad_norm": 1.6328125, + "learning_rate": 5.872449929243236e-06, + "loss": 0.8362, + "step": 11154 + }, + { + "epoch": 1.900700859463746, + "grad_norm": 1.7265625, + "learning_rate": 5.870805081174862e-06, + "loss": 0.8204, + "step": 11155 + }, + { + "epoch": 1.9008723235527358, + "grad_norm": 1.7109375, + "learning_rate": 5.869160367775084e-06, + "loss": 0.7579, + "step": 11156 + }, + { + "epoch": 1.9010437876417257, + "grad_norm": 1.7578125, + "learning_rate": 5.86751578909754e-06, + "loss": 0.8474, + "step": 11157 + }, + { + "epoch": 1.9012152517307155, + "grad_norm": 1.6875, + "learning_rate": 5.865871345195875e-06, + "loss": 0.8747, + "step": 11158 + }, + { + "epoch": 1.9013867158197055, + "grad_norm": 1.640625, + "learning_rate": 5.8642270361237066e-06, + "loss": 0.8889, + "step": 11159 + }, + { + "epoch": 1.9015581799086954, + "grad_norm": 1.71875, + "learning_rate": 5.8625828619346695e-06, + "loss": 0.8505, + "step": 11160 + }, + { + "epoch": 1.9017296439976852, + "grad_norm": 1.7578125, + "learning_rate": 5.860938822682385e-06, + "loss": 0.9267, + "step": 11161 + }, + { + "epoch": 1.9019011080866752, + "grad_norm": 1.78125, + "learning_rate": 5.859294918420473e-06, + "loss": 0.8676, + "step": 11162 + }, + { + "epoch": 1.902072572175665, + "grad_norm": 1.6015625, + "learning_rate": 5.857651149202545e-06, + "loss": 0.8717, + "step": 11163 + }, + { + "epoch": 1.9022440362646549, + "grad_norm": 1.6796875, + "learning_rate": 5.856007515082212e-06, + "loss": 0.8625, + "step": 11164 + }, + { + "epoch": 1.9024155003536447, + "grad_norm": 1.640625, + "learning_rate": 5.8543640161130765e-06, + "loss": 0.7168, + "step": 11165 + }, + { + "epoch": 1.9025869644426345, + "grad_norm": 1.671875, + "learning_rate": 5.852720652348744e-06, + "loss": 0.826, + "step": 11166 + }, + { + "epoch": 1.9027584285316244, + "grad_norm": 1.6953125, + "learning_rate": 5.851077423842807e-06, + "loss": 0.81, + "step": 11167 + }, + { + "epoch": 1.9029298926206142, + "grad_norm": 1.671875, + "learning_rate": 5.8494343306488595e-06, + "loss": 0.9127, + "step": 11168 + }, + { + "epoch": 1.903101356709604, + "grad_norm": 1.7734375, + "learning_rate": 5.847791372820493e-06, + "loss": 0.9623, + "step": 11169 + }, + { + "epoch": 1.9032728207985938, + "grad_norm": 1.6171875, + "learning_rate": 5.846148550411279e-06, + "loss": 0.8097, + "step": 11170 + }, + { + "epoch": 1.9034442848875839, + "grad_norm": 1.8125, + "learning_rate": 5.8445058634748055e-06, + "loss": 0.8239, + "step": 11171 + }, + { + "epoch": 1.9036157489765737, + "grad_norm": 1.578125, + "learning_rate": 5.842863312064642e-06, + "loss": 0.7607, + "step": 11172 + }, + { + "epoch": 1.9037872130655635, + "grad_norm": 1.7734375, + "learning_rate": 5.841220896234358e-06, + "loss": 0.8831, + "step": 11173 + }, + { + "epoch": 1.9039586771545536, + "grad_norm": 1.6875, + "learning_rate": 5.839578616037525e-06, + "loss": 0.8309, + "step": 11174 + }, + { + "epoch": 1.9041301412435434, + "grad_norm": 1.734375, + "learning_rate": 5.837936471527701e-06, + "loss": 0.8537, + "step": 11175 + }, + { + "epoch": 1.9043016053325332, + "grad_norm": 1.6484375, + "learning_rate": 5.836294462758441e-06, + "loss": 0.7846, + "step": 11176 + }, + { + "epoch": 1.904473069421523, + "grad_norm": 1.71875, + "learning_rate": 5.834652589783295e-06, + "loss": 0.8714, + "step": 11177 + }, + { + "epoch": 1.9046445335105129, + "grad_norm": 1.828125, + "learning_rate": 5.833010852655815e-06, + "loss": 0.9001, + "step": 11178 + }, + { + "epoch": 1.9048159975995027, + "grad_norm": 1.65625, + "learning_rate": 5.8313692514295416e-06, + "loss": 0.8025, + "step": 11179 + }, + { + "epoch": 1.9049874616884925, + "grad_norm": 1.6875, + "learning_rate": 5.829727786158011e-06, + "loss": 0.88, + "step": 11180 + }, + { + "epoch": 1.9051589257774824, + "grad_norm": 1.6328125, + "learning_rate": 5.828086456894769e-06, + "loss": 0.8556, + "step": 11181 + }, + { + "epoch": 1.9053303898664722, + "grad_norm": 1.734375, + "learning_rate": 5.826445263693335e-06, + "loss": 0.8384, + "step": 11182 + }, + { + "epoch": 1.9055018539554622, + "grad_norm": 1.765625, + "learning_rate": 5.824804206607235e-06, + "loss": 0.8997, + "step": 11183 + }, + { + "epoch": 1.905673318044452, + "grad_norm": 1.7578125, + "learning_rate": 5.823163285689992e-06, + "loss": 0.8814, + "step": 11184 + }, + { + "epoch": 1.9058447821334419, + "grad_norm": 1.65625, + "learning_rate": 5.821522500995125e-06, + "loss": 0.8122, + "step": 11185 + }, + { + "epoch": 1.906016246222432, + "grad_norm": 1.6171875, + "learning_rate": 5.819881852576141e-06, + "loss": 0.7983, + "step": 11186 + }, + { + "epoch": 1.9061877103114218, + "grad_norm": 1.765625, + "learning_rate": 5.818241340486554e-06, + "loss": 0.8247, + "step": 11187 + }, + { + "epoch": 1.9063591744004116, + "grad_norm": 1.5625, + "learning_rate": 5.8166009647798616e-06, + "loss": 0.7229, + "step": 11188 + }, + { + "epoch": 1.9065306384894014, + "grad_norm": 1.671875, + "learning_rate": 5.8149607255095665e-06, + "loss": 0.7919, + "step": 11189 + }, + { + "epoch": 1.9067021025783912, + "grad_norm": 1.6796875, + "learning_rate": 5.813320622729159e-06, + "loss": 0.8322, + "step": 11190 + }, + { + "epoch": 1.906873566667381, + "grad_norm": 1.703125, + "learning_rate": 5.811680656492134e-06, + "loss": 0.8727, + "step": 11191 + }, + { + "epoch": 1.9070450307563709, + "grad_norm": 1.6796875, + "learning_rate": 5.810040826851978e-06, + "loss": 0.793, + "step": 11192 + }, + { + "epoch": 1.9072164948453607, + "grad_norm": 1.703125, + "learning_rate": 5.808401133862165e-06, + "loss": 0.7886, + "step": 11193 + }, + { + "epoch": 1.9073879589343505, + "grad_norm": 1.7265625, + "learning_rate": 5.8067615775761746e-06, + "loss": 0.8129, + "step": 11194 + }, + { + "epoch": 1.9075594230233406, + "grad_norm": 1.7890625, + "learning_rate": 5.8051221580474786e-06, + "loss": 0.9707, + "step": 11195 + }, + { + "epoch": 1.9077308871123304, + "grad_norm": 1.6328125, + "learning_rate": 5.803482875329543e-06, + "loss": 0.7983, + "step": 11196 + }, + { + "epoch": 1.9079023512013202, + "grad_norm": 1.6875, + "learning_rate": 5.801843729475836e-06, + "loss": 0.7935, + "step": 11197 + }, + { + "epoch": 1.9080738152903103, + "grad_norm": 1.6171875, + "learning_rate": 5.800204720539813e-06, + "loss": 0.8917, + "step": 11198 + }, + { + "epoch": 1.9082452793793, + "grad_norm": 1.6171875, + "learning_rate": 5.798565848574931e-06, + "loss": 0.799, + "step": 11199 + }, + { + "epoch": 1.90841674346829, + "grad_norm": 1.7734375, + "learning_rate": 5.796927113634637e-06, + "loss": 0.8679, + "step": 11200 + }, + { + "epoch": 1.90841674346829, + "eval_loss": 0.8319188952445984, + "eval_runtime": 835.9538, + "eval_samples_per_second": 2.989, + "eval_steps_per_second": 2.989, + "step": 11200 + }, + { + "epoch": 1.9085882075572798, + "grad_norm": 1.75, + "learning_rate": 5.795288515772377e-06, + "loss": 0.8503, + "step": 11201 + }, + { + "epoch": 1.9087596716462696, + "grad_norm": 1.71875, + "learning_rate": 5.7936500550415934e-06, + "loss": 0.8268, + "step": 11202 + }, + { + "epoch": 1.9089311357352594, + "grad_norm": 1.9453125, + "learning_rate": 5.792011731495719e-06, + "loss": 0.9486, + "step": 11203 + }, + { + "epoch": 1.9091025998242492, + "grad_norm": 1.6171875, + "learning_rate": 5.7903735451881935e-06, + "loss": 0.823, + "step": 11204 + }, + { + "epoch": 1.909274063913239, + "grad_norm": 1.6953125, + "learning_rate": 5.788735496172435e-06, + "loss": 0.8247, + "step": 11205 + }, + { + "epoch": 1.9094455280022289, + "grad_norm": 1.953125, + "learning_rate": 5.7870975845018685e-06, + "loss": 0.8613, + "step": 11206 + }, + { + "epoch": 1.909616992091219, + "grad_norm": 1.6640625, + "learning_rate": 5.785459810229914e-06, + "loss": 0.8429, + "step": 11207 + }, + { + "epoch": 1.9097884561802088, + "grad_norm": 1.7734375, + "learning_rate": 5.783822173409988e-06, + "loss": 0.8999, + "step": 11208 + }, + { + "epoch": 1.9099599202691986, + "grad_norm": 1.734375, + "learning_rate": 5.782184674095495e-06, + "loss": 0.8341, + "step": 11209 + }, + { + "epoch": 1.9101313843581886, + "grad_norm": 1.671875, + "learning_rate": 5.780547312339844e-06, + "loss": 0.8422, + "step": 11210 + }, + { + "epoch": 1.9103028484471785, + "grad_norm": 1.75, + "learning_rate": 5.778910088196435e-06, + "loss": 0.8455, + "step": 11211 + }, + { + "epoch": 1.9104743125361683, + "grad_norm": 1.703125, + "learning_rate": 5.77727300171866e-06, + "loss": 0.9009, + "step": 11212 + }, + { + "epoch": 1.910645776625158, + "grad_norm": 1.6953125, + "learning_rate": 5.775636052959916e-06, + "loss": 0.9527, + "step": 11213 + }, + { + "epoch": 1.910817240714148, + "grad_norm": 1.5859375, + "learning_rate": 5.773999241973587e-06, + "loss": 0.8251, + "step": 11214 + }, + { + "epoch": 1.9109887048031378, + "grad_norm": 1.6953125, + "learning_rate": 5.7723625688130565e-06, + "loss": 0.7843, + "step": 11215 + }, + { + "epoch": 1.9111601688921276, + "grad_norm": 1.6015625, + "learning_rate": 5.770726033531704e-06, + "loss": 0.8589, + "step": 11216 + }, + { + "epoch": 1.9113316329811174, + "grad_norm": 1.703125, + "learning_rate": 5.769089636182901e-06, + "loss": 0.8252, + "step": 11217 + }, + { + "epoch": 1.9115030970701072, + "grad_norm": 1.7265625, + "learning_rate": 5.767453376820016e-06, + "loss": 0.8801, + "step": 11218 + }, + { + "epoch": 1.9116745611590973, + "grad_norm": 1.6640625, + "learning_rate": 5.765817255496414e-06, + "loss": 0.8598, + "step": 11219 + }, + { + "epoch": 1.911846025248087, + "grad_norm": 1.609375, + "learning_rate": 5.764181272265456e-06, + "loss": 0.8349, + "step": 11220 + }, + { + "epoch": 1.912017489337077, + "grad_norm": 1.5859375, + "learning_rate": 5.762545427180499e-06, + "loss": 0.8272, + "step": 11221 + }, + { + "epoch": 1.912188953426067, + "grad_norm": 1.671875, + "learning_rate": 5.760909720294892e-06, + "loss": 0.8555, + "step": 11222 + }, + { + "epoch": 1.9123604175150568, + "grad_norm": 1.59375, + "learning_rate": 5.759274151661981e-06, + "loss": 0.8261, + "step": 11223 + }, + { + "epoch": 1.9125318816040466, + "grad_norm": 1.7265625, + "learning_rate": 5.757638721335111e-06, + "loss": 0.901, + "step": 11224 + }, + { + "epoch": 1.9127033456930365, + "grad_norm": 1.7109375, + "learning_rate": 5.756003429367615e-06, + "loss": 0.8994, + "step": 11225 + }, + { + "epoch": 1.9128748097820263, + "grad_norm": 1.71875, + "learning_rate": 5.7543682758128295e-06, + "loss": 0.8293, + "step": 11226 + }, + { + "epoch": 1.9130462738710161, + "grad_norm": 1.6640625, + "learning_rate": 5.752733260724086e-06, + "loss": 0.8326, + "step": 11227 + }, + { + "epoch": 1.913217737960006, + "grad_norm": 1.7109375, + "learning_rate": 5.751098384154701e-06, + "loss": 0.8523, + "step": 11228 + }, + { + "epoch": 1.9133892020489958, + "grad_norm": 1.6953125, + "learning_rate": 5.749463646157998e-06, + "loss": 0.8387, + "step": 11229 + }, + { + "epoch": 1.9135606661379856, + "grad_norm": 1.7109375, + "learning_rate": 5.74782904678729e-06, + "loss": 0.7988, + "step": 11230 + }, + { + "epoch": 1.9137321302269756, + "grad_norm": 1.75, + "learning_rate": 5.74619458609589e-06, + "loss": 0.7845, + "step": 11231 + }, + { + "epoch": 1.9139035943159655, + "grad_norm": 1.640625, + "learning_rate": 5.744560264137102e-06, + "loss": 0.9016, + "step": 11232 + }, + { + "epoch": 1.9140750584049553, + "grad_norm": 1.7265625, + "learning_rate": 5.7429260809642295e-06, + "loss": 0.8691, + "step": 11233 + }, + { + "epoch": 1.9142465224939453, + "grad_norm": 1.75, + "learning_rate": 5.741292036630568e-06, + "loss": 0.8374, + "step": 11234 + }, + { + "epoch": 1.9144179865829352, + "grad_norm": 1.6328125, + "learning_rate": 5.739658131189405e-06, + "loss": 0.8578, + "step": 11235 + }, + { + "epoch": 1.914589450671925, + "grad_norm": 1.7265625, + "learning_rate": 5.738024364694039e-06, + "loss": 0.9524, + "step": 11236 + }, + { + "epoch": 1.9147609147609148, + "grad_norm": 1.6796875, + "learning_rate": 5.736390737197745e-06, + "loss": 0.8665, + "step": 11237 + }, + { + "epoch": 1.9149323788499046, + "grad_norm": 1.7421875, + "learning_rate": 5.734757248753811e-06, + "loss": 0.8449, + "step": 11238 + }, + { + "epoch": 1.9151038429388945, + "grad_norm": 1.765625, + "learning_rate": 5.7331238994155e-06, + "loss": 0.9447, + "step": 11239 + }, + { + "epoch": 1.9152753070278843, + "grad_norm": 1.5859375, + "learning_rate": 5.7314906892360855e-06, + "loss": 0.8613, + "step": 11240 + }, + { + "epoch": 1.9154467711168741, + "grad_norm": 1.7109375, + "learning_rate": 5.729857618268832e-06, + "loss": 0.8167, + "step": 11241 + }, + { + "epoch": 1.915618235205864, + "grad_norm": 1.7265625, + "learning_rate": 5.728224686567004e-06, + "loss": 0.9106, + "step": 11242 + }, + { + "epoch": 1.915789699294854, + "grad_norm": 1.796875, + "learning_rate": 5.726591894183855e-06, + "loss": 0.7939, + "step": 11243 + }, + { + "epoch": 1.9159611633838438, + "grad_norm": 1.65625, + "learning_rate": 5.724959241172634e-06, + "loss": 0.7932, + "step": 11244 + }, + { + "epoch": 1.9161326274728336, + "grad_norm": 1.7421875, + "learning_rate": 5.723326727586593e-06, + "loss": 0.8271, + "step": 11245 + }, + { + "epoch": 1.9163040915618237, + "grad_norm": 1.828125, + "learning_rate": 5.721694353478971e-06, + "loss": 0.9315, + "step": 11246 + }, + { + "epoch": 1.9164755556508135, + "grad_norm": 1.6796875, + "learning_rate": 5.720062118903006e-06, + "loss": 0.8342, + "step": 11247 + }, + { + "epoch": 1.9166470197398033, + "grad_norm": 1.6484375, + "learning_rate": 5.718430023911932e-06, + "loss": 0.8443, + "step": 11248 + }, + { + "epoch": 1.9168184838287932, + "grad_norm": 1.734375, + "learning_rate": 5.7167980685589785e-06, + "loss": 0.8486, + "step": 11249 + }, + { + "epoch": 1.916989947917783, + "grad_norm": 1.6171875, + "learning_rate": 5.715166252897373e-06, + "loss": 0.8103, + "step": 11250 + }, + { + "epoch": 1.9171614120067728, + "grad_norm": 1.71875, + "learning_rate": 5.713534576980328e-06, + "loss": 0.8686, + "step": 11251 + }, + { + "epoch": 1.9173328760957626, + "grad_norm": 1.796875, + "learning_rate": 5.711903040861055e-06, + "loss": 0.8277, + "step": 11252 + }, + { + "epoch": 1.9175043401847525, + "grad_norm": 1.6953125, + "learning_rate": 5.7102716445927785e-06, + "loss": 0.8404, + "step": 11253 + }, + { + "epoch": 1.9176758042737423, + "grad_norm": 1.8671875, + "learning_rate": 5.708640388228697e-06, + "loss": 0.9247, + "step": 11254 + }, + { + "epoch": 1.9178472683627321, + "grad_norm": 1.671875, + "learning_rate": 5.707009271822011e-06, + "loss": 0.7726, + "step": 11255 + }, + { + "epoch": 1.9180187324517222, + "grad_norm": 1.703125, + "learning_rate": 5.705378295425919e-06, + "loss": 0.8043, + "step": 11256 + }, + { + "epoch": 1.918190196540712, + "grad_norm": 1.6953125, + "learning_rate": 5.703747459093611e-06, + "loss": 0.924, + "step": 11257 + }, + { + "epoch": 1.9183616606297018, + "grad_norm": 1.7109375, + "learning_rate": 5.702116762878278e-06, + "loss": 0.8636, + "step": 11258 + }, + { + "epoch": 1.9185331247186919, + "grad_norm": 1.640625, + "learning_rate": 5.700486206833103e-06, + "loss": 0.8355, + "step": 11259 + }, + { + "epoch": 1.9187045888076817, + "grad_norm": 1.6171875, + "learning_rate": 5.698855791011262e-06, + "loss": 0.817, + "step": 11260 + }, + { + "epoch": 1.9188760528966715, + "grad_norm": 1.5859375, + "learning_rate": 5.697225515465934e-06, + "loss": 0.8022, + "step": 11261 + }, + { + "epoch": 1.9190475169856613, + "grad_norm": 1.6953125, + "learning_rate": 5.69559538025028e-06, + "loss": 0.9162, + "step": 11262 + }, + { + "epoch": 1.9192189810746512, + "grad_norm": 1.6875, + "learning_rate": 5.693965385417471e-06, + "loss": 0.8811, + "step": 11263 + }, + { + "epoch": 1.919390445163641, + "grad_norm": 1.59375, + "learning_rate": 5.692335531020665e-06, + "loss": 0.7753, + "step": 11264 + }, + { + "epoch": 1.9195619092526308, + "grad_norm": 1.7578125, + "learning_rate": 5.69070581711302e-06, + "loss": 0.8442, + "step": 11265 + }, + { + "epoch": 1.9197333733416206, + "grad_norm": 1.7109375, + "learning_rate": 5.689076243747684e-06, + "loss": 0.8683, + "step": 11266 + }, + { + "epoch": 1.9199048374306105, + "grad_norm": 1.6953125, + "learning_rate": 5.687446810977806e-06, + "loss": 0.8505, + "step": 11267 + }, + { + "epoch": 1.9200763015196005, + "grad_norm": 1.6796875, + "learning_rate": 5.6858175188565266e-06, + "loss": 0.8469, + "step": 11268 + }, + { + "epoch": 1.9202477656085903, + "grad_norm": 1.6953125, + "learning_rate": 5.684188367436984e-06, + "loss": 0.8282, + "step": 11269 + }, + { + "epoch": 1.9204192296975802, + "grad_norm": 1.796875, + "learning_rate": 5.68255935677231e-06, + "loss": 0.8958, + "step": 11270 + }, + { + "epoch": 1.9205906937865702, + "grad_norm": 1.609375, + "learning_rate": 5.680930486915633e-06, + "loss": 0.7198, + "step": 11271 + }, + { + "epoch": 1.92076215787556, + "grad_norm": 1.6796875, + "learning_rate": 5.679301757920078e-06, + "loss": 0.8596, + "step": 11272 + }, + { + "epoch": 1.9209336219645499, + "grad_norm": 1.6953125, + "learning_rate": 5.677673169838762e-06, + "loss": 0.8341, + "step": 11273 + }, + { + "epoch": 1.9211050860535397, + "grad_norm": 1.71875, + "learning_rate": 5.676044722724801e-06, + "loss": 0.843, + "step": 11274 + }, + { + "epoch": 1.9212765501425295, + "grad_norm": 1.7109375, + "learning_rate": 5.674416416631304e-06, + "loss": 0.8879, + "step": 11275 + }, + { + "epoch": 1.9214480142315193, + "grad_norm": 1.6328125, + "learning_rate": 5.6727882516113765e-06, + "loss": 0.823, + "step": 11276 + }, + { + "epoch": 1.9216194783205092, + "grad_norm": 1.7734375, + "learning_rate": 5.671160227718118e-06, + "loss": 0.9125, + "step": 11277 + }, + { + "epoch": 1.921790942409499, + "grad_norm": 1.7109375, + "learning_rate": 5.669532345004627e-06, + "loss": 0.875, + "step": 11278 + }, + { + "epoch": 1.9219624064984888, + "grad_norm": 1.71875, + "learning_rate": 5.667904603523992e-06, + "loss": 0.8969, + "step": 11279 + }, + { + "epoch": 1.9221338705874789, + "grad_norm": 1.6953125, + "learning_rate": 5.666277003329301e-06, + "loss": 0.9196, + "step": 11280 + }, + { + "epoch": 1.9223053346764687, + "grad_norm": 1.6640625, + "learning_rate": 5.664649544473636e-06, + "loss": 0.9005, + "step": 11281 + }, + { + "epoch": 1.9224767987654585, + "grad_norm": 1.71875, + "learning_rate": 5.6630222270100755e-06, + "loss": 0.8406, + "step": 11282 + }, + { + "epoch": 1.9226482628544486, + "grad_norm": 1.6171875, + "learning_rate": 5.661395050991691e-06, + "loss": 0.8144, + "step": 11283 + }, + { + "epoch": 1.9228197269434384, + "grad_norm": 1.625, + "learning_rate": 5.659768016471556e-06, + "loss": 0.7749, + "step": 11284 + }, + { + "epoch": 1.9229911910324282, + "grad_norm": 1.671875, + "learning_rate": 5.658141123502726e-06, + "loss": 0.8384, + "step": 11285 + }, + { + "epoch": 1.923162655121418, + "grad_norm": 1.6328125, + "learning_rate": 5.656514372138265e-06, + "loss": 0.8448, + "step": 11286 + }, + { + "epoch": 1.9233341192104079, + "grad_norm": 1.6328125, + "learning_rate": 5.654887762431224e-06, + "loss": 0.8417, + "step": 11287 + }, + { + "epoch": 1.9235055832993977, + "grad_norm": 1.78125, + "learning_rate": 5.653261294434655e-06, + "loss": 0.8934, + "step": 11288 + }, + { + "epoch": 1.9236770473883875, + "grad_norm": 1.65625, + "learning_rate": 5.651634968201606e-06, + "loss": 0.9038, + "step": 11289 + }, + { + "epoch": 1.9238485114773773, + "grad_norm": 1.6328125, + "learning_rate": 5.65000878378511e-06, + "loss": 0.8409, + "step": 11290 + }, + { + "epoch": 1.9240199755663672, + "grad_norm": 1.65625, + "learning_rate": 5.648382741238212e-06, + "loss": 0.8006, + "step": 11291 + }, + { + "epoch": 1.9241914396553572, + "grad_norm": 1.7109375, + "learning_rate": 5.64675684061394e-06, + "loss": 0.8432, + "step": 11292 + }, + { + "epoch": 1.924362903744347, + "grad_norm": 1.7109375, + "learning_rate": 5.645131081965319e-06, + "loss": 0.8412, + "step": 11293 + }, + { + "epoch": 1.9245343678333369, + "grad_norm": 1.734375, + "learning_rate": 5.6435054653453735e-06, + "loss": 0.8535, + "step": 11294 + }, + { + "epoch": 1.924705831922327, + "grad_norm": 1.7265625, + "learning_rate": 5.64187999080712e-06, + "loss": 0.8394, + "step": 11295 + }, + { + "epoch": 1.9248772960113167, + "grad_norm": 1.6796875, + "learning_rate": 5.6402546584035744e-06, + "loss": 0.8817, + "step": 11296 + }, + { + "epoch": 1.9250487601003066, + "grad_norm": 1.6640625, + "learning_rate": 5.6386294681877375e-06, + "loss": 0.8096, + "step": 11297 + }, + { + "epoch": 1.9252202241892964, + "grad_norm": 1.7421875, + "learning_rate": 5.637004420212617e-06, + "loss": 0.8154, + "step": 11298 + }, + { + "epoch": 1.9253916882782862, + "grad_norm": 1.5859375, + "learning_rate": 5.635379514531212e-06, + "loss": 0.7501, + "step": 11299 + }, + { + "epoch": 1.925563152367276, + "grad_norm": 1.6875, + "learning_rate": 5.633754751196516e-06, + "loss": 0.8901, + "step": 11300 + }, + { + "epoch": 1.9257346164562659, + "grad_norm": 1.6640625, + "learning_rate": 5.632130130261522e-06, + "loss": 0.8765, + "step": 11301 + }, + { + "epoch": 1.9259060805452557, + "grad_norm": 1.59375, + "learning_rate": 5.63050565177921e-06, + "loss": 0.8111, + "step": 11302 + }, + { + "epoch": 1.9260775446342455, + "grad_norm": 1.6953125, + "learning_rate": 5.628881315802563e-06, + "loss": 0.8233, + "step": 11303 + }, + { + "epoch": 1.9262490087232356, + "grad_norm": 1.71875, + "learning_rate": 5.627257122384558e-06, + "loss": 0.8096, + "step": 11304 + }, + { + "epoch": 1.9264204728122254, + "grad_norm": 1.71875, + "learning_rate": 5.625633071578163e-06, + "loss": 0.8586, + "step": 11305 + }, + { + "epoch": 1.9265919369012152, + "grad_norm": 1.734375, + "learning_rate": 5.624009163436345e-06, + "loss": 0.8614, + "step": 11306 + }, + { + "epoch": 1.9267634009902053, + "grad_norm": 1.6796875, + "learning_rate": 5.622385398012067e-06, + "loss": 0.8562, + "step": 11307 + }, + { + "epoch": 1.926934865079195, + "grad_norm": 1.6484375, + "learning_rate": 5.620761775358287e-06, + "loss": 0.7839, + "step": 11308 + }, + { + "epoch": 1.927106329168185, + "grad_norm": 1.625, + "learning_rate": 5.619138295527955e-06, + "loss": 0.8728, + "step": 11309 + }, + { + "epoch": 1.9272777932571747, + "grad_norm": 1.6796875, + "learning_rate": 5.617514958574021e-06, + "loss": 0.84, + "step": 11310 + }, + { + "epoch": 1.9274492573461646, + "grad_norm": 1.7109375, + "learning_rate": 5.615891764549426e-06, + "loss": 0.9072, + "step": 11311 + }, + { + "epoch": 1.9276207214351544, + "grad_norm": 1.75, + "learning_rate": 5.61426871350711e-06, + "loss": 0.8774, + "step": 11312 + }, + { + "epoch": 1.9277921855241442, + "grad_norm": 1.703125, + "learning_rate": 5.612645805500008e-06, + "loss": 0.8319, + "step": 11313 + }, + { + "epoch": 1.927963649613134, + "grad_norm": 1.7109375, + "learning_rate": 5.611023040581045e-06, + "loss": 0.8352, + "step": 11314 + }, + { + "epoch": 1.9281351137021239, + "grad_norm": 1.7109375, + "learning_rate": 5.60940041880315e-06, + "loss": 0.7795, + "step": 11315 + }, + { + "epoch": 1.928306577791114, + "grad_norm": 1.6328125, + "learning_rate": 5.607777940219239e-06, + "loss": 0.8453, + "step": 11316 + }, + { + "epoch": 1.9284780418801037, + "grad_norm": 1.765625, + "learning_rate": 5.606155604882231e-06, + "loss": 0.9062, + "step": 11317 + }, + { + "epoch": 1.9286495059690936, + "grad_norm": 1.5859375, + "learning_rate": 5.604533412845032e-06, + "loss": 0.8051, + "step": 11318 + }, + { + "epoch": 1.9288209700580836, + "grad_norm": 1.71875, + "learning_rate": 5.602911364160557e-06, + "loss": 0.848, + "step": 11319 + }, + { + "epoch": 1.9289924341470734, + "grad_norm": 1.71875, + "learning_rate": 5.601289458881693e-06, + "loss": 0.888, + "step": 11320 + }, + { + "epoch": 1.9291638982360633, + "grad_norm": 1.71875, + "learning_rate": 5.599667697061346e-06, + "loss": 0.7835, + "step": 11321 + }, + { + "epoch": 1.929335362325053, + "grad_norm": 1.875, + "learning_rate": 5.598046078752406e-06, + "loss": 0.8079, + "step": 11322 + }, + { + "epoch": 1.929506826414043, + "grad_norm": 1.7421875, + "learning_rate": 5.596424604007758e-06, + "loss": 0.7706, + "step": 11323 + }, + { + "epoch": 1.9296782905030327, + "grad_norm": 1.6015625, + "learning_rate": 5.594803272880287e-06, + "loss": 0.8445, + "step": 11324 + }, + { + "epoch": 1.9298497545920226, + "grad_norm": 1.6640625, + "learning_rate": 5.59318208542287e-06, + "loss": 0.7663, + "step": 11325 + }, + { + "epoch": 1.9300212186810124, + "grad_norm": 1.734375, + "learning_rate": 5.591561041688378e-06, + "loss": 0.8155, + "step": 11326 + }, + { + "epoch": 1.9301926827700022, + "grad_norm": 1.625, + "learning_rate": 5.589940141729683e-06, + "loss": 0.7547, + "step": 11327 + }, + { + "epoch": 1.9303641468589923, + "grad_norm": 1.7578125, + "learning_rate": 5.588319385599645e-06, + "loss": 0.8048, + "step": 11328 + }, + { + "epoch": 1.930535610947982, + "grad_norm": 1.7890625, + "learning_rate": 5.586698773351122e-06, + "loss": 0.8576, + "step": 11329 + }, + { + "epoch": 1.930707075036972, + "grad_norm": 1.7109375, + "learning_rate": 5.585078305036975e-06, + "loss": 0.9041, + "step": 11330 + }, + { + "epoch": 1.930878539125962, + "grad_norm": 1.765625, + "learning_rate": 5.583457980710053e-06, + "loss": 0.8994, + "step": 11331 + }, + { + "epoch": 1.9310500032149518, + "grad_norm": 1.6953125, + "learning_rate": 5.581837800423193e-06, + "loss": 0.8983, + "step": 11332 + }, + { + "epoch": 1.9312214673039416, + "grad_norm": 1.71875, + "learning_rate": 5.580217764229241e-06, + "loss": 0.8648, + "step": 11333 + }, + { + "epoch": 1.9313929313929314, + "grad_norm": 1.6640625, + "learning_rate": 5.578597872181031e-06, + "loss": 0.7956, + "step": 11334 + }, + { + "epoch": 1.9315643954819213, + "grad_norm": 1.8515625, + "learning_rate": 5.5769781243313915e-06, + "loss": 0.9276, + "step": 11335 + }, + { + "epoch": 1.931735859570911, + "grad_norm": 1.6640625, + "learning_rate": 5.575358520733153e-06, + "loss": 0.7656, + "step": 11336 + }, + { + "epoch": 1.931907323659901, + "grad_norm": 1.6328125, + "learning_rate": 5.573739061439134e-06, + "loss": 0.826, + "step": 11337 + }, + { + "epoch": 1.9320787877488907, + "grad_norm": 1.640625, + "learning_rate": 5.572119746502152e-06, + "loss": 0.8464, + "step": 11338 + }, + { + "epoch": 1.9322502518378806, + "grad_norm": 1.6796875, + "learning_rate": 5.57050057597502e-06, + "loss": 0.8301, + "step": 11339 + }, + { + "epoch": 1.9324217159268706, + "grad_norm": 1.640625, + "learning_rate": 5.568881549910543e-06, + "loss": 0.9023, + "step": 11340 + }, + { + "epoch": 1.9325931800158604, + "grad_norm": 1.5390625, + "learning_rate": 5.567262668361525e-06, + "loss": 0.7431, + "step": 11341 + }, + { + "epoch": 1.9327646441048503, + "grad_norm": 1.609375, + "learning_rate": 5.5656439313807675e-06, + "loss": 0.884, + "step": 11342 + }, + { + "epoch": 1.9329361081938403, + "grad_norm": 1.765625, + "learning_rate": 5.564025339021055e-06, + "loss": 0.7854, + "step": 11343 + }, + { + "epoch": 1.9331075722828301, + "grad_norm": 1.734375, + "learning_rate": 5.562406891335181e-06, + "loss": 0.8516, + "step": 11344 + }, + { + "epoch": 1.93327903637182, + "grad_norm": 1.7734375, + "learning_rate": 5.560788588375925e-06, + "loss": 0.8559, + "step": 11345 + }, + { + "epoch": 1.9334505004608098, + "grad_norm": 1.59375, + "learning_rate": 5.559170430196074e-06, + "loss": 0.7799, + "step": 11346 + }, + { + "epoch": 1.9336219645497996, + "grad_norm": 1.6484375, + "learning_rate": 5.557552416848398e-06, + "loss": 0.7802, + "step": 11347 + }, + { + "epoch": 1.9337934286387894, + "grad_norm": 1.6953125, + "learning_rate": 5.555934548385665e-06, + "loss": 0.8102, + "step": 11348 + }, + { + "epoch": 1.9339648927277793, + "grad_norm": 1.7578125, + "learning_rate": 5.554316824860642e-06, + "loss": 0.8421, + "step": 11349 + }, + { + "epoch": 1.934136356816769, + "grad_norm": 1.6953125, + "learning_rate": 5.5526992463260884e-06, + "loss": 0.8531, + "step": 11350 + }, + { + "epoch": 1.934307820905759, + "grad_norm": 1.6171875, + "learning_rate": 5.55108181283476e-06, + "loss": 0.7784, + "step": 11351 + }, + { + "epoch": 1.9344792849947487, + "grad_norm": 1.515625, + "learning_rate": 5.549464524439406e-06, + "loss": 0.8066, + "step": 11352 + }, + { + "epoch": 1.9346507490837388, + "grad_norm": 1.6640625, + "learning_rate": 5.547847381192772e-06, + "loss": 0.8316, + "step": 11353 + }, + { + "epoch": 1.9348222131727286, + "grad_norm": 1.6484375, + "learning_rate": 5.546230383147606e-06, + "loss": 0.8106, + "step": 11354 + }, + { + "epoch": 1.9349936772617184, + "grad_norm": 1.6640625, + "learning_rate": 5.544613530356633e-06, + "loss": 0.8452, + "step": 11355 + }, + { + "epoch": 1.9351651413507085, + "grad_norm": 1.75, + "learning_rate": 5.54299682287259e-06, + "loss": 0.9008, + "step": 11356 + }, + { + "epoch": 1.9353366054396983, + "grad_norm": 1.6953125, + "learning_rate": 5.541380260748206e-06, + "loss": 0.8208, + "step": 11357 + }, + { + "epoch": 1.9355080695286881, + "grad_norm": 1.6875, + "learning_rate": 5.539763844036201e-06, + "loss": 0.8263, + "step": 11358 + }, + { + "epoch": 1.935679533617678, + "grad_norm": 1.71875, + "learning_rate": 5.538147572789292e-06, + "loss": 0.8089, + "step": 11359 + }, + { + "epoch": 1.9358509977066678, + "grad_norm": 1.6796875, + "learning_rate": 5.5365314470601915e-06, + "loss": 0.8441, + "step": 11360 + }, + { + "epoch": 1.9360224617956576, + "grad_norm": 1.765625, + "learning_rate": 5.534915466901609e-06, + "loss": 0.7927, + "step": 11361 + }, + { + "epoch": 1.9361939258846474, + "grad_norm": 1.625, + "learning_rate": 5.533299632366248e-06, + "loss": 0.8619, + "step": 11362 + }, + { + "epoch": 1.9363653899736373, + "grad_norm": 1.671875, + "learning_rate": 5.531683943506806e-06, + "loss": 0.7727, + "step": 11363 + }, + { + "epoch": 1.936536854062627, + "grad_norm": 1.734375, + "learning_rate": 5.530068400375976e-06, + "loss": 0.7626, + "step": 11364 + }, + { + "epoch": 1.9367083181516171, + "grad_norm": 1.6484375, + "learning_rate": 5.528453003026448e-06, + "loss": 0.8035, + "step": 11365 + }, + { + "epoch": 1.936879782240607, + "grad_norm": 1.71875, + "learning_rate": 5.526837751510905e-06, + "loss": 0.8574, + "step": 11366 + }, + { + "epoch": 1.9370512463295968, + "grad_norm": 1.7734375, + "learning_rate": 5.525222645882029e-06, + "loss": 0.9693, + "step": 11367 + }, + { + "epoch": 1.9372227104185868, + "grad_norm": 1.703125, + "learning_rate": 5.523607686192492e-06, + "loss": 0.8168, + "step": 11368 + }, + { + "epoch": 1.9373941745075767, + "grad_norm": 1.6953125, + "learning_rate": 5.521992872494965e-06, + "loss": 0.8063, + "step": 11369 + }, + { + "epoch": 1.9375656385965665, + "grad_norm": 1.828125, + "learning_rate": 5.5203782048421115e-06, + "loss": 0.8824, + "step": 11370 + }, + { + "epoch": 1.9377371026855563, + "grad_norm": 1.546875, + "learning_rate": 5.518763683286596e-06, + "loss": 0.7815, + "step": 11371 + }, + { + "epoch": 1.9379085667745461, + "grad_norm": 1.6796875, + "learning_rate": 5.5171493078810704e-06, + "loss": 0.8088, + "step": 11372 + }, + { + "epoch": 1.938080030863536, + "grad_norm": 1.5859375, + "learning_rate": 5.515535078678187e-06, + "loss": 0.8024, + "step": 11373 + }, + { + "epoch": 1.9382514949525258, + "grad_norm": 1.6484375, + "learning_rate": 5.51392099573059e-06, + "loss": 0.7431, + "step": 11374 + }, + { + "epoch": 1.9384229590415156, + "grad_norm": 1.8671875, + "learning_rate": 5.512307059090925e-06, + "loss": 0.7968, + "step": 11375 + }, + { + "epoch": 1.9385944231305055, + "grad_norm": 1.671875, + "learning_rate": 5.5106932688118245e-06, + "loss": 0.7958, + "step": 11376 + }, + { + "epoch": 1.9387658872194955, + "grad_norm": 2.640625, + "learning_rate": 5.509079624945926e-06, + "loss": 0.8416, + "step": 11377 + }, + { + "epoch": 1.9389373513084853, + "grad_norm": 1.6328125, + "learning_rate": 5.507466127545847e-06, + "loss": 0.8436, + "step": 11378 + }, + { + "epoch": 1.9391088153974751, + "grad_norm": 1.84375, + "learning_rate": 5.505852776664215e-06, + "loss": 0.8292, + "step": 11379 + }, + { + "epoch": 1.9392802794864652, + "grad_norm": 1.6875, + "learning_rate": 5.504239572353649e-06, + "loss": 0.8329, + "step": 11380 + }, + { + "epoch": 1.939451743575455, + "grad_norm": 1.640625, + "learning_rate": 5.5026265146667605e-06, + "loss": 0.7997, + "step": 11381 + }, + { + "epoch": 1.9396232076644448, + "grad_norm": 1.6328125, + "learning_rate": 5.501013603656155e-06, + "loss": 0.7976, + "step": 11382 + }, + { + "epoch": 1.9397946717534347, + "grad_norm": 1.6796875, + "learning_rate": 5.499400839374438e-06, + "loss": 0.8052, + "step": 11383 + }, + { + "epoch": 1.9399661358424245, + "grad_norm": 1.7578125, + "learning_rate": 5.497788221874203e-06, + "loss": 0.9537, + "step": 11384 + }, + { + "epoch": 1.9401375999314143, + "grad_norm": 1.6484375, + "learning_rate": 5.496175751208052e-06, + "loss": 0.8053, + "step": 11385 + }, + { + "epoch": 1.9403090640204042, + "grad_norm": 1.6171875, + "learning_rate": 5.494563427428569e-06, + "loss": 0.8405, + "step": 11386 + }, + { + "epoch": 1.940480528109394, + "grad_norm": 1.671875, + "learning_rate": 5.492951250588341e-06, + "loss": 0.8469, + "step": 11387 + }, + { + "epoch": 1.9406519921983838, + "grad_norm": 1.625, + "learning_rate": 5.491339220739947e-06, + "loss": 0.797, + "step": 11388 + }, + { + "epoch": 1.9408234562873738, + "grad_norm": 1.6953125, + "learning_rate": 5.489727337935955e-06, + "loss": 0.8585, + "step": 11389 + }, + { + "epoch": 1.9409949203763637, + "grad_norm": 1.7890625, + "learning_rate": 5.488115602228938e-06, + "loss": 0.9122, + "step": 11390 + }, + { + "epoch": 1.9411663844653535, + "grad_norm": 1.625, + "learning_rate": 5.486504013671463e-06, + "loss": 0.792, + "step": 11391 + }, + { + "epoch": 1.9413378485543435, + "grad_norm": 1.7265625, + "learning_rate": 5.484892572316088e-06, + "loss": 0.8458, + "step": 11392 + }, + { + "epoch": 1.9415093126433334, + "grad_norm": 1.8203125, + "learning_rate": 5.483281278215366e-06, + "loss": 0.9417, + "step": 11393 + }, + { + "epoch": 1.9416807767323232, + "grad_norm": 1.6484375, + "learning_rate": 5.481670131421853e-06, + "loss": 0.8389, + "step": 11394 + }, + { + "epoch": 1.941852240821313, + "grad_norm": 1.640625, + "learning_rate": 5.4800591319880905e-06, + "loss": 0.8018, + "step": 11395 + }, + { + "epoch": 1.9420237049103029, + "grad_norm": 1.671875, + "learning_rate": 5.47844827996662e-06, + "loss": 0.861, + "step": 11396 + }, + { + "epoch": 1.9421951689992927, + "grad_norm": 1.671875, + "learning_rate": 5.476837575409978e-06, + "loss": 0.8843, + "step": 11397 + }, + { + "epoch": 1.9423666330882825, + "grad_norm": 1.7265625, + "learning_rate": 5.475227018370695e-06, + "loss": 0.8738, + "step": 11398 + }, + { + "epoch": 1.9425380971772723, + "grad_norm": 1.59375, + "learning_rate": 5.473616608901298e-06, + "loss": 0.8311, + "step": 11399 + }, + { + "epoch": 1.9427095612662622, + "grad_norm": 1.71875, + "learning_rate": 5.4720063470543115e-06, + "loss": 0.8529, + "step": 11400 + }, + { + "epoch": 1.9428810253552522, + "grad_norm": 1.71875, + "learning_rate": 5.470396232882242e-06, + "loss": 0.8726, + "step": 11401 + }, + { + "epoch": 1.943052489444242, + "grad_norm": 1.6953125, + "learning_rate": 5.468786266437611e-06, + "loss": 0.8968, + "step": 11402 + }, + { + "epoch": 1.9432239535332319, + "grad_norm": 1.640625, + "learning_rate": 5.467176447772924e-06, + "loss": 0.864, + "step": 11403 + }, + { + "epoch": 1.943395417622222, + "grad_norm": 1.625, + "learning_rate": 5.4655667769406805e-06, + "loss": 0.9435, + "step": 11404 + }, + { + "epoch": 1.9435668817112117, + "grad_norm": 1.6015625, + "learning_rate": 5.46395725399338e-06, + "loss": 0.7874, + "step": 11405 + }, + { + "epoch": 1.9437383458002016, + "grad_norm": 1.6640625, + "learning_rate": 5.462347878983516e-06, + "loss": 0.8414, + "step": 11406 + }, + { + "epoch": 1.9439098098891914, + "grad_norm": 1.6328125, + "learning_rate": 5.460738651963573e-06, + "loss": 0.7489, + "step": 11407 + }, + { + "epoch": 1.9440812739781812, + "grad_norm": 1.6171875, + "learning_rate": 5.459129572986037e-06, + "loss": 0.7888, + "step": 11408 + }, + { + "epoch": 1.944252738067171, + "grad_norm": 1.765625, + "learning_rate": 5.457520642103385e-06, + "loss": 0.8051, + "step": 11409 + }, + { + "epoch": 1.9444242021561609, + "grad_norm": 1.8046875, + "learning_rate": 5.4559118593680905e-06, + "loss": 0.9306, + "step": 11410 + }, + { + "epoch": 1.9445956662451507, + "grad_norm": 1.5703125, + "learning_rate": 5.454303224832624e-06, + "loss": 0.7945, + "step": 11411 + }, + { + "epoch": 1.9447671303341405, + "grad_norm": 1.8125, + "learning_rate": 5.4526947385494445e-06, + "loss": 0.8615, + "step": 11412 + }, + { + "epoch": 1.9449385944231306, + "grad_norm": 1.75, + "learning_rate": 5.451086400571013e-06, + "loss": 0.8689, + "step": 11413 + }, + { + "epoch": 1.9451100585121204, + "grad_norm": 1.6953125, + "learning_rate": 5.449478210949784e-06, + "loss": 0.8464, + "step": 11414 + }, + { + "epoch": 1.9452815226011102, + "grad_norm": 1.734375, + "learning_rate": 5.447870169738205e-06, + "loss": 0.8437, + "step": 11415 + }, + { + "epoch": 1.9454529866901002, + "grad_norm": 1.6484375, + "learning_rate": 5.446262276988722e-06, + "loss": 0.8332, + "step": 11416 + }, + { + "epoch": 1.94562445077909, + "grad_norm": 1.6171875, + "learning_rate": 5.444654532753775e-06, + "loss": 0.8617, + "step": 11417 + }, + { + "epoch": 1.94579591486808, + "grad_norm": 1.625, + "learning_rate": 5.443046937085795e-06, + "loss": 0.8854, + "step": 11418 + }, + { + "epoch": 1.9459673789570697, + "grad_norm": 1.6328125, + "learning_rate": 5.441439490037217e-06, + "loss": 0.8435, + "step": 11419 + }, + { + "epoch": 1.9461388430460596, + "grad_norm": 1.7734375, + "learning_rate": 5.439832191660462e-06, + "loss": 0.8938, + "step": 11420 + }, + { + "epoch": 1.9463103071350494, + "grad_norm": 1.7890625, + "learning_rate": 5.438225042007949e-06, + "loss": 0.9738, + "step": 11421 + }, + { + "epoch": 1.9464817712240392, + "grad_norm": 1.734375, + "learning_rate": 5.436618041132099e-06, + "loss": 0.8825, + "step": 11422 + }, + { + "epoch": 1.946653235313029, + "grad_norm": 1.703125, + "learning_rate": 5.435011189085318e-06, + "loss": 0.9542, + "step": 11423 + }, + { + "epoch": 1.9468246994020189, + "grad_norm": 1.6953125, + "learning_rate": 5.433404485920011e-06, + "loss": 0.8039, + "step": 11424 + }, + { + "epoch": 1.946996163491009, + "grad_norm": 1.7421875, + "learning_rate": 5.431797931688581e-06, + "loss": 0.7727, + "step": 11425 + }, + { + "epoch": 1.9471676275799987, + "grad_norm": 1.7265625, + "learning_rate": 5.430191526443423e-06, + "loss": 0.8357, + "step": 11426 + }, + { + "epoch": 1.9473390916689886, + "grad_norm": 1.6796875, + "learning_rate": 5.428585270236925e-06, + "loss": 0.8357, + "step": 11427 + }, + { + "epoch": 1.9475105557579786, + "grad_norm": 1.7890625, + "learning_rate": 5.426979163121479e-06, + "loss": 0.8168, + "step": 11428 + }, + { + "epoch": 1.9476820198469684, + "grad_norm": 1.6953125, + "learning_rate": 5.425373205149461e-06, + "loss": 0.8457, + "step": 11429 + }, + { + "epoch": 1.9478534839359583, + "grad_norm": 1.734375, + "learning_rate": 5.42376739637325e-06, + "loss": 0.8166, + "step": 11430 + }, + { + "epoch": 1.948024948024948, + "grad_norm": 1.6640625, + "learning_rate": 5.4221617368452165e-06, + "loss": 0.8057, + "step": 11431 + }, + { + "epoch": 1.948196412113938, + "grad_norm": 1.7421875, + "learning_rate": 5.420556226617727e-06, + "loss": 0.7438, + "step": 11432 + }, + { + "epoch": 1.9483678762029277, + "grad_norm": 1.6015625, + "learning_rate": 5.4189508657431444e-06, + "loss": 0.7822, + "step": 11433 + }, + { + "epoch": 1.9485393402919176, + "grad_norm": 1.75, + "learning_rate": 5.417345654273827e-06, + "loss": 0.8917, + "step": 11434 + }, + { + "epoch": 1.9487108043809074, + "grad_norm": 1.671875, + "learning_rate": 5.415740592262122e-06, + "loss": 0.7634, + "step": 11435 + }, + { + "epoch": 1.9488822684698972, + "grad_norm": 1.65625, + "learning_rate": 5.41413567976038e-06, + "loss": 0.8146, + "step": 11436 + }, + { + "epoch": 1.9490537325588873, + "grad_norm": 1.7421875, + "learning_rate": 5.4125309168209394e-06, + "loss": 0.9364, + "step": 11437 + }, + { + "epoch": 1.949225196647877, + "grad_norm": 1.625, + "learning_rate": 5.410926303496142e-06, + "loss": 0.8589, + "step": 11438 + }, + { + "epoch": 1.949396660736867, + "grad_norm": 1.671875, + "learning_rate": 5.4093218398383154e-06, + "loss": 0.8342, + "step": 11439 + }, + { + "epoch": 1.9495681248258567, + "grad_norm": 1.8046875, + "learning_rate": 5.407717525899793e-06, + "loss": 0.8523, + "step": 11440 + }, + { + "epoch": 1.9497395889148468, + "grad_norm": 1.6171875, + "learning_rate": 5.4061133617328956e-06, + "loss": 0.8945, + "step": 11441 + }, + { + "epoch": 1.9499110530038366, + "grad_norm": 1.6953125, + "learning_rate": 5.40450934738994e-06, + "loss": 0.7959, + "step": 11442 + }, + { + "epoch": 1.9500825170928264, + "grad_norm": 1.65625, + "learning_rate": 5.402905482923239e-06, + "loss": 0.9032, + "step": 11443 + }, + { + "epoch": 1.9502539811818163, + "grad_norm": 1.6875, + "learning_rate": 5.4013017683851034e-06, + "loss": 0.8463, + "step": 11444 + }, + { + "epoch": 1.950425445270806, + "grad_norm": 1.7734375, + "learning_rate": 5.399698203827834e-06, + "loss": 0.8543, + "step": 11445 + }, + { + "epoch": 1.950596909359796, + "grad_norm": 1.7265625, + "learning_rate": 5.398094789303731e-06, + "loss": 0.9256, + "step": 11446 + }, + { + "epoch": 1.9507683734487857, + "grad_norm": 1.6640625, + "learning_rate": 5.396491524865084e-06, + "loss": 0.7521, + "step": 11447 + }, + { + "epoch": 1.9509398375377756, + "grad_norm": 1.75, + "learning_rate": 5.394888410564184e-06, + "loss": 0.8547, + "step": 11448 + }, + { + "epoch": 1.9511113016267654, + "grad_norm": 1.609375, + "learning_rate": 5.393285446453314e-06, + "loss": 0.801, + "step": 11449 + }, + { + "epoch": 1.9512827657157554, + "grad_norm": 1.7109375, + "learning_rate": 5.391682632584752e-06, + "loss": 0.8614, + "step": 11450 + }, + { + "epoch": 1.9514542298047453, + "grad_norm": 1.6875, + "learning_rate": 5.390079969010773e-06, + "loss": 0.8522, + "step": 11451 + }, + { + "epoch": 1.951625693893735, + "grad_norm": 1.671875, + "learning_rate": 5.388477455783645e-06, + "loss": 0.8962, + "step": 11452 + }, + { + "epoch": 1.9517971579827251, + "grad_norm": 1.75, + "learning_rate": 5.386875092955633e-06, + "loss": 0.8499, + "step": 11453 + }, + { + "epoch": 1.951968622071715, + "grad_norm": 1.734375, + "learning_rate": 5.385272880578996e-06, + "loss": 0.9148, + "step": 11454 + }, + { + "epoch": 1.9521400861607048, + "grad_norm": 1.8828125, + "learning_rate": 5.383670818705987e-06, + "loss": 0.792, + "step": 11455 + }, + { + "epoch": 1.9523115502496946, + "grad_norm": 1.6875, + "learning_rate": 5.382068907388856e-06, + "loss": 0.8232, + "step": 11456 + }, + { + "epoch": 1.9524830143386844, + "grad_norm": 1.6171875, + "learning_rate": 5.3804671466798486e-06, + "loss": 0.8675, + "step": 11457 + }, + { + "epoch": 1.9526544784276743, + "grad_norm": 1.71875, + "learning_rate": 5.378865536631203e-06, + "loss": 0.8182, + "step": 11458 + }, + { + "epoch": 1.952825942516664, + "grad_norm": 1.6796875, + "learning_rate": 5.377264077295152e-06, + "loss": 0.8782, + "step": 11459 + }, + { + "epoch": 1.952997406605654, + "grad_norm": 1.734375, + "learning_rate": 5.375662768723929e-06, + "loss": 0.7995, + "step": 11460 + }, + { + "epoch": 1.9531688706946437, + "grad_norm": 1.625, + "learning_rate": 5.374061610969756e-06, + "loss": 0.8436, + "step": 11461 + }, + { + "epoch": 1.9533403347836338, + "grad_norm": 1.71875, + "learning_rate": 5.372460604084854e-06, + "loss": 0.8517, + "step": 11462 + }, + { + "epoch": 1.9535117988726236, + "grad_norm": 1.6484375, + "learning_rate": 5.370859748121437e-06, + "loss": 0.8209, + "step": 11463 + }, + { + "epoch": 1.9536832629616134, + "grad_norm": 1.8046875, + "learning_rate": 5.369259043131716e-06, + "loss": 0.8313, + "step": 11464 + }, + { + "epoch": 1.9538547270506035, + "grad_norm": 1.6640625, + "learning_rate": 5.367658489167894e-06, + "loss": 0.8333, + "step": 11465 + }, + { + "epoch": 1.9540261911395933, + "grad_norm": 1.6640625, + "learning_rate": 5.366058086282173e-06, + "loss": 0.8558, + "step": 11466 + }, + { + "epoch": 1.9541976552285831, + "grad_norm": 1.8046875, + "learning_rate": 5.364457834526749e-06, + "loss": 0.8054, + "step": 11467 + }, + { + "epoch": 1.954369119317573, + "grad_norm": 1.6875, + "learning_rate": 5.362857733953809e-06, + "loss": 0.8077, + "step": 11468 + }, + { + "epoch": 1.9545405834065628, + "grad_norm": 1.6015625, + "learning_rate": 5.361257784615546e-06, + "loss": 0.8044, + "step": 11469 + }, + { + "epoch": 1.9547120474955526, + "grad_norm": 1.7578125, + "learning_rate": 5.3596579865641286e-06, + "loss": 0.9041, + "step": 11470 + }, + { + "epoch": 1.9548835115845424, + "grad_norm": 1.6484375, + "learning_rate": 5.358058339851739e-06, + "loss": 0.8412, + "step": 11471 + }, + { + "epoch": 1.9550549756735323, + "grad_norm": 1.6640625, + "learning_rate": 5.356458844530546e-06, + "loss": 0.8495, + "step": 11472 + }, + { + "epoch": 1.955226439762522, + "grad_norm": 1.6328125, + "learning_rate": 5.354859500652717e-06, + "loss": 0.8629, + "step": 11473 + }, + { + "epoch": 1.9553979038515121, + "grad_norm": 1.7421875, + "learning_rate": 5.3532603082704115e-06, + "loss": 0.8612, + "step": 11474 + }, + { + "epoch": 1.955569367940502, + "grad_norm": 1.8125, + "learning_rate": 5.351661267435786e-06, + "loss": 0.8293, + "step": 11475 + }, + { + "epoch": 1.9557408320294918, + "grad_norm": 1.7421875, + "learning_rate": 5.350062378200989e-06, + "loss": 0.8635, + "step": 11476 + }, + { + "epoch": 1.9559122961184818, + "grad_norm": 1.7109375, + "learning_rate": 5.348463640618169e-06, + "loss": 0.8071, + "step": 11477 + }, + { + "epoch": 1.9560837602074717, + "grad_norm": 1.6640625, + "learning_rate": 5.346865054739463e-06, + "loss": 0.8663, + "step": 11478 + }, + { + "epoch": 1.9562552242964615, + "grad_norm": 1.7890625, + "learning_rate": 5.345266620617011e-06, + "loss": 0.9402, + "step": 11479 + }, + { + "epoch": 1.9564266883854513, + "grad_norm": 1.7890625, + "learning_rate": 5.343668338302944e-06, + "loss": 0.8633, + "step": 11480 + }, + { + "epoch": 1.9565981524744411, + "grad_norm": 1.734375, + "learning_rate": 5.34207020784939e-06, + "loss": 0.8151, + "step": 11481 + }, + { + "epoch": 1.956769616563431, + "grad_norm": 1.703125, + "learning_rate": 5.340472229308465e-06, + "loss": 0.8865, + "step": 11482 + }, + { + "epoch": 1.9569410806524208, + "grad_norm": 1.671875, + "learning_rate": 5.338874402732284e-06, + "loss": 0.7956, + "step": 11483 + }, + { + "epoch": 1.9571125447414106, + "grad_norm": 1.59375, + "learning_rate": 5.337276728172962e-06, + "loss": 0.7595, + "step": 11484 + }, + { + "epoch": 1.9572840088304004, + "grad_norm": 1.671875, + "learning_rate": 5.335679205682604e-06, + "loss": 0.8831, + "step": 11485 + }, + { + "epoch": 1.9574554729193905, + "grad_norm": 1.71875, + "learning_rate": 5.3340818353133105e-06, + "loss": 0.8479, + "step": 11486 + }, + { + "epoch": 1.9576269370083803, + "grad_norm": 1.59375, + "learning_rate": 5.33248461711718e-06, + "loss": 0.8728, + "step": 11487 + }, + { + "epoch": 1.9577984010973701, + "grad_norm": 1.703125, + "learning_rate": 5.330887551146301e-06, + "loss": 0.952, + "step": 11488 + }, + { + "epoch": 1.9579698651863602, + "grad_norm": 1.6875, + "learning_rate": 5.329290637452762e-06, + "loss": 0.8699, + "step": 11489 + }, + { + "epoch": 1.95814132927535, + "grad_norm": 1.6484375, + "learning_rate": 5.327693876088643e-06, + "loss": 0.8186, + "step": 11490 + }, + { + "epoch": 1.9583127933643398, + "grad_norm": 1.8046875, + "learning_rate": 5.3260972671060215e-06, + "loss": 0.8201, + "step": 11491 + }, + { + "epoch": 1.9584842574533297, + "grad_norm": 1.734375, + "learning_rate": 5.324500810556971e-06, + "loss": 0.8688, + "step": 11492 + }, + { + "epoch": 1.9586557215423195, + "grad_norm": 1.65625, + "learning_rate": 5.322904506493554e-06, + "loss": 0.8751, + "step": 11493 + }, + { + "epoch": 1.9588271856313093, + "grad_norm": 1.7890625, + "learning_rate": 5.321308354967828e-06, + "loss": 0.8815, + "step": 11494 + }, + { + "epoch": 1.9589986497202991, + "grad_norm": 1.71875, + "learning_rate": 5.319712356031859e-06, + "loss": 0.8465, + "step": 11495 + }, + { + "epoch": 1.959170113809289, + "grad_norm": 1.703125, + "learning_rate": 5.318116509737694e-06, + "loss": 0.807, + "step": 11496 + }, + { + "epoch": 1.9593415778982788, + "grad_norm": 1.8046875, + "learning_rate": 5.31652081613738e-06, + "loss": 0.8358, + "step": 11497 + }, + { + "epoch": 1.9595130419872688, + "grad_norm": 1.6875, + "learning_rate": 5.314925275282959e-06, + "loss": 0.8653, + "step": 11498 + }, + { + "epoch": 1.9596845060762587, + "grad_norm": 1.6796875, + "learning_rate": 5.313329887226467e-06, + "loss": 0.8799, + "step": 11499 + }, + { + "epoch": 1.9598559701652485, + "grad_norm": 1.796875, + "learning_rate": 5.311734652019935e-06, + "loss": 0.8724, + "step": 11500 + }, + { + "epoch": 1.9600274342542385, + "grad_norm": 1.78125, + "learning_rate": 5.310139569715391e-06, + "loss": 0.8704, + "step": 11501 + }, + { + "epoch": 1.9601988983432284, + "grad_norm": 1.6171875, + "learning_rate": 5.308544640364856e-06, + "loss": 0.7946, + "step": 11502 + }, + { + "epoch": 1.9603703624322182, + "grad_norm": 1.6640625, + "learning_rate": 5.306949864020348e-06, + "loss": 0.7768, + "step": 11503 + }, + { + "epoch": 1.960541826521208, + "grad_norm": 1.828125, + "learning_rate": 5.305355240733881e-06, + "loss": 0.7978, + "step": 11504 + }, + { + "epoch": 1.9607132906101978, + "grad_norm": 1.609375, + "learning_rate": 5.303760770557452e-06, + "loss": 0.8094, + "step": 11505 + }, + { + "epoch": 1.9608847546991877, + "grad_norm": 1.6484375, + "learning_rate": 5.302166453543069e-06, + "loss": 0.8126, + "step": 11506 + }, + { + "epoch": 1.9610562187881775, + "grad_norm": 1.6484375, + "learning_rate": 5.30057228974273e-06, + "loss": 0.8152, + "step": 11507 + }, + { + "epoch": 1.9612276828771673, + "grad_norm": 1.6640625, + "learning_rate": 5.2989782792084236e-06, + "loss": 0.8209, + "step": 11508 + }, + { + "epoch": 1.9613991469661571, + "grad_norm": 1.6796875, + "learning_rate": 5.297384421992138e-06, + "loss": 0.8824, + "step": 11509 + }, + { + "epoch": 1.9615706110551472, + "grad_norm": 1.6875, + "learning_rate": 5.2957907181458554e-06, + "loss": 0.8342, + "step": 11510 + }, + { + "epoch": 1.961742075144137, + "grad_norm": 1.6640625, + "learning_rate": 5.294197167721552e-06, + "loss": 0.8246, + "step": 11511 + }, + { + "epoch": 1.9619135392331268, + "grad_norm": 1.6875, + "learning_rate": 5.292603770771199e-06, + "loss": 0.8105, + "step": 11512 + }, + { + "epoch": 1.9620850033221169, + "grad_norm": 1.546875, + "learning_rate": 5.291010527346763e-06, + "loss": 0.747, + "step": 11513 + }, + { + "epoch": 1.9622564674111067, + "grad_norm": 1.734375, + "learning_rate": 5.289417437500207e-06, + "loss": 0.8496, + "step": 11514 + }, + { + "epoch": 1.9624279315000965, + "grad_norm": 1.765625, + "learning_rate": 5.287824501283487e-06, + "loss": 0.8033, + "step": 11515 + }, + { + "epoch": 1.9625993955890864, + "grad_norm": 1.6640625, + "learning_rate": 5.286231718748553e-06, + "loss": 0.7926, + "step": 11516 + }, + { + "epoch": 1.9627708596780762, + "grad_norm": 1.6875, + "learning_rate": 5.284639089947355e-06, + "loss": 0.8827, + "step": 11517 + }, + { + "epoch": 1.962942323767066, + "grad_norm": 1.6796875, + "learning_rate": 5.28304661493183e-06, + "loss": 0.7797, + "step": 11518 + }, + { + "epoch": 1.9631137878560558, + "grad_norm": 1.6484375, + "learning_rate": 5.281454293753921e-06, + "loss": 0.827, + "step": 11519 + }, + { + "epoch": 1.9632852519450457, + "grad_norm": 1.640625, + "learning_rate": 5.279862126465552e-06, + "loss": 0.8803, + "step": 11520 + }, + { + "epoch": 1.9634567160340355, + "grad_norm": 1.6875, + "learning_rate": 5.278270113118655e-06, + "loss": 0.8489, + "step": 11521 + }, + { + "epoch": 1.9636281801230255, + "grad_norm": 1.7109375, + "learning_rate": 5.276678253765151e-06, + "loss": 0.8641, + "step": 11522 + }, + { + "epoch": 1.9637996442120154, + "grad_norm": 1.6796875, + "learning_rate": 5.275086548456954e-06, + "loss": 0.8714, + "step": 11523 + }, + { + "epoch": 1.9639711083010052, + "grad_norm": 1.8046875, + "learning_rate": 5.273494997245978e-06, + "loss": 0.9374, + "step": 11524 + }, + { + "epoch": 1.9641425723899952, + "grad_norm": 1.6875, + "learning_rate": 5.2719036001841295e-06, + "loss": 0.8625, + "step": 11525 + }, + { + "epoch": 1.964314036478985, + "grad_norm": 1.75, + "learning_rate": 5.270312357323307e-06, + "loss": 0.8165, + "step": 11526 + }, + { + "epoch": 1.9644855005679749, + "grad_norm": 1.6640625, + "learning_rate": 5.268721268715413e-06, + "loss": 0.8819, + "step": 11527 + }, + { + "epoch": 1.9646569646569647, + "grad_norm": 1.7265625, + "learning_rate": 5.2671303344123315e-06, + "loss": 0.8243, + "step": 11528 + }, + { + "epoch": 1.9648284287459545, + "grad_norm": 1.65625, + "learning_rate": 5.265539554465953e-06, + "loss": 0.8084, + "step": 11529 + }, + { + "epoch": 1.9649998928349444, + "grad_norm": 1.7265625, + "learning_rate": 5.263948928928157e-06, + "loss": 0.8177, + "step": 11530 + }, + { + "epoch": 1.9651713569239342, + "grad_norm": 1.7265625, + "learning_rate": 5.26235845785082e-06, + "loss": 0.8604, + "step": 11531 + }, + { + "epoch": 1.965342821012924, + "grad_norm": 1.78125, + "learning_rate": 5.260768141285814e-06, + "loss": 0.877, + "step": 11532 + }, + { + "epoch": 1.9655142851019138, + "grad_norm": 1.671875, + "learning_rate": 5.259177979285003e-06, + "loss": 0.8761, + "step": 11533 + }, + { + "epoch": 1.9656857491909037, + "grad_norm": 1.671875, + "learning_rate": 5.257587971900253e-06, + "loss": 0.8233, + "step": 11534 + }, + { + "epoch": 1.9658572132798937, + "grad_norm": 1.7734375, + "learning_rate": 5.255998119183419e-06, + "loss": 0.9104, + "step": 11535 + }, + { + "epoch": 1.9660286773688835, + "grad_norm": 1.6796875, + "learning_rate": 5.254408421186348e-06, + "loss": 0.8313, + "step": 11536 + }, + { + "epoch": 1.9662001414578734, + "grad_norm": 1.7734375, + "learning_rate": 5.25281887796089e-06, + "loss": 0.8515, + "step": 11537 + }, + { + "epoch": 1.9663716055468634, + "grad_norm": 1.7265625, + "learning_rate": 5.251229489558889e-06, + "loss": 0.8842, + "step": 11538 + }, + { + "epoch": 1.9665430696358532, + "grad_norm": 1.7421875, + "learning_rate": 5.249640256032172e-06, + "loss": 0.8579, + "step": 11539 + }, + { + "epoch": 1.966714533724843, + "grad_norm": 1.8046875, + "learning_rate": 5.248051177432575e-06, + "loss": 0.8923, + "step": 11540 + }, + { + "epoch": 1.9668859978138329, + "grad_norm": 1.7109375, + "learning_rate": 5.246462253811923e-06, + "loss": 0.8346, + "step": 11541 + }, + { + "epoch": 1.9670574619028227, + "grad_norm": 1.640625, + "learning_rate": 5.244873485222038e-06, + "loss": 0.8458, + "step": 11542 + }, + { + "epoch": 1.9672289259918125, + "grad_norm": 1.765625, + "learning_rate": 5.243284871714733e-06, + "loss": 0.8551, + "step": 11543 + }, + { + "epoch": 1.9674003900808024, + "grad_norm": 1.734375, + "learning_rate": 5.2416964133418216e-06, + "loss": 0.8916, + "step": 11544 + }, + { + "epoch": 1.9675718541697922, + "grad_norm": 1.7734375, + "learning_rate": 5.240108110155108e-06, + "loss": 0.8748, + "step": 11545 + }, + { + "epoch": 1.967743318258782, + "grad_norm": 1.6953125, + "learning_rate": 5.238519962206392e-06, + "loss": 0.8406, + "step": 11546 + }, + { + "epoch": 1.967914782347772, + "grad_norm": 1.75, + "learning_rate": 5.236931969547472e-06, + "loss": 0.8873, + "step": 11547 + }, + { + "epoch": 1.9680862464367619, + "grad_norm": 1.7578125, + "learning_rate": 5.2353441322301356e-06, + "loss": 0.8211, + "step": 11548 + }, + { + "epoch": 1.9682577105257517, + "grad_norm": 1.8671875, + "learning_rate": 5.2337564503061685e-06, + "loss": 0.8973, + "step": 11549 + }, + { + "epoch": 1.9684291746147418, + "grad_norm": 1.7890625, + "learning_rate": 5.232168923827352e-06, + "loss": 0.8692, + "step": 11550 + }, + { + "epoch": 1.9686006387037316, + "grad_norm": 1.734375, + "learning_rate": 5.230581552845462e-06, + "loss": 0.9202, + "step": 11551 + }, + { + "epoch": 1.9687721027927214, + "grad_norm": 1.78125, + "learning_rate": 5.228994337412266e-06, + "loss": 0.8969, + "step": 11552 + }, + { + "epoch": 1.9689435668817112, + "grad_norm": 1.609375, + "learning_rate": 5.227407277579531e-06, + "loss": 0.8916, + "step": 11553 + }, + { + "epoch": 1.969115030970701, + "grad_norm": 2.265625, + "learning_rate": 5.225820373399018e-06, + "loss": 0.8081, + "step": 11554 + }, + { + "epoch": 1.969286495059691, + "grad_norm": 1.671875, + "learning_rate": 5.224233624922479e-06, + "loss": 0.7951, + "step": 11555 + }, + { + "epoch": 1.9694579591486807, + "grad_norm": 1.703125, + "learning_rate": 5.222647032201664e-06, + "loss": 0.8401, + "step": 11556 + }, + { + "epoch": 1.9696294232376705, + "grad_norm": 1.7109375, + "learning_rate": 5.221060595288321e-06, + "loss": 0.8383, + "step": 11557 + }, + { + "epoch": 1.9698008873266604, + "grad_norm": 1.8046875, + "learning_rate": 5.219474314234187e-06, + "loss": 0.8254, + "step": 11558 + }, + { + "epoch": 1.9699723514156504, + "grad_norm": 1.6953125, + "learning_rate": 5.217888189090999e-06, + "loss": 0.8489, + "step": 11559 + }, + { + "epoch": 1.9701438155046402, + "grad_norm": 1.6484375, + "learning_rate": 5.216302219910484e-06, + "loss": 0.8711, + "step": 11560 + }, + { + "epoch": 1.97031527959363, + "grad_norm": 1.8125, + "learning_rate": 5.2147164067443715e-06, + "loss": 0.8555, + "step": 11561 + }, + { + "epoch": 1.9704867436826201, + "grad_norm": 1.7109375, + "learning_rate": 5.2131307496443724e-06, + "loss": 0.8815, + "step": 11562 + }, + { + "epoch": 1.97065820777161, + "grad_norm": 1.8515625, + "learning_rate": 5.211545248662205e-06, + "loss": 0.8769, + "step": 11563 + }, + { + "epoch": 1.9708296718605998, + "grad_norm": 1.703125, + "learning_rate": 5.20995990384958e-06, + "loss": 0.7547, + "step": 11564 + }, + { + "epoch": 1.9710011359495896, + "grad_norm": 1.7578125, + "learning_rate": 5.2083747152581975e-06, + "loss": 0.9023, + "step": 11565 + }, + { + "epoch": 1.9711726000385794, + "grad_norm": 1.6953125, + "learning_rate": 5.206789682939761e-06, + "loss": 0.8359, + "step": 11566 + }, + { + "epoch": 1.9713440641275692, + "grad_norm": 1.6484375, + "learning_rate": 5.205204806945963e-06, + "loss": 0.7655, + "step": 11567 + }, + { + "epoch": 1.971515528216559, + "grad_norm": 1.6328125, + "learning_rate": 5.203620087328491e-06, + "loss": 0.7909, + "step": 11568 + }, + { + "epoch": 1.971686992305549, + "grad_norm": 1.71875, + "learning_rate": 5.202035524139029e-06, + "loss": 0.8972, + "step": 11569 + }, + { + "epoch": 1.9718584563945387, + "grad_norm": 1.6171875, + "learning_rate": 5.200451117429257e-06, + "loss": 0.8527, + "step": 11570 + }, + { + "epoch": 1.9720299204835288, + "grad_norm": 1.703125, + "learning_rate": 5.198866867250847e-06, + "loss": 0.8325, + "step": 11571 + }, + { + "epoch": 1.9722013845725186, + "grad_norm": 1.6484375, + "learning_rate": 5.197282773655463e-06, + "loss": 0.8403, + "step": 11572 + }, + { + "epoch": 1.9723728486615084, + "grad_norm": 1.6796875, + "learning_rate": 5.195698836694783e-06, + "loss": 0.816, + "step": 11573 + }, + { + "epoch": 1.9725443127504985, + "grad_norm": 1.7421875, + "learning_rate": 5.19411505642045e-06, + "loss": 0.8329, + "step": 11574 + }, + { + "epoch": 1.9727157768394883, + "grad_norm": 1.703125, + "learning_rate": 5.192531432884122e-06, + "loss": 0.955, + "step": 11575 + }, + { + "epoch": 1.9728872409284781, + "grad_norm": 1.703125, + "learning_rate": 5.1909479661374475e-06, + "loss": 0.8504, + "step": 11576 + }, + { + "epoch": 1.973058705017468, + "grad_norm": 1.6796875, + "learning_rate": 5.18936465623207e-06, + "loss": 0.8312, + "step": 11577 + }, + { + "epoch": 1.9732301691064578, + "grad_norm": 1.84375, + "learning_rate": 5.1877815032196255e-06, + "loss": 0.8168, + "step": 11578 + }, + { + "epoch": 1.9734016331954476, + "grad_norm": 1.6875, + "learning_rate": 5.186198507151747e-06, + "loss": 0.8544, + "step": 11579 + }, + { + "epoch": 1.9735730972844374, + "grad_norm": 1.7109375, + "learning_rate": 5.184615668080063e-06, + "loss": 0.8645, + "step": 11580 + }, + { + "epoch": 1.9737445613734272, + "grad_norm": 1.6640625, + "learning_rate": 5.183032986056195e-06, + "loss": 0.7389, + "step": 11581 + }, + { + "epoch": 1.973916025462417, + "grad_norm": 1.671875, + "learning_rate": 5.181450461131759e-06, + "loss": 0.7831, + "step": 11582 + }, + { + "epoch": 1.9740874895514071, + "grad_norm": 1.671875, + "learning_rate": 5.17986809335837e-06, + "loss": 0.881, + "step": 11583 + }, + { + "epoch": 1.974258953640397, + "grad_norm": 1.6640625, + "learning_rate": 5.178285882787637e-06, + "loss": 0.8336, + "step": 11584 + }, + { + "epoch": 1.9744304177293868, + "grad_norm": 1.7578125, + "learning_rate": 5.176703829471155e-06, + "loss": 0.8578, + "step": 11585 + }, + { + "epoch": 1.9746018818183768, + "grad_norm": 1.6015625, + "learning_rate": 5.175121933460523e-06, + "loss": 0.8177, + "step": 11586 + }, + { + "epoch": 1.9747733459073666, + "grad_norm": 1.7265625, + "learning_rate": 5.173540194807334e-06, + "loss": 0.853, + "step": 11587 + }, + { + "epoch": 1.9749448099963565, + "grad_norm": 1.703125, + "learning_rate": 5.171958613563168e-06, + "loss": 0.8128, + "step": 11588 + }, + { + "epoch": 1.9751162740853463, + "grad_norm": 1.65625, + "learning_rate": 5.170377189779618e-06, + "loss": 0.8597, + "step": 11589 + }, + { + "epoch": 1.9752877381743361, + "grad_norm": 1.75, + "learning_rate": 5.1687959235082554e-06, + "loss": 0.7888, + "step": 11590 + }, + { + "epoch": 1.975459202263326, + "grad_norm": 1.734375, + "learning_rate": 5.1672148148006494e-06, + "loss": 0.8659, + "step": 11591 + }, + { + "epoch": 1.9756306663523158, + "grad_norm": 1.6796875, + "learning_rate": 5.165633863708368e-06, + "loss": 0.8922, + "step": 11592 + }, + { + "epoch": 1.9758021304413056, + "grad_norm": 1.6875, + "learning_rate": 5.164053070282968e-06, + "loss": 0.9071, + "step": 11593 + }, + { + "epoch": 1.9759735945302954, + "grad_norm": 1.703125, + "learning_rate": 5.162472434576011e-06, + "loss": 0.9287, + "step": 11594 + }, + { + "epoch": 1.9761450586192855, + "grad_norm": 1.765625, + "learning_rate": 5.160891956639042e-06, + "loss": 0.8846, + "step": 11595 + }, + { + "epoch": 1.9763165227082753, + "grad_norm": 1.6640625, + "learning_rate": 5.159311636523614e-06, + "loss": 0.8229, + "step": 11596 + }, + { + "epoch": 1.9764879867972651, + "grad_norm": 1.625, + "learning_rate": 5.1577314742812575e-06, + "loss": 0.8095, + "step": 11597 + }, + { + "epoch": 1.9766594508862552, + "grad_norm": 1.7421875, + "learning_rate": 5.1561514699635126e-06, + "loss": 0.8743, + "step": 11598 + }, + { + "epoch": 1.976830914975245, + "grad_norm": 1.65625, + "learning_rate": 5.1545716236219075e-06, + "loss": 0.7827, + "step": 11599 + }, + { + "epoch": 1.9770023790642348, + "grad_norm": 1.7109375, + "learning_rate": 5.152991935307969e-06, + "loss": 0.9119, + "step": 11600 + }, + { + "epoch": 1.9771738431532246, + "grad_norm": 1.6796875, + "learning_rate": 5.151412405073215e-06, + "loss": 0.8324, + "step": 11601 + }, + { + "epoch": 1.9773453072422145, + "grad_norm": 1.6484375, + "learning_rate": 5.1498330329691605e-06, + "loss": 0.8356, + "step": 11602 + }, + { + "epoch": 1.9775167713312043, + "grad_norm": 1.640625, + "learning_rate": 5.148253819047315e-06, + "loss": 0.833, + "step": 11603 + }, + { + "epoch": 1.9776882354201941, + "grad_norm": 1.65625, + "learning_rate": 5.146674763359183e-06, + "loss": 0.8032, + "step": 11604 + }, + { + "epoch": 1.977859699509184, + "grad_norm": 1.7890625, + "learning_rate": 5.145095865956262e-06, + "loss": 0.8332, + "step": 11605 + }, + { + "epoch": 1.9780311635981738, + "grad_norm": 1.7421875, + "learning_rate": 5.1435171268900474e-06, + "loss": 0.8906, + "step": 11606 + }, + { + "epoch": 1.9782026276871638, + "grad_norm": 1.640625, + "learning_rate": 5.1419385462120264e-06, + "loss": 0.9111, + "step": 11607 + }, + { + "epoch": 1.9783740917761536, + "grad_norm": 1.65625, + "learning_rate": 5.140360123973683e-06, + "loss": 0.8105, + "step": 11608 + }, + { + "epoch": 1.9785455558651435, + "grad_norm": 1.671875, + "learning_rate": 5.138781860226496e-06, + "loss": 0.8314, + "step": 11609 + }, + { + "epoch": 1.9787170199541335, + "grad_norm": 1.6328125, + "learning_rate": 5.137203755021938e-06, + "loss": 0.7603, + "step": 11610 + }, + { + "epoch": 1.9788884840431233, + "grad_norm": 1.671875, + "learning_rate": 5.135625808411478e-06, + "loss": 0.874, + "step": 11611 + }, + { + "epoch": 1.9790599481321132, + "grad_norm": 1.7265625, + "learning_rate": 5.134048020446578e-06, + "loss": 0.8629, + "step": 11612 + }, + { + "epoch": 1.979231412221103, + "grad_norm": 1.8125, + "learning_rate": 5.1324703911786946e-06, + "loss": 0.8188, + "step": 11613 + }, + { + "epoch": 1.9794028763100928, + "grad_norm": 1.7109375, + "learning_rate": 5.1308929206592805e-06, + "loss": 0.7701, + "step": 11614 + }, + { + "epoch": 1.9795743403990826, + "grad_norm": 1.703125, + "learning_rate": 5.129315608939786e-06, + "loss": 0.8955, + "step": 11615 + }, + { + "epoch": 1.9797458044880725, + "grad_norm": 1.7421875, + "learning_rate": 5.1277384560716495e-06, + "loss": 0.8836, + "step": 11616 + }, + { + "epoch": 1.9799172685770623, + "grad_norm": 1.703125, + "learning_rate": 5.126161462106308e-06, + "loss": 0.9262, + "step": 11617 + }, + { + "epoch": 1.9800887326660521, + "grad_norm": 1.7578125, + "learning_rate": 5.124584627095196e-06, + "loss": 0.8944, + "step": 11618 + }, + { + "epoch": 1.9802601967550422, + "grad_norm": 1.6484375, + "learning_rate": 5.123007951089743e-06, + "loss": 0.8255, + "step": 11619 + }, + { + "epoch": 1.980431660844032, + "grad_norm": 1.6796875, + "learning_rate": 5.12143143414136e-06, + "loss": 0.8353, + "step": 11620 + }, + { + "epoch": 1.9806031249330218, + "grad_norm": 1.7578125, + "learning_rate": 5.119855076301468e-06, + "loss": 0.8852, + "step": 11621 + }, + { + "epoch": 1.9807745890220119, + "grad_norm": 1.671875, + "learning_rate": 5.118278877621481e-06, + "loss": 0.8293, + "step": 11622 + }, + { + "epoch": 1.9809460531110017, + "grad_norm": 1.65625, + "learning_rate": 5.116702838152803e-06, + "loss": 0.8015, + "step": 11623 + }, + { + "epoch": 1.9811175171999915, + "grad_norm": 1.7265625, + "learning_rate": 5.115126957946832e-06, + "loss": 0.8779, + "step": 11624 + }, + { + "epoch": 1.9812889812889813, + "grad_norm": 1.8046875, + "learning_rate": 5.113551237054968e-06, + "loss": 0.869, + "step": 11625 + }, + { + "epoch": 1.9814604453779712, + "grad_norm": 1.796875, + "learning_rate": 5.111975675528598e-06, + "loss": 0.8304, + "step": 11626 + }, + { + "epoch": 1.981631909466961, + "grad_norm": 1.6171875, + "learning_rate": 5.110400273419104e-06, + "loss": 0.7922, + "step": 11627 + }, + { + "epoch": 1.9818033735559508, + "grad_norm": 1.625, + "learning_rate": 5.108825030777873e-06, + "loss": 0.8331, + "step": 11628 + }, + { + "epoch": 1.9819748376449406, + "grad_norm": 1.7421875, + "learning_rate": 5.107249947656276e-06, + "loss": 0.8969, + "step": 11629 + }, + { + "epoch": 1.9821463017339305, + "grad_norm": 1.7265625, + "learning_rate": 5.105675024105683e-06, + "loss": 0.8615, + "step": 11630 + }, + { + "epoch": 1.9823177658229203, + "grad_norm": 1.734375, + "learning_rate": 5.104100260177462e-06, + "loss": 0.8852, + "step": 11631 + }, + { + "epoch": 1.9824892299119103, + "grad_norm": 1.7890625, + "learning_rate": 5.102525655922963e-06, + "loss": 0.8795, + "step": 11632 + }, + { + "epoch": 1.9826606940009002, + "grad_norm": 1.6171875, + "learning_rate": 5.100951211393545e-06, + "loss": 0.8602, + "step": 11633 + }, + { + "epoch": 1.98283215808989, + "grad_norm": 1.71875, + "learning_rate": 5.099376926640558e-06, + "loss": 0.9049, + "step": 11634 + }, + { + "epoch": 1.98300362217888, + "grad_norm": 1.6484375, + "learning_rate": 5.097802801715341e-06, + "loss": 0.8136, + "step": 11635 + }, + { + "epoch": 1.9831750862678699, + "grad_norm": 1.6484375, + "learning_rate": 5.096228836669234e-06, + "loss": 0.8509, + "step": 11636 + }, + { + "epoch": 1.9833465503568597, + "grad_norm": 1.703125, + "learning_rate": 5.094655031553572e-06, + "loss": 0.8705, + "step": 11637 + }, + { + "epoch": 1.9835180144458495, + "grad_norm": 1.5703125, + "learning_rate": 5.093081386419682e-06, + "loss": 0.731, + "step": 11638 + }, + { + "epoch": 1.9836894785348393, + "grad_norm": 1.7109375, + "learning_rate": 5.091507901318885e-06, + "loss": 0.8829, + "step": 11639 + }, + { + "epoch": 1.9838609426238292, + "grad_norm": 1.734375, + "learning_rate": 5.0899345763024986e-06, + "loss": 0.8311, + "step": 11640 + }, + { + "epoch": 1.984032406712819, + "grad_norm": 1.71875, + "learning_rate": 5.088361411421836e-06, + "loss": 0.8326, + "step": 11641 + }, + { + "epoch": 1.9842038708018088, + "grad_norm": 1.6328125, + "learning_rate": 5.086788406728207e-06, + "loss": 0.8074, + "step": 11642 + }, + { + "epoch": 1.9843753348907986, + "grad_norm": 1.6640625, + "learning_rate": 5.085215562272901e-06, + "loss": 0.8817, + "step": 11643 + }, + { + "epoch": 1.9845467989797887, + "grad_norm": 1.7265625, + "learning_rate": 5.0836428781072266e-06, + "loss": 0.8249, + "step": 11644 + }, + { + "epoch": 1.9847182630687785, + "grad_norm": 1.734375, + "learning_rate": 5.082070354282473e-06, + "loss": 0.8485, + "step": 11645 + }, + { + "epoch": 1.9848897271577683, + "grad_norm": 1.75, + "learning_rate": 5.080497990849922e-06, + "loss": 0.9151, + "step": 11646 + }, + { + "epoch": 1.9850611912467584, + "grad_norm": 1.765625, + "learning_rate": 5.078925787860858e-06, + "loss": 0.7656, + "step": 11647 + }, + { + "epoch": 1.9852326553357482, + "grad_norm": 1.6484375, + "learning_rate": 5.077353745366555e-06, + "loss": 0.8011, + "step": 11648 + }, + { + "epoch": 1.985404119424738, + "grad_norm": 1.703125, + "learning_rate": 5.075781863418283e-06, + "loss": 0.8813, + "step": 11649 + }, + { + "epoch": 1.9855755835137279, + "grad_norm": 1.6171875, + "learning_rate": 5.074210142067309e-06, + "loss": 0.8026, + "step": 11650 + }, + { + "epoch": 1.9857470476027177, + "grad_norm": 1.7421875, + "learning_rate": 5.072638581364888e-06, + "loss": 0.8569, + "step": 11651 + }, + { + "epoch": 1.9859185116917075, + "grad_norm": 1.7578125, + "learning_rate": 5.071067181362279e-06, + "loss": 0.926, + "step": 11652 + }, + { + "epoch": 1.9860899757806973, + "grad_norm": 1.71875, + "learning_rate": 5.069495942110731e-06, + "loss": 0.8279, + "step": 11653 + }, + { + "epoch": 1.9862614398696872, + "grad_norm": 1.6953125, + "learning_rate": 5.06792486366149e-06, + "loss": 0.8042, + "step": 11654 + }, + { + "epoch": 1.986432903958677, + "grad_norm": 1.765625, + "learning_rate": 5.066353946065787e-06, + "loss": 0.8291, + "step": 11655 + }, + { + "epoch": 1.986604368047667, + "grad_norm": 1.6640625, + "learning_rate": 5.064783189374862e-06, + "loss": 0.8628, + "step": 11656 + }, + { + "epoch": 1.9867758321366569, + "grad_norm": 1.671875, + "learning_rate": 5.063212593639939e-06, + "loss": 0.8537, + "step": 11657 + }, + { + "epoch": 1.9869472962256467, + "grad_norm": 1.6875, + "learning_rate": 5.061642158912246e-06, + "loss": 0.8203, + "step": 11658 + }, + { + "epoch": 1.9871187603146367, + "grad_norm": 1.7578125, + "learning_rate": 5.060071885242996e-06, + "loss": 0.8631, + "step": 11659 + }, + { + "epoch": 1.9872902244036266, + "grad_norm": 1.765625, + "learning_rate": 5.0585017726834065e-06, + "loss": 0.801, + "step": 11660 + }, + { + "epoch": 1.9874616884926164, + "grad_norm": 1.6484375, + "learning_rate": 5.056931821284681e-06, + "loss": 0.87, + "step": 11661 + }, + { + "epoch": 1.9876331525816062, + "grad_norm": 1.8046875, + "learning_rate": 5.055362031098021e-06, + "loss": 0.8322, + "step": 11662 + }, + { + "epoch": 1.987804616670596, + "grad_norm": 1.671875, + "learning_rate": 5.053792402174627e-06, + "loss": 0.7921, + "step": 11663 + }, + { + "epoch": 1.9879760807595859, + "grad_norm": 1.7421875, + "learning_rate": 5.0522229345656885e-06, + "loss": 0.8372, + "step": 11664 + } + ], + "logging_steps": 1, + "max_steps": 17496, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5832, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.4421122547400573e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}