diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,7021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1000.0, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.0, + "grad_norm": 0.33083319664001465, + "learning_rate": 2e-05, + "loss": 1.2008, + "step": 1 + }, + { + "epoch": 2.0, + "grad_norm": 0.3303728997707367, + "learning_rate": 4e-05, + "loss": 1.2064, + "step": 2 + }, + { + "epoch": 3.0, + "grad_norm": 0.3344730734825134, + "learning_rate": 6e-05, + "loss": 1.1978, + "step": 3 + }, + { + "epoch": 4.0, + "grad_norm": 0.3437633514404297, + "learning_rate": 8e-05, + "loss": 1.1916, + "step": 4 + }, + { + "epoch": 5.0, + "grad_norm": 0.35260695219039917, + "learning_rate": 0.0001, + "loss": 1.1857, + "step": 5 + }, + { + "epoch": 6.0, + "grad_norm": 0.35693618655204773, + "learning_rate": 0.00012, + "loss": 1.1802, + "step": 6 + }, + { + "epoch": 7.0, + "grad_norm": 0.370437353849411, + "learning_rate": 0.00014000000000000001, + "loss": 1.1727, + "step": 7 + }, + { + "epoch": 8.0, + "grad_norm": 0.4025161564350128, + "learning_rate": 0.00016, + "loss": 1.1517, + "step": 8 + }, + { + "epoch": 9.0, + "grad_norm": 0.4241160750389099, + "learning_rate": 0.00017999999999999998, + "loss": 1.1291, + "step": 9 + }, + { + "epoch": 10.0, + "grad_norm": 0.4702780544757843, + "learning_rate": 0.0002, + "loss": 1.1016, + "step": 10 + }, + { + "epoch": 11.0, + "grad_norm": 0.4980923533439636, + "learning_rate": 0.00022, + "loss": 1.0659, + "step": 11 + }, + { + "epoch": 12.0, + "grad_norm": 0.49099475145339966, + "learning_rate": 0.00024, + "loss": 1.0292, + "step": 12 + }, + { + "epoch": 13.0, + "grad_norm": 0.3541075587272644, + "learning_rate": 0.00026000000000000003, + "loss": 0.9999, + "step": 13 + }, + { + "epoch": 14.0, + "grad_norm": 0.5386999845504761, + "learning_rate": 0.00028000000000000003, + "loss": 0.9961, + "step": 14 + }, + { + "epoch": 15.0, + "grad_norm": 0.7703613042831421, + "learning_rate": 0.0003, + "loss": 0.9971, + "step": 15 + }, + { + "epoch": 16.0, + "grad_norm": 0.76753169298172, + "learning_rate": 0.00032, + "loss": 0.9816, + "step": 16 + }, + { + "epoch": 17.0, + "grad_norm": 0.6905381679534912, + "learning_rate": 0.00034, + "loss": 0.9583, + "step": 17 + }, + { + "epoch": 18.0, + "grad_norm": 0.47108712792396545, + "learning_rate": 0.00035999999999999997, + "loss": 0.9283, + "step": 18 + }, + { + "epoch": 19.0, + "grad_norm": 0.23705267906188965, + "learning_rate": 0.00038, + "loss": 0.9124, + "step": 19 + }, + { + "epoch": 20.0, + "grad_norm": 0.40022554993629456, + "learning_rate": 0.0004, + "loss": 0.9014, + "step": 20 + }, + { + "epoch": 21.0, + "grad_norm": 0.46119189262390137, + "learning_rate": 0.00042, + "loss": 0.9008, + "step": 21 + }, + { + "epoch": 22.0, + "grad_norm": 0.3977269232273102, + "learning_rate": 0.00044, + "loss": 0.8831, + "step": 22 + }, + { + "epoch": 23.0, + "grad_norm": 0.317619651556015, + "learning_rate": 0.00046, + "loss": 0.8675, + "step": 23 + }, + { + "epoch": 24.0, + "grad_norm": 0.21384184062480927, + "learning_rate": 0.00048, + "loss": 0.8475, + "step": 24 + }, + { + "epoch": 25.0, + "grad_norm": 0.2396264374256134, + "learning_rate": 0.0005, + "loss": 0.8411, + "step": 25 + }, + { + "epoch": 26.0, + "grad_norm": 0.32538554072380066, + "learning_rate": 0.0005200000000000001, + "loss": 0.8369, + "step": 26 + }, + { + "epoch": 27.0, + "grad_norm": 0.30055898427963257, + "learning_rate": 0.00054, + "loss": 0.8208, + "step": 27 + }, + { + "epoch": 28.0, + "grad_norm": 0.24997705221176147, + "learning_rate": 0.0005600000000000001, + "loss": 0.8115, + "step": 28 + }, + { + "epoch": 29.0, + "grad_norm": 0.17621903121471405, + "learning_rate": 0.00058, + "loss": 0.7968, + "step": 29 + }, + { + "epoch": 30.0, + "grad_norm": 0.2194773107767105, + "learning_rate": 0.0006, + "loss": 0.7909, + "step": 30 + }, + { + "epoch": 31.0, + "grad_norm": 0.2789114713668823, + "learning_rate": 0.00062, + "loss": 0.7807, + "step": 31 + }, + { + "epoch": 32.0, + "grad_norm": 0.2547268569469452, + "learning_rate": 0.00064, + "loss": 0.7746, + "step": 32 + }, + { + "epoch": 33.0, + "grad_norm": 0.1717722862958908, + "learning_rate": 0.00066, + "loss": 0.7638, + "step": 33 + }, + { + "epoch": 34.0, + "grad_norm": 0.1685272604227066, + "learning_rate": 0.00068, + "loss": 0.7537, + "step": 34 + }, + { + "epoch": 35.0, + "grad_norm": 0.1899716705083847, + "learning_rate": 0.0007, + "loss": 0.7477, + "step": 35 + }, + { + "epoch": 36.0, + "grad_norm": 0.17042605578899384, + "learning_rate": 0.0007199999999999999, + "loss": 0.7332, + "step": 36 + }, + { + "epoch": 37.0, + "grad_norm": 0.12942233681678772, + "learning_rate": 0.00074, + "loss": 0.7308, + "step": 37 + }, + { + "epoch": 38.0, + "grad_norm": 0.17319905757904053, + "learning_rate": 0.00076, + "loss": 0.7168, + "step": 38 + }, + { + "epoch": 39.0, + "grad_norm": 0.2035578042268753, + "learning_rate": 0.0007800000000000001, + "loss": 0.7101, + "step": 39 + }, + { + "epoch": 40.0, + "grad_norm": 0.12170140445232391, + "learning_rate": 0.0008, + "loss": 0.6984, + "step": 40 + }, + { + "epoch": 41.0, + "grad_norm": 0.15441377460956573, + "learning_rate": 0.00082, + "loss": 0.689, + "step": 41 + }, + { + "epoch": 42.0, + "grad_norm": 0.2036806046962738, + "learning_rate": 0.00084, + "loss": 0.6795, + "step": 42 + }, + { + "epoch": 43.0, + "grad_norm": 0.1200585812330246, + "learning_rate": 0.00086, + "loss": 0.674, + "step": 43 + }, + { + "epoch": 44.0, + "grad_norm": 0.17891642451286316, + "learning_rate": 0.00088, + "loss": 0.6666, + "step": 44 + }, + { + "epoch": 45.0, + "grad_norm": 0.15115144848823547, + "learning_rate": 0.0009000000000000001, + "loss": 0.6556, + "step": 45 + }, + { + "epoch": 46.0, + "grad_norm": 0.1360883265733719, + "learning_rate": 0.00092, + "loss": 0.6455, + "step": 46 + }, + { + "epoch": 47.0, + "grad_norm": 0.15223605930805206, + "learning_rate": 0.00094, + "loss": 0.6417, + "step": 47 + }, + { + "epoch": 48.0, + "grad_norm": 0.1136261597275734, + "learning_rate": 0.00096, + "loss": 0.6329, + "step": 48 + }, + { + "epoch": 49.0, + "grad_norm": 0.14590047299861908, + "learning_rate": 0.00098, + "loss": 0.6202, + "step": 49 + }, + { + "epoch": 50.0, + "grad_norm": 0.1152251586318016, + "learning_rate": 0.001, + "loss": 0.61, + "step": 50 + }, + { + "epoch": 51.0, + "grad_norm": 0.13655897974967957, + "learning_rate": 0.00102, + "loss": 0.5998, + "step": 51 + }, + { + "epoch": 52.0, + "grad_norm": 0.12263485044240952, + "learning_rate": 0.0010400000000000001, + "loss": 0.5932, + "step": 52 + }, + { + "epoch": 53.0, + "grad_norm": 0.13639847934246063, + "learning_rate": 0.0010600000000000002, + "loss": 0.5835, + "step": 53 + }, + { + "epoch": 54.0, + "grad_norm": 0.11983615159988403, + "learning_rate": 0.00108, + "loss": 0.5726, + "step": 54 + }, + { + "epoch": 55.0, + "grad_norm": 0.10973749309778214, + "learning_rate": 0.0011, + "loss": 0.5625, + "step": 55 + }, + { + "epoch": 56.0, + "grad_norm": 0.12474089860916138, + "learning_rate": 0.0011200000000000001, + "loss": 0.5522, + "step": 56 + }, + { + "epoch": 57.0, + "grad_norm": 0.13365118205547333, + "learning_rate": 0.00114, + "loss": 0.5432, + "step": 57 + }, + { + "epoch": 58.0, + "grad_norm": 0.11922751367092133, + "learning_rate": 0.00116, + "loss": 0.5289, + "step": 58 + }, + { + "epoch": 59.0, + "grad_norm": 0.12625685334205627, + "learning_rate": 0.00118, + "loss": 0.5248, + "step": 59 + }, + { + "epoch": 60.0, + "grad_norm": 0.16271014511585236, + "learning_rate": 0.0012, + "loss": 0.5091, + "step": 60 + }, + { + "epoch": 61.0, + "grad_norm": 0.17841878533363342, + "learning_rate": 0.00122, + "loss": 0.4991, + "step": 61 + }, + { + "epoch": 62.0, + "grad_norm": 0.22032824158668518, + "learning_rate": 0.00124, + "loss": 0.4945, + "step": 62 + }, + { + "epoch": 63.0, + "grad_norm": 0.22950832545757294, + "learning_rate": 0.00126, + "loss": 0.4829, + "step": 63 + }, + { + "epoch": 64.0, + "grad_norm": 0.26760056614875793, + "learning_rate": 0.00128, + "loss": 0.4725, + "step": 64 + }, + { + "epoch": 65.0, + "grad_norm": 0.2522161602973938, + "learning_rate": 0.0013000000000000002, + "loss": 0.4595, + "step": 65 + }, + { + "epoch": 66.0, + "grad_norm": 0.24825534224510193, + "learning_rate": 0.00132, + "loss": 0.4498, + "step": 66 + }, + { + "epoch": 67.0, + "grad_norm": 0.192153200507164, + "learning_rate": 0.00134, + "loss": 0.4397, + "step": 67 + }, + { + "epoch": 68.0, + "grad_norm": 0.15480364859104156, + "learning_rate": 0.00136, + "loss": 0.4315, + "step": 68 + }, + { + "epoch": 69.0, + "grad_norm": 0.1302233189344406, + "learning_rate": 0.00138, + "loss": 0.4166, + "step": 69 + }, + { + "epoch": 70.0, + "grad_norm": 0.12858335673809052, + "learning_rate": 0.0014, + "loss": 0.4061, + "step": 70 + }, + { + "epoch": 71.0, + "grad_norm": 0.14819584786891937, + "learning_rate": 0.00142, + "loss": 0.3977, + "step": 71 + }, + { + "epoch": 72.0, + "grad_norm": 0.20153498649597168, + "learning_rate": 0.0014399999999999999, + "loss": 0.391, + "step": 72 + }, + { + "epoch": 73.0, + "grad_norm": 0.22449156641960144, + "learning_rate": 0.00146, + "loss": 0.3788, + "step": 73 + }, + { + "epoch": 74.0, + "grad_norm": 0.20828841626644135, + "learning_rate": 0.00148, + "loss": 0.3685, + "step": 74 + }, + { + "epoch": 75.0, + "grad_norm": 0.20556601881980896, + "learning_rate": 0.0015, + "loss": 0.3578, + "step": 75 + }, + { + "epoch": 76.0, + "grad_norm": 0.14805461466312408, + "learning_rate": 0.00152, + "loss": 0.3496, + "step": 76 + }, + { + "epoch": 77.0, + "grad_norm": 0.1653750240802765, + "learning_rate": 0.0015400000000000001, + "loss": 0.3421, + "step": 77 + }, + { + "epoch": 78.0, + "grad_norm": 0.15252411365509033, + "learning_rate": 0.0015600000000000002, + "loss": 0.3307, + "step": 78 + }, + { + "epoch": 79.0, + "grad_norm": 0.17012618482112885, + "learning_rate": 0.00158, + "loss": 0.3219, + "step": 79 + }, + { + "epoch": 80.0, + "grad_norm": 0.22152607142925262, + "learning_rate": 0.0016, + "loss": 0.3131, + "step": 80 + }, + { + "epoch": 81.0, + "grad_norm": 0.33657050132751465, + "learning_rate": 0.0016200000000000001, + "loss": 0.3072, + "step": 81 + }, + { + "epoch": 82.0, + "grad_norm": 0.6131005883216858, + "learning_rate": 0.00164, + "loss": 0.3065, + "step": 82 + }, + { + "epoch": 83.0, + "grad_norm": 0.8047762513160706, + "learning_rate": 0.00166, + "loss": 0.3261, + "step": 83 + }, + { + "epoch": 84.0, + "grad_norm": 0.2988828420639038, + "learning_rate": 0.00168, + "loss": 0.2857, + "step": 84 + }, + { + "epoch": 85.0, + "grad_norm": 0.4759994149208069, + "learning_rate": 0.0017, + "loss": 0.2885, + "step": 85 + }, + { + "epoch": 86.0, + "grad_norm": 0.29689377546310425, + "learning_rate": 0.00172, + "loss": 0.2734, + "step": 86 + }, + { + "epoch": 87.0, + "grad_norm": 0.3681173026561737, + "learning_rate": 0.00174, + "loss": 0.2711, + "step": 87 + }, + { + "epoch": 88.0, + "grad_norm": 0.23025013506412506, + "learning_rate": 0.00176, + "loss": 0.2591, + "step": 88 + }, + { + "epoch": 89.0, + "grad_norm": 0.3065023124217987, + "learning_rate": 0.0017800000000000001, + "loss": 0.2552, + "step": 89 + }, + { + "epoch": 90.0, + "grad_norm": 0.20948432385921478, + "learning_rate": 0.0018000000000000002, + "loss": 0.2454, + "step": 90 + }, + { + "epoch": 91.0, + "grad_norm": 0.303070068359375, + "learning_rate": 0.00182, + "loss": 0.2402, + "step": 91 + }, + { + "epoch": 92.0, + "grad_norm": 0.17348457872867584, + "learning_rate": 0.00184, + "loss": 0.2282, + "step": 92 + }, + { + "epoch": 93.0, + "grad_norm": 0.2221260964870453, + "learning_rate": 0.00186, + "loss": 0.227, + "step": 93 + }, + { + "epoch": 94.0, + "grad_norm": 0.2136717438697815, + "learning_rate": 0.00188, + "loss": 0.217, + "step": 94 + }, + { + "epoch": 95.0, + "grad_norm": 0.18598757684230804, + "learning_rate": 0.0019, + "loss": 0.2093, + "step": 95 + }, + { + "epoch": 96.0, + "grad_norm": 0.2381504476070404, + "learning_rate": 0.00192, + "loss": 0.2075, + "step": 96 + }, + { + "epoch": 97.0, + "grad_norm": 0.16604942083358765, + "learning_rate": 0.0019399999999999999, + "loss": 0.1968, + "step": 97 + }, + { + "epoch": 98.0, + "grad_norm": 0.1756654977798462, + "learning_rate": 0.00196, + "loss": 0.1914, + "step": 98 + }, + { + "epoch": 99.0, + "grad_norm": 0.18636329472064972, + "learning_rate": 0.00198, + "loss": 0.1871, + "step": 99 + }, + { + "epoch": 100.0, + "grad_norm": 0.15465092658996582, + "learning_rate": 0.002, + "loss": 0.1811, + "step": 100 + }, + { + "epoch": 101.0, + "grad_norm": 0.17197373509407043, + "learning_rate": 0.001997777777777778, + "loss": 0.1776, + "step": 101 + }, + { + "epoch": 102.0, + "grad_norm": 0.1587797999382019, + "learning_rate": 0.0019955555555555555, + "loss": 0.1711, + "step": 102 + }, + { + "epoch": 103.0, + "grad_norm": 0.16476012766361237, + "learning_rate": 0.0019933333333333335, + "loss": 0.1672, + "step": 103 + }, + { + "epoch": 104.0, + "grad_norm": 0.15602949261665344, + "learning_rate": 0.001991111111111111, + "loss": 0.1614, + "step": 104 + }, + { + "epoch": 105.0, + "grad_norm": 0.14582307636737823, + "learning_rate": 0.001988888888888889, + "loss": 0.1538, + "step": 105 + }, + { + "epoch": 106.0, + "grad_norm": 0.15502554178237915, + "learning_rate": 0.0019866666666666665, + "loss": 0.1507, + "step": 106 + }, + { + "epoch": 107.0, + "grad_norm": 0.16572409868240356, + "learning_rate": 0.0019844444444444445, + "loss": 0.1479, + "step": 107 + }, + { + "epoch": 108.0, + "grad_norm": 0.12365152686834335, + "learning_rate": 0.0019822222222222225, + "loss": 0.1416, + "step": 108 + }, + { + "epoch": 109.0, + "grad_norm": 0.18411065638065338, + "learning_rate": 0.00198, + "loss": 0.1401, + "step": 109 + }, + { + "epoch": 110.0, + "grad_norm": 0.15501633286476135, + "learning_rate": 0.001977777777777778, + "loss": 0.1352, + "step": 110 + }, + { + "epoch": 111.0, + "grad_norm": 0.151006817817688, + "learning_rate": 0.0019755555555555555, + "loss": 0.1297, + "step": 111 + }, + { + "epoch": 112.0, + "grad_norm": 0.1714586317539215, + "learning_rate": 0.0019733333333333334, + "loss": 0.1287, + "step": 112 + }, + { + "epoch": 113.0, + "grad_norm": 0.14063534140586853, + "learning_rate": 0.001971111111111111, + "loss": 0.124, + "step": 113 + }, + { + "epoch": 114.0, + "grad_norm": 0.15380504727363586, + "learning_rate": 0.001968888888888889, + "loss": 0.1213, + "step": 114 + }, + { + "epoch": 115.0, + "grad_norm": 0.13425365090370178, + "learning_rate": 0.0019666666666666665, + "loss": 0.1164, + "step": 115 + }, + { + "epoch": 116.0, + "grad_norm": 0.16011053323745728, + "learning_rate": 0.0019644444444444444, + "loss": 0.1153, + "step": 116 + }, + { + "epoch": 117.0, + "grad_norm": 0.12487512826919556, + "learning_rate": 0.0019622222222222224, + "loss": 0.1102, + "step": 117 + }, + { + "epoch": 118.0, + "grad_norm": 0.16518144309520721, + "learning_rate": 0.00196, + "loss": 0.1094, + "step": 118 + }, + { + "epoch": 119.0, + "grad_norm": 0.12795649468898773, + "learning_rate": 0.001957777777777778, + "loss": 0.1053, + "step": 119 + }, + { + "epoch": 120.0, + "grad_norm": 0.17986378073692322, + "learning_rate": 0.0019555555555555554, + "loss": 0.1028, + "step": 120 + }, + { + "epoch": 121.0, + "grad_norm": 0.12250766158103943, + "learning_rate": 0.0019533333333333334, + "loss": 0.0997, + "step": 121 + }, + { + "epoch": 122.0, + "grad_norm": 0.16212934255599976, + "learning_rate": 0.0019511111111111111, + "loss": 0.1001, + "step": 122 + }, + { + "epoch": 123.0, + "grad_norm": 0.11872928589582443, + "learning_rate": 0.001948888888888889, + "loss": 0.0955, + "step": 123 + }, + { + "epoch": 124.0, + "grad_norm": 0.14130988717079163, + "learning_rate": 0.0019466666666666669, + "loss": 0.0928, + "step": 124 + }, + { + "epoch": 125.0, + "grad_norm": 0.12492092698812485, + "learning_rate": 0.0019444444444444444, + "loss": 0.0901, + "step": 125 + }, + { + "epoch": 126.0, + "grad_norm": 0.1664874404668808, + "learning_rate": 0.0019422222222222224, + "loss": 0.0897, + "step": 126 + }, + { + "epoch": 127.0, + "grad_norm": 0.13373196125030518, + "learning_rate": 0.0019399999999999999, + "loss": 0.0873, + "step": 127 + }, + { + "epoch": 128.0, + "grad_norm": 0.15403704345226288, + "learning_rate": 0.0019377777777777778, + "loss": 0.0842, + "step": 128 + }, + { + "epoch": 129.0, + "grad_norm": 0.12807263433933258, + "learning_rate": 0.0019355555555555556, + "loss": 0.0815, + "step": 129 + }, + { + "epoch": 130.0, + "grad_norm": 0.11713800579309464, + "learning_rate": 0.0019333333333333333, + "loss": 0.0798, + "step": 130 + }, + { + "epoch": 131.0, + "grad_norm": 0.11321233212947845, + "learning_rate": 0.001931111111111111, + "loss": 0.0783, + "step": 131 + }, + { + "epoch": 132.0, + "grad_norm": 0.11122141778469086, + "learning_rate": 0.0019288888888888888, + "loss": 0.0762, + "step": 132 + }, + { + "epoch": 133.0, + "grad_norm": 0.11468366533517838, + "learning_rate": 0.0019266666666666668, + "loss": 0.0744, + "step": 133 + }, + { + "epoch": 134.0, + "grad_norm": 0.10164441913366318, + "learning_rate": 0.0019244444444444443, + "loss": 0.0715, + "step": 134 + }, + { + "epoch": 135.0, + "grad_norm": 0.1132010817527771, + "learning_rate": 0.0019222222222222223, + "loss": 0.0704, + "step": 135 + }, + { + "epoch": 136.0, + "grad_norm": 0.10453721135854721, + "learning_rate": 0.00192, + "loss": 0.0683, + "step": 136 + }, + { + "epoch": 137.0, + "grad_norm": 0.11656288057565689, + "learning_rate": 0.0019177777777777778, + "loss": 0.0671, + "step": 137 + }, + { + "epoch": 138.0, + "grad_norm": 0.10022811591625214, + "learning_rate": 0.0019155555555555555, + "loss": 0.0633, + "step": 138 + }, + { + "epoch": 139.0, + "grad_norm": 0.10532992333173752, + "learning_rate": 0.0019133333333333333, + "loss": 0.0652, + "step": 139 + }, + { + "epoch": 140.0, + "grad_norm": 0.1101190447807312, + "learning_rate": 0.0019111111111111113, + "loss": 0.0641, + "step": 140 + }, + { + "epoch": 141.0, + "grad_norm": 0.12256909906864166, + "learning_rate": 0.001908888888888889, + "loss": 0.0618, + "step": 141 + }, + { + "epoch": 142.0, + "grad_norm": 0.10809922963380814, + "learning_rate": 0.0019066666666666668, + "loss": 0.0604, + "step": 142 + }, + { + "epoch": 143.0, + "grad_norm": 0.10069157928228378, + "learning_rate": 0.0019044444444444445, + "loss": 0.0577, + "step": 143 + }, + { + "epoch": 144.0, + "grad_norm": 0.10927508026361465, + "learning_rate": 0.0019022222222222222, + "loss": 0.0568, + "step": 144 + }, + { + "epoch": 145.0, + "grad_norm": 0.11600210517644882, + "learning_rate": 0.0019, + "loss": 0.0569, + "step": 145 + }, + { + "epoch": 146.0, + "grad_norm": 0.12598258256912231, + "learning_rate": 0.0018977777777777777, + "loss": 0.0556, + "step": 146 + }, + { + "epoch": 147.0, + "grad_norm": 0.11476030200719833, + "learning_rate": 0.0018955555555555557, + "loss": 0.0537, + "step": 147 + }, + { + "epoch": 148.0, + "grad_norm": 0.13637323677539825, + "learning_rate": 0.0018933333333333335, + "loss": 0.0555, + "step": 148 + }, + { + "epoch": 149.0, + "grad_norm": 0.11303841322660446, + "learning_rate": 0.0018911111111111112, + "loss": 0.0514, + "step": 149 + }, + { + "epoch": 150.0, + "grad_norm": 0.12342188507318497, + "learning_rate": 0.001888888888888889, + "loss": 0.052, + "step": 150 + }, + { + "epoch": 151.0, + "grad_norm": 0.10494454950094223, + "learning_rate": 0.0018866666666666667, + "loss": 0.0496, + "step": 151 + }, + { + "epoch": 152.0, + "grad_norm": 0.18033015727996826, + "learning_rate": 0.0018844444444444444, + "loss": 0.0506, + "step": 152 + }, + { + "epoch": 153.0, + "grad_norm": 0.1556997448205948, + "learning_rate": 0.0018822222222222222, + "loss": 0.0493, + "step": 153 + }, + { + "epoch": 154.0, + "grad_norm": 0.14510443806648254, + "learning_rate": 0.00188, + "loss": 0.0488, + "step": 154 + }, + { + "epoch": 155.0, + "grad_norm": 0.18192656338214874, + "learning_rate": 0.001877777777777778, + "loss": 0.0489, + "step": 155 + }, + { + "epoch": 156.0, + "grad_norm": 0.11847592890262604, + "learning_rate": 0.0018755555555555557, + "loss": 0.0463, + "step": 156 + }, + { + "epoch": 157.0, + "grad_norm": 0.15888793766498566, + "learning_rate": 0.0018733333333333334, + "loss": 0.0462, + "step": 157 + }, + { + "epoch": 158.0, + "grad_norm": 0.12652742862701416, + "learning_rate": 0.0018711111111111112, + "loss": 0.0451, + "step": 158 + }, + { + "epoch": 159.0, + "grad_norm": 0.16675762832164764, + "learning_rate": 0.001868888888888889, + "loss": 0.0446, + "step": 159 + }, + { + "epoch": 160.0, + "grad_norm": 0.10220520198345184, + "learning_rate": 0.0018666666666666666, + "loss": 0.0422, + "step": 160 + }, + { + "epoch": 161.0, + "grad_norm": 0.11163638532161713, + "learning_rate": 0.0018644444444444444, + "loss": 0.0424, + "step": 161 + }, + { + "epoch": 162.0, + "grad_norm": 0.11750265210866928, + "learning_rate": 0.0018622222222222224, + "loss": 0.0418, + "step": 162 + }, + { + "epoch": 163.0, + "grad_norm": 0.09098522365093231, + "learning_rate": 0.00186, + "loss": 0.0395, + "step": 163 + }, + { + "epoch": 164.0, + "grad_norm": 0.10419722646474838, + "learning_rate": 0.0018577777777777779, + "loss": 0.0395, + "step": 164 + }, + { + "epoch": 165.0, + "grad_norm": 0.09210154414176941, + "learning_rate": 0.0018555555555555556, + "loss": 0.0382, + "step": 165 + }, + { + "epoch": 166.0, + "grad_norm": 0.10829305648803711, + "learning_rate": 0.0018533333333333334, + "loss": 0.0379, + "step": 166 + }, + { + "epoch": 167.0, + "grad_norm": 0.08274272084236145, + "learning_rate": 0.001851111111111111, + "loss": 0.0375, + "step": 167 + }, + { + "epoch": 168.0, + "grad_norm": 0.08142054080963135, + "learning_rate": 0.0018488888888888888, + "loss": 0.0363, + "step": 168 + }, + { + "epoch": 169.0, + "grad_norm": 0.1034446507692337, + "learning_rate": 0.0018466666666666668, + "loss": 0.037, + "step": 169 + }, + { + "epoch": 170.0, + "grad_norm": 0.08171215653419495, + "learning_rate": 0.0018444444444444446, + "loss": 0.0365, + "step": 170 + }, + { + "epoch": 171.0, + "grad_norm": 0.1313827484846115, + "learning_rate": 0.0018422222222222223, + "loss": 0.0372, + "step": 171 + }, + { + "epoch": 172.0, + "grad_norm": 0.08655200153589249, + "learning_rate": 0.00184, + "loss": 0.0352, + "step": 172 + }, + { + "epoch": 173.0, + "grad_norm": 0.1611122488975525, + "learning_rate": 0.0018377777777777778, + "loss": 0.0355, + "step": 173 + }, + { + "epoch": 174.0, + "grad_norm": 0.07916305959224701, + "learning_rate": 0.0018355555555555556, + "loss": 0.0338, + "step": 174 + }, + { + "epoch": 175.0, + "grad_norm": 0.14963340759277344, + "learning_rate": 0.0018333333333333333, + "loss": 0.0355, + "step": 175 + }, + { + "epoch": 176.0, + "grad_norm": 0.08377885818481445, + "learning_rate": 0.0018311111111111113, + "loss": 0.0327, + "step": 176 + }, + { + "epoch": 177.0, + "grad_norm": 0.1263081133365631, + "learning_rate": 0.0018288888888888888, + "loss": 0.0339, + "step": 177 + }, + { + "epoch": 178.0, + "grad_norm": 0.09036055207252502, + "learning_rate": 0.0018266666666666668, + "loss": 0.0319, + "step": 178 + }, + { + "epoch": 179.0, + "grad_norm": 0.08474431186914444, + "learning_rate": 0.0018244444444444445, + "loss": 0.0312, + "step": 179 + }, + { + "epoch": 180.0, + "grad_norm": 0.1496538668870926, + "learning_rate": 0.0018222222222222223, + "loss": 0.0325, + "step": 180 + }, + { + "epoch": 181.0, + "grad_norm": 0.0777578204870224, + "learning_rate": 0.00182, + "loss": 0.0301, + "step": 181 + }, + { + "epoch": 182.0, + "grad_norm": 0.14861616492271423, + "learning_rate": 0.0018177777777777778, + "loss": 0.0316, + "step": 182 + }, + { + "epoch": 183.0, + "grad_norm": 0.07743405550718307, + "learning_rate": 0.0018155555555555557, + "loss": 0.0302, + "step": 183 + }, + { + "epoch": 184.0, + "grad_norm": 0.12615865468978882, + "learning_rate": 0.0018133333333333332, + "loss": 0.0307, + "step": 184 + }, + { + "epoch": 185.0, + "grad_norm": 0.09123097360134125, + "learning_rate": 0.0018111111111111112, + "loss": 0.0283, + "step": 185 + }, + { + "epoch": 186.0, + "grad_norm": 0.08183681964874268, + "learning_rate": 0.001808888888888889, + "loss": 0.0288, + "step": 186 + }, + { + "epoch": 187.0, + "grad_norm": 0.12389379739761353, + "learning_rate": 0.0018066666666666667, + "loss": 0.029, + "step": 187 + }, + { + "epoch": 188.0, + "grad_norm": 0.07837918400764465, + "learning_rate": 0.0018044444444444445, + "loss": 0.0269, + "step": 188 + }, + { + "epoch": 189.0, + "grad_norm": 0.13364242017269135, + "learning_rate": 0.0018022222222222222, + "loss": 0.0289, + "step": 189 + }, + { + "epoch": 190.0, + "grad_norm": 0.06742086261510849, + "learning_rate": 0.0018000000000000002, + "loss": 0.0265, + "step": 190 + }, + { + "epoch": 191.0, + "grad_norm": 0.12599071860313416, + "learning_rate": 0.0017977777777777777, + "loss": 0.0281, + "step": 191 + }, + { + "epoch": 192.0, + "grad_norm": 0.0692119225859642, + "learning_rate": 0.0017955555555555557, + "loss": 0.0259, + "step": 192 + }, + { + "epoch": 193.0, + "grad_norm": 0.08335267007350922, + "learning_rate": 0.0017933333333333332, + "loss": 0.0256, + "step": 193 + }, + { + "epoch": 194.0, + "grad_norm": 0.09375467896461487, + "learning_rate": 0.0017911111111111112, + "loss": 0.0254, + "step": 194 + }, + { + "epoch": 195.0, + "grad_norm": 0.0680534839630127, + "learning_rate": 0.001788888888888889, + "loss": 0.0252, + "step": 195 + }, + { + "epoch": 196.0, + "grad_norm": 0.09214572608470917, + "learning_rate": 0.0017866666666666667, + "loss": 0.0259, + "step": 196 + }, + { + "epoch": 197.0, + "grad_norm": 0.07533039897680283, + "learning_rate": 0.0017844444444444446, + "loss": 0.0249, + "step": 197 + }, + { + "epoch": 198.0, + "grad_norm": 0.06569315493106842, + "learning_rate": 0.0017822222222222222, + "loss": 0.0245, + "step": 198 + }, + { + "epoch": 199.0, + "grad_norm": 0.11427601426839828, + "learning_rate": 0.0017800000000000001, + "loss": 0.0248, + "step": 199 + }, + { + "epoch": 200.0, + "grad_norm": 0.06313782185316086, + "learning_rate": 0.0017777777777777776, + "loss": 0.0237, + "step": 200 + }, + { + "epoch": 201.0, + "grad_norm": 0.08156254887580872, + "learning_rate": 0.0017755555555555556, + "loss": 0.0237, + "step": 201 + }, + { + "epoch": 202.0, + "grad_norm": 0.06546188145875931, + "learning_rate": 0.0017733333333333334, + "loss": 0.0229, + "step": 202 + }, + { + "epoch": 203.0, + "grad_norm": 0.06551296263933182, + "learning_rate": 0.001771111111111111, + "loss": 0.0227, + "step": 203 + }, + { + "epoch": 204.0, + "grad_norm": 0.06212465465068817, + "learning_rate": 0.001768888888888889, + "loss": 0.0224, + "step": 204 + }, + { + "epoch": 205.0, + "grad_norm": 0.06709878146648407, + "learning_rate": 0.0017666666666666666, + "loss": 0.0225, + "step": 205 + }, + { + "epoch": 206.0, + "grad_norm": 0.06637589633464813, + "learning_rate": 0.0017644444444444446, + "loss": 0.0228, + "step": 206 + }, + { + "epoch": 207.0, + "grad_norm": 0.07172456383705139, + "learning_rate": 0.001762222222222222, + "loss": 0.0218, + "step": 207 + }, + { + "epoch": 208.0, + "grad_norm": 0.06298528611660004, + "learning_rate": 0.00176, + "loss": 0.0219, + "step": 208 + }, + { + "epoch": 209.0, + "grad_norm": 0.06474582105875015, + "learning_rate": 0.001757777777777778, + "loss": 0.0205, + "step": 209 + }, + { + "epoch": 210.0, + "grad_norm": 0.064134880900383, + "learning_rate": 0.0017555555555555556, + "loss": 0.0211, + "step": 210 + }, + { + "epoch": 211.0, + "grad_norm": 0.06449710577726364, + "learning_rate": 0.0017533333333333335, + "loss": 0.0207, + "step": 211 + }, + { + "epoch": 212.0, + "grad_norm": 0.05953633040189743, + "learning_rate": 0.001751111111111111, + "loss": 0.0202, + "step": 212 + }, + { + "epoch": 213.0, + "grad_norm": 0.06264209747314453, + "learning_rate": 0.001748888888888889, + "loss": 0.0209, + "step": 213 + }, + { + "epoch": 214.0, + "grad_norm": 0.056829385459423065, + "learning_rate": 0.0017466666666666665, + "loss": 0.0203, + "step": 214 + }, + { + "epoch": 215.0, + "grad_norm": 0.06606360524892807, + "learning_rate": 0.0017444444444444445, + "loss": 0.0207, + "step": 215 + }, + { + "epoch": 216.0, + "grad_norm": 0.05813910439610481, + "learning_rate": 0.001742222222222222, + "loss": 0.02, + "step": 216 + }, + { + "epoch": 217.0, + "grad_norm": 0.06723108887672424, + "learning_rate": 0.00174, + "loss": 0.0192, + "step": 217 + }, + { + "epoch": 218.0, + "grad_norm": 0.05759569630026817, + "learning_rate": 0.001737777777777778, + "loss": 0.0194, + "step": 218 + }, + { + "epoch": 219.0, + "grad_norm": 0.07146891206502914, + "learning_rate": 0.0017355555555555555, + "loss": 0.0198, + "step": 219 + }, + { + "epoch": 220.0, + "grad_norm": 0.06193268671631813, + "learning_rate": 0.0017333333333333335, + "loss": 0.0196, + "step": 220 + }, + { + "epoch": 221.0, + "grad_norm": 0.05935683101415634, + "learning_rate": 0.001731111111111111, + "loss": 0.0185, + "step": 221 + }, + { + "epoch": 222.0, + "grad_norm": 0.05822020396590233, + "learning_rate": 0.001728888888888889, + "loss": 0.0188, + "step": 222 + }, + { + "epoch": 223.0, + "grad_norm": 0.05631418898701668, + "learning_rate": 0.0017266666666666667, + "loss": 0.0182, + "step": 223 + }, + { + "epoch": 224.0, + "grad_norm": 0.05941098555922508, + "learning_rate": 0.0017244444444444445, + "loss": 0.0185, + "step": 224 + }, + { + "epoch": 225.0, + "grad_norm": 0.060537729412317276, + "learning_rate": 0.0017222222222222224, + "loss": 0.0184, + "step": 225 + }, + { + "epoch": 226.0, + "grad_norm": 0.05600609630346298, + "learning_rate": 0.00172, + "loss": 0.0182, + "step": 226 + }, + { + "epoch": 227.0, + "grad_norm": 0.06475334614515305, + "learning_rate": 0.001717777777777778, + "loss": 0.0173, + "step": 227 + }, + { + "epoch": 228.0, + "grad_norm": 0.06878049671649933, + "learning_rate": 0.0017155555555555555, + "loss": 0.0179, + "step": 228 + }, + { + "epoch": 229.0, + "grad_norm": 0.07705635577440262, + "learning_rate": 0.0017133333333333334, + "loss": 0.0176, + "step": 229 + }, + { + "epoch": 230.0, + "grad_norm": 0.07096253335475922, + "learning_rate": 0.0017111111111111112, + "loss": 0.0174, + "step": 230 + }, + { + "epoch": 231.0, + "grad_norm": 0.0696563646197319, + "learning_rate": 0.001708888888888889, + "loss": 0.0178, + "step": 231 + }, + { + "epoch": 232.0, + "grad_norm": 0.08657005429267883, + "learning_rate": 0.0017066666666666669, + "loss": 0.0176, + "step": 232 + }, + { + "epoch": 233.0, + "grad_norm": 0.05430256947875023, + "learning_rate": 0.0017044444444444444, + "loss": 0.0166, + "step": 233 + }, + { + "epoch": 234.0, + "grad_norm": 0.09091652184724808, + "learning_rate": 0.0017022222222222224, + "loss": 0.0171, + "step": 234 + }, + { + "epoch": 235.0, + "grad_norm": 0.060334715992212296, + "learning_rate": 0.0017, + "loss": 0.0172, + "step": 235 + }, + { + "epoch": 236.0, + "grad_norm": 0.08081120997667313, + "learning_rate": 0.0016977777777777779, + "loss": 0.0163, + "step": 236 + }, + { + "epoch": 237.0, + "grad_norm": 0.05501524358987808, + "learning_rate": 0.0016955555555555556, + "loss": 0.0161, + "step": 237 + }, + { + "epoch": 238.0, + "grad_norm": 0.05665547773241997, + "learning_rate": 0.0016933333333333334, + "loss": 0.0161, + "step": 238 + }, + { + "epoch": 239.0, + "grad_norm": 0.06293457001447678, + "learning_rate": 0.0016911111111111111, + "loss": 0.0165, + "step": 239 + }, + { + "epoch": 240.0, + "grad_norm": 0.0509144552052021, + "learning_rate": 0.0016888888888888889, + "loss": 0.0156, + "step": 240 + }, + { + "epoch": 241.0, + "grad_norm": 0.05950072035193443, + "learning_rate": 0.0016866666666666668, + "loss": 0.0161, + "step": 241 + }, + { + "epoch": 242.0, + "grad_norm": 0.06005380302667618, + "learning_rate": 0.0016844444444444444, + "loss": 0.0159, + "step": 242 + }, + { + "epoch": 243.0, + "grad_norm": 0.05741410329937935, + "learning_rate": 0.0016822222222222223, + "loss": 0.0157, + "step": 243 + }, + { + "epoch": 244.0, + "grad_norm": 0.06560762971639633, + "learning_rate": 0.00168, + "loss": 0.0151, + "step": 244 + }, + { + "epoch": 245.0, + "grad_norm": 0.05892657861113548, + "learning_rate": 0.0016777777777777778, + "loss": 0.0158, + "step": 245 + }, + { + "epoch": 246.0, + "grad_norm": 0.07045791298151016, + "learning_rate": 0.0016755555555555556, + "loss": 0.0155, + "step": 246 + }, + { + "epoch": 247.0, + "grad_norm": 0.055328112095594406, + "learning_rate": 0.0016733333333333333, + "loss": 0.0149, + "step": 247 + }, + { + "epoch": 248.0, + "grad_norm": 0.06157761439681053, + "learning_rate": 0.0016711111111111113, + "loss": 0.0149, + "step": 248 + }, + { + "epoch": 249.0, + "grad_norm": 0.07325014472007751, + "learning_rate": 0.0016688888888888888, + "loss": 0.0149, + "step": 249 + }, + { + "epoch": 250.0, + "grad_norm": 0.055287521332502365, + "learning_rate": 0.0016666666666666668, + "loss": 0.0152, + "step": 250 + }, + { + "epoch": 251.0, + "grad_norm": 0.09518066048622131, + "learning_rate": 0.0016644444444444445, + "loss": 0.0162, + "step": 251 + }, + { + "epoch": 252.0, + "grad_norm": 0.05666990950703621, + "learning_rate": 0.0016622222222222223, + "loss": 0.0143, + "step": 252 + }, + { + "epoch": 253.0, + "grad_norm": 0.11963475495576859, + "learning_rate": 0.00166, + "loss": 0.0162, + "step": 253 + }, + { + "epoch": 254.0, + "grad_norm": 0.05737292766571045, + "learning_rate": 0.0016577777777777778, + "loss": 0.0143, + "step": 254 + }, + { + "epoch": 255.0, + "grad_norm": 0.06973493844270706, + "learning_rate": 0.0016555555555555555, + "loss": 0.0147, + "step": 255 + }, + { + "epoch": 256.0, + "grad_norm": 0.07595274597406387, + "learning_rate": 0.0016533333333333333, + "loss": 0.0144, + "step": 256 + }, + { + "epoch": 257.0, + "grad_norm": 0.06479303538799286, + "learning_rate": 0.0016511111111111112, + "loss": 0.0146, + "step": 257 + }, + { + "epoch": 258.0, + "grad_norm": 0.08630603551864624, + "learning_rate": 0.001648888888888889, + "loss": 0.0148, + "step": 258 + }, + { + "epoch": 259.0, + "grad_norm": 0.050509463995695114, + "learning_rate": 0.0016466666666666667, + "loss": 0.014, + "step": 259 + }, + { + "epoch": 260.0, + "grad_norm": 0.07676823437213898, + "learning_rate": 0.0016444444444444445, + "loss": 0.0146, + "step": 260 + }, + { + "epoch": 261.0, + "grad_norm": 0.06755410879850388, + "learning_rate": 0.0016422222222222222, + "loss": 0.0142, + "step": 261 + }, + { + "epoch": 262.0, + "grad_norm": 0.055984627455472946, + "learning_rate": 0.00164, + "loss": 0.0137, + "step": 262 + }, + { + "epoch": 263.0, + "grad_norm": 0.06891737133264542, + "learning_rate": 0.0016377777777777777, + "loss": 0.0141, + "step": 263 + }, + { + "epoch": 264.0, + "grad_norm": 0.06404928863048553, + "learning_rate": 0.0016355555555555557, + "loss": 0.014, + "step": 264 + }, + { + "epoch": 265.0, + "grad_norm": 0.06907233595848083, + "learning_rate": 0.0016333333333333334, + "loss": 0.0139, + "step": 265 + }, + { + "epoch": 266.0, + "grad_norm": 0.07640419155359268, + "learning_rate": 0.0016311111111111112, + "loss": 0.0139, + "step": 266 + }, + { + "epoch": 267.0, + "grad_norm": 0.047443002462387085, + "learning_rate": 0.001628888888888889, + "loss": 0.0127, + "step": 267 + }, + { + "epoch": 268.0, + "grad_norm": 0.10862824320793152, + "learning_rate": 0.0016266666666666667, + "loss": 0.0139, + "step": 268 + }, + { + "epoch": 269.0, + "grad_norm": 0.054732754826545715, + "learning_rate": 0.0016244444444444444, + "loss": 0.0135, + "step": 269 + }, + { + "epoch": 270.0, + "grad_norm": 0.06462664902210236, + "learning_rate": 0.0016222222222222222, + "loss": 0.0124, + "step": 270 + }, + { + "epoch": 271.0, + "grad_norm": 0.0714699774980545, + "learning_rate": 0.0016200000000000001, + "loss": 0.0127, + "step": 271 + }, + { + "epoch": 272.0, + "grad_norm": 0.05089233070611954, + "learning_rate": 0.0016177777777777779, + "loss": 0.0125, + "step": 272 + }, + { + "epoch": 273.0, + "grad_norm": 0.1049375906586647, + "learning_rate": 0.0016155555555555556, + "loss": 0.0142, + "step": 273 + }, + { + "epoch": 274.0, + "grad_norm": 0.052508674561977386, + "learning_rate": 0.0016133333333333334, + "loss": 0.0125, + "step": 274 + }, + { + "epoch": 275.0, + "grad_norm": 0.05343548581004143, + "learning_rate": 0.0016111111111111111, + "loss": 0.0127, + "step": 275 + }, + { + "epoch": 276.0, + "grad_norm": 0.0628175362944603, + "learning_rate": 0.0016088888888888889, + "loss": 0.0127, + "step": 276 + }, + { + "epoch": 277.0, + "grad_norm": 0.05540168285369873, + "learning_rate": 0.0016066666666666666, + "loss": 0.013, + "step": 277 + }, + { + "epoch": 278.0, + "grad_norm": 0.06312675029039383, + "learning_rate": 0.0016044444444444444, + "loss": 0.0127, + "step": 278 + }, + { + "epoch": 279.0, + "grad_norm": 0.04804935306310654, + "learning_rate": 0.0016022222222222223, + "loss": 0.0123, + "step": 279 + }, + { + "epoch": 280.0, + "grad_norm": 0.07451438903808594, + "learning_rate": 0.0016, + "loss": 0.0137, + "step": 280 + }, + { + "epoch": 281.0, + "grad_norm": 0.06311290711164474, + "learning_rate": 0.0015977777777777778, + "loss": 0.0129, + "step": 281 + }, + { + "epoch": 282.0, + "grad_norm": 0.04818476364016533, + "learning_rate": 0.0015955555555555556, + "loss": 0.0114, + "step": 282 + }, + { + "epoch": 283.0, + "grad_norm": 0.05734807625412941, + "learning_rate": 0.0015933333333333333, + "loss": 0.012, + "step": 283 + }, + { + "epoch": 284.0, + "grad_norm": 0.05613243579864502, + "learning_rate": 0.001591111111111111, + "loss": 0.0122, + "step": 284 + }, + { + "epoch": 285.0, + "grad_norm": 0.05182022973895073, + "learning_rate": 0.0015888888888888888, + "loss": 0.0114, + "step": 285 + }, + { + "epoch": 286.0, + "grad_norm": 0.058295805007219315, + "learning_rate": 0.0015866666666666668, + "loss": 0.0117, + "step": 286 + }, + { + "epoch": 287.0, + "grad_norm": 0.04197324812412262, + "learning_rate": 0.0015844444444444445, + "loss": 0.0117, + "step": 287 + }, + { + "epoch": 288.0, + "grad_norm": 0.05144956335425377, + "learning_rate": 0.0015822222222222223, + "loss": 0.0113, + "step": 288 + }, + { + "epoch": 289.0, + "grad_norm": 0.0465347096323967, + "learning_rate": 0.00158, + "loss": 0.0119, + "step": 289 + }, + { + "epoch": 290.0, + "grad_norm": 0.04576906934380531, + "learning_rate": 0.0015777777777777778, + "loss": 0.0112, + "step": 290 + }, + { + "epoch": 291.0, + "grad_norm": 0.040685515850782394, + "learning_rate": 0.0015755555555555557, + "loss": 0.0114, + "step": 291 + }, + { + "epoch": 292.0, + "grad_norm": 0.04453601688146591, + "learning_rate": 0.0015733333333333333, + "loss": 0.0113, + "step": 292 + }, + { + "epoch": 293.0, + "grad_norm": 0.053638309240341187, + "learning_rate": 0.0015711111111111112, + "loss": 0.0114, + "step": 293 + }, + { + "epoch": 294.0, + "grad_norm": 0.04391471669077873, + "learning_rate": 0.001568888888888889, + "loss": 0.0112, + "step": 294 + }, + { + "epoch": 295.0, + "grad_norm": 0.045375898480415344, + "learning_rate": 0.0015666666666666667, + "loss": 0.0113, + "step": 295 + }, + { + "epoch": 296.0, + "grad_norm": 0.04697088524699211, + "learning_rate": 0.0015644444444444445, + "loss": 0.0104, + "step": 296 + }, + { + "epoch": 297.0, + "grad_norm": 0.04696211591362953, + "learning_rate": 0.0015622222222222222, + "loss": 0.011, + "step": 297 + }, + { + "epoch": 298.0, + "grad_norm": 0.05330660939216614, + "learning_rate": 0.0015600000000000002, + "loss": 0.0108, + "step": 298 + }, + { + "epoch": 299.0, + "grad_norm": 0.047231633216142654, + "learning_rate": 0.0015577777777777777, + "loss": 0.0109, + "step": 299 + }, + { + "epoch": 300.0, + "grad_norm": 0.06341907382011414, + "learning_rate": 0.0015555555555555557, + "loss": 0.0107, + "step": 300 + }, + { + "epoch": 301.0, + "grad_norm": 0.044753991067409515, + "learning_rate": 0.0015533333333333332, + "loss": 0.0104, + "step": 301 + }, + { + "epoch": 302.0, + "grad_norm": 0.056486308574676514, + "learning_rate": 0.0015511111111111112, + "loss": 0.0111, + "step": 302 + }, + { + "epoch": 303.0, + "grad_norm": 0.04415280744433403, + "learning_rate": 0.001548888888888889, + "loss": 0.0103, + "step": 303 + }, + { + "epoch": 304.0, + "grad_norm": 0.049498315900564194, + "learning_rate": 0.0015466666666666667, + "loss": 0.0111, + "step": 304 + }, + { + "epoch": 305.0, + "grad_norm": 0.04238919913768768, + "learning_rate": 0.0015444444444444446, + "loss": 0.0106, + "step": 305 + }, + { + "epoch": 306.0, + "grad_norm": 0.04967685788869858, + "learning_rate": 0.0015422222222222222, + "loss": 0.0105, + "step": 306 + }, + { + "epoch": 307.0, + "grad_norm": 0.043164417147636414, + "learning_rate": 0.0015400000000000001, + "loss": 0.0103, + "step": 307 + }, + { + "epoch": 308.0, + "grad_norm": 0.05318186432123184, + "learning_rate": 0.0015377777777777777, + "loss": 0.0101, + "step": 308 + }, + { + "epoch": 309.0, + "grad_norm": 0.05574382469058037, + "learning_rate": 0.0015355555555555556, + "loss": 0.0102, + "step": 309 + }, + { + "epoch": 310.0, + "grad_norm": 0.036135945469141006, + "learning_rate": 0.0015333333333333334, + "loss": 0.0097, + "step": 310 + }, + { + "epoch": 311.0, + "grad_norm": 0.07383669912815094, + "learning_rate": 0.0015311111111111111, + "loss": 0.0104, + "step": 311 + }, + { + "epoch": 312.0, + "grad_norm": 0.04554266110062599, + "learning_rate": 0.001528888888888889, + "loss": 0.0099, + "step": 312 + }, + { + "epoch": 313.0, + "grad_norm": 0.07572053372859955, + "learning_rate": 0.0015266666666666666, + "loss": 0.0109, + "step": 313 + }, + { + "epoch": 314.0, + "grad_norm": 0.05518485605716705, + "learning_rate": 0.0015244444444444446, + "loss": 0.0103, + "step": 314 + }, + { + "epoch": 315.0, + "grad_norm": 0.07998452335596085, + "learning_rate": 0.0015222222222222221, + "loss": 0.0105, + "step": 315 + }, + { + "epoch": 316.0, + "grad_norm": 0.058478306978940964, + "learning_rate": 0.00152, + "loss": 0.0094, + "step": 316 + }, + { + "epoch": 317.0, + "grad_norm": 0.04713175818324089, + "learning_rate": 0.0015177777777777776, + "loss": 0.0099, + "step": 317 + }, + { + "epoch": 318.0, + "grad_norm": 0.08784548938274384, + "learning_rate": 0.0015155555555555556, + "loss": 0.0111, + "step": 318 + }, + { + "epoch": 319.0, + "grad_norm": 0.0438971072435379, + "learning_rate": 0.0015133333333333335, + "loss": 0.0097, + "step": 319 + }, + { + "epoch": 320.0, + "grad_norm": 0.0707026869058609, + "learning_rate": 0.001511111111111111, + "loss": 0.01, + "step": 320 + }, + { + "epoch": 321.0, + "grad_norm": 0.04169069975614548, + "learning_rate": 0.001508888888888889, + "loss": 0.0095, + "step": 321 + }, + { + "epoch": 322.0, + "grad_norm": 0.04208659380674362, + "learning_rate": 0.0015066666666666666, + "loss": 0.0091, + "step": 322 + }, + { + "epoch": 323.0, + "grad_norm": 0.04683458432555199, + "learning_rate": 0.0015044444444444445, + "loss": 0.0098, + "step": 323 + }, + { + "epoch": 324.0, + "grad_norm": 0.0388614684343338, + "learning_rate": 0.001502222222222222, + "loss": 0.0093, + "step": 324 + }, + { + "epoch": 325.0, + "grad_norm": 0.04117365926504135, + "learning_rate": 0.0015, + "loss": 0.0091, + "step": 325 + }, + { + "epoch": 326.0, + "grad_norm": 0.042431872338056564, + "learning_rate": 0.001497777777777778, + "loss": 0.0098, + "step": 326 + }, + { + "epoch": 327.0, + "grad_norm": 0.04213636368513107, + "learning_rate": 0.0014955555555555555, + "loss": 0.0094, + "step": 327 + }, + { + "epoch": 328.0, + "grad_norm": 0.03774186596274376, + "learning_rate": 0.0014933333333333335, + "loss": 0.0088, + "step": 328 + }, + { + "epoch": 329.0, + "grad_norm": 0.040397752076387405, + "learning_rate": 0.001491111111111111, + "loss": 0.0093, + "step": 329 + }, + { + "epoch": 330.0, + "grad_norm": 0.04346088692545891, + "learning_rate": 0.001488888888888889, + "loss": 0.0092, + "step": 330 + }, + { + "epoch": 331.0, + "grad_norm": 0.03852611780166626, + "learning_rate": 0.0014866666666666665, + "loss": 0.0086, + "step": 331 + }, + { + "epoch": 332.0, + "grad_norm": 0.052108846604824066, + "learning_rate": 0.0014844444444444445, + "loss": 0.01, + "step": 332 + }, + { + "epoch": 333.0, + "grad_norm": 0.047736842185258865, + "learning_rate": 0.0014822222222222224, + "loss": 0.009, + "step": 333 + }, + { + "epoch": 334.0, + "grad_norm": 0.042882874608039856, + "learning_rate": 0.00148, + "loss": 0.0087, + "step": 334 + }, + { + "epoch": 335.0, + "grad_norm": 0.04501271992921829, + "learning_rate": 0.001477777777777778, + "loss": 0.0089, + "step": 335 + }, + { + "epoch": 336.0, + "grad_norm": 0.04159759730100632, + "learning_rate": 0.0014755555555555555, + "loss": 0.0091, + "step": 336 + }, + { + "epoch": 337.0, + "grad_norm": 0.04928553104400635, + "learning_rate": 0.0014733333333333334, + "loss": 0.0093, + "step": 337 + }, + { + "epoch": 338.0, + "grad_norm": 0.04055558145046234, + "learning_rate": 0.001471111111111111, + "loss": 0.0088, + "step": 338 + }, + { + "epoch": 339.0, + "grad_norm": 0.04683249071240425, + "learning_rate": 0.001468888888888889, + "loss": 0.0088, + "step": 339 + }, + { + "epoch": 340.0, + "grad_norm": 0.05597413331270218, + "learning_rate": 0.0014666666666666667, + "loss": 0.0096, + "step": 340 + }, + { + "epoch": 341.0, + "grad_norm": 0.04371971637010574, + "learning_rate": 0.0014644444444444444, + "loss": 0.0087, + "step": 341 + }, + { + "epoch": 342.0, + "grad_norm": 0.07671177387237549, + "learning_rate": 0.0014622222222222224, + "loss": 0.0091, + "step": 342 + }, + { + "epoch": 343.0, + "grad_norm": 0.03556251898407936, + "learning_rate": 0.00146, + "loss": 0.0084, + "step": 343 + }, + { + "epoch": 344.0, + "grad_norm": 0.05906197428703308, + "learning_rate": 0.0014577777777777779, + "loss": 0.0088, + "step": 344 + }, + { + "epoch": 345.0, + "grad_norm": 0.060906119644641876, + "learning_rate": 0.0014555555555555554, + "loss": 0.0089, + "step": 345 + }, + { + "epoch": 346.0, + "grad_norm": 0.036098964512348175, + "learning_rate": 0.0014533333333333334, + "loss": 0.0083, + "step": 346 + }, + { + "epoch": 347.0, + "grad_norm": 0.07324780523777008, + "learning_rate": 0.0014511111111111111, + "loss": 0.0087, + "step": 347 + }, + { + "epoch": 348.0, + "grad_norm": 0.05585562065243721, + "learning_rate": 0.0014488888888888889, + "loss": 0.0084, + "step": 348 + }, + { + "epoch": 349.0, + "grad_norm": 0.053688161075115204, + "learning_rate": 0.0014466666666666668, + "loss": 0.0087, + "step": 349 + }, + { + "epoch": 350.0, + "grad_norm": 0.06806395202875137, + "learning_rate": 0.0014444444444444444, + "loss": 0.0091, + "step": 350 + }, + { + "epoch": 351.0, + "grad_norm": 0.052333708852529526, + "learning_rate": 0.0014422222222222223, + "loss": 0.0083, + "step": 351 + }, + { + "epoch": 352.0, + "grad_norm": 0.06652957201004028, + "learning_rate": 0.0014399999999999999, + "loss": 0.0084, + "step": 352 + }, + { + "epoch": 353.0, + "grad_norm": 0.09658028930425644, + "learning_rate": 0.0014377777777777778, + "loss": 0.0095, + "step": 353 + }, + { + "epoch": 354.0, + "grad_norm": 0.03924448788166046, + "learning_rate": 0.0014355555555555556, + "loss": 0.0083, + "step": 354 + }, + { + "epoch": 355.0, + "grad_norm": 0.12581630051136017, + "learning_rate": 0.0014333333333333333, + "loss": 0.0102, + "step": 355 + }, + { + "epoch": 356.0, + "grad_norm": 0.04559926316142082, + "learning_rate": 0.001431111111111111, + "loss": 0.0084, + "step": 356 + }, + { + "epoch": 357.0, + "grad_norm": 0.08532512933015823, + "learning_rate": 0.0014288888888888888, + "loss": 0.0089, + "step": 357 + }, + { + "epoch": 358.0, + "grad_norm": 0.05258309841156006, + "learning_rate": 0.0014266666666666668, + "loss": 0.0088, + "step": 358 + }, + { + "epoch": 359.0, + "grad_norm": 0.050833553075790405, + "learning_rate": 0.0014244444444444443, + "loss": 0.0083, + "step": 359 + }, + { + "epoch": 360.0, + "grad_norm": 0.07133360952138901, + "learning_rate": 0.0014222222222222223, + "loss": 0.0086, + "step": 360 + }, + { + "epoch": 361.0, + "grad_norm": 0.04288196563720703, + "learning_rate": 0.00142, + "loss": 0.0081, + "step": 361 + }, + { + "epoch": 362.0, + "grad_norm": 0.050019655376672745, + "learning_rate": 0.0014177777777777778, + "loss": 0.0081, + "step": 362 + }, + { + "epoch": 363.0, + "grad_norm": 0.04713081568479538, + "learning_rate": 0.0014155555555555555, + "loss": 0.0078, + "step": 363 + }, + { + "epoch": 364.0, + "grad_norm": 0.03391753509640694, + "learning_rate": 0.0014133333333333333, + "loss": 0.0079, + "step": 364 + }, + { + "epoch": 365.0, + "grad_norm": 0.05800512805581093, + "learning_rate": 0.0014111111111111112, + "loss": 0.0083, + "step": 365 + }, + { + "epoch": 366.0, + "grad_norm": 0.04860348999500275, + "learning_rate": 0.001408888888888889, + "loss": 0.008, + "step": 366 + }, + { + "epoch": 367.0, + "grad_norm": 0.04747091606259346, + "learning_rate": 0.0014066666666666667, + "loss": 0.008, + "step": 367 + }, + { + "epoch": 368.0, + "grad_norm": 0.05328553542494774, + "learning_rate": 0.0014044444444444445, + "loss": 0.008, + "step": 368 + }, + { + "epoch": 369.0, + "grad_norm": 0.033220142126083374, + "learning_rate": 0.0014022222222222222, + "loss": 0.0072, + "step": 369 + }, + { + "epoch": 370.0, + "grad_norm": 0.03936196118593216, + "learning_rate": 0.0014, + "loss": 0.0072, + "step": 370 + }, + { + "epoch": 371.0, + "grad_norm": 0.035857848823070526, + "learning_rate": 0.0013977777777777777, + "loss": 0.0077, + "step": 371 + }, + { + "epoch": 372.0, + "grad_norm": 0.04019852727651596, + "learning_rate": 0.0013955555555555557, + "loss": 0.0074, + "step": 372 + }, + { + "epoch": 373.0, + "grad_norm": 0.03362793102860451, + "learning_rate": 0.0013933333333333334, + "loss": 0.0076, + "step": 373 + }, + { + "epoch": 374.0, + "grad_norm": 0.04266555234789848, + "learning_rate": 0.0013911111111111112, + "loss": 0.0077, + "step": 374 + }, + { + "epoch": 375.0, + "grad_norm": 0.0359489843249321, + "learning_rate": 0.001388888888888889, + "loss": 0.0072, + "step": 375 + }, + { + "epoch": 376.0, + "grad_norm": 0.05567077919840813, + "learning_rate": 0.0013866666666666667, + "loss": 0.0082, + "step": 376 + }, + { + "epoch": 377.0, + "grad_norm": 0.044729381799697876, + "learning_rate": 0.0013844444444444444, + "loss": 0.0075, + "step": 377 + }, + { + "epoch": 378.0, + "grad_norm": 0.0389576219022274, + "learning_rate": 0.0013822222222222222, + "loss": 0.0075, + "step": 378 + }, + { + "epoch": 379.0, + "grad_norm": 0.03594253212213516, + "learning_rate": 0.00138, + "loss": 0.0071, + "step": 379 + }, + { + "epoch": 380.0, + "grad_norm": 0.03960775211453438, + "learning_rate": 0.001377777777777778, + "loss": 0.0077, + "step": 380 + }, + { + "epoch": 381.0, + "grad_norm": 0.036085862666368484, + "learning_rate": 0.0013755555555555556, + "loss": 0.0075, + "step": 381 + }, + { + "epoch": 382.0, + "grad_norm": 0.04432854801416397, + "learning_rate": 0.0013733333333333334, + "loss": 0.0077, + "step": 382 + }, + { + "epoch": 383.0, + "grad_norm": 0.04181389883160591, + "learning_rate": 0.0013711111111111111, + "loss": 0.0074, + "step": 383 + }, + { + "epoch": 384.0, + "grad_norm": 0.04760071262717247, + "learning_rate": 0.0013688888888888889, + "loss": 0.0075, + "step": 384 + }, + { + "epoch": 385.0, + "grad_norm": 0.03643626347184181, + "learning_rate": 0.0013666666666666666, + "loss": 0.0074, + "step": 385 + }, + { + "epoch": 386.0, + "grad_norm": 0.03681834042072296, + "learning_rate": 0.0013644444444444444, + "loss": 0.0072, + "step": 386 + }, + { + "epoch": 387.0, + "grad_norm": 0.053312405943870544, + "learning_rate": 0.0013622222222222223, + "loss": 0.0081, + "step": 387 + }, + { + "epoch": 388.0, + "grad_norm": 0.0378199964761734, + "learning_rate": 0.00136, + "loss": 0.0071, + "step": 388 + }, + { + "epoch": 389.0, + "grad_norm": 0.035717807710170746, + "learning_rate": 0.0013577777777777778, + "loss": 0.0072, + "step": 389 + }, + { + "epoch": 390.0, + "grad_norm": 0.051514606922864914, + "learning_rate": 0.0013555555555555556, + "loss": 0.0077, + "step": 390 + }, + { + "epoch": 391.0, + "grad_norm": 0.039895568042993546, + "learning_rate": 0.0013533333333333333, + "loss": 0.0072, + "step": 391 + }, + { + "epoch": 392.0, + "grad_norm": 0.03736487403512001, + "learning_rate": 0.001351111111111111, + "loss": 0.0068, + "step": 392 + }, + { + "epoch": 393.0, + "grad_norm": 0.034880317747592926, + "learning_rate": 0.0013488888888888888, + "loss": 0.0071, + "step": 393 + }, + { + "epoch": 394.0, + "grad_norm": 0.053845588117837906, + "learning_rate": 0.0013466666666666668, + "loss": 0.0077, + "step": 394 + }, + { + "epoch": 395.0, + "grad_norm": 0.034650277346372604, + "learning_rate": 0.0013444444444444445, + "loss": 0.007, + "step": 395 + }, + { + "epoch": 396.0, + "grad_norm": 0.06803309917449951, + "learning_rate": 0.0013422222222222223, + "loss": 0.0079, + "step": 396 + }, + { + "epoch": 397.0, + "grad_norm": 0.03568552806973457, + "learning_rate": 0.00134, + "loss": 0.0068, + "step": 397 + }, + { + "epoch": 398.0, + "grad_norm": 0.04094022884964943, + "learning_rate": 0.0013377777777777778, + "loss": 0.0073, + "step": 398 + }, + { + "epoch": 399.0, + "grad_norm": 0.045098766684532166, + "learning_rate": 0.0013355555555555555, + "loss": 0.0072, + "step": 399 + }, + { + "epoch": 400.0, + "grad_norm": 0.03938360884785652, + "learning_rate": 0.0013333333333333333, + "loss": 0.0073, + "step": 400 + }, + { + "epoch": 401.0, + "grad_norm": 0.03678111359477043, + "learning_rate": 0.0013311111111111113, + "loss": 0.0066, + "step": 401 + }, + { + "epoch": 402.0, + "grad_norm": 0.03792262822389603, + "learning_rate": 0.0013288888888888888, + "loss": 0.0066, + "step": 402 + }, + { + "epoch": 403.0, + "grad_norm": 0.04037335887551308, + "learning_rate": 0.0013266666666666667, + "loss": 0.0069, + "step": 403 + }, + { + "epoch": 404.0, + "grad_norm": 0.0448298305273056, + "learning_rate": 0.0013244444444444445, + "loss": 0.0072, + "step": 404 + }, + { + "epoch": 405.0, + "grad_norm": 0.029835334047675133, + "learning_rate": 0.0013222222222222222, + "loss": 0.0067, + "step": 405 + }, + { + "epoch": 406.0, + "grad_norm": 0.03127991408109665, + "learning_rate": 0.00132, + "loss": 0.0066, + "step": 406 + }, + { + "epoch": 407.0, + "grad_norm": 0.034645188599824905, + "learning_rate": 0.0013177777777777777, + "loss": 0.0065, + "step": 407 + }, + { + "epoch": 408.0, + "grad_norm": 0.03312946483492851, + "learning_rate": 0.0013155555555555557, + "loss": 0.0066, + "step": 408 + }, + { + "epoch": 409.0, + "grad_norm": 0.03247128427028656, + "learning_rate": 0.0013133333333333332, + "loss": 0.0064, + "step": 409 + }, + { + "epoch": 410.0, + "grad_norm": 0.03561067953705788, + "learning_rate": 0.0013111111111111112, + "loss": 0.0064, + "step": 410 + }, + { + "epoch": 411.0, + "grad_norm": 0.03821596875786781, + "learning_rate": 0.001308888888888889, + "loss": 0.0066, + "step": 411 + }, + { + "epoch": 412.0, + "grad_norm": 0.0356701985001564, + "learning_rate": 0.0013066666666666667, + "loss": 0.0065, + "step": 412 + }, + { + "epoch": 413.0, + "grad_norm": 0.04700474441051483, + "learning_rate": 0.0013044444444444444, + "loss": 0.0066, + "step": 413 + }, + { + "epoch": 414.0, + "grad_norm": 0.03856738656759262, + "learning_rate": 0.0013022222222222222, + "loss": 0.0067, + "step": 414 + }, + { + "epoch": 415.0, + "grad_norm": 0.03348975256085396, + "learning_rate": 0.0013000000000000002, + "loss": 0.0065, + "step": 415 + }, + { + "epoch": 416.0, + "grad_norm": 0.03193169832229614, + "learning_rate": 0.0012977777777777777, + "loss": 0.0064, + "step": 416 + }, + { + "epoch": 417.0, + "grad_norm": 0.0403468981385231, + "learning_rate": 0.0012955555555555557, + "loss": 0.0063, + "step": 417 + }, + { + "epoch": 418.0, + "grad_norm": 0.03923949599266052, + "learning_rate": 0.0012933333333333332, + "loss": 0.0068, + "step": 418 + }, + { + "epoch": 419.0, + "grad_norm": 0.03874557837843895, + "learning_rate": 0.0012911111111111111, + "loss": 0.007, + "step": 419 + }, + { + "epoch": 420.0, + "grad_norm": 0.03878943994641304, + "learning_rate": 0.001288888888888889, + "loss": 0.0066, + "step": 420 + }, + { + "epoch": 421.0, + "grad_norm": 0.030541859567165375, + "learning_rate": 0.0012866666666666666, + "loss": 0.0061, + "step": 421 + }, + { + "epoch": 422.0, + "grad_norm": 0.05509382113814354, + "learning_rate": 0.0012844444444444446, + "loss": 0.0068, + "step": 422 + }, + { + "epoch": 423.0, + "grad_norm": 0.03676342964172363, + "learning_rate": 0.0012822222222222221, + "loss": 0.007, + "step": 423 + }, + { + "epoch": 424.0, + "grad_norm": 0.03279677405953407, + "learning_rate": 0.00128, + "loss": 0.0062, + "step": 424 + }, + { + "epoch": 425.0, + "grad_norm": 0.03973347321152687, + "learning_rate": 0.0012777777777777776, + "loss": 0.0066, + "step": 425 + }, + { + "epoch": 426.0, + "grad_norm": 0.03546801954507828, + "learning_rate": 0.0012755555555555556, + "loss": 0.0065, + "step": 426 + }, + { + "epoch": 427.0, + "grad_norm": 0.031479015946388245, + "learning_rate": 0.0012733333333333333, + "loss": 0.0062, + "step": 427 + }, + { + "epoch": 428.0, + "grad_norm": 0.033816102892160416, + "learning_rate": 0.001271111111111111, + "loss": 0.0064, + "step": 428 + }, + { + "epoch": 429.0, + "grad_norm": 0.03433229401707649, + "learning_rate": 0.001268888888888889, + "loss": 0.0067, + "step": 429 + }, + { + "epoch": 430.0, + "grad_norm": 0.03628786653280258, + "learning_rate": 0.0012666666666666666, + "loss": 0.0064, + "step": 430 + }, + { + "epoch": 431.0, + "grad_norm": 0.02654869668185711, + "learning_rate": 0.0012644444444444446, + "loss": 0.006, + "step": 431 + }, + { + "epoch": 432.0, + "grad_norm": 0.037869714200496674, + "learning_rate": 0.001262222222222222, + "loss": 0.0065, + "step": 432 + }, + { + "epoch": 433.0, + "grad_norm": 0.04116172716021538, + "learning_rate": 0.00126, + "loss": 0.0067, + "step": 433 + }, + { + "epoch": 434.0, + "grad_norm": 0.036912932991981506, + "learning_rate": 0.001257777777777778, + "loss": 0.0065, + "step": 434 + }, + { + "epoch": 435.0, + "grad_norm": 0.032851189374923706, + "learning_rate": 0.0012555555555555555, + "loss": 0.0057, + "step": 435 + }, + { + "epoch": 436.0, + "grad_norm": 0.03754141926765442, + "learning_rate": 0.0012533333333333335, + "loss": 0.0064, + "step": 436 + }, + { + "epoch": 437.0, + "grad_norm": 0.0499282069504261, + "learning_rate": 0.001251111111111111, + "loss": 0.0064, + "step": 437 + }, + { + "epoch": 438.0, + "grad_norm": 0.03528120741248131, + "learning_rate": 0.001248888888888889, + "loss": 0.0061, + "step": 438 + }, + { + "epoch": 439.0, + "grad_norm": 0.04098303243517876, + "learning_rate": 0.0012466666666666665, + "loss": 0.0064, + "step": 439 + }, + { + "epoch": 440.0, + "grad_norm": 0.05273183062672615, + "learning_rate": 0.0012444444444444445, + "loss": 0.0059, + "step": 440 + }, + { + "epoch": 441.0, + "grad_norm": 0.029608091339468956, + "learning_rate": 0.001242222222222222, + "loss": 0.0057, + "step": 441 + }, + { + "epoch": 442.0, + "grad_norm": 0.03589300811290741, + "learning_rate": 0.00124, + "loss": 0.0058, + "step": 442 + }, + { + "epoch": 443.0, + "grad_norm": 0.03696886822581291, + "learning_rate": 0.001237777777777778, + "loss": 0.0059, + "step": 443 + }, + { + "epoch": 444.0, + "grad_norm": 0.04584373161196709, + "learning_rate": 0.0012355555555555555, + "loss": 0.0063, + "step": 444 + }, + { + "epoch": 445.0, + "grad_norm": 0.03926507383584976, + "learning_rate": 0.0012333333333333335, + "loss": 0.006, + "step": 445 + }, + { + "epoch": 446.0, + "grad_norm": 0.0737149715423584, + "learning_rate": 0.001231111111111111, + "loss": 0.0065, + "step": 446 + }, + { + "epoch": 447.0, + "grad_norm": 0.040384162217378616, + "learning_rate": 0.001228888888888889, + "loss": 0.006, + "step": 447 + }, + { + "epoch": 448.0, + "grad_norm": 0.048789847642183304, + "learning_rate": 0.0012266666666666667, + "loss": 0.006, + "step": 448 + }, + { + "epoch": 449.0, + "grad_norm": 0.04522663727402687, + "learning_rate": 0.0012244444444444445, + "loss": 0.0064, + "step": 449 + }, + { + "epoch": 450.0, + "grad_norm": 0.044181939214468, + "learning_rate": 0.0012222222222222224, + "loss": 0.0061, + "step": 450 + }, + { + "epoch": 451.0, + "grad_norm": 0.04393507167696953, + "learning_rate": 0.00122, + "loss": 0.0062, + "step": 451 + }, + { + "epoch": 452.0, + "grad_norm": 0.0377420112490654, + "learning_rate": 0.001217777777777778, + "loss": 0.0059, + "step": 452 + }, + { + "epoch": 453.0, + "grad_norm": 0.027778957039117813, + "learning_rate": 0.0012155555555555554, + "loss": 0.0056, + "step": 453 + }, + { + "epoch": 454.0, + "grad_norm": 0.03586387261748314, + "learning_rate": 0.0012133333333333334, + "loss": 0.0059, + "step": 454 + }, + { + "epoch": 455.0, + "grad_norm": 0.041379209607839584, + "learning_rate": 0.0012111111111111112, + "loss": 0.0059, + "step": 455 + }, + { + "epoch": 456.0, + "grad_norm": 0.037975508719682693, + "learning_rate": 0.001208888888888889, + "loss": 0.0061, + "step": 456 + }, + { + "epoch": 457.0, + "grad_norm": 0.03542089834809303, + "learning_rate": 0.0012066666666666669, + "loss": 0.0055, + "step": 457 + }, + { + "epoch": 458.0, + "grad_norm": 0.035748131573200226, + "learning_rate": 0.0012044444444444444, + "loss": 0.006, + "step": 458 + }, + { + "epoch": 459.0, + "grad_norm": 0.03112563118338585, + "learning_rate": 0.0012022222222222224, + "loss": 0.0058, + "step": 459 + }, + { + "epoch": 460.0, + "grad_norm": 0.047486674040555954, + "learning_rate": 0.0012, + "loss": 0.0062, + "step": 460 + }, + { + "epoch": 461.0, + "grad_norm": 0.03399772197008133, + "learning_rate": 0.0011977777777777779, + "loss": 0.0059, + "step": 461 + }, + { + "epoch": 462.0, + "grad_norm": 0.043101608753204346, + "learning_rate": 0.0011955555555555556, + "loss": 0.0059, + "step": 462 + }, + { + "epoch": 463.0, + "grad_norm": 0.026961272582411766, + "learning_rate": 0.0011933333333333334, + "loss": 0.0057, + "step": 463 + }, + { + "epoch": 464.0, + "grad_norm": 0.03507644310593605, + "learning_rate": 0.001191111111111111, + "loss": 0.0059, + "step": 464 + }, + { + "epoch": 465.0, + "grad_norm": 0.03533957153558731, + "learning_rate": 0.0011888888888888889, + "loss": 0.0059, + "step": 465 + }, + { + "epoch": 466.0, + "grad_norm": 0.03447294607758522, + "learning_rate": 0.0011866666666666668, + "loss": 0.0059, + "step": 466 + }, + { + "epoch": 467.0, + "grad_norm": 0.03277276083827019, + "learning_rate": 0.0011844444444444443, + "loss": 0.0061, + "step": 467 + }, + { + "epoch": 468.0, + "grad_norm": 0.03529715910553932, + "learning_rate": 0.0011822222222222223, + "loss": 0.0057, + "step": 468 + }, + { + "epoch": 469.0, + "grad_norm": 0.03415314108133316, + "learning_rate": 0.00118, + "loss": 0.0057, + "step": 469 + }, + { + "epoch": 470.0, + "grad_norm": 0.0367075614631176, + "learning_rate": 0.0011777777777777778, + "loss": 0.0058, + "step": 470 + }, + { + "epoch": 471.0, + "grad_norm": 0.04992802441120148, + "learning_rate": 0.0011755555555555556, + "loss": 0.0058, + "step": 471 + }, + { + "epoch": 472.0, + "grad_norm": 0.02544887363910675, + "learning_rate": 0.0011733333333333333, + "loss": 0.0051, + "step": 472 + }, + { + "epoch": 473.0, + "grad_norm": 0.035531774163246155, + "learning_rate": 0.0011711111111111113, + "loss": 0.0062, + "step": 473 + }, + { + "epoch": 474.0, + "grad_norm": 0.02878675051033497, + "learning_rate": 0.0011688888888888888, + "loss": 0.0052, + "step": 474 + }, + { + "epoch": 475.0, + "grad_norm": 0.05629320815205574, + "learning_rate": 0.0011666666666666668, + "loss": 0.0062, + "step": 475 + }, + { + "epoch": 476.0, + "grad_norm": 0.03907129168510437, + "learning_rate": 0.0011644444444444445, + "loss": 0.0058, + "step": 476 + }, + { + "epoch": 477.0, + "grad_norm": 0.05472861975431442, + "learning_rate": 0.0011622222222222223, + "loss": 0.0056, + "step": 477 + }, + { + "epoch": 478.0, + "grad_norm": 0.03535694256424904, + "learning_rate": 0.00116, + "loss": 0.0054, + "step": 478 + }, + { + "epoch": 479.0, + "grad_norm": 0.03546389192342758, + "learning_rate": 0.0011577777777777778, + "loss": 0.0058, + "step": 479 + }, + { + "epoch": 480.0, + "grad_norm": 0.027603119611740112, + "learning_rate": 0.0011555555555555555, + "loss": 0.0052, + "step": 480 + }, + { + "epoch": 481.0, + "grad_norm": 0.03660883754491806, + "learning_rate": 0.0011533333333333333, + "loss": 0.0057, + "step": 481 + }, + { + "epoch": 482.0, + "grad_norm": 0.030513163655996323, + "learning_rate": 0.0011511111111111112, + "loss": 0.0053, + "step": 482 + }, + { + "epoch": 483.0, + "grad_norm": 0.03554394096136093, + "learning_rate": 0.001148888888888889, + "loss": 0.0057, + "step": 483 + }, + { + "epoch": 484.0, + "grad_norm": 0.037891678512096405, + "learning_rate": 0.0011466666666666667, + "loss": 0.0054, + "step": 484 + }, + { + "epoch": 485.0, + "grad_norm": 0.04184143990278244, + "learning_rate": 0.0011444444444444445, + "loss": 0.0056, + "step": 485 + }, + { + "epoch": 486.0, + "grad_norm": 0.03347189724445343, + "learning_rate": 0.0011422222222222222, + "loss": 0.0054, + "step": 486 + }, + { + "epoch": 487.0, + "grad_norm": 0.03891591727733612, + "learning_rate": 0.00114, + "loss": 0.0053, + "step": 487 + }, + { + "epoch": 488.0, + "grad_norm": 0.030163027346134186, + "learning_rate": 0.0011377777777777777, + "loss": 0.0054, + "step": 488 + }, + { + "epoch": 489.0, + "grad_norm": 0.03170597180724144, + "learning_rate": 0.0011355555555555557, + "loss": 0.0053, + "step": 489 + }, + { + "epoch": 490.0, + "grad_norm": 0.027653338387608528, + "learning_rate": 0.0011333333333333334, + "loss": 0.0049, + "step": 490 + }, + { + "epoch": 491.0, + "grad_norm": 0.025576811283826828, + "learning_rate": 0.0011311111111111112, + "loss": 0.0049, + "step": 491 + }, + { + "epoch": 492.0, + "grad_norm": 0.02671181410551071, + "learning_rate": 0.001128888888888889, + "loss": 0.0051, + "step": 492 + }, + { + "epoch": 493.0, + "grad_norm": 0.031090857461094856, + "learning_rate": 0.0011266666666666667, + "loss": 0.0054, + "step": 493 + }, + { + "epoch": 494.0, + "grad_norm": 0.030311353504657745, + "learning_rate": 0.0011244444444444444, + "loss": 0.0053, + "step": 494 + }, + { + "epoch": 495.0, + "grad_norm": 0.03606722131371498, + "learning_rate": 0.0011222222222222222, + "loss": 0.0053, + "step": 495 + }, + { + "epoch": 496.0, + "grad_norm": 0.035778749734163284, + "learning_rate": 0.0011200000000000001, + "loss": 0.0053, + "step": 496 + }, + { + "epoch": 497.0, + "grad_norm": 0.03796238452196121, + "learning_rate": 0.0011177777777777779, + "loss": 0.0054, + "step": 497 + }, + { + "epoch": 498.0, + "grad_norm": 0.02831469103693962, + "learning_rate": 0.0011155555555555556, + "loss": 0.005, + "step": 498 + }, + { + "epoch": 499.0, + "grad_norm": 0.0282357819378376, + "learning_rate": 0.0011133333333333334, + "loss": 0.005, + "step": 499 + }, + { + "epoch": 500.0, + "grad_norm": 0.04182511568069458, + "learning_rate": 0.0011111111111111111, + "loss": 0.0056, + "step": 500 + }, + { + "epoch": 501.0, + "grad_norm": 0.02399536222219467, + "learning_rate": 0.0011088888888888889, + "loss": 0.0047, + "step": 501 + }, + { + "epoch": 502.0, + "grad_norm": 0.033601414412260056, + "learning_rate": 0.0011066666666666666, + "loss": 0.0051, + "step": 502 + }, + { + "epoch": 503.0, + "grad_norm": 0.033893782645463943, + "learning_rate": 0.0011044444444444444, + "loss": 0.0055, + "step": 503 + }, + { + "epoch": 504.0, + "grad_norm": 0.030596988275647163, + "learning_rate": 0.0011022222222222223, + "loss": 0.0048, + "step": 504 + }, + { + "epoch": 505.0, + "grad_norm": 0.03259752318263054, + "learning_rate": 0.0011, + "loss": 0.0051, + "step": 505 + }, + { + "epoch": 506.0, + "grad_norm": 0.02722361497581005, + "learning_rate": 0.0010977777777777778, + "loss": 0.0045, + "step": 506 + }, + { + "epoch": 507.0, + "grad_norm": 0.03016485832631588, + "learning_rate": 0.0010955555555555556, + "loss": 0.005, + "step": 507 + }, + { + "epoch": 508.0, + "grad_norm": 0.02929559536278248, + "learning_rate": 0.0010933333333333333, + "loss": 0.0049, + "step": 508 + }, + { + "epoch": 509.0, + "grad_norm": 0.041284140199422836, + "learning_rate": 0.001091111111111111, + "loss": 0.005, + "step": 509 + }, + { + "epoch": 510.0, + "grad_norm": 0.026360472664237022, + "learning_rate": 0.0010888888888888888, + "loss": 0.005, + "step": 510 + }, + { + "epoch": 511.0, + "grad_norm": 0.03568575158715248, + "learning_rate": 0.0010866666666666668, + "loss": 0.0048, + "step": 511 + }, + { + "epoch": 512.0, + "grad_norm": 0.030765000730752945, + "learning_rate": 0.0010844444444444445, + "loss": 0.0048, + "step": 512 + }, + { + "epoch": 513.0, + "grad_norm": 0.032683007419109344, + "learning_rate": 0.0010822222222222223, + "loss": 0.0052, + "step": 513 + }, + { + "epoch": 514.0, + "grad_norm": 0.025469932705163956, + "learning_rate": 0.00108, + "loss": 0.0046, + "step": 514 + }, + { + "epoch": 515.0, + "grad_norm": 0.0416124053299427, + "learning_rate": 0.0010777777777777778, + "loss": 0.0051, + "step": 515 + }, + { + "epoch": 516.0, + "grad_norm": 0.03848906606435776, + "learning_rate": 0.0010755555555555557, + "loss": 0.0052, + "step": 516 + }, + { + "epoch": 517.0, + "grad_norm": 0.04426151141524315, + "learning_rate": 0.0010733333333333333, + "loss": 0.0052, + "step": 517 + }, + { + "epoch": 518.0, + "grad_norm": 0.03802550211548805, + "learning_rate": 0.0010711111111111112, + "loss": 0.0049, + "step": 518 + }, + { + "epoch": 519.0, + "grad_norm": 0.025775237008929253, + "learning_rate": 0.001068888888888889, + "loss": 0.0049, + "step": 519 + }, + { + "epoch": 520.0, + "grad_norm": 0.04428073391318321, + "learning_rate": 0.0010666666666666667, + "loss": 0.0052, + "step": 520 + }, + { + "epoch": 521.0, + "grad_norm": 0.033617082983255386, + "learning_rate": 0.0010644444444444445, + "loss": 0.0051, + "step": 521 + }, + { + "epoch": 522.0, + "grad_norm": 0.033705078065395355, + "learning_rate": 0.0010622222222222222, + "loss": 0.0048, + "step": 522 + }, + { + "epoch": 523.0, + "grad_norm": 0.04792787879705429, + "learning_rate": 0.0010600000000000002, + "loss": 0.0049, + "step": 523 + }, + { + "epoch": 524.0, + "grad_norm": 0.02900075912475586, + "learning_rate": 0.0010577777777777777, + "loss": 0.0047, + "step": 524 + }, + { + "epoch": 525.0, + "grad_norm": 0.054417647421360016, + "learning_rate": 0.0010555555555555557, + "loss": 0.0054, + "step": 525 + }, + { + "epoch": 526.0, + "grad_norm": 0.03227232024073601, + "learning_rate": 0.0010533333333333332, + "loss": 0.005, + "step": 526 + }, + { + "epoch": 527.0, + "grad_norm": 0.03752639517188072, + "learning_rate": 0.0010511111111111112, + "loss": 0.0053, + "step": 527 + }, + { + "epoch": 528.0, + "grad_norm": 0.029628194868564606, + "learning_rate": 0.001048888888888889, + "loss": 0.0049, + "step": 528 + }, + { + "epoch": 529.0, + "grad_norm": 0.03387615829706192, + "learning_rate": 0.0010466666666666667, + "loss": 0.005, + "step": 529 + }, + { + "epoch": 530.0, + "grad_norm": 0.033868737518787384, + "learning_rate": 0.0010444444444444446, + "loss": 0.0048, + "step": 530 + }, + { + "epoch": 531.0, + "grad_norm": 0.035898059606552124, + "learning_rate": 0.0010422222222222222, + "loss": 0.005, + "step": 531 + }, + { + "epoch": 532.0, + "grad_norm": 0.040057726204395294, + "learning_rate": 0.0010400000000000001, + "loss": 0.005, + "step": 532 + }, + { + "epoch": 533.0, + "grad_norm": 0.03613459691405296, + "learning_rate": 0.0010377777777777777, + "loss": 0.005, + "step": 533 + }, + { + "epoch": 534.0, + "grad_norm": 0.034286290407180786, + "learning_rate": 0.0010355555555555556, + "loss": 0.005, + "step": 534 + }, + { + "epoch": 535.0, + "grad_norm": 0.040484048426151276, + "learning_rate": 0.0010333333333333334, + "loss": 0.005, + "step": 535 + }, + { + "epoch": 536.0, + "grad_norm": 0.03549760952591896, + "learning_rate": 0.0010311111111111111, + "loss": 0.0049, + "step": 536 + }, + { + "epoch": 537.0, + "grad_norm": 0.03199818730354309, + "learning_rate": 0.001028888888888889, + "loss": 0.0048, + "step": 537 + }, + { + "epoch": 538.0, + "grad_norm": 0.045031916350126266, + "learning_rate": 0.0010266666666666666, + "loss": 0.005, + "step": 538 + }, + { + "epoch": 539.0, + "grad_norm": 0.03412579745054245, + "learning_rate": 0.0010244444444444446, + "loss": 0.0051, + "step": 539 + }, + { + "epoch": 540.0, + "grad_norm": 0.02991371974349022, + "learning_rate": 0.0010222222222222221, + "loss": 0.0046, + "step": 540 + }, + { + "epoch": 541.0, + "grad_norm": 0.025920365005731583, + "learning_rate": 0.00102, + "loss": 0.0044, + "step": 541 + }, + { + "epoch": 542.0, + "grad_norm": 0.04434429481625557, + "learning_rate": 0.0010177777777777776, + "loss": 0.005, + "step": 542 + }, + { + "epoch": 543.0, + "grad_norm": 0.03925777226686478, + "learning_rate": 0.0010155555555555556, + "loss": 0.005, + "step": 543 + }, + { + "epoch": 544.0, + "grad_norm": 0.028992939740419388, + "learning_rate": 0.0010133333333333335, + "loss": 0.0044, + "step": 544 + }, + { + "epoch": 545.0, + "grad_norm": 0.050765953958034515, + "learning_rate": 0.001011111111111111, + "loss": 0.005, + "step": 545 + }, + { + "epoch": 546.0, + "grad_norm": 0.0336458683013916, + "learning_rate": 0.001008888888888889, + "loss": 0.0047, + "step": 546 + }, + { + "epoch": 547.0, + "grad_norm": 0.03314169868826866, + "learning_rate": 0.0010066666666666666, + "loss": 0.0047, + "step": 547 + }, + { + "epoch": 548.0, + "grad_norm": 0.048472099006175995, + "learning_rate": 0.0010044444444444445, + "loss": 0.0049, + "step": 548 + }, + { + "epoch": 549.0, + "grad_norm": 0.03656391799449921, + "learning_rate": 0.001002222222222222, + "loss": 0.0049, + "step": 549 + }, + { + "epoch": 550.0, + "grad_norm": 0.046296313405036926, + "learning_rate": 0.001, + "loss": 0.0049, + "step": 550 + }, + { + "epoch": 551.0, + "grad_norm": 0.028759324923157692, + "learning_rate": 0.0009977777777777778, + "loss": 0.0047, + "step": 551 + }, + { + "epoch": 552.0, + "grad_norm": 0.031202584505081177, + "learning_rate": 0.0009955555555555555, + "loss": 0.0048, + "step": 552 + }, + { + "epoch": 553.0, + "grad_norm": 0.040073346346616745, + "learning_rate": 0.0009933333333333333, + "loss": 0.005, + "step": 553 + }, + { + "epoch": 554.0, + "grad_norm": 0.05222450569272041, + "learning_rate": 0.0009911111111111112, + "loss": 0.0047, + "step": 554 + }, + { + "epoch": 555.0, + "grad_norm": 0.03518804907798767, + "learning_rate": 0.000988888888888889, + "loss": 0.0046, + "step": 555 + }, + { + "epoch": 556.0, + "grad_norm": 0.04386880621314049, + "learning_rate": 0.0009866666666666667, + "loss": 0.0049, + "step": 556 + }, + { + "epoch": 557.0, + "grad_norm": 0.034030430018901825, + "learning_rate": 0.0009844444444444445, + "loss": 0.0049, + "step": 557 + }, + { + "epoch": 558.0, + "grad_norm": 0.024780094623565674, + "learning_rate": 0.0009822222222222222, + "loss": 0.0046, + "step": 558 + }, + { + "epoch": 559.0, + "grad_norm": 0.027013640850782394, + "learning_rate": 0.00098, + "loss": 0.0046, + "step": 559 + }, + { + "epoch": 560.0, + "grad_norm": 0.030160361900925636, + "learning_rate": 0.0009777777777777777, + "loss": 0.0043, + "step": 560 + }, + { + "epoch": 561.0, + "grad_norm": 0.034773167222738266, + "learning_rate": 0.0009755555555555556, + "loss": 0.005, + "step": 561 + }, + { + "epoch": 562.0, + "grad_norm": 0.031707510352134705, + "learning_rate": 0.0009733333333333334, + "loss": 0.0046, + "step": 562 + }, + { + "epoch": 563.0, + "grad_norm": 0.028600016608834267, + "learning_rate": 0.0009711111111111112, + "loss": 0.0044, + "step": 563 + }, + { + "epoch": 564.0, + "grad_norm": 0.04488474503159523, + "learning_rate": 0.0009688888888888889, + "loss": 0.0046, + "step": 564 + }, + { + "epoch": 565.0, + "grad_norm": 0.027938274666666985, + "learning_rate": 0.0009666666666666667, + "loss": 0.0045, + "step": 565 + }, + { + "epoch": 566.0, + "grad_norm": 0.036352016031742096, + "learning_rate": 0.0009644444444444444, + "loss": 0.0045, + "step": 566 + }, + { + "epoch": 567.0, + "grad_norm": 0.03465161472558975, + "learning_rate": 0.0009622222222222222, + "loss": 0.0045, + "step": 567 + }, + { + "epoch": 568.0, + "grad_norm": 0.0352412685751915, + "learning_rate": 0.00096, + "loss": 0.0047, + "step": 568 + }, + { + "epoch": 569.0, + "grad_norm": 0.03907673805952072, + "learning_rate": 0.0009577777777777778, + "loss": 0.0049, + "step": 569 + }, + { + "epoch": 570.0, + "grad_norm": 0.0324881337583065, + "learning_rate": 0.0009555555555555556, + "loss": 0.0045, + "step": 570 + }, + { + "epoch": 571.0, + "grad_norm": 0.03322600945830345, + "learning_rate": 0.0009533333333333334, + "loss": 0.0046, + "step": 571 + }, + { + "epoch": 572.0, + "grad_norm": 0.04286476597189903, + "learning_rate": 0.0009511111111111111, + "loss": 0.0045, + "step": 572 + }, + { + "epoch": 573.0, + "grad_norm": 0.038839150220155716, + "learning_rate": 0.0009488888888888889, + "loss": 0.0046, + "step": 573 + }, + { + "epoch": 574.0, + "grad_norm": 0.027261720970273018, + "learning_rate": 0.0009466666666666667, + "loss": 0.0043, + "step": 574 + }, + { + "epoch": 575.0, + "grad_norm": 0.0263065192848444, + "learning_rate": 0.0009444444444444445, + "loss": 0.0041, + "step": 575 + }, + { + "epoch": 576.0, + "grad_norm": 0.03773610666394234, + "learning_rate": 0.0009422222222222222, + "loss": 0.0048, + "step": 576 + }, + { + "epoch": 577.0, + "grad_norm": 0.03849232941865921, + "learning_rate": 0.00094, + "loss": 0.005, + "step": 577 + }, + { + "epoch": 578.0, + "grad_norm": 0.04704838618636131, + "learning_rate": 0.0009377777777777778, + "loss": 0.0049, + "step": 578 + }, + { + "epoch": 579.0, + "grad_norm": 0.029075607657432556, + "learning_rate": 0.0009355555555555556, + "loss": 0.0044, + "step": 579 + }, + { + "epoch": 580.0, + "grad_norm": 0.026013720780611038, + "learning_rate": 0.0009333333333333333, + "loss": 0.0043, + "step": 580 + }, + { + "epoch": 581.0, + "grad_norm": 0.03620687872171402, + "learning_rate": 0.0009311111111111112, + "loss": 0.0046, + "step": 581 + }, + { + "epoch": 582.0, + "grad_norm": 0.03129187971353531, + "learning_rate": 0.0009288888888888889, + "loss": 0.0044, + "step": 582 + }, + { + "epoch": 583.0, + "grad_norm": 0.022100580856204033, + "learning_rate": 0.0009266666666666667, + "loss": 0.004, + "step": 583 + }, + { + "epoch": 584.0, + "grad_norm": 0.03407726436853409, + "learning_rate": 0.0009244444444444444, + "loss": 0.0045, + "step": 584 + }, + { + "epoch": 585.0, + "grad_norm": 0.030179621651768684, + "learning_rate": 0.0009222222222222223, + "loss": 0.0045, + "step": 585 + }, + { + "epoch": 586.0, + "grad_norm": 0.03250374644994736, + "learning_rate": 0.00092, + "loss": 0.0048, + "step": 586 + }, + { + "epoch": 587.0, + "grad_norm": 0.030906520783901215, + "learning_rate": 0.0009177777777777778, + "loss": 0.0043, + "step": 587 + }, + { + "epoch": 588.0, + "grad_norm": 0.03283608704805374, + "learning_rate": 0.0009155555555555556, + "loss": 0.0044, + "step": 588 + }, + { + "epoch": 589.0, + "grad_norm": 0.029278622940182686, + "learning_rate": 0.0009133333333333334, + "loss": 0.0044, + "step": 589 + }, + { + "epoch": 590.0, + "grad_norm": 0.02297147922217846, + "learning_rate": 0.0009111111111111111, + "loss": 0.0041, + "step": 590 + }, + { + "epoch": 591.0, + "grad_norm": 0.023177258670330048, + "learning_rate": 0.0009088888888888889, + "loss": 0.0039, + "step": 591 + }, + { + "epoch": 592.0, + "grad_norm": 0.030108338221907616, + "learning_rate": 0.0009066666666666666, + "loss": 0.0043, + "step": 592 + }, + { + "epoch": 593.0, + "grad_norm": 0.02184971235692501, + "learning_rate": 0.0009044444444444445, + "loss": 0.0038, + "step": 593 + }, + { + "epoch": 594.0, + "grad_norm": 0.027614466845989227, + "learning_rate": 0.0009022222222222222, + "loss": 0.0041, + "step": 594 + }, + { + "epoch": 595.0, + "grad_norm": 0.038632676005363464, + "learning_rate": 0.0009000000000000001, + "loss": 0.0044, + "step": 595 + }, + { + "epoch": 596.0, + "grad_norm": 0.032530948519706726, + "learning_rate": 0.0008977777777777778, + "loss": 0.0043, + "step": 596 + }, + { + "epoch": 597.0, + "grad_norm": 0.028832774609327316, + "learning_rate": 0.0008955555555555556, + "loss": 0.0044, + "step": 597 + }, + { + "epoch": 598.0, + "grad_norm": 0.02461000345647335, + "learning_rate": 0.0008933333333333333, + "loss": 0.0042, + "step": 598 + }, + { + "epoch": 599.0, + "grad_norm": 0.028294360265135765, + "learning_rate": 0.0008911111111111111, + "loss": 0.0043, + "step": 599 + }, + { + "epoch": 600.0, + "grad_norm": 0.023488713428378105, + "learning_rate": 0.0008888888888888888, + "loss": 0.004, + "step": 600 + }, + { + "epoch": 601.0, + "grad_norm": 0.03520805388689041, + "learning_rate": 0.0008866666666666667, + "loss": 0.0038, + "step": 601 + }, + { + "epoch": 602.0, + "grad_norm": 0.03618593141436577, + "learning_rate": 0.0008844444444444445, + "loss": 0.0045, + "step": 602 + }, + { + "epoch": 603.0, + "grad_norm": 0.02912176214158535, + "learning_rate": 0.0008822222222222223, + "loss": 0.0042, + "step": 603 + }, + { + "epoch": 604.0, + "grad_norm": 0.02343122847378254, + "learning_rate": 0.00088, + "loss": 0.0038, + "step": 604 + }, + { + "epoch": 605.0, + "grad_norm": 0.032638318836688995, + "learning_rate": 0.0008777777777777778, + "loss": 0.0041, + "step": 605 + }, + { + "epoch": 606.0, + "grad_norm": 0.03873821347951889, + "learning_rate": 0.0008755555555555555, + "loss": 0.0045, + "step": 606 + }, + { + "epoch": 607.0, + "grad_norm": 0.04660570248961449, + "learning_rate": 0.0008733333333333333, + "loss": 0.0044, + "step": 607 + }, + { + "epoch": 608.0, + "grad_norm": 0.03421563282608986, + "learning_rate": 0.000871111111111111, + "loss": 0.004, + "step": 608 + }, + { + "epoch": 609.0, + "grad_norm": 0.02049202285706997, + "learning_rate": 0.000868888888888889, + "loss": 0.0038, + "step": 609 + }, + { + "epoch": 610.0, + "grad_norm": 0.0303183626383543, + "learning_rate": 0.0008666666666666667, + "loss": 0.004, + "step": 610 + }, + { + "epoch": 611.0, + "grad_norm": 0.03326858580112457, + "learning_rate": 0.0008644444444444445, + "loss": 0.004, + "step": 611 + }, + { + "epoch": 612.0, + "grad_norm": 0.030689280480146408, + "learning_rate": 0.0008622222222222222, + "loss": 0.0041, + "step": 612 + }, + { + "epoch": 613.0, + "grad_norm": 0.029986605048179626, + "learning_rate": 0.00086, + "loss": 0.0041, + "step": 613 + }, + { + "epoch": 614.0, + "grad_norm": 0.02606850303709507, + "learning_rate": 0.0008577777777777777, + "loss": 0.004, + "step": 614 + }, + { + "epoch": 615.0, + "grad_norm": 0.034229837357997894, + "learning_rate": 0.0008555555555555556, + "loss": 0.004, + "step": 615 + }, + { + "epoch": 616.0, + "grad_norm": 0.029757648706436157, + "learning_rate": 0.0008533333333333334, + "loss": 0.0043, + "step": 616 + }, + { + "epoch": 617.0, + "grad_norm": 0.03645173832774162, + "learning_rate": 0.0008511111111111112, + "loss": 0.0044, + "step": 617 + }, + { + "epoch": 618.0, + "grad_norm": 0.03808034211397171, + "learning_rate": 0.0008488888888888889, + "loss": 0.0041, + "step": 618 + }, + { + "epoch": 619.0, + "grad_norm": 0.02224326692521572, + "learning_rate": 0.0008466666666666667, + "loss": 0.0038, + "step": 619 + }, + { + "epoch": 620.0, + "grad_norm": 0.03940601274371147, + "learning_rate": 0.0008444444444444444, + "loss": 0.0039, + "step": 620 + }, + { + "epoch": 621.0, + "grad_norm": 0.049832046031951904, + "learning_rate": 0.0008422222222222222, + "loss": 0.0044, + "step": 621 + }, + { + "epoch": 622.0, + "grad_norm": 0.022758588194847107, + "learning_rate": 0.00084, + "loss": 0.0039, + "step": 622 + }, + { + "epoch": 623.0, + "grad_norm": 0.049774039536714554, + "learning_rate": 0.0008377777777777778, + "loss": 0.0045, + "step": 623 + }, + { + "epoch": 624.0, + "grad_norm": 0.03903564065694809, + "learning_rate": 0.0008355555555555556, + "loss": 0.0042, + "step": 624 + }, + { + "epoch": 625.0, + "grad_norm": 0.029349060729146004, + "learning_rate": 0.0008333333333333334, + "loss": 0.0042, + "step": 625 + }, + { + "epoch": 626.0, + "grad_norm": 0.034631963819265366, + "learning_rate": 0.0008311111111111111, + "loss": 0.0043, + "step": 626 + }, + { + "epoch": 627.0, + "grad_norm": 0.04115751013159752, + "learning_rate": 0.0008288888888888889, + "loss": 0.0044, + "step": 627 + }, + { + "epoch": 628.0, + "grad_norm": 0.03835492208600044, + "learning_rate": 0.0008266666666666666, + "loss": 0.0038, + "step": 628 + }, + { + "epoch": 629.0, + "grad_norm": 0.027649203315377235, + "learning_rate": 0.0008244444444444445, + "loss": 0.0042, + "step": 629 + }, + { + "epoch": 630.0, + "grad_norm": 0.028530064970254898, + "learning_rate": 0.0008222222222222222, + "loss": 0.0041, + "step": 630 + }, + { + "epoch": 631.0, + "grad_norm": 0.029089566320180893, + "learning_rate": 0.00082, + "loss": 0.0038, + "step": 631 + }, + { + "epoch": 632.0, + "grad_norm": 0.04255674034357071, + "learning_rate": 0.0008177777777777778, + "loss": 0.0042, + "step": 632 + }, + { + "epoch": 633.0, + "grad_norm": 0.024558911100029945, + "learning_rate": 0.0008155555555555556, + "loss": 0.0039, + "step": 633 + }, + { + "epoch": 634.0, + "grad_norm": 0.031170202419161797, + "learning_rate": 0.0008133333333333333, + "loss": 0.0041, + "step": 634 + }, + { + "epoch": 635.0, + "grad_norm": 0.029109520837664604, + "learning_rate": 0.0008111111111111111, + "loss": 0.0038, + "step": 635 + }, + { + "epoch": 636.0, + "grad_norm": 0.02705140970647335, + "learning_rate": 0.0008088888888888889, + "loss": 0.0038, + "step": 636 + }, + { + "epoch": 637.0, + "grad_norm": 0.023048389703035355, + "learning_rate": 0.0008066666666666667, + "loss": 0.0038, + "step": 637 + }, + { + "epoch": 638.0, + "grad_norm": 0.02259679324924946, + "learning_rate": 0.0008044444444444444, + "loss": 0.0036, + "step": 638 + }, + { + "epoch": 639.0, + "grad_norm": 0.029809147119522095, + "learning_rate": 0.0008022222222222222, + "loss": 0.0039, + "step": 639 + }, + { + "epoch": 640.0, + "grad_norm": 0.028109390288591385, + "learning_rate": 0.0008, + "loss": 0.0038, + "step": 640 + }, + { + "epoch": 641.0, + "grad_norm": 0.04414813220500946, + "learning_rate": 0.0007977777777777778, + "loss": 0.0042, + "step": 641 + }, + { + "epoch": 642.0, + "grad_norm": 0.030944792553782463, + "learning_rate": 0.0007955555555555555, + "loss": 0.0038, + "step": 642 + }, + { + "epoch": 643.0, + "grad_norm": 0.023884311318397522, + "learning_rate": 0.0007933333333333334, + "loss": 0.0036, + "step": 643 + }, + { + "epoch": 644.0, + "grad_norm": 0.024393659085035324, + "learning_rate": 0.0007911111111111111, + "loss": 0.0038, + "step": 644 + }, + { + "epoch": 645.0, + "grad_norm": 0.02105238474905491, + "learning_rate": 0.0007888888888888889, + "loss": 0.0036, + "step": 645 + }, + { + "epoch": 646.0, + "grad_norm": 0.02686360850930214, + "learning_rate": 0.0007866666666666666, + "loss": 0.0039, + "step": 646 + }, + { + "epoch": 647.0, + "grad_norm": 0.03692441061139107, + "learning_rate": 0.0007844444444444445, + "loss": 0.0042, + "step": 647 + }, + { + "epoch": 648.0, + "grad_norm": 0.026083093136548996, + "learning_rate": 0.0007822222222222222, + "loss": 0.0037, + "step": 648 + }, + { + "epoch": 649.0, + "grad_norm": 0.021342594176530838, + "learning_rate": 0.0007800000000000001, + "loss": 0.0036, + "step": 649 + }, + { + "epoch": 650.0, + "grad_norm": 0.024524180218577385, + "learning_rate": 0.0007777777777777778, + "loss": 0.0039, + "step": 650 + }, + { + "epoch": 651.0, + "grad_norm": 0.028572745621204376, + "learning_rate": 0.0007755555555555556, + "loss": 0.0038, + "step": 651 + }, + { + "epoch": 652.0, + "grad_norm": 0.043239813297986984, + "learning_rate": 0.0007733333333333333, + "loss": 0.0041, + "step": 652 + }, + { + "epoch": 653.0, + "grad_norm": 0.021039173007011414, + "learning_rate": 0.0007711111111111111, + "loss": 0.0036, + "step": 653 + }, + { + "epoch": 654.0, + "grad_norm": 0.0256752148270607, + "learning_rate": 0.0007688888888888888, + "loss": 0.0037, + "step": 654 + }, + { + "epoch": 655.0, + "grad_norm": 0.033442508429288864, + "learning_rate": 0.0007666666666666667, + "loss": 0.0041, + "step": 655 + }, + { + "epoch": 656.0, + "grad_norm": 0.023301295936107635, + "learning_rate": 0.0007644444444444445, + "loss": 0.0038, + "step": 656 + }, + { + "epoch": 657.0, + "grad_norm": 0.02558085508644581, + "learning_rate": 0.0007622222222222223, + "loss": 0.0039, + "step": 657 + }, + { + "epoch": 658.0, + "grad_norm": 0.02924611233174801, + "learning_rate": 0.00076, + "loss": 0.0038, + "step": 658 + }, + { + "epoch": 659.0, + "grad_norm": 0.02991502359509468, + "learning_rate": 0.0007577777777777778, + "loss": 0.0035, + "step": 659 + }, + { + "epoch": 660.0, + "grad_norm": 0.03482845798134804, + "learning_rate": 0.0007555555555555555, + "loss": 0.004, + "step": 660 + }, + { + "epoch": 661.0, + "grad_norm": 0.02580106444656849, + "learning_rate": 0.0007533333333333333, + "loss": 0.0036, + "step": 661 + }, + { + "epoch": 662.0, + "grad_norm": 0.020824357867240906, + "learning_rate": 0.000751111111111111, + "loss": 0.0033, + "step": 662 + }, + { + "epoch": 663.0, + "grad_norm": 0.02536211535334587, + "learning_rate": 0.000748888888888889, + "loss": 0.0037, + "step": 663 + }, + { + "epoch": 664.0, + "grad_norm": 0.05054183304309845, + "learning_rate": 0.0007466666666666667, + "loss": 0.0045, + "step": 664 + }, + { + "epoch": 665.0, + "grad_norm": 0.022909749299287796, + "learning_rate": 0.0007444444444444445, + "loss": 0.0036, + "step": 665 + }, + { + "epoch": 666.0, + "grad_norm": 0.027930857613682747, + "learning_rate": 0.0007422222222222222, + "loss": 0.0038, + "step": 666 + }, + { + "epoch": 667.0, + "grad_norm": 0.022380666807293892, + "learning_rate": 0.00074, + "loss": 0.0036, + "step": 667 + }, + { + "epoch": 668.0, + "grad_norm": 0.032786887139081955, + "learning_rate": 0.0007377777777777777, + "loss": 0.0036, + "step": 668 + }, + { + "epoch": 669.0, + "grad_norm": 0.03477782383561134, + "learning_rate": 0.0007355555555555555, + "loss": 0.0041, + "step": 669 + }, + { + "epoch": 670.0, + "grad_norm": 0.029481414705514908, + "learning_rate": 0.0007333333333333333, + "loss": 0.0038, + "step": 670 + }, + { + "epoch": 671.0, + "grad_norm": 0.039676692336797714, + "learning_rate": 0.0007311111111111112, + "loss": 0.0039, + "step": 671 + }, + { + "epoch": 672.0, + "grad_norm": 0.049726180732250214, + "learning_rate": 0.0007288888888888889, + "loss": 0.0042, + "step": 672 + }, + { + "epoch": 673.0, + "grad_norm": 0.02737841010093689, + "learning_rate": 0.0007266666666666667, + "loss": 0.0036, + "step": 673 + }, + { + "epoch": 674.0, + "grad_norm": 0.02903469279408455, + "learning_rate": 0.0007244444444444444, + "loss": 0.0037, + "step": 674 + }, + { + "epoch": 675.0, + "grad_norm": 0.036345984786748886, + "learning_rate": 0.0007222222222222222, + "loss": 0.004, + "step": 675 + }, + { + "epoch": 676.0, + "grad_norm": 0.05052892118692398, + "learning_rate": 0.0007199999999999999, + "loss": 0.0041, + "step": 676 + }, + { + "epoch": 677.0, + "grad_norm": 0.024914775043725967, + "learning_rate": 0.0007177777777777778, + "loss": 0.0036, + "step": 677 + }, + { + "epoch": 678.0, + "grad_norm": 0.053874969482421875, + "learning_rate": 0.0007155555555555555, + "loss": 0.0039, + "step": 678 + }, + { + "epoch": 679.0, + "grad_norm": 0.029548736289143562, + "learning_rate": 0.0007133333333333334, + "loss": 0.0035, + "step": 679 + }, + { + "epoch": 680.0, + "grad_norm": 0.030973508954048157, + "learning_rate": 0.0007111111111111111, + "loss": 0.0038, + "step": 680 + }, + { + "epoch": 681.0, + "grad_norm": 0.021659094840288162, + "learning_rate": 0.0007088888888888889, + "loss": 0.0035, + "step": 681 + }, + { + "epoch": 682.0, + "grad_norm": 0.05033154413104057, + "learning_rate": 0.0007066666666666666, + "loss": 0.0042, + "step": 682 + }, + { + "epoch": 683.0, + "grad_norm": 0.024461543187499046, + "learning_rate": 0.0007044444444444445, + "loss": 0.0035, + "step": 683 + }, + { + "epoch": 684.0, + "grad_norm": 0.03515414148569107, + "learning_rate": 0.0007022222222222222, + "loss": 0.0036, + "step": 684 + }, + { + "epoch": 685.0, + "grad_norm": 0.024221835657954216, + "learning_rate": 0.0007, + "loss": 0.0035, + "step": 685 + }, + { + "epoch": 686.0, + "grad_norm": 0.0397065207362175, + "learning_rate": 0.0006977777777777778, + "loss": 0.0035, + "step": 686 + }, + { + "epoch": 687.0, + "grad_norm": 0.06702705472707748, + "learning_rate": 0.0006955555555555556, + "loss": 0.0042, + "step": 687 + }, + { + "epoch": 688.0, + "grad_norm": 0.0224533099681139, + "learning_rate": 0.0006933333333333333, + "loss": 0.0034, + "step": 688 + }, + { + "epoch": 689.0, + "grad_norm": 0.029705343768000603, + "learning_rate": 0.0006911111111111111, + "loss": 0.0035, + "step": 689 + }, + { + "epoch": 690.0, + "grad_norm": 0.05282840505242348, + "learning_rate": 0.000688888888888889, + "loss": 0.0043, + "step": 690 + }, + { + "epoch": 691.0, + "grad_norm": 0.04364459589123726, + "learning_rate": 0.0006866666666666667, + "loss": 0.0042, + "step": 691 + }, + { + "epoch": 692.0, + "grad_norm": 0.025150645524263382, + "learning_rate": 0.0006844444444444444, + "loss": 0.0034, + "step": 692 + }, + { + "epoch": 693.0, + "grad_norm": 0.03731248155236244, + "learning_rate": 0.0006822222222222222, + "loss": 0.0039, + "step": 693 + }, + { + "epoch": 694.0, + "grad_norm": 0.04981468245387077, + "learning_rate": 0.00068, + "loss": 0.0043, + "step": 694 + }, + { + "epoch": 695.0, + "grad_norm": 0.03002479299902916, + "learning_rate": 0.0006777777777777778, + "loss": 0.0039, + "step": 695 + }, + { + "epoch": 696.0, + "grad_norm": 0.024293873459100723, + "learning_rate": 0.0006755555555555555, + "loss": 0.0035, + "step": 696 + }, + { + "epoch": 697.0, + "grad_norm": 0.042657673358917236, + "learning_rate": 0.0006733333333333334, + "loss": 0.0038, + "step": 697 + }, + { + "epoch": 698.0, + "grad_norm": 0.05678756162524223, + "learning_rate": 0.0006711111111111111, + "loss": 0.0042, + "step": 698 + }, + { + "epoch": 699.0, + "grad_norm": 0.024007895961403847, + "learning_rate": 0.0006688888888888889, + "loss": 0.0035, + "step": 699 + }, + { + "epoch": 700.0, + "grad_norm": 0.041944634169340134, + "learning_rate": 0.0006666666666666666, + "loss": 0.0038, + "step": 700 + }, + { + "epoch": 701.0, + "grad_norm": 0.04064980521798134, + "learning_rate": 0.0006644444444444444, + "loss": 0.0039, + "step": 701 + }, + { + "epoch": 702.0, + "grad_norm": 0.02716059610247612, + "learning_rate": 0.0006622222222222222, + "loss": 0.0036, + "step": 702 + }, + { + "epoch": 703.0, + "grad_norm": 0.023927675560116768, + "learning_rate": 0.00066, + "loss": 0.0035, + "step": 703 + }, + { + "epoch": 704.0, + "grad_norm": 0.03374261036515236, + "learning_rate": 0.0006577777777777779, + "loss": 0.0035, + "step": 704 + }, + { + "epoch": 705.0, + "grad_norm": 0.026420941576361656, + "learning_rate": 0.0006555555555555556, + "loss": 0.0037, + "step": 705 + }, + { + "epoch": 706.0, + "grad_norm": 0.03130070120096207, + "learning_rate": 0.0006533333333333333, + "loss": 0.0037, + "step": 706 + }, + { + "epoch": 707.0, + "grad_norm": 0.024263912811875343, + "learning_rate": 0.0006511111111111111, + "loss": 0.0034, + "step": 707 + }, + { + "epoch": 708.0, + "grad_norm": 0.027501242235302925, + "learning_rate": 0.0006488888888888888, + "loss": 0.0038, + "step": 708 + }, + { + "epoch": 709.0, + "grad_norm": 0.02567414566874504, + "learning_rate": 0.0006466666666666666, + "loss": 0.0035, + "step": 709 + }, + { + "epoch": 710.0, + "grad_norm": 0.024132689461112022, + "learning_rate": 0.0006444444444444444, + "loss": 0.0036, + "step": 710 + }, + { + "epoch": 711.0, + "grad_norm": 0.024011000990867615, + "learning_rate": 0.0006422222222222223, + "loss": 0.0036, + "step": 711 + }, + { + "epoch": 712.0, + "grad_norm": 0.02344098687171936, + "learning_rate": 0.00064, + "loss": 0.0036, + "step": 712 + }, + { + "epoch": 713.0, + "grad_norm": 0.023972654715180397, + "learning_rate": 0.0006377777777777778, + "loss": 0.0037, + "step": 713 + }, + { + "epoch": 714.0, + "grad_norm": 0.029682451859116554, + "learning_rate": 0.0006355555555555555, + "loss": 0.0036, + "step": 714 + }, + { + "epoch": 715.0, + "grad_norm": 0.02595234103500843, + "learning_rate": 0.0006333333333333333, + "loss": 0.0034, + "step": 715 + }, + { + "epoch": 716.0, + "grad_norm": 0.023837080225348473, + "learning_rate": 0.000631111111111111, + "loss": 0.0036, + "step": 716 + }, + { + "epoch": 717.0, + "grad_norm": 0.026442958042025566, + "learning_rate": 0.000628888888888889, + "loss": 0.0036, + "step": 717 + }, + { + "epoch": 718.0, + "grad_norm": 0.024803321808576584, + "learning_rate": 0.0006266666666666668, + "loss": 0.0036, + "step": 718 + }, + { + "epoch": 719.0, + "grad_norm": 0.023358143866062164, + "learning_rate": 0.0006244444444444445, + "loss": 0.0035, + "step": 719 + }, + { + "epoch": 720.0, + "grad_norm": 0.026576291769742966, + "learning_rate": 0.0006222222222222223, + "loss": 0.0033, + "step": 720 + }, + { + "epoch": 721.0, + "grad_norm": 0.030121877789497375, + "learning_rate": 0.00062, + "loss": 0.0035, + "step": 721 + }, + { + "epoch": 722.0, + "grad_norm": 0.02631719410419464, + "learning_rate": 0.0006177777777777777, + "loss": 0.0038, + "step": 722 + }, + { + "epoch": 723.0, + "grad_norm": 0.027765462175011635, + "learning_rate": 0.0006155555555555555, + "loss": 0.0034, + "step": 723 + }, + { + "epoch": 724.0, + "grad_norm": 0.031482163816690445, + "learning_rate": 0.0006133333333333334, + "loss": 0.0035, + "step": 724 + }, + { + "epoch": 725.0, + "grad_norm": 0.023726513609290123, + "learning_rate": 0.0006111111111111112, + "loss": 0.0033, + "step": 725 + }, + { + "epoch": 726.0, + "grad_norm": 0.02765974961221218, + "learning_rate": 0.000608888888888889, + "loss": 0.0034, + "step": 726 + }, + { + "epoch": 727.0, + "grad_norm": 0.03566696122288704, + "learning_rate": 0.0006066666666666667, + "loss": 0.0036, + "step": 727 + }, + { + "epoch": 728.0, + "grad_norm": 0.028009934350848198, + "learning_rate": 0.0006044444444444445, + "loss": 0.0035, + "step": 728 + }, + { + "epoch": 729.0, + "grad_norm": 0.02318991906940937, + "learning_rate": 0.0006022222222222222, + "loss": 0.0032, + "step": 729 + }, + { + "epoch": 730.0, + "grad_norm": 0.022554708644747734, + "learning_rate": 0.0006, + "loss": 0.0035, + "step": 730 + }, + { + "epoch": 731.0, + "grad_norm": 0.02474828064441681, + "learning_rate": 0.0005977777777777778, + "loss": 0.0033, + "step": 731 + }, + { + "epoch": 732.0, + "grad_norm": 0.0323016531765461, + "learning_rate": 0.0005955555555555556, + "loss": 0.0034, + "step": 732 + }, + { + "epoch": 733.0, + "grad_norm": 0.019244784489274025, + "learning_rate": 0.0005933333333333334, + "loss": 0.0033, + "step": 733 + }, + { + "epoch": 734.0, + "grad_norm": 0.021869376301765442, + "learning_rate": 0.0005911111111111112, + "loss": 0.0032, + "step": 734 + }, + { + "epoch": 735.0, + "grad_norm": 0.01879352703690529, + "learning_rate": 0.0005888888888888889, + "loss": 0.0032, + "step": 735 + }, + { + "epoch": 736.0, + "grad_norm": 0.026631703600287437, + "learning_rate": 0.0005866666666666667, + "loss": 0.0033, + "step": 736 + }, + { + "epoch": 737.0, + "grad_norm": 0.028665434569120407, + "learning_rate": 0.0005844444444444444, + "loss": 0.0035, + "step": 737 + }, + { + "epoch": 738.0, + "grad_norm": 0.01596708409488201, + "learning_rate": 0.0005822222222222223, + "loss": 0.003, + "step": 738 + }, + { + "epoch": 739.0, + "grad_norm": 0.017112715169787407, + "learning_rate": 0.00058, + "loss": 0.0031, + "step": 739 + }, + { + "epoch": 740.0, + "grad_norm": 0.0246286503970623, + "learning_rate": 0.0005777777777777778, + "loss": 0.0034, + "step": 740 + }, + { + "epoch": 741.0, + "grad_norm": 0.01566813327372074, + "learning_rate": 0.0005755555555555556, + "loss": 0.003, + "step": 741 + }, + { + "epoch": 742.0, + "grad_norm": 0.02360912226140499, + "learning_rate": 0.0005733333333333334, + "loss": 0.0033, + "step": 742 + }, + { + "epoch": 743.0, + "grad_norm": 0.024110812693834305, + "learning_rate": 0.0005711111111111111, + "loss": 0.0033, + "step": 743 + }, + { + "epoch": 744.0, + "grad_norm": 0.020427672192454338, + "learning_rate": 0.0005688888888888889, + "loss": 0.0032, + "step": 744 + }, + { + "epoch": 745.0, + "grad_norm": 0.025975676253437996, + "learning_rate": 0.0005666666666666667, + "loss": 0.0031, + "step": 745 + }, + { + "epoch": 746.0, + "grad_norm": 0.022355573251843452, + "learning_rate": 0.0005644444444444445, + "loss": 0.0033, + "step": 746 + }, + { + "epoch": 747.0, + "grad_norm": 0.04759243130683899, + "learning_rate": 0.0005622222222222222, + "loss": 0.0037, + "step": 747 + }, + { + "epoch": 748.0, + "grad_norm": 0.030733415856957436, + "learning_rate": 0.0005600000000000001, + "loss": 0.0033, + "step": 748 + }, + { + "epoch": 749.0, + "grad_norm": 0.02027864381670952, + "learning_rate": 0.0005577777777777778, + "loss": 0.0033, + "step": 749 + }, + { + "epoch": 750.0, + "grad_norm": 0.028928080573678017, + "learning_rate": 0.0005555555555555556, + "loss": 0.0031, + "step": 750 + }, + { + "epoch": 751.0, + "grad_norm": 0.03321721404790878, + "learning_rate": 0.0005533333333333333, + "loss": 0.0034, + "step": 751 + }, + { + "epoch": 752.0, + "grad_norm": 0.036787249147892, + "learning_rate": 0.0005511111111111112, + "loss": 0.0034, + "step": 752 + }, + { + "epoch": 753.0, + "grad_norm": 0.021165387704968452, + "learning_rate": 0.0005488888888888889, + "loss": 0.0031, + "step": 753 + }, + { + "epoch": 754.0, + "grad_norm": 0.025929953902959824, + "learning_rate": 0.0005466666666666667, + "loss": 0.0036, + "step": 754 + }, + { + "epoch": 755.0, + "grad_norm": 0.0214844960719347, + "learning_rate": 0.0005444444444444444, + "loss": 0.0031, + "step": 755 + }, + { + "epoch": 756.0, + "grad_norm": 0.030406184494495392, + "learning_rate": 0.0005422222222222223, + "loss": 0.0034, + "step": 756 + }, + { + "epoch": 757.0, + "grad_norm": 0.0321720615029335, + "learning_rate": 0.00054, + "loss": 0.0035, + "step": 757 + }, + { + "epoch": 758.0, + "grad_norm": 0.026725683361291885, + "learning_rate": 0.0005377777777777779, + "loss": 0.0032, + "step": 758 + }, + { + "epoch": 759.0, + "grad_norm": 0.031080015003681183, + "learning_rate": 0.0005355555555555556, + "loss": 0.0035, + "step": 759 + }, + { + "epoch": 760.0, + "grad_norm": 0.023731930181384087, + "learning_rate": 0.0005333333333333334, + "loss": 0.0031, + "step": 760 + }, + { + "epoch": 761.0, + "grad_norm": 0.026623785495758057, + "learning_rate": 0.0005311111111111111, + "loss": 0.0032, + "step": 761 + }, + { + "epoch": 762.0, + "grad_norm": 0.042063500732183456, + "learning_rate": 0.0005288888888888889, + "loss": 0.0032, + "step": 762 + }, + { + "epoch": 763.0, + "grad_norm": 0.021555962041020393, + "learning_rate": 0.0005266666666666666, + "loss": 0.0031, + "step": 763 + }, + { + "epoch": 764.0, + "grad_norm": 0.026008032262325287, + "learning_rate": 0.0005244444444444445, + "loss": 0.0034, + "step": 764 + }, + { + "epoch": 765.0, + "grad_norm": 0.01928178407251835, + "learning_rate": 0.0005222222222222223, + "loss": 0.003, + "step": 765 + }, + { + "epoch": 766.0, + "grad_norm": 0.02430052123963833, + "learning_rate": 0.0005200000000000001, + "loss": 0.0031, + "step": 766 + }, + { + "epoch": 767.0, + "grad_norm": 0.028112513944506645, + "learning_rate": 0.0005177777777777778, + "loss": 0.0033, + "step": 767 + }, + { + "epoch": 768.0, + "grad_norm": 0.019901221618056297, + "learning_rate": 0.0005155555555555556, + "loss": 0.003, + "step": 768 + }, + { + "epoch": 769.0, + "grad_norm": 0.02807488478720188, + "learning_rate": 0.0005133333333333333, + "loss": 0.0031, + "step": 769 + }, + { + "epoch": 770.0, + "grad_norm": 0.027293385937809944, + "learning_rate": 0.0005111111111111111, + "loss": 0.0033, + "step": 770 + }, + { + "epoch": 771.0, + "grad_norm": 0.022820137441158295, + "learning_rate": 0.0005088888888888888, + "loss": 0.003, + "step": 771 + }, + { + "epoch": 772.0, + "grad_norm": 0.028339603915810585, + "learning_rate": 0.0005066666666666668, + "loss": 0.0033, + "step": 772 + }, + { + "epoch": 773.0, + "grad_norm": 0.027798311784863472, + "learning_rate": 0.0005044444444444445, + "loss": 0.0032, + "step": 773 + }, + { + "epoch": 774.0, + "grad_norm": 0.037513189017772675, + "learning_rate": 0.0005022222222222223, + "loss": 0.0031, + "step": 774 + }, + { + "epoch": 775.0, + "grad_norm": 0.029608087614178658, + "learning_rate": 0.0005, + "loss": 0.0031, + "step": 775 + }, + { + "epoch": 776.0, + "grad_norm": 0.02258935756981373, + "learning_rate": 0.0004977777777777778, + "loss": 0.0034, + "step": 776 + }, + { + "epoch": 777.0, + "grad_norm": 0.03222902864217758, + "learning_rate": 0.0004955555555555556, + "loss": 0.0037, + "step": 777 + }, + { + "epoch": 778.0, + "grad_norm": 0.028507541865110397, + "learning_rate": 0.0004933333333333334, + "loss": 0.0033, + "step": 778 + }, + { + "epoch": 779.0, + "grad_norm": 0.026021234691143036, + "learning_rate": 0.0004911111111111111, + "loss": 0.0031, + "step": 779 + }, + { + "epoch": 780.0, + "grad_norm": 0.03054329752922058, + "learning_rate": 0.0004888888888888889, + "loss": 0.0033, + "step": 780 + }, + { + "epoch": 781.0, + "grad_norm": 0.025316089391708374, + "learning_rate": 0.0004866666666666667, + "loss": 0.0033, + "step": 781 + }, + { + "epoch": 782.0, + "grad_norm": 0.02000274695456028, + "learning_rate": 0.00048444444444444446, + "loss": 0.003, + "step": 782 + }, + { + "epoch": 783.0, + "grad_norm": 0.03106926940381527, + "learning_rate": 0.0004822222222222222, + "loss": 0.0033, + "step": 783 + }, + { + "epoch": 784.0, + "grad_norm": 0.02422090247273445, + "learning_rate": 0.00048, + "loss": 0.0032, + "step": 784 + }, + { + "epoch": 785.0, + "grad_norm": 0.03346557170152664, + "learning_rate": 0.0004777777777777778, + "loss": 0.0033, + "step": 785 + }, + { + "epoch": 786.0, + "grad_norm": 0.016884582117199898, + "learning_rate": 0.00047555555555555556, + "loss": 0.003, + "step": 786 + }, + { + "epoch": 787.0, + "grad_norm": 0.023125050589442253, + "learning_rate": 0.00047333333333333336, + "loss": 0.0031, + "step": 787 + }, + { + "epoch": 788.0, + "grad_norm": 0.015507596544921398, + "learning_rate": 0.0004711111111111111, + "loss": 0.003, + "step": 788 + }, + { + "epoch": 789.0, + "grad_norm": 0.02069436013698578, + "learning_rate": 0.0004688888888888889, + "loss": 0.0032, + "step": 789 + }, + { + "epoch": 790.0, + "grad_norm": 0.022422535344958305, + "learning_rate": 0.00046666666666666666, + "loss": 0.0032, + "step": 790 + }, + { + "epoch": 791.0, + "grad_norm": 0.02150949463248253, + "learning_rate": 0.00046444444444444446, + "loss": 0.003, + "step": 791 + }, + { + "epoch": 792.0, + "grad_norm": 0.03762350231409073, + "learning_rate": 0.0004622222222222222, + "loss": 0.0034, + "step": 792 + }, + { + "epoch": 793.0, + "grad_norm": 0.018060874193906784, + "learning_rate": 0.00046, + "loss": 0.003, + "step": 793 + }, + { + "epoch": 794.0, + "grad_norm": 0.023126404732465744, + "learning_rate": 0.0004577777777777778, + "loss": 0.003, + "step": 794 + }, + { + "epoch": 795.0, + "grad_norm": 0.024747442454099655, + "learning_rate": 0.00045555555555555556, + "loss": 0.0031, + "step": 795 + }, + { + "epoch": 796.0, + "grad_norm": 0.03307885304093361, + "learning_rate": 0.0004533333333333333, + "loss": 0.003, + "step": 796 + }, + { + "epoch": 797.0, + "grad_norm": 0.025574902072548866, + "learning_rate": 0.0004511111111111111, + "loss": 0.003, + "step": 797 + }, + { + "epoch": 798.0, + "grad_norm": 0.018072202801704407, + "learning_rate": 0.0004488888888888889, + "loss": 0.003, + "step": 798 + }, + { + "epoch": 799.0, + "grad_norm": 0.02339911088347435, + "learning_rate": 0.00044666666666666666, + "loss": 0.0031, + "step": 799 + }, + { + "epoch": 800.0, + "grad_norm": 0.02469002641737461, + "learning_rate": 0.0004444444444444444, + "loss": 0.003, + "step": 800 + }, + { + "epoch": 801.0, + "grad_norm": 0.023919183760881424, + "learning_rate": 0.00044222222222222227, + "loss": 0.0034, + "step": 801 + }, + { + "epoch": 802.0, + "grad_norm": 0.018128234893083572, + "learning_rate": 0.00044, + "loss": 0.0028, + "step": 802 + }, + { + "epoch": 803.0, + "grad_norm": 0.024188602343201637, + "learning_rate": 0.00043777777777777776, + "loss": 0.0031, + "step": 803 + }, + { + "epoch": 804.0, + "grad_norm": 0.02027260698378086, + "learning_rate": 0.0004355555555555555, + "loss": 0.0032, + "step": 804 + }, + { + "epoch": 805.0, + "grad_norm": 0.019797317683696747, + "learning_rate": 0.00043333333333333337, + "loss": 0.003, + "step": 805 + }, + { + "epoch": 806.0, + "grad_norm": 0.027181904762983322, + "learning_rate": 0.0004311111111111111, + "loss": 0.0033, + "step": 806 + }, + { + "epoch": 807.0, + "grad_norm": 0.03041798062622547, + "learning_rate": 0.00042888888888888886, + "loss": 0.0031, + "step": 807 + }, + { + "epoch": 808.0, + "grad_norm": 0.025036826729774475, + "learning_rate": 0.0004266666666666667, + "loss": 0.003, + "step": 808 + }, + { + "epoch": 809.0, + "grad_norm": 0.02821156196296215, + "learning_rate": 0.00042444444444444447, + "loss": 0.003, + "step": 809 + }, + { + "epoch": 810.0, + "grad_norm": 0.01684625819325447, + "learning_rate": 0.0004222222222222222, + "loss": 0.0028, + "step": 810 + }, + { + "epoch": 811.0, + "grad_norm": 0.014117077924311161, + "learning_rate": 0.00042, + "loss": 0.0027, + "step": 811 + }, + { + "epoch": 812.0, + "grad_norm": 0.028561661019921303, + "learning_rate": 0.0004177777777777778, + "loss": 0.003, + "step": 812 + }, + { + "epoch": 813.0, + "grad_norm": 0.022630490362644196, + "learning_rate": 0.00041555555555555557, + "loss": 0.003, + "step": 813 + }, + { + "epoch": 814.0, + "grad_norm": 0.031984347850084305, + "learning_rate": 0.0004133333333333333, + "loss": 0.0033, + "step": 814 + }, + { + "epoch": 815.0, + "grad_norm": 0.02748434990644455, + "learning_rate": 0.0004111111111111111, + "loss": 0.0031, + "step": 815 + }, + { + "epoch": 816.0, + "grad_norm": 0.02543744631111622, + "learning_rate": 0.0004088888888888889, + "loss": 0.003, + "step": 816 + }, + { + "epoch": 817.0, + "grad_norm": 0.020958127453923225, + "learning_rate": 0.00040666666666666667, + "loss": 0.003, + "step": 817 + }, + { + "epoch": 818.0, + "grad_norm": 0.029978320002555847, + "learning_rate": 0.00040444444444444447, + "loss": 0.0033, + "step": 818 + }, + { + "epoch": 819.0, + "grad_norm": 0.03185059130191803, + "learning_rate": 0.0004022222222222222, + "loss": 0.0033, + "step": 819 + }, + { + "epoch": 820.0, + "grad_norm": 0.015984434634447098, + "learning_rate": 0.0004, + "loss": 0.0029, + "step": 820 + }, + { + "epoch": 821.0, + "grad_norm": 0.016744885593652725, + "learning_rate": 0.00039777777777777777, + "loss": 0.0028, + "step": 821 + }, + { + "epoch": 822.0, + "grad_norm": 0.02187785878777504, + "learning_rate": 0.00039555555555555557, + "loss": 0.0029, + "step": 822 + }, + { + "epoch": 823.0, + "grad_norm": 0.013766797259449959, + "learning_rate": 0.0003933333333333333, + "loss": 0.0027, + "step": 823 + }, + { + "epoch": 824.0, + "grad_norm": 0.04106425866484642, + "learning_rate": 0.0003911111111111111, + "loss": 0.0034, + "step": 824 + }, + { + "epoch": 825.0, + "grad_norm": 0.03544626384973526, + "learning_rate": 0.0003888888888888889, + "loss": 0.003, + "step": 825 + }, + { + "epoch": 826.0, + "grad_norm": 0.023083612322807312, + "learning_rate": 0.00038666666666666667, + "loss": 0.0031, + "step": 826 + }, + { + "epoch": 827.0, + "grad_norm": 0.01776996999979019, + "learning_rate": 0.0003844444444444444, + "loss": 0.0029, + "step": 827 + }, + { + "epoch": 828.0, + "grad_norm": 0.029645999893546104, + "learning_rate": 0.0003822222222222223, + "loss": 0.0032, + "step": 828 + }, + { + "epoch": 829.0, + "grad_norm": 0.024389177560806274, + "learning_rate": 0.00038, + "loss": 0.003, + "step": 829 + }, + { + "epoch": 830.0, + "grad_norm": 0.03170039877295494, + "learning_rate": 0.00037777777777777777, + "loss": 0.0033, + "step": 830 + }, + { + "epoch": 831.0, + "grad_norm": 0.020817887037992477, + "learning_rate": 0.0003755555555555555, + "loss": 0.0032, + "step": 831 + }, + { + "epoch": 832.0, + "grad_norm": 0.01924346759915352, + "learning_rate": 0.0003733333333333334, + "loss": 0.0028, + "step": 832 + }, + { + "epoch": 833.0, + "grad_norm": 0.01933015137910843, + "learning_rate": 0.0003711111111111111, + "loss": 0.0029, + "step": 833 + }, + { + "epoch": 834.0, + "grad_norm": 0.02916400507092476, + "learning_rate": 0.00036888888888888887, + "loss": 0.0031, + "step": 834 + }, + { + "epoch": 835.0, + "grad_norm": 0.020619528368115425, + "learning_rate": 0.00036666666666666667, + "loss": 0.003, + "step": 835 + }, + { + "epoch": 836.0, + "grad_norm": 0.025901442393660545, + "learning_rate": 0.00036444444444444447, + "loss": 0.0032, + "step": 836 + }, + { + "epoch": 837.0, + "grad_norm": 0.02749483659863472, + "learning_rate": 0.0003622222222222222, + "loss": 0.0031, + "step": 837 + }, + { + "epoch": 838.0, + "grad_norm": 0.01978285051882267, + "learning_rate": 0.00035999999999999997, + "loss": 0.0028, + "step": 838 + }, + { + "epoch": 839.0, + "grad_norm": 0.023931678384542465, + "learning_rate": 0.00035777777777777777, + "loss": 0.0029, + "step": 839 + }, + { + "epoch": 840.0, + "grad_norm": 0.016439393162727356, + "learning_rate": 0.00035555555555555557, + "loss": 0.0028, + "step": 840 + }, + { + "epoch": 841.0, + "grad_norm": 0.024643810465931892, + "learning_rate": 0.0003533333333333333, + "loss": 0.0029, + "step": 841 + }, + { + "epoch": 842.0, + "grad_norm": 0.025052694603800774, + "learning_rate": 0.0003511111111111111, + "loss": 0.0029, + "step": 842 + }, + { + "epoch": 843.0, + "grad_norm": 0.02013804018497467, + "learning_rate": 0.0003488888888888889, + "loss": 0.0029, + "step": 843 + }, + { + "epoch": 844.0, + "grad_norm": 0.019899819046258926, + "learning_rate": 0.00034666666666666667, + "loss": 0.0029, + "step": 844 + }, + { + "epoch": 845.0, + "grad_norm": 0.020912861451506615, + "learning_rate": 0.0003444444444444445, + "loss": 0.0029, + "step": 845 + }, + { + "epoch": 846.0, + "grad_norm": 0.015246815979480743, + "learning_rate": 0.0003422222222222222, + "loss": 0.0028, + "step": 846 + }, + { + "epoch": 847.0, + "grad_norm": 0.026435496285557747, + "learning_rate": 0.00034, + "loss": 0.0032, + "step": 847 + }, + { + "epoch": 848.0, + "grad_norm": 0.019885210320353508, + "learning_rate": 0.00033777777777777777, + "loss": 0.0028, + "step": 848 + }, + { + "epoch": 849.0, + "grad_norm": 0.020924292504787445, + "learning_rate": 0.0003355555555555556, + "loss": 0.0029, + "step": 849 + }, + { + "epoch": 850.0, + "grad_norm": 0.019253870472311974, + "learning_rate": 0.0003333333333333333, + "loss": 0.0029, + "step": 850 + }, + { + "epoch": 851.0, + "grad_norm": 0.019240371882915497, + "learning_rate": 0.0003311111111111111, + "loss": 0.0028, + "step": 851 + }, + { + "epoch": 852.0, + "grad_norm": 0.021707167848944664, + "learning_rate": 0.0003288888888888889, + "loss": 0.003, + "step": 852 + }, + { + "epoch": 853.0, + "grad_norm": 0.020251473411917686, + "learning_rate": 0.0003266666666666667, + "loss": 0.003, + "step": 853 + }, + { + "epoch": 854.0, + "grad_norm": 0.020360205322504044, + "learning_rate": 0.0003244444444444444, + "loss": 0.0028, + "step": 854 + }, + { + "epoch": 855.0, + "grad_norm": 0.019384529441595078, + "learning_rate": 0.0003222222222222222, + "loss": 0.0029, + "step": 855 + }, + { + "epoch": 856.0, + "grad_norm": 0.02220081351697445, + "learning_rate": 0.00032, + "loss": 0.0032, + "step": 856 + }, + { + "epoch": 857.0, + "grad_norm": 0.02192023955285549, + "learning_rate": 0.0003177777777777778, + "loss": 0.0029, + "step": 857 + }, + { + "epoch": 858.0, + "grad_norm": 0.020040197297930717, + "learning_rate": 0.0003155555555555555, + "loss": 0.0029, + "step": 858 + }, + { + "epoch": 859.0, + "grad_norm": 0.022478275001049042, + "learning_rate": 0.0003133333333333334, + "loss": 0.0031, + "step": 859 + }, + { + "epoch": 860.0, + "grad_norm": 0.017191395163536072, + "learning_rate": 0.0003111111111111111, + "loss": 0.0027, + "step": 860 + }, + { + "epoch": 861.0, + "grad_norm": 0.025374887511134148, + "learning_rate": 0.0003088888888888889, + "loss": 0.0031, + "step": 861 + }, + { + "epoch": 862.0, + "grad_norm": 0.01610608585178852, + "learning_rate": 0.0003066666666666667, + "loss": 0.0027, + "step": 862 + }, + { + "epoch": 863.0, + "grad_norm": 0.019191846251487732, + "learning_rate": 0.0003044444444444445, + "loss": 0.0027, + "step": 863 + }, + { + "epoch": 864.0, + "grad_norm": 0.021266650408506393, + "learning_rate": 0.0003022222222222222, + "loss": 0.0029, + "step": 864 + }, + { + "epoch": 865.0, + "grad_norm": 0.024204988032579422, + "learning_rate": 0.0003, + "loss": 0.003, + "step": 865 + }, + { + "epoch": 866.0, + "grad_norm": 0.018310649320483208, + "learning_rate": 0.0002977777777777778, + "loss": 0.0027, + "step": 866 + }, + { + "epoch": 867.0, + "grad_norm": 0.017516782507300377, + "learning_rate": 0.0002955555555555556, + "loss": 0.0029, + "step": 867 + }, + { + "epoch": 868.0, + "grad_norm": 0.021248290315270424, + "learning_rate": 0.0002933333333333333, + "loss": 0.0029, + "step": 868 + }, + { + "epoch": 869.0, + "grad_norm": 0.01870272122323513, + "learning_rate": 0.00029111111111111113, + "loss": 0.0028, + "step": 869 + }, + { + "epoch": 870.0, + "grad_norm": 0.02665873058140278, + "learning_rate": 0.0002888888888888889, + "loss": 0.003, + "step": 870 + }, + { + "epoch": 871.0, + "grad_norm": 0.02477414719760418, + "learning_rate": 0.0002866666666666667, + "loss": 0.003, + "step": 871 + }, + { + "epoch": 872.0, + "grad_norm": 0.023296542465686798, + "learning_rate": 0.0002844444444444444, + "loss": 0.0031, + "step": 872 + }, + { + "epoch": 873.0, + "grad_norm": 0.03521310165524483, + "learning_rate": 0.00028222222222222223, + "loss": 0.0029, + "step": 873 + }, + { + "epoch": 874.0, + "grad_norm": 0.020849304273724556, + "learning_rate": 0.00028000000000000003, + "loss": 0.0028, + "step": 874 + }, + { + "epoch": 875.0, + "grad_norm": 0.023307524621486664, + "learning_rate": 0.0002777777777777778, + "loss": 0.003, + "step": 875 + }, + { + "epoch": 876.0, + "grad_norm": 0.01824437826871872, + "learning_rate": 0.0002755555555555556, + "loss": 0.0025, + "step": 876 + }, + { + "epoch": 877.0, + "grad_norm": 0.02158845216035843, + "learning_rate": 0.00027333333333333333, + "loss": 0.0029, + "step": 877 + }, + { + "epoch": 878.0, + "grad_norm": 0.02377997152507305, + "learning_rate": 0.00027111111111111113, + "loss": 0.0029, + "step": 878 + }, + { + "epoch": 879.0, + "grad_norm": 0.024584239348769188, + "learning_rate": 0.00026888888888888893, + "loss": 0.0031, + "step": 879 + }, + { + "epoch": 880.0, + "grad_norm": 0.016272418200969696, + "learning_rate": 0.0002666666666666667, + "loss": 0.0027, + "step": 880 + }, + { + "epoch": 881.0, + "grad_norm": 0.01953684352338314, + "learning_rate": 0.00026444444444444443, + "loss": 0.0027, + "step": 881 + }, + { + "epoch": 882.0, + "grad_norm": 0.023698432371020317, + "learning_rate": 0.00026222222222222223, + "loss": 0.003, + "step": 882 + }, + { + "epoch": 883.0, + "grad_norm": 0.021787166595458984, + "learning_rate": 0.00026000000000000003, + "loss": 0.003, + "step": 883 + }, + { + "epoch": 884.0, + "grad_norm": 0.026674091815948486, + "learning_rate": 0.0002577777777777778, + "loss": 0.0027, + "step": 884 + }, + { + "epoch": 885.0, + "grad_norm": 0.02600809372961521, + "learning_rate": 0.00025555555555555553, + "loss": 0.0028, + "step": 885 + }, + { + "epoch": 886.0, + "grad_norm": 0.0214143767952919, + "learning_rate": 0.0002533333333333334, + "loss": 0.0027, + "step": 886 + }, + { + "epoch": 887.0, + "grad_norm": 0.01773553155362606, + "learning_rate": 0.00025111111111111113, + "loss": 0.0028, + "step": 887 + }, + { + "epoch": 888.0, + "grad_norm": 0.016479508951306343, + "learning_rate": 0.0002488888888888889, + "loss": 0.0027, + "step": 888 + }, + { + "epoch": 889.0, + "grad_norm": 0.026842381805181503, + "learning_rate": 0.0002466666666666667, + "loss": 0.0031, + "step": 889 + }, + { + "epoch": 890.0, + "grad_norm": 0.029731744900345802, + "learning_rate": 0.00024444444444444443, + "loss": 0.0028, + "step": 890 + }, + { + "epoch": 891.0, + "grad_norm": 0.018305297940969467, + "learning_rate": 0.00024222222222222223, + "loss": 0.0028, + "step": 891 + }, + { + "epoch": 892.0, + "grad_norm": 0.020879078656435013, + "learning_rate": 0.00024, + "loss": 0.0027, + "step": 892 + }, + { + "epoch": 893.0, + "grad_norm": 0.025136977434158325, + "learning_rate": 0.00023777777777777778, + "loss": 0.0027, + "step": 893 + }, + { + "epoch": 894.0, + "grad_norm": 0.02135421149432659, + "learning_rate": 0.00023555555555555556, + "loss": 0.0028, + "step": 894 + }, + { + "epoch": 895.0, + "grad_norm": 0.018956074491143227, + "learning_rate": 0.00023333333333333333, + "loss": 0.0026, + "step": 895 + }, + { + "epoch": 896.0, + "grad_norm": 0.024466995149850845, + "learning_rate": 0.0002311111111111111, + "loss": 0.0028, + "step": 896 + }, + { + "epoch": 897.0, + "grad_norm": 0.024282222613692284, + "learning_rate": 0.0002288888888888889, + "loss": 0.0028, + "step": 897 + }, + { + "epoch": 898.0, + "grad_norm": 0.02457410655915737, + "learning_rate": 0.00022666666666666666, + "loss": 0.0028, + "step": 898 + }, + { + "epoch": 899.0, + "grad_norm": 0.025316430255770683, + "learning_rate": 0.00022444444444444446, + "loss": 0.0028, + "step": 899 + }, + { + "epoch": 900.0, + "grad_norm": 0.015992436558008194, + "learning_rate": 0.0002222222222222222, + "loss": 0.0027, + "step": 900 + }, + { + "epoch": 901.0, + "grad_norm": 0.01664648950099945, + "learning_rate": 0.00022, + "loss": 0.0026, + "step": 901 + }, + { + "epoch": 902.0, + "grad_norm": 0.019682608544826508, + "learning_rate": 0.00021777777777777776, + "loss": 0.0027, + "step": 902 + }, + { + "epoch": 903.0, + "grad_norm": 0.024491267278790474, + "learning_rate": 0.00021555555555555556, + "loss": 0.0031, + "step": 903 + }, + { + "epoch": 904.0, + "grad_norm": 0.024832140654325485, + "learning_rate": 0.00021333333333333336, + "loss": 0.0027, + "step": 904 + }, + { + "epoch": 905.0, + "grad_norm": 0.019126785919070244, + "learning_rate": 0.0002111111111111111, + "loss": 0.0027, + "step": 905 + }, + { + "epoch": 906.0, + "grad_norm": 0.024714525789022446, + "learning_rate": 0.0002088888888888889, + "loss": 0.003, + "step": 906 + }, + { + "epoch": 907.0, + "grad_norm": 0.018799038603901863, + "learning_rate": 0.00020666666666666666, + "loss": 0.0025, + "step": 907 + }, + { + "epoch": 908.0, + "grad_norm": 0.014316687360405922, + "learning_rate": 0.00020444444444444446, + "loss": 0.0025, + "step": 908 + }, + { + "epoch": 909.0, + "grad_norm": 0.0304707158356905, + "learning_rate": 0.00020222222222222223, + "loss": 0.0028, + "step": 909 + }, + { + "epoch": 910.0, + "grad_norm": 0.018442662432789803, + "learning_rate": 0.0002, + "loss": 0.0026, + "step": 910 + }, + { + "epoch": 911.0, + "grad_norm": 0.02719203568994999, + "learning_rate": 0.00019777777777777778, + "loss": 0.003, + "step": 911 + }, + { + "epoch": 912.0, + "grad_norm": 0.02310093119740486, + "learning_rate": 0.00019555555555555556, + "loss": 0.0028, + "step": 912 + }, + { + "epoch": 913.0, + "grad_norm": 0.017705217003822327, + "learning_rate": 0.00019333333333333333, + "loss": 0.0027, + "step": 913 + }, + { + "epoch": 914.0, + "grad_norm": 0.017214365303516388, + "learning_rate": 0.00019111111111111114, + "loss": 0.0026, + "step": 914 + }, + { + "epoch": 915.0, + "grad_norm": 0.020950743928551674, + "learning_rate": 0.00018888888888888888, + "loss": 0.0027, + "step": 915 + }, + { + "epoch": 916.0, + "grad_norm": 0.018532825633883476, + "learning_rate": 0.0001866666666666667, + "loss": 0.0026, + "step": 916 + }, + { + "epoch": 917.0, + "grad_norm": 0.014188375324010849, + "learning_rate": 0.00018444444444444443, + "loss": 0.0028, + "step": 917 + }, + { + "epoch": 918.0, + "grad_norm": 0.025212949141860008, + "learning_rate": 0.00018222222222222224, + "loss": 0.003, + "step": 918 + }, + { + "epoch": 919.0, + "grad_norm": 0.02256765589118004, + "learning_rate": 0.00017999999999999998, + "loss": 0.0027, + "step": 919 + }, + { + "epoch": 920.0, + "grad_norm": 0.015511687844991684, + "learning_rate": 0.00017777777777777779, + "loss": 0.0027, + "step": 920 + }, + { + "epoch": 921.0, + "grad_norm": 0.02182592637836933, + "learning_rate": 0.00017555555555555556, + "loss": 0.0029, + "step": 921 + }, + { + "epoch": 922.0, + "grad_norm": 0.016262182965874672, + "learning_rate": 0.00017333333333333334, + "loss": 0.0026, + "step": 922 + }, + { + "epoch": 923.0, + "grad_norm": 0.0173965897411108, + "learning_rate": 0.0001711111111111111, + "loss": 0.0027, + "step": 923 + }, + { + "epoch": 924.0, + "grad_norm": 0.022845404222607613, + "learning_rate": 0.00016888888888888889, + "loss": 0.0028, + "step": 924 + }, + { + "epoch": 925.0, + "grad_norm": 0.019500279799103737, + "learning_rate": 0.00016666666666666666, + "loss": 0.0025, + "step": 925 + }, + { + "epoch": 926.0, + "grad_norm": 0.033996641635894775, + "learning_rate": 0.00016444444444444446, + "loss": 0.003, + "step": 926 + }, + { + "epoch": 927.0, + "grad_norm": 0.02097196690738201, + "learning_rate": 0.0001622222222222222, + "loss": 0.0027, + "step": 927 + }, + { + "epoch": 928.0, + "grad_norm": 0.027539506554603577, + "learning_rate": 0.00016, + "loss": 0.0033, + "step": 928 + }, + { + "epoch": 929.0, + "grad_norm": 0.015689266845583916, + "learning_rate": 0.00015777777777777776, + "loss": 0.0026, + "step": 929 + }, + { + "epoch": 930.0, + "grad_norm": 0.020680051296949387, + "learning_rate": 0.00015555555555555556, + "loss": 0.0028, + "step": 930 + }, + { + "epoch": 931.0, + "grad_norm": 0.02494923025369644, + "learning_rate": 0.00015333333333333334, + "loss": 0.0029, + "step": 931 + }, + { + "epoch": 932.0, + "grad_norm": 0.028578734025359154, + "learning_rate": 0.0001511111111111111, + "loss": 0.003, + "step": 932 + }, + { + "epoch": 933.0, + "grad_norm": 0.029307426884770393, + "learning_rate": 0.0001488888888888889, + "loss": 0.0029, + "step": 933 + }, + { + "epoch": 934.0, + "grad_norm": 0.02381393313407898, + "learning_rate": 0.00014666666666666666, + "loss": 0.0028, + "step": 934 + }, + { + "epoch": 935.0, + "grad_norm": 0.013117119669914246, + "learning_rate": 0.00014444444444444444, + "loss": 0.0026, + "step": 935 + }, + { + "epoch": 936.0, + "grad_norm": 0.028397388756275177, + "learning_rate": 0.0001422222222222222, + "loss": 0.0027, + "step": 936 + }, + { + "epoch": 937.0, + "grad_norm": 0.021766725927591324, + "learning_rate": 0.00014000000000000001, + "loss": 0.0028, + "step": 937 + }, + { + "epoch": 938.0, + "grad_norm": 0.019310174509882927, + "learning_rate": 0.0001377777777777778, + "loss": 0.0027, + "step": 938 + }, + { + "epoch": 939.0, + "grad_norm": 0.0176254715770483, + "learning_rate": 0.00013555555555555556, + "loss": 0.0027, + "step": 939 + }, + { + "epoch": 940.0, + "grad_norm": 0.026609305292367935, + "learning_rate": 0.00013333333333333334, + "loss": 0.0029, + "step": 940 + }, + { + "epoch": 941.0, + "grad_norm": 0.01736409030854702, + "learning_rate": 0.00013111111111111111, + "loss": 0.0027, + "step": 941 + }, + { + "epoch": 942.0, + "grad_norm": 0.020236855372786522, + "learning_rate": 0.0001288888888888889, + "loss": 0.0028, + "step": 942 + }, + { + "epoch": 943.0, + "grad_norm": 0.01565195992588997, + "learning_rate": 0.0001266666666666667, + "loss": 0.0028, + "step": 943 + }, + { + "epoch": 944.0, + "grad_norm": 0.02295234240591526, + "learning_rate": 0.00012444444444444444, + "loss": 0.0028, + "step": 944 + }, + { + "epoch": 945.0, + "grad_norm": 0.016273394227027893, + "learning_rate": 0.00012222222222222221, + "loss": 0.0025, + "step": 945 + }, + { + "epoch": 946.0, + "grad_norm": 0.021817076951265335, + "learning_rate": 0.00012, + "loss": 0.0028, + "step": 946 + }, + { + "epoch": 947.0, + "grad_norm": 0.02048509754240513, + "learning_rate": 0.00011777777777777778, + "loss": 0.0026, + "step": 947 + }, + { + "epoch": 948.0, + "grad_norm": 0.024927016347646713, + "learning_rate": 0.00011555555555555555, + "loss": 0.0028, + "step": 948 + }, + { + "epoch": 949.0, + "grad_norm": 0.014938845299184322, + "learning_rate": 0.00011333333333333333, + "loss": 0.0026, + "step": 949 + }, + { + "epoch": 950.0, + "grad_norm": 0.018117714673280716, + "learning_rate": 0.0001111111111111111, + "loss": 0.0027, + "step": 950 + }, + { + "epoch": 951.0, + "grad_norm": 0.020745469257235527, + "learning_rate": 0.00010888888888888888, + "loss": 0.0027, + "step": 951 + }, + { + "epoch": 952.0, + "grad_norm": 0.013773414306342602, + "learning_rate": 0.00010666666666666668, + "loss": 0.0025, + "step": 952 + }, + { + "epoch": 953.0, + "grad_norm": 0.023852935060858727, + "learning_rate": 0.00010444444444444445, + "loss": 0.0028, + "step": 953 + }, + { + "epoch": 954.0, + "grad_norm": 0.023745089769363403, + "learning_rate": 0.00010222222222222223, + "loss": 0.0028, + "step": 954 + }, + { + "epoch": 955.0, + "grad_norm": 0.027273166924715042, + "learning_rate": 0.0001, + "loss": 0.003, + "step": 955 + }, + { + "epoch": 956.0, + "grad_norm": 0.020344140008091927, + "learning_rate": 9.777777777777778e-05, + "loss": 0.003, + "step": 956 + }, + { + "epoch": 957.0, + "grad_norm": 0.017448484897613525, + "learning_rate": 9.555555555555557e-05, + "loss": 0.0027, + "step": 957 + }, + { + "epoch": 958.0, + "grad_norm": 0.016183484345674515, + "learning_rate": 9.333333333333334e-05, + "loss": 0.0026, + "step": 958 + }, + { + "epoch": 959.0, + "grad_norm": 0.01649181731045246, + "learning_rate": 9.111111111111112e-05, + "loss": 0.0027, + "step": 959 + }, + { + "epoch": 960.0, + "grad_norm": 0.024920279160141945, + "learning_rate": 8.888888888888889e-05, + "loss": 0.0027, + "step": 960 + }, + { + "epoch": 961.0, + "grad_norm": 0.013914654962718487, + "learning_rate": 8.666666666666667e-05, + "loss": 0.0024, + "step": 961 + }, + { + "epoch": 962.0, + "grad_norm": 0.021497434005141258, + "learning_rate": 8.444444444444444e-05, + "loss": 0.0026, + "step": 962 + }, + { + "epoch": 963.0, + "grad_norm": 0.019979996606707573, + "learning_rate": 8.222222222222223e-05, + "loss": 0.0029, + "step": 963 + }, + { + "epoch": 964.0, + "grad_norm": 0.02189183235168457, + "learning_rate": 8e-05, + "loss": 0.0028, + "step": 964 + }, + { + "epoch": 965.0, + "grad_norm": 0.015944723039865494, + "learning_rate": 7.777777777777778e-05, + "loss": 0.0026, + "step": 965 + }, + { + "epoch": 966.0, + "grad_norm": 0.01600065268576145, + "learning_rate": 7.555555555555556e-05, + "loss": 0.0025, + "step": 966 + }, + { + "epoch": 967.0, + "grad_norm": 0.020630113780498505, + "learning_rate": 7.333333333333333e-05, + "loss": 0.0027, + "step": 967 + }, + { + "epoch": 968.0, + "grad_norm": 0.014975383877754211, + "learning_rate": 7.11111111111111e-05, + "loss": 0.0025, + "step": 968 + }, + { + "epoch": 969.0, + "grad_norm": 0.016374630853533745, + "learning_rate": 6.88888888888889e-05, + "loss": 0.0026, + "step": 969 + }, + { + "epoch": 970.0, + "grad_norm": 0.019182894378900528, + "learning_rate": 6.666666666666667e-05, + "loss": 0.0027, + "step": 970 + }, + { + "epoch": 971.0, + "grad_norm": 0.024381978437304497, + "learning_rate": 6.444444444444444e-05, + "loss": 0.0028, + "step": 971 + }, + { + "epoch": 972.0, + "grad_norm": 0.019862636923789978, + "learning_rate": 6.222222222222222e-05, + "loss": 0.0026, + "step": 972 + }, + { + "epoch": 973.0, + "grad_norm": 0.019189875572919846, + "learning_rate": 6e-05, + "loss": 0.0025, + "step": 973 + }, + { + "epoch": 974.0, + "grad_norm": 0.012594843283295631, + "learning_rate": 5.7777777777777776e-05, + "loss": 0.0024, + "step": 974 + }, + { + "epoch": 975.0, + "grad_norm": 0.01766464300453663, + "learning_rate": 5.555555555555555e-05, + "loss": 0.0025, + "step": 975 + }, + { + "epoch": 976.0, + "grad_norm": 0.016625959426164627, + "learning_rate": 5.333333333333334e-05, + "loss": 0.0025, + "step": 976 + }, + { + "epoch": 977.0, + "grad_norm": 0.01728684827685356, + "learning_rate": 5.1111111111111115e-05, + "loss": 0.0027, + "step": 977 + }, + { + "epoch": 978.0, + "grad_norm": 0.017300087958574295, + "learning_rate": 4.888888888888889e-05, + "loss": 0.0026, + "step": 978 + }, + { + "epoch": 979.0, + "grad_norm": 0.024032112210989, + "learning_rate": 4.666666666666667e-05, + "loss": 0.0028, + "step": 979 + }, + { + "epoch": 980.0, + "grad_norm": 0.018220216035842896, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.0026, + "step": 980 + }, + { + "epoch": 981.0, + "grad_norm": 0.015681209042668343, + "learning_rate": 4.222222222222222e-05, + "loss": 0.0025, + "step": 981 + }, + { + "epoch": 982.0, + "grad_norm": 0.0216491911560297, + "learning_rate": 4e-05, + "loss": 0.0029, + "step": 982 + }, + { + "epoch": 983.0, + "grad_norm": 0.021082593128085136, + "learning_rate": 3.777777777777778e-05, + "loss": 0.0025, + "step": 983 + }, + { + "epoch": 984.0, + "grad_norm": 0.02241634391248226, + "learning_rate": 3.555555555555555e-05, + "loss": 0.0026, + "step": 984 + }, + { + "epoch": 985.0, + "grad_norm": 0.018041379749774933, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0025, + "step": 985 + }, + { + "epoch": 986.0, + "grad_norm": 0.01738720014691353, + "learning_rate": 3.111111111111111e-05, + "loss": 0.0025, + "step": 986 + }, + { + "epoch": 987.0, + "grad_norm": 0.02450176514685154, + "learning_rate": 2.8888888888888888e-05, + "loss": 0.0026, + "step": 987 + }, + { + "epoch": 988.0, + "grad_norm": 0.016677524894475937, + "learning_rate": 2.666666666666667e-05, + "loss": 0.0025, + "step": 988 + }, + { + "epoch": 989.0, + "grad_norm": 0.014843937940895557, + "learning_rate": 2.4444444444444445e-05, + "loss": 0.0025, + "step": 989 + }, + { + "epoch": 990.0, + "grad_norm": 0.017436960712075233, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.0027, + "step": 990 + }, + { + "epoch": 991.0, + "grad_norm": 0.02031978964805603, + "learning_rate": 2e-05, + "loss": 0.0026, + "step": 991 + }, + { + "epoch": 992.0, + "grad_norm": 0.018474267795681953, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.0026, + "step": 992 + }, + { + "epoch": 993.0, + "grad_norm": 0.021300526335835457, + "learning_rate": 1.5555555555555555e-05, + "loss": 0.0026, + "step": 993 + }, + { + "epoch": 994.0, + "grad_norm": 0.0179448164999485, + "learning_rate": 1.3333333333333335e-05, + "loss": 0.0028, + "step": 994 + }, + { + "epoch": 995.0, + "grad_norm": 0.024742020294070244, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.0028, + "step": 995 + }, + { + "epoch": 996.0, + "grad_norm": 0.02364485338330269, + "learning_rate": 8.888888888888888e-06, + "loss": 0.0028, + "step": 996 + }, + { + "epoch": 997.0, + "grad_norm": 0.021595612168312073, + "learning_rate": 6.6666666666666675e-06, + "loss": 0.0027, + "step": 997 + }, + { + "epoch": 998.0, + "grad_norm": 0.020215950906276703, + "learning_rate": 4.444444444444444e-06, + "loss": 0.0026, + "step": 998 + }, + { + "epoch": 999.0, + "grad_norm": 0.02067585475742817, + "learning_rate": 2.222222222222222e-06, + "loss": 0.0027, + "step": 999 + }, + { + "epoch": 1000.0, + "grad_norm": 0.01625109650194645, + "learning_rate": 0.0, + "loss": 0.0026, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1000, + "save_steps": 500, + "total_flos": 5525183397888000.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}